{ "best_metric": 0.427830308675766, "best_model_checkpoint": "../checkpoints/iccad-contest-results/checkpoint-110000", "epoch": 4.991530615922442, "eval_steps": 500, "global_step": 138500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001801996612246369, "grad_norm": 0.279486745595932, "learning_rate": 6.005284650492435e-08, "loss": 1.012, "step": 5 }, { "epoch": 0.0003603993224492738, "grad_norm": 0.22080670297145844, "learning_rate": 1.201056930098487e-07, "loss": 0.9384, "step": 10 }, { "epoch": 0.0005405989836739107, "grad_norm": 0.16033534705638885, "learning_rate": 1.80158539514773e-07, "loss": 0.993, "step": 15 }, { "epoch": 0.0007207986448985476, "grad_norm": 0.21021625399589539, "learning_rate": 2.402113860196974e-07, "loss": 0.9073, "step": 20 }, { "epoch": 0.0009009983061231845, "grad_norm": 0.1960284262895584, "learning_rate": 3.002642325246217e-07, "loss": 0.9033, "step": 25 }, { "epoch": 0.0010811979673478213, "grad_norm": 0.31484127044677734, "learning_rate": 3.60317079029546e-07, "loss": 1.0241, "step": 30 }, { "epoch": 0.0012613976285724582, "grad_norm": 0.35444363951683044, "learning_rate": 4.2036992553447037e-07, "loss": 0.9982, "step": 35 }, { "epoch": 0.0014415972897970951, "grad_norm": 0.25069698691368103, "learning_rate": 4.804227720393948e-07, "loss": 0.9843, "step": 40 }, { "epoch": 0.001621796951021732, "grad_norm": 0.23986054956912994, "learning_rate": 5.40475618544319e-07, "loss": 0.9416, "step": 45 }, { "epoch": 0.001801996612246369, "grad_norm": 0.2893970012664795, "learning_rate": 6.005284650492434e-07, "loss": 0.9448, "step": 50 }, { "epoch": 0.0019821962734710057, "grad_norm": 0.19641688466072083, "learning_rate": 6.605813115541676e-07, "loss": 0.8997, "step": 55 }, { "epoch": 0.0021623959346956426, "grad_norm": 0.2509602904319763, "learning_rate": 7.20634158059092e-07, "loss": 0.9826, "step": 60 }, { "epoch": 0.0023425955959202795, "grad_norm": 0.25828176736831665, "learning_rate": 7.806870045640163e-07, "loss": 0.9222, "step": 65 }, { "epoch": 0.0025227952571449165, "grad_norm": 0.19463542103767395, "learning_rate": 8.407398510689407e-07, "loss": 0.8906, "step": 70 }, { "epoch": 0.0027029949183695534, "grad_norm": 0.1942373514175415, "learning_rate": 9.007926975738651e-07, "loss": 0.9204, "step": 75 }, { "epoch": 0.0028831945795941903, "grad_norm": 0.2071942389011383, "learning_rate": 9.608455440787895e-07, "loss": 0.9334, "step": 80 }, { "epoch": 0.0030633942408188272, "grad_norm": 0.2774100601673126, "learning_rate": 1.0208983905837138e-06, "loss": 0.9543, "step": 85 }, { "epoch": 0.003243593902043464, "grad_norm": 0.21016931533813477, "learning_rate": 1.080951237088638e-06, "loss": 0.9632, "step": 90 }, { "epoch": 0.003423793563268101, "grad_norm": 0.24393318593502045, "learning_rate": 1.1410040835935624e-06, "loss": 0.8989, "step": 95 }, { "epoch": 0.003603993224492738, "grad_norm": 0.22489190101623535, "learning_rate": 1.2010569300984867e-06, "loss": 0.9678, "step": 100 }, { "epoch": 0.003784192885717375, "grad_norm": 0.2544313669204712, "learning_rate": 1.261109776603411e-06, "loss": 0.9295, "step": 105 }, { "epoch": 0.003964392546942011, "grad_norm": 0.21903349459171295, "learning_rate": 1.3211626231083353e-06, "loss": 0.9345, "step": 110 }, { "epoch": 0.004144592208166649, "grad_norm": 0.2654012143611908, "learning_rate": 1.3812154696132596e-06, "loss": 0.9683, "step": 115 }, { "epoch": 0.004324791869391285, "grad_norm": 0.25590232014656067, "learning_rate": 1.441268316118184e-06, "loss": 0.9614, "step": 120 }, { "epoch": 0.004504991530615923, "grad_norm": 0.25431954860687256, "learning_rate": 1.5013211626231084e-06, "loss": 1.0293, "step": 125 }, { "epoch": 0.004685191191840559, "grad_norm": 0.18761098384857178, "learning_rate": 1.5613740091280327e-06, "loss": 0.8426, "step": 130 }, { "epoch": 0.004865390853065196, "grad_norm": 0.21217921376228333, "learning_rate": 1.621426855632957e-06, "loss": 0.9276, "step": 135 }, { "epoch": 0.005045590514289833, "grad_norm": 0.23869900405406952, "learning_rate": 1.6814797021378815e-06, "loss": 0.8987, "step": 140 }, { "epoch": 0.00522579017551447, "grad_norm": 0.18920210003852844, "learning_rate": 1.7415325486428058e-06, "loss": 0.9036, "step": 145 }, { "epoch": 0.005405989836739107, "grad_norm": 0.2015441209077835, "learning_rate": 1.8015853951477303e-06, "loss": 0.8491, "step": 150 }, { "epoch": 0.005586189497963744, "grad_norm": 0.18248215317726135, "learning_rate": 1.8616382416526544e-06, "loss": 0.9322, "step": 155 }, { "epoch": 0.005766389159188381, "grad_norm": 0.23022598028182983, "learning_rate": 1.921691088157579e-06, "loss": 0.8488, "step": 160 }, { "epoch": 0.005946588820413018, "grad_norm": 0.18739941716194153, "learning_rate": 1.981743934662503e-06, "loss": 0.8359, "step": 165 }, { "epoch": 0.0061267884816376544, "grad_norm": 0.21552830934524536, "learning_rate": 2.0417967811674277e-06, "loss": 0.8829, "step": 170 }, { "epoch": 0.006306988142862292, "grad_norm": 0.15109090507030487, "learning_rate": 2.101849627672352e-06, "loss": 0.7759, "step": 175 }, { "epoch": 0.006487187804086928, "grad_norm": 0.2417735457420349, "learning_rate": 2.161902474177276e-06, "loss": 0.9047, "step": 180 }, { "epoch": 0.006667387465311566, "grad_norm": 0.23814047873020172, "learning_rate": 2.2219553206822005e-06, "loss": 0.8425, "step": 185 }, { "epoch": 0.006847587126536202, "grad_norm": 0.23794889450073242, "learning_rate": 2.282008167187125e-06, "loss": 0.7744, "step": 190 }, { "epoch": 0.007027786787760839, "grad_norm": 0.18542109429836273, "learning_rate": 2.342061013692049e-06, "loss": 0.7658, "step": 195 }, { "epoch": 0.007207986448985476, "grad_norm": 0.18955494463443756, "learning_rate": 2.4021138601969734e-06, "loss": 0.8884, "step": 200 }, { "epoch": 0.0073881861102101124, "grad_norm": 0.17132216691970825, "learning_rate": 2.4621667067018977e-06, "loss": 0.8408, "step": 205 }, { "epoch": 0.00756838577143475, "grad_norm": 0.20857086777687073, "learning_rate": 2.522219553206822e-06, "loss": 0.826, "step": 210 }, { "epoch": 0.007748585432659386, "grad_norm": 0.23217755556106567, "learning_rate": 2.5822723997117463e-06, "loss": 0.7692, "step": 215 }, { "epoch": 0.007928785093884023, "grad_norm": 0.26247748732566833, "learning_rate": 2.6423252462166706e-06, "loss": 0.758, "step": 220 }, { "epoch": 0.008108984755108661, "grad_norm": 0.1590353399515152, "learning_rate": 2.7023780927215953e-06, "loss": 0.7827, "step": 225 }, { "epoch": 0.008289184416333297, "grad_norm": 0.1754877269268036, "learning_rate": 2.762430939226519e-06, "loss": 0.8534, "step": 230 }, { "epoch": 0.008469384077557934, "grad_norm": 0.2057289332151413, "learning_rate": 2.822483785731444e-06, "loss": 0.7856, "step": 235 }, { "epoch": 0.00864958373878257, "grad_norm": 0.15028229355812073, "learning_rate": 2.882536632236368e-06, "loss": 0.7429, "step": 240 }, { "epoch": 0.008829783400007209, "grad_norm": 0.20948627591133118, "learning_rate": 2.9425894787412925e-06, "loss": 0.7745, "step": 245 }, { "epoch": 0.009009983061231845, "grad_norm": 0.2172427922487259, "learning_rate": 3.0026423252462168e-06, "loss": 0.7531, "step": 250 }, { "epoch": 0.009190182722456482, "grad_norm": 0.2265809327363968, "learning_rate": 3.062695171751141e-06, "loss": 0.8168, "step": 255 }, { "epoch": 0.009370382383681118, "grad_norm": 0.216866135597229, "learning_rate": 3.1227480182560654e-06, "loss": 0.7149, "step": 260 }, { "epoch": 0.009550582044905756, "grad_norm": 0.21732446551322937, "learning_rate": 3.1828008647609896e-06, "loss": 0.6542, "step": 265 }, { "epoch": 0.009730781706130393, "grad_norm": 0.21542949974536896, "learning_rate": 3.242853711265914e-06, "loss": 0.7051, "step": 270 }, { "epoch": 0.00991098136735503, "grad_norm": 0.19147822260856628, "learning_rate": 3.3029065577708387e-06, "loss": 0.7055, "step": 275 }, { "epoch": 0.010091181028579666, "grad_norm": 0.24729080498218536, "learning_rate": 3.362959404275763e-06, "loss": 0.7251, "step": 280 }, { "epoch": 0.010271380689804302, "grad_norm": 0.21902547776699066, "learning_rate": 3.423012250780687e-06, "loss": 0.7087, "step": 285 }, { "epoch": 0.01045158035102894, "grad_norm": 0.1997164785861969, "learning_rate": 3.4830650972856115e-06, "loss": 0.7187, "step": 290 }, { "epoch": 0.010631780012253577, "grad_norm": 0.16925355792045593, "learning_rate": 3.543117943790536e-06, "loss": 0.6483, "step": 295 }, { "epoch": 0.010811979673478214, "grad_norm": 0.11117614805698395, "learning_rate": 3.6031707902954605e-06, "loss": 0.6396, "step": 300 }, { "epoch": 0.01099217933470285, "grad_norm": 0.15845130383968353, "learning_rate": 3.6632236368003844e-06, "loss": 0.6586, "step": 305 }, { "epoch": 0.011172378995927488, "grad_norm": 0.15920360386371613, "learning_rate": 3.7232764833053087e-06, "loss": 0.6414, "step": 310 }, { "epoch": 0.011352578657152125, "grad_norm": 0.1967982053756714, "learning_rate": 3.7833293298102334e-06, "loss": 0.6027, "step": 315 }, { "epoch": 0.011532778318376761, "grad_norm": 0.1459701806306839, "learning_rate": 3.843382176315158e-06, "loss": 0.6115, "step": 320 }, { "epoch": 0.011712977979601398, "grad_norm": 0.1541188508272171, "learning_rate": 3.903435022820082e-06, "loss": 0.6935, "step": 325 }, { "epoch": 0.011893177640826036, "grad_norm": 0.15788233280181885, "learning_rate": 3.963487869325006e-06, "loss": 0.6226, "step": 330 }, { "epoch": 0.012073377302050672, "grad_norm": 0.15022051334381104, "learning_rate": 4.023540715829931e-06, "loss": 0.6914, "step": 335 }, { "epoch": 0.012253576963275309, "grad_norm": 0.16778388619422913, "learning_rate": 4.083593562334855e-06, "loss": 0.6599, "step": 340 }, { "epoch": 0.012433776624499945, "grad_norm": 0.154036745429039, "learning_rate": 4.143646408839779e-06, "loss": 0.6181, "step": 345 }, { "epoch": 0.012613976285724584, "grad_norm": 0.15167488157749176, "learning_rate": 4.203699255344704e-06, "loss": 0.6736, "step": 350 }, { "epoch": 0.01279417594694922, "grad_norm": 0.1559118628501892, "learning_rate": 4.263752101849628e-06, "loss": 0.6087, "step": 355 }, { "epoch": 0.012974375608173857, "grad_norm": 0.11976916342973709, "learning_rate": 4.323804948354552e-06, "loss": 0.6182, "step": 360 }, { "epoch": 0.013154575269398493, "grad_norm": 0.12271628528833389, "learning_rate": 4.383857794859476e-06, "loss": 0.594, "step": 365 }, { "epoch": 0.013334774930623131, "grad_norm": 0.16896136105060577, "learning_rate": 4.443910641364401e-06, "loss": 0.6635, "step": 370 }, { "epoch": 0.013514974591847768, "grad_norm": 0.1389530748128891, "learning_rate": 4.503963487869326e-06, "loss": 0.62, "step": 375 }, { "epoch": 0.013695174253072404, "grad_norm": 0.1768377125263214, "learning_rate": 4.56401633437425e-06, "loss": 0.6179, "step": 380 }, { "epoch": 0.01387537391429704, "grad_norm": 0.12114609777927399, "learning_rate": 4.6240691808791735e-06, "loss": 0.5923, "step": 385 }, { "epoch": 0.014055573575521677, "grad_norm": 0.13337160646915436, "learning_rate": 4.684122027384098e-06, "loss": 0.6363, "step": 390 }, { "epoch": 0.014235773236746315, "grad_norm": 0.12390133738517761, "learning_rate": 4.744174873889023e-06, "loss": 0.5967, "step": 395 }, { "epoch": 0.014415972897970952, "grad_norm": 0.13579072058200836, "learning_rate": 4.804227720393947e-06, "loss": 0.6168, "step": 400 }, { "epoch": 0.014596172559195588, "grad_norm": 0.13865922391414642, "learning_rate": 4.8642805668988715e-06, "loss": 0.6619, "step": 405 }, { "epoch": 0.014776372220420225, "grad_norm": 0.13145041465759277, "learning_rate": 4.924333413403795e-06, "loss": 0.6132, "step": 410 }, { "epoch": 0.014956571881644863, "grad_norm": 0.13919544219970703, "learning_rate": 4.98438625990872e-06, "loss": 0.6158, "step": 415 }, { "epoch": 0.0151367715428695, "grad_norm": 0.14985574781894684, "learning_rate": 5.044439106413644e-06, "loss": 0.6162, "step": 420 }, { "epoch": 0.015316971204094136, "grad_norm": 0.16457071900367737, "learning_rate": 5.104491952918569e-06, "loss": 0.6272, "step": 425 }, { "epoch": 0.015497170865318773, "grad_norm": 0.09835664927959442, "learning_rate": 5.164544799423493e-06, "loss": 0.5721, "step": 430 }, { "epoch": 0.01567737052654341, "grad_norm": 0.12352655827999115, "learning_rate": 5.224597645928417e-06, "loss": 0.5868, "step": 435 }, { "epoch": 0.015857570187768046, "grad_norm": 0.11715588718652725, "learning_rate": 5.284650492433341e-06, "loss": 0.5729, "step": 440 }, { "epoch": 0.016037769848992686, "grad_norm": 0.14185546338558197, "learning_rate": 5.344703338938266e-06, "loss": 0.6224, "step": 445 }, { "epoch": 0.016217969510217322, "grad_norm": 0.10686635971069336, "learning_rate": 5.404756185443191e-06, "loss": 0.5762, "step": 450 }, { "epoch": 0.01639816917144196, "grad_norm": 0.1367388218641281, "learning_rate": 5.4648090319481145e-06, "loss": 0.6108, "step": 455 }, { "epoch": 0.016578368832666595, "grad_norm": 0.1376735419034958, "learning_rate": 5.524861878453038e-06, "loss": 0.6249, "step": 460 }, { "epoch": 0.01675856849389123, "grad_norm": 0.1213606670498848, "learning_rate": 5.584914724957963e-06, "loss": 0.5733, "step": 465 }, { "epoch": 0.016938768155115868, "grad_norm": 0.14439453184604645, "learning_rate": 5.644967571462888e-06, "loss": 0.636, "step": 470 }, { "epoch": 0.017118967816340504, "grad_norm": 0.1540927290916443, "learning_rate": 5.7050204179678125e-06, "loss": 0.5924, "step": 475 }, { "epoch": 0.01729916747756514, "grad_norm": 0.13569463789463043, "learning_rate": 5.765073264472736e-06, "loss": 0.5908, "step": 480 }, { "epoch": 0.017479367138789777, "grad_norm": 0.13307730853557587, "learning_rate": 5.82512611097766e-06, "loss": 0.5795, "step": 485 }, { "epoch": 0.017659566800014417, "grad_norm": 0.17746399343013763, "learning_rate": 5.885178957482585e-06, "loss": 0.5938, "step": 490 }, { "epoch": 0.017839766461239054, "grad_norm": 0.11576178669929504, "learning_rate": 5.94523180398751e-06, "loss": 0.5914, "step": 495 }, { "epoch": 0.01801996612246369, "grad_norm": 0.12623916566371918, "learning_rate": 6.0052846504924335e-06, "loss": 0.5679, "step": 500 }, { "epoch": 0.01801996612246369, "eval_loss": 0.6039146184921265, "eval_runtime": 3.5675, "eval_samples_per_second": 28.031, "eval_steps_per_second": 7.008, "step": 500 }, { "epoch": 0.018200165783688327, "grad_norm": 0.13079528510570526, "learning_rate": 6.065337496997358e-06, "loss": 0.5946, "step": 505 }, { "epoch": 0.018380365444912963, "grad_norm": 0.1083323135972023, "learning_rate": 6.125390343502282e-06, "loss": 0.5892, "step": 510 }, { "epoch": 0.0185605651061376, "grad_norm": 0.16727234423160553, "learning_rate": 6.185443190007207e-06, "loss": 0.5907, "step": 515 }, { "epoch": 0.018740764767362236, "grad_norm": 0.12499808520078659, "learning_rate": 6.245496036512131e-06, "loss": 0.6076, "step": 520 }, { "epoch": 0.018920964428586873, "grad_norm": 0.16318917274475098, "learning_rate": 6.3055488830170554e-06, "loss": 0.5995, "step": 525 }, { "epoch": 0.019101164089811513, "grad_norm": 0.12780463695526123, "learning_rate": 6.365601729521979e-06, "loss": 0.5618, "step": 530 }, { "epoch": 0.01928136375103615, "grad_norm": 0.11932645738124847, "learning_rate": 6.425654576026904e-06, "loss": 0.5856, "step": 535 }, { "epoch": 0.019461563412260786, "grad_norm": 0.1256912350654602, "learning_rate": 6.485707422531828e-06, "loss": 0.5552, "step": 540 }, { "epoch": 0.019641763073485422, "grad_norm": 0.09648074209690094, "learning_rate": 6.545760269036752e-06, "loss": 0.5326, "step": 545 }, { "epoch": 0.01982196273471006, "grad_norm": 0.14793181419372559, "learning_rate": 6.605813115541677e-06, "loss": 0.5664, "step": 550 }, { "epoch": 0.020002162395934695, "grad_norm": 0.12733140587806702, "learning_rate": 6.665865962046601e-06, "loss": 0.571, "step": 555 }, { "epoch": 0.02018236205715933, "grad_norm": 0.12296765297651291, "learning_rate": 6.725918808551526e-06, "loss": 0.5242, "step": 560 }, { "epoch": 0.020362561718383968, "grad_norm": 0.13922452926635742, "learning_rate": 6.78597165505645e-06, "loss": 0.5367, "step": 565 }, { "epoch": 0.020542761379608605, "grad_norm": 0.14491942524909973, "learning_rate": 6.846024501561374e-06, "loss": 0.5708, "step": 570 }, { "epoch": 0.020722961040833245, "grad_norm": 0.1201525330543518, "learning_rate": 6.906077348066299e-06, "loss": 0.6137, "step": 575 }, { "epoch": 0.02090316070205788, "grad_norm": 0.11606668680906296, "learning_rate": 6.966130194571223e-06, "loss": 0.5537, "step": 580 }, { "epoch": 0.021083360363282518, "grad_norm": 0.14227990806102753, "learning_rate": 7.026183041076147e-06, "loss": 0.5886, "step": 585 }, { "epoch": 0.021263560024507154, "grad_norm": 0.13029618561267853, "learning_rate": 7.086235887581072e-06, "loss": 0.5999, "step": 590 }, { "epoch": 0.02144375968573179, "grad_norm": 0.11412752419710159, "learning_rate": 7.1462887340859955e-06, "loss": 0.5678, "step": 595 }, { "epoch": 0.021623959346956427, "grad_norm": 0.12800736725330353, "learning_rate": 7.206341580590921e-06, "loss": 0.6086, "step": 600 }, { "epoch": 0.021804159008181064, "grad_norm": 0.14154621958732605, "learning_rate": 7.266394427095845e-06, "loss": 0.539, "step": 605 }, { "epoch": 0.0219843586694057, "grad_norm": 0.10308407992124557, "learning_rate": 7.326447273600769e-06, "loss": 0.5246, "step": 610 }, { "epoch": 0.02216455833063034, "grad_norm": 0.15192610025405884, "learning_rate": 7.3865001201056936e-06, "loss": 0.5498, "step": 615 }, { "epoch": 0.022344757991854976, "grad_norm": 0.15246666967868805, "learning_rate": 7.446552966610617e-06, "loss": 0.5671, "step": 620 }, { "epoch": 0.022524957653079613, "grad_norm": 0.14070971310138702, "learning_rate": 7.506605813115541e-06, "loss": 0.5181, "step": 625 }, { "epoch": 0.02270515731430425, "grad_norm": 0.1048644632101059, "learning_rate": 7.566658659620467e-06, "loss": 0.5661, "step": 630 }, { "epoch": 0.022885356975528886, "grad_norm": 0.12176632136106491, "learning_rate": 7.626711506125391e-06, "loss": 0.6039, "step": 635 }, { "epoch": 0.023065556636753522, "grad_norm": 0.11277682334184647, "learning_rate": 7.686764352630316e-06, "loss": 0.5394, "step": 640 }, { "epoch": 0.02324575629797816, "grad_norm": 0.12266416847705841, "learning_rate": 7.74681719913524e-06, "loss": 0.5814, "step": 645 }, { "epoch": 0.023425955959202795, "grad_norm": 0.1094772145152092, "learning_rate": 7.806870045640164e-06, "loss": 0.5854, "step": 650 }, { "epoch": 0.023606155620427435, "grad_norm": 0.16953280568122864, "learning_rate": 7.866922892145088e-06, "loss": 0.5464, "step": 655 }, { "epoch": 0.023786355281652072, "grad_norm": 0.16109640896320343, "learning_rate": 7.926975738650012e-06, "loss": 0.5889, "step": 660 }, { "epoch": 0.02396655494287671, "grad_norm": 0.1488220989704132, "learning_rate": 7.987028585154936e-06, "loss": 0.57, "step": 665 }, { "epoch": 0.024146754604101345, "grad_norm": 0.15228040516376495, "learning_rate": 8.047081431659861e-06, "loss": 0.5437, "step": 670 }, { "epoch": 0.02432695426532598, "grad_norm": 0.11605563014745712, "learning_rate": 8.107134278164785e-06, "loss": 0.5315, "step": 675 }, { "epoch": 0.024507153926550618, "grad_norm": 0.1317128986120224, "learning_rate": 8.16718712466971e-06, "loss": 0.5731, "step": 680 }, { "epoch": 0.024687353587775254, "grad_norm": 0.14042159914970398, "learning_rate": 8.227239971174635e-06, "loss": 0.5542, "step": 685 }, { "epoch": 0.02486755324899989, "grad_norm": 0.16029533743858337, "learning_rate": 8.287292817679558e-06, "loss": 0.5782, "step": 690 }, { "epoch": 0.025047752910224527, "grad_norm": 0.1571711301803589, "learning_rate": 8.347345664184484e-06, "loss": 0.5533, "step": 695 }, { "epoch": 0.025227952571449167, "grad_norm": 0.13758143782615662, "learning_rate": 8.407398510689408e-06, "loss": 0.5316, "step": 700 }, { "epoch": 0.025408152232673804, "grad_norm": 0.11138833314180374, "learning_rate": 8.467451357194332e-06, "loss": 0.5075, "step": 705 }, { "epoch": 0.02558835189389844, "grad_norm": 0.14913310110569, "learning_rate": 8.527504203699256e-06, "loss": 0.5363, "step": 710 }, { "epoch": 0.025768551555123077, "grad_norm": 0.11564284563064575, "learning_rate": 8.58755705020418e-06, "loss": 0.5616, "step": 715 }, { "epoch": 0.025948751216347713, "grad_norm": 0.15782320499420166, "learning_rate": 8.647609896709103e-06, "loss": 0.5368, "step": 720 }, { "epoch": 0.02612895087757235, "grad_norm": 0.16397272050380707, "learning_rate": 8.707662743214029e-06, "loss": 0.6325, "step": 725 }, { "epoch": 0.026309150538796986, "grad_norm": 0.16753605008125305, "learning_rate": 8.767715589718953e-06, "loss": 0.5532, "step": 730 }, { "epoch": 0.026489350200021623, "grad_norm": 0.14978809654712677, "learning_rate": 8.827768436223878e-06, "loss": 0.5431, "step": 735 }, { "epoch": 0.026669549861246263, "grad_norm": 0.15160630643367767, "learning_rate": 8.887821282728802e-06, "loss": 0.576, "step": 740 }, { "epoch": 0.0268497495224709, "grad_norm": 0.1296982318162918, "learning_rate": 8.947874129233726e-06, "loss": 0.5488, "step": 745 }, { "epoch": 0.027029949183695536, "grad_norm": 0.19447606801986694, "learning_rate": 9.007926975738652e-06, "loss": 0.5878, "step": 750 }, { "epoch": 0.027210148844920172, "grad_norm": 0.13086961209774017, "learning_rate": 9.067979822243575e-06, "loss": 0.5713, "step": 755 }, { "epoch": 0.02739034850614481, "grad_norm": 0.1761922836303711, "learning_rate": 9.1280326687485e-06, "loss": 0.5605, "step": 760 }, { "epoch": 0.027570548167369445, "grad_norm": 0.13089734315872192, "learning_rate": 9.188085515253423e-06, "loss": 0.5187, "step": 765 }, { "epoch": 0.02775074782859408, "grad_norm": 0.14569701254367828, "learning_rate": 9.248138361758347e-06, "loss": 0.5301, "step": 770 }, { "epoch": 0.027930947489818718, "grad_norm": 0.16195152699947357, "learning_rate": 9.308191208263273e-06, "loss": 0.5554, "step": 775 }, { "epoch": 0.028111147151043354, "grad_norm": 0.13685005903244019, "learning_rate": 9.368244054768196e-06, "loss": 0.5719, "step": 780 }, { "epoch": 0.028291346812267994, "grad_norm": 0.1750328093767166, "learning_rate": 9.42829690127312e-06, "loss": 0.5635, "step": 785 }, { "epoch": 0.02847154647349263, "grad_norm": 0.16423344612121582, "learning_rate": 9.488349747778046e-06, "loss": 0.6248, "step": 790 }, { "epoch": 0.028651746134717267, "grad_norm": 0.15460895001888275, "learning_rate": 9.54840259428297e-06, "loss": 0.5727, "step": 795 }, { "epoch": 0.028831945795941904, "grad_norm": 0.15125848352909088, "learning_rate": 9.608455440787894e-06, "loss": 0.5535, "step": 800 }, { "epoch": 0.02901214545716654, "grad_norm": 0.18755701184272766, "learning_rate": 9.66850828729282e-06, "loss": 0.5292, "step": 805 }, { "epoch": 0.029192345118391177, "grad_norm": 0.14782796800136566, "learning_rate": 9.728561133797743e-06, "loss": 0.5155, "step": 810 }, { "epoch": 0.029372544779615813, "grad_norm": 0.1731768250465393, "learning_rate": 9.788613980302667e-06, "loss": 0.5752, "step": 815 }, { "epoch": 0.02955274444084045, "grad_norm": 0.17426101863384247, "learning_rate": 9.84866682680759e-06, "loss": 0.5796, "step": 820 }, { "epoch": 0.02973294410206509, "grad_norm": 0.1710687130689621, "learning_rate": 9.908719673312515e-06, "loss": 0.5363, "step": 825 }, { "epoch": 0.029913143763289726, "grad_norm": 0.12665097415447235, "learning_rate": 9.96877251981744e-06, "loss": 0.5302, "step": 830 }, { "epoch": 0.030093343424514363, "grad_norm": 0.20195025205612183, "learning_rate": 1.0028825366322364e-05, "loss": 0.6092, "step": 835 }, { "epoch": 0.030273543085739, "grad_norm": 0.18237920105457306, "learning_rate": 1.0088878212827288e-05, "loss": 0.5281, "step": 840 }, { "epoch": 0.030453742746963636, "grad_norm": 0.17698000371456146, "learning_rate": 1.0148931059332214e-05, "loss": 0.5705, "step": 845 }, { "epoch": 0.030633942408188272, "grad_norm": 0.16873596608638763, "learning_rate": 1.0208983905837137e-05, "loss": 0.5534, "step": 850 }, { "epoch": 0.03081414206941291, "grad_norm": 0.18587124347686768, "learning_rate": 1.0269036752342063e-05, "loss": 0.5482, "step": 855 }, { "epoch": 0.030994341730637545, "grad_norm": 0.15320129692554474, "learning_rate": 1.0329089598846985e-05, "loss": 0.584, "step": 860 }, { "epoch": 0.03117454139186218, "grad_norm": 0.15630275011062622, "learning_rate": 1.0389142445351909e-05, "loss": 0.5771, "step": 865 }, { "epoch": 0.03135474105308682, "grad_norm": 0.20800472795963287, "learning_rate": 1.0449195291856835e-05, "loss": 0.5642, "step": 870 }, { "epoch": 0.031534940714311455, "grad_norm": 0.12712302803993225, "learning_rate": 1.0509248138361758e-05, "loss": 0.5427, "step": 875 }, { "epoch": 0.03171514037553609, "grad_norm": 0.1484389752149582, "learning_rate": 1.0569300984866682e-05, "loss": 0.5385, "step": 880 }, { "epoch": 0.03189534003676073, "grad_norm": 0.12968121469020844, "learning_rate": 1.0629353831371608e-05, "loss": 0.5315, "step": 885 }, { "epoch": 0.03207553969798537, "grad_norm": 0.14982539415359497, "learning_rate": 1.0689406677876532e-05, "loss": 0.5571, "step": 890 }, { "epoch": 0.03225573935921001, "grad_norm": 0.16306400299072266, "learning_rate": 1.0749459524381456e-05, "loss": 0.5379, "step": 895 }, { "epoch": 0.032435939020434644, "grad_norm": 0.19171880185604095, "learning_rate": 1.0809512370886381e-05, "loss": 0.5652, "step": 900 }, { "epoch": 0.03261613868165928, "grad_norm": 0.1954376995563507, "learning_rate": 1.0869565217391305e-05, "loss": 0.5635, "step": 905 }, { "epoch": 0.03279633834288392, "grad_norm": 0.1810484379529953, "learning_rate": 1.0929618063896229e-05, "loss": 0.5046, "step": 910 }, { "epoch": 0.03297653800410855, "grad_norm": 0.18302740156650543, "learning_rate": 1.0989670910401153e-05, "loss": 0.5501, "step": 915 }, { "epoch": 0.03315673766533319, "grad_norm": 0.1687111258506775, "learning_rate": 1.1049723756906077e-05, "loss": 0.5069, "step": 920 }, { "epoch": 0.033336937326557826, "grad_norm": 0.16762186586856842, "learning_rate": 1.1109776603411002e-05, "loss": 0.5167, "step": 925 }, { "epoch": 0.03351713698778246, "grad_norm": 0.22583533823490143, "learning_rate": 1.1169829449915926e-05, "loss": 0.5251, "step": 930 }, { "epoch": 0.0336973366490071, "grad_norm": 0.19438394904136658, "learning_rate": 1.122988229642085e-05, "loss": 0.6164, "step": 935 }, { "epoch": 0.033877536310231736, "grad_norm": 0.1634751260280609, "learning_rate": 1.1289935142925776e-05, "loss": 0.5606, "step": 940 }, { "epoch": 0.03405773597145637, "grad_norm": 0.1744074523448944, "learning_rate": 1.13499879894307e-05, "loss": 0.5853, "step": 945 }, { "epoch": 0.03423793563268101, "grad_norm": 0.18406233191490173, "learning_rate": 1.1410040835935625e-05, "loss": 0.5132, "step": 950 }, { "epoch": 0.034418135293905645, "grad_norm": 0.14555774629116058, "learning_rate": 1.1470093682440549e-05, "loss": 0.546, "step": 955 }, { "epoch": 0.03459833495513028, "grad_norm": 0.14365147054195404, "learning_rate": 1.1530146528945473e-05, "loss": 0.5515, "step": 960 }, { "epoch": 0.03477853461635492, "grad_norm": 0.19130977988243103, "learning_rate": 1.1590199375450397e-05, "loss": 0.5576, "step": 965 }, { "epoch": 0.034958734277579555, "grad_norm": 0.1700170785188675, "learning_rate": 1.165025222195532e-05, "loss": 0.5344, "step": 970 }, { "epoch": 0.0351389339388042, "grad_norm": 0.14722155034542084, "learning_rate": 1.1710305068460244e-05, "loss": 0.5644, "step": 975 }, { "epoch": 0.035319133600028835, "grad_norm": 0.1315588653087616, "learning_rate": 1.177035791496517e-05, "loss": 0.5136, "step": 980 }, { "epoch": 0.03549933326125347, "grad_norm": 0.22789514064788818, "learning_rate": 1.1830410761470094e-05, "loss": 0.5709, "step": 985 }, { "epoch": 0.03567953292247811, "grad_norm": 0.19900259375572205, "learning_rate": 1.189046360797502e-05, "loss": 0.5815, "step": 990 }, { "epoch": 0.035859732583702744, "grad_norm": 0.20573367178440094, "learning_rate": 1.1950516454479943e-05, "loss": 0.5188, "step": 995 }, { "epoch": 0.03603993224492738, "grad_norm": 0.17396429181098938, "learning_rate": 1.2010569300984867e-05, "loss": 0.5032, "step": 1000 }, { "epoch": 0.03603993224492738, "eval_loss": 0.5565588474273682, "eval_runtime": 3.5193, "eval_samples_per_second": 28.415, "eval_steps_per_second": 7.104, "step": 1000 }, { "epoch": 0.03622013190615202, "grad_norm": 0.1656351536512375, "learning_rate": 1.2070622147489793e-05, "loss": 0.5045, "step": 1005 }, { "epoch": 0.036400331567376654, "grad_norm": 0.15851475298404694, "learning_rate": 1.2130674993994717e-05, "loss": 0.5142, "step": 1010 }, { "epoch": 0.03658053122860129, "grad_norm": 0.18795153498649597, "learning_rate": 1.219072784049964e-05, "loss": 0.5408, "step": 1015 }, { "epoch": 0.03676073088982593, "grad_norm": 0.18724803626537323, "learning_rate": 1.2250780687004564e-05, "loss": 0.5132, "step": 1020 }, { "epoch": 0.03694093055105056, "grad_norm": 0.17133454978466034, "learning_rate": 1.2310833533509488e-05, "loss": 0.5231, "step": 1025 }, { "epoch": 0.0371211302122752, "grad_norm": 0.19401347637176514, "learning_rate": 1.2370886380014414e-05, "loss": 0.5672, "step": 1030 }, { "epoch": 0.037301329873499836, "grad_norm": 0.23305334150791168, "learning_rate": 1.2430939226519338e-05, "loss": 0.5496, "step": 1035 }, { "epoch": 0.03748152953472447, "grad_norm": 0.1804538369178772, "learning_rate": 1.2490992073024261e-05, "loss": 0.5241, "step": 1040 }, { "epoch": 0.03766172919594911, "grad_norm": 0.16487692296504974, "learning_rate": 1.2551044919529187e-05, "loss": 0.537, "step": 1045 }, { "epoch": 0.037841928857173746, "grad_norm": 0.17555435001850128, "learning_rate": 1.2611097766034111e-05, "loss": 0.5381, "step": 1050 }, { "epoch": 0.03802212851839838, "grad_norm": 0.19943825900554657, "learning_rate": 1.2671150612539035e-05, "loss": 0.5353, "step": 1055 }, { "epoch": 0.038202328179623025, "grad_norm": 0.19467459619045258, "learning_rate": 1.2731203459043959e-05, "loss": 0.5634, "step": 1060 }, { "epoch": 0.03838252784084766, "grad_norm": 0.1945466697216034, "learning_rate": 1.2791256305548884e-05, "loss": 0.5709, "step": 1065 }, { "epoch": 0.0385627275020723, "grad_norm": 0.17574776709079742, "learning_rate": 1.2851309152053808e-05, "loss": 0.496, "step": 1070 }, { "epoch": 0.038742927163296935, "grad_norm": 0.19129513204097748, "learning_rate": 1.2911361998558732e-05, "loss": 0.4818, "step": 1075 }, { "epoch": 0.03892312682452157, "grad_norm": 0.16488486528396606, "learning_rate": 1.2971414845063656e-05, "loss": 0.5433, "step": 1080 }, { "epoch": 0.03910332648574621, "grad_norm": 0.1613866090774536, "learning_rate": 1.303146769156858e-05, "loss": 0.5058, "step": 1085 }, { "epoch": 0.039283526146970844, "grad_norm": 0.21093931794166565, "learning_rate": 1.3091520538073504e-05, "loss": 0.5199, "step": 1090 }, { "epoch": 0.03946372580819548, "grad_norm": 0.1792885661125183, "learning_rate": 1.315157338457843e-05, "loss": 0.5298, "step": 1095 }, { "epoch": 0.03964392546942012, "grad_norm": 0.20177467167377472, "learning_rate": 1.3211626231083355e-05, "loss": 0.5183, "step": 1100 }, { "epoch": 0.039824125130644754, "grad_norm": 0.1867968887090683, "learning_rate": 1.3271679077588279e-05, "loss": 0.5268, "step": 1105 }, { "epoch": 0.04000432479186939, "grad_norm": 0.17943917214870453, "learning_rate": 1.3331731924093202e-05, "loss": 0.555, "step": 1110 }, { "epoch": 0.04018452445309403, "grad_norm": 0.24387885630130768, "learning_rate": 1.3391784770598126e-05, "loss": 0.5367, "step": 1115 }, { "epoch": 0.04036472411431866, "grad_norm": 0.27169400453567505, "learning_rate": 1.3451837617103052e-05, "loss": 0.5293, "step": 1120 }, { "epoch": 0.0405449237755433, "grad_norm": 0.22766202688217163, "learning_rate": 1.3511890463607976e-05, "loss": 0.5438, "step": 1125 }, { "epoch": 0.040725123436767936, "grad_norm": 0.17502576112747192, "learning_rate": 1.35719433101129e-05, "loss": 0.482, "step": 1130 }, { "epoch": 0.04090532309799257, "grad_norm": 0.1690770983695984, "learning_rate": 1.3631996156617823e-05, "loss": 0.5331, "step": 1135 }, { "epoch": 0.04108552275921721, "grad_norm": 0.18362893164157867, "learning_rate": 1.3692049003122747e-05, "loss": 0.4705, "step": 1140 }, { "epoch": 0.04126572242044185, "grad_norm": 0.2055690884590149, "learning_rate": 1.3752101849627675e-05, "loss": 0.4958, "step": 1145 }, { "epoch": 0.04144592208166649, "grad_norm": 0.20133014023303986, "learning_rate": 1.3812154696132598e-05, "loss": 0.5269, "step": 1150 }, { "epoch": 0.041626121742891126, "grad_norm": 0.18268916010856628, "learning_rate": 1.3872207542637522e-05, "loss": 0.5104, "step": 1155 }, { "epoch": 0.04180632140411576, "grad_norm": 0.22504407167434692, "learning_rate": 1.3932260389142446e-05, "loss": 0.5622, "step": 1160 }, { "epoch": 0.0419865210653404, "grad_norm": 0.2050849348306656, "learning_rate": 1.399231323564737e-05, "loss": 0.5176, "step": 1165 }, { "epoch": 0.042166720726565035, "grad_norm": 0.1802973598241806, "learning_rate": 1.4052366082152294e-05, "loss": 0.5322, "step": 1170 }, { "epoch": 0.04234692038778967, "grad_norm": 0.19005241990089417, "learning_rate": 1.411241892865722e-05, "loss": 0.5582, "step": 1175 }, { "epoch": 0.04252712004901431, "grad_norm": 0.2026328444480896, "learning_rate": 1.4172471775162143e-05, "loss": 0.5726, "step": 1180 }, { "epoch": 0.042707319710238945, "grad_norm": 0.22983752191066742, "learning_rate": 1.4232524621667067e-05, "loss": 0.5469, "step": 1185 }, { "epoch": 0.04288751937146358, "grad_norm": 0.15863169729709625, "learning_rate": 1.4292577468171991e-05, "loss": 0.5368, "step": 1190 }, { "epoch": 0.04306771903268822, "grad_norm": 0.19948942959308624, "learning_rate": 1.4352630314676915e-05, "loss": 0.518, "step": 1195 }, { "epoch": 0.043247918693912854, "grad_norm": 0.20229573547840118, "learning_rate": 1.4412683161181842e-05, "loss": 0.5379, "step": 1200 }, { "epoch": 0.04342811835513749, "grad_norm": 0.22024467587471008, "learning_rate": 1.4472736007686766e-05, "loss": 0.5648, "step": 1205 }, { "epoch": 0.04360831801636213, "grad_norm": 0.22408129274845123, "learning_rate": 1.453278885419169e-05, "loss": 0.5228, "step": 1210 }, { "epoch": 0.043788517677586763, "grad_norm": 0.21704581379890442, "learning_rate": 1.4592841700696614e-05, "loss": 0.5289, "step": 1215 }, { "epoch": 0.0439687173388114, "grad_norm": 0.1930111199617386, "learning_rate": 1.4652894547201538e-05, "loss": 0.5176, "step": 1220 }, { "epoch": 0.044148917000036036, "grad_norm": 0.23059917986392975, "learning_rate": 1.4712947393706462e-05, "loss": 0.5323, "step": 1225 }, { "epoch": 0.04432911666126068, "grad_norm": 0.21172671020030975, "learning_rate": 1.4773000240211387e-05, "loss": 0.544, "step": 1230 }, { "epoch": 0.044509316322485316, "grad_norm": 0.20656728744506836, "learning_rate": 1.4833053086716311e-05, "loss": 0.5019, "step": 1235 }, { "epoch": 0.04468951598370995, "grad_norm": 0.19450949132442474, "learning_rate": 1.4893105933221235e-05, "loss": 0.5262, "step": 1240 }, { "epoch": 0.04486971564493459, "grad_norm": 0.18495753407478333, "learning_rate": 1.4953158779726159e-05, "loss": 0.4986, "step": 1245 }, { "epoch": 0.045049915306159226, "grad_norm": 0.20766963064670563, "learning_rate": 1.5013211626231083e-05, "loss": 0.5081, "step": 1250 }, { "epoch": 0.04523011496738386, "grad_norm": 0.18297769129276276, "learning_rate": 1.507326447273601e-05, "loss": 0.5523, "step": 1255 }, { "epoch": 0.0454103146286085, "grad_norm": 0.18521635234355927, "learning_rate": 1.5133317319240934e-05, "loss": 0.5279, "step": 1260 }, { "epoch": 0.045590514289833135, "grad_norm": 0.22829121351242065, "learning_rate": 1.5193370165745858e-05, "loss": 0.5055, "step": 1265 }, { "epoch": 0.04577071395105777, "grad_norm": 0.21849414706230164, "learning_rate": 1.5253423012250781e-05, "loss": 0.539, "step": 1270 }, { "epoch": 0.04595091361228241, "grad_norm": 0.21421493589878082, "learning_rate": 1.5313475858755704e-05, "loss": 0.5275, "step": 1275 }, { "epoch": 0.046131113273507045, "grad_norm": 0.23160327970981598, "learning_rate": 1.5373528705260633e-05, "loss": 0.5409, "step": 1280 }, { "epoch": 0.04631131293473168, "grad_norm": 0.16949445009231567, "learning_rate": 1.5433581551765555e-05, "loss": 0.5284, "step": 1285 }, { "epoch": 0.04649151259595632, "grad_norm": 0.18024949729442596, "learning_rate": 1.549363439827048e-05, "loss": 0.5375, "step": 1290 }, { "epoch": 0.046671712257180954, "grad_norm": 0.22767312824726105, "learning_rate": 1.5553687244775402e-05, "loss": 0.5214, "step": 1295 }, { "epoch": 0.04685191191840559, "grad_norm": 0.21435782313346863, "learning_rate": 1.5613740091280328e-05, "loss": 0.4854, "step": 1300 }, { "epoch": 0.04703211157963023, "grad_norm": 0.22171509265899658, "learning_rate": 1.567379293778525e-05, "loss": 0.5017, "step": 1305 }, { "epoch": 0.04721231124085487, "grad_norm": 0.23257885873317719, "learning_rate": 1.5733845784290176e-05, "loss": 0.5319, "step": 1310 }, { "epoch": 0.04739251090207951, "grad_norm": 0.24968785047531128, "learning_rate": 1.57938986307951e-05, "loss": 0.5511, "step": 1315 }, { "epoch": 0.047572710563304144, "grad_norm": 0.19016103446483612, "learning_rate": 1.5853951477300024e-05, "loss": 0.5075, "step": 1320 }, { "epoch": 0.04775291022452878, "grad_norm": 0.19356337189674377, "learning_rate": 1.591400432380495e-05, "loss": 0.5068, "step": 1325 }, { "epoch": 0.04793310988575342, "grad_norm": 0.28713881969451904, "learning_rate": 1.597405717030987e-05, "loss": 0.5403, "step": 1330 }, { "epoch": 0.04811330954697805, "grad_norm": 0.18049970269203186, "learning_rate": 1.60341100168148e-05, "loss": 0.5184, "step": 1335 }, { "epoch": 0.04829350920820269, "grad_norm": 0.22419008612632751, "learning_rate": 1.6094162863319722e-05, "loss": 0.5497, "step": 1340 }, { "epoch": 0.048473708869427326, "grad_norm": 0.24275891482830048, "learning_rate": 1.6154215709824648e-05, "loss": 0.5375, "step": 1345 }, { "epoch": 0.04865390853065196, "grad_norm": 0.21218125522136688, "learning_rate": 1.621426855632957e-05, "loss": 0.5069, "step": 1350 }, { "epoch": 0.0488341081918766, "grad_norm": 0.17719610035419464, "learning_rate": 1.6274321402834496e-05, "loss": 0.5194, "step": 1355 }, { "epoch": 0.049014307853101236, "grad_norm": 0.27654048800468445, "learning_rate": 1.633437424933942e-05, "loss": 0.5144, "step": 1360 }, { "epoch": 0.04919450751432587, "grad_norm": 0.20031145215034485, "learning_rate": 1.6394427095844343e-05, "loss": 0.5053, "step": 1365 }, { "epoch": 0.04937470717555051, "grad_norm": 0.21770700812339783, "learning_rate": 1.645447994234927e-05, "loss": 0.5087, "step": 1370 }, { "epoch": 0.049554906836775145, "grad_norm": 0.26108381152153015, "learning_rate": 1.651453278885419e-05, "loss": 0.5614, "step": 1375 }, { "epoch": 0.04973510649799978, "grad_norm": 0.21475346386432648, "learning_rate": 1.6574585635359117e-05, "loss": 0.533, "step": 1380 }, { "epoch": 0.04991530615922442, "grad_norm": 0.2189481258392334, "learning_rate": 1.663463848186404e-05, "loss": 0.5106, "step": 1385 }, { "epoch": 0.050095505820449054, "grad_norm": 0.22523830831050873, "learning_rate": 1.6694691328368968e-05, "loss": 0.5047, "step": 1390 }, { "epoch": 0.0502757054816737, "grad_norm": 0.2538076639175415, "learning_rate": 1.675474417487389e-05, "loss": 0.4805, "step": 1395 }, { "epoch": 0.050455905142898334, "grad_norm": 0.23015667498111725, "learning_rate": 1.6814797021378816e-05, "loss": 0.5236, "step": 1400 }, { "epoch": 0.05063610480412297, "grad_norm": 0.26876676082611084, "learning_rate": 1.6874849867883738e-05, "loss": 0.541, "step": 1405 }, { "epoch": 0.05081630446534761, "grad_norm": 0.25886017084121704, "learning_rate": 1.6934902714388663e-05, "loss": 0.5363, "step": 1410 }, { "epoch": 0.050996504126572244, "grad_norm": 0.22038790583610535, "learning_rate": 1.699495556089359e-05, "loss": 0.4836, "step": 1415 }, { "epoch": 0.05117670378779688, "grad_norm": 0.22495049238204956, "learning_rate": 1.705500840739851e-05, "loss": 0.5435, "step": 1420 }, { "epoch": 0.05135690344902152, "grad_norm": 0.22736889123916626, "learning_rate": 1.7115061253903437e-05, "loss": 0.4876, "step": 1425 }, { "epoch": 0.05153710311024615, "grad_norm": 0.21810127794742584, "learning_rate": 1.717511410040836e-05, "loss": 0.5353, "step": 1430 }, { "epoch": 0.05171730277147079, "grad_norm": 0.19743861258029938, "learning_rate": 1.7235166946913284e-05, "loss": 0.5042, "step": 1435 }, { "epoch": 0.051897502432695426, "grad_norm": 0.22972045838832855, "learning_rate": 1.7295219793418207e-05, "loss": 0.4953, "step": 1440 }, { "epoch": 0.05207770209392006, "grad_norm": 0.19471551477909088, "learning_rate": 1.7355272639923136e-05, "loss": 0.5055, "step": 1445 }, { "epoch": 0.0522579017551447, "grad_norm": 0.29399409890174866, "learning_rate": 1.7415325486428058e-05, "loss": 0.547, "step": 1450 }, { "epoch": 0.052438101416369336, "grad_norm": 0.24069805443286896, "learning_rate": 1.7475378332932983e-05, "loss": 0.5267, "step": 1455 }, { "epoch": 0.05261830107759397, "grad_norm": 0.23223073780536652, "learning_rate": 1.7535431179437905e-05, "loss": 0.4945, "step": 1460 }, { "epoch": 0.05279850073881861, "grad_norm": 0.19976119697093964, "learning_rate": 1.759548402594283e-05, "loss": 0.5135, "step": 1465 }, { "epoch": 0.052978700400043245, "grad_norm": 0.2490244060754776, "learning_rate": 1.7655536872447757e-05, "loss": 0.5031, "step": 1470 }, { "epoch": 0.05315890006126788, "grad_norm": 0.23295678198337555, "learning_rate": 1.771558971895268e-05, "loss": 0.5652, "step": 1475 }, { "epoch": 0.053339099722492525, "grad_norm": 0.22554819285869598, "learning_rate": 1.7775642565457604e-05, "loss": 0.5279, "step": 1480 }, { "epoch": 0.05351929938371716, "grad_norm": 0.2634689509868622, "learning_rate": 1.7835695411962526e-05, "loss": 0.5486, "step": 1485 }, { "epoch": 0.0536994990449418, "grad_norm": 0.1896762251853943, "learning_rate": 1.7895748258467452e-05, "loss": 0.5419, "step": 1490 }, { "epoch": 0.053879698706166435, "grad_norm": 0.21280480921268463, "learning_rate": 1.7955801104972378e-05, "loss": 0.5541, "step": 1495 }, { "epoch": 0.05405989836739107, "grad_norm": 0.19312447309494019, "learning_rate": 1.8015853951477303e-05, "loss": 0.5315, "step": 1500 }, { "epoch": 0.05405989836739107, "eval_loss": 0.5359941720962524, "eval_runtime": 3.5176, "eval_samples_per_second": 28.429, "eval_steps_per_second": 7.107, "step": 1500 }, { "epoch": 0.05424009802861571, "grad_norm": 0.22356046736240387, "learning_rate": 1.8075906797982225e-05, "loss": 0.5439, "step": 1505 }, { "epoch": 0.054420297689840344, "grad_norm": 0.24172358214855194, "learning_rate": 1.813595964448715e-05, "loss": 0.5436, "step": 1510 }, { "epoch": 0.05460049735106498, "grad_norm": 0.25496965646743774, "learning_rate": 1.8196012490992073e-05, "loss": 0.5427, "step": 1515 }, { "epoch": 0.05478069701228962, "grad_norm": 0.21974816918373108, "learning_rate": 1.8256065337497e-05, "loss": 0.5261, "step": 1520 }, { "epoch": 0.05496089667351425, "grad_norm": 0.2528958320617676, "learning_rate": 1.8316118184001924e-05, "loss": 0.5123, "step": 1525 }, { "epoch": 0.05514109633473889, "grad_norm": 0.2141297310590744, "learning_rate": 1.8376171030506846e-05, "loss": 0.4936, "step": 1530 }, { "epoch": 0.055321295995963526, "grad_norm": 0.2709108293056488, "learning_rate": 1.8436223877011772e-05, "loss": 0.5397, "step": 1535 }, { "epoch": 0.05550149565718816, "grad_norm": 0.2779485285282135, "learning_rate": 1.8496276723516694e-05, "loss": 0.5001, "step": 1540 }, { "epoch": 0.0556816953184128, "grad_norm": 0.22084900736808777, "learning_rate": 1.855632957002162e-05, "loss": 0.4713, "step": 1545 }, { "epoch": 0.055861894979637436, "grad_norm": 0.1823149174451828, "learning_rate": 1.8616382416526545e-05, "loss": 0.4974, "step": 1550 }, { "epoch": 0.05604209464086207, "grad_norm": 0.20994675159454346, "learning_rate": 1.867643526303147e-05, "loss": 0.5241, "step": 1555 }, { "epoch": 0.05622229430208671, "grad_norm": 0.24369290471076965, "learning_rate": 1.8736488109536393e-05, "loss": 0.5478, "step": 1560 }, { "epoch": 0.05640249396331135, "grad_norm": 0.259897381067276, "learning_rate": 1.879654095604132e-05, "loss": 0.4991, "step": 1565 }, { "epoch": 0.05658269362453599, "grad_norm": 0.25176161527633667, "learning_rate": 1.885659380254624e-05, "loss": 0.5066, "step": 1570 }, { "epoch": 0.056762893285760625, "grad_norm": 0.2606624364852905, "learning_rate": 1.8916646649051163e-05, "loss": 0.4857, "step": 1575 }, { "epoch": 0.05694309294698526, "grad_norm": 0.197222039103508, "learning_rate": 1.8976699495556092e-05, "loss": 0.5225, "step": 1580 }, { "epoch": 0.0571232926082099, "grad_norm": 0.24445274472236633, "learning_rate": 1.9036752342061014e-05, "loss": 0.4929, "step": 1585 }, { "epoch": 0.057303492269434535, "grad_norm": 0.21949337422847748, "learning_rate": 1.909680518856594e-05, "loss": 0.5149, "step": 1590 }, { "epoch": 0.05748369193065917, "grad_norm": 0.19175410270690918, "learning_rate": 1.9156858035070862e-05, "loss": 0.4939, "step": 1595 }, { "epoch": 0.05766389159188381, "grad_norm": 0.2463282346725464, "learning_rate": 1.9216910881575787e-05, "loss": 0.5435, "step": 1600 }, { "epoch": 0.057844091253108444, "grad_norm": 0.2281898409128189, "learning_rate": 1.9276963728080713e-05, "loss": 0.5538, "step": 1605 }, { "epoch": 0.05802429091433308, "grad_norm": 0.2389582097530365, "learning_rate": 1.933701657458564e-05, "loss": 0.5149, "step": 1610 }, { "epoch": 0.05820449057555772, "grad_norm": 0.23086562752723694, "learning_rate": 1.939706942109056e-05, "loss": 0.5109, "step": 1615 }, { "epoch": 0.058384690236782354, "grad_norm": 0.2583639621734619, "learning_rate": 1.9457122267595486e-05, "loss": 0.5174, "step": 1620 }, { "epoch": 0.05856488989800699, "grad_norm": 0.20627906918525696, "learning_rate": 1.951717511410041e-05, "loss": 0.5224, "step": 1625 }, { "epoch": 0.05874508955923163, "grad_norm": 0.30713996291160583, "learning_rate": 1.9577227960605334e-05, "loss": 0.5218, "step": 1630 }, { "epoch": 0.05892528922045626, "grad_norm": 0.26830917596817017, "learning_rate": 1.963728080711026e-05, "loss": 0.5101, "step": 1635 }, { "epoch": 0.0591054888816809, "grad_norm": 0.21486251056194305, "learning_rate": 1.969733365361518e-05, "loss": 0.4967, "step": 1640 }, { "epoch": 0.059285688542905536, "grad_norm": 0.26688727736473083, "learning_rate": 1.9757386500120107e-05, "loss": 0.5322, "step": 1645 }, { "epoch": 0.05946588820413018, "grad_norm": 0.23560921847820282, "learning_rate": 1.981743934662503e-05, "loss": 0.491, "step": 1650 }, { "epoch": 0.059646087865354816, "grad_norm": 0.27083316445350647, "learning_rate": 1.9877492193129955e-05, "loss": 0.5475, "step": 1655 }, { "epoch": 0.05982628752657945, "grad_norm": 0.24693989753723145, "learning_rate": 1.993754503963488e-05, "loss": 0.5349, "step": 1660 }, { "epoch": 0.06000648718780409, "grad_norm": 0.2298416942358017, "learning_rate": 1.9997597886139806e-05, "loss": 0.5048, "step": 1665 }, { "epoch": 0.060186686849028725, "grad_norm": 0.2370372712612152, "learning_rate": 2.0057650732644728e-05, "loss": 0.4959, "step": 1670 }, { "epoch": 0.06036688651025336, "grad_norm": 0.25343507528305054, "learning_rate": 2.011770357914965e-05, "loss": 0.5119, "step": 1675 }, { "epoch": 0.060547086171478, "grad_norm": 0.28533029556274414, "learning_rate": 2.0177756425654576e-05, "loss": 0.5539, "step": 1680 }, { "epoch": 0.060727285832702635, "grad_norm": 0.2781274914741516, "learning_rate": 2.02378092721595e-05, "loss": 0.4953, "step": 1685 }, { "epoch": 0.06090748549392727, "grad_norm": 0.33222508430480957, "learning_rate": 2.0297862118664427e-05, "loss": 0.5152, "step": 1690 }, { "epoch": 0.06108768515515191, "grad_norm": 0.2313944697380066, "learning_rate": 2.035791496516935e-05, "loss": 0.5293, "step": 1695 }, { "epoch": 0.061267884816376544, "grad_norm": 0.28105735778808594, "learning_rate": 2.0417967811674275e-05, "loss": 0.5141, "step": 1700 }, { "epoch": 0.06144808447760118, "grad_norm": 0.2256317436695099, "learning_rate": 2.0478020658179197e-05, "loss": 0.4813, "step": 1705 }, { "epoch": 0.06162828413882582, "grad_norm": 0.23726746439933777, "learning_rate": 2.0538073504684126e-05, "loss": 0.5469, "step": 1710 }, { "epoch": 0.061808483800050454, "grad_norm": 0.27420201897621155, "learning_rate": 2.0598126351189048e-05, "loss": 0.4634, "step": 1715 }, { "epoch": 0.06198868346127509, "grad_norm": 0.2140614092350006, "learning_rate": 2.065817919769397e-05, "loss": 0.5011, "step": 1720 }, { "epoch": 0.06216888312249973, "grad_norm": 0.21357890963554382, "learning_rate": 2.0718232044198896e-05, "loss": 0.4775, "step": 1725 }, { "epoch": 0.06234908278372436, "grad_norm": 0.2980591058731079, "learning_rate": 2.0778284890703818e-05, "loss": 0.5457, "step": 1730 }, { "epoch": 0.062529282444949, "grad_norm": 0.2486668825149536, "learning_rate": 2.0838337737208744e-05, "loss": 0.4759, "step": 1735 }, { "epoch": 0.06270948210617364, "grad_norm": 0.24250219762325287, "learning_rate": 2.089839058371367e-05, "loss": 0.5026, "step": 1740 }, { "epoch": 0.06288968176739827, "grad_norm": 0.2787621319293976, "learning_rate": 2.0958443430218595e-05, "loss": 0.5464, "step": 1745 }, { "epoch": 0.06306988142862291, "grad_norm": 0.23946985602378845, "learning_rate": 2.1018496276723517e-05, "loss": 0.5052, "step": 1750 }, { "epoch": 0.06325008108984755, "grad_norm": 0.26933711767196655, "learning_rate": 2.1078549123228443e-05, "loss": 0.5287, "step": 1755 }, { "epoch": 0.06343028075107218, "grad_norm": 0.2348264902830124, "learning_rate": 2.1138601969733365e-05, "loss": 0.5193, "step": 1760 }, { "epoch": 0.06361048041229682, "grad_norm": 0.26587164402008057, "learning_rate": 2.1198654816238294e-05, "loss": 0.5169, "step": 1765 }, { "epoch": 0.06379068007352146, "grad_norm": 0.2152547687292099, "learning_rate": 2.1258707662743216e-05, "loss": 0.5034, "step": 1770 }, { "epoch": 0.0639708797347461, "grad_norm": 0.27491626143455505, "learning_rate": 2.1318760509248138e-05, "loss": 0.5476, "step": 1775 }, { "epoch": 0.06415107939597074, "grad_norm": 0.22118522226810455, "learning_rate": 2.1378813355753064e-05, "loss": 0.5291, "step": 1780 }, { "epoch": 0.06433127905719538, "grad_norm": 0.2910890281200409, "learning_rate": 2.1438866202257986e-05, "loss": 0.5183, "step": 1785 }, { "epoch": 0.06451147871842002, "grad_norm": 0.2631491720676422, "learning_rate": 2.149891904876291e-05, "loss": 0.4692, "step": 1790 }, { "epoch": 0.06469167837964465, "grad_norm": 0.2912589907646179, "learning_rate": 2.1558971895267837e-05, "loss": 0.5036, "step": 1795 }, { "epoch": 0.06487187804086929, "grad_norm": 0.2575485408306122, "learning_rate": 2.1619024741772762e-05, "loss": 0.4924, "step": 1800 }, { "epoch": 0.06505207770209392, "grad_norm": 0.24182820320129395, "learning_rate": 2.1679077588277685e-05, "loss": 0.4854, "step": 1805 }, { "epoch": 0.06523227736331856, "grad_norm": 0.23146407306194305, "learning_rate": 2.173913043478261e-05, "loss": 0.4882, "step": 1810 }, { "epoch": 0.0654124770245432, "grad_norm": 0.2757367789745331, "learning_rate": 2.1799183281287532e-05, "loss": 0.4886, "step": 1815 }, { "epoch": 0.06559267668576783, "grad_norm": 0.22752000391483307, "learning_rate": 2.1859236127792458e-05, "loss": 0.49, "step": 1820 }, { "epoch": 0.06577287634699247, "grad_norm": 0.2603028416633606, "learning_rate": 2.1919288974297383e-05, "loss": 0.4861, "step": 1825 }, { "epoch": 0.0659530760082171, "grad_norm": 0.226707324385643, "learning_rate": 2.1979341820802306e-05, "loss": 0.4598, "step": 1830 }, { "epoch": 0.06613327566944174, "grad_norm": 0.2270716428756714, "learning_rate": 2.203939466730723e-05, "loss": 0.5174, "step": 1835 }, { "epoch": 0.06631347533066638, "grad_norm": 0.2932356595993042, "learning_rate": 2.2099447513812153e-05, "loss": 0.5836, "step": 1840 }, { "epoch": 0.06649367499189102, "grad_norm": 0.24968333542346954, "learning_rate": 2.2159500360317082e-05, "loss": 0.5215, "step": 1845 }, { "epoch": 0.06667387465311565, "grad_norm": 0.3082146942615509, "learning_rate": 2.2219553206822005e-05, "loss": 0.5236, "step": 1850 }, { "epoch": 0.06685407431434029, "grad_norm": 0.2535383999347687, "learning_rate": 2.227960605332693e-05, "loss": 0.4725, "step": 1855 }, { "epoch": 0.06703427397556493, "grad_norm": 0.380136102437973, "learning_rate": 2.2339658899831852e-05, "loss": 0.5161, "step": 1860 }, { "epoch": 0.06721447363678956, "grad_norm": 0.24711187183856964, "learning_rate": 2.2399711746336778e-05, "loss": 0.5302, "step": 1865 }, { "epoch": 0.0673946732980142, "grad_norm": 0.285430908203125, "learning_rate": 2.24597645928417e-05, "loss": 0.5437, "step": 1870 }, { "epoch": 0.06757487295923884, "grad_norm": 0.2444837987422943, "learning_rate": 2.2519817439346626e-05, "loss": 0.4988, "step": 1875 }, { "epoch": 0.06775507262046347, "grad_norm": 0.3459201455116272, "learning_rate": 2.257987028585155e-05, "loss": 0.5126, "step": 1880 }, { "epoch": 0.06793527228168811, "grad_norm": 0.23254404962062836, "learning_rate": 2.2639923132356473e-05, "loss": 0.5229, "step": 1885 }, { "epoch": 0.06811547194291274, "grad_norm": 0.3149000406265259, "learning_rate": 2.26999759788614e-05, "loss": 0.5091, "step": 1890 }, { "epoch": 0.06829567160413738, "grad_norm": 0.31007108092308044, "learning_rate": 2.276002882536632e-05, "loss": 0.487, "step": 1895 }, { "epoch": 0.06847587126536202, "grad_norm": 0.25598689913749695, "learning_rate": 2.282008167187125e-05, "loss": 0.5061, "step": 1900 }, { "epoch": 0.06865607092658665, "grad_norm": 0.27555733919143677, "learning_rate": 2.2880134518376172e-05, "loss": 0.5177, "step": 1905 }, { "epoch": 0.06883627058781129, "grad_norm": 0.28343528509140015, "learning_rate": 2.2940187364881098e-05, "loss": 0.5267, "step": 1910 }, { "epoch": 0.06901647024903593, "grad_norm": 0.1938653290271759, "learning_rate": 2.300024021138602e-05, "loss": 0.4926, "step": 1915 }, { "epoch": 0.06919666991026056, "grad_norm": 0.2610265910625458, "learning_rate": 2.3060293057890945e-05, "loss": 0.5187, "step": 1920 }, { "epoch": 0.0693768695714852, "grad_norm": 0.20911858975887299, "learning_rate": 2.3120345904395868e-05, "loss": 0.4979, "step": 1925 }, { "epoch": 0.06955706923270984, "grad_norm": 0.2716982662677765, "learning_rate": 2.3180398750900793e-05, "loss": 0.4976, "step": 1930 }, { "epoch": 0.06973726889393447, "grad_norm": 0.3215219974517822, "learning_rate": 2.324045159740572e-05, "loss": 0.5193, "step": 1935 }, { "epoch": 0.06991746855515911, "grad_norm": 0.3229633867740631, "learning_rate": 2.330050444391064e-05, "loss": 0.5057, "step": 1940 }, { "epoch": 0.07009766821638376, "grad_norm": 0.20286689698696136, "learning_rate": 2.3360557290415567e-05, "loss": 0.4657, "step": 1945 }, { "epoch": 0.0702778678776084, "grad_norm": 0.30475685000419617, "learning_rate": 2.342061013692049e-05, "loss": 0.5316, "step": 1950 }, { "epoch": 0.07045806753883303, "grad_norm": 0.273539662361145, "learning_rate": 2.3480662983425418e-05, "loss": 0.5268, "step": 1955 }, { "epoch": 0.07063826720005767, "grad_norm": 0.2909047305583954, "learning_rate": 2.354071582993034e-05, "loss": 0.4885, "step": 1960 }, { "epoch": 0.0708184668612823, "grad_norm": 0.23149296641349792, "learning_rate": 2.3600768676435265e-05, "loss": 0.534, "step": 1965 }, { "epoch": 0.07099866652250694, "grad_norm": 0.32449668645858765, "learning_rate": 2.3660821522940188e-05, "loss": 0.508, "step": 1970 }, { "epoch": 0.07117886618373158, "grad_norm": 0.2767580449581146, "learning_rate": 2.3720874369445113e-05, "loss": 0.5106, "step": 1975 }, { "epoch": 0.07135906584495622, "grad_norm": 0.3347299098968506, "learning_rate": 2.378092721595004e-05, "loss": 0.4898, "step": 1980 }, { "epoch": 0.07153926550618085, "grad_norm": 0.2920180559158325, "learning_rate": 2.384098006245496e-05, "loss": 0.5102, "step": 1985 }, { "epoch": 0.07171946516740549, "grad_norm": 0.2103186845779419, "learning_rate": 2.3901032908959886e-05, "loss": 0.5, "step": 1990 }, { "epoch": 0.07189966482863012, "grad_norm": 0.276729941368103, "learning_rate": 2.396108575546481e-05, "loss": 0.4737, "step": 1995 }, { "epoch": 0.07207986448985476, "grad_norm": 0.25289759039878845, "learning_rate": 2.4021138601969734e-05, "loss": 0.4869, "step": 2000 }, { "epoch": 0.07207986448985476, "eval_loss": 0.5271518230438232, "eval_runtime": 3.5218, "eval_samples_per_second": 28.395, "eval_steps_per_second": 7.099, "step": 2000 }, { "epoch": 0.0722600641510794, "grad_norm": 0.2662065923213959, "learning_rate": 2.4081191448474656e-05, "loss": 0.5005, "step": 2005 }, { "epoch": 0.07244026381230403, "grad_norm": 0.2379211187362671, "learning_rate": 2.4141244294979585e-05, "loss": 0.4959, "step": 2010 }, { "epoch": 0.07262046347352867, "grad_norm": 0.25165653228759766, "learning_rate": 2.4201297141484507e-05, "loss": 0.4869, "step": 2015 }, { "epoch": 0.07280066313475331, "grad_norm": 0.21869604289531708, "learning_rate": 2.4261349987989433e-05, "loss": 0.4938, "step": 2020 }, { "epoch": 0.07298086279597794, "grad_norm": 0.21399500966072083, "learning_rate": 2.4321402834494355e-05, "loss": 0.5029, "step": 2025 }, { "epoch": 0.07316106245720258, "grad_norm": 0.24735891819000244, "learning_rate": 2.438145568099928e-05, "loss": 0.503, "step": 2030 }, { "epoch": 0.07334126211842722, "grad_norm": 0.27130669355392456, "learning_rate": 2.4441508527504206e-05, "loss": 0.4726, "step": 2035 }, { "epoch": 0.07352146177965185, "grad_norm": 0.33456870913505554, "learning_rate": 2.450156137400913e-05, "loss": 0.4906, "step": 2040 }, { "epoch": 0.07370166144087649, "grad_norm": 0.3058684766292572, "learning_rate": 2.4561614220514054e-05, "loss": 0.5345, "step": 2045 }, { "epoch": 0.07388186110210113, "grad_norm": 0.24727894365787506, "learning_rate": 2.4621667067018976e-05, "loss": 0.5108, "step": 2050 }, { "epoch": 0.07406206076332576, "grad_norm": 0.32694023847579956, "learning_rate": 2.4681719913523902e-05, "loss": 0.4917, "step": 2055 }, { "epoch": 0.0742422604245504, "grad_norm": 0.3927519917488098, "learning_rate": 2.4741772760028827e-05, "loss": 0.4762, "step": 2060 }, { "epoch": 0.07442246008577504, "grad_norm": 0.22874149680137634, "learning_rate": 2.4801825606533753e-05, "loss": 0.4849, "step": 2065 }, { "epoch": 0.07460265974699967, "grad_norm": 0.24440160393714905, "learning_rate": 2.4861878453038675e-05, "loss": 0.4888, "step": 2070 }, { "epoch": 0.07478285940822431, "grad_norm": 0.2987866997718811, "learning_rate": 2.49219312995436e-05, "loss": 0.4926, "step": 2075 }, { "epoch": 0.07496305906944895, "grad_norm": 0.2661680281162262, "learning_rate": 2.4981984146048523e-05, "loss": 0.4854, "step": 2080 }, { "epoch": 0.07514325873067358, "grad_norm": 0.23432804644107819, "learning_rate": 2.504203699255345e-05, "loss": 0.4452, "step": 2085 }, { "epoch": 0.07532345839189822, "grad_norm": 0.28125226497650146, "learning_rate": 2.5102089839058374e-05, "loss": 0.4941, "step": 2090 }, { "epoch": 0.07550365805312285, "grad_norm": 0.2805047631263733, "learning_rate": 2.5162142685563296e-05, "loss": 0.4885, "step": 2095 }, { "epoch": 0.07568385771434749, "grad_norm": 0.24063357710838318, "learning_rate": 2.5222195532068222e-05, "loss": 0.4973, "step": 2100 }, { "epoch": 0.07586405737557213, "grad_norm": 0.27712422609329224, "learning_rate": 2.5282248378573147e-05, "loss": 0.5314, "step": 2105 }, { "epoch": 0.07604425703679676, "grad_norm": 0.2542542517185211, "learning_rate": 2.534230122507807e-05, "loss": 0.5142, "step": 2110 }, { "epoch": 0.07622445669802141, "grad_norm": 0.19768337905406952, "learning_rate": 2.5402354071582995e-05, "loss": 0.4794, "step": 2115 }, { "epoch": 0.07640465635924605, "grad_norm": 0.2631590664386749, "learning_rate": 2.5462406918087917e-05, "loss": 0.5131, "step": 2120 }, { "epoch": 0.07658485602047069, "grad_norm": 0.29528722167015076, "learning_rate": 2.5522459764592843e-05, "loss": 0.5186, "step": 2125 }, { "epoch": 0.07676505568169532, "grad_norm": 0.22340206801891327, "learning_rate": 2.558251261109777e-05, "loss": 0.4905, "step": 2130 }, { "epoch": 0.07694525534291996, "grad_norm": 0.21741612255573273, "learning_rate": 2.564256545760269e-05, "loss": 0.4528, "step": 2135 }, { "epoch": 0.0771254550041446, "grad_norm": 0.2538367807865143, "learning_rate": 2.5702618304107616e-05, "loss": 0.5053, "step": 2140 }, { "epoch": 0.07730565466536923, "grad_norm": 0.21461638808250427, "learning_rate": 2.5762671150612538e-05, "loss": 0.5069, "step": 2145 }, { "epoch": 0.07748585432659387, "grad_norm": 0.2098444551229477, "learning_rate": 2.5822723997117464e-05, "loss": 0.4934, "step": 2150 }, { "epoch": 0.0776660539878185, "grad_norm": 0.26539066433906555, "learning_rate": 2.5882776843622393e-05, "loss": 0.525, "step": 2155 }, { "epoch": 0.07784625364904314, "grad_norm": 0.23766759037971497, "learning_rate": 2.594282969012731e-05, "loss": 0.473, "step": 2160 }, { "epoch": 0.07802645331026778, "grad_norm": 0.35496985912323, "learning_rate": 2.600288253663224e-05, "loss": 0.5096, "step": 2165 }, { "epoch": 0.07820665297149242, "grad_norm": 0.23349033296108246, "learning_rate": 2.606293538313716e-05, "loss": 0.5294, "step": 2170 }, { "epoch": 0.07838685263271705, "grad_norm": 0.26091858744621277, "learning_rate": 2.6122988229642088e-05, "loss": 0.4993, "step": 2175 }, { "epoch": 0.07856705229394169, "grad_norm": 0.22804076969623566, "learning_rate": 2.6183041076147007e-05, "loss": 0.4726, "step": 2180 }, { "epoch": 0.07874725195516633, "grad_norm": 0.24472366273403168, "learning_rate": 2.6243093922651936e-05, "loss": 0.4801, "step": 2185 }, { "epoch": 0.07892745161639096, "grad_norm": 0.30233868956565857, "learning_rate": 2.630314676915686e-05, "loss": 0.5144, "step": 2190 }, { "epoch": 0.0791076512776156, "grad_norm": 0.2996428608894348, "learning_rate": 2.6363199615661784e-05, "loss": 0.518, "step": 2195 }, { "epoch": 0.07928785093884023, "grad_norm": 0.25679364800453186, "learning_rate": 2.642325246216671e-05, "loss": 0.4516, "step": 2200 }, { "epoch": 0.07946805060006487, "grad_norm": 0.311570942401886, "learning_rate": 2.648330530867163e-05, "loss": 0.5341, "step": 2205 }, { "epoch": 0.07964825026128951, "grad_norm": 0.2966238260269165, "learning_rate": 2.6543358155176557e-05, "loss": 0.4725, "step": 2210 }, { "epoch": 0.07982844992251414, "grad_norm": 0.254808247089386, "learning_rate": 2.6603411001681483e-05, "loss": 0.4943, "step": 2215 }, { "epoch": 0.08000864958373878, "grad_norm": 0.23695941269397736, "learning_rate": 2.6663463848186405e-05, "loss": 0.5048, "step": 2220 }, { "epoch": 0.08018884924496342, "grad_norm": 0.29264283180236816, "learning_rate": 2.672351669469133e-05, "loss": 0.4968, "step": 2225 }, { "epoch": 0.08036904890618805, "grad_norm": 0.24488191306591034, "learning_rate": 2.6783569541196252e-05, "loss": 0.4911, "step": 2230 }, { "epoch": 0.08054924856741269, "grad_norm": 0.281082421541214, "learning_rate": 2.6843622387701178e-05, "loss": 0.4946, "step": 2235 }, { "epoch": 0.08072944822863733, "grad_norm": 0.2860982418060303, "learning_rate": 2.6903675234206104e-05, "loss": 0.5168, "step": 2240 }, { "epoch": 0.08090964788986196, "grad_norm": 0.2372538298368454, "learning_rate": 2.6963728080711026e-05, "loss": 0.4603, "step": 2245 }, { "epoch": 0.0810898475510866, "grad_norm": 0.3054511845111847, "learning_rate": 2.702378092721595e-05, "loss": 0.4975, "step": 2250 }, { "epoch": 0.08127004721231124, "grad_norm": 0.3154446482658386, "learning_rate": 2.7083833773720874e-05, "loss": 0.5038, "step": 2255 }, { "epoch": 0.08145024687353587, "grad_norm": 0.2663957476615906, "learning_rate": 2.71438866202258e-05, "loss": 0.5158, "step": 2260 }, { "epoch": 0.08163044653476051, "grad_norm": 0.2612464427947998, "learning_rate": 2.7203939466730728e-05, "loss": 0.5184, "step": 2265 }, { "epoch": 0.08181064619598515, "grad_norm": 0.22396814823150635, "learning_rate": 2.7263992313235647e-05, "loss": 0.5009, "step": 2270 }, { "epoch": 0.08199084585720978, "grad_norm": 0.2750130295753479, "learning_rate": 2.7324045159740576e-05, "loss": 0.5154, "step": 2275 }, { "epoch": 0.08217104551843442, "grad_norm": 0.29694536328315735, "learning_rate": 2.7384098006245495e-05, "loss": 0.4796, "step": 2280 }, { "epoch": 0.08235124517965907, "grad_norm": 0.20941223204135895, "learning_rate": 2.7444150852750424e-05, "loss": 0.4525, "step": 2285 }, { "epoch": 0.0825314448408837, "grad_norm": 0.3018248677253723, "learning_rate": 2.750420369925535e-05, "loss": 0.5211, "step": 2290 }, { "epoch": 0.08271164450210834, "grad_norm": 0.22867879271507263, "learning_rate": 2.756425654576027e-05, "loss": 0.4702, "step": 2295 }, { "epoch": 0.08289184416333298, "grad_norm": 0.26447781920433044, "learning_rate": 2.7624309392265197e-05, "loss": 0.4893, "step": 2300 }, { "epoch": 0.08307204382455761, "grad_norm": 0.22963860630989075, "learning_rate": 2.768436223877012e-05, "loss": 0.509, "step": 2305 }, { "epoch": 0.08325224348578225, "grad_norm": 0.34469008445739746, "learning_rate": 2.7744415085275045e-05, "loss": 0.4985, "step": 2310 }, { "epoch": 0.08343244314700689, "grad_norm": 0.23541411757469177, "learning_rate": 2.7804467931779963e-05, "loss": 0.4892, "step": 2315 }, { "epoch": 0.08361264280823152, "grad_norm": 0.2870751619338989, "learning_rate": 2.7864520778284892e-05, "loss": 0.4806, "step": 2320 }, { "epoch": 0.08379284246945616, "grad_norm": 0.24497903883457184, "learning_rate": 2.7924573624789818e-05, "loss": 0.4938, "step": 2325 }, { "epoch": 0.0839730421306808, "grad_norm": 0.29667189717292786, "learning_rate": 2.798462647129474e-05, "loss": 0.5338, "step": 2330 }, { "epoch": 0.08415324179190543, "grad_norm": 0.2696093022823334, "learning_rate": 2.8044679317799666e-05, "loss": 0.4821, "step": 2335 }, { "epoch": 0.08433344145313007, "grad_norm": 0.2720743715763092, "learning_rate": 2.8104732164304588e-05, "loss": 0.4867, "step": 2340 }, { "epoch": 0.0845136411143547, "grad_norm": 0.3151923418045044, "learning_rate": 2.8164785010809513e-05, "loss": 0.4833, "step": 2345 }, { "epoch": 0.08469384077557934, "grad_norm": 0.2476382553577423, "learning_rate": 2.822483785731444e-05, "loss": 0.4882, "step": 2350 }, { "epoch": 0.08487404043680398, "grad_norm": 0.2863009572029114, "learning_rate": 2.828489070381936e-05, "loss": 0.4852, "step": 2355 }, { "epoch": 0.08505424009802862, "grad_norm": 0.26534658670425415, "learning_rate": 2.8344943550324287e-05, "loss": 0.5093, "step": 2360 }, { "epoch": 0.08523443975925325, "grad_norm": 0.2766381800174713, "learning_rate": 2.840499639682921e-05, "loss": 0.4981, "step": 2365 }, { "epoch": 0.08541463942047789, "grad_norm": 0.3110561966896057, "learning_rate": 2.8465049243334134e-05, "loss": 0.5176, "step": 2370 }, { "epoch": 0.08559483908170253, "grad_norm": 0.2818463444709778, "learning_rate": 2.8525102089839063e-05, "loss": 0.4864, "step": 2375 }, { "epoch": 0.08577503874292716, "grad_norm": 0.2455795407295227, "learning_rate": 2.8585154936343982e-05, "loss": 0.5048, "step": 2380 }, { "epoch": 0.0859552384041518, "grad_norm": 0.3062375783920288, "learning_rate": 2.864520778284891e-05, "loss": 0.5028, "step": 2385 }, { "epoch": 0.08613543806537644, "grad_norm": 0.29522883892059326, "learning_rate": 2.870526062935383e-05, "loss": 0.5034, "step": 2390 }, { "epoch": 0.08631563772660107, "grad_norm": 0.3509838283061981, "learning_rate": 2.876531347585876e-05, "loss": 0.461, "step": 2395 }, { "epoch": 0.08649583738782571, "grad_norm": 0.2348550707101822, "learning_rate": 2.8825366322363684e-05, "loss": 0.5092, "step": 2400 }, { "epoch": 0.08667603704905034, "grad_norm": 0.28098031878471375, "learning_rate": 2.8885419168868607e-05, "loss": 0.5046, "step": 2405 }, { "epoch": 0.08685623671027498, "grad_norm": 0.29102039337158203, "learning_rate": 2.8945472015373532e-05, "loss": 0.4669, "step": 2410 }, { "epoch": 0.08703643637149962, "grad_norm": 0.273709774017334, "learning_rate": 2.900552486187845e-05, "loss": 0.5121, "step": 2415 }, { "epoch": 0.08721663603272425, "grad_norm": 0.3082577586174011, "learning_rate": 2.906557770838338e-05, "loss": 0.5158, "step": 2420 }, { "epoch": 0.08739683569394889, "grad_norm": 0.26482829451560974, "learning_rate": 2.9125630554888305e-05, "loss": 0.4774, "step": 2425 }, { "epoch": 0.08757703535517353, "grad_norm": 0.26536524295806885, "learning_rate": 2.9185683401393228e-05, "loss": 0.4929, "step": 2430 }, { "epoch": 0.08775723501639816, "grad_norm": 0.26633599400520325, "learning_rate": 2.9245736247898153e-05, "loss": 0.4826, "step": 2435 }, { "epoch": 0.0879374346776228, "grad_norm": 0.2031547874212265, "learning_rate": 2.9305789094403075e-05, "loss": 0.4906, "step": 2440 }, { "epoch": 0.08811763433884744, "grad_norm": 0.2550112307071686, "learning_rate": 2.9365841940908e-05, "loss": 0.4968, "step": 2445 }, { "epoch": 0.08829783400007207, "grad_norm": 0.28357723355293274, "learning_rate": 2.9425894787412923e-05, "loss": 0.4956, "step": 2450 }, { "epoch": 0.08847803366129672, "grad_norm": 0.32640430331230164, "learning_rate": 2.948594763391785e-05, "loss": 0.5074, "step": 2455 }, { "epoch": 0.08865823332252136, "grad_norm": 0.240611732006073, "learning_rate": 2.9546000480422774e-05, "loss": 0.4661, "step": 2460 }, { "epoch": 0.088838432983746, "grad_norm": 0.27230045199394226, "learning_rate": 2.9606053326927696e-05, "loss": 0.4959, "step": 2465 }, { "epoch": 0.08901863264497063, "grad_norm": 0.20566551387310028, "learning_rate": 2.9666106173432622e-05, "loss": 0.518, "step": 2470 }, { "epoch": 0.08919883230619527, "grad_norm": 0.2713625729084015, "learning_rate": 2.9726159019937544e-05, "loss": 0.5178, "step": 2475 }, { "epoch": 0.0893790319674199, "grad_norm": 0.2567339837551117, "learning_rate": 2.978621186644247e-05, "loss": 0.498, "step": 2480 }, { "epoch": 0.08955923162864454, "grad_norm": 0.27121180295944214, "learning_rate": 2.98462647129474e-05, "loss": 0.5194, "step": 2485 }, { "epoch": 0.08973943128986918, "grad_norm": 0.35053110122680664, "learning_rate": 2.9906317559452317e-05, "loss": 0.533, "step": 2490 }, { "epoch": 0.08991963095109382, "grad_norm": 0.2476474940776825, "learning_rate": 2.9966370405957246e-05, "loss": 0.4895, "step": 2495 }, { "epoch": 0.09009983061231845, "grad_norm": 0.22306469082832336, "learning_rate": 3.0026423252462165e-05, "loss": 0.4847, "step": 2500 }, { "epoch": 0.09009983061231845, "eval_loss": 0.5190762877464294, "eval_runtime": 3.513, "eval_samples_per_second": 28.466, "eval_steps_per_second": 7.116, "step": 2500 }, { "epoch": 0.09028003027354309, "grad_norm": 0.2834235429763794, "learning_rate": 3.008647609896709e-05, "loss": 0.5589, "step": 2505 }, { "epoch": 0.09046022993476772, "grad_norm": 0.24811461567878723, "learning_rate": 3.014652894547202e-05, "loss": 0.4679, "step": 2510 }, { "epoch": 0.09064042959599236, "grad_norm": 0.25787869095802307, "learning_rate": 3.020658179197694e-05, "loss": 0.4961, "step": 2515 }, { "epoch": 0.090820629257217, "grad_norm": 0.1960582286119461, "learning_rate": 3.0266634638481867e-05, "loss": 0.5253, "step": 2520 }, { "epoch": 0.09100082891844163, "grad_norm": 0.28364789485931396, "learning_rate": 3.0326687484986786e-05, "loss": 0.4797, "step": 2525 }, { "epoch": 0.09118102857966627, "grad_norm": 0.30647027492523193, "learning_rate": 3.0386740331491715e-05, "loss": 0.5131, "step": 2530 }, { "epoch": 0.09136122824089091, "grad_norm": 0.2918342351913452, "learning_rate": 3.044679317799664e-05, "loss": 0.4649, "step": 2535 }, { "epoch": 0.09154142790211554, "grad_norm": 0.34838005900382996, "learning_rate": 3.0506846024501563e-05, "loss": 0.5218, "step": 2540 }, { "epoch": 0.09172162756334018, "grad_norm": 0.22675694525241852, "learning_rate": 3.056689887100649e-05, "loss": 0.5072, "step": 2545 }, { "epoch": 0.09190182722456482, "grad_norm": 0.3220086991786957, "learning_rate": 3.062695171751141e-05, "loss": 0.4664, "step": 2550 }, { "epoch": 0.09208202688578945, "grad_norm": 0.2494743913412094, "learning_rate": 3.068700456401633e-05, "loss": 0.5067, "step": 2555 }, { "epoch": 0.09226222654701409, "grad_norm": 0.35692298412323, "learning_rate": 3.0747057410521265e-05, "loss": 0.5213, "step": 2560 }, { "epoch": 0.09244242620823873, "grad_norm": 0.352923721075058, "learning_rate": 3.0807110257026184e-05, "loss": 0.5052, "step": 2565 }, { "epoch": 0.09262262586946336, "grad_norm": 0.2893941402435303, "learning_rate": 3.086716310353111e-05, "loss": 0.4696, "step": 2570 }, { "epoch": 0.092802825530688, "grad_norm": 0.26177549362182617, "learning_rate": 3.092721595003603e-05, "loss": 0.5036, "step": 2575 }, { "epoch": 0.09298302519191264, "grad_norm": 0.22046709060668945, "learning_rate": 3.098726879654096e-05, "loss": 0.4604, "step": 2580 }, { "epoch": 0.09316322485313727, "grad_norm": 0.28608959913253784, "learning_rate": 3.104732164304588e-05, "loss": 0.5139, "step": 2585 }, { "epoch": 0.09334342451436191, "grad_norm": 0.31415843963623047, "learning_rate": 3.1107374489550805e-05, "loss": 0.4915, "step": 2590 }, { "epoch": 0.09352362417558654, "grad_norm": 0.24263805150985718, "learning_rate": 3.116742733605573e-05, "loss": 0.5423, "step": 2595 }, { "epoch": 0.09370382383681118, "grad_norm": 0.30432233214378357, "learning_rate": 3.1227480182560656e-05, "loss": 0.4955, "step": 2600 }, { "epoch": 0.09388402349803582, "grad_norm": 0.2743891775608063, "learning_rate": 3.128753302906558e-05, "loss": 0.5167, "step": 2605 }, { "epoch": 0.09406422315926045, "grad_norm": 0.2813490629196167, "learning_rate": 3.13475858755705e-05, "loss": 0.5374, "step": 2610 }, { "epoch": 0.09424442282048509, "grad_norm": 0.3482913076877594, "learning_rate": 3.1407638722075426e-05, "loss": 0.4621, "step": 2615 }, { "epoch": 0.09442462248170974, "grad_norm": 0.3260897696018219, "learning_rate": 3.146769156858035e-05, "loss": 0.5003, "step": 2620 }, { "epoch": 0.09460482214293438, "grad_norm": 0.27140331268310547, "learning_rate": 3.152774441508528e-05, "loss": 0.5161, "step": 2625 }, { "epoch": 0.09478502180415901, "grad_norm": 0.24953608214855194, "learning_rate": 3.15877972615902e-05, "loss": 0.5059, "step": 2630 }, { "epoch": 0.09496522146538365, "grad_norm": 0.2428867667913437, "learning_rate": 3.164785010809512e-05, "loss": 0.5285, "step": 2635 }, { "epoch": 0.09514542112660829, "grad_norm": 0.2688390910625458, "learning_rate": 3.170790295460005e-05, "loss": 0.4971, "step": 2640 }, { "epoch": 0.09532562078783292, "grad_norm": 0.292272686958313, "learning_rate": 3.176795580110498e-05, "loss": 0.4781, "step": 2645 }, { "epoch": 0.09550582044905756, "grad_norm": 0.22563514113426208, "learning_rate": 3.18280086476099e-05, "loss": 0.4319, "step": 2650 }, { "epoch": 0.0956860201102822, "grad_norm": 0.31794285774230957, "learning_rate": 3.1888061494114824e-05, "loss": 0.5238, "step": 2655 }, { "epoch": 0.09586621977150683, "grad_norm": 0.36745190620422363, "learning_rate": 3.194811434061974e-05, "loss": 0.4985, "step": 2660 }, { "epoch": 0.09604641943273147, "grad_norm": 0.2950117588043213, "learning_rate": 3.200816718712467e-05, "loss": 0.5128, "step": 2665 }, { "epoch": 0.0962266190939561, "grad_norm": 0.30142202973365784, "learning_rate": 3.20682200336296e-05, "loss": 0.4884, "step": 2670 }, { "epoch": 0.09640681875518074, "grad_norm": 0.34995657205581665, "learning_rate": 3.212827288013452e-05, "loss": 0.5074, "step": 2675 }, { "epoch": 0.09658701841640538, "grad_norm": 0.2538526952266693, "learning_rate": 3.2188325726639445e-05, "loss": 0.4964, "step": 2680 }, { "epoch": 0.09676721807763002, "grad_norm": 0.3425311744213104, "learning_rate": 3.2248378573144364e-05, "loss": 0.4721, "step": 2685 }, { "epoch": 0.09694741773885465, "grad_norm": 0.3075348138809204, "learning_rate": 3.2308431419649296e-05, "loss": 0.5145, "step": 2690 }, { "epoch": 0.09712761740007929, "grad_norm": 0.24702075123786926, "learning_rate": 3.236848426615422e-05, "loss": 0.4793, "step": 2695 }, { "epoch": 0.09730781706130393, "grad_norm": 0.2921466827392578, "learning_rate": 3.242853711265914e-05, "loss": 0.5097, "step": 2700 }, { "epoch": 0.09748801672252856, "grad_norm": 0.22841855883598328, "learning_rate": 3.2488589959164066e-05, "loss": 0.4905, "step": 2705 }, { "epoch": 0.0976682163837532, "grad_norm": 0.24366632103919983, "learning_rate": 3.254864280566899e-05, "loss": 0.4773, "step": 2710 }, { "epoch": 0.09784841604497783, "grad_norm": 0.2745325565338135, "learning_rate": 3.260869565217392e-05, "loss": 0.492, "step": 2715 }, { "epoch": 0.09802861570620247, "grad_norm": 0.22801122069358826, "learning_rate": 3.266874849867884e-05, "loss": 0.4883, "step": 2720 }, { "epoch": 0.09820881536742711, "grad_norm": 0.316967636346817, "learning_rate": 3.272880134518376e-05, "loss": 0.4811, "step": 2725 }, { "epoch": 0.09838901502865174, "grad_norm": 0.28945192694664, "learning_rate": 3.278885419168869e-05, "loss": 0.4782, "step": 2730 }, { "epoch": 0.09856921468987638, "grad_norm": 0.25695520639419556, "learning_rate": 3.284890703819361e-05, "loss": 0.5038, "step": 2735 }, { "epoch": 0.09874941435110102, "grad_norm": 0.22989587485790253, "learning_rate": 3.290895988469854e-05, "loss": 0.4745, "step": 2740 }, { "epoch": 0.09892961401232565, "grad_norm": 0.28844088315963745, "learning_rate": 3.296901273120346e-05, "loss": 0.4886, "step": 2745 }, { "epoch": 0.09910981367355029, "grad_norm": 0.21202966570854187, "learning_rate": 3.302906557770838e-05, "loss": 0.4926, "step": 2750 }, { "epoch": 0.09929001333477493, "grad_norm": 0.4138276278972626, "learning_rate": 3.308911842421331e-05, "loss": 0.5199, "step": 2755 }, { "epoch": 0.09947021299599956, "grad_norm": 0.22605939209461212, "learning_rate": 3.3149171270718233e-05, "loss": 0.4753, "step": 2760 }, { "epoch": 0.0996504126572242, "grad_norm": 0.25468626618385315, "learning_rate": 3.320922411722316e-05, "loss": 0.483, "step": 2765 }, { "epoch": 0.09983061231844884, "grad_norm": 0.25252848863601685, "learning_rate": 3.326927696372808e-05, "loss": 0.5189, "step": 2770 }, { "epoch": 0.10001081197967347, "grad_norm": 0.29405850172042847, "learning_rate": 3.3329329810233003e-05, "loss": 0.4849, "step": 2775 }, { "epoch": 0.10019101164089811, "grad_norm": 0.37302690744400024, "learning_rate": 3.3389382656737936e-05, "loss": 0.4963, "step": 2780 }, { "epoch": 0.10037121130212275, "grad_norm": 0.23915189504623413, "learning_rate": 3.3449435503242855e-05, "loss": 0.5278, "step": 2785 }, { "epoch": 0.1005514109633474, "grad_norm": 0.3481108546257019, "learning_rate": 3.350948834974778e-05, "loss": 0.4938, "step": 2790 }, { "epoch": 0.10073161062457203, "grad_norm": 0.23707103729248047, "learning_rate": 3.35695411962527e-05, "loss": 0.4766, "step": 2795 }, { "epoch": 0.10091181028579667, "grad_norm": 0.25163257122039795, "learning_rate": 3.362959404275763e-05, "loss": 0.4613, "step": 2800 }, { "epoch": 0.1010920099470213, "grad_norm": 0.2651137113571167, "learning_rate": 3.368964688926256e-05, "loss": 0.4841, "step": 2805 }, { "epoch": 0.10127220960824594, "grad_norm": 0.32314032316207886, "learning_rate": 3.3749699735767476e-05, "loss": 0.4957, "step": 2810 }, { "epoch": 0.10145240926947058, "grad_norm": 0.3319461941719055, "learning_rate": 3.38097525822724e-05, "loss": 0.492, "step": 2815 }, { "epoch": 0.10163260893069521, "grad_norm": 0.24138695001602173, "learning_rate": 3.386980542877733e-05, "loss": 0.511, "step": 2820 }, { "epoch": 0.10181280859191985, "grad_norm": 0.3053799271583557, "learning_rate": 3.392985827528225e-05, "loss": 0.4911, "step": 2825 }, { "epoch": 0.10199300825314449, "grad_norm": 0.26419997215270996, "learning_rate": 3.398991112178718e-05, "loss": 0.4836, "step": 2830 }, { "epoch": 0.10217320791436912, "grad_norm": 0.2777937352657318, "learning_rate": 3.4049963968292097e-05, "loss": 0.5124, "step": 2835 }, { "epoch": 0.10235340757559376, "grad_norm": 0.2962092459201813, "learning_rate": 3.411001681479702e-05, "loss": 0.4948, "step": 2840 }, { "epoch": 0.1025336072368184, "grad_norm": 0.3595232665538788, "learning_rate": 3.417006966130195e-05, "loss": 0.5122, "step": 2845 }, { "epoch": 0.10271380689804303, "grad_norm": 0.20213566720485687, "learning_rate": 3.423012250780687e-05, "loss": 0.4886, "step": 2850 }, { "epoch": 0.10289400655926767, "grad_norm": 0.3061508536338806, "learning_rate": 3.42901753543118e-05, "loss": 0.5068, "step": 2855 }, { "epoch": 0.1030742062204923, "grad_norm": 0.27155801653862, "learning_rate": 3.435022820081672e-05, "loss": 0.4583, "step": 2860 }, { "epoch": 0.10325440588171694, "grad_norm": 0.2824406623840332, "learning_rate": 3.441028104732164e-05, "loss": 0.4857, "step": 2865 }, { "epoch": 0.10343460554294158, "grad_norm": 0.2800888121128082, "learning_rate": 3.447033389382657e-05, "loss": 0.4928, "step": 2870 }, { "epoch": 0.10361480520416622, "grad_norm": 0.2530428469181061, "learning_rate": 3.4530386740331494e-05, "loss": 0.4841, "step": 2875 }, { "epoch": 0.10379500486539085, "grad_norm": 0.2559506595134735, "learning_rate": 3.459043958683641e-05, "loss": 0.4919, "step": 2880 }, { "epoch": 0.10397520452661549, "grad_norm": 0.35989275574684143, "learning_rate": 3.465049243334134e-05, "loss": 0.5059, "step": 2885 }, { "epoch": 0.10415540418784013, "grad_norm": 0.2567185163497925, "learning_rate": 3.471054527984627e-05, "loss": 0.4752, "step": 2890 }, { "epoch": 0.10433560384906476, "grad_norm": 0.3426368832588196, "learning_rate": 3.477059812635119e-05, "loss": 0.5043, "step": 2895 }, { "epoch": 0.1045158035102894, "grad_norm": 0.26572245359420776, "learning_rate": 3.4830650972856115e-05, "loss": 0.5032, "step": 2900 }, { "epoch": 0.10469600317151403, "grad_norm": 0.2190892994403839, "learning_rate": 3.4890703819361034e-05, "loss": 0.4637, "step": 2905 }, { "epoch": 0.10487620283273867, "grad_norm": 0.3017270267009735, "learning_rate": 3.4950756665865967e-05, "loss": 0.4957, "step": 2910 }, { "epoch": 0.10505640249396331, "grad_norm": 0.2628999650478363, "learning_rate": 3.501080951237089e-05, "loss": 0.469, "step": 2915 }, { "epoch": 0.10523660215518794, "grad_norm": 0.26654675602912903, "learning_rate": 3.507086235887581e-05, "loss": 0.4872, "step": 2920 }, { "epoch": 0.10541680181641258, "grad_norm": 0.33271774649620056, "learning_rate": 3.5130915205380736e-05, "loss": 0.465, "step": 2925 }, { "epoch": 0.10559700147763722, "grad_norm": 0.2927808165550232, "learning_rate": 3.519096805188566e-05, "loss": 0.5051, "step": 2930 }, { "epoch": 0.10577720113886185, "grad_norm": 0.2696301341056824, "learning_rate": 3.525102089839059e-05, "loss": 0.459, "step": 2935 }, { "epoch": 0.10595740080008649, "grad_norm": 0.21873407065868378, "learning_rate": 3.531107374489551e-05, "loss": 0.4857, "step": 2940 }, { "epoch": 0.10613760046131113, "grad_norm": 0.22248321771621704, "learning_rate": 3.537112659140043e-05, "loss": 0.4911, "step": 2945 }, { "epoch": 0.10631780012253576, "grad_norm": 0.2554193139076233, "learning_rate": 3.543117943790536e-05, "loss": 0.5309, "step": 2950 }, { "epoch": 0.1064979997837604, "grad_norm": 0.27886924147605896, "learning_rate": 3.549123228441028e-05, "loss": 0.4637, "step": 2955 }, { "epoch": 0.10667819944498505, "grad_norm": 0.23546633124351501, "learning_rate": 3.555128513091521e-05, "loss": 0.4657, "step": 2960 }, { "epoch": 0.10685839910620969, "grad_norm": 0.2205863744020462, "learning_rate": 3.5611337977420134e-05, "loss": 0.493, "step": 2965 }, { "epoch": 0.10703859876743432, "grad_norm": 0.27504095435142517, "learning_rate": 3.567139082392505e-05, "loss": 0.4543, "step": 2970 }, { "epoch": 0.10721879842865896, "grad_norm": 0.27117300033569336, "learning_rate": 3.573144367042998e-05, "loss": 0.4604, "step": 2975 }, { "epoch": 0.1073989980898836, "grad_norm": 0.29087886214256287, "learning_rate": 3.5791496516934904e-05, "loss": 0.515, "step": 2980 }, { "epoch": 0.10757919775110823, "grad_norm": 0.2880057096481323, "learning_rate": 3.585154936343983e-05, "loss": 0.5116, "step": 2985 }, { "epoch": 0.10775939741233287, "grad_norm": 0.26126590371131897, "learning_rate": 3.5911602209944755e-05, "loss": 0.5085, "step": 2990 }, { "epoch": 0.1079395970735575, "grad_norm": 0.34922003746032715, "learning_rate": 3.5971655056449674e-05, "loss": 0.5063, "step": 2995 }, { "epoch": 0.10811979673478214, "grad_norm": 0.34584909677505493, "learning_rate": 3.6031707902954606e-05, "loss": 0.5323, "step": 3000 }, { "epoch": 0.10811979673478214, "eval_loss": 0.515049397945404, "eval_runtime": 3.5163, "eval_samples_per_second": 28.439, "eval_steps_per_second": 7.11, "step": 3000 }, { "epoch": 0.10829999639600678, "grad_norm": 0.22614799439907074, "learning_rate": 3.6091760749459525e-05, "loss": 0.4944, "step": 3005 }, { "epoch": 0.10848019605723142, "grad_norm": 0.2877223789691925, "learning_rate": 3.615181359596445e-05, "loss": 0.5352, "step": 3010 }, { "epoch": 0.10866039571845605, "grad_norm": 0.2686571478843689, "learning_rate": 3.621186644246937e-05, "loss": 0.4605, "step": 3015 }, { "epoch": 0.10884059537968069, "grad_norm": 0.3369406759738922, "learning_rate": 3.62719192889743e-05, "loss": 0.477, "step": 3020 }, { "epoch": 0.10902079504090532, "grad_norm": 0.27363601326942444, "learning_rate": 3.633197213547923e-05, "loss": 0.503, "step": 3025 }, { "epoch": 0.10920099470212996, "grad_norm": 0.21706698834896088, "learning_rate": 3.6392024981984146e-05, "loss": 0.4783, "step": 3030 }, { "epoch": 0.1093811943633546, "grad_norm": 0.2553368806838989, "learning_rate": 3.645207782848907e-05, "loss": 0.514, "step": 3035 }, { "epoch": 0.10956139402457923, "grad_norm": 0.22971071302890778, "learning_rate": 3.6512130674994e-05, "loss": 0.4765, "step": 3040 }, { "epoch": 0.10974159368580387, "grad_norm": 0.2552284598350525, "learning_rate": 3.657218352149892e-05, "loss": 0.4497, "step": 3045 }, { "epoch": 0.1099217933470285, "grad_norm": 0.336775541305542, "learning_rate": 3.663223636800385e-05, "loss": 0.5162, "step": 3050 }, { "epoch": 0.11010199300825314, "grad_norm": 0.27510499954223633, "learning_rate": 3.669228921450877e-05, "loss": 0.4996, "step": 3055 }, { "epoch": 0.11028219266947778, "grad_norm": 0.2974676787853241, "learning_rate": 3.675234206101369e-05, "loss": 0.4634, "step": 3060 }, { "epoch": 0.11046239233070242, "grad_norm": 0.2689826190471649, "learning_rate": 3.681239490751862e-05, "loss": 0.5105, "step": 3065 }, { "epoch": 0.11064259199192705, "grad_norm": 0.27652692794799805, "learning_rate": 3.6872447754023544e-05, "loss": 0.5259, "step": 3070 }, { "epoch": 0.11082279165315169, "grad_norm": 0.29832860827445984, "learning_rate": 3.693250060052847e-05, "loss": 0.5042, "step": 3075 }, { "epoch": 0.11100299131437633, "grad_norm": 0.28680893778800964, "learning_rate": 3.699255344703339e-05, "loss": 0.5209, "step": 3080 }, { "epoch": 0.11118319097560096, "grad_norm": 0.3082677125930786, "learning_rate": 3.7052606293538314e-05, "loss": 0.503, "step": 3085 }, { "epoch": 0.1113633906368256, "grad_norm": 0.2548987865447998, "learning_rate": 3.711265914004324e-05, "loss": 0.4714, "step": 3090 }, { "epoch": 0.11154359029805024, "grad_norm": 0.39584243297576904, "learning_rate": 3.7172711986548165e-05, "loss": 0.5219, "step": 3095 }, { "epoch": 0.11172378995927487, "grad_norm": 0.36074209213256836, "learning_rate": 3.723276483305309e-05, "loss": 0.5255, "step": 3100 }, { "epoch": 0.11190398962049951, "grad_norm": 0.22818799316883087, "learning_rate": 3.729281767955801e-05, "loss": 0.4651, "step": 3105 }, { "epoch": 0.11208418928172414, "grad_norm": 0.22764305770397186, "learning_rate": 3.735287052606294e-05, "loss": 0.4482, "step": 3110 }, { "epoch": 0.11226438894294878, "grad_norm": 0.2548796236515045, "learning_rate": 3.741292337256786e-05, "loss": 0.4823, "step": 3115 }, { "epoch": 0.11244458860417342, "grad_norm": 0.2939002811908722, "learning_rate": 3.7472976219072786e-05, "loss": 0.4909, "step": 3120 }, { "epoch": 0.11262478826539805, "grad_norm": 0.3162885308265686, "learning_rate": 3.753302906557771e-05, "loss": 0.4936, "step": 3125 }, { "epoch": 0.1128049879266227, "grad_norm": 0.2588067352771759, "learning_rate": 3.759308191208264e-05, "loss": 0.4921, "step": 3130 }, { "epoch": 0.11298518758784734, "grad_norm": 0.3024102747440338, "learning_rate": 3.765313475858756e-05, "loss": 0.4931, "step": 3135 }, { "epoch": 0.11316538724907198, "grad_norm": 0.40092089772224426, "learning_rate": 3.771318760509248e-05, "loss": 0.5415, "step": 3140 }, { "epoch": 0.11334558691029661, "grad_norm": 0.29862499237060547, "learning_rate": 3.777324045159741e-05, "loss": 0.4613, "step": 3145 }, { "epoch": 0.11352578657152125, "grad_norm": 0.3131415843963623, "learning_rate": 3.7833293298102326e-05, "loss": 0.5001, "step": 3150 }, { "epoch": 0.11370598623274589, "grad_norm": 0.2650774419307709, "learning_rate": 3.789334614460726e-05, "loss": 0.4916, "step": 3155 }, { "epoch": 0.11388618589397052, "grad_norm": 0.20322157442569733, "learning_rate": 3.7953398991112184e-05, "loss": 0.4615, "step": 3160 }, { "epoch": 0.11406638555519516, "grad_norm": 0.2679648995399475, "learning_rate": 3.80134518376171e-05, "loss": 0.5039, "step": 3165 }, { "epoch": 0.1142465852164198, "grad_norm": 0.3185202181339264, "learning_rate": 3.807350468412203e-05, "loss": 0.5249, "step": 3170 }, { "epoch": 0.11442678487764443, "grad_norm": 0.31188127398490906, "learning_rate": 3.8133557530626954e-05, "loss": 0.4923, "step": 3175 }, { "epoch": 0.11460698453886907, "grad_norm": 0.2713908553123474, "learning_rate": 3.819361037713188e-05, "loss": 0.4823, "step": 3180 }, { "epoch": 0.1147871842000937, "grad_norm": 0.2642289102077484, "learning_rate": 3.8253663223636805e-05, "loss": 0.4711, "step": 3185 }, { "epoch": 0.11496738386131834, "grad_norm": 0.30651018023490906, "learning_rate": 3.8313716070141724e-05, "loss": 0.5242, "step": 3190 }, { "epoch": 0.11514758352254298, "grad_norm": 0.31043529510498047, "learning_rate": 3.837376891664665e-05, "loss": 0.4922, "step": 3195 }, { "epoch": 0.11532778318376762, "grad_norm": 0.29356715083122253, "learning_rate": 3.8433821763151575e-05, "loss": 0.46, "step": 3200 }, { "epoch": 0.11550798284499225, "grad_norm": 0.3174915611743927, "learning_rate": 3.84938746096565e-05, "loss": 0.4916, "step": 3205 }, { "epoch": 0.11568818250621689, "grad_norm": 0.2561566233634949, "learning_rate": 3.8553927456161426e-05, "loss": 0.4666, "step": 3210 }, { "epoch": 0.11586838216744152, "grad_norm": 0.24369752407073975, "learning_rate": 3.8613980302666345e-05, "loss": 0.483, "step": 3215 }, { "epoch": 0.11604858182866616, "grad_norm": 0.24509058892726898, "learning_rate": 3.867403314917128e-05, "loss": 0.4709, "step": 3220 }, { "epoch": 0.1162287814898908, "grad_norm": 0.23607365787029266, "learning_rate": 3.8734085995676196e-05, "loss": 0.5009, "step": 3225 }, { "epoch": 0.11640898115111543, "grad_norm": 0.30435293912887573, "learning_rate": 3.879413884218112e-05, "loss": 0.4995, "step": 3230 }, { "epoch": 0.11658918081234007, "grad_norm": 0.20363330841064453, "learning_rate": 3.885419168868605e-05, "loss": 0.4809, "step": 3235 }, { "epoch": 0.11676938047356471, "grad_norm": 0.25914672017097473, "learning_rate": 3.891424453519097e-05, "loss": 0.4811, "step": 3240 }, { "epoch": 0.11694958013478934, "grad_norm": 0.2945806682109833, "learning_rate": 3.89742973816959e-05, "loss": 0.4874, "step": 3245 }, { "epoch": 0.11712977979601398, "grad_norm": 0.25838515162467957, "learning_rate": 3.903435022820082e-05, "loss": 0.493, "step": 3250 }, { "epoch": 0.11730997945723862, "grad_norm": 0.27623122930526733, "learning_rate": 3.909440307470574e-05, "loss": 0.4873, "step": 3255 }, { "epoch": 0.11749017911846325, "grad_norm": 0.24453672766685486, "learning_rate": 3.915445592121067e-05, "loss": 0.4419, "step": 3260 }, { "epoch": 0.11767037877968789, "grad_norm": 0.26414886116981506, "learning_rate": 3.9214508767715593e-05, "loss": 0.4597, "step": 3265 }, { "epoch": 0.11785057844091253, "grad_norm": 0.246024951338768, "learning_rate": 3.927456161422052e-05, "loss": 0.4923, "step": 3270 }, { "epoch": 0.11803077810213716, "grad_norm": 0.25583988428115845, "learning_rate": 3.933461446072544e-05, "loss": 0.4346, "step": 3275 }, { "epoch": 0.1182109777633618, "grad_norm": 0.2518952190876007, "learning_rate": 3.939466730723036e-05, "loss": 0.5041, "step": 3280 }, { "epoch": 0.11839117742458644, "grad_norm": 0.29746881127357483, "learning_rate": 3.945472015373529e-05, "loss": 0.5066, "step": 3285 }, { "epoch": 0.11857137708581107, "grad_norm": 0.2888292074203491, "learning_rate": 3.9514773000240214e-05, "loss": 0.4976, "step": 3290 }, { "epoch": 0.11875157674703571, "grad_norm": 0.2132648229598999, "learning_rate": 3.957482584674514e-05, "loss": 0.4901, "step": 3295 }, { "epoch": 0.11893177640826036, "grad_norm": 0.2614684998989105, "learning_rate": 3.963487869325006e-05, "loss": 0.5054, "step": 3300 }, { "epoch": 0.119111976069485, "grad_norm": 0.2751060128211975, "learning_rate": 3.9694931539754984e-05, "loss": 0.5719, "step": 3305 }, { "epoch": 0.11929217573070963, "grad_norm": 0.2760029435157776, "learning_rate": 3.975498438625991e-05, "loss": 0.4718, "step": 3310 }, { "epoch": 0.11947237539193427, "grad_norm": 0.3538672626018524, "learning_rate": 3.9815037232764836e-05, "loss": 0.5002, "step": 3315 }, { "epoch": 0.1196525750531589, "grad_norm": 0.34897580742836, "learning_rate": 3.987509007926976e-05, "loss": 0.4935, "step": 3320 }, { "epoch": 0.11983277471438354, "grad_norm": 0.3150857388973236, "learning_rate": 3.993514292577468e-05, "loss": 0.5232, "step": 3325 }, { "epoch": 0.12001297437560818, "grad_norm": 0.22704921662807465, "learning_rate": 3.999519577227961e-05, "loss": 0.4848, "step": 3330 }, { "epoch": 0.12019317403683281, "grad_norm": 0.25814276933670044, "learning_rate": 4.005524861878453e-05, "loss": 0.4773, "step": 3335 }, { "epoch": 0.12037337369805745, "grad_norm": 0.2505935728549957, "learning_rate": 4.0115301465289457e-05, "loss": 0.5039, "step": 3340 }, { "epoch": 0.12055357335928209, "grad_norm": 0.26671484112739563, "learning_rate": 4.017535431179438e-05, "loss": 0.486, "step": 3345 }, { "epoch": 0.12073377302050672, "grad_norm": 0.289005309343338, "learning_rate": 4.02354071582993e-05, "loss": 0.5201, "step": 3350 }, { "epoch": 0.12091397268173136, "grad_norm": 0.284462571144104, "learning_rate": 4.029546000480423e-05, "loss": 0.5013, "step": 3355 }, { "epoch": 0.121094172342956, "grad_norm": 0.29563307762145996, "learning_rate": 4.035551285130915e-05, "loss": 0.5042, "step": 3360 }, { "epoch": 0.12127437200418063, "grad_norm": 0.2727985680103302, "learning_rate": 4.041556569781408e-05, "loss": 0.46, "step": 3365 }, { "epoch": 0.12145457166540527, "grad_norm": 0.24392500519752502, "learning_rate": 4.0475618544319e-05, "loss": 0.4903, "step": 3370 }, { "epoch": 0.1216347713266299, "grad_norm": 0.261043518781662, "learning_rate": 4.053567139082393e-05, "loss": 0.4818, "step": 3375 }, { "epoch": 0.12181497098785454, "grad_norm": 0.294289767742157, "learning_rate": 4.0595724237328854e-05, "loss": 0.48, "step": 3380 }, { "epoch": 0.12199517064907918, "grad_norm": 0.300879567861557, "learning_rate": 4.065577708383377e-05, "loss": 0.4833, "step": 3385 }, { "epoch": 0.12217537031030382, "grad_norm": 0.23128901422023773, "learning_rate": 4.07158299303387e-05, "loss": 0.5047, "step": 3390 }, { "epoch": 0.12235556997152845, "grad_norm": 0.3030316233634949, "learning_rate": 4.0775882776843624e-05, "loss": 0.478, "step": 3395 }, { "epoch": 0.12253576963275309, "grad_norm": 0.25957420468330383, "learning_rate": 4.083593562334855e-05, "loss": 0.5159, "step": 3400 }, { "epoch": 0.12271596929397773, "grad_norm": 0.2559145390987396, "learning_rate": 4.0895988469853475e-05, "loss": 0.4766, "step": 3405 }, { "epoch": 0.12289616895520236, "grad_norm": 0.3006746768951416, "learning_rate": 4.0956041316358394e-05, "loss": 0.4874, "step": 3410 }, { "epoch": 0.123076368616427, "grad_norm": 0.3356633484363556, "learning_rate": 4.101609416286332e-05, "loss": 0.5086, "step": 3415 }, { "epoch": 0.12325656827765163, "grad_norm": 0.3298927843570709, "learning_rate": 4.107614700936825e-05, "loss": 0.5317, "step": 3420 }, { "epoch": 0.12343676793887627, "grad_norm": 0.30408477783203125, "learning_rate": 4.113619985587317e-05, "loss": 0.5221, "step": 3425 }, { "epoch": 0.12361696760010091, "grad_norm": 0.300484299659729, "learning_rate": 4.1196252702378096e-05, "loss": 0.4728, "step": 3430 }, { "epoch": 0.12379716726132554, "grad_norm": 0.3350473642349243, "learning_rate": 4.1256305548883015e-05, "loss": 0.4809, "step": 3435 }, { "epoch": 0.12397736692255018, "grad_norm": 0.23786677420139313, "learning_rate": 4.131635839538794e-05, "loss": 0.4957, "step": 3440 }, { "epoch": 0.12415756658377482, "grad_norm": 0.29893916845321655, "learning_rate": 4.1376411241892866e-05, "loss": 0.5178, "step": 3445 }, { "epoch": 0.12433776624499945, "grad_norm": 0.29565131664276123, "learning_rate": 4.143646408839779e-05, "loss": 0.4833, "step": 3450 }, { "epoch": 0.12451796590622409, "grad_norm": 0.27151501178741455, "learning_rate": 4.149651693490272e-05, "loss": 0.4696, "step": 3455 }, { "epoch": 0.12469816556744873, "grad_norm": 0.23224970698356628, "learning_rate": 4.1556569781407636e-05, "loss": 0.4969, "step": 3460 }, { "epoch": 0.12487836522867338, "grad_norm": 0.20471958816051483, "learning_rate": 4.161662262791257e-05, "loss": 0.4783, "step": 3465 }, { "epoch": 0.125058564889898, "grad_norm": 0.24922746419906616, "learning_rate": 4.167667547441749e-05, "loss": 0.475, "step": 3470 }, { "epoch": 0.12523876455112265, "grad_norm": 0.29032933712005615, "learning_rate": 4.173672832092241e-05, "loss": 0.4964, "step": 3475 }, { "epoch": 0.12541896421234727, "grad_norm": 0.2781813144683838, "learning_rate": 4.179678116742734e-05, "loss": 0.5235, "step": 3480 }, { "epoch": 0.12559916387357192, "grad_norm": 0.20956522226333618, "learning_rate": 4.1856834013932264e-05, "loss": 0.4646, "step": 3485 }, { "epoch": 0.12577936353479655, "grad_norm": 0.2584211528301239, "learning_rate": 4.191688686043719e-05, "loss": 0.5034, "step": 3490 }, { "epoch": 0.1259595631960212, "grad_norm": 0.2762541174888611, "learning_rate": 4.197693970694211e-05, "loss": 0.4318, "step": 3495 }, { "epoch": 0.12613976285724582, "grad_norm": 0.27284300327301025, "learning_rate": 4.2036992553447034e-05, "loss": 0.5069, "step": 3500 }, { "epoch": 0.12613976285724582, "eval_loss": 0.5114214420318604, "eval_runtime": 3.5175, "eval_samples_per_second": 28.429, "eval_steps_per_second": 7.107, "step": 3500 }, { "epoch": 0.12631996251847047, "grad_norm": 0.27490076422691345, "learning_rate": 4.209704539995196e-05, "loss": 0.5157, "step": 3505 }, { "epoch": 0.1265001621796951, "grad_norm": 0.27542826533317566, "learning_rate": 4.2157098246456885e-05, "loss": 0.4931, "step": 3510 }, { "epoch": 0.12668036184091974, "grad_norm": 0.25971895456314087, "learning_rate": 4.221715109296181e-05, "loss": 0.4658, "step": 3515 }, { "epoch": 0.12686056150214436, "grad_norm": 0.26690730452537537, "learning_rate": 4.227720393946673e-05, "loss": 0.4838, "step": 3520 }, { "epoch": 0.12704076116336901, "grad_norm": 0.3250514566898346, "learning_rate": 4.2337256785971655e-05, "loss": 0.5006, "step": 3525 }, { "epoch": 0.12722096082459364, "grad_norm": 0.23514829576015472, "learning_rate": 4.239730963247659e-05, "loss": 0.4743, "step": 3530 }, { "epoch": 0.1274011604858183, "grad_norm": 0.2410300076007843, "learning_rate": 4.2457362478981506e-05, "loss": 0.4643, "step": 3535 }, { "epoch": 0.1275813601470429, "grad_norm": 0.3281730115413666, "learning_rate": 4.251741532548643e-05, "loss": 0.4714, "step": 3540 }, { "epoch": 0.12776155980826756, "grad_norm": 0.28623196482658386, "learning_rate": 4.257746817199135e-05, "loss": 0.5064, "step": 3545 }, { "epoch": 0.1279417594694922, "grad_norm": 0.2539494037628174, "learning_rate": 4.2637521018496276e-05, "loss": 0.4777, "step": 3550 }, { "epoch": 0.12812195913071683, "grad_norm": 0.26674479246139526, "learning_rate": 4.269757386500121e-05, "loss": 0.4719, "step": 3555 }, { "epoch": 0.12830215879194148, "grad_norm": 0.23590758442878723, "learning_rate": 4.275762671150613e-05, "loss": 0.4984, "step": 3560 }, { "epoch": 0.1284823584531661, "grad_norm": 0.26926812529563904, "learning_rate": 4.281767955801105e-05, "loss": 0.5151, "step": 3565 }, { "epoch": 0.12866255811439076, "grad_norm": 0.2809188663959503, "learning_rate": 4.287773240451597e-05, "loss": 0.5129, "step": 3570 }, { "epoch": 0.12884275777561538, "grad_norm": 0.2685711681842804, "learning_rate": 4.2937785251020904e-05, "loss": 0.4916, "step": 3575 }, { "epoch": 0.12902295743684003, "grad_norm": 0.2880760431289673, "learning_rate": 4.299783809752582e-05, "loss": 0.504, "step": 3580 }, { "epoch": 0.12920315709806465, "grad_norm": 0.28242960572242737, "learning_rate": 4.305789094403075e-05, "loss": 0.51, "step": 3585 }, { "epoch": 0.1293833567592893, "grad_norm": 0.22377252578735352, "learning_rate": 4.3117943790535674e-05, "loss": 0.471, "step": 3590 }, { "epoch": 0.12956355642051393, "grad_norm": 0.23554405570030212, "learning_rate": 4.31779966370406e-05, "loss": 0.497, "step": 3595 }, { "epoch": 0.12974375608173858, "grad_norm": 0.3754778802394867, "learning_rate": 4.3238049483545525e-05, "loss": 0.4746, "step": 3600 }, { "epoch": 0.1299239557429632, "grad_norm": 0.2620472013950348, "learning_rate": 4.3298102330050444e-05, "loss": 0.4682, "step": 3605 }, { "epoch": 0.13010415540418785, "grad_norm": 0.25988104939460754, "learning_rate": 4.335815517655537e-05, "loss": 0.4681, "step": 3610 }, { "epoch": 0.13028435506541247, "grad_norm": 0.2066163271665573, "learning_rate": 4.3418208023060295e-05, "loss": 0.4465, "step": 3615 }, { "epoch": 0.13046455472663712, "grad_norm": 0.28050124645233154, "learning_rate": 4.347826086956522e-05, "loss": 0.4737, "step": 3620 }, { "epoch": 0.13064475438786174, "grad_norm": 0.32841894030570984, "learning_rate": 4.3538313716070146e-05, "loss": 0.4578, "step": 3625 }, { "epoch": 0.1308249540490864, "grad_norm": 0.2937125861644745, "learning_rate": 4.3598366562575065e-05, "loss": 0.4655, "step": 3630 }, { "epoch": 0.13100515371031102, "grad_norm": 0.2880992889404297, "learning_rate": 4.365841940907999e-05, "loss": 0.48, "step": 3635 }, { "epoch": 0.13118535337153567, "grad_norm": 0.23047612607479095, "learning_rate": 4.3718472255584916e-05, "loss": 0.5327, "step": 3640 }, { "epoch": 0.1313655530327603, "grad_norm": 0.27166831493377686, "learning_rate": 4.377852510208984e-05, "loss": 0.4772, "step": 3645 }, { "epoch": 0.13154575269398494, "grad_norm": 0.3133472502231598, "learning_rate": 4.383857794859477e-05, "loss": 0.5161, "step": 3650 }, { "epoch": 0.13172595235520956, "grad_norm": 0.25573498010635376, "learning_rate": 4.3898630795099686e-05, "loss": 0.5252, "step": 3655 }, { "epoch": 0.1319061520164342, "grad_norm": 0.23422662913799286, "learning_rate": 4.395868364160461e-05, "loss": 0.4795, "step": 3660 }, { "epoch": 0.13208635167765884, "grad_norm": 0.19992312788963318, "learning_rate": 4.4018736488109544e-05, "loss": 0.5103, "step": 3665 }, { "epoch": 0.1322665513388835, "grad_norm": 0.22987040877342224, "learning_rate": 4.407878933461446e-05, "loss": 0.5037, "step": 3670 }, { "epoch": 0.1324467510001081, "grad_norm": 0.35055404901504517, "learning_rate": 4.413884218111939e-05, "loss": 0.5178, "step": 3675 }, { "epoch": 0.13262695066133276, "grad_norm": 0.2555431127548218, "learning_rate": 4.419889502762431e-05, "loss": 0.4925, "step": 3680 }, { "epoch": 0.13280715032255738, "grad_norm": 0.2736440598964691, "learning_rate": 4.425894787412924e-05, "loss": 0.5068, "step": 3685 }, { "epoch": 0.13298734998378203, "grad_norm": 0.20614711940288544, "learning_rate": 4.4319000720634165e-05, "loss": 0.4919, "step": 3690 }, { "epoch": 0.13316754964500666, "grad_norm": 0.29815688729286194, "learning_rate": 4.4379053567139083e-05, "loss": 0.5117, "step": 3695 }, { "epoch": 0.1333477493062313, "grad_norm": 0.23376838862895966, "learning_rate": 4.443910641364401e-05, "loss": 0.5105, "step": 3700 }, { "epoch": 0.13352794896745593, "grad_norm": 0.27070969343185425, "learning_rate": 4.4499159260148935e-05, "loss": 0.4692, "step": 3705 }, { "epoch": 0.13370814862868058, "grad_norm": 0.2089190036058426, "learning_rate": 4.455921210665386e-05, "loss": 0.4596, "step": 3710 }, { "epoch": 0.1338883482899052, "grad_norm": 0.331286758184433, "learning_rate": 4.461926495315878e-05, "loss": 0.5144, "step": 3715 }, { "epoch": 0.13406854795112985, "grad_norm": 0.2310916930437088, "learning_rate": 4.4679317799663705e-05, "loss": 0.5188, "step": 3720 }, { "epoch": 0.1342487476123545, "grad_norm": 0.21956689655780792, "learning_rate": 4.473937064616863e-05, "loss": 0.4799, "step": 3725 }, { "epoch": 0.13442894727357912, "grad_norm": 0.2846265733242035, "learning_rate": 4.4799423492673556e-05, "loss": 0.525, "step": 3730 }, { "epoch": 0.13460914693480378, "grad_norm": 0.2660646140575409, "learning_rate": 4.485947633917848e-05, "loss": 0.451, "step": 3735 }, { "epoch": 0.1347893465960284, "grad_norm": 0.30038926005363464, "learning_rate": 4.49195291856834e-05, "loss": 0.4577, "step": 3740 }, { "epoch": 0.13496954625725305, "grad_norm": 0.23106199502944946, "learning_rate": 4.4979582032188326e-05, "loss": 0.5007, "step": 3745 }, { "epoch": 0.13514974591847767, "grad_norm": 0.24195370078086853, "learning_rate": 4.503963487869325e-05, "loss": 0.4571, "step": 3750 }, { "epoch": 0.13532994557970232, "grad_norm": 0.3315066993236542, "learning_rate": 4.509968772519818e-05, "loss": 0.5199, "step": 3755 }, { "epoch": 0.13551014524092694, "grad_norm": 0.23671256005764008, "learning_rate": 4.51597405717031e-05, "loss": 0.486, "step": 3760 }, { "epoch": 0.1356903449021516, "grad_norm": 0.2768378257751465, "learning_rate": 4.521979341820802e-05, "loss": 0.478, "step": 3765 }, { "epoch": 0.13587054456337622, "grad_norm": 0.20859333872795105, "learning_rate": 4.5279846264712947e-05, "loss": 0.4651, "step": 3770 }, { "epoch": 0.13605074422460087, "grad_norm": 0.28573447465896606, "learning_rate": 4.533989911121788e-05, "loss": 0.4645, "step": 3775 }, { "epoch": 0.1362309438858255, "grad_norm": 0.2684060335159302, "learning_rate": 4.53999519577228e-05, "loss": 0.4407, "step": 3780 }, { "epoch": 0.13641114354705014, "grad_norm": 0.24884513020515442, "learning_rate": 4.546000480422772e-05, "loss": 0.4747, "step": 3785 }, { "epoch": 0.13659134320827476, "grad_norm": 0.25004905462265015, "learning_rate": 4.552005765073264e-05, "loss": 0.4725, "step": 3790 }, { "epoch": 0.1367715428694994, "grad_norm": 0.28418663144111633, "learning_rate": 4.5580110497237574e-05, "loss": 0.5091, "step": 3795 }, { "epoch": 0.13695174253072404, "grad_norm": 0.2636741101741791, "learning_rate": 4.56401633437425e-05, "loss": 0.4824, "step": 3800 }, { "epoch": 0.13713194219194869, "grad_norm": 0.22315765917301178, "learning_rate": 4.570021619024742e-05, "loss": 0.4479, "step": 3805 }, { "epoch": 0.1373121418531733, "grad_norm": 0.26647093892097473, "learning_rate": 4.5760269036752344e-05, "loss": 0.4664, "step": 3810 }, { "epoch": 0.13749234151439796, "grad_norm": 0.26308777928352356, "learning_rate": 4.582032188325727e-05, "loss": 0.5274, "step": 3815 }, { "epoch": 0.13767254117562258, "grad_norm": 0.2349170744419098, "learning_rate": 4.5880374729762195e-05, "loss": 0.4634, "step": 3820 }, { "epoch": 0.13785274083684723, "grad_norm": 0.21153903007507324, "learning_rate": 4.594042757626712e-05, "loss": 0.4549, "step": 3825 }, { "epoch": 0.13803294049807185, "grad_norm": 0.282583624124527, "learning_rate": 4.600048042277204e-05, "loss": 0.5048, "step": 3830 }, { "epoch": 0.1382131401592965, "grad_norm": 0.24750690162181854, "learning_rate": 4.6060533269276965e-05, "loss": 0.5016, "step": 3835 }, { "epoch": 0.13839333982052113, "grad_norm": 0.28650569915771484, "learning_rate": 4.612058611578189e-05, "loss": 0.503, "step": 3840 }, { "epoch": 0.13857353948174578, "grad_norm": 0.28856590390205383, "learning_rate": 4.6180638962286817e-05, "loss": 0.4748, "step": 3845 }, { "epoch": 0.1387537391429704, "grad_norm": 0.24171088635921478, "learning_rate": 4.6240691808791735e-05, "loss": 0.4835, "step": 3850 }, { "epoch": 0.13893393880419505, "grad_norm": 0.3100346028804779, "learning_rate": 4.630074465529666e-05, "loss": 0.4756, "step": 3855 }, { "epoch": 0.13911413846541967, "grad_norm": 0.18996688723564148, "learning_rate": 4.6360797501801586e-05, "loss": 0.5116, "step": 3860 }, { "epoch": 0.13929433812664432, "grad_norm": 0.2496083527803421, "learning_rate": 4.642085034830651e-05, "loss": 0.4598, "step": 3865 }, { "epoch": 0.13947453778786895, "grad_norm": 0.27395179867744446, "learning_rate": 4.648090319481144e-05, "loss": 0.4652, "step": 3870 }, { "epoch": 0.1396547374490936, "grad_norm": 0.24019189178943634, "learning_rate": 4.6540956041316356e-05, "loss": 0.4794, "step": 3875 }, { "epoch": 0.13983493711031822, "grad_norm": 0.24794286489486694, "learning_rate": 4.660100888782128e-05, "loss": 0.4893, "step": 3880 }, { "epoch": 0.14001513677154287, "grad_norm": 0.2861761450767517, "learning_rate": 4.6661061734326214e-05, "loss": 0.5465, "step": 3885 }, { "epoch": 0.14019533643276752, "grad_norm": 0.24302253127098083, "learning_rate": 4.672111458083113e-05, "loss": 0.5152, "step": 3890 }, { "epoch": 0.14037553609399214, "grad_norm": 0.25176292657852173, "learning_rate": 4.678116742733606e-05, "loss": 0.5149, "step": 3895 }, { "epoch": 0.1405557357552168, "grad_norm": 0.3168286085128784, "learning_rate": 4.684122027384098e-05, "loss": 0.5241, "step": 3900 }, { "epoch": 0.14073593541644142, "grad_norm": 0.3177297115325928, "learning_rate": 4.690127312034591e-05, "loss": 0.5131, "step": 3905 }, { "epoch": 0.14091613507766607, "grad_norm": 0.30368152260780334, "learning_rate": 4.6961325966850835e-05, "loss": 0.5248, "step": 3910 }, { "epoch": 0.1410963347388907, "grad_norm": 0.2546997368335724, "learning_rate": 4.7021378813355754e-05, "loss": 0.4857, "step": 3915 }, { "epoch": 0.14127653440011534, "grad_norm": 0.23904703557491302, "learning_rate": 4.708143165986068e-05, "loss": 0.4906, "step": 3920 }, { "epoch": 0.14145673406133996, "grad_norm": 0.26533734798431396, "learning_rate": 4.7141484506365605e-05, "loss": 0.4906, "step": 3925 }, { "epoch": 0.1416369337225646, "grad_norm": 0.3533400297164917, "learning_rate": 4.720153735287053e-05, "loss": 0.4932, "step": 3930 }, { "epoch": 0.14181713338378923, "grad_norm": 0.31380826234817505, "learning_rate": 4.7261590199375456e-05, "loss": 0.4983, "step": 3935 }, { "epoch": 0.14199733304501388, "grad_norm": 0.25796714425086975, "learning_rate": 4.7321643045880375e-05, "loss": 0.4602, "step": 3940 }, { "epoch": 0.1421775327062385, "grad_norm": 0.2745930850505829, "learning_rate": 4.73816958923853e-05, "loss": 0.4592, "step": 3945 }, { "epoch": 0.14235773236746316, "grad_norm": 0.24087023735046387, "learning_rate": 4.7441748738890226e-05, "loss": 0.4606, "step": 3950 }, { "epoch": 0.14253793202868778, "grad_norm": 0.2588431239128113, "learning_rate": 4.750180158539515e-05, "loss": 0.4881, "step": 3955 }, { "epoch": 0.14271813168991243, "grad_norm": 0.26731452345848083, "learning_rate": 4.756185443190008e-05, "loss": 0.4715, "step": 3960 }, { "epoch": 0.14289833135113705, "grad_norm": 0.2952512204647064, "learning_rate": 4.7621907278404996e-05, "loss": 0.4721, "step": 3965 }, { "epoch": 0.1430785310123617, "grad_norm": 0.19790342450141907, "learning_rate": 4.768196012490992e-05, "loss": 0.473, "step": 3970 }, { "epoch": 0.14325873067358633, "grad_norm": 0.23839952051639557, "learning_rate": 4.774201297141485e-05, "loss": 0.5295, "step": 3975 }, { "epoch": 0.14343893033481098, "grad_norm": 0.2507927715778351, "learning_rate": 4.780206581791977e-05, "loss": 0.4815, "step": 3980 }, { "epoch": 0.1436191299960356, "grad_norm": 0.24117940664291382, "learning_rate": 4.786211866442469e-05, "loss": 0.4546, "step": 3985 }, { "epoch": 0.14379932965726025, "grad_norm": 0.2980533838272095, "learning_rate": 4.792217151092962e-05, "loss": 0.4767, "step": 3990 }, { "epoch": 0.14397952931848487, "grad_norm": 0.2951893210411072, "learning_rate": 4.798222435743455e-05, "loss": 0.4729, "step": 3995 }, { "epoch": 0.14415972897970952, "grad_norm": 0.19777187705039978, "learning_rate": 4.804227720393947e-05, "loss": 0.4814, "step": 4000 }, { "epoch": 0.14415972897970952, "eval_loss": 0.5076904296875, "eval_runtime": 3.5171, "eval_samples_per_second": 28.432, "eval_steps_per_second": 7.108, "step": 4000 }, { "epoch": 0.14433992864093415, "grad_norm": 0.21895861625671387, "learning_rate": 4.8102330050444394e-05, "loss": 0.495, "step": 4005 }, { "epoch": 0.1445201283021588, "grad_norm": 0.2850351631641388, "learning_rate": 4.816238289694931e-05, "loss": 0.4767, "step": 4010 }, { "epoch": 0.14470032796338342, "grad_norm": 0.26362937688827515, "learning_rate": 4.8222435743454245e-05, "loss": 0.5319, "step": 4015 }, { "epoch": 0.14488052762460807, "grad_norm": 0.25553131103515625, "learning_rate": 4.828248858995917e-05, "loss": 0.4739, "step": 4020 }, { "epoch": 0.1450607272858327, "grad_norm": 0.2785678505897522, "learning_rate": 4.834254143646409e-05, "loss": 0.5092, "step": 4025 }, { "epoch": 0.14524092694705734, "grad_norm": 0.25108572840690613, "learning_rate": 4.8402594282969015e-05, "loss": 0.496, "step": 4030 }, { "epoch": 0.14542112660828196, "grad_norm": 0.21198877692222595, "learning_rate": 4.8462647129473934e-05, "loss": 0.4941, "step": 4035 }, { "epoch": 0.14560132626950661, "grad_norm": 0.23192477226257324, "learning_rate": 4.8522699975978866e-05, "loss": 0.486, "step": 4040 }, { "epoch": 0.14578152593073124, "grad_norm": 0.25340479612350464, "learning_rate": 4.858275282248379e-05, "loss": 0.5056, "step": 4045 }, { "epoch": 0.1459617255919559, "grad_norm": 0.28752046823501587, "learning_rate": 4.864280566898871e-05, "loss": 0.4944, "step": 4050 }, { "epoch": 0.1461419252531805, "grad_norm": 0.2980968952178955, "learning_rate": 4.8702858515493636e-05, "loss": 0.4604, "step": 4055 }, { "epoch": 0.14632212491440516, "grad_norm": 0.2499280720949173, "learning_rate": 4.876291136199856e-05, "loss": 0.52, "step": 4060 }, { "epoch": 0.1465023245756298, "grad_norm": 0.2250458002090454, "learning_rate": 4.882296420850349e-05, "loss": 0.475, "step": 4065 }, { "epoch": 0.14668252423685443, "grad_norm": 0.20067259669303894, "learning_rate": 4.888301705500841e-05, "loss": 0.4465, "step": 4070 }, { "epoch": 0.14686272389807908, "grad_norm": 0.2999953329563141, "learning_rate": 4.894306990151333e-05, "loss": 0.5123, "step": 4075 }, { "epoch": 0.1470429235593037, "grad_norm": 0.26574891805648804, "learning_rate": 4.900312274801826e-05, "loss": 0.4827, "step": 4080 }, { "epoch": 0.14722312322052836, "grad_norm": 0.2696743309497833, "learning_rate": 4.906317559452318e-05, "loss": 0.4896, "step": 4085 }, { "epoch": 0.14740332288175298, "grad_norm": 0.3257639706134796, "learning_rate": 4.912322844102811e-05, "loss": 0.5203, "step": 4090 }, { "epoch": 0.14758352254297763, "grad_norm": 0.2311761975288391, "learning_rate": 4.9183281287533034e-05, "loss": 0.502, "step": 4095 }, { "epoch": 0.14776372220420225, "grad_norm": 0.26127567887306213, "learning_rate": 4.924333413403795e-05, "loss": 0.4544, "step": 4100 }, { "epoch": 0.1479439218654269, "grad_norm": 0.25755074620246887, "learning_rate": 4.9303386980542885e-05, "loss": 0.4996, "step": 4105 }, { "epoch": 0.14812412152665153, "grad_norm": 0.23005364835262299, "learning_rate": 4.9363439827047804e-05, "loss": 0.5183, "step": 4110 }, { "epoch": 0.14830432118787618, "grad_norm": 0.20392531156539917, "learning_rate": 4.942349267355273e-05, "loss": 0.4814, "step": 4115 }, { "epoch": 0.1484845208491008, "grad_norm": 0.2658933401107788, "learning_rate": 4.9483545520057655e-05, "loss": 0.5149, "step": 4120 }, { "epoch": 0.14866472051032545, "grad_norm": 0.19606181979179382, "learning_rate": 4.9543598366562574e-05, "loss": 0.4621, "step": 4125 }, { "epoch": 0.14884492017155007, "grad_norm": 0.25544169545173645, "learning_rate": 4.9603651213067506e-05, "loss": 0.5054, "step": 4130 }, { "epoch": 0.14902511983277472, "grad_norm": 0.25996389985084534, "learning_rate": 4.9663704059572425e-05, "loss": 0.4607, "step": 4135 }, { "epoch": 0.14920531949399934, "grad_norm": 0.22277650237083435, "learning_rate": 4.972375690607735e-05, "loss": 0.4594, "step": 4140 }, { "epoch": 0.149385519155224, "grad_norm": 0.24612875282764435, "learning_rate": 4.978380975258227e-05, "loss": 0.4942, "step": 4145 }, { "epoch": 0.14956571881644862, "grad_norm": 0.3200278878211975, "learning_rate": 4.98438625990872e-05, "loss": 0.497, "step": 4150 }, { "epoch": 0.14974591847767327, "grad_norm": 0.2536075711250305, "learning_rate": 4.990391544559213e-05, "loss": 0.4744, "step": 4155 }, { "epoch": 0.1499261181388979, "grad_norm": 0.22233828902244568, "learning_rate": 4.9963968292097046e-05, "loss": 0.4684, "step": 4160 }, { "epoch": 0.15010631780012254, "grad_norm": 0.2499028444290161, "learning_rate": 4.999999997275039e-05, "loss": 0.4484, "step": 4165 }, { "epoch": 0.15028651746134716, "grad_norm": 0.2228631228208542, "learning_rate": 4.9999999666192246e-05, "loss": 0.4891, "step": 4170 }, { "epoch": 0.1504667171225718, "grad_norm": 0.25544434785842896, "learning_rate": 4.9999999019013944e-05, "loss": 0.4893, "step": 4175 }, { "epoch": 0.15064691678379644, "grad_norm": 0.22523342072963715, "learning_rate": 4.99999980312155e-05, "loss": 0.4708, "step": 4180 }, { "epoch": 0.1508271164450211, "grad_norm": 0.2670489549636841, "learning_rate": 4.999999670279692e-05, "loss": 0.4553, "step": 4185 }, { "epoch": 0.1510073161062457, "grad_norm": 0.19969885051250458, "learning_rate": 4.999999503375823e-05, "loss": 0.4742, "step": 4190 }, { "epoch": 0.15118751576747036, "grad_norm": 0.26136794686317444, "learning_rate": 4.9999993024099446e-05, "loss": 0.5093, "step": 4195 }, { "epoch": 0.15136771542869498, "grad_norm": 0.265652596950531, "learning_rate": 4.999999067382059e-05, "loss": 0.4894, "step": 4200 }, { "epoch": 0.15154791508991963, "grad_norm": 0.22276762127876282, "learning_rate": 4.999998798292171e-05, "loss": 0.4643, "step": 4205 }, { "epoch": 0.15172811475114426, "grad_norm": 0.18586856126785278, "learning_rate": 4.9999984951402834e-05, "loss": 0.4719, "step": 4210 }, { "epoch": 0.1519083144123689, "grad_norm": 0.2172873467206955, "learning_rate": 4.9999981579263997e-05, "loss": 0.4829, "step": 4215 }, { "epoch": 0.15208851407359353, "grad_norm": 0.24789679050445557, "learning_rate": 4.999997786650525e-05, "loss": 0.4808, "step": 4220 }, { "epoch": 0.15226871373481818, "grad_norm": 0.2855437397956848, "learning_rate": 4.9999973813126654e-05, "loss": 0.4822, "step": 4225 }, { "epoch": 0.15244891339604283, "grad_norm": 0.29781678318977356, "learning_rate": 4.999996941912825e-05, "loss": 0.5047, "step": 4230 }, { "epoch": 0.15262911305726745, "grad_norm": 0.25594308972358704, "learning_rate": 4.9999964684510104e-05, "loss": 0.5076, "step": 4235 }, { "epoch": 0.1528093127184921, "grad_norm": 0.21112053096294403, "learning_rate": 4.999995960927228e-05, "loss": 0.4764, "step": 4240 }, { "epoch": 0.15298951237971672, "grad_norm": 0.29285240173339844, "learning_rate": 4.999995419341485e-05, "loss": 0.4841, "step": 4245 }, { "epoch": 0.15316971204094137, "grad_norm": 0.25095129013061523, "learning_rate": 4.9999948436937873e-05, "loss": 0.4723, "step": 4250 }, { "epoch": 0.153349911702166, "grad_norm": 0.2660590410232544, "learning_rate": 4.999994233984145e-05, "loss": 0.4527, "step": 4255 }, { "epoch": 0.15353011136339065, "grad_norm": 0.19300931692123413, "learning_rate": 4.999993590212564e-05, "loss": 0.4346, "step": 4260 }, { "epoch": 0.15371031102461527, "grad_norm": 0.2573135197162628, "learning_rate": 4.999992912379055e-05, "loss": 0.4827, "step": 4265 }, { "epoch": 0.15389051068583992, "grad_norm": 0.2267155647277832, "learning_rate": 4.999992200483626e-05, "loss": 0.4743, "step": 4270 }, { "epoch": 0.15407071034706454, "grad_norm": 0.22252587974071503, "learning_rate": 4.999991454526289e-05, "loss": 0.4899, "step": 4275 }, { "epoch": 0.1542509100082892, "grad_norm": 0.24364939332008362, "learning_rate": 4.9999906745070515e-05, "loss": 0.4584, "step": 4280 }, { "epoch": 0.15443110966951382, "grad_norm": 0.2882271409034729, "learning_rate": 4.999989860425924e-05, "loss": 0.4897, "step": 4285 }, { "epoch": 0.15461130933073847, "grad_norm": 0.22930704057216644, "learning_rate": 4.9999890122829205e-05, "loss": 0.482, "step": 4290 }, { "epoch": 0.1547915089919631, "grad_norm": 0.22965483367443085, "learning_rate": 4.9999881300780495e-05, "loss": 0.4587, "step": 4295 }, { "epoch": 0.15497170865318774, "grad_norm": 0.2293003350496292, "learning_rate": 4.999987213811325e-05, "loss": 0.532, "step": 4300 }, { "epoch": 0.15515190831441236, "grad_norm": 0.3403257727622986, "learning_rate": 4.999986263482758e-05, "loss": 0.4722, "step": 4305 }, { "epoch": 0.155332107975637, "grad_norm": 0.27816566824913025, "learning_rate": 4.9999852790923626e-05, "loss": 0.4603, "step": 4310 }, { "epoch": 0.15551230763686164, "grad_norm": 0.24829629063606262, "learning_rate": 4.9999842606401516e-05, "loss": 0.5108, "step": 4315 }, { "epoch": 0.15569250729808629, "grad_norm": 0.1773914247751236, "learning_rate": 4.999983208126139e-05, "loss": 0.4665, "step": 4320 }, { "epoch": 0.1558727069593109, "grad_norm": 0.22621670365333557, "learning_rate": 4.9999821215503396e-05, "loss": 0.4857, "step": 4325 }, { "epoch": 0.15605290662053556, "grad_norm": 0.232326477766037, "learning_rate": 4.999981000912767e-05, "loss": 0.4696, "step": 4330 }, { "epoch": 0.15623310628176018, "grad_norm": 0.30691394209861755, "learning_rate": 4.999979846213438e-05, "loss": 0.5046, "step": 4335 }, { "epoch": 0.15641330594298483, "grad_norm": 0.2561699151992798, "learning_rate": 4.9999786574523675e-05, "loss": 0.4833, "step": 4340 }, { "epoch": 0.15659350560420945, "grad_norm": 0.33284783363342285, "learning_rate": 4.9999774346295716e-05, "loss": 0.5305, "step": 4345 }, { "epoch": 0.1567737052654341, "grad_norm": 0.18784891068935394, "learning_rate": 4.999976177745067e-05, "loss": 0.4514, "step": 4350 }, { "epoch": 0.15695390492665873, "grad_norm": 0.23666001856327057, "learning_rate": 4.999974886798872e-05, "loss": 0.5089, "step": 4355 }, { "epoch": 0.15713410458788338, "grad_norm": 0.2393905371427536, "learning_rate": 4.999973561791002e-05, "loss": 0.508, "step": 4360 }, { "epoch": 0.157314304249108, "grad_norm": 0.2510823905467987, "learning_rate": 4.999972202721477e-05, "loss": 0.4712, "step": 4365 }, { "epoch": 0.15749450391033265, "grad_norm": 0.21815423667430878, "learning_rate": 4.999970809590314e-05, "loss": 0.4702, "step": 4370 }, { "epoch": 0.15767470357155727, "grad_norm": 0.26132404804229736, "learning_rate": 4.999969382397534e-05, "loss": 0.4778, "step": 4375 }, { "epoch": 0.15785490323278192, "grad_norm": 0.21386702358722687, "learning_rate": 4.999967921143154e-05, "loss": 0.4859, "step": 4380 }, { "epoch": 0.15803510289400655, "grad_norm": 0.3148122727870941, "learning_rate": 4.999966425827195e-05, "loss": 0.5291, "step": 4385 }, { "epoch": 0.1582153025552312, "grad_norm": 0.26083052158355713, "learning_rate": 4.999964896449678e-05, "loss": 0.4753, "step": 4390 }, { "epoch": 0.15839550221645585, "grad_norm": 0.24049343168735504, "learning_rate": 4.9999633330106234e-05, "loss": 0.5006, "step": 4395 }, { "epoch": 0.15857570187768047, "grad_norm": 0.31498703360557556, "learning_rate": 4.999961735510052e-05, "loss": 0.4786, "step": 4400 }, { "epoch": 0.15875590153890512, "grad_norm": 0.29424259066581726, "learning_rate": 4.999960103947986e-05, "loss": 0.5437, "step": 4405 }, { "epoch": 0.15893610120012974, "grad_norm": 0.28579822182655334, "learning_rate": 4.999958438324448e-05, "loss": 0.4814, "step": 4410 }, { "epoch": 0.1591163008613544, "grad_norm": 0.2311839759349823, "learning_rate": 4.9999567386394595e-05, "loss": 0.4655, "step": 4415 }, { "epoch": 0.15929650052257902, "grad_norm": 0.3129260241985321, "learning_rate": 4.9999550048930455e-05, "loss": 0.4614, "step": 4420 }, { "epoch": 0.15947670018380367, "grad_norm": 0.2093297243118286, "learning_rate": 4.999953237085228e-05, "loss": 0.4925, "step": 4425 }, { "epoch": 0.1596568998450283, "grad_norm": 0.2211044728755951, "learning_rate": 4.999951435216032e-05, "loss": 0.4632, "step": 4430 }, { "epoch": 0.15983709950625294, "grad_norm": 0.2064119130373001, "learning_rate": 4.9999495992854806e-05, "loss": 0.4794, "step": 4435 }, { "epoch": 0.16001729916747756, "grad_norm": 0.19835302233695984, "learning_rate": 4.999947729293601e-05, "loss": 0.4831, "step": 4440 }, { "epoch": 0.1601974988287022, "grad_norm": 0.2594761550426483, "learning_rate": 4.9999458252404176e-05, "loss": 0.4982, "step": 4445 }, { "epoch": 0.16037769848992683, "grad_norm": 0.22020448744297028, "learning_rate": 4.999943887125955e-05, "loss": 0.4638, "step": 4450 }, { "epoch": 0.16055789815115148, "grad_norm": 0.16716046631336212, "learning_rate": 4.9999419149502426e-05, "loss": 0.4716, "step": 4455 }, { "epoch": 0.1607380978123761, "grad_norm": 0.2861575782299042, "learning_rate": 4.9999399087133044e-05, "loss": 0.4905, "step": 4460 }, { "epoch": 0.16091829747360076, "grad_norm": 0.22994907200336456, "learning_rate": 4.99993786841517e-05, "loss": 0.486, "step": 4465 }, { "epoch": 0.16109849713482538, "grad_norm": 0.17052839696407318, "learning_rate": 4.9999357940558656e-05, "loss": 0.4374, "step": 4470 }, { "epoch": 0.16127869679605003, "grad_norm": 0.20029401779174805, "learning_rate": 4.99993368563542e-05, "loss": 0.4667, "step": 4475 }, { "epoch": 0.16145889645727465, "grad_norm": 0.26389387249946594, "learning_rate": 4.9999315431538616e-05, "loss": 0.4532, "step": 4480 }, { "epoch": 0.1616390961184993, "grad_norm": 0.27325350046157837, "learning_rate": 4.999929366611221e-05, "loss": 0.4707, "step": 4485 }, { "epoch": 0.16181929577972393, "grad_norm": 0.28567302227020264, "learning_rate": 4.9999271560075256e-05, "loss": 0.4888, "step": 4490 }, { "epoch": 0.16199949544094858, "grad_norm": 0.16461628675460815, "learning_rate": 4.999924911342807e-05, "loss": 0.4651, "step": 4495 }, { "epoch": 0.1621796951021732, "grad_norm": 0.1844627559185028, "learning_rate": 4.999922632617096e-05, "loss": 0.4724, "step": 4500 }, { "epoch": 0.1621796951021732, "eval_loss": 0.5034152865409851, "eval_runtime": 3.5158, "eval_samples_per_second": 28.443, "eval_steps_per_second": 7.111, "step": 4500 }, { "epoch": 0.16235989476339785, "grad_norm": 0.21949437260627747, "learning_rate": 4.999920319830423e-05, "loss": 0.4663, "step": 4505 }, { "epoch": 0.16254009442462247, "grad_norm": 0.23475132882595062, "learning_rate": 4.9999179729828195e-05, "loss": 0.4849, "step": 4510 }, { "epoch": 0.16272029408584712, "grad_norm": 0.20308546721935272, "learning_rate": 4.999915592074318e-05, "loss": 0.4665, "step": 4515 }, { "epoch": 0.16290049374707175, "grad_norm": 0.18522381782531738, "learning_rate": 4.9999131771049496e-05, "loss": 0.4843, "step": 4520 }, { "epoch": 0.1630806934082964, "grad_norm": 0.2702092230319977, "learning_rate": 4.9999107280747484e-05, "loss": 0.5005, "step": 4525 }, { "epoch": 0.16326089306952102, "grad_norm": 0.20005741715431213, "learning_rate": 4.999908244983748e-05, "loss": 0.4915, "step": 4530 }, { "epoch": 0.16344109273074567, "grad_norm": 0.2156675010919571, "learning_rate": 4.9999057278319817e-05, "loss": 0.4701, "step": 4535 }, { "epoch": 0.1636212923919703, "grad_norm": 0.2057947814464569, "learning_rate": 4.999903176619484e-05, "loss": 0.4371, "step": 4540 }, { "epoch": 0.16380149205319494, "grad_norm": 0.20522654056549072, "learning_rate": 4.9999005913462896e-05, "loss": 0.474, "step": 4545 }, { "epoch": 0.16398169171441956, "grad_norm": 0.20008191466331482, "learning_rate": 4.999897972012433e-05, "loss": 0.4826, "step": 4550 }, { "epoch": 0.16416189137564421, "grad_norm": 0.2456914186477661, "learning_rate": 4.999895318617951e-05, "loss": 0.47, "step": 4555 }, { "epoch": 0.16434209103686884, "grad_norm": 0.21113859117031097, "learning_rate": 4.999892631162879e-05, "loss": 0.4505, "step": 4560 }, { "epoch": 0.1645222906980935, "grad_norm": 0.2364521026611328, "learning_rate": 4.999889909647254e-05, "loss": 0.4626, "step": 4565 }, { "epoch": 0.16470249035931814, "grad_norm": 0.29241201281547546, "learning_rate": 4.9998871540711126e-05, "loss": 0.5229, "step": 4570 }, { "epoch": 0.16488269002054276, "grad_norm": 0.22053126990795135, "learning_rate": 4.9998843644344926e-05, "loss": 0.5066, "step": 4575 }, { "epoch": 0.1650628896817674, "grad_norm": 0.2373570203781128, "learning_rate": 4.999881540737433e-05, "loss": 0.4642, "step": 4580 }, { "epoch": 0.16524308934299203, "grad_norm": 0.24440720677375793, "learning_rate": 4.999878682979972e-05, "loss": 0.4755, "step": 4585 }, { "epoch": 0.16542328900421668, "grad_norm": 0.29946649074554443, "learning_rate": 4.999875791162146e-05, "loss": 0.4903, "step": 4590 }, { "epoch": 0.1656034886654413, "grad_norm": 0.2605338394641876, "learning_rate": 4.9998728652839974e-05, "loss": 0.478, "step": 4595 }, { "epoch": 0.16578368832666596, "grad_norm": 0.20642036199569702, "learning_rate": 4.999869905345565e-05, "loss": 0.4739, "step": 4600 }, { "epoch": 0.16596388798789058, "grad_norm": 0.2667411267757416, "learning_rate": 4.999866911346889e-05, "loss": 0.4881, "step": 4605 }, { "epoch": 0.16614408764911523, "grad_norm": 0.20082050561904907, "learning_rate": 4.999863883288011e-05, "loss": 0.4852, "step": 4610 }, { "epoch": 0.16632428731033985, "grad_norm": 0.27822422981262207, "learning_rate": 4.99986082116897e-05, "loss": 0.4941, "step": 4615 }, { "epoch": 0.1665044869715645, "grad_norm": 0.28651654720306396, "learning_rate": 4.999857724989811e-05, "loss": 0.5, "step": 4620 }, { "epoch": 0.16668468663278913, "grad_norm": 0.2600594162940979, "learning_rate": 4.9998545947505734e-05, "loss": 0.48, "step": 4625 }, { "epoch": 0.16686488629401378, "grad_norm": 0.2715243697166443, "learning_rate": 4.999851430451301e-05, "loss": 0.472, "step": 4630 }, { "epoch": 0.1670450859552384, "grad_norm": 0.259139746427536, "learning_rate": 4.999848232092037e-05, "loss": 0.4986, "step": 4635 }, { "epoch": 0.16722528561646305, "grad_norm": 0.24041247367858887, "learning_rate": 4.999844999672825e-05, "loss": 0.4546, "step": 4640 }, { "epoch": 0.16740548527768767, "grad_norm": 0.2729252278804779, "learning_rate": 4.9998417331937086e-05, "loss": 0.4713, "step": 4645 }, { "epoch": 0.16758568493891232, "grad_norm": 0.25324806571006775, "learning_rate": 4.999838432654733e-05, "loss": 0.4919, "step": 4650 }, { "epoch": 0.16776588460013694, "grad_norm": 0.25361695885658264, "learning_rate": 4.9998350980559427e-05, "loss": 0.4811, "step": 4655 }, { "epoch": 0.1679460842613616, "grad_norm": 0.24803782999515533, "learning_rate": 4.999831729397383e-05, "loss": 0.4998, "step": 4660 }, { "epoch": 0.16812628392258622, "grad_norm": 0.28923875093460083, "learning_rate": 4.9998283266791e-05, "loss": 0.4713, "step": 4665 }, { "epoch": 0.16830648358381087, "grad_norm": 0.17523469030857086, "learning_rate": 4.9998248899011405e-05, "loss": 0.4995, "step": 4670 }, { "epoch": 0.1684866832450355, "grad_norm": 0.21582381427288055, "learning_rate": 4.9998214190635495e-05, "loss": 0.4943, "step": 4675 }, { "epoch": 0.16866688290626014, "grad_norm": 0.35510435700416565, "learning_rate": 4.999817914166377e-05, "loss": 0.5125, "step": 4680 }, { "epoch": 0.16884708256748476, "grad_norm": 0.255281001329422, "learning_rate": 4.9998143752096684e-05, "loss": 0.5037, "step": 4685 }, { "epoch": 0.1690272822287094, "grad_norm": 0.26469287276268005, "learning_rate": 4.999810802193473e-05, "loss": 0.4958, "step": 4690 }, { "epoch": 0.16920748188993404, "grad_norm": 0.23432576656341553, "learning_rate": 4.9998071951178405e-05, "loss": 0.4646, "step": 4695 }, { "epoch": 0.1693876815511587, "grad_norm": 0.19142989814281464, "learning_rate": 4.999803553982818e-05, "loss": 0.4931, "step": 4700 }, { "epoch": 0.1695678812123833, "grad_norm": 0.21128664910793304, "learning_rate": 4.999799878788456e-05, "loss": 0.4901, "step": 4705 }, { "epoch": 0.16974808087360796, "grad_norm": 0.22904910147190094, "learning_rate": 4.999796169534805e-05, "loss": 0.4975, "step": 4710 }, { "epoch": 0.16992828053483258, "grad_norm": 0.27078938484191895, "learning_rate": 4.999792426221915e-05, "loss": 0.4434, "step": 4715 }, { "epoch": 0.17010848019605723, "grad_norm": 0.2444290965795517, "learning_rate": 4.9997886488498374e-05, "loss": 0.4923, "step": 4720 }, { "epoch": 0.17028867985728185, "grad_norm": 0.2548987865447998, "learning_rate": 4.999784837418623e-05, "loss": 0.4474, "step": 4725 }, { "epoch": 0.1704688795185065, "grad_norm": 0.2576419711112976, "learning_rate": 4.999780991928325e-05, "loss": 0.5217, "step": 4730 }, { "epoch": 0.17064907917973116, "grad_norm": 0.2640141546726227, "learning_rate": 4.999777112378994e-05, "loss": 0.4983, "step": 4735 }, { "epoch": 0.17082927884095578, "grad_norm": 0.2303144633769989, "learning_rate": 4.999773198770684e-05, "loss": 0.477, "step": 4740 }, { "epoch": 0.17100947850218043, "grad_norm": 0.22392189502716064, "learning_rate": 4.999769251103449e-05, "loss": 0.4617, "step": 4745 }, { "epoch": 0.17118967816340505, "grad_norm": 0.26157307624816895, "learning_rate": 4.9997652693773414e-05, "loss": 0.4668, "step": 4750 }, { "epoch": 0.1713698778246297, "grad_norm": 0.24970059096813202, "learning_rate": 4.999761253592415e-05, "loss": 0.4901, "step": 4755 }, { "epoch": 0.17155007748585432, "grad_norm": 0.25207483768463135, "learning_rate": 4.999757203748727e-05, "loss": 0.4811, "step": 4760 }, { "epoch": 0.17173027714707897, "grad_norm": 0.23375752568244934, "learning_rate": 4.99975311984633e-05, "loss": 0.4795, "step": 4765 }, { "epoch": 0.1719104768083036, "grad_norm": 0.2202087789773941, "learning_rate": 4.999749001885281e-05, "loss": 0.5194, "step": 4770 }, { "epoch": 0.17209067646952825, "grad_norm": 0.17962399125099182, "learning_rate": 4.999744849865636e-05, "loss": 0.4766, "step": 4775 }, { "epoch": 0.17227087613075287, "grad_norm": 0.18169254064559937, "learning_rate": 4.999740663787452e-05, "loss": 0.5046, "step": 4780 }, { "epoch": 0.17245107579197752, "grad_norm": 0.21856611967086792, "learning_rate": 4.999736443650784e-05, "loss": 0.4918, "step": 4785 }, { "epoch": 0.17263127545320214, "grad_norm": 0.24348682165145874, "learning_rate": 4.999732189455692e-05, "loss": 0.4751, "step": 4790 }, { "epoch": 0.1728114751144268, "grad_norm": 0.22899436950683594, "learning_rate": 4.9997279012022324e-05, "loss": 0.4198, "step": 4795 }, { "epoch": 0.17299167477565142, "grad_norm": 0.26833006739616394, "learning_rate": 4.9997235788904646e-05, "loss": 0.4782, "step": 4800 }, { "epoch": 0.17317187443687607, "grad_norm": 0.2631605565547943, "learning_rate": 4.9997192225204466e-05, "loss": 0.4366, "step": 4805 }, { "epoch": 0.1733520740981007, "grad_norm": 0.22278451919555664, "learning_rate": 4.999714832092238e-05, "loss": 0.4783, "step": 4810 }, { "epoch": 0.17353227375932534, "grad_norm": 0.22488220036029816, "learning_rate": 4.999710407605899e-05, "loss": 0.4497, "step": 4815 }, { "epoch": 0.17371247342054996, "grad_norm": 0.19599469006061554, "learning_rate": 4.99970594906149e-05, "loss": 0.5147, "step": 4820 }, { "epoch": 0.1738926730817746, "grad_norm": 0.22859926521778107, "learning_rate": 4.9997014564590706e-05, "loss": 0.4891, "step": 4825 }, { "epoch": 0.17407287274299924, "grad_norm": 0.29516199231147766, "learning_rate": 4.9996969297987036e-05, "loss": 0.454, "step": 4830 }, { "epoch": 0.17425307240422389, "grad_norm": 0.2656870484352112, "learning_rate": 4.9996923690804495e-05, "loss": 0.4491, "step": 4835 }, { "epoch": 0.1744332720654485, "grad_norm": 0.2088516801595688, "learning_rate": 4.999687774304371e-05, "loss": 0.489, "step": 4840 }, { "epoch": 0.17461347172667316, "grad_norm": 0.20423473417758942, "learning_rate": 4.99968314547053e-05, "loss": 0.5235, "step": 4845 }, { "epoch": 0.17479367138789778, "grad_norm": 0.2307443767786026, "learning_rate": 4.999678482578991e-05, "loss": 0.4527, "step": 4850 }, { "epoch": 0.17497387104912243, "grad_norm": 0.16717872023582458, "learning_rate": 4.9996737856298156e-05, "loss": 0.4769, "step": 4855 }, { "epoch": 0.17515407071034705, "grad_norm": 0.23562130331993103, "learning_rate": 4.99966905462307e-05, "loss": 0.4787, "step": 4860 }, { "epoch": 0.1753342703715717, "grad_norm": 0.2515668272972107, "learning_rate": 4.9996642895588166e-05, "loss": 0.4721, "step": 4865 }, { "epoch": 0.17551447003279633, "grad_norm": 0.20563237369060516, "learning_rate": 4.9996594904371215e-05, "loss": 0.481, "step": 4870 }, { "epoch": 0.17569466969402098, "grad_norm": 0.24153757095336914, "learning_rate": 4.99965465725805e-05, "loss": 0.49, "step": 4875 }, { "epoch": 0.1758748693552456, "grad_norm": 0.19717536866664886, "learning_rate": 4.999649790021667e-05, "loss": 0.4529, "step": 4880 }, { "epoch": 0.17605506901647025, "grad_norm": 0.26905906200408936, "learning_rate": 4.99964488872804e-05, "loss": 0.4775, "step": 4885 }, { "epoch": 0.17623526867769487, "grad_norm": 0.18942050635814667, "learning_rate": 4.999639953377235e-05, "loss": 0.4558, "step": 4890 }, { "epoch": 0.17641546833891952, "grad_norm": 0.25289055705070496, "learning_rate": 4.99963498396932e-05, "loss": 0.4752, "step": 4895 }, { "epoch": 0.17659566800014415, "grad_norm": 0.28434181213378906, "learning_rate": 4.9996299805043625e-05, "loss": 0.498, "step": 4900 }, { "epoch": 0.1767758676613688, "grad_norm": 0.2828446328639984, "learning_rate": 4.99962494298243e-05, "loss": 0.498, "step": 4905 }, { "epoch": 0.17695606732259345, "grad_norm": 0.24182403087615967, "learning_rate": 4.999619871403592e-05, "loss": 0.4959, "step": 4910 }, { "epoch": 0.17713626698381807, "grad_norm": 0.2429310828447342, "learning_rate": 4.999614765767917e-05, "loss": 0.4828, "step": 4915 }, { "epoch": 0.17731646664504272, "grad_norm": 0.2273981124162674, "learning_rate": 4.999609626075475e-05, "loss": 0.4961, "step": 4920 }, { "epoch": 0.17749666630626734, "grad_norm": 0.24011553823947906, "learning_rate": 4.999604452326335e-05, "loss": 0.4782, "step": 4925 }, { "epoch": 0.177676865967492, "grad_norm": 0.22934234142303467, "learning_rate": 4.999599244520569e-05, "loss": 0.5014, "step": 4930 }, { "epoch": 0.17785706562871662, "grad_norm": 0.19686634838581085, "learning_rate": 4.999594002658247e-05, "loss": 0.446, "step": 4935 }, { "epoch": 0.17803726528994127, "grad_norm": 0.2651787996292114, "learning_rate": 4.999588726739441e-05, "loss": 0.5142, "step": 4940 }, { "epoch": 0.1782174649511659, "grad_norm": 0.1980464607477188, "learning_rate": 4.999583416764222e-05, "loss": 0.4975, "step": 4945 }, { "epoch": 0.17839766461239054, "grad_norm": 0.24422787129878998, "learning_rate": 4.999578072732664e-05, "loss": 0.455, "step": 4950 }, { "epoch": 0.17857786427361516, "grad_norm": 0.31826359033584595, "learning_rate": 4.9995726946448375e-05, "loss": 0.5055, "step": 4955 }, { "epoch": 0.1787580639348398, "grad_norm": 0.26964086294174194, "learning_rate": 4.999567282500817e-05, "loss": 0.4871, "step": 4960 }, { "epoch": 0.17893826359606443, "grad_norm": 0.18171383440494537, "learning_rate": 4.999561836300676e-05, "loss": 0.4757, "step": 4965 }, { "epoch": 0.17911846325728908, "grad_norm": 0.26201415061950684, "learning_rate": 4.99955635604449e-05, "loss": 0.4891, "step": 4970 }, { "epoch": 0.1792986629185137, "grad_norm": 0.24988138675689697, "learning_rate": 4.999550841732331e-05, "loss": 0.4845, "step": 4975 }, { "epoch": 0.17947886257973836, "grad_norm": 0.21902771294116974, "learning_rate": 4.999545293364277e-05, "loss": 0.4645, "step": 4980 }, { "epoch": 0.17965906224096298, "grad_norm": 0.1887025535106659, "learning_rate": 4.999539710940402e-05, "loss": 0.4777, "step": 4985 }, { "epoch": 0.17983926190218763, "grad_norm": 0.21480071544647217, "learning_rate": 4.9995340944607824e-05, "loss": 0.499, "step": 4990 }, { "epoch": 0.18001946156341225, "grad_norm": 0.2067321240901947, "learning_rate": 4.9995284439254944e-05, "loss": 0.4782, "step": 4995 }, { "epoch": 0.1801996612246369, "grad_norm": 0.22930487990379333, "learning_rate": 4.999522759334616e-05, "loss": 0.5074, "step": 5000 }, { "epoch": 0.1801996612246369, "eval_loss": 0.5006341338157654, "eval_runtime": 3.5194, "eval_samples_per_second": 28.414, "eval_steps_per_second": 7.104, "step": 5000 }, { "epoch": 0.18037986088586153, "grad_norm": 0.23178088665008545, "learning_rate": 4.999517040688223e-05, "loss": 0.4822, "step": 5005 }, { "epoch": 0.18056006054708618, "grad_norm": 0.21988582611083984, "learning_rate": 4.999511287986395e-05, "loss": 0.4559, "step": 5010 }, { "epoch": 0.1807402602083108, "grad_norm": 0.25729215145111084, "learning_rate": 4.999505501229209e-05, "loss": 0.4693, "step": 5015 }, { "epoch": 0.18092045986953545, "grad_norm": 0.23540961742401123, "learning_rate": 4.999499680416745e-05, "loss": 0.5181, "step": 5020 }, { "epoch": 0.18110065953076007, "grad_norm": 0.17820635437965393, "learning_rate": 4.9994938255490814e-05, "loss": 0.4853, "step": 5025 }, { "epoch": 0.18128085919198472, "grad_norm": 0.2419961541891098, "learning_rate": 4.999487936626299e-05, "loss": 0.4927, "step": 5030 }, { "epoch": 0.18146105885320934, "grad_norm": 0.2524636685848236, "learning_rate": 4.999482013648476e-05, "loss": 0.4986, "step": 5035 }, { "epoch": 0.181641258514434, "grad_norm": 0.18317490816116333, "learning_rate": 4.999476056615696e-05, "loss": 0.4806, "step": 5040 }, { "epoch": 0.18182145817565862, "grad_norm": 0.2592169940471649, "learning_rate": 4.999470065528038e-05, "loss": 0.4741, "step": 5045 }, { "epoch": 0.18200165783688327, "grad_norm": 0.24432708323001862, "learning_rate": 4.999464040385584e-05, "loss": 0.4726, "step": 5050 }, { "epoch": 0.1821818574981079, "grad_norm": 0.23248982429504395, "learning_rate": 4.999457981188417e-05, "loss": 0.4759, "step": 5055 }, { "epoch": 0.18236205715933254, "grad_norm": 0.23487761616706848, "learning_rate": 4.999451887936618e-05, "loss": 0.4953, "step": 5060 }, { "epoch": 0.18254225682055716, "grad_norm": 0.28242379426956177, "learning_rate": 4.9994457606302714e-05, "loss": 0.4772, "step": 5065 }, { "epoch": 0.18272245648178181, "grad_norm": 0.1956668496131897, "learning_rate": 4.999439599269461e-05, "loss": 0.5024, "step": 5070 }, { "epoch": 0.18290265614300646, "grad_norm": 0.25298696756362915, "learning_rate": 4.999433403854269e-05, "loss": 0.5046, "step": 5075 }, { "epoch": 0.1830828558042311, "grad_norm": 0.2178865671157837, "learning_rate": 4.999427174384781e-05, "loss": 0.5063, "step": 5080 }, { "epoch": 0.18326305546545574, "grad_norm": 0.2657603621482849, "learning_rate": 4.9994209108610814e-05, "loss": 0.4846, "step": 5085 }, { "epoch": 0.18344325512668036, "grad_norm": 0.23558183014392853, "learning_rate": 4.999414613283256e-05, "loss": 0.4673, "step": 5090 }, { "epoch": 0.183623454787905, "grad_norm": 0.23601466417312622, "learning_rate": 4.999408281651391e-05, "loss": 0.5041, "step": 5095 }, { "epoch": 0.18380365444912963, "grad_norm": 0.22025693953037262, "learning_rate": 4.999401915965572e-05, "loss": 0.4699, "step": 5100 }, { "epoch": 0.18398385411035428, "grad_norm": 0.23485559225082397, "learning_rate": 4.9993955162258844e-05, "loss": 0.4927, "step": 5105 }, { "epoch": 0.1841640537715789, "grad_norm": 0.2333470582962036, "learning_rate": 4.999389082432417e-05, "loss": 0.472, "step": 5110 }, { "epoch": 0.18434425343280356, "grad_norm": 0.20295509696006775, "learning_rate": 4.999382614585258e-05, "loss": 0.4978, "step": 5115 }, { "epoch": 0.18452445309402818, "grad_norm": 0.23132069408893585, "learning_rate": 4.999376112684495e-05, "loss": 0.4796, "step": 5120 }, { "epoch": 0.18470465275525283, "grad_norm": 0.19237832725048065, "learning_rate": 4.9993695767302165e-05, "loss": 0.5032, "step": 5125 }, { "epoch": 0.18488485241647745, "grad_norm": 0.19899766147136688, "learning_rate": 4.999363006722511e-05, "loss": 0.4785, "step": 5130 }, { "epoch": 0.1850650520777021, "grad_norm": 0.22194600105285645, "learning_rate": 4.999356402661469e-05, "loss": 0.5207, "step": 5135 }, { "epoch": 0.18524525173892673, "grad_norm": 0.22985856235027313, "learning_rate": 4.999349764547179e-05, "loss": 0.4558, "step": 5140 }, { "epoch": 0.18542545140015138, "grad_norm": 0.2950509488582611, "learning_rate": 4.9993430923797324e-05, "loss": 0.4956, "step": 5145 }, { "epoch": 0.185605651061376, "grad_norm": 0.2260800004005432, "learning_rate": 4.9993363861592204e-05, "loss": 0.435, "step": 5150 }, { "epoch": 0.18578585072260065, "grad_norm": 0.18863672018051147, "learning_rate": 4.999329645885734e-05, "loss": 0.5241, "step": 5155 }, { "epoch": 0.18596605038382527, "grad_norm": 0.24623405933380127, "learning_rate": 4.999322871559365e-05, "loss": 0.4892, "step": 5160 }, { "epoch": 0.18614625004504992, "grad_norm": 0.2146414965391159, "learning_rate": 4.999316063180206e-05, "loss": 0.4762, "step": 5165 }, { "epoch": 0.18632644970627454, "grad_norm": 0.24258659780025482, "learning_rate": 4.999309220748349e-05, "loss": 0.4825, "step": 5170 }, { "epoch": 0.1865066493674992, "grad_norm": 0.24239155650138855, "learning_rate": 4.999302344263889e-05, "loss": 0.4796, "step": 5175 }, { "epoch": 0.18668684902872382, "grad_norm": 0.25734949111938477, "learning_rate": 4.999295433726917e-05, "loss": 0.5216, "step": 5180 }, { "epoch": 0.18686704868994847, "grad_norm": 0.2938891649246216, "learning_rate": 4.999288489137529e-05, "loss": 0.4744, "step": 5185 }, { "epoch": 0.1870472483511731, "grad_norm": 0.20838968455791473, "learning_rate": 4.999281510495819e-05, "loss": 0.4786, "step": 5190 }, { "epoch": 0.18722744801239774, "grad_norm": 0.25164559483528137, "learning_rate": 4.999274497801883e-05, "loss": 0.523, "step": 5195 }, { "epoch": 0.18740764767362236, "grad_norm": 0.23008379340171814, "learning_rate": 4.999267451055815e-05, "loss": 0.4591, "step": 5200 }, { "epoch": 0.187587847334847, "grad_norm": 0.21487128734588623, "learning_rate": 4.9992603702577124e-05, "loss": 0.4903, "step": 5205 }, { "epoch": 0.18776804699607164, "grad_norm": 0.2193828970193863, "learning_rate": 4.999253255407671e-05, "loss": 0.4357, "step": 5210 }, { "epoch": 0.1879482466572963, "grad_norm": 0.20415742695331573, "learning_rate": 4.999246106505788e-05, "loss": 0.4428, "step": 5215 }, { "epoch": 0.1881284463185209, "grad_norm": 0.21222014725208282, "learning_rate": 4.9992389235521606e-05, "loss": 0.4969, "step": 5220 }, { "epoch": 0.18830864597974556, "grad_norm": 0.2923332750797272, "learning_rate": 4.999231706546886e-05, "loss": 0.5121, "step": 5225 }, { "epoch": 0.18848884564097018, "grad_norm": 0.26699545979499817, "learning_rate": 4.999224455490065e-05, "loss": 0.4731, "step": 5230 }, { "epoch": 0.18866904530219483, "grad_norm": 0.27180203795433044, "learning_rate": 4.999217170381794e-05, "loss": 0.4727, "step": 5235 }, { "epoch": 0.18884924496341948, "grad_norm": 0.23332376778125763, "learning_rate": 4.999209851222172e-05, "loss": 0.4673, "step": 5240 }, { "epoch": 0.1890294446246441, "grad_norm": 0.19692248106002808, "learning_rate": 4.9992024980113e-05, "loss": 0.4728, "step": 5245 }, { "epoch": 0.18920964428586876, "grad_norm": 0.22370965778827667, "learning_rate": 4.999195110749278e-05, "loss": 0.4951, "step": 5250 }, { "epoch": 0.18938984394709338, "grad_norm": 0.20484954118728638, "learning_rate": 4.9991876894362064e-05, "loss": 0.4878, "step": 5255 }, { "epoch": 0.18957004360831803, "grad_norm": 0.18314266204833984, "learning_rate": 4.9991802340721866e-05, "loss": 0.5089, "step": 5260 }, { "epoch": 0.18975024326954265, "grad_norm": 0.1618998497724533, "learning_rate": 4.99917274465732e-05, "loss": 0.5125, "step": 5265 }, { "epoch": 0.1899304429307673, "grad_norm": 0.18222755193710327, "learning_rate": 4.9991652211917086e-05, "loss": 0.5117, "step": 5270 }, { "epoch": 0.19011064259199192, "grad_norm": 0.25865301489830017, "learning_rate": 4.999157663675454e-05, "loss": 0.4643, "step": 5275 }, { "epoch": 0.19029084225321657, "grad_norm": 0.21296486258506775, "learning_rate": 4.999150072108661e-05, "loss": 0.4639, "step": 5280 }, { "epoch": 0.1904710419144412, "grad_norm": 0.19290651381015778, "learning_rate": 4.9991424464914324e-05, "loss": 0.4825, "step": 5285 }, { "epoch": 0.19065124157566585, "grad_norm": 0.28650933504104614, "learning_rate": 4.9991347868238714e-05, "loss": 0.5179, "step": 5290 }, { "epoch": 0.19083144123689047, "grad_norm": 0.1798352748155594, "learning_rate": 4.9991270931060826e-05, "loss": 0.4681, "step": 5295 }, { "epoch": 0.19101164089811512, "grad_norm": 0.238149955868721, "learning_rate": 4.9991193653381704e-05, "loss": 0.4844, "step": 5300 }, { "epoch": 0.19119184055933974, "grad_norm": 0.27041110396385193, "learning_rate": 4.9991116035202425e-05, "loss": 0.5273, "step": 5305 }, { "epoch": 0.1913720402205644, "grad_norm": 0.2148786038160324, "learning_rate": 4.999103807652401e-05, "loss": 0.4877, "step": 5310 }, { "epoch": 0.19155223988178902, "grad_norm": 0.20174206793308258, "learning_rate": 4.999095977734755e-05, "loss": 0.4858, "step": 5315 }, { "epoch": 0.19173243954301367, "grad_norm": 0.26444557309150696, "learning_rate": 4.9990881137674103e-05, "loss": 0.4992, "step": 5320 }, { "epoch": 0.1919126392042383, "grad_norm": 0.21838323771953583, "learning_rate": 4.9990802157504734e-05, "loss": 0.4098, "step": 5325 }, { "epoch": 0.19209283886546294, "grad_norm": 0.23885351419448853, "learning_rate": 4.999072283684052e-05, "loss": 0.4674, "step": 5330 }, { "epoch": 0.19227303852668756, "grad_norm": 0.24196289479732513, "learning_rate": 4.9990643175682554e-05, "loss": 0.5008, "step": 5335 }, { "epoch": 0.1924532381879122, "grad_norm": 0.18864691257476807, "learning_rate": 4.999056317403191e-05, "loss": 0.4603, "step": 5340 }, { "epoch": 0.19263343784913683, "grad_norm": 0.2206612378358841, "learning_rate": 4.9990482831889685e-05, "loss": 0.4875, "step": 5345 }, { "epoch": 0.19281363751036149, "grad_norm": 0.18795616924762726, "learning_rate": 4.9990402149256964e-05, "loss": 0.4718, "step": 5350 }, { "epoch": 0.1929938371715861, "grad_norm": 0.28613436222076416, "learning_rate": 4.999032112613485e-05, "loss": 0.4476, "step": 5355 }, { "epoch": 0.19317403683281076, "grad_norm": 0.22328707575798035, "learning_rate": 4.999023976252445e-05, "loss": 0.4631, "step": 5360 }, { "epoch": 0.19335423649403538, "grad_norm": 0.20430655777454376, "learning_rate": 4.9990158058426875e-05, "loss": 0.523, "step": 5365 }, { "epoch": 0.19353443615526003, "grad_norm": 0.19645529985427856, "learning_rate": 4.999007601384324e-05, "loss": 0.4338, "step": 5370 }, { "epoch": 0.19371463581648465, "grad_norm": 0.25969836115837097, "learning_rate": 4.998999362877464e-05, "loss": 0.4712, "step": 5375 }, { "epoch": 0.1938948354777093, "grad_norm": 0.19297480583190918, "learning_rate": 4.9989910903222234e-05, "loss": 0.471, "step": 5380 }, { "epoch": 0.19407503513893393, "grad_norm": 0.24897582828998566, "learning_rate": 4.998982783718712e-05, "loss": 0.4915, "step": 5385 }, { "epoch": 0.19425523480015858, "grad_norm": 0.2619524896144867, "learning_rate": 4.998974443067044e-05, "loss": 0.5119, "step": 5390 }, { "epoch": 0.1944354344613832, "grad_norm": 0.17749615013599396, "learning_rate": 4.998966068367334e-05, "loss": 0.4782, "step": 5395 }, { "epoch": 0.19461563412260785, "grad_norm": 0.20441953837871552, "learning_rate": 4.998957659619694e-05, "loss": 0.4346, "step": 5400 }, { "epoch": 0.19479583378383247, "grad_norm": 0.22351542115211487, "learning_rate": 4.9989492168242405e-05, "loss": 0.4833, "step": 5405 }, { "epoch": 0.19497603344505712, "grad_norm": 0.19759854674339294, "learning_rate": 4.998940739981087e-05, "loss": 0.5104, "step": 5410 }, { "epoch": 0.19515623310628177, "grad_norm": 0.2504650354385376, "learning_rate": 4.9989322290903504e-05, "loss": 0.4585, "step": 5415 }, { "epoch": 0.1953364327675064, "grad_norm": 0.2255609631538391, "learning_rate": 4.998923684152145e-05, "loss": 0.4881, "step": 5420 }, { "epoch": 0.19551663242873105, "grad_norm": 0.25729531049728394, "learning_rate": 4.99891510516659e-05, "loss": 0.4705, "step": 5425 }, { "epoch": 0.19569683208995567, "grad_norm": 0.24127353727817535, "learning_rate": 4.998906492133798e-05, "loss": 0.4939, "step": 5430 }, { "epoch": 0.19587703175118032, "grad_norm": 0.2850863039493561, "learning_rate": 4.99889784505389e-05, "loss": 0.4461, "step": 5435 }, { "epoch": 0.19605723141240494, "grad_norm": 0.1722259372472763, "learning_rate": 4.998889163926983e-05, "loss": 0.4819, "step": 5440 }, { "epoch": 0.1962374310736296, "grad_norm": 0.21394529938697815, "learning_rate": 4.998880448753195e-05, "loss": 0.4985, "step": 5445 }, { "epoch": 0.19641763073485421, "grad_norm": 0.18665440380573273, "learning_rate": 4.998871699532644e-05, "loss": 0.5014, "step": 5450 }, { "epoch": 0.19659783039607887, "grad_norm": 0.226554274559021, "learning_rate": 4.998862916265451e-05, "loss": 0.4596, "step": 5455 }, { "epoch": 0.1967780300573035, "grad_norm": 0.2326064258813858, "learning_rate": 4.998854098951734e-05, "loss": 0.4634, "step": 5460 }, { "epoch": 0.19695822971852814, "grad_norm": 0.20283064246177673, "learning_rate": 4.9988452475916134e-05, "loss": 0.4553, "step": 5465 }, { "epoch": 0.19713842937975276, "grad_norm": 0.19399242103099823, "learning_rate": 4.998836362185211e-05, "loss": 0.472, "step": 5470 }, { "epoch": 0.1973186290409774, "grad_norm": 0.24312461912631989, "learning_rate": 4.998827442732646e-05, "loss": 0.4887, "step": 5475 }, { "epoch": 0.19749882870220203, "grad_norm": 0.23328492045402527, "learning_rate": 4.9988184892340414e-05, "loss": 0.4683, "step": 5480 }, { "epoch": 0.19767902836342668, "grad_norm": 0.1696726530790329, "learning_rate": 4.998809501689518e-05, "loss": 0.4872, "step": 5485 }, { "epoch": 0.1978592280246513, "grad_norm": 0.20958632230758667, "learning_rate": 4.998800480099199e-05, "loss": 0.4747, "step": 5490 }, { "epoch": 0.19803942768587596, "grad_norm": 0.23482240736484528, "learning_rate": 4.998791424463208e-05, "loss": 0.4519, "step": 5495 }, { "epoch": 0.19821962734710058, "grad_norm": 0.20664586126804352, "learning_rate": 4.998782334781668e-05, "loss": 0.4516, "step": 5500 }, { "epoch": 0.19821962734710058, "eval_loss": 0.49830162525177, "eval_runtime": 3.5194, "eval_samples_per_second": 28.414, "eval_steps_per_second": 7.103, "step": 5500 }, { "epoch": 0.19839982700832523, "grad_norm": 0.2016601264476776, "learning_rate": 4.998773211054701e-05, "loss": 0.5032, "step": 5505 }, { "epoch": 0.19858002666954985, "grad_norm": 0.18409425020217896, "learning_rate": 4.998764053282433e-05, "loss": 0.4853, "step": 5510 }, { "epoch": 0.1987602263307745, "grad_norm": 0.18686065077781677, "learning_rate": 4.998754861464989e-05, "loss": 0.4274, "step": 5515 }, { "epoch": 0.19894042599199913, "grad_norm": 0.20691046118736267, "learning_rate": 4.9987456356024944e-05, "loss": 0.5276, "step": 5520 }, { "epoch": 0.19912062565322378, "grad_norm": 0.1962127685546875, "learning_rate": 4.9987363756950736e-05, "loss": 0.4697, "step": 5525 }, { "epoch": 0.1993008253144484, "grad_norm": 0.20173537731170654, "learning_rate": 4.9987270817428535e-05, "loss": 0.434, "step": 5530 }, { "epoch": 0.19948102497567305, "grad_norm": 0.21217285096645355, "learning_rate": 4.998717753745961e-05, "loss": 0.4603, "step": 5535 }, { "epoch": 0.19966122463689767, "grad_norm": 0.2284974455833435, "learning_rate": 4.998708391704522e-05, "loss": 0.476, "step": 5540 }, { "epoch": 0.19984142429812232, "grad_norm": 0.22764502465724945, "learning_rate": 4.998698995618666e-05, "loss": 0.5092, "step": 5545 }, { "epoch": 0.20002162395934694, "grad_norm": 0.19202855229377747, "learning_rate": 4.998689565488519e-05, "loss": 0.4573, "step": 5550 }, { "epoch": 0.2002018236205716, "grad_norm": 0.20149993896484375, "learning_rate": 4.998680101314211e-05, "loss": 0.4563, "step": 5555 }, { "epoch": 0.20038202328179622, "grad_norm": 0.1978190690279007, "learning_rate": 4.9986706030958705e-05, "loss": 0.4785, "step": 5560 }, { "epoch": 0.20056222294302087, "grad_norm": 0.186653271317482, "learning_rate": 4.998661070833627e-05, "loss": 0.4688, "step": 5565 }, { "epoch": 0.2007424226042455, "grad_norm": 0.20869889855384827, "learning_rate": 4.9986515045276094e-05, "loss": 0.4916, "step": 5570 }, { "epoch": 0.20092262226547014, "grad_norm": 0.2684619128704071, "learning_rate": 4.9986419041779485e-05, "loss": 0.4965, "step": 5575 }, { "epoch": 0.2011028219266948, "grad_norm": 0.2171335518360138, "learning_rate": 4.9986322697847765e-05, "loss": 0.4418, "step": 5580 }, { "epoch": 0.20128302158791941, "grad_norm": 0.24311456084251404, "learning_rate": 4.998622601348223e-05, "loss": 0.4689, "step": 5585 }, { "epoch": 0.20146322124914406, "grad_norm": 0.19631347060203552, "learning_rate": 4.998612898868421e-05, "loss": 0.4882, "step": 5590 }, { "epoch": 0.2016434209103687, "grad_norm": 0.2597336173057556, "learning_rate": 4.9986031623455006e-05, "loss": 0.5101, "step": 5595 }, { "epoch": 0.20182362057159334, "grad_norm": 0.2971448600292206, "learning_rate": 4.9985933917795966e-05, "loss": 0.4788, "step": 5600 }, { "epoch": 0.20200382023281796, "grad_norm": 0.2276962250471115, "learning_rate": 4.998583587170842e-05, "loss": 0.4641, "step": 5605 }, { "epoch": 0.2021840198940426, "grad_norm": 0.16654905676841736, "learning_rate": 4.99857374851937e-05, "loss": 0.4612, "step": 5610 }, { "epoch": 0.20236421955526723, "grad_norm": 0.17817789316177368, "learning_rate": 4.998563875825313e-05, "loss": 0.4524, "step": 5615 }, { "epoch": 0.20254441921649188, "grad_norm": 0.1932527720928192, "learning_rate": 4.998553969088807e-05, "loss": 0.4654, "step": 5620 }, { "epoch": 0.2027246188777165, "grad_norm": 0.22446228563785553, "learning_rate": 4.9985440283099885e-05, "loss": 0.5021, "step": 5625 }, { "epoch": 0.20290481853894116, "grad_norm": 0.22577793896198273, "learning_rate": 4.99853405348899e-05, "loss": 0.5042, "step": 5630 }, { "epoch": 0.20308501820016578, "grad_norm": 0.19390153884887695, "learning_rate": 4.998524044625949e-05, "loss": 0.4851, "step": 5635 }, { "epoch": 0.20326521786139043, "grad_norm": 0.22741837799549103, "learning_rate": 4.998514001721002e-05, "loss": 0.4954, "step": 5640 }, { "epoch": 0.20344541752261505, "grad_norm": 0.20795418322086334, "learning_rate": 4.998503924774285e-05, "loss": 0.4748, "step": 5645 }, { "epoch": 0.2036256171838397, "grad_norm": 0.20817185938358307, "learning_rate": 4.998493813785936e-05, "loss": 0.5023, "step": 5650 }, { "epoch": 0.20380581684506432, "grad_norm": 0.24967476725578308, "learning_rate": 4.9984836687560924e-05, "loss": 0.5128, "step": 5655 }, { "epoch": 0.20398601650628898, "grad_norm": 0.2160884588956833, "learning_rate": 4.998473489684892e-05, "loss": 0.4386, "step": 5660 }, { "epoch": 0.2041662161675136, "grad_norm": 0.19086240231990814, "learning_rate": 4.998463276572475e-05, "loss": 0.4722, "step": 5665 }, { "epoch": 0.20434641582873825, "grad_norm": 0.21120908856391907, "learning_rate": 4.9984530294189794e-05, "loss": 0.4662, "step": 5670 }, { "epoch": 0.20452661548996287, "grad_norm": 0.20728902518749237, "learning_rate": 4.9984427482245445e-05, "loss": 0.4166, "step": 5675 }, { "epoch": 0.20470681515118752, "grad_norm": 0.24865606427192688, "learning_rate": 4.998432432989311e-05, "loss": 0.49, "step": 5680 }, { "epoch": 0.20488701481241214, "grad_norm": 0.3118506968021393, "learning_rate": 4.99842208371342e-05, "loss": 0.5066, "step": 5685 }, { "epoch": 0.2050672144736368, "grad_norm": 0.21482154726982117, "learning_rate": 4.998411700397011e-05, "loss": 0.5173, "step": 5690 }, { "epoch": 0.20524741413486142, "grad_norm": 0.2417951226234436, "learning_rate": 4.998401283040226e-05, "loss": 0.4688, "step": 5695 }, { "epoch": 0.20542761379608607, "grad_norm": 0.22649191319942474, "learning_rate": 4.9983908316432084e-05, "loss": 0.4748, "step": 5700 }, { "epoch": 0.2056078134573107, "grad_norm": 0.28141918778419495, "learning_rate": 4.998380346206099e-05, "loss": 0.4986, "step": 5705 }, { "epoch": 0.20578801311853534, "grad_norm": 0.25580498576164246, "learning_rate": 4.99836982672904e-05, "loss": 0.5207, "step": 5710 }, { "epoch": 0.20596821277975996, "grad_norm": 0.2193039357662201, "learning_rate": 4.998359273212177e-05, "loss": 0.5007, "step": 5715 }, { "epoch": 0.2061484124409846, "grad_norm": 0.22306658327579498, "learning_rate": 4.998348685655653e-05, "loss": 0.4909, "step": 5720 }, { "epoch": 0.20632861210220924, "grad_norm": 0.20013074576854706, "learning_rate": 4.998338064059611e-05, "loss": 0.472, "step": 5725 }, { "epoch": 0.20650881176343389, "grad_norm": 0.2772786319255829, "learning_rate": 4.998327408424196e-05, "loss": 0.4913, "step": 5730 }, { "epoch": 0.2066890114246585, "grad_norm": 0.23623016476631165, "learning_rate": 4.998316718749555e-05, "loss": 0.4423, "step": 5735 }, { "epoch": 0.20686921108588316, "grad_norm": 0.2390371859073639, "learning_rate": 4.9983059950358316e-05, "loss": 0.4747, "step": 5740 }, { "epoch": 0.20704941074710778, "grad_norm": 0.16959403455257416, "learning_rate": 4.9982952372831724e-05, "loss": 0.4875, "step": 5745 }, { "epoch": 0.20722961040833243, "grad_norm": 0.1819695085287094, "learning_rate": 4.998284445491726e-05, "loss": 0.4731, "step": 5750 }, { "epoch": 0.20740981006955708, "grad_norm": 0.21502423286437988, "learning_rate": 4.998273619661636e-05, "loss": 0.4587, "step": 5755 }, { "epoch": 0.2075900097307817, "grad_norm": 0.1912410408258438, "learning_rate": 4.998262759793052e-05, "loss": 0.4497, "step": 5760 }, { "epoch": 0.20777020939200636, "grad_norm": 0.21954499185085297, "learning_rate": 4.9982518658861224e-05, "loss": 0.4768, "step": 5765 }, { "epoch": 0.20795040905323098, "grad_norm": 0.23011371493339539, "learning_rate": 4.998240937940993e-05, "loss": 0.4931, "step": 5770 }, { "epoch": 0.20813060871445563, "grad_norm": 0.18826115131378174, "learning_rate": 4.998229975957816e-05, "loss": 0.4547, "step": 5775 }, { "epoch": 0.20831080837568025, "grad_norm": 0.23759086430072784, "learning_rate": 4.998218979936739e-05, "loss": 0.4687, "step": 5780 }, { "epoch": 0.2084910080369049, "grad_norm": 0.2513539493083954, "learning_rate": 4.9982079498779125e-05, "loss": 0.4908, "step": 5785 }, { "epoch": 0.20867120769812952, "grad_norm": 0.23931877315044403, "learning_rate": 4.998196885781485e-05, "loss": 0.4848, "step": 5790 }, { "epoch": 0.20885140735935417, "grad_norm": 0.21470136940479279, "learning_rate": 4.99818578764761e-05, "loss": 0.4397, "step": 5795 }, { "epoch": 0.2090316070205788, "grad_norm": 0.181662455201149, "learning_rate": 4.9981746554764366e-05, "loss": 0.4663, "step": 5800 }, { "epoch": 0.20921180668180345, "grad_norm": 0.20043453574180603, "learning_rate": 4.9981634892681175e-05, "loss": 0.5032, "step": 5805 }, { "epoch": 0.20939200634302807, "grad_norm": 0.21683253347873688, "learning_rate": 4.998152289022804e-05, "loss": 0.4492, "step": 5810 }, { "epoch": 0.20957220600425272, "grad_norm": 0.24924011528491974, "learning_rate": 4.998141054740649e-05, "loss": 0.4736, "step": 5815 }, { "epoch": 0.20975240566547734, "grad_norm": 0.2400321364402771, "learning_rate": 4.998129786421807e-05, "loss": 0.4451, "step": 5820 }, { "epoch": 0.209932605326702, "grad_norm": 0.19398939609527588, "learning_rate": 4.9981184840664294e-05, "loss": 0.4288, "step": 5825 }, { "epoch": 0.21011280498792662, "grad_norm": 0.20897026360034943, "learning_rate": 4.9981071476746717e-05, "loss": 0.4522, "step": 5830 }, { "epoch": 0.21029300464915127, "grad_norm": 0.19112837314605713, "learning_rate": 4.998095777246687e-05, "loss": 0.4699, "step": 5835 }, { "epoch": 0.2104732043103759, "grad_norm": 0.27279478311538696, "learning_rate": 4.998084372782631e-05, "loss": 0.4821, "step": 5840 }, { "epoch": 0.21065340397160054, "grad_norm": 0.17838047444820404, "learning_rate": 4.99807293428266e-05, "loss": 0.4548, "step": 5845 }, { "epoch": 0.21083360363282516, "grad_norm": 0.23038989305496216, "learning_rate": 4.9980614617469284e-05, "loss": 0.4893, "step": 5850 }, { "epoch": 0.2110138032940498, "grad_norm": 0.21518467366695404, "learning_rate": 4.998049955175593e-05, "loss": 0.4707, "step": 5855 }, { "epoch": 0.21119400295527443, "grad_norm": 0.2731831669807434, "learning_rate": 4.998038414568811e-05, "loss": 0.4649, "step": 5860 }, { "epoch": 0.21137420261649909, "grad_norm": 0.2282124161720276, "learning_rate": 4.998026839926738e-05, "loss": 0.4721, "step": 5865 }, { "epoch": 0.2115544022777237, "grad_norm": 0.24522550404071808, "learning_rate": 4.9980152312495345e-05, "loss": 0.4881, "step": 5870 }, { "epoch": 0.21173460193894836, "grad_norm": 0.24952565133571625, "learning_rate": 4.998003588537356e-05, "loss": 0.4917, "step": 5875 }, { "epoch": 0.21191480160017298, "grad_norm": 0.19938194751739502, "learning_rate": 4.997991911790363e-05, "loss": 0.4714, "step": 5880 }, { "epoch": 0.21209500126139763, "grad_norm": 0.25844866037368774, "learning_rate": 4.997980201008713e-05, "loss": 0.485, "step": 5885 }, { "epoch": 0.21227520092262225, "grad_norm": 0.19596533477306366, "learning_rate": 4.9979684561925663e-05, "loss": 0.4782, "step": 5890 }, { "epoch": 0.2124554005838469, "grad_norm": 0.17095403373241425, "learning_rate": 4.9979566773420836e-05, "loss": 0.4786, "step": 5895 }, { "epoch": 0.21263560024507153, "grad_norm": 0.20486395061016083, "learning_rate": 4.9979448644574254e-05, "loss": 0.4701, "step": 5900 }, { "epoch": 0.21281579990629618, "grad_norm": 0.19251783192157745, "learning_rate": 4.997933017538751e-05, "loss": 0.4164, "step": 5905 }, { "epoch": 0.2129959995675208, "grad_norm": 0.21283471584320068, "learning_rate": 4.9979211365862235e-05, "loss": 0.4842, "step": 5910 }, { "epoch": 0.21317619922874545, "grad_norm": 0.21387127041816711, "learning_rate": 4.997909221600003e-05, "loss": 0.4786, "step": 5915 }, { "epoch": 0.2133563988899701, "grad_norm": 0.22376859188079834, "learning_rate": 4.997897272580254e-05, "loss": 0.4504, "step": 5920 }, { "epoch": 0.21353659855119472, "grad_norm": 0.15793459117412567, "learning_rate": 4.997885289527139e-05, "loss": 0.4533, "step": 5925 }, { "epoch": 0.21371679821241937, "grad_norm": 0.2030140906572342, "learning_rate": 4.9978732724408195e-05, "loss": 0.4771, "step": 5930 }, { "epoch": 0.213896997873644, "grad_norm": 0.19730666279792786, "learning_rate": 4.997861221321461e-05, "loss": 0.4598, "step": 5935 }, { "epoch": 0.21407719753486865, "grad_norm": 0.28604263067245483, "learning_rate": 4.9978491361692255e-05, "loss": 0.5014, "step": 5940 }, { "epoch": 0.21425739719609327, "grad_norm": 0.2171340137720108, "learning_rate": 4.99783701698428e-05, "loss": 0.4597, "step": 5945 }, { "epoch": 0.21443759685731792, "grad_norm": 0.24565933644771576, "learning_rate": 4.9978248637667883e-05, "loss": 0.4856, "step": 5950 }, { "epoch": 0.21461779651854254, "grad_norm": 0.20074883103370667, "learning_rate": 4.997812676516917e-05, "loss": 0.4681, "step": 5955 }, { "epoch": 0.2147979961797672, "grad_norm": 0.17820361256599426, "learning_rate": 4.9978004552348314e-05, "loss": 0.485, "step": 5960 }, { "epoch": 0.21497819584099181, "grad_norm": 0.19465869665145874, "learning_rate": 4.997788199920698e-05, "loss": 0.4693, "step": 5965 }, { "epoch": 0.21515839550221647, "grad_norm": 0.22670099139213562, "learning_rate": 4.997775910574685e-05, "loss": 0.5077, "step": 5970 }, { "epoch": 0.2153385951634411, "grad_norm": 0.14773029088974, "learning_rate": 4.997763587196957e-05, "loss": 0.4664, "step": 5975 }, { "epoch": 0.21551879482466574, "grad_norm": 0.26467835903167725, "learning_rate": 4.997751229787685e-05, "loss": 0.5078, "step": 5980 }, { "epoch": 0.21569899448589036, "grad_norm": 0.20489193499088287, "learning_rate": 4.9977388383470356e-05, "loss": 0.4256, "step": 5985 }, { "epoch": 0.215879194147115, "grad_norm": 0.2557709217071533, "learning_rate": 4.997726412875178e-05, "loss": 0.4677, "step": 5990 }, { "epoch": 0.21605939380833963, "grad_norm": 0.20980533957481384, "learning_rate": 4.997713953372282e-05, "loss": 0.4954, "step": 5995 }, { "epoch": 0.21623959346956428, "grad_norm": 0.20784328877925873, "learning_rate": 4.997701459838517e-05, "loss": 0.515, "step": 6000 }, { "epoch": 0.21623959346956428, "eval_loss": 0.4947231411933899, "eval_runtime": 3.5181, "eval_samples_per_second": 28.424, "eval_steps_per_second": 7.106, "step": 6000 }, { "epoch": 0.2164197931307889, "grad_norm": 0.2340945601463318, "learning_rate": 4.997688932274053e-05, "loss": 0.4411, "step": 6005 }, { "epoch": 0.21659999279201356, "grad_norm": 0.1668865829706192, "learning_rate": 4.997676370679061e-05, "loss": 0.4891, "step": 6010 }, { "epoch": 0.21678019245323818, "grad_norm": 0.22756971418857574, "learning_rate": 4.997663775053712e-05, "loss": 0.4992, "step": 6015 }, { "epoch": 0.21696039211446283, "grad_norm": 0.21588781476020813, "learning_rate": 4.997651145398177e-05, "loss": 0.4388, "step": 6020 }, { "epoch": 0.21714059177568745, "grad_norm": 0.1796243041753769, "learning_rate": 4.9976384817126295e-05, "loss": 0.4314, "step": 6025 }, { "epoch": 0.2173207914369121, "grad_norm": 0.2105339765548706, "learning_rate": 4.9976257839972406e-05, "loss": 0.4898, "step": 6030 }, { "epoch": 0.21750099109813673, "grad_norm": 0.23153090476989746, "learning_rate": 4.9976130522521845e-05, "loss": 0.4549, "step": 6035 }, { "epoch": 0.21768119075936138, "grad_norm": 0.240932434797287, "learning_rate": 4.997600286477634e-05, "loss": 0.4623, "step": 6040 }, { "epoch": 0.217861390420586, "grad_norm": 0.19893620908260345, "learning_rate": 4.997587486673763e-05, "loss": 0.4703, "step": 6045 }, { "epoch": 0.21804159008181065, "grad_norm": 0.17426280677318573, "learning_rate": 4.9975746528407466e-05, "loss": 0.4738, "step": 6050 }, { "epoch": 0.21822178974303527, "grad_norm": 0.20304343104362488, "learning_rate": 4.997561784978758e-05, "loss": 0.4971, "step": 6055 }, { "epoch": 0.21840198940425992, "grad_norm": 0.20934487879276276, "learning_rate": 4.997548883087974e-05, "loss": 0.4552, "step": 6060 }, { "epoch": 0.21858218906548454, "grad_norm": 0.17177551984786987, "learning_rate": 4.99753594716857e-05, "loss": 0.4991, "step": 6065 }, { "epoch": 0.2187623887267092, "grad_norm": 0.2459387630224228, "learning_rate": 4.9975229772207224e-05, "loss": 0.4845, "step": 6070 }, { "epoch": 0.21894258838793382, "grad_norm": 0.23840418457984924, "learning_rate": 4.9975099732446085e-05, "loss": 0.4877, "step": 6075 }, { "epoch": 0.21912278804915847, "grad_norm": 0.2193634808063507, "learning_rate": 4.9974969352404036e-05, "loss": 0.4465, "step": 6080 }, { "epoch": 0.21930298771038312, "grad_norm": 0.15216214954853058, "learning_rate": 4.997483863208288e-05, "loss": 0.4538, "step": 6085 }, { "epoch": 0.21948318737160774, "grad_norm": 0.1821216493844986, "learning_rate": 4.997470757148437e-05, "loss": 0.4328, "step": 6090 }, { "epoch": 0.2196633870328324, "grad_norm": 0.2372681200504303, "learning_rate": 4.99745761706103e-05, "loss": 0.4391, "step": 6095 }, { "epoch": 0.219843586694057, "grad_norm": 0.1972285658121109, "learning_rate": 4.9974444429462476e-05, "loss": 0.4586, "step": 6100 }, { "epoch": 0.22002378635528166, "grad_norm": 0.20691488683223724, "learning_rate": 4.997431234804267e-05, "loss": 0.5046, "step": 6105 }, { "epoch": 0.2202039860165063, "grad_norm": 0.20699431002140045, "learning_rate": 4.9974179926352706e-05, "loss": 0.4709, "step": 6110 }, { "epoch": 0.22038418567773094, "grad_norm": 0.24073725938796997, "learning_rate": 4.997404716439438e-05, "loss": 0.4774, "step": 6115 }, { "epoch": 0.22056438533895556, "grad_norm": 0.19937781989574432, "learning_rate": 4.997391406216948e-05, "loss": 0.4387, "step": 6120 }, { "epoch": 0.2207445850001802, "grad_norm": 0.203308567404747, "learning_rate": 4.997378061967984e-05, "loss": 0.4535, "step": 6125 }, { "epoch": 0.22092478466140483, "grad_norm": 0.1698613315820694, "learning_rate": 4.997364683692728e-05, "loss": 0.4852, "step": 6130 }, { "epoch": 0.22110498432262948, "grad_norm": 0.22055956721305847, "learning_rate": 4.997351271391362e-05, "loss": 0.5054, "step": 6135 }, { "epoch": 0.2212851839838541, "grad_norm": 0.25628286600112915, "learning_rate": 4.997337825064068e-05, "loss": 0.4846, "step": 6140 }, { "epoch": 0.22146538364507876, "grad_norm": 0.17073556780815125, "learning_rate": 4.9973243447110294e-05, "loss": 0.4608, "step": 6145 }, { "epoch": 0.22164558330630338, "grad_norm": 0.13316601514816284, "learning_rate": 4.99731083033243e-05, "loss": 0.4658, "step": 6150 }, { "epoch": 0.22182578296752803, "grad_norm": 0.22222983837127686, "learning_rate": 4.997297281928455e-05, "loss": 0.4558, "step": 6155 }, { "epoch": 0.22200598262875265, "grad_norm": 0.21334873139858246, "learning_rate": 4.997283699499287e-05, "loss": 0.4884, "step": 6160 }, { "epoch": 0.2221861822899773, "grad_norm": 0.1857357621192932, "learning_rate": 4.997270083045112e-05, "loss": 0.4372, "step": 6165 }, { "epoch": 0.22236638195120192, "grad_norm": 0.18568672239780426, "learning_rate": 4.997256432566116e-05, "loss": 0.488, "step": 6170 }, { "epoch": 0.22254658161242657, "grad_norm": 0.19653025269508362, "learning_rate": 4.997242748062485e-05, "loss": 0.4907, "step": 6175 }, { "epoch": 0.2227267812736512, "grad_norm": 0.19124481081962585, "learning_rate": 4.9972290295344046e-05, "loss": 0.5021, "step": 6180 }, { "epoch": 0.22290698093487585, "grad_norm": 0.20403480529785156, "learning_rate": 4.997215276982062e-05, "loss": 0.4855, "step": 6185 }, { "epoch": 0.22308718059610047, "grad_norm": 0.18618778884410858, "learning_rate": 4.9972014904056446e-05, "loss": 0.4315, "step": 6190 }, { "epoch": 0.22326738025732512, "grad_norm": 0.24033483862876892, "learning_rate": 4.997187669805341e-05, "loss": 0.4607, "step": 6195 }, { "epoch": 0.22344757991854974, "grad_norm": 0.26656222343444824, "learning_rate": 4.997173815181339e-05, "loss": 0.5022, "step": 6200 }, { "epoch": 0.2236277795797744, "grad_norm": 0.22280102968215942, "learning_rate": 4.997159926533826e-05, "loss": 0.4928, "step": 6205 }, { "epoch": 0.22380797924099902, "grad_norm": 0.2294902205467224, "learning_rate": 4.997146003862994e-05, "loss": 0.5015, "step": 6210 }, { "epoch": 0.22398817890222367, "grad_norm": 0.22047486901283264, "learning_rate": 4.9971320471690295e-05, "loss": 0.4545, "step": 6215 }, { "epoch": 0.2241683785634483, "grad_norm": 0.21225684881210327, "learning_rate": 4.9971180564521254e-05, "loss": 0.4758, "step": 6220 }, { "epoch": 0.22434857822467294, "grad_norm": 0.18276849389076233, "learning_rate": 4.9971040317124706e-05, "loss": 0.4767, "step": 6225 }, { "epoch": 0.22452877788589756, "grad_norm": 0.19218437373638153, "learning_rate": 4.9970899729502576e-05, "loss": 0.466, "step": 6230 }, { "epoch": 0.2247089775471222, "grad_norm": 0.22989146411418915, "learning_rate": 4.997075880165677e-05, "loss": 0.4543, "step": 6235 }, { "epoch": 0.22488917720834684, "grad_norm": 0.25233665108680725, "learning_rate": 4.997061753358921e-05, "loss": 0.501, "step": 6240 }, { "epoch": 0.22506937686957149, "grad_norm": 0.23017632961273193, "learning_rate": 4.997047592530182e-05, "loss": 0.463, "step": 6245 }, { "epoch": 0.2252495765307961, "grad_norm": 0.18937349319458008, "learning_rate": 4.9970333976796526e-05, "loss": 0.4387, "step": 6250 }, { "epoch": 0.22542977619202076, "grad_norm": 0.15060578286647797, "learning_rate": 4.997019168807527e-05, "loss": 0.4603, "step": 6255 }, { "epoch": 0.2256099758532454, "grad_norm": 0.2040947675704956, "learning_rate": 4.997004905913998e-05, "loss": 0.4982, "step": 6260 }, { "epoch": 0.22579017551447003, "grad_norm": 0.2734396159648895, "learning_rate": 4.9969906089992616e-05, "loss": 0.4854, "step": 6265 }, { "epoch": 0.22597037517569468, "grad_norm": 0.17175772786140442, "learning_rate": 4.996976278063511e-05, "loss": 0.4451, "step": 6270 }, { "epoch": 0.2261505748369193, "grad_norm": 0.21332839131355286, "learning_rate": 4.996961913106942e-05, "loss": 0.4869, "step": 6275 }, { "epoch": 0.22633077449814396, "grad_norm": 0.16605186462402344, "learning_rate": 4.9969475141297504e-05, "loss": 0.4448, "step": 6280 }, { "epoch": 0.22651097415936858, "grad_norm": 0.18335530161857605, "learning_rate": 4.9969330811321325e-05, "loss": 0.4778, "step": 6285 }, { "epoch": 0.22669117382059323, "grad_norm": 0.20451930165290833, "learning_rate": 4.996918614114285e-05, "loss": 0.4643, "step": 6290 }, { "epoch": 0.22687137348181785, "grad_norm": 0.1627047061920166, "learning_rate": 4.9969041130764046e-05, "loss": 0.4595, "step": 6295 }, { "epoch": 0.2270515731430425, "grad_norm": 0.18496522307395935, "learning_rate": 4.9968895780186884e-05, "loss": 0.4552, "step": 6300 }, { "epoch": 0.22723177280426712, "grad_norm": 0.209730327129364, "learning_rate": 4.9968750089413365e-05, "loss": 0.4515, "step": 6305 }, { "epoch": 0.22741197246549177, "grad_norm": 0.21571126580238342, "learning_rate": 4.996860405844545e-05, "loss": 0.4874, "step": 6310 }, { "epoch": 0.2275921721267164, "grad_norm": 0.20048457384109497, "learning_rate": 4.996845768728514e-05, "loss": 0.4921, "step": 6315 }, { "epoch": 0.22777237178794105, "grad_norm": 0.211773082613945, "learning_rate": 4.996831097593443e-05, "loss": 0.4907, "step": 6320 }, { "epoch": 0.22795257144916567, "grad_norm": 0.22399649024009705, "learning_rate": 4.996816392439532e-05, "loss": 0.4853, "step": 6325 }, { "epoch": 0.22813277111039032, "grad_norm": 0.2886163294315338, "learning_rate": 4.9968016532669805e-05, "loss": 0.4592, "step": 6330 }, { "epoch": 0.22831297077161494, "grad_norm": 0.21131619811058044, "learning_rate": 4.9967868800759895e-05, "loss": 0.4624, "step": 6335 }, { "epoch": 0.2284931704328396, "grad_norm": 0.2376997023820877, "learning_rate": 4.996772072866762e-05, "loss": 0.4935, "step": 6340 }, { "epoch": 0.22867337009406422, "grad_norm": 0.24256786704063416, "learning_rate": 4.996757231639497e-05, "loss": 0.4595, "step": 6345 }, { "epoch": 0.22885356975528887, "grad_norm": 0.22123675048351288, "learning_rate": 4.9967423563943994e-05, "loss": 0.4759, "step": 6350 }, { "epoch": 0.2290337694165135, "grad_norm": 0.22762177884578705, "learning_rate": 4.996727447131669e-05, "loss": 0.4417, "step": 6355 }, { "epoch": 0.22921396907773814, "grad_norm": 0.1603131741285324, "learning_rate": 4.9967125038515116e-05, "loss": 0.4618, "step": 6360 }, { "epoch": 0.22939416873896276, "grad_norm": 0.2037096470594406, "learning_rate": 4.99669752655413e-05, "loss": 0.4785, "step": 6365 }, { "epoch": 0.2295743684001874, "grad_norm": 0.25867146253585815, "learning_rate": 4.996682515239728e-05, "loss": 0.451, "step": 6370 }, { "epoch": 0.22975456806141203, "grad_norm": 0.15876412391662598, "learning_rate": 4.996667469908509e-05, "loss": 0.4362, "step": 6375 }, { "epoch": 0.22993476772263668, "grad_norm": 0.22342482209205627, "learning_rate": 4.99665239056068e-05, "loss": 0.4629, "step": 6380 }, { "epoch": 0.2301149673838613, "grad_norm": 0.1261013001203537, "learning_rate": 4.9966372771964456e-05, "loss": 0.4501, "step": 6385 }, { "epoch": 0.23029516704508596, "grad_norm": 0.20957724750041962, "learning_rate": 4.996622129816011e-05, "loss": 0.4736, "step": 6390 }, { "epoch": 0.23047536670631058, "grad_norm": 0.23294906318187714, "learning_rate": 4.996606948419583e-05, "loss": 0.4565, "step": 6395 }, { "epoch": 0.23065556636753523, "grad_norm": 0.18201923370361328, "learning_rate": 4.99659173300737e-05, "loss": 0.5131, "step": 6400 }, { "epoch": 0.23083576602875985, "grad_norm": 0.2046813815832138, "learning_rate": 4.996576483579577e-05, "loss": 0.4632, "step": 6405 }, { "epoch": 0.2310159656899845, "grad_norm": 0.17519819736480713, "learning_rate": 4.9965612001364124e-05, "loss": 0.5046, "step": 6410 }, { "epoch": 0.23119616535120913, "grad_norm": 0.19435188174247742, "learning_rate": 4.996545882678086e-05, "loss": 0.4378, "step": 6415 }, { "epoch": 0.23137636501243378, "grad_norm": 0.2328794300556183, "learning_rate": 4.9965305312048043e-05, "loss": 0.4884, "step": 6420 }, { "epoch": 0.23155656467365843, "grad_norm": 0.1869356334209442, "learning_rate": 4.996515145716778e-05, "loss": 0.4929, "step": 6425 }, { "epoch": 0.23173676433488305, "grad_norm": 0.18540406227111816, "learning_rate": 4.996499726214216e-05, "loss": 0.4421, "step": 6430 }, { "epoch": 0.2319169639961077, "grad_norm": 0.23471662402153015, "learning_rate": 4.9964842726973286e-05, "loss": 0.5024, "step": 6435 }, { "epoch": 0.23209716365733232, "grad_norm": 0.1900070160627365, "learning_rate": 4.996468785166326e-05, "loss": 0.4652, "step": 6440 }, { "epoch": 0.23227736331855697, "grad_norm": 0.23058246076107025, "learning_rate": 4.99645326362142e-05, "loss": 0.5072, "step": 6445 }, { "epoch": 0.2324575629797816, "grad_norm": 0.20412348210811615, "learning_rate": 4.9964377080628215e-05, "loss": 0.4847, "step": 6450 }, { "epoch": 0.23263776264100625, "grad_norm": 0.20064546167850494, "learning_rate": 4.9964221184907424e-05, "loss": 0.4501, "step": 6455 }, { "epoch": 0.23281796230223087, "grad_norm": 0.16838903725147247, "learning_rate": 4.996406494905396e-05, "loss": 0.4665, "step": 6460 }, { "epoch": 0.23299816196345552, "grad_norm": 0.17399536073207855, "learning_rate": 4.9963908373069935e-05, "loss": 0.4616, "step": 6465 }, { "epoch": 0.23317836162468014, "grad_norm": 0.18575124442577362, "learning_rate": 4.9963751456957494e-05, "loss": 0.4336, "step": 6470 }, { "epoch": 0.2333585612859048, "grad_norm": 0.22949109971523285, "learning_rate": 4.996359420071877e-05, "loss": 0.5, "step": 6475 }, { "epoch": 0.23353876094712941, "grad_norm": 0.20516782999038696, "learning_rate": 4.9963436604355916e-05, "loss": 0.4611, "step": 6480 }, { "epoch": 0.23371896060835406, "grad_norm": 0.2293572872877121, "learning_rate": 4.996327866787106e-05, "loss": 0.501, "step": 6485 }, { "epoch": 0.2338991602695787, "grad_norm": 0.18813565373420715, "learning_rate": 4.9963120391266376e-05, "loss": 0.4347, "step": 6490 }, { "epoch": 0.23407935993080334, "grad_norm": 0.2553117275238037, "learning_rate": 4.9962961774544006e-05, "loss": 0.4794, "step": 6495 }, { "epoch": 0.23425955959202796, "grad_norm": 0.20878198742866516, "learning_rate": 4.996280281770611e-05, "loss": 0.4326, "step": 6500 }, { "epoch": 0.23425955959202796, "eval_loss": 0.4936707615852356, "eval_runtime": 3.5146, "eval_samples_per_second": 28.453, "eval_steps_per_second": 7.113, "step": 6500 }, { "epoch": 0.2344397592532526, "grad_norm": 0.20102368295192719, "learning_rate": 4.996264352075487e-05, "loss": 0.4871, "step": 6505 }, { "epoch": 0.23461995891447723, "grad_norm": 0.20165607333183289, "learning_rate": 4.996248388369243e-05, "loss": 0.4737, "step": 6510 }, { "epoch": 0.23480015857570188, "grad_norm": 0.1898958534002304, "learning_rate": 4.996232390652099e-05, "loss": 0.4562, "step": 6515 }, { "epoch": 0.2349803582369265, "grad_norm": 0.2387424260377884, "learning_rate": 4.996216358924272e-05, "loss": 0.4935, "step": 6520 }, { "epoch": 0.23516055789815116, "grad_norm": 0.2199474722146988, "learning_rate": 4.99620029318598e-05, "loss": 0.4808, "step": 6525 }, { "epoch": 0.23534075755937578, "grad_norm": 0.20181065797805786, "learning_rate": 4.996184193437442e-05, "loss": 0.4586, "step": 6530 }, { "epoch": 0.23552095722060043, "grad_norm": 0.21171115338802338, "learning_rate": 4.9961680596788784e-05, "loss": 0.451, "step": 6535 }, { "epoch": 0.23570115688182505, "grad_norm": 0.22186800837516785, "learning_rate": 4.996151891910508e-05, "loss": 0.4912, "step": 6540 }, { "epoch": 0.2358813565430497, "grad_norm": 0.20023906230926514, "learning_rate": 4.9961356901325515e-05, "loss": 0.481, "step": 6545 }, { "epoch": 0.23606155620427433, "grad_norm": 0.2267458438873291, "learning_rate": 4.9961194543452296e-05, "loss": 0.4896, "step": 6550 }, { "epoch": 0.23624175586549898, "grad_norm": 0.1733904927968979, "learning_rate": 4.996103184548763e-05, "loss": 0.4545, "step": 6555 }, { "epoch": 0.2364219555267236, "grad_norm": 0.1978592723608017, "learning_rate": 4.9960868807433734e-05, "loss": 0.4491, "step": 6560 }, { "epoch": 0.23660215518794825, "grad_norm": 0.1597129851579666, "learning_rate": 4.9960705429292836e-05, "loss": 0.4196, "step": 6565 }, { "epoch": 0.23678235484917287, "grad_norm": 0.19397877156734467, "learning_rate": 4.996054171106716e-05, "loss": 0.4751, "step": 6570 }, { "epoch": 0.23696255451039752, "grad_norm": 0.20515479147434235, "learning_rate": 4.996037765275894e-05, "loss": 0.4897, "step": 6575 }, { "epoch": 0.23714275417162214, "grad_norm": 0.240402951836586, "learning_rate": 4.99602132543704e-05, "loss": 0.4842, "step": 6580 }, { "epoch": 0.2373229538328468, "grad_norm": 0.20333066582679749, "learning_rate": 4.996004851590379e-05, "loss": 0.4708, "step": 6585 }, { "epoch": 0.23750315349407142, "grad_norm": 0.2151239812374115, "learning_rate": 4.995988343736135e-05, "loss": 0.4748, "step": 6590 }, { "epoch": 0.23768335315529607, "grad_norm": 0.1680813729763031, "learning_rate": 4.995971801874533e-05, "loss": 0.4726, "step": 6595 }, { "epoch": 0.23786355281652072, "grad_norm": 0.18834345042705536, "learning_rate": 4.995955226005799e-05, "loss": 0.4443, "step": 6600 }, { "epoch": 0.23804375247774534, "grad_norm": 0.21085232496261597, "learning_rate": 4.995938616130158e-05, "loss": 0.4451, "step": 6605 }, { "epoch": 0.23822395213897, "grad_norm": 0.20523454248905182, "learning_rate": 4.9959219722478365e-05, "loss": 0.4476, "step": 6610 }, { "epoch": 0.2384041518001946, "grad_norm": 0.20655976235866547, "learning_rate": 4.995908632657321e-05, "loss": 0.445, "step": 6615 }, { "epoch": 0.23858435146141926, "grad_norm": 0.1544501781463623, "learning_rate": 4.995891927563548e-05, "loss": 0.454, "step": 6620 }, { "epoch": 0.2387645511226439, "grad_norm": 0.20439325273036957, "learning_rate": 4.9958751884637286e-05, "loss": 0.4784, "step": 6625 }, { "epoch": 0.23894475078386854, "grad_norm": 0.15891632437705994, "learning_rate": 4.9958584153580933e-05, "loss": 0.4433, "step": 6630 }, { "epoch": 0.23912495044509316, "grad_norm": 0.20959921181201935, "learning_rate": 4.995841608246871e-05, "loss": 0.4862, "step": 6635 }, { "epoch": 0.2393051501063178, "grad_norm": 0.17294225096702576, "learning_rate": 4.995824767130289e-05, "loss": 0.436, "step": 6640 }, { "epoch": 0.23948534976754243, "grad_norm": 0.26996639370918274, "learning_rate": 4.995807892008578e-05, "loss": 0.4688, "step": 6645 }, { "epoch": 0.23966554942876708, "grad_norm": 0.20224200189113617, "learning_rate": 4.995790982881968e-05, "loss": 0.4688, "step": 6650 }, { "epoch": 0.2398457490899917, "grad_norm": 0.17374387383460999, "learning_rate": 4.995774039750689e-05, "loss": 0.4692, "step": 6655 }, { "epoch": 0.24002594875121636, "grad_norm": 0.14203490316867828, "learning_rate": 4.995757062614972e-05, "loss": 0.4516, "step": 6660 }, { "epoch": 0.24020614841244098, "grad_norm": 0.19781635701656342, "learning_rate": 4.9957400514750484e-05, "loss": 0.5077, "step": 6665 }, { "epoch": 0.24038634807366563, "grad_norm": 0.2605617344379425, "learning_rate": 4.99572300633115e-05, "loss": 0.4947, "step": 6670 }, { "epoch": 0.24056654773489025, "grad_norm": 0.22600892186164856, "learning_rate": 4.995705927183508e-05, "loss": 0.4973, "step": 6675 }, { "epoch": 0.2407467473961149, "grad_norm": 0.1905760020017624, "learning_rate": 4.995688814032357e-05, "loss": 0.5074, "step": 6680 }, { "epoch": 0.24092694705733952, "grad_norm": 0.19111768901348114, "learning_rate": 4.995671666877928e-05, "loss": 0.4408, "step": 6685 }, { "epoch": 0.24110714671856417, "grad_norm": 0.16061218082904816, "learning_rate": 4.995654485720456e-05, "loss": 0.4824, "step": 6690 }, { "epoch": 0.2412873463797888, "grad_norm": 0.21395018696784973, "learning_rate": 4.9956372705601754e-05, "loss": 0.4909, "step": 6695 }, { "epoch": 0.24146754604101345, "grad_norm": 0.18952535092830658, "learning_rate": 4.9956200213973195e-05, "loss": 0.476, "step": 6700 }, { "epoch": 0.24164774570223807, "grad_norm": 0.23430414497852325, "learning_rate": 4.9956027382321244e-05, "loss": 0.4684, "step": 6705 }, { "epoch": 0.24182794536346272, "grad_norm": 0.23189213871955872, "learning_rate": 4.9955854210648246e-05, "loss": 0.4627, "step": 6710 }, { "epoch": 0.24200814502468734, "grad_norm": 0.19272887706756592, "learning_rate": 4.995568069895657e-05, "loss": 0.4993, "step": 6715 }, { "epoch": 0.242188344685912, "grad_norm": 0.24815817177295685, "learning_rate": 4.995550684724858e-05, "loss": 0.4526, "step": 6720 }, { "epoch": 0.24236854434713662, "grad_norm": 0.2101772576570511, "learning_rate": 4.995533265552663e-05, "loss": 0.4711, "step": 6725 }, { "epoch": 0.24254874400836127, "grad_norm": 0.18738162517547607, "learning_rate": 4.995515812379311e-05, "loss": 0.4603, "step": 6730 }, { "epoch": 0.2427289436695859, "grad_norm": 0.20007184147834778, "learning_rate": 4.9954983252050393e-05, "loss": 0.4957, "step": 6735 }, { "epoch": 0.24290914333081054, "grad_norm": 0.22411781549453735, "learning_rate": 4.995480804030086e-05, "loss": 0.4503, "step": 6740 }, { "epoch": 0.24308934299203516, "grad_norm": 0.19585049152374268, "learning_rate": 4.99546324885469e-05, "loss": 0.4572, "step": 6745 }, { "epoch": 0.2432695426532598, "grad_norm": 0.1979917585849762, "learning_rate": 4.99544565967909e-05, "loss": 0.436, "step": 6750 }, { "epoch": 0.24344974231448444, "grad_norm": 0.16787096858024597, "learning_rate": 4.995428036503527e-05, "loss": 0.4745, "step": 6755 }, { "epoch": 0.24362994197570909, "grad_norm": 0.1978481262922287, "learning_rate": 4.995410379328239e-05, "loss": 0.4553, "step": 6760 }, { "epoch": 0.24381014163693374, "grad_norm": 0.22523565590381622, "learning_rate": 4.995392688153468e-05, "loss": 0.4355, "step": 6765 }, { "epoch": 0.24399034129815836, "grad_norm": 0.16378116607666016, "learning_rate": 4.995374962979455e-05, "loss": 0.4136, "step": 6770 }, { "epoch": 0.244170540959383, "grad_norm": 0.17417633533477783, "learning_rate": 4.995357203806441e-05, "loss": 0.4231, "step": 6775 }, { "epoch": 0.24435074062060763, "grad_norm": 0.20466962456703186, "learning_rate": 4.9953394106346686e-05, "loss": 0.5051, "step": 6780 }, { "epoch": 0.24453094028183228, "grad_norm": 0.1599748432636261, "learning_rate": 4.99532158346438e-05, "loss": 0.4419, "step": 6785 }, { "epoch": 0.2447111399430569, "grad_norm": 0.18782804906368256, "learning_rate": 4.995303722295816e-05, "loss": 0.4459, "step": 6790 }, { "epoch": 0.24489133960428155, "grad_norm": 0.19041045010089874, "learning_rate": 4.995285827129224e-05, "loss": 0.458, "step": 6795 }, { "epoch": 0.24507153926550618, "grad_norm": 0.2037457972764969, "learning_rate": 4.995267897964845e-05, "loss": 0.4666, "step": 6800 }, { "epoch": 0.24525173892673083, "grad_norm": 0.2316879779100418, "learning_rate": 4.995249934802925e-05, "loss": 0.4435, "step": 6805 }, { "epoch": 0.24543193858795545, "grad_norm": 0.19015929102897644, "learning_rate": 4.995231937643706e-05, "loss": 0.4936, "step": 6810 }, { "epoch": 0.2456121382491801, "grad_norm": 0.17666086554527283, "learning_rate": 4.995213906487436e-05, "loss": 0.443, "step": 6815 }, { "epoch": 0.24579233791040472, "grad_norm": 0.17423413693904877, "learning_rate": 4.995195841334359e-05, "loss": 0.4887, "step": 6820 }, { "epoch": 0.24597253757162937, "grad_norm": 0.19055670499801636, "learning_rate": 4.9951777421847225e-05, "loss": 0.4609, "step": 6825 }, { "epoch": 0.246152737232854, "grad_norm": 0.21230706572532654, "learning_rate": 4.995159609038772e-05, "loss": 0.4283, "step": 6830 }, { "epoch": 0.24633293689407865, "grad_norm": 0.20157566666603088, "learning_rate": 4.995141441896754e-05, "loss": 0.4772, "step": 6835 }, { "epoch": 0.24651313655530327, "grad_norm": 0.177265927195549, "learning_rate": 4.995123240758919e-05, "loss": 0.4816, "step": 6840 }, { "epoch": 0.24669333621652792, "grad_norm": 0.20768406987190247, "learning_rate": 4.995105005625511e-05, "loss": 0.482, "step": 6845 }, { "epoch": 0.24687353587775254, "grad_norm": 0.18862581253051758, "learning_rate": 4.9950867364967814e-05, "loss": 0.4796, "step": 6850 }, { "epoch": 0.2470537355389772, "grad_norm": 0.20289580523967743, "learning_rate": 4.995068433372978e-05, "loss": 0.509, "step": 6855 }, { "epoch": 0.24723393520020182, "grad_norm": 0.23247657716274261, "learning_rate": 4.9950500962543503e-05, "loss": 0.4609, "step": 6860 }, { "epoch": 0.24741413486142647, "grad_norm": 0.17218773066997528, "learning_rate": 4.995031725141147e-05, "loss": 0.4785, "step": 6865 }, { "epoch": 0.2475943345226511, "grad_norm": 0.21331791579723358, "learning_rate": 4.99501332003362e-05, "loss": 0.4784, "step": 6870 }, { "epoch": 0.24777453418387574, "grad_norm": 0.25006669759750366, "learning_rate": 4.99499488093202e-05, "loss": 0.4931, "step": 6875 }, { "epoch": 0.24795473384510036, "grad_norm": 0.18483485281467438, "learning_rate": 4.994976407836598e-05, "loss": 0.4738, "step": 6880 }, { "epoch": 0.248134933506325, "grad_norm": 0.18278944492340088, "learning_rate": 4.994957900747606e-05, "loss": 0.4416, "step": 6885 }, { "epoch": 0.24831513316754963, "grad_norm": 0.17712728679180145, "learning_rate": 4.9949393596652936e-05, "loss": 0.5227, "step": 6890 }, { "epoch": 0.24849533282877428, "grad_norm": 0.18737031519412994, "learning_rate": 4.994920784589917e-05, "loss": 0.4913, "step": 6895 }, { "epoch": 0.2486755324899989, "grad_norm": 0.2273300439119339, "learning_rate": 4.9949021755217276e-05, "loss": 0.4476, "step": 6900 }, { "epoch": 0.24885573215122356, "grad_norm": 0.238374263048172, "learning_rate": 4.9948835324609786e-05, "loss": 0.4756, "step": 6905 }, { "epoch": 0.24903593181244818, "grad_norm": 0.21403105556964874, "learning_rate": 4.9948648554079246e-05, "loss": 0.448, "step": 6910 }, { "epoch": 0.24921613147367283, "grad_norm": 0.23275822401046753, "learning_rate": 4.9948461443628205e-05, "loss": 0.48, "step": 6915 }, { "epoch": 0.24939633113489745, "grad_norm": 0.13692793250083923, "learning_rate": 4.99482739932592e-05, "loss": 0.4495, "step": 6920 }, { "epoch": 0.2495765307961221, "grad_norm": 0.2151869684457779, "learning_rate": 4.9948086202974795e-05, "loss": 0.4986, "step": 6925 }, { "epoch": 0.24975673045734675, "grad_norm": 0.19741404056549072, "learning_rate": 4.9947898072777557e-05, "loss": 0.4688, "step": 6930 }, { "epoch": 0.24993693011857138, "grad_norm": 0.1841607540845871, "learning_rate": 4.994770960267002e-05, "loss": 0.4252, "step": 6935 }, { "epoch": 0.250117129779796, "grad_norm": 0.16864712536334991, "learning_rate": 4.994752079265478e-05, "loss": 0.4635, "step": 6940 }, { "epoch": 0.2502973294410207, "grad_norm": 0.1859603226184845, "learning_rate": 4.9947331642734394e-05, "loss": 0.4722, "step": 6945 }, { "epoch": 0.2504775291022453, "grad_norm": 0.19181598722934723, "learning_rate": 4.994714215291144e-05, "loss": 0.5074, "step": 6950 }, { "epoch": 0.2506577287634699, "grad_norm": 0.17568619549274445, "learning_rate": 4.9946952323188514e-05, "loss": 0.4396, "step": 6955 }, { "epoch": 0.25083792842469455, "grad_norm": 0.18716076016426086, "learning_rate": 4.9946762153568195e-05, "loss": 0.4776, "step": 6960 }, { "epoch": 0.2510181280859192, "grad_norm": 0.18289898335933685, "learning_rate": 4.994657164405306e-05, "loss": 0.4569, "step": 6965 }, { "epoch": 0.25119832774714385, "grad_norm": 0.1580553501844406, "learning_rate": 4.994638079464572e-05, "loss": 0.4865, "step": 6970 }, { "epoch": 0.25137852740836847, "grad_norm": 0.200215145945549, "learning_rate": 4.9946189605348775e-05, "loss": 0.4281, "step": 6975 }, { "epoch": 0.2515587270695931, "grad_norm": 0.16953638195991516, "learning_rate": 4.9945998076164824e-05, "loss": 0.4802, "step": 6980 }, { "epoch": 0.25173892673081777, "grad_norm": 0.22101880609989166, "learning_rate": 4.994580620709648e-05, "loss": 0.4825, "step": 6985 }, { "epoch": 0.2519191263920424, "grad_norm": 0.1716253012418747, "learning_rate": 4.9945613998146356e-05, "loss": 0.4562, "step": 6990 }, { "epoch": 0.252099326053267, "grad_norm": 0.19001296162605286, "learning_rate": 4.9945421449317065e-05, "loss": 0.4797, "step": 6995 }, { "epoch": 0.25227952571449164, "grad_norm": 0.1752406656742096, "learning_rate": 4.9945228560611244e-05, "loss": 0.4398, "step": 7000 }, { "epoch": 0.25227952571449164, "eval_loss": 0.49143746495246887, "eval_runtime": 3.5206, "eval_samples_per_second": 28.404, "eval_steps_per_second": 7.101, "step": 7000 }, { "epoch": 0.2524597253757163, "grad_norm": 0.15203414857387543, "learning_rate": 4.994503533203151e-05, "loss": 0.4602, "step": 7005 }, { "epoch": 0.25263992503694094, "grad_norm": 0.1956150233745575, "learning_rate": 4.9944841763580505e-05, "loss": 0.4639, "step": 7010 }, { "epoch": 0.25282012469816556, "grad_norm": 0.18748140335083008, "learning_rate": 4.9944647855260854e-05, "loss": 0.4597, "step": 7015 }, { "epoch": 0.2530003243593902, "grad_norm": 0.19696061313152313, "learning_rate": 4.99444536070752e-05, "loss": 0.449, "step": 7020 }, { "epoch": 0.25318052402061486, "grad_norm": 0.1847335398197174, "learning_rate": 4.9944259019026207e-05, "loss": 0.4875, "step": 7025 }, { "epoch": 0.2533607236818395, "grad_norm": 0.17691460251808167, "learning_rate": 4.99440640911165e-05, "loss": 0.4793, "step": 7030 }, { "epoch": 0.2535409233430641, "grad_norm": 0.18585610389709473, "learning_rate": 4.994386882334877e-05, "loss": 0.4646, "step": 7035 }, { "epoch": 0.25372112300428873, "grad_norm": 0.2865478992462158, "learning_rate": 4.994367321572564e-05, "loss": 0.491, "step": 7040 }, { "epoch": 0.2539013226655134, "grad_norm": 0.20224061608314514, "learning_rate": 4.9943477268249796e-05, "loss": 0.5204, "step": 7045 }, { "epoch": 0.25408152232673803, "grad_norm": 0.181314155459404, "learning_rate": 4.99432809809239e-05, "loss": 0.4963, "step": 7050 }, { "epoch": 0.25426172198796265, "grad_norm": 0.22092019021511078, "learning_rate": 4.9943084353750635e-05, "loss": 0.4489, "step": 7055 }, { "epoch": 0.2544419216491873, "grad_norm": 0.19176073372364044, "learning_rate": 4.9942887386732676e-05, "loss": 0.4806, "step": 7060 }, { "epoch": 0.25462212131041195, "grad_norm": 0.2104840874671936, "learning_rate": 4.99426900798727e-05, "loss": 0.4741, "step": 7065 }, { "epoch": 0.2548023209716366, "grad_norm": 0.14947918057441711, "learning_rate": 4.9942492433173405e-05, "loss": 0.5013, "step": 7070 }, { "epoch": 0.2549825206328612, "grad_norm": 0.1898050606250763, "learning_rate": 4.9942294446637486e-05, "loss": 0.4995, "step": 7075 }, { "epoch": 0.2551627202940858, "grad_norm": 0.18579892814159393, "learning_rate": 4.9942096120267626e-05, "loss": 0.4634, "step": 7080 }, { "epoch": 0.2553429199553105, "grad_norm": 0.2083805650472641, "learning_rate": 4.9941897454066535e-05, "loss": 0.4602, "step": 7085 }, { "epoch": 0.2555231196165351, "grad_norm": 0.22849465906620026, "learning_rate": 4.9941698448036916e-05, "loss": 0.4593, "step": 7090 }, { "epoch": 0.25570331927775974, "grad_norm": 0.2611923813819885, "learning_rate": 4.994149910218149e-05, "loss": 0.4715, "step": 7095 }, { "epoch": 0.2558835189389844, "grad_norm": 0.20796480774879456, "learning_rate": 4.994129941650296e-05, "loss": 0.4678, "step": 7100 }, { "epoch": 0.25606371860020904, "grad_norm": 0.16699600219726562, "learning_rate": 4.994109939100406e-05, "loss": 0.4886, "step": 7105 }, { "epoch": 0.25624391826143367, "grad_norm": 0.18829451501369476, "learning_rate": 4.994089902568751e-05, "loss": 0.4502, "step": 7110 }, { "epoch": 0.2564241179226583, "grad_norm": 0.20573166012763977, "learning_rate": 4.994069832055604e-05, "loss": 0.4985, "step": 7115 }, { "epoch": 0.25660431758388297, "grad_norm": 0.22231273353099823, "learning_rate": 4.994049727561239e-05, "loss": 0.4926, "step": 7120 }, { "epoch": 0.2567845172451076, "grad_norm": 0.19772711396217346, "learning_rate": 4.994029589085929e-05, "loss": 0.4781, "step": 7125 }, { "epoch": 0.2569647169063322, "grad_norm": 0.24097611010074615, "learning_rate": 4.9940094166299486e-05, "loss": 0.4474, "step": 7130 }, { "epoch": 0.25714491656755684, "grad_norm": 0.2717370390892029, "learning_rate": 4.9939892101935723e-05, "loss": 0.473, "step": 7135 }, { "epoch": 0.2573251162287815, "grad_norm": 0.16408710181713104, "learning_rate": 4.993968969777076e-05, "loss": 0.4656, "step": 7140 }, { "epoch": 0.25750531589000614, "grad_norm": 0.19900165498256683, "learning_rate": 4.993948695380736e-05, "loss": 0.4309, "step": 7145 }, { "epoch": 0.25768551555123076, "grad_norm": 0.19811104238033295, "learning_rate": 4.993928387004827e-05, "loss": 0.4194, "step": 7150 }, { "epoch": 0.2578657152124554, "grad_norm": 0.20136548578739166, "learning_rate": 4.9939080446496264e-05, "loss": 0.4728, "step": 7155 }, { "epoch": 0.25804591487368006, "grad_norm": 0.20853722095489502, "learning_rate": 4.993887668315413e-05, "loss": 0.436, "step": 7160 }, { "epoch": 0.2582261145349047, "grad_norm": 0.18556027114391327, "learning_rate": 4.9938672580024615e-05, "loss": 0.4321, "step": 7165 }, { "epoch": 0.2584063141961293, "grad_norm": 0.24166804552078247, "learning_rate": 4.993846813711052e-05, "loss": 0.4749, "step": 7170 }, { "epoch": 0.25858651385735393, "grad_norm": 0.17454582452774048, "learning_rate": 4.9938263354414626e-05, "loss": 0.4782, "step": 7175 }, { "epoch": 0.2587667135185786, "grad_norm": 0.18085728585720062, "learning_rate": 4.993805823193972e-05, "loss": 0.4631, "step": 7180 }, { "epoch": 0.25894691317980323, "grad_norm": 0.168624609708786, "learning_rate": 4.99378527696886e-05, "loss": 0.4665, "step": 7185 }, { "epoch": 0.25912711284102785, "grad_norm": 0.17559649050235748, "learning_rate": 4.9937646967664066e-05, "loss": 0.4262, "step": 7190 }, { "epoch": 0.2593073125022525, "grad_norm": 0.1460302621126175, "learning_rate": 4.993744082586891e-05, "loss": 0.4484, "step": 7195 }, { "epoch": 0.25948751216347715, "grad_norm": 0.250219464302063, "learning_rate": 4.9937234344305964e-05, "loss": 0.477, "step": 7200 }, { "epoch": 0.2596677118247018, "grad_norm": 0.17748834192752838, "learning_rate": 4.993702752297802e-05, "loss": 0.4931, "step": 7205 }, { "epoch": 0.2598479114859264, "grad_norm": 0.21896113455295563, "learning_rate": 4.99368203618879e-05, "loss": 0.4621, "step": 7210 }, { "epoch": 0.260028111147151, "grad_norm": 0.25870126485824585, "learning_rate": 4.9936612861038446e-05, "loss": 0.4634, "step": 7215 }, { "epoch": 0.2602083108083757, "grad_norm": 0.1914265900850296, "learning_rate": 4.993640502043246e-05, "loss": 0.4604, "step": 7220 }, { "epoch": 0.2603885104696003, "grad_norm": 0.16129115223884583, "learning_rate": 4.993619684007278e-05, "loss": 0.4598, "step": 7225 }, { "epoch": 0.26056871013082494, "grad_norm": 0.19293855130672455, "learning_rate": 4.993598831996225e-05, "loss": 0.464, "step": 7230 }, { "epoch": 0.26074890979204957, "grad_norm": 0.1789807379245758, "learning_rate": 4.993577946010371e-05, "loss": 0.4816, "step": 7235 }, { "epoch": 0.26092910945327424, "grad_norm": 0.19150225818157196, "learning_rate": 4.99355702605e-05, "loss": 0.4875, "step": 7240 }, { "epoch": 0.26110930911449887, "grad_norm": 0.2224695235490799, "learning_rate": 4.9935360721153965e-05, "loss": 0.4727, "step": 7245 }, { "epoch": 0.2612895087757235, "grad_norm": 0.17574471235275269, "learning_rate": 4.993515084206848e-05, "loss": 0.4247, "step": 7250 }, { "epoch": 0.2614697084369481, "grad_norm": 0.16948924958705902, "learning_rate": 4.9934940623246387e-05, "loss": 0.4704, "step": 7255 }, { "epoch": 0.2616499080981728, "grad_norm": 0.17520290613174438, "learning_rate": 4.993473006469055e-05, "loss": 0.4646, "step": 7260 }, { "epoch": 0.2618301077593974, "grad_norm": 0.20636965334415436, "learning_rate": 4.993451916640386e-05, "loss": 0.4463, "step": 7265 }, { "epoch": 0.26201030742062204, "grad_norm": 0.1967248171567917, "learning_rate": 4.9934307928389154e-05, "loss": 0.4631, "step": 7270 }, { "epoch": 0.2621905070818467, "grad_norm": 0.22940480709075928, "learning_rate": 4.993409635064934e-05, "loss": 0.4354, "step": 7275 }, { "epoch": 0.26237070674307134, "grad_norm": 0.2190198004245758, "learning_rate": 4.9933884433187295e-05, "loss": 0.4826, "step": 7280 }, { "epoch": 0.26255090640429596, "grad_norm": 0.16079477965831757, "learning_rate": 4.9933672176005894e-05, "loss": 0.4499, "step": 7285 }, { "epoch": 0.2627311060655206, "grad_norm": 0.24084745347499847, "learning_rate": 4.993345957910804e-05, "loss": 0.5024, "step": 7290 }, { "epoch": 0.26291130572674526, "grad_norm": 0.16274696588516235, "learning_rate": 4.9933246642496626e-05, "loss": 0.4463, "step": 7295 }, { "epoch": 0.2630915053879699, "grad_norm": 0.21243201196193695, "learning_rate": 4.9933033366174554e-05, "loss": 0.4764, "step": 7300 }, { "epoch": 0.2632717050491945, "grad_norm": 0.20116925239562988, "learning_rate": 4.9932819750144734e-05, "loss": 0.484, "step": 7305 }, { "epoch": 0.2634519047104191, "grad_norm": 0.20278726518154144, "learning_rate": 4.993260579441006e-05, "loss": 0.4757, "step": 7310 }, { "epoch": 0.2636321043716438, "grad_norm": 0.17847001552581787, "learning_rate": 4.993239149897347e-05, "loss": 0.4309, "step": 7315 }, { "epoch": 0.2638123040328684, "grad_norm": 0.1934574991464615, "learning_rate": 4.993217686383787e-05, "loss": 0.4267, "step": 7320 }, { "epoch": 0.26399250369409305, "grad_norm": 0.1816255897283554, "learning_rate": 4.993196188900618e-05, "loss": 0.4676, "step": 7325 }, { "epoch": 0.2641727033553177, "grad_norm": 0.1440691500902176, "learning_rate": 4.993174657448135e-05, "loss": 0.4451, "step": 7330 }, { "epoch": 0.26435290301654235, "grad_norm": 0.18416348099708557, "learning_rate": 4.993153092026629e-05, "loss": 0.4692, "step": 7335 }, { "epoch": 0.264533102677767, "grad_norm": 0.1908058077096939, "learning_rate": 4.9931314926363945e-05, "loss": 0.438, "step": 7340 }, { "epoch": 0.2647133023389916, "grad_norm": 0.17665372788906097, "learning_rate": 4.993109859277727e-05, "loss": 0.4475, "step": 7345 }, { "epoch": 0.2648935020002162, "grad_norm": 0.20020587742328644, "learning_rate": 4.99308819195092e-05, "loss": 0.466, "step": 7350 }, { "epoch": 0.2650737016614409, "grad_norm": 0.20081447064876556, "learning_rate": 4.9930664906562695e-05, "loss": 0.4793, "step": 7355 }, { "epoch": 0.2652539013226655, "grad_norm": 0.1721320003271103, "learning_rate": 4.993049105163899e-05, "loss": 0.4658, "step": 7360 }, { "epoch": 0.26543410098389014, "grad_norm": 0.24417546391487122, "learning_rate": 4.993027342727875e-05, "loss": 0.4484, "step": 7365 }, { "epoch": 0.26561430064511476, "grad_norm": 0.2776060700416565, "learning_rate": 4.993005546324836e-05, "loss": 0.4846, "step": 7370 }, { "epoch": 0.26579450030633944, "grad_norm": 0.19747385382652283, "learning_rate": 4.9929837159550784e-05, "loss": 0.4603, "step": 7375 }, { "epoch": 0.26597469996756407, "grad_norm": 0.1888158768415451, "learning_rate": 4.9929618516189e-05, "loss": 0.4405, "step": 7380 }, { "epoch": 0.2661548996287887, "grad_norm": 0.20889566838741302, "learning_rate": 4.9929399533166e-05, "loss": 0.4846, "step": 7385 }, { "epoch": 0.2663350992900133, "grad_norm": 0.19640065729618073, "learning_rate": 4.992918021048475e-05, "loss": 0.4725, "step": 7390 }, { "epoch": 0.266515298951238, "grad_norm": 0.20840995013713837, "learning_rate": 4.992896054814825e-05, "loss": 0.4594, "step": 7395 }, { "epoch": 0.2666954986124626, "grad_norm": 0.2129368633031845, "learning_rate": 4.992874054615949e-05, "loss": 0.4721, "step": 7400 }, { "epoch": 0.26687569827368723, "grad_norm": 0.13430319726467133, "learning_rate": 4.992852020452147e-05, "loss": 0.4479, "step": 7405 }, { "epoch": 0.26705589793491186, "grad_norm": 0.2261311113834381, "learning_rate": 4.992829952323718e-05, "loss": 0.4972, "step": 7410 }, { "epoch": 0.26723609759613653, "grad_norm": 0.21001654863357544, "learning_rate": 4.992807850230964e-05, "loss": 0.5047, "step": 7415 }, { "epoch": 0.26741629725736116, "grad_norm": 0.2056044489145279, "learning_rate": 4.992785714174185e-05, "loss": 0.4793, "step": 7420 }, { "epoch": 0.2675964969185858, "grad_norm": 0.156636044383049, "learning_rate": 4.9927635441536844e-05, "loss": 0.4914, "step": 7425 }, { "epoch": 0.2677766965798104, "grad_norm": 0.18658673763275146, "learning_rate": 4.9927413401697625e-05, "loss": 0.4451, "step": 7430 }, { "epoch": 0.2679568962410351, "grad_norm": 0.14643876254558563, "learning_rate": 4.992719102222723e-05, "loss": 0.4108, "step": 7435 }, { "epoch": 0.2681370959022597, "grad_norm": 0.2012728452682495, "learning_rate": 4.9926968303128674e-05, "loss": 0.4497, "step": 7440 }, { "epoch": 0.2683172955634843, "grad_norm": 0.19251321256160736, "learning_rate": 4.9926745244405e-05, "loss": 0.4938, "step": 7445 }, { "epoch": 0.268497495224709, "grad_norm": 0.19038186967372894, "learning_rate": 4.992652184605926e-05, "loss": 0.4739, "step": 7450 }, { "epoch": 0.2686776948859336, "grad_norm": 0.21992941200733185, "learning_rate": 4.992629810809448e-05, "loss": 0.4839, "step": 7455 }, { "epoch": 0.26885789454715825, "grad_norm": 0.208626389503479, "learning_rate": 4.992607403051371e-05, "loss": 0.4498, "step": 7460 }, { "epoch": 0.26903809420838287, "grad_norm": 0.14760567247867584, "learning_rate": 4.9925849613320006e-05, "loss": 0.4185, "step": 7465 }, { "epoch": 0.26921829386960755, "grad_norm": 0.2208520770072937, "learning_rate": 4.992562485651644e-05, "loss": 0.4734, "step": 7470 }, { "epoch": 0.2693984935308322, "grad_norm": 0.18603351712226868, "learning_rate": 4.992539976010605e-05, "loss": 0.5101, "step": 7475 }, { "epoch": 0.2695786931920568, "grad_norm": 0.2119520753622055, "learning_rate": 4.992517432409192e-05, "loss": 0.4816, "step": 7480 }, { "epoch": 0.2697588928532814, "grad_norm": 0.17017479240894318, "learning_rate": 4.99249485484771e-05, "loss": 0.4189, "step": 7485 }, { "epoch": 0.2699390925145061, "grad_norm": 0.2112009972333908, "learning_rate": 4.99247224332647e-05, "loss": 0.4506, "step": 7490 }, { "epoch": 0.2701192921757307, "grad_norm": 0.22888222336769104, "learning_rate": 4.992449597845777e-05, "loss": 0.444, "step": 7495 }, { "epoch": 0.27029949183695534, "grad_norm": 0.19122794270515442, "learning_rate": 4.992426918405941e-05, "loss": 0.4645, "step": 7500 }, { "epoch": 0.27029949183695534, "eval_loss": 0.48843687772750854, "eval_runtime": 3.5217, "eval_samples_per_second": 28.395, "eval_steps_per_second": 7.099, "step": 7500 }, { "epoch": 0.27047969149817996, "grad_norm": 0.18610242009162903, "learning_rate": 4.992404205007272e-05, "loss": 0.4546, "step": 7505 }, { "epoch": 0.27065989115940464, "grad_norm": 0.23450550436973572, "learning_rate": 4.992381457650077e-05, "loss": 0.4374, "step": 7510 }, { "epoch": 0.27084009082062926, "grad_norm": 0.2130429446697235, "learning_rate": 4.9923586763346674e-05, "loss": 0.4876, "step": 7515 }, { "epoch": 0.2710202904818539, "grad_norm": 0.18153907358646393, "learning_rate": 4.992335861061354e-05, "loss": 0.4945, "step": 7520 }, { "epoch": 0.2712004901430785, "grad_norm": 0.16226467490196228, "learning_rate": 4.992313011830446e-05, "loss": 0.4293, "step": 7525 }, { "epoch": 0.2713806898043032, "grad_norm": 0.18704953789710999, "learning_rate": 4.992290128642257e-05, "loss": 0.4861, "step": 7530 }, { "epoch": 0.2715608894655278, "grad_norm": 0.24006149172782898, "learning_rate": 4.992267211497097e-05, "loss": 0.5093, "step": 7535 }, { "epoch": 0.27174108912675243, "grad_norm": 0.17990513145923615, "learning_rate": 4.992244260395278e-05, "loss": 0.5006, "step": 7540 }, { "epoch": 0.27192128878797706, "grad_norm": 0.19324089586734772, "learning_rate": 4.992221275337115e-05, "loss": 0.4631, "step": 7545 }, { "epoch": 0.27210148844920173, "grad_norm": 0.1453637182712555, "learning_rate": 4.992198256322918e-05, "loss": 0.4758, "step": 7550 }, { "epoch": 0.27228168811042636, "grad_norm": 0.18295510113239288, "learning_rate": 4.992175203353003e-05, "loss": 0.4832, "step": 7555 }, { "epoch": 0.272461887771651, "grad_norm": 0.20734231173992157, "learning_rate": 4.992152116427683e-05, "loss": 0.4669, "step": 7560 }, { "epoch": 0.2726420874328756, "grad_norm": 0.17811109125614166, "learning_rate": 4.992128995547274e-05, "loss": 0.4197, "step": 7565 }, { "epoch": 0.2728222870941003, "grad_norm": 0.16642598807811737, "learning_rate": 4.992105840712089e-05, "loss": 0.4575, "step": 7570 }, { "epoch": 0.2730024867553249, "grad_norm": 0.2581879794597626, "learning_rate": 4.992082651922444e-05, "loss": 0.4631, "step": 7575 }, { "epoch": 0.2731826864165495, "grad_norm": 0.17311778664588928, "learning_rate": 4.992059429178656e-05, "loss": 0.4674, "step": 7580 }, { "epoch": 0.27336288607777415, "grad_norm": 0.17181158065795898, "learning_rate": 4.992036172481041e-05, "loss": 0.4439, "step": 7585 }, { "epoch": 0.2735430857389988, "grad_norm": 0.21754592657089233, "learning_rate": 4.992012881829915e-05, "loss": 0.4399, "step": 7590 }, { "epoch": 0.27372328540022345, "grad_norm": 0.18678173422813416, "learning_rate": 4.9919895572255956e-05, "loss": 0.4932, "step": 7595 }, { "epoch": 0.27390348506144807, "grad_norm": 0.16362018883228302, "learning_rate": 4.9919661986684024e-05, "loss": 0.4359, "step": 7600 }, { "epoch": 0.27408368472267275, "grad_norm": 0.2467060089111328, "learning_rate": 4.991942806158652e-05, "loss": 0.4648, "step": 7605 }, { "epoch": 0.27426388438389737, "grad_norm": 0.22692877054214478, "learning_rate": 4.991919379696662e-05, "loss": 0.5261, "step": 7610 }, { "epoch": 0.274444084045122, "grad_norm": 0.1850813329219818, "learning_rate": 4.9918959192827534e-05, "loss": 0.4305, "step": 7615 }, { "epoch": 0.2746242837063466, "grad_norm": 0.183277890086174, "learning_rate": 4.9918724249172454e-05, "loss": 0.4175, "step": 7620 }, { "epoch": 0.2748044833675713, "grad_norm": 0.18612994253635406, "learning_rate": 4.9918488966004587e-05, "loss": 0.4802, "step": 7625 }, { "epoch": 0.2749846830287959, "grad_norm": 0.1732574850320816, "learning_rate": 4.9918253343327123e-05, "loss": 0.4322, "step": 7630 }, { "epoch": 0.27516488269002054, "grad_norm": 0.20043861865997314, "learning_rate": 4.991801738114329e-05, "loss": 0.4696, "step": 7635 }, { "epoch": 0.27534508235124516, "grad_norm": 0.2102193683385849, "learning_rate": 4.991778107945629e-05, "loss": 0.4652, "step": 7640 }, { "epoch": 0.27552528201246984, "grad_norm": 0.20501478016376495, "learning_rate": 4.9917544438269346e-05, "loss": 0.4636, "step": 7645 }, { "epoch": 0.27570548167369446, "grad_norm": 0.17633172869682312, "learning_rate": 4.991730745758568e-05, "loss": 0.4995, "step": 7650 }, { "epoch": 0.2758856813349191, "grad_norm": 0.18572551012039185, "learning_rate": 4.991707013740853e-05, "loss": 0.4752, "step": 7655 }, { "epoch": 0.2760658809961437, "grad_norm": 0.14463745057582855, "learning_rate": 4.991683247774113e-05, "loss": 0.4448, "step": 7660 }, { "epoch": 0.2762460806573684, "grad_norm": 0.2664778232574463, "learning_rate": 4.99165944785867e-05, "loss": 0.4605, "step": 7665 }, { "epoch": 0.276426280318593, "grad_norm": 0.20322363078594208, "learning_rate": 4.991635613994849e-05, "loss": 0.4676, "step": 7670 }, { "epoch": 0.27660647997981763, "grad_norm": 0.21015766263008118, "learning_rate": 4.991611746182977e-05, "loss": 0.4449, "step": 7675 }, { "epoch": 0.27678667964104225, "grad_norm": 0.23488974571228027, "learning_rate": 4.991587844423376e-05, "loss": 0.4613, "step": 7680 }, { "epoch": 0.27696687930226693, "grad_norm": 0.2240959256887436, "learning_rate": 4.9915639087163736e-05, "loss": 0.4601, "step": 7685 }, { "epoch": 0.27714707896349156, "grad_norm": 0.18855224549770355, "learning_rate": 4.991539939062295e-05, "loss": 0.4584, "step": 7690 }, { "epoch": 0.2773272786247162, "grad_norm": 0.20485761761665344, "learning_rate": 4.9915159354614674e-05, "loss": 0.4444, "step": 7695 }, { "epoch": 0.2775074782859408, "grad_norm": 0.18977908790111542, "learning_rate": 4.9914918979142163e-05, "loss": 0.4626, "step": 7700 }, { "epoch": 0.2776876779471655, "grad_norm": 0.15290701389312744, "learning_rate": 4.991467826420872e-05, "loss": 0.4448, "step": 7705 }, { "epoch": 0.2778678776083901, "grad_norm": 0.22318267822265625, "learning_rate": 4.99144372098176e-05, "loss": 0.4937, "step": 7710 }, { "epoch": 0.2780480772696147, "grad_norm": 0.2257704883813858, "learning_rate": 4.9914195815972104e-05, "loss": 0.4445, "step": 7715 }, { "epoch": 0.27822827693083935, "grad_norm": 0.2357822060585022, "learning_rate": 4.991395408267551e-05, "loss": 0.4546, "step": 7720 }, { "epoch": 0.278408476592064, "grad_norm": 0.17252907156944275, "learning_rate": 4.991371200993111e-05, "loss": 0.445, "step": 7725 }, { "epoch": 0.27858867625328865, "grad_norm": 0.16278617084026337, "learning_rate": 4.991346959774221e-05, "loss": 0.4839, "step": 7730 }, { "epoch": 0.27876887591451327, "grad_norm": 0.1887577474117279, "learning_rate": 4.9913226846112114e-05, "loss": 0.4232, "step": 7735 }, { "epoch": 0.2789490755757379, "grad_norm": 0.17083100974559784, "learning_rate": 4.991298375504413e-05, "loss": 0.4863, "step": 7740 }, { "epoch": 0.27912927523696257, "grad_norm": 0.1833745837211609, "learning_rate": 4.991274032454156e-05, "loss": 0.4544, "step": 7745 }, { "epoch": 0.2793094748981872, "grad_norm": 0.15337827801704407, "learning_rate": 4.991249655460773e-05, "loss": 0.4621, "step": 7750 }, { "epoch": 0.2794896745594118, "grad_norm": 0.22126014530658722, "learning_rate": 4.991225244524595e-05, "loss": 0.4861, "step": 7755 }, { "epoch": 0.27966987422063644, "grad_norm": 0.203162282705307, "learning_rate": 4.991200799645955e-05, "loss": 0.4432, "step": 7760 }, { "epoch": 0.2798500738818611, "grad_norm": 0.1990215927362442, "learning_rate": 4.991176320825188e-05, "loss": 0.4583, "step": 7765 }, { "epoch": 0.28003027354308574, "grad_norm": 0.18323101103305817, "learning_rate": 4.991151808062625e-05, "loss": 0.4382, "step": 7770 }, { "epoch": 0.28021047320431036, "grad_norm": 0.1830572485923767, "learning_rate": 4.9911272613586006e-05, "loss": 0.4663, "step": 7775 }, { "epoch": 0.28039067286553504, "grad_norm": 0.1886363923549652, "learning_rate": 4.99110268071345e-05, "loss": 0.4462, "step": 7780 }, { "epoch": 0.28057087252675966, "grad_norm": 0.15756884217262268, "learning_rate": 4.991078066127508e-05, "loss": 0.4707, "step": 7785 }, { "epoch": 0.2807510721879843, "grad_norm": 0.17909304797649384, "learning_rate": 4.991053417601109e-05, "loss": 0.497, "step": 7790 }, { "epoch": 0.2809312718492089, "grad_norm": 0.22559069097042084, "learning_rate": 4.99102873513459e-05, "loss": 0.5026, "step": 7795 }, { "epoch": 0.2811114715104336, "grad_norm": 0.19916412234306335, "learning_rate": 4.991004018728286e-05, "loss": 0.4825, "step": 7800 }, { "epoch": 0.2812916711716582, "grad_norm": 0.23998232185840607, "learning_rate": 4.990979268382535e-05, "loss": 0.5141, "step": 7805 }, { "epoch": 0.28147187083288283, "grad_norm": 0.17623983323574066, "learning_rate": 4.9909544840976744e-05, "loss": 0.4464, "step": 7810 }, { "epoch": 0.28165207049410745, "grad_norm": 0.2640053927898407, "learning_rate": 4.99092966587404e-05, "loss": 0.4738, "step": 7815 }, { "epoch": 0.28183227015533213, "grad_norm": 0.15781162679195404, "learning_rate": 4.990904813711972e-05, "loss": 0.4283, "step": 7820 }, { "epoch": 0.28201246981655675, "grad_norm": 0.15870793163776398, "learning_rate": 4.990879927611808e-05, "loss": 0.4477, "step": 7825 }, { "epoch": 0.2821926694777814, "grad_norm": 0.23389936983585358, "learning_rate": 4.990855007573887e-05, "loss": 0.4772, "step": 7830 }, { "epoch": 0.282372869139006, "grad_norm": 0.24188333749771118, "learning_rate": 4.9908300535985486e-05, "loss": 0.4548, "step": 7835 }, { "epoch": 0.2825530688002307, "grad_norm": 0.1669192612171173, "learning_rate": 4.990805065686133e-05, "loss": 0.4476, "step": 7840 }, { "epoch": 0.2827332684614553, "grad_norm": 0.15673232078552246, "learning_rate": 4.990780043836981e-05, "loss": 0.4181, "step": 7845 }, { "epoch": 0.2829134681226799, "grad_norm": 0.146693155169487, "learning_rate": 4.9907549880514334e-05, "loss": 0.4416, "step": 7850 }, { "epoch": 0.28309366778390455, "grad_norm": 0.24487614631652832, "learning_rate": 4.990729898329831e-05, "loss": 0.4169, "step": 7855 }, { "epoch": 0.2832738674451292, "grad_norm": 0.2468927651643753, "learning_rate": 4.9907047746725154e-05, "loss": 0.4804, "step": 7860 }, { "epoch": 0.28345406710635385, "grad_norm": 0.22746756672859192, "learning_rate": 4.99067961707983e-05, "loss": 0.4499, "step": 7865 }, { "epoch": 0.28363426676757847, "grad_norm": 0.17610248923301697, "learning_rate": 4.990654425552117e-05, "loss": 0.4824, "step": 7870 }, { "epoch": 0.2838144664288031, "grad_norm": 0.20157018303871155, "learning_rate": 4.9906292000897196e-05, "loss": 0.4641, "step": 7875 }, { "epoch": 0.28399466609002777, "grad_norm": 0.23462137579917908, "learning_rate": 4.990603940692982e-05, "loss": 0.4813, "step": 7880 }, { "epoch": 0.2841748657512524, "grad_norm": 0.19886912405490875, "learning_rate": 4.990578647362247e-05, "loss": 0.4413, "step": 7885 }, { "epoch": 0.284355065412477, "grad_norm": 0.17550869286060333, "learning_rate": 4.9905533200978606e-05, "loss": 0.4518, "step": 7890 }, { "epoch": 0.28453526507370164, "grad_norm": 0.17420583963394165, "learning_rate": 4.9905279589001674e-05, "loss": 0.4741, "step": 7895 }, { "epoch": 0.2847154647349263, "grad_norm": 0.2519500255584717, "learning_rate": 4.990502563769514e-05, "loss": 0.4974, "step": 7900 }, { "epoch": 0.28489566439615094, "grad_norm": 0.21269458532333374, "learning_rate": 4.990477134706244e-05, "loss": 0.527, "step": 7905 }, { "epoch": 0.28507586405737556, "grad_norm": 0.2123083919286728, "learning_rate": 4.990451671710705e-05, "loss": 0.4385, "step": 7910 }, { "epoch": 0.2852560637186002, "grad_norm": 0.17294515669345856, "learning_rate": 4.990426174783245e-05, "loss": 0.4442, "step": 7915 }, { "epoch": 0.28543626337982486, "grad_norm": 0.14767326414585114, "learning_rate": 4.99040064392421e-05, "loss": 0.4443, "step": 7920 }, { "epoch": 0.2856164630410495, "grad_norm": 0.19468598067760468, "learning_rate": 4.9903750791339485e-05, "loss": 0.475, "step": 7925 }, { "epoch": 0.2857966627022741, "grad_norm": 0.20397444069385529, "learning_rate": 4.990349480412809e-05, "loss": 0.4989, "step": 7930 }, { "epoch": 0.28597686236349873, "grad_norm": 0.18751677870750427, "learning_rate": 4.990323847761139e-05, "loss": 0.46, "step": 7935 }, { "epoch": 0.2861570620247234, "grad_norm": 0.20503003895282745, "learning_rate": 4.99029818117929e-05, "loss": 0.4274, "step": 7940 }, { "epoch": 0.28633726168594803, "grad_norm": 0.20516186952590942, "learning_rate": 4.9902724806676094e-05, "loss": 0.4503, "step": 7945 }, { "epoch": 0.28651746134717265, "grad_norm": 0.21163448691368103, "learning_rate": 4.990246746226449e-05, "loss": 0.5434, "step": 7950 }, { "epoch": 0.28669766100839733, "grad_norm": 0.21273358166217804, "learning_rate": 4.9902209778561585e-05, "loss": 0.4581, "step": 7955 }, { "epoch": 0.28687786066962195, "grad_norm": 0.2407582700252533, "learning_rate": 4.9901951755570896e-05, "loss": 0.4555, "step": 7960 }, { "epoch": 0.2870580603308466, "grad_norm": 0.19239749014377594, "learning_rate": 4.9901693393295935e-05, "loss": 0.4809, "step": 7965 }, { "epoch": 0.2872382599920712, "grad_norm": 0.20676837861537933, "learning_rate": 4.990143469174022e-05, "loss": 0.4315, "step": 7970 }, { "epoch": 0.2874184596532959, "grad_norm": 0.26478490233421326, "learning_rate": 4.990117565090728e-05, "loss": 0.4561, "step": 7975 }, { "epoch": 0.2875986593145205, "grad_norm": 0.19174893200397491, "learning_rate": 4.990091627080065e-05, "loss": 0.525, "step": 7980 }, { "epoch": 0.2877788589757451, "grad_norm": 0.18685384094715118, "learning_rate": 4.9900656551423844e-05, "loss": 0.4683, "step": 7985 }, { "epoch": 0.28795905863696974, "grad_norm": 0.17788560688495636, "learning_rate": 4.990039649278042e-05, "loss": 0.4609, "step": 7990 }, { "epoch": 0.2881392582981944, "grad_norm": 0.2256275713443756, "learning_rate": 4.990013609487391e-05, "loss": 0.4687, "step": 7995 }, { "epoch": 0.28831945795941905, "grad_norm": 0.2363923341035843, "learning_rate": 4.989987535770787e-05, "loss": 0.4783, "step": 8000 }, { "epoch": 0.28831945795941905, "eval_loss": 0.48639407753944397, "eval_runtime": 3.5338, "eval_samples_per_second": 28.298, "eval_steps_per_second": 7.075, "step": 8000 }, { "epoch": 0.28849965762064367, "grad_norm": 0.20103095471858978, "learning_rate": 4.9899614281285856e-05, "loss": 0.4766, "step": 8005 }, { "epoch": 0.2886798572818683, "grad_norm": 0.19780664145946503, "learning_rate": 4.98993528656114e-05, "loss": 0.5028, "step": 8010 }, { "epoch": 0.28886005694309297, "grad_norm": 0.19557078182697296, "learning_rate": 4.9899091110688104e-05, "loss": 0.4693, "step": 8015 }, { "epoch": 0.2890402566043176, "grad_norm": 0.18514655530452728, "learning_rate": 4.98988290165195e-05, "loss": 0.4637, "step": 8020 }, { "epoch": 0.2892204562655422, "grad_norm": 0.14113621413707733, "learning_rate": 4.9898566583109174e-05, "loss": 0.4324, "step": 8025 }, { "epoch": 0.28940065592676684, "grad_norm": 0.20794130861759186, "learning_rate": 4.989830381046069e-05, "loss": 0.5302, "step": 8030 }, { "epoch": 0.2895808555879915, "grad_norm": 0.18446312844753265, "learning_rate": 4.9898040698577655e-05, "loss": 0.4497, "step": 8035 }, { "epoch": 0.28976105524921614, "grad_norm": 0.20031079649925232, "learning_rate": 4.9897777247463615e-05, "loss": 0.4522, "step": 8040 }, { "epoch": 0.28994125491044076, "grad_norm": 0.21210455894470215, "learning_rate": 4.989751345712219e-05, "loss": 0.4408, "step": 8045 }, { "epoch": 0.2901214545716654, "grad_norm": 0.18559224903583527, "learning_rate": 4.989724932755697e-05, "loss": 0.4555, "step": 8050 }, { "epoch": 0.29030165423289006, "grad_norm": 0.21420493721961975, "learning_rate": 4.9896984858771546e-05, "loss": 0.4763, "step": 8055 }, { "epoch": 0.2904818538941147, "grad_norm": 0.1991835981607437, "learning_rate": 4.9896720050769516e-05, "loss": 0.4475, "step": 8060 }, { "epoch": 0.2906620535553393, "grad_norm": 0.23717185854911804, "learning_rate": 4.98964549035545e-05, "loss": 0.4635, "step": 8065 }, { "epoch": 0.29084225321656393, "grad_norm": 0.2004196047782898, "learning_rate": 4.989618941713011e-05, "loss": 0.4411, "step": 8070 }, { "epoch": 0.2910224528777886, "grad_norm": 0.21253125369548798, "learning_rate": 4.9895923591499954e-05, "loss": 0.4925, "step": 8075 }, { "epoch": 0.29120265253901323, "grad_norm": 0.253714382648468, "learning_rate": 4.9895657426667666e-05, "loss": 0.4723, "step": 8080 }, { "epoch": 0.29138285220023785, "grad_norm": 0.21997293829917908, "learning_rate": 4.9895390922636854e-05, "loss": 0.4838, "step": 8085 }, { "epoch": 0.2915630518614625, "grad_norm": 0.17473578453063965, "learning_rate": 4.989512407941117e-05, "loss": 0.4264, "step": 8090 }, { "epoch": 0.29174325152268715, "grad_norm": 0.1931363344192505, "learning_rate": 4.989485689699423e-05, "loss": 0.485, "step": 8095 }, { "epoch": 0.2919234511839118, "grad_norm": 0.16745588183403015, "learning_rate": 4.98945893753897e-05, "loss": 0.4865, "step": 8100 }, { "epoch": 0.2921036508451364, "grad_norm": 0.19875845313072205, "learning_rate": 4.98943215146012e-05, "loss": 0.4319, "step": 8105 }, { "epoch": 0.292283850506361, "grad_norm": 0.21367180347442627, "learning_rate": 4.989405331463239e-05, "loss": 0.4779, "step": 8110 }, { "epoch": 0.2924640501675857, "grad_norm": 0.16339702904224396, "learning_rate": 4.989378477548692e-05, "loss": 0.4386, "step": 8115 }, { "epoch": 0.2926442498288103, "grad_norm": 0.15709401667118073, "learning_rate": 4.9893515897168455e-05, "loss": 0.4246, "step": 8120 }, { "epoch": 0.29282444949003494, "grad_norm": 0.23660311102867126, "learning_rate": 4.989324667968066e-05, "loss": 0.4658, "step": 8125 }, { "epoch": 0.2930046491512596, "grad_norm": 0.17152658104896545, "learning_rate": 4.9892977123027194e-05, "loss": 0.4546, "step": 8130 }, { "epoch": 0.29318484881248424, "grad_norm": 0.25571581721305847, "learning_rate": 4.989270722721173e-05, "loss": 0.4507, "step": 8135 }, { "epoch": 0.29336504847370887, "grad_norm": 0.16954413056373596, "learning_rate": 4.989243699223796e-05, "loss": 0.4685, "step": 8140 }, { "epoch": 0.2935452481349335, "grad_norm": 0.19579976797103882, "learning_rate": 4.989216641810955e-05, "loss": 0.5074, "step": 8145 }, { "epoch": 0.29372544779615817, "grad_norm": 0.16534093022346497, "learning_rate": 4.989189550483019e-05, "loss": 0.4594, "step": 8150 }, { "epoch": 0.2939056474573828, "grad_norm": 0.19394879043102264, "learning_rate": 4.9891624252403574e-05, "loss": 0.4696, "step": 8155 }, { "epoch": 0.2940858471186074, "grad_norm": 0.2109389305114746, "learning_rate": 4.98913526608334e-05, "loss": 0.4606, "step": 8160 }, { "epoch": 0.29426604677983204, "grad_norm": 0.19720369577407837, "learning_rate": 4.9891080730123365e-05, "loss": 0.4608, "step": 8165 }, { "epoch": 0.2944462464410567, "grad_norm": 0.16822995245456696, "learning_rate": 4.9890808460277163e-05, "loss": 0.4925, "step": 8170 }, { "epoch": 0.29462644610228134, "grad_norm": 0.1735120564699173, "learning_rate": 4.9890535851298526e-05, "loss": 0.4465, "step": 8175 }, { "epoch": 0.29480664576350596, "grad_norm": 0.16720019280910492, "learning_rate": 4.989026290319115e-05, "loss": 0.4391, "step": 8180 }, { "epoch": 0.2949868454247306, "grad_norm": 0.19565287232398987, "learning_rate": 4.988998961595876e-05, "loss": 0.4996, "step": 8185 }, { "epoch": 0.29516704508595526, "grad_norm": 0.17741116881370544, "learning_rate": 4.988971598960509e-05, "loss": 0.4652, "step": 8190 }, { "epoch": 0.2953472447471799, "grad_norm": 0.16898617148399353, "learning_rate": 4.988944202413386e-05, "loss": 0.478, "step": 8195 }, { "epoch": 0.2955274444084045, "grad_norm": 0.2120852917432785, "learning_rate": 4.988916771954879e-05, "loss": 0.4719, "step": 8200 }, { "epoch": 0.2957076440696291, "grad_norm": 0.16278114914894104, "learning_rate": 4.988889307585364e-05, "loss": 0.4799, "step": 8205 }, { "epoch": 0.2958878437308538, "grad_norm": 0.18301096558570862, "learning_rate": 4.988861809305213e-05, "loss": 0.4712, "step": 8210 }, { "epoch": 0.29606804339207843, "grad_norm": 0.15939167141914368, "learning_rate": 4.988834277114802e-05, "loss": 0.4389, "step": 8215 }, { "epoch": 0.29624824305330305, "grad_norm": 0.1933128535747528, "learning_rate": 4.988806711014505e-05, "loss": 0.4845, "step": 8220 }, { "epoch": 0.2964284427145277, "grad_norm": 0.1766856461763382, "learning_rate": 4.9887791110047e-05, "loss": 0.4751, "step": 8225 }, { "epoch": 0.29660864237575235, "grad_norm": 0.1650475561618805, "learning_rate": 4.9887514770857605e-05, "loss": 0.416, "step": 8230 }, { "epoch": 0.296788842036977, "grad_norm": 0.16024361550807953, "learning_rate": 4.988723809258064e-05, "loss": 0.4831, "step": 8235 }, { "epoch": 0.2969690416982016, "grad_norm": 0.1936086267232895, "learning_rate": 4.9886961075219885e-05, "loss": 0.4589, "step": 8240 }, { "epoch": 0.2971492413594262, "grad_norm": 0.20437081158161163, "learning_rate": 4.988668371877909e-05, "loss": 0.5085, "step": 8245 }, { "epoch": 0.2973294410206509, "grad_norm": 0.2507721185684204, "learning_rate": 4.988640602326205e-05, "loss": 0.4888, "step": 8250 }, { "epoch": 0.2975096406818755, "grad_norm": 0.2515529692173004, "learning_rate": 4.9886127988672554e-05, "loss": 0.4617, "step": 8255 }, { "epoch": 0.29768984034310014, "grad_norm": 0.18619805574417114, "learning_rate": 4.988584961501438e-05, "loss": 0.4843, "step": 8260 }, { "epoch": 0.29787004000432477, "grad_norm": 0.18921491503715515, "learning_rate": 4.988557090229132e-05, "loss": 0.4308, "step": 8265 }, { "epoch": 0.29805023966554944, "grad_norm": 0.17561018466949463, "learning_rate": 4.988529185050717e-05, "loss": 0.4686, "step": 8270 }, { "epoch": 0.29823043932677407, "grad_norm": 0.18587404489517212, "learning_rate": 4.988501245966574e-05, "loss": 0.4476, "step": 8275 }, { "epoch": 0.2984106389879987, "grad_norm": 0.22094377875328064, "learning_rate": 4.988473272977083e-05, "loss": 0.4372, "step": 8280 }, { "epoch": 0.29859083864922337, "grad_norm": 0.23441436886787415, "learning_rate": 4.988445266082626e-05, "loss": 0.4697, "step": 8285 }, { "epoch": 0.298771038310448, "grad_norm": 0.2000618875026703, "learning_rate": 4.988417225283585e-05, "loss": 0.4548, "step": 8290 }, { "epoch": 0.2989512379716726, "grad_norm": 0.16407370567321777, "learning_rate": 4.9883891505803394e-05, "loss": 0.4583, "step": 8295 }, { "epoch": 0.29913143763289723, "grad_norm": 0.1786169558763504, "learning_rate": 4.988361041973274e-05, "loss": 0.4534, "step": 8300 }, { "epoch": 0.2993116372941219, "grad_norm": 0.22652758657932281, "learning_rate": 4.988332899462771e-05, "loss": 0.4826, "step": 8305 }, { "epoch": 0.29949183695534654, "grad_norm": 0.23459400236606598, "learning_rate": 4.9883047230492144e-05, "loss": 0.4353, "step": 8310 }, { "epoch": 0.29967203661657116, "grad_norm": 0.19752709567546844, "learning_rate": 4.988276512732987e-05, "loss": 0.4551, "step": 8315 }, { "epoch": 0.2998522362777958, "grad_norm": 0.184610515832901, "learning_rate": 4.988248268514475e-05, "loss": 0.4713, "step": 8320 }, { "epoch": 0.30003243593902046, "grad_norm": 0.16262328624725342, "learning_rate": 4.988219990394061e-05, "loss": 0.472, "step": 8325 }, { "epoch": 0.3002126356002451, "grad_norm": 0.1739642322063446, "learning_rate": 4.988191678372132e-05, "loss": 0.4622, "step": 8330 }, { "epoch": 0.3003928352614697, "grad_norm": 0.15458573400974274, "learning_rate": 4.988163332449073e-05, "loss": 0.4829, "step": 8335 }, { "epoch": 0.3005730349226943, "grad_norm": 0.18645693361759186, "learning_rate": 4.9881349526252694e-05, "loss": 0.4414, "step": 8340 }, { "epoch": 0.300753234583919, "grad_norm": 0.17546844482421875, "learning_rate": 4.988106538901109e-05, "loss": 0.4612, "step": 8345 }, { "epoch": 0.3009334342451436, "grad_norm": 0.1612381488084793, "learning_rate": 4.9880780912769796e-05, "loss": 0.4759, "step": 8350 }, { "epoch": 0.30111363390636825, "grad_norm": 0.179169699549675, "learning_rate": 4.988049609753268e-05, "loss": 0.4535, "step": 8355 }, { "epoch": 0.3012938335675929, "grad_norm": 0.1993793547153473, "learning_rate": 4.988021094330362e-05, "loss": 0.4463, "step": 8360 }, { "epoch": 0.30147403322881755, "grad_norm": 0.22224834561347961, "learning_rate": 4.987992545008649e-05, "loss": 0.4579, "step": 8365 }, { "epoch": 0.3016542328900422, "grad_norm": 0.17759408056735992, "learning_rate": 4.987963961788521e-05, "loss": 0.4718, "step": 8370 }, { "epoch": 0.3018344325512668, "grad_norm": 0.1582093983888626, "learning_rate": 4.9879353446703655e-05, "loss": 0.4568, "step": 8375 }, { "epoch": 0.3020146322124914, "grad_norm": 0.20277364552021027, "learning_rate": 4.987906693654572e-05, "loss": 0.4607, "step": 8380 }, { "epoch": 0.3021948318737161, "grad_norm": 0.20033441483974457, "learning_rate": 4.987878008741531e-05, "loss": 0.4503, "step": 8385 }, { "epoch": 0.3023750315349407, "grad_norm": 0.2945716977119446, "learning_rate": 4.9878492899316346e-05, "loss": 0.5069, "step": 8390 }, { "epoch": 0.30255523119616534, "grad_norm": 0.16416752338409424, "learning_rate": 4.987820537225273e-05, "loss": 0.4901, "step": 8395 }, { "epoch": 0.30273543085738996, "grad_norm": 0.25771409273147583, "learning_rate": 4.9877917506228386e-05, "loss": 0.471, "step": 8400 }, { "epoch": 0.30291563051861464, "grad_norm": 0.18886184692382812, "learning_rate": 4.987762930124723e-05, "loss": 0.4543, "step": 8405 }, { "epoch": 0.30309583017983927, "grad_norm": 0.17397350072860718, "learning_rate": 4.987734075731319e-05, "loss": 0.4644, "step": 8410 }, { "epoch": 0.3032760298410639, "grad_norm": 0.19836938381195068, "learning_rate": 4.9877051874430204e-05, "loss": 0.4784, "step": 8415 }, { "epoch": 0.3034562295022885, "grad_norm": 0.1647557020187378, "learning_rate": 4.987676265260219e-05, "loss": 0.4518, "step": 8420 }, { "epoch": 0.3036364291635132, "grad_norm": 0.21178211271762848, "learning_rate": 4.987647309183311e-05, "loss": 0.4945, "step": 8425 }, { "epoch": 0.3038166288247378, "grad_norm": 0.19628405570983887, "learning_rate": 4.98761831921269e-05, "loss": 0.4792, "step": 8430 }, { "epoch": 0.30399682848596243, "grad_norm": 0.20497670769691467, "learning_rate": 4.987589295348751e-05, "loss": 0.4518, "step": 8435 }, { "epoch": 0.30417702814718706, "grad_norm": 0.2078750729560852, "learning_rate": 4.987560237591889e-05, "loss": 0.5137, "step": 8440 }, { "epoch": 0.30435722780841173, "grad_norm": 0.15747526288032532, "learning_rate": 4.987531145942501e-05, "loss": 0.4622, "step": 8445 }, { "epoch": 0.30453742746963636, "grad_norm": 0.19545340538024902, "learning_rate": 4.987502020400983e-05, "loss": 0.46, "step": 8450 }, { "epoch": 0.304717627130861, "grad_norm": 0.19771957397460938, "learning_rate": 4.9874728609677316e-05, "loss": 0.4655, "step": 8455 }, { "epoch": 0.30489782679208566, "grad_norm": 0.1792868971824646, "learning_rate": 4.9874436676431435e-05, "loss": 0.452, "step": 8460 }, { "epoch": 0.3050780264533103, "grad_norm": 0.2185121327638626, "learning_rate": 4.9874144404276165e-05, "loss": 0.4803, "step": 8465 }, { "epoch": 0.3052582261145349, "grad_norm": 0.18078531324863434, "learning_rate": 4.98738517932155e-05, "loss": 0.4395, "step": 8470 }, { "epoch": 0.3054384257757595, "grad_norm": 0.14936190843582153, "learning_rate": 4.987355884325342e-05, "loss": 0.4672, "step": 8475 }, { "epoch": 0.3056186254369842, "grad_norm": 0.19642381370067596, "learning_rate": 4.987326555439392e-05, "loss": 0.4507, "step": 8480 }, { "epoch": 0.3057988250982088, "grad_norm": 0.1511949896812439, "learning_rate": 4.987297192664099e-05, "loss": 0.426, "step": 8485 }, { "epoch": 0.30597902475943345, "grad_norm": 0.20524610579013824, "learning_rate": 4.9872677959998626e-05, "loss": 0.4387, "step": 8490 }, { "epoch": 0.30615922442065807, "grad_norm": 0.19021369516849518, "learning_rate": 4.987238365447084e-05, "loss": 0.49, "step": 8495 }, { "epoch": 0.30633942408188275, "grad_norm": 0.2122432291507721, "learning_rate": 4.987208901006165e-05, "loss": 0.4536, "step": 8500 }, { "epoch": 0.30633942408188275, "eval_loss": 0.4835846722126007, "eval_runtime": 3.5343, "eval_samples_per_second": 28.294, "eval_steps_per_second": 7.074, "step": 8500 }, { "epoch": 0.3065196237431074, "grad_norm": 0.2556203603744507, "learning_rate": 4.9871794026775067e-05, "loss": 0.4393, "step": 8505 }, { "epoch": 0.306699823404332, "grad_norm": 0.16816875338554382, "learning_rate": 4.98714987046151e-05, "loss": 0.4726, "step": 8510 }, { "epoch": 0.3068800230655566, "grad_norm": 0.2908528745174408, "learning_rate": 4.9871203043585774e-05, "loss": 0.4778, "step": 8515 }, { "epoch": 0.3070602227267813, "grad_norm": 0.2010086327791214, "learning_rate": 4.987090704369112e-05, "loss": 0.4649, "step": 8520 }, { "epoch": 0.3072404223880059, "grad_norm": 0.19618774950504303, "learning_rate": 4.9870610704935185e-05, "loss": 0.5132, "step": 8525 }, { "epoch": 0.30742062204923054, "grad_norm": 0.15902240574359894, "learning_rate": 4.9870314027321984e-05, "loss": 0.4017, "step": 8530 }, { "epoch": 0.30760082171045516, "grad_norm": 0.22079305350780487, "learning_rate": 4.9870017010855576e-05, "loss": 0.4651, "step": 8535 }, { "epoch": 0.30778102137167984, "grad_norm": 0.19805465638637543, "learning_rate": 4.9869719655539994e-05, "loss": 0.4438, "step": 8540 }, { "epoch": 0.30796122103290446, "grad_norm": 0.17793111503124237, "learning_rate": 4.98694219613793e-05, "loss": 0.4655, "step": 8545 }, { "epoch": 0.3081414206941291, "grad_norm": 0.16948306560516357, "learning_rate": 4.986912392837755e-05, "loss": 0.4485, "step": 8550 }, { "epoch": 0.3083216203553537, "grad_norm": 0.16313959658145905, "learning_rate": 4.98688255565388e-05, "loss": 0.4846, "step": 8555 }, { "epoch": 0.3085018200165784, "grad_norm": 0.20121654868125916, "learning_rate": 4.9868526845867115e-05, "loss": 0.4565, "step": 8560 }, { "epoch": 0.308682019677803, "grad_norm": 0.17432835698127747, "learning_rate": 4.9868227796366566e-05, "loss": 0.4505, "step": 8565 }, { "epoch": 0.30886221933902763, "grad_norm": 0.2507334053516388, "learning_rate": 4.986792840804122e-05, "loss": 0.4353, "step": 8570 }, { "epoch": 0.30904241900025226, "grad_norm": 0.12277240306138992, "learning_rate": 4.986762868089517e-05, "loss": 0.4637, "step": 8575 }, { "epoch": 0.30922261866147693, "grad_norm": 0.20205271244049072, "learning_rate": 4.98673286149325e-05, "loss": 0.4189, "step": 8580 }, { "epoch": 0.30940281832270156, "grad_norm": 0.16052600741386414, "learning_rate": 4.986702821015729e-05, "loss": 0.4499, "step": 8585 }, { "epoch": 0.3095830179839262, "grad_norm": 0.16914494335651398, "learning_rate": 4.9866727466573634e-05, "loss": 0.4447, "step": 8590 }, { "epoch": 0.3097632176451508, "grad_norm": 0.19733648002147675, "learning_rate": 4.986642638418563e-05, "loss": 0.4586, "step": 8595 }, { "epoch": 0.3099434173063755, "grad_norm": 0.19025246798992157, "learning_rate": 4.986612496299738e-05, "loss": 0.5077, "step": 8600 }, { "epoch": 0.3101236169676001, "grad_norm": 0.1786302626132965, "learning_rate": 4.986582320301299e-05, "loss": 0.4415, "step": 8605 }, { "epoch": 0.3103038166288247, "grad_norm": 0.18163448572158813, "learning_rate": 4.9865521104236575e-05, "loss": 0.465, "step": 8610 }, { "epoch": 0.31048401629004935, "grad_norm": 0.19647279381752014, "learning_rate": 4.986521866667225e-05, "loss": 0.4357, "step": 8615 }, { "epoch": 0.310664215951274, "grad_norm": 0.15463513135910034, "learning_rate": 4.9864915890324136e-05, "loss": 0.3975, "step": 8620 }, { "epoch": 0.31084441561249865, "grad_norm": 0.1965053528547287, "learning_rate": 4.986461277519635e-05, "loss": 0.463, "step": 8625 }, { "epoch": 0.31102461527372327, "grad_norm": 0.20269662141799927, "learning_rate": 4.9864309321293035e-05, "loss": 0.441, "step": 8630 }, { "epoch": 0.31120481493494795, "grad_norm": 0.2272135317325592, "learning_rate": 4.986400552861832e-05, "loss": 0.5056, "step": 8635 }, { "epoch": 0.31138501459617257, "grad_norm": 0.1746290922164917, "learning_rate": 4.986370139717634e-05, "loss": 0.4583, "step": 8640 }, { "epoch": 0.3115652142573972, "grad_norm": 0.17649775743484497, "learning_rate": 4.9863396926971245e-05, "loss": 0.4873, "step": 8645 }, { "epoch": 0.3117454139186218, "grad_norm": 0.185885488986969, "learning_rate": 4.9863092118007185e-05, "loss": 0.4692, "step": 8650 }, { "epoch": 0.3119256135798465, "grad_norm": 0.1990884244441986, "learning_rate": 4.98627869702883e-05, "loss": 0.4578, "step": 8655 }, { "epoch": 0.3121058132410711, "grad_norm": 0.22869646549224854, "learning_rate": 4.9862481483818755e-05, "loss": 0.4932, "step": 8660 }, { "epoch": 0.31228601290229574, "grad_norm": 0.1766314059495926, "learning_rate": 4.986217565860272e-05, "loss": 0.439, "step": 8665 }, { "epoch": 0.31246621256352036, "grad_norm": 0.14868828654289246, "learning_rate": 4.986186949464435e-05, "loss": 0.4357, "step": 8670 }, { "epoch": 0.31264641222474504, "grad_norm": 0.17688503861427307, "learning_rate": 4.986156299194783e-05, "loss": 0.4194, "step": 8675 }, { "epoch": 0.31282661188596966, "grad_norm": 0.215143084526062, "learning_rate": 4.9861256150517324e-05, "loss": 0.4795, "step": 8680 }, { "epoch": 0.3130068115471943, "grad_norm": 0.18795055150985718, "learning_rate": 4.9860948970357014e-05, "loss": 0.4408, "step": 8685 }, { "epoch": 0.3131870112084189, "grad_norm": 0.14594529569149017, "learning_rate": 4.986064145147108e-05, "loss": 0.4296, "step": 8690 }, { "epoch": 0.3133672108696436, "grad_norm": 0.19025376439094543, "learning_rate": 4.986033359386373e-05, "loss": 0.4736, "step": 8695 }, { "epoch": 0.3135474105308682, "grad_norm": 0.18264278769493103, "learning_rate": 4.986002539753915e-05, "loss": 0.4456, "step": 8700 }, { "epoch": 0.31372761019209283, "grad_norm": 0.14919918775558472, "learning_rate": 4.985971686250153e-05, "loss": 0.4644, "step": 8705 }, { "epoch": 0.31390780985331745, "grad_norm": 0.2042977660894394, "learning_rate": 4.985940798875508e-05, "loss": 0.4554, "step": 8710 }, { "epoch": 0.31408800951454213, "grad_norm": 0.1687832921743393, "learning_rate": 4.9859098776304015e-05, "loss": 0.4685, "step": 8715 }, { "epoch": 0.31426820917576676, "grad_norm": 0.19032564759254456, "learning_rate": 4.985878922515253e-05, "loss": 0.451, "step": 8720 }, { "epoch": 0.3144484088369914, "grad_norm": 0.15537263453006744, "learning_rate": 4.985847933530486e-05, "loss": 0.4373, "step": 8725 }, { "epoch": 0.314628608498216, "grad_norm": 0.24856919050216675, "learning_rate": 4.985816910676523e-05, "loss": 0.5004, "step": 8730 }, { "epoch": 0.3148088081594407, "grad_norm": 0.18003836274147034, "learning_rate": 4.985785853953786e-05, "loss": 0.4703, "step": 8735 }, { "epoch": 0.3149890078206653, "grad_norm": 0.1739412099123001, "learning_rate": 4.9857547633626964e-05, "loss": 0.4538, "step": 8740 }, { "epoch": 0.3151692074818899, "grad_norm": 0.19691774249076843, "learning_rate": 4.985723638903681e-05, "loss": 0.4671, "step": 8745 }, { "epoch": 0.31534940714311455, "grad_norm": 0.2076934427022934, "learning_rate": 4.9856924805771614e-05, "loss": 0.4796, "step": 8750 }, { "epoch": 0.3155296068043392, "grad_norm": 0.16024084389209747, "learning_rate": 4.9856612883835633e-05, "loss": 0.4567, "step": 8755 }, { "epoch": 0.31570980646556385, "grad_norm": 0.23982550203800201, "learning_rate": 4.985630062323311e-05, "loss": 0.4944, "step": 8760 }, { "epoch": 0.31589000612678847, "grad_norm": 0.16979862749576569, "learning_rate": 4.9855988023968314e-05, "loss": 0.481, "step": 8765 }, { "epoch": 0.3160702057880131, "grad_norm": 0.19428572058677673, "learning_rate": 4.9855675086045486e-05, "loss": 0.4769, "step": 8770 }, { "epoch": 0.31625040544923777, "grad_norm": 0.21432949602603912, "learning_rate": 4.985536180946889e-05, "loss": 0.4535, "step": 8775 }, { "epoch": 0.3164306051104624, "grad_norm": 0.1915997415781021, "learning_rate": 4.9855048194242816e-05, "loss": 0.4451, "step": 8780 }, { "epoch": 0.316610804771687, "grad_norm": 0.18941251933574677, "learning_rate": 4.985473424037151e-05, "loss": 0.4671, "step": 8785 }, { "epoch": 0.3167910044329117, "grad_norm": 0.17575441300868988, "learning_rate": 4.985441994785927e-05, "loss": 0.4611, "step": 8790 }, { "epoch": 0.3169712040941363, "grad_norm": 0.1792481243610382, "learning_rate": 4.9854105316710364e-05, "loss": 0.4631, "step": 8795 }, { "epoch": 0.31715140375536094, "grad_norm": 0.20034508407115936, "learning_rate": 4.9853790346929096e-05, "loss": 0.4751, "step": 8800 }, { "epoch": 0.31733160341658556, "grad_norm": 0.21609006822109222, "learning_rate": 4.9853475038519736e-05, "loss": 0.516, "step": 8805 }, { "epoch": 0.31751180307781024, "grad_norm": 0.17463666200637817, "learning_rate": 4.9853159391486594e-05, "loss": 0.4715, "step": 8810 }, { "epoch": 0.31769200273903486, "grad_norm": 0.1856069266796112, "learning_rate": 4.9852843405833965e-05, "loss": 0.4599, "step": 8815 }, { "epoch": 0.3178722024002595, "grad_norm": 0.2578841745853424, "learning_rate": 4.985252708156616e-05, "loss": 0.4756, "step": 8820 }, { "epoch": 0.3180524020614841, "grad_norm": 0.19595186412334442, "learning_rate": 4.985221041868748e-05, "loss": 0.4905, "step": 8825 }, { "epoch": 0.3182326017227088, "grad_norm": 0.1569208949804306, "learning_rate": 4.9851893417202247e-05, "loss": 0.4481, "step": 8830 }, { "epoch": 0.3184128013839334, "grad_norm": 0.1994239240884781, "learning_rate": 4.9851576077114784e-05, "loss": 0.4628, "step": 8835 }, { "epoch": 0.31859300104515803, "grad_norm": 0.15360099077224731, "learning_rate": 4.98512583984294e-05, "loss": 0.4508, "step": 8840 }, { "epoch": 0.31877320070638265, "grad_norm": 0.16948221623897552, "learning_rate": 4.985094038115043e-05, "loss": 0.4559, "step": 8845 }, { "epoch": 0.31895340036760733, "grad_norm": 0.1563452035188675, "learning_rate": 4.985062202528221e-05, "loss": 0.4163, "step": 8850 }, { "epoch": 0.31913360002883195, "grad_norm": 0.20190612971782684, "learning_rate": 4.985030333082908e-05, "loss": 0.4037, "step": 8855 }, { "epoch": 0.3193137996900566, "grad_norm": 0.17092286050319672, "learning_rate": 4.984998429779538e-05, "loss": 0.4339, "step": 8860 }, { "epoch": 0.3194939993512812, "grad_norm": 0.1695345640182495, "learning_rate": 4.9849664926185445e-05, "loss": 0.4485, "step": 8865 }, { "epoch": 0.3196741990125059, "grad_norm": 0.17635978758335114, "learning_rate": 4.9849345216003654e-05, "loss": 0.4437, "step": 8870 }, { "epoch": 0.3198543986737305, "grad_norm": 0.17225243151187897, "learning_rate": 4.9849025167254324e-05, "loss": 0.4684, "step": 8875 }, { "epoch": 0.3200345983349551, "grad_norm": 0.21078833937644958, "learning_rate": 4.984870477994186e-05, "loss": 0.473, "step": 8880 }, { "epoch": 0.32021479799617975, "grad_norm": 0.16367143392562866, "learning_rate": 4.9848384054070584e-05, "loss": 0.4629, "step": 8885 }, { "epoch": 0.3203949976574044, "grad_norm": 0.1702301949262619, "learning_rate": 4.9848062989644894e-05, "loss": 0.445, "step": 8890 }, { "epoch": 0.32057519731862905, "grad_norm": 0.1748560667037964, "learning_rate": 4.984774158666916e-05, "loss": 0.4815, "step": 8895 }, { "epoch": 0.32075539697985367, "grad_norm": 0.20928102731704712, "learning_rate": 4.9847419845147755e-05, "loss": 0.4564, "step": 8900 }, { "epoch": 0.3209355966410783, "grad_norm": 0.28013554215431213, "learning_rate": 4.984709776508506e-05, "loss": 0.473, "step": 8905 }, { "epoch": 0.32111579630230297, "grad_norm": 0.14769336581230164, "learning_rate": 4.984677534648548e-05, "loss": 0.4582, "step": 8910 }, { "epoch": 0.3212959959635276, "grad_norm": 0.2364906519651413, "learning_rate": 4.984645258935339e-05, "loss": 0.468, "step": 8915 }, { "epoch": 0.3214761956247522, "grad_norm": 0.13622739911079407, "learning_rate": 4.9846129493693183e-05, "loss": 0.4214, "step": 8920 }, { "epoch": 0.32165639528597684, "grad_norm": 0.2001640349626541, "learning_rate": 4.984580605950929e-05, "loss": 0.4832, "step": 8925 }, { "epoch": 0.3218365949472015, "grad_norm": 0.18895497918128967, "learning_rate": 4.984548228680609e-05, "loss": 0.498, "step": 8930 }, { "epoch": 0.32201679460842614, "grad_norm": 0.18323996663093567, "learning_rate": 4.9845158175588006e-05, "loss": 0.4827, "step": 8935 }, { "epoch": 0.32219699426965076, "grad_norm": 0.16754454374313354, "learning_rate": 4.9844833725859454e-05, "loss": 0.4472, "step": 8940 }, { "epoch": 0.3223771939308754, "grad_norm": 0.1655125617980957, "learning_rate": 4.9844508937624844e-05, "loss": 0.4637, "step": 8945 }, { "epoch": 0.32255739359210006, "grad_norm": 0.20413149893283844, "learning_rate": 4.984418381088862e-05, "loss": 0.444, "step": 8950 }, { "epoch": 0.3227375932533247, "grad_norm": 0.16138513386249542, "learning_rate": 4.98438583456552e-05, "loss": 0.4388, "step": 8955 }, { "epoch": 0.3229177929145493, "grad_norm": 0.194508358836174, "learning_rate": 4.9843532541929016e-05, "loss": 0.4823, "step": 8960 }, { "epoch": 0.323097992575774, "grad_norm": 0.19578173756599426, "learning_rate": 4.9843206399714516e-05, "loss": 0.4623, "step": 8965 }, { "epoch": 0.3232781922369986, "grad_norm": 0.19344592094421387, "learning_rate": 4.984287991901613e-05, "loss": 0.4333, "step": 8970 }, { "epoch": 0.32345839189822323, "grad_norm": 0.20126448571681976, "learning_rate": 4.9842553099838324e-05, "loss": 0.4837, "step": 8975 }, { "epoch": 0.32363859155944785, "grad_norm": 0.1657496839761734, "learning_rate": 4.9842225942185536e-05, "loss": 0.4983, "step": 8980 }, { "epoch": 0.32381879122067253, "grad_norm": 0.14652882516384125, "learning_rate": 4.984189844606223e-05, "loss": 0.4724, "step": 8985 }, { "epoch": 0.32399899088189715, "grad_norm": 0.15776591002941132, "learning_rate": 4.984157061147287e-05, "loss": 0.4545, "step": 8990 }, { "epoch": 0.3241791905431218, "grad_norm": 0.1731184720993042, "learning_rate": 4.984124243842192e-05, "loss": 0.4698, "step": 8995 }, { "epoch": 0.3243593902043464, "grad_norm": 0.16583359241485596, "learning_rate": 4.984091392691385e-05, "loss": 0.4673, "step": 9000 }, { "epoch": 0.3243593902043464, "eval_loss": 0.48290419578552246, "eval_runtime": 3.5289, "eval_samples_per_second": 28.337, "eval_steps_per_second": 7.084, "step": 9000 }, { "epoch": 0.3245395898655711, "grad_norm": 0.22497062385082245, "learning_rate": 4.984058507695314e-05, "loss": 0.4875, "step": 9005 }, { "epoch": 0.3247197895267957, "grad_norm": 0.1878102570772171, "learning_rate": 4.9840255888544265e-05, "loss": 0.4555, "step": 9010 }, { "epoch": 0.3248999891880203, "grad_norm": 0.2143104523420334, "learning_rate": 4.983992636169171e-05, "loss": 0.4425, "step": 9015 }, { "epoch": 0.32508018884924494, "grad_norm": 0.18537423014640808, "learning_rate": 4.9839596496399964e-05, "loss": 0.4809, "step": 9020 }, { "epoch": 0.3252603885104696, "grad_norm": 0.1459311693906784, "learning_rate": 4.983926629267353e-05, "loss": 0.4394, "step": 9025 }, { "epoch": 0.32544058817169425, "grad_norm": 0.16225488483905792, "learning_rate": 4.98389357505169e-05, "loss": 0.4756, "step": 9030 }, { "epoch": 0.32562078783291887, "grad_norm": 0.21932868659496307, "learning_rate": 4.983860486993458e-05, "loss": 0.4869, "step": 9035 }, { "epoch": 0.3258009874941435, "grad_norm": 0.17709140479564667, "learning_rate": 4.983827365093109e-05, "loss": 0.4538, "step": 9040 }, { "epoch": 0.32598118715536817, "grad_norm": 0.17748679220676422, "learning_rate": 4.9837942093510914e-05, "loss": 0.4759, "step": 9045 }, { "epoch": 0.3261613868165928, "grad_norm": 0.17787578701972961, "learning_rate": 4.983761019767859e-05, "loss": 0.4355, "step": 9050 }, { "epoch": 0.3263415864778174, "grad_norm": 0.21222223341464996, "learning_rate": 4.983727796343864e-05, "loss": 0.4351, "step": 9055 }, { "epoch": 0.32652178613904204, "grad_norm": 0.18843764066696167, "learning_rate": 4.983694539079558e-05, "loss": 0.4608, "step": 9060 }, { "epoch": 0.3267019858002667, "grad_norm": 0.2049497812986374, "learning_rate": 4.9836612479753955e-05, "loss": 0.468, "step": 9065 }, { "epoch": 0.32688218546149134, "grad_norm": 0.1821064054965973, "learning_rate": 4.9836279230318286e-05, "loss": 0.4842, "step": 9070 }, { "epoch": 0.32706238512271596, "grad_norm": 0.18487945199012756, "learning_rate": 4.983594564249312e-05, "loss": 0.4394, "step": 9075 }, { "epoch": 0.3272425847839406, "grad_norm": 0.19593387842178345, "learning_rate": 4.9835611716283015e-05, "loss": 0.4812, "step": 9080 }, { "epoch": 0.32742278444516526, "grad_norm": 0.1479068547487259, "learning_rate": 4.98352774516925e-05, "loss": 0.4608, "step": 9085 }, { "epoch": 0.3276029841063899, "grad_norm": 0.21857762336730957, "learning_rate": 4.983494284872614e-05, "loss": 0.4715, "step": 9090 }, { "epoch": 0.3277831837676145, "grad_norm": 0.18269813060760498, "learning_rate": 4.983460790738849e-05, "loss": 0.4397, "step": 9095 }, { "epoch": 0.32796338342883913, "grad_norm": 0.1688721776008606, "learning_rate": 4.983427262768411e-05, "loss": 0.4727, "step": 9100 }, { "epoch": 0.3281435830900638, "grad_norm": 0.2021578699350357, "learning_rate": 4.983393700961758e-05, "loss": 0.4429, "step": 9105 }, { "epoch": 0.32832378275128843, "grad_norm": 0.16843554377555847, "learning_rate": 4.9833601053193465e-05, "loss": 0.4566, "step": 9110 }, { "epoch": 0.32850398241251305, "grad_norm": 0.19692714512348175, "learning_rate": 4.983326475841635e-05, "loss": 0.4815, "step": 9115 }, { "epoch": 0.3286841820737377, "grad_norm": 0.20175185799598694, "learning_rate": 4.983292812529081e-05, "loss": 0.4429, "step": 9120 }, { "epoch": 0.32886438173496235, "grad_norm": 0.16880235075950623, "learning_rate": 4.9832591153821424e-05, "loss": 0.4417, "step": 9125 }, { "epoch": 0.329044581396187, "grad_norm": 0.1737411469221115, "learning_rate": 4.983225384401279e-05, "loss": 0.4304, "step": 9130 }, { "epoch": 0.3292247810574116, "grad_norm": 0.20997703075408936, "learning_rate": 4.983191619586951e-05, "loss": 0.4775, "step": 9135 }, { "epoch": 0.3294049807186363, "grad_norm": 0.20304125547409058, "learning_rate": 4.9831578209396186e-05, "loss": 0.4416, "step": 9140 }, { "epoch": 0.3295851803798609, "grad_norm": 0.22506339848041534, "learning_rate": 4.9831239884597407e-05, "loss": 0.531, "step": 9145 }, { "epoch": 0.3297653800410855, "grad_norm": 0.16725526750087738, "learning_rate": 4.983090122147779e-05, "loss": 0.4536, "step": 9150 }, { "epoch": 0.32994557970231014, "grad_norm": 0.1725703775882721, "learning_rate": 4.983056222004196e-05, "loss": 0.4525, "step": 9155 }, { "epoch": 0.3301257793635348, "grad_norm": 0.13673150539398193, "learning_rate": 4.9830222880294525e-05, "loss": 0.4515, "step": 9160 }, { "epoch": 0.33030597902475944, "grad_norm": 0.19130565226078033, "learning_rate": 4.982988320224011e-05, "loss": 0.4683, "step": 9165 }, { "epoch": 0.33048617868598407, "grad_norm": 0.2034883350133896, "learning_rate": 4.9829543185883344e-05, "loss": 0.4899, "step": 9170 }, { "epoch": 0.3306663783472087, "grad_norm": 0.22771266102790833, "learning_rate": 4.982920283122885e-05, "loss": 0.4736, "step": 9175 }, { "epoch": 0.33084657800843337, "grad_norm": 0.18065966665744781, "learning_rate": 4.982886213828128e-05, "loss": 0.4542, "step": 9180 }, { "epoch": 0.331026777669658, "grad_norm": 0.1368507593870163, "learning_rate": 4.9828521107045276e-05, "loss": 0.4624, "step": 9185 }, { "epoch": 0.3312069773308826, "grad_norm": 0.16015149652957916, "learning_rate": 4.982817973752548e-05, "loss": 0.4361, "step": 9190 }, { "epoch": 0.33138717699210724, "grad_norm": 0.19293028116226196, "learning_rate": 4.9827838029726535e-05, "loss": 0.4403, "step": 9195 }, { "epoch": 0.3315673766533319, "grad_norm": 0.18895958364009857, "learning_rate": 4.9827495983653104e-05, "loss": 0.4999, "step": 9200 }, { "epoch": 0.33174757631455654, "grad_norm": 0.16563822329044342, "learning_rate": 4.982715359930985e-05, "loss": 0.436, "step": 9205 }, { "epoch": 0.33192777597578116, "grad_norm": 0.21397612988948822, "learning_rate": 4.982681087670144e-05, "loss": 0.4835, "step": 9210 }, { "epoch": 0.3321079756370058, "grad_norm": 0.18013893067836761, "learning_rate": 4.982646781583252e-05, "loss": 0.4622, "step": 9215 }, { "epoch": 0.33228817529823046, "grad_norm": 0.19024062156677246, "learning_rate": 4.98261244167078e-05, "loss": 0.4419, "step": 9220 }, { "epoch": 0.3324683749594551, "grad_norm": 0.18336628377437592, "learning_rate": 4.9825780679331935e-05, "loss": 0.4525, "step": 9225 }, { "epoch": 0.3326485746206797, "grad_norm": 0.17188718914985657, "learning_rate": 4.982543660370962e-05, "loss": 0.4737, "step": 9230 }, { "epoch": 0.3328287742819043, "grad_norm": 0.1877148300409317, "learning_rate": 4.982509218984553e-05, "loss": 0.4557, "step": 9235 }, { "epoch": 0.333008973943129, "grad_norm": 0.2199414074420929, "learning_rate": 4.982474743774437e-05, "loss": 0.4892, "step": 9240 }, { "epoch": 0.33318917360435363, "grad_norm": 0.17190028727054596, "learning_rate": 4.982440234741082e-05, "loss": 0.487, "step": 9245 }, { "epoch": 0.33336937326557825, "grad_norm": 0.2042684108018875, "learning_rate": 4.9824056918849614e-05, "loss": 0.4648, "step": 9250 }, { "epoch": 0.3335495729268029, "grad_norm": 0.15054477751255035, "learning_rate": 4.9823711152065425e-05, "loss": 0.4622, "step": 9255 }, { "epoch": 0.33372977258802755, "grad_norm": 0.20531079173088074, "learning_rate": 4.9823365047062986e-05, "loss": 0.466, "step": 9260 }, { "epoch": 0.3339099722492522, "grad_norm": 0.17524632811546326, "learning_rate": 4.9823018603847e-05, "loss": 0.4271, "step": 9265 }, { "epoch": 0.3340901719104768, "grad_norm": 0.21310240030288696, "learning_rate": 4.9822671822422195e-05, "loss": 0.4699, "step": 9270 }, { "epoch": 0.3342703715717014, "grad_norm": 0.19141936302185059, "learning_rate": 4.982232470279329e-05, "loss": 0.4508, "step": 9275 }, { "epoch": 0.3344505712329261, "grad_norm": 0.21620051562786102, "learning_rate": 4.9821977244965014e-05, "loss": 0.502, "step": 9280 }, { "epoch": 0.3346307708941507, "grad_norm": 0.16024553775787354, "learning_rate": 4.98216294489421e-05, "loss": 0.442, "step": 9285 }, { "epoch": 0.33481097055537534, "grad_norm": 0.21437714993953705, "learning_rate": 4.982128131472929e-05, "loss": 0.4775, "step": 9290 }, { "epoch": 0.3349911702166, "grad_norm": 0.18475022912025452, "learning_rate": 4.982093284233134e-05, "loss": 0.4903, "step": 9295 }, { "epoch": 0.33517136987782464, "grad_norm": 0.13329213857650757, "learning_rate": 4.982058403175298e-05, "loss": 0.443, "step": 9300 }, { "epoch": 0.33535156953904927, "grad_norm": 0.16213421523571014, "learning_rate": 4.982023488299897e-05, "loss": 0.4502, "step": 9305 }, { "epoch": 0.3355317692002739, "grad_norm": 0.19696162641048431, "learning_rate": 4.981988539607406e-05, "loss": 0.4939, "step": 9310 }, { "epoch": 0.33571196886149857, "grad_norm": 0.153734490275383, "learning_rate": 4.981953557098302e-05, "loss": 0.4557, "step": 9315 }, { "epoch": 0.3358921685227232, "grad_norm": 0.16626779735088348, "learning_rate": 4.981918540773061e-05, "loss": 0.4774, "step": 9320 }, { "epoch": 0.3360723681839478, "grad_norm": 0.2404673844575882, "learning_rate": 4.981883490632161e-05, "loss": 0.4717, "step": 9325 }, { "epoch": 0.33625256784517243, "grad_norm": 0.19677095115184784, "learning_rate": 4.9818484066760786e-05, "loss": 0.4442, "step": 9330 }, { "epoch": 0.3364327675063971, "grad_norm": 0.17953583598136902, "learning_rate": 4.9818132889052914e-05, "loss": 0.4613, "step": 9335 }, { "epoch": 0.33661296716762173, "grad_norm": 0.19360299408435822, "learning_rate": 4.98177813732028e-05, "loss": 0.4416, "step": 9340 }, { "epoch": 0.33679316682884636, "grad_norm": 0.16744250059127808, "learning_rate": 4.9817429519215206e-05, "loss": 0.454, "step": 9345 }, { "epoch": 0.336973366490071, "grad_norm": 0.15772564709186554, "learning_rate": 4.981707732709495e-05, "loss": 0.4651, "step": 9350 }, { "epoch": 0.33715356615129566, "grad_norm": 0.16711197793483734, "learning_rate": 4.9816724796846814e-05, "loss": 0.4269, "step": 9355 }, { "epoch": 0.3373337658125203, "grad_norm": 0.14946970343589783, "learning_rate": 4.981637192847561e-05, "loss": 0.4646, "step": 9360 }, { "epoch": 0.3375139654737449, "grad_norm": 0.20291027426719666, "learning_rate": 4.9816018721986145e-05, "loss": 0.4598, "step": 9365 }, { "epoch": 0.3376941651349695, "grad_norm": 0.1772489696741104, "learning_rate": 4.981566517738323e-05, "loss": 0.4794, "step": 9370 }, { "epoch": 0.3378743647961942, "grad_norm": 0.1733308732509613, "learning_rate": 4.981531129467168e-05, "loss": 0.4581, "step": 9375 }, { "epoch": 0.3380545644574188, "grad_norm": 0.13439474999904633, "learning_rate": 4.981495707385632e-05, "loss": 0.4519, "step": 9380 }, { "epoch": 0.33823476411864345, "grad_norm": 0.15566691756248474, "learning_rate": 4.9814602514941965e-05, "loss": 0.452, "step": 9385 }, { "epoch": 0.33841496377986807, "grad_norm": 0.1827496588230133, "learning_rate": 4.981424761793346e-05, "loss": 0.4634, "step": 9390 }, { "epoch": 0.33859516344109275, "grad_norm": 0.16756999492645264, "learning_rate": 4.9813892382835635e-05, "loss": 0.485, "step": 9395 }, { "epoch": 0.3387753631023174, "grad_norm": 0.17952053248882294, "learning_rate": 4.981353680965334e-05, "loss": 0.4769, "step": 9400 }, { "epoch": 0.338955562763542, "grad_norm": 0.14793772995471954, "learning_rate": 4.98131808983914e-05, "loss": 0.4435, "step": 9405 }, { "epoch": 0.3391357624247666, "grad_norm": 0.19632504880428314, "learning_rate": 4.9812824649054674e-05, "loss": 0.3992, "step": 9410 }, { "epoch": 0.3393159620859913, "grad_norm": 0.19574934244155884, "learning_rate": 4.9812468061648024e-05, "loss": 0.489, "step": 9415 }, { "epoch": 0.3394961617472159, "grad_norm": 0.22474491596221924, "learning_rate": 4.981211113617629e-05, "loss": 0.4526, "step": 9420 }, { "epoch": 0.33967636140844054, "grad_norm": 0.18208986520767212, "learning_rate": 4.981175387264435e-05, "loss": 0.4648, "step": 9425 }, { "epoch": 0.33985656106966516, "grad_norm": 0.16022849082946777, "learning_rate": 4.9811396271057067e-05, "loss": 0.4527, "step": 9430 }, { "epoch": 0.34003676073088984, "grad_norm": 0.21129754185676575, "learning_rate": 4.981103833141931e-05, "loss": 0.4382, "step": 9435 }, { "epoch": 0.34021696039211446, "grad_norm": 0.20417428016662598, "learning_rate": 4.981068005373597e-05, "loss": 0.4474, "step": 9440 }, { "epoch": 0.3403971600533391, "grad_norm": 0.21465253829956055, "learning_rate": 4.981032143801191e-05, "loss": 0.4521, "step": 9445 }, { "epoch": 0.3405773597145637, "grad_norm": 0.19207169115543365, "learning_rate": 4.980996248425202e-05, "loss": 0.456, "step": 9450 }, { "epoch": 0.3407575593757884, "grad_norm": 0.1812460720539093, "learning_rate": 4.98096031924612e-05, "loss": 0.469, "step": 9455 }, { "epoch": 0.340937759037013, "grad_norm": 0.21918563544750214, "learning_rate": 4.9809243562644334e-05, "loss": 0.4829, "step": 9460 }, { "epoch": 0.34111795869823763, "grad_norm": 0.18760140240192413, "learning_rate": 4.980888359480634e-05, "loss": 0.4243, "step": 9465 }, { "epoch": 0.3412981583594623, "grad_norm": 0.1885528415441513, "learning_rate": 4.98085232889521e-05, "loss": 0.4846, "step": 9470 }, { "epoch": 0.34147835802068693, "grad_norm": 0.16368678212165833, "learning_rate": 4.980816264508654e-05, "loss": 0.4488, "step": 9475 }, { "epoch": 0.34165855768191156, "grad_norm": 0.21680162847042084, "learning_rate": 4.980780166321456e-05, "loss": 0.4932, "step": 9480 }, { "epoch": 0.3418387573431362, "grad_norm": 0.19433487951755524, "learning_rate": 4.9807440343341095e-05, "loss": 0.4472, "step": 9485 }, { "epoch": 0.34201895700436086, "grad_norm": 0.16287468373775482, "learning_rate": 4.980707868547105e-05, "loss": 0.4767, "step": 9490 }, { "epoch": 0.3421991566655855, "grad_norm": 0.15700995922088623, "learning_rate": 4.9806716689609356e-05, "loss": 0.4508, "step": 9495 }, { "epoch": 0.3423793563268101, "grad_norm": 0.2554052472114563, "learning_rate": 4.980635435576096e-05, "loss": 0.4737, "step": 9500 }, { "epoch": 0.3423793563268101, "eval_loss": 0.4815308451652527, "eval_runtime": 3.5226, "eval_samples_per_second": 28.388, "eval_steps_per_second": 7.097, "step": 9500 }, { "epoch": 0.3425595559880347, "grad_norm": 0.21748638153076172, "learning_rate": 4.980599168393079e-05, "loss": 0.4881, "step": 9505 }, { "epoch": 0.3427397556492594, "grad_norm": 0.18842586874961853, "learning_rate": 4.9805628674123774e-05, "loss": 0.4897, "step": 9510 }, { "epoch": 0.342919955310484, "grad_norm": 0.1802000105381012, "learning_rate": 4.9805265326344874e-05, "loss": 0.4584, "step": 9515 }, { "epoch": 0.34310015497170865, "grad_norm": 0.1921650469303131, "learning_rate": 4.980490164059904e-05, "loss": 0.4693, "step": 9520 }, { "epoch": 0.34328035463293327, "grad_norm": 0.1768704205751419, "learning_rate": 4.9804537616891224e-05, "loss": 0.4649, "step": 9525 }, { "epoch": 0.34346055429415795, "grad_norm": 0.1596553921699524, "learning_rate": 4.980417325522638e-05, "loss": 0.4141, "step": 9530 }, { "epoch": 0.34364075395538257, "grad_norm": 0.18743056058883667, "learning_rate": 4.980380855560949e-05, "loss": 0.4305, "step": 9535 }, { "epoch": 0.3438209536166072, "grad_norm": 0.16719353199005127, "learning_rate": 4.98034435180455e-05, "loss": 0.4731, "step": 9540 }, { "epoch": 0.3440011532778318, "grad_norm": 0.17522072792053223, "learning_rate": 4.980307814253939e-05, "loss": 0.4487, "step": 9545 }, { "epoch": 0.3441813529390565, "grad_norm": 0.15065240859985352, "learning_rate": 4.9802712429096154e-05, "loss": 0.4602, "step": 9550 }, { "epoch": 0.3443615526002811, "grad_norm": 0.19820521771907806, "learning_rate": 4.980234637772075e-05, "loss": 0.4411, "step": 9555 }, { "epoch": 0.34454175226150574, "grad_norm": 0.20624442398548126, "learning_rate": 4.980197998841819e-05, "loss": 0.4873, "step": 9560 }, { "epoch": 0.34472195192273036, "grad_norm": 0.18093755841255188, "learning_rate": 4.9801613261193455e-05, "loss": 0.46, "step": 9565 }, { "epoch": 0.34490215158395504, "grad_norm": 0.17363637685775757, "learning_rate": 4.9801246196051535e-05, "loss": 0.4306, "step": 9570 }, { "epoch": 0.34508235124517966, "grad_norm": 0.1665421426296234, "learning_rate": 4.980087879299744e-05, "loss": 0.4478, "step": 9575 }, { "epoch": 0.3452625509064043, "grad_norm": 0.18756218254566193, "learning_rate": 4.980051105203617e-05, "loss": 0.5001, "step": 9580 }, { "epoch": 0.3454427505676289, "grad_norm": 0.16964589059352875, "learning_rate": 4.980014297317274e-05, "loss": 0.4586, "step": 9585 }, { "epoch": 0.3456229502288536, "grad_norm": 0.24527285993099213, "learning_rate": 4.979977455641217e-05, "loss": 0.456, "step": 9590 }, { "epoch": 0.3458031498900782, "grad_norm": 0.22246739268302917, "learning_rate": 4.9799405801759466e-05, "loss": 0.4951, "step": 9595 }, { "epoch": 0.34598334955130283, "grad_norm": 0.18261441588401794, "learning_rate": 4.979903670921966e-05, "loss": 0.4364, "step": 9600 }, { "epoch": 0.34616354921252745, "grad_norm": 0.15440581738948822, "learning_rate": 4.9798667278797775e-05, "loss": 0.4383, "step": 9605 }, { "epoch": 0.34634374887375213, "grad_norm": 0.15516167879104614, "learning_rate": 4.979829751049886e-05, "loss": 0.4524, "step": 9610 }, { "epoch": 0.34652394853497676, "grad_norm": 0.13671459257602692, "learning_rate": 4.979792740432794e-05, "loss": 0.4862, "step": 9615 }, { "epoch": 0.3467041481962014, "grad_norm": 0.155983105301857, "learning_rate": 4.9797556960290047e-05, "loss": 0.4566, "step": 9620 }, { "epoch": 0.346884347857426, "grad_norm": 0.14357610046863556, "learning_rate": 4.9797186178390255e-05, "loss": 0.4731, "step": 9625 }, { "epoch": 0.3470645475186507, "grad_norm": 0.19745934009552002, "learning_rate": 4.97968150586336e-05, "loss": 0.5, "step": 9630 }, { "epoch": 0.3472447471798753, "grad_norm": 0.19887088239192963, "learning_rate": 4.9796443601025144e-05, "loss": 0.43, "step": 9635 }, { "epoch": 0.3474249468410999, "grad_norm": 0.17473819851875305, "learning_rate": 4.9796071805569936e-05, "loss": 0.4736, "step": 9640 }, { "epoch": 0.3476051465023246, "grad_norm": 0.17900219559669495, "learning_rate": 4.9795699672273054e-05, "loss": 0.4633, "step": 9645 }, { "epoch": 0.3477853461635492, "grad_norm": 0.21898692846298218, "learning_rate": 4.979532720113956e-05, "loss": 0.4815, "step": 9650 }, { "epoch": 0.34796554582477385, "grad_norm": 0.18762876093387604, "learning_rate": 4.979495439217454e-05, "loss": 0.4904, "step": 9655 }, { "epoch": 0.34814574548599847, "grad_norm": 0.2149989753961563, "learning_rate": 4.9794581245383074e-05, "loss": 0.468, "step": 9660 }, { "epoch": 0.34832594514722315, "grad_norm": 0.1882586032152176, "learning_rate": 4.979420776077023e-05, "loss": 0.4934, "step": 9665 }, { "epoch": 0.34850614480844777, "grad_norm": 0.19263622164726257, "learning_rate": 4.979383393834111e-05, "loss": 0.4851, "step": 9670 }, { "epoch": 0.3486863444696724, "grad_norm": 0.190067321062088, "learning_rate": 4.9793459778100794e-05, "loss": 0.4344, "step": 9675 }, { "epoch": 0.348866544130897, "grad_norm": 0.16212069988250732, "learning_rate": 4.979308528005439e-05, "loss": 0.4459, "step": 9680 }, { "epoch": 0.3490467437921217, "grad_norm": 0.20930610597133636, "learning_rate": 4.9792710444207004e-05, "loss": 0.4726, "step": 9685 }, { "epoch": 0.3492269434533463, "grad_norm": 0.20407001674175262, "learning_rate": 4.979233527056374e-05, "loss": 0.47, "step": 9690 }, { "epoch": 0.34940714311457094, "grad_norm": 0.16390258073806763, "learning_rate": 4.9791959759129706e-05, "loss": 0.4299, "step": 9695 }, { "epoch": 0.34958734277579556, "grad_norm": 0.18050611019134521, "learning_rate": 4.979158390991002e-05, "loss": 0.4971, "step": 9700 }, { "epoch": 0.34976754243702024, "grad_norm": 0.172181636095047, "learning_rate": 4.9791207722909794e-05, "loss": 0.4547, "step": 9705 }, { "epoch": 0.34994774209824486, "grad_norm": 0.12145093083381653, "learning_rate": 4.9790831198134175e-05, "loss": 0.471, "step": 9710 }, { "epoch": 0.3501279417594695, "grad_norm": 0.17209351062774658, "learning_rate": 4.979045433558828e-05, "loss": 0.4392, "step": 9715 }, { "epoch": 0.3503081414206941, "grad_norm": 0.21002522110939026, "learning_rate": 4.979007713527723e-05, "loss": 0.4976, "step": 9720 }, { "epoch": 0.3504883410819188, "grad_norm": 0.16209115087985992, "learning_rate": 4.9789699597206196e-05, "loss": 0.4635, "step": 9725 }, { "epoch": 0.3506685407431434, "grad_norm": 0.14901262521743774, "learning_rate": 4.97893217213803e-05, "loss": 0.4632, "step": 9730 }, { "epoch": 0.35084874040436803, "grad_norm": 0.16062456369400024, "learning_rate": 4.9788943507804686e-05, "loss": 0.4482, "step": 9735 }, { "epoch": 0.35102894006559265, "grad_norm": 0.14877210557460785, "learning_rate": 4.9788564956484527e-05, "loss": 0.4823, "step": 9740 }, { "epoch": 0.35120913972681733, "grad_norm": 0.1947886049747467, "learning_rate": 4.978818606742496e-05, "loss": 0.4606, "step": 9745 }, { "epoch": 0.35138933938804195, "grad_norm": 0.18804532289505005, "learning_rate": 4.978780684063116e-05, "loss": 0.4878, "step": 9750 }, { "epoch": 0.3515695390492666, "grad_norm": 0.179021954536438, "learning_rate": 4.97874272761083e-05, "loss": 0.4619, "step": 9755 }, { "epoch": 0.3517497387104912, "grad_norm": 0.1584351509809494, "learning_rate": 4.978704737386153e-05, "loss": 0.4884, "step": 9760 }, { "epoch": 0.3519299383717159, "grad_norm": 0.18059109151363373, "learning_rate": 4.9786667133896046e-05, "loss": 0.4296, "step": 9765 }, { "epoch": 0.3521101380329405, "grad_norm": 0.16413117945194244, "learning_rate": 4.978628655621702e-05, "loss": 0.4785, "step": 9770 }, { "epoch": 0.3522903376941651, "grad_norm": 0.17627866566181183, "learning_rate": 4.9785905640829635e-05, "loss": 0.4539, "step": 9775 }, { "epoch": 0.35247053735538975, "grad_norm": 0.19507911801338196, "learning_rate": 4.978552438773909e-05, "loss": 0.4753, "step": 9780 }, { "epoch": 0.3526507370166144, "grad_norm": 0.23072102665901184, "learning_rate": 4.9785142796950566e-05, "loss": 0.4645, "step": 9785 }, { "epoch": 0.35283093667783905, "grad_norm": 0.16471746563911438, "learning_rate": 4.978476086846928e-05, "loss": 0.4756, "step": 9790 }, { "epoch": 0.35301113633906367, "grad_norm": 0.17672182619571686, "learning_rate": 4.978437860230042e-05, "loss": 0.459, "step": 9795 }, { "epoch": 0.3531913360002883, "grad_norm": 0.17026053369045258, "learning_rate": 4.97839959984492e-05, "loss": 0.4961, "step": 9800 }, { "epoch": 0.35337153566151297, "grad_norm": 0.19481323659420013, "learning_rate": 4.978361305692083e-05, "loss": 0.4534, "step": 9805 }, { "epoch": 0.3535517353227376, "grad_norm": 0.17143213748931885, "learning_rate": 4.978322977772053e-05, "loss": 0.4726, "step": 9810 }, { "epoch": 0.3537319349839622, "grad_norm": 0.20486843585968018, "learning_rate": 4.978284616085352e-05, "loss": 0.5067, "step": 9815 }, { "epoch": 0.3539121346451869, "grad_norm": 0.15131095051765442, "learning_rate": 4.9782462206325045e-05, "loss": 0.4922, "step": 9820 }, { "epoch": 0.3540923343064115, "grad_norm": 0.20476940274238586, "learning_rate": 4.978207791414031e-05, "loss": 0.47, "step": 9825 }, { "epoch": 0.35427253396763614, "grad_norm": 0.16714629530906677, "learning_rate": 4.978169328430456e-05, "loss": 0.454, "step": 9830 }, { "epoch": 0.35445273362886076, "grad_norm": 0.18224307894706726, "learning_rate": 4.978130831682304e-05, "loss": 0.4527, "step": 9835 }, { "epoch": 0.35463293329008544, "grad_norm": 0.1600581556558609, "learning_rate": 4.978092301170099e-05, "loss": 0.4877, "step": 9840 }, { "epoch": 0.35481313295131006, "grad_norm": 0.21684806048870087, "learning_rate": 4.9780537368943655e-05, "loss": 0.4997, "step": 9845 }, { "epoch": 0.3549933326125347, "grad_norm": 0.15424644947052002, "learning_rate": 4.978015138855631e-05, "loss": 0.4398, "step": 9850 }, { "epoch": 0.3551735322737593, "grad_norm": 0.16304361820220947, "learning_rate": 4.9779765070544195e-05, "loss": 0.4756, "step": 9855 }, { "epoch": 0.355353731934984, "grad_norm": 0.1486636996269226, "learning_rate": 4.977937841491257e-05, "loss": 0.4152, "step": 9860 }, { "epoch": 0.3555339315962086, "grad_norm": 0.1545153707265854, "learning_rate": 4.9778991421666724e-05, "loss": 0.447, "step": 9865 }, { "epoch": 0.35571413125743323, "grad_norm": 0.14595122635364532, "learning_rate": 4.977860409081191e-05, "loss": 0.4589, "step": 9870 }, { "epoch": 0.35589433091865785, "grad_norm": 0.17412835359573364, "learning_rate": 4.977821642235341e-05, "loss": 0.4367, "step": 9875 }, { "epoch": 0.35607453057988253, "grad_norm": 0.20736642181873322, "learning_rate": 4.9777828416296513e-05, "loss": 0.4574, "step": 9880 }, { "epoch": 0.35625473024110715, "grad_norm": 0.20509152114391327, "learning_rate": 4.9777440072646504e-05, "loss": 0.4694, "step": 9885 }, { "epoch": 0.3564349299023318, "grad_norm": 0.1846247762441635, "learning_rate": 4.977705139140867e-05, "loss": 0.4674, "step": 9890 }, { "epoch": 0.3566151295635564, "grad_norm": 0.2067510038614273, "learning_rate": 4.97766623725883e-05, "loss": 0.4276, "step": 9895 }, { "epoch": 0.3567953292247811, "grad_norm": 0.2130732536315918, "learning_rate": 4.977627301619071e-05, "loss": 0.4933, "step": 9900 }, { "epoch": 0.3569755288860057, "grad_norm": 0.2266158163547516, "learning_rate": 4.97758833222212e-05, "loss": 0.4332, "step": 9905 }, { "epoch": 0.3571557285472303, "grad_norm": 0.18190445005893707, "learning_rate": 4.977549329068506e-05, "loss": 0.4467, "step": 9910 }, { "epoch": 0.35733592820845494, "grad_norm": 0.14068225026130676, "learning_rate": 4.977510292158764e-05, "loss": 0.4436, "step": 9915 }, { "epoch": 0.3575161278696796, "grad_norm": 0.1720120906829834, "learning_rate": 4.977471221493423e-05, "loss": 0.4622, "step": 9920 }, { "epoch": 0.35769632753090425, "grad_norm": 0.1588047444820404, "learning_rate": 4.977432117073016e-05, "loss": 0.4633, "step": 9925 }, { "epoch": 0.35787652719212887, "grad_norm": 0.168239563703537, "learning_rate": 4.9773929788980766e-05, "loss": 0.4476, "step": 9930 }, { "epoch": 0.3580567268533535, "grad_norm": 0.16691339015960693, "learning_rate": 4.9773538069691375e-05, "loss": 0.4693, "step": 9935 }, { "epoch": 0.35823692651457817, "grad_norm": 0.15054184198379517, "learning_rate": 4.977314601286732e-05, "loss": 0.4347, "step": 9940 }, { "epoch": 0.3584171261758028, "grad_norm": 0.147248312830925, "learning_rate": 4.9772753618513945e-05, "loss": 0.444, "step": 9945 }, { "epoch": 0.3585973258370274, "grad_norm": 0.20914539694786072, "learning_rate": 4.9772360886636605e-05, "loss": 0.4573, "step": 9950 }, { "epoch": 0.35877752549825204, "grad_norm": 0.16650797426700592, "learning_rate": 4.977196781724064e-05, "loss": 0.4585, "step": 9955 }, { "epoch": 0.3589577251594767, "grad_norm": 0.2121131271123886, "learning_rate": 4.9771574410331415e-05, "loss": 0.4204, "step": 9960 }, { "epoch": 0.35913792482070134, "grad_norm": 0.1681807041168213, "learning_rate": 4.977118066591427e-05, "loss": 0.5008, "step": 9965 }, { "epoch": 0.35931812448192596, "grad_norm": 0.18898646533489227, "learning_rate": 4.97707865839946e-05, "loss": 0.5016, "step": 9970 }, { "epoch": 0.35949832414315064, "grad_norm": 0.1509980857372284, "learning_rate": 4.977039216457775e-05, "loss": 0.4251, "step": 9975 }, { "epoch": 0.35967852380437526, "grad_norm": 0.15001198649406433, "learning_rate": 4.976999740766911e-05, "loss": 0.4745, "step": 9980 }, { "epoch": 0.3598587234655999, "grad_norm": 0.19364404678344727, "learning_rate": 4.976960231327404e-05, "loss": 0.4624, "step": 9985 }, { "epoch": 0.3600389231268245, "grad_norm": 0.1906237006187439, "learning_rate": 4.976920688139794e-05, "loss": 0.4492, "step": 9990 }, { "epoch": 0.3602191227880492, "grad_norm": 0.1953149288892746, "learning_rate": 4.9768811112046196e-05, "loss": 0.4655, "step": 9995 }, { "epoch": 0.3603993224492738, "grad_norm": 0.1708211600780487, "learning_rate": 4.976841500522419e-05, "loss": 0.4667, "step": 10000 }, { "epoch": 0.3603993224492738, "eval_loss": 0.4796702265739441, "eval_runtime": 3.5164, "eval_samples_per_second": 28.438, "eval_steps_per_second": 7.11, "step": 10000 }, { "epoch": 0.36057952211049843, "grad_norm": 0.20447692275047302, "learning_rate": 4.976801856093732e-05, "loss": 0.4598, "step": 10005 }, { "epoch": 0.36075972177172305, "grad_norm": 0.16265089809894562, "learning_rate": 4.976762177919101e-05, "loss": 0.4668, "step": 10010 }, { "epoch": 0.36093992143294773, "grad_norm": 0.1456652581691742, "learning_rate": 4.976722465999063e-05, "loss": 0.4301, "step": 10015 }, { "epoch": 0.36112012109417235, "grad_norm": 0.17092099785804749, "learning_rate": 4.976682720334162e-05, "loss": 0.4681, "step": 10020 }, { "epoch": 0.361300320755397, "grad_norm": 0.1890965700149536, "learning_rate": 4.976642940924938e-05, "loss": 0.477, "step": 10025 }, { "epoch": 0.3614805204166216, "grad_norm": 0.16893772780895233, "learning_rate": 4.976603127771934e-05, "loss": 0.4796, "step": 10030 }, { "epoch": 0.3616607200778463, "grad_norm": 0.16961359977722168, "learning_rate": 4.9765632808756915e-05, "loss": 0.4314, "step": 10035 }, { "epoch": 0.3618409197390709, "grad_norm": 0.1630767434835434, "learning_rate": 4.9765234002367534e-05, "loss": 0.4594, "step": 10040 }, { "epoch": 0.3620211194002955, "grad_norm": 0.16475652158260345, "learning_rate": 4.9764834858556635e-05, "loss": 0.4489, "step": 10045 }, { "epoch": 0.36220131906152014, "grad_norm": 0.1670771986246109, "learning_rate": 4.9764435377329654e-05, "loss": 0.4278, "step": 10050 }, { "epoch": 0.3623815187227448, "grad_norm": 0.1597401350736618, "learning_rate": 4.9764035558692045e-05, "loss": 0.4035, "step": 10055 }, { "epoch": 0.36256171838396944, "grad_norm": 0.2012094110250473, "learning_rate": 4.9763635402649236e-05, "loss": 0.4649, "step": 10060 }, { "epoch": 0.36274191804519407, "grad_norm": 0.16659463942050934, "learning_rate": 4.9763234909206695e-05, "loss": 0.447, "step": 10065 }, { "epoch": 0.3629221177064187, "grad_norm": 0.15925155580043793, "learning_rate": 4.976283407836987e-05, "loss": 0.4575, "step": 10070 }, { "epoch": 0.36310231736764337, "grad_norm": 0.15115337073802948, "learning_rate": 4.976243291014423e-05, "loss": 0.4517, "step": 10075 }, { "epoch": 0.363282517028868, "grad_norm": 0.1577247679233551, "learning_rate": 4.976203140453523e-05, "loss": 0.4539, "step": 10080 }, { "epoch": 0.3634627166900926, "grad_norm": 0.16622968018054962, "learning_rate": 4.9761629561548354e-05, "loss": 0.4357, "step": 10085 }, { "epoch": 0.36364291635131724, "grad_norm": 0.1571248173713684, "learning_rate": 4.976122738118906e-05, "loss": 0.4368, "step": 10090 }, { "epoch": 0.3638231160125419, "grad_norm": 0.17340044677257538, "learning_rate": 4.976082486346284e-05, "loss": 0.4284, "step": 10095 }, { "epoch": 0.36400331567376654, "grad_norm": 0.20559607446193695, "learning_rate": 4.9760422008375176e-05, "loss": 0.4247, "step": 10100 }, { "epoch": 0.36418351533499116, "grad_norm": 0.159358948469162, "learning_rate": 4.976001881593155e-05, "loss": 0.4389, "step": 10105 }, { "epoch": 0.3643637149962158, "grad_norm": 0.19919893145561218, "learning_rate": 4.9759615286137465e-05, "loss": 0.4724, "step": 10110 }, { "epoch": 0.36454391465744046, "grad_norm": 0.1773001104593277, "learning_rate": 4.975921141899842e-05, "loss": 0.4847, "step": 10115 }, { "epoch": 0.3647241143186651, "grad_norm": 0.17267437279224396, "learning_rate": 4.975880721451991e-05, "loss": 0.4398, "step": 10120 }, { "epoch": 0.3649043139798897, "grad_norm": 0.19149023294448853, "learning_rate": 4.975840267270744e-05, "loss": 0.4703, "step": 10125 }, { "epoch": 0.3650845136411143, "grad_norm": 0.1706707626581192, "learning_rate": 4.975799779356653e-05, "loss": 0.5034, "step": 10130 }, { "epoch": 0.365264713302339, "grad_norm": 0.18125681579113007, "learning_rate": 4.97575925771027e-05, "loss": 0.473, "step": 10135 }, { "epoch": 0.36544491296356363, "grad_norm": 0.17481379210948944, "learning_rate": 4.975718702332146e-05, "loss": 0.4341, "step": 10140 }, { "epoch": 0.36562511262478825, "grad_norm": 0.19114582240581512, "learning_rate": 4.9756781132228334e-05, "loss": 0.4629, "step": 10145 }, { "epoch": 0.36580531228601293, "grad_norm": 0.1782168298959732, "learning_rate": 4.975637490382887e-05, "loss": 0.4375, "step": 10150 }, { "epoch": 0.36598551194723755, "grad_norm": 0.21205340325832367, "learning_rate": 4.975596833812858e-05, "loss": 0.4514, "step": 10155 }, { "epoch": 0.3661657116084622, "grad_norm": 0.19303537905216217, "learning_rate": 4.9755561435133024e-05, "loss": 0.4596, "step": 10160 }, { "epoch": 0.3663459112696868, "grad_norm": 0.16598139703273773, "learning_rate": 4.9755154194847734e-05, "loss": 0.4418, "step": 10165 }, { "epoch": 0.3665261109309115, "grad_norm": 0.19510379433631897, "learning_rate": 4.9754746617278254e-05, "loss": 0.4699, "step": 10170 }, { "epoch": 0.3667063105921361, "grad_norm": 0.16283252835273743, "learning_rate": 4.975433870243015e-05, "loss": 0.4337, "step": 10175 }, { "epoch": 0.3668865102533607, "grad_norm": 0.20761866867542267, "learning_rate": 4.975393045030897e-05, "loss": 0.4831, "step": 10180 }, { "epoch": 0.36706670991458534, "grad_norm": 0.15632273256778717, "learning_rate": 4.9753521860920284e-05, "loss": 0.4706, "step": 10185 }, { "epoch": 0.36724690957581, "grad_norm": 0.19553816318511963, "learning_rate": 4.975311293426965e-05, "loss": 0.4547, "step": 10190 }, { "epoch": 0.36742710923703464, "grad_norm": 0.15499721467494965, "learning_rate": 4.975270367036264e-05, "loss": 0.4329, "step": 10195 }, { "epoch": 0.36760730889825927, "grad_norm": 0.19358859956264496, "learning_rate": 4.975229406920485e-05, "loss": 0.4449, "step": 10200 }, { "epoch": 0.3677875085594839, "grad_norm": 0.19693566858768463, "learning_rate": 4.975188413080184e-05, "loss": 0.4487, "step": 10205 }, { "epoch": 0.36796770822070857, "grad_norm": 0.16378581523895264, "learning_rate": 4.97514738551592e-05, "loss": 0.4419, "step": 10210 }, { "epoch": 0.3681479078819332, "grad_norm": 0.22160831093788147, "learning_rate": 4.975106324228252e-05, "loss": 0.4503, "step": 10215 }, { "epoch": 0.3683281075431578, "grad_norm": 0.18501290678977966, "learning_rate": 4.975065229217739e-05, "loss": 0.4692, "step": 10220 }, { "epoch": 0.36850830720438243, "grad_norm": 0.17586058378219604, "learning_rate": 4.9750241004849415e-05, "loss": 0.4593, "step": 10225 }, { "epoch": 0.3686885068656071, "grad_norm": 0.17529945075511932, "learning_rate": 4.974982938030421e-05, "loss": 0.4924, "step": 10230 }, { "epoch": 0.36886870652683174, "grad_norm": 0.22143632173538208, "learning_rate": 4.974941741854736e-05, "loss": 0.4816, "step": 10235 }, { "epoch": 0.36904890618805636, "grad_norm": 0.16022737324237823, "learning_rate": 4.974900511958449e-05, "loss": 0.469, "step": 10240 }, { "epoch": 0.369229105849281, "grad_norm": 0.1857171654701233, "learning_rate": 4.974859248342122e-05, "loss": 0.4561, "step": 10245 }, { "epoch": 0.36940930551050566, "grad_norm": 0.15506871044635773, "learning_rate": 4.974817951006318e-05, "loss": 0.484, "step": 10250 }, { "epoch": 0.3695895051717303, "grad_norm": 0.16695240139961243, "learning_rate": 4.9747766199515967e-05, "loss": 0.4345, "step": 10255 }, { "epoch": 0.3697697048329549, "grad_norm": 0.1858297437429428, "learning_rate": 4.9747352551785234e-05, "loss": 0.4694, "step": 10260 }, { "epoch": 0.3699499044941795, "grad_norm": 0.18636928498744965, "learning_rate": 4.9746938566876624e-05, "loss": 0.4836, "step": 10265 }, { "epoch": 0.3701301041554042, "grad_norm": 0.14627771079540253, "learning_rate": 4.9746524244795755e-05, "loss": 0.4755, "step": 10270 }, { "epoch": 0.3703103038166288, "grad_norm": 0.2531489431858063, "learning_rate": 4.974610958554829e-05, "loss": 0.4655, "step": 10275 }, { "epoch": 0.37049050347785345, "grad_norm": 0.14432822167873383, "learning_rate": 4.974569458913988e-05, "loss": 0.4514, "step": 10280 }, { "epoch": 0.3706707031390781, "grad_norm": 0.200893372297287, "learning_rate": 4.974527925557616e-05, "loss": 0.4709, "step": 10285 }, { "epoch": 0.37085090280030275, "grad_norm": 0.1737281084060669, "learning_rate": 4.974486358486281e-05, "loss": 0.4521, "step": 10290 }, { "epoch": 0.3710311024615274, "grad_norm": 0.18836097419261932, "learning_rate": 4.9744447577005484e-05, "loss": 0.4333, "step": 10295 }, { "epoch": 0.371211302122752, "grad_norm": 0.16161343455314636, "learning_rate": 4.974403123200984e-05, "loss": 0.4779, "step": 10300 }, { "epoch": 0.3713915017839766, "grad_norm": 0.1600758135318756, "learning_rate": 4.9743614549881566e-05, "loss": 0.4377, "step": 10305 }, { "epoch": 0.3715717014452013, "grad_norm": 0.2167365998029709, "learning_rate": 4.974319753062634e-05, "loss": 0.4531, "step": 10310 }, { "epoch": 0.3717519011064259, "grad_norm": 0.20788373053073883, "learning_rate": 4.9742780174249835e-05, "loss": 0.4682, "step": 10315 }, { "epoch": 0.37193210076765054, "grad_norm": 0.16195015609264374, "learning_rate": 4.974236248075774e-05, "loss": 0.4617, "step": 10320 }, { "epoch": 0.3721123004288752, "grad_norm": 0.21307627856731415, "learning_rate": 4.974194445015574e-05, "loss": 0.474, "step": 10325 }, { "epoch": 0.37229250009009984, "grad_norm": 0.3052423894405365, "learning_rate": 4.974152608244955e-05, "loss": 0.4861, "step": 10330 }, { "epoch": 0.37247269975132447, "grad_norm": 0.16430044174194336, "learning_rate": 4.9741107377644845e-05, "loss": 0.4265, "step": 10335 }, { "epoch": 0.3726528994125491, "grad_norm": 0.19519920647144318, "learning_rate": 4.974068833574736e-05, "loss": 0.447, "step": 10340 }, { "epoch": 0.37283309907377377, "grad_norm": 0.22070425748825073, "learning_rate": 4.974026895676277e-05, "loss": 0.51, "step": 10345 }, { "epoch": 0.3730132987349984, "grad_norm": 0.17484411597251892, "learning_rate": 4.973984924069681e-05, "loss": 0.4552, "step": 10350 }, { "epoch": 0.373193498396223, "grad_norm": 0.15123674273490906, "learning_rate": 4.9739429187555185e-05, "loss": 0.4425, "step": 10355 }, { "epoch": 0.37337369805744763, "grad_norm": 0.17658279836177826, "learning_rate": 4.973900879734364e-05, "loss": 0.4305, "step": 10360 }, { "epoch": 0.3735538977186723, "grad_norm": 0.1411803662776947, "learning_rate": 4.973858807006788e-05, "loss": 0.4365, "step": 10365 }, { "epoch": 0.37373409737989693, "grad_norm": 0.1503876894712448, "learning_rate": 4.973816700573366e-05, "loss": 0.4826, "step": 10370 }, { "epoch": 0.37391429704112156, "grad_norm": 0.14822295308113098, "learning_rate": 4.97377456043467e-05, "loss": 0.4837, "step": 10375 }, { "epoch": 0.3740944967023462, "grad_norm": 0.1543811559677124, "learning_rate": 4.9737323865912734e-05, "loss": 0.4731, "step": 10380 }, { "epoch": 0.37427469636357086, "grad_norm": 0.18722733855247498, "learning_rate": 4.973690179043753e-05, "loss": 0.4697, "step": 10385 }, { "epoch": 0.3744548960247955, "grad_norm": 0.20646195113658905, "learning_rate": 4.9736479377926826e-05, "loss": 0.4538, "step": 10390 }, { "epoch": 0.3746350956860201, "grad_norm": 0.19029046595096588, "learning_rate": 4.9736056628386374e-05, "loss": 0.4714, "step": 10395 }, { "epoch": 0.3748152953472447, "grad_norm": 0.18136341869831085, "learning_rate": 4.973563354182195e-05, "loss": 0.4703, "step": 10400 }, { "epoch": 0.3749954950084694, "grad_norm": 0.20808909833431244, "learning_rate": 4.97352101182393e-05, "loss": 0.5067, "step": 10405 }, { "epoch": 0.375175694669694, "grad_norm": 0.18707624077796936, "learning_rate": 4.9734786357644204e-05, "loss": 0.4373, "step": 10410 }, { "epoch": 0.37535589433091865, "grad_norm": 0.19006437063217163, "learning_rate": 4.9734362260042434e-05, "loss": 0.4335, "step": 10415 }, { "epoch": 0.37553609399214327, "grad_norm": 0.22536343336105347, "learning_rate": 4.973393782543976e-05, "loss": 0.4656, "step": 10420 }, { "epoch": 0.37571629365336795, "grad_norm": 0.20031480491161346, "learning_rate": 4.9733513053841984e-05, "loss": 0.4998, "step": 10425 }, { "epoch": 0.3758964933145926, "grad_norm": 0.17395085096359253, "learning_rate": 4.973308794525487e-05, "loss": 0.4831, "step": 10430 }, { "epoch": 0.3760766929758172, "grad_norm": 0.20083628594875336, "learning_rate": 4.973266249968423e-05, "loss": 0.4584, "step": 10435 }, { "epoch": 0.3762568926370418, "grad_norm": 0.1963411122560501, "learning_rate": 4.973223671713585e-05, "loss": 0.4865, "step": 10440 }, { "epoch": 0.3764370922982665, "grad_norm": 0.20614828169345856, "learning_rate": 4.973181059761552e-05, "loss": 0.4628, "step": 10445 }, { "epoch": 0.3766172919594911, "grad_norm": 0.2048252522945404, "learning_rate": 4.973138414112908e-05, "loss": 0.4685, "step": 10450 }, { "epoch": 0.37679749162071574, "grad_norm": 0.1850869357585907, "learning_rate": 4.97309573476823e-05, "loss": 0.4674, "step": 10455 }, { "epoch": 0.37697769128194036, "grad_norm": 0.19031380116939545, "learning_rate": 4.9730530217281023e-05, "loss": 0.4621, "step": 10460 }, { "epoch": 0.37715789094316504, "grad_norm": 0.15056711435317993, "learning_rate": 4.973010274993106e-05, "loss": 0.4718, "step": 10465 }, { "epoch": 0.37733809060438966, "grad_norm": 0.19555266201496124, "learning_rate": 4.9729674945638236e-05, "loss": 0.4505, "step": 10470 }, { "epoch": 0.3775182902656143, "grad_norm": 0.18490497767925262, "learning_rate": 4.972924680440838e-05, "loss": 0.5174, "step": 10475 }, { "epoch": 0.37769848992683897, "grad_norm": 0.19573722779750824, "learning_rate": 4.9728818326247316e-05, "loss": 0.4594, "step": 10480 }, { "epoch": 0.3778786895880636, "grad_norm": 0.17318245768547058, "learning_rate": 4.97283895111609e-05, "loss": 0.4886, "step": 10485 }, { "epoch": 0.3780588892492882, "grad_norm": 0.17160874605178833, "learning_rate": 4.972796035915496e-05, "loss": 0.4565, "step": 10490 }, { "epoch": 0.37823908891051283, "grad_norm": 0.16455820202827454, "learning_rate": 4.9727530870235345e-05, "loss": 0.4022, "step": 10495 }, { "epoch": 0.3784192885717375, "grad_norm": 0.21172361075878143, "learning_rate": 4.972710104440791e-05, "loss": 0.449, "step": 10500 }, { "epoch": 0.3784192885717375, "eval_loss": 0.47920936346054077, "eval_runtime": 3.5282, "eval_samples_per_second": 28.343, "eval_steps_per_second": 7.086, "step": 10500 }, { "epoch": 0.37859948823296213, "grad_norm": 0.15352605283260345, "learning_rate": 4.9726670881678517e-05, "loss": 0.446, "step": 10505 }, { "epoch": 0.37877968789418676, "grad_norm": 0.18436992168426514, "learning_rate": 4.972624038205301e-05, "loss": 0.4641, "step": 10510 }, { "epoch": 0.3789598875554114, "grad_norm": 0.1690305769443512, "learning_rate": 4.972580954553727e-05, "loss": 0.4275, "step": 10515 }, { "epoch": 0.37914008721663606, "grad_norm": 0.20340761542320251, "learning_rate": 4.9725378372137166e-05, "loss": 0.4284, "step": 10520 }, { "epoch": 0.3793202868778607, "grad_norm": 0.17345954477787018, "learning_rate": 4.9724946861858566e-05, "loss": 0.4662, "step": 10525 }, { "epoch": 0.3795004865390853, "grad_norm": 0.16937977075576782, "learning_rate": 4.9724515014707354e-05, "loss": 0.4486, "step": 10530 }, { "epoch": 0.3796806862003099, "grad_norm": 0.1675933450460434, "learning_rate": 4.9724082830689404e-05, "loss": 0.466, "step": 10535 }, { "epoch": 0.3798608858615346, "grad_norm": 0.14321444928646088, "learning_rate": 4.972365030981062e-05, "loss": 0.4187, "step": 10540 }, { "epoch": 0.3800410855227592, "grad_norm": 0.19194680452346802, "learning_rate": 4.972321745207688e-05, "loss": 0.4069, "step": 10545 }, { "epoch": 0.38022128518398385, "grad_norm": 0.16965104639530182, "learning_rate": 4.972278425749409e-05, "loss": 0.4158, "step": 10550 }, { "epoch": 0.38040148484520847, "grad_norm": 0.18481513857841492, "learning_rate": 4.972235072606816e-05, "loss": 0.472, "step": 10555 }, { "epoch": 0.38058168450643315, "grad_norm": 0.17054641246795654, "learning_rate": 4.972191685780498e-05, "loss": 0.4005, "step": 10560 }, { "epoch": 0.38076188416765777, "grad_norm": 0.2037338763475418, "learning_rate": 4.972148265271047e-05, "loss": 0.4741, "step": 10565 }, { "epoch": 0.3809420838288824, "grad_norm": 0.17392988502979279, "learning_rate": 4.9721048110790546e-05, "loss": 0.4823, "step": 10570 }, { "epoch": 0.381122283490107, "grad_norm": 0.1915632039308548, "learning_rate": 4.972061323205113e-05, "loss": 0.4084, "step": 10575 }, { "epoch": 0.3813024831513317, "grad_norm": 0.16601555049419403, "learning_rate": 4.972017801649814e-05, "loss": 0.4264, "step": 10580 }, { "epoch": 0.3814826828125563, "grad_norm": 0.15968888998031616, "learning_rate": 4.971974246413752e-05, "loss": 0.4563, "step": 10585 }, { "epoch": 0.38166288247378094, "grad_norm": 0.17778979241847992, "learning_rate": 4.971930657497518e-05, "loss": 0.451, "step": 10590 }, { "epoch": 0.38184308213500556, "grad_norm": 0.16911092400550842, "learning_rate": 4.971887034901708e-05, "loss": 0.4147, "step": 10595 }, { "epoch": 0.38202328179623024, "grad_norm": 0.24606285989284515, "learning_rate": 4.971843378626916e-05, "loss": 0.4428, "step": 10600 }, { "epoch": 0.38220348145745486, "grad_norm": 0.1590912640094757, "learning_rate": 4.971799688673737e-05, "loss": 0.441, "step": 10605 }, { "epoch": 0.3823836811186795, "grad_norm": 0.1825900822877884, "learning_rate": 4.971755965042765e-05, "loss": 0.4634, "step": 10610 }, { "epoch": 0.3825638807799041, "grad_norm": 0.14857780933380127, "learning_rate": 4.9717122077345965e-05, "loss": 0.4584, "step": 10615 }, { "epoch": 0.3827440804411288, "grad_norm": 0.18574172258377075, "learning_rate": 4.971668416749828e-05, "loss": 0.4721, "step": 10620 }, { "epoch": 0.3829242801023534, "grad_norm": 0.21157976984977722, "learning_rate": 4.971624592089056e-05, "loss": 0.464, "step": 10625 }, { "epoch": 0.38310447976357803, "grad_norm": 0.15750320255756378, "learning_rate": 4.971580733752877e-05, "loss": 0.4572, "step": 10630 }, { "epoch": 0.38328467942480265, "grad_norm": 0.188838928937912, "learning_rate": 4.9715368417418894e-05, "loss": 0.4838, "step": 10635 }, { "epoch": 0.38346487908602733, "grad_norm": 0.19151824712753296, "learning_rate": 4.9714929160566906e-05, "loss": 0.4321, "step": 10640 }, { "epoch": 0.38364507874725196, "grad_norm": 0.17638704180717468, "learning_rate": 4.971448956697879e-05, "loss": 0.4894, "step": 10645 }, { "epoch": 0.3838252784084766, "grad_norm": 0.191755011677742, "learning_rate": 4.9714049636660544e-05, "loss": 0.4614, "step": 10650 }, { "epoch": 0.38400547806970126, "grad_norm": 0.1991795152425766, "learning_rate": 4.971360936961815e-05, "loss": 0.4343, "step": 10655 }, { "epoch": 0.3841856777309259, "grad_norm": 0.1604321449995041, "learning_rate": 4.971316876585762e-05, "loss": 0.4477, "step": 10660 }, { "epoch": 0.3843658773921505, "grad_norm": 0.19753023982048035, "learning_rate": 4.971272782538495e-05, "loss": 0.4803, "step": 10665 }, { "epoch": 0.3845460770533751, "grad_norm": 0.1573752909898758, "learning_rate": 4.971228654820615e-05, "loss": 0.4378, "step": 10670 }, { "epoch": 0.3847262767145998, "grad_norm": 0.19612035155296326, "learning_rate": 4.971184493432722e-05, "loss": 0.4939, "step": 10675 }, { "epoch": 0.3849064763758244, "grad_norm": 0.1873956173658371, "learning_rate": 4.9711402983754194e-05, "loss": 0.4378, "step": 10680 }, { "epoch": 0.38508667603704905, "grad_norm": 0.1653887778520584, "learning_rate": 4.971096069649309e-05, "loss": 0.4403, "step": 10685 }, { "epoch": 0.38526687569827367, "grad_norm": 0.1948213279247284, "learning_rate": 4.971051807254993e-05, "loss": 0.3959, "step": 10690 }, { "epoch": 0.38544707535949835, "grad_norm": 0.17649932205677032, "learning_rate": 4.9710075111930744e-05, "loss": 0.4329, "step": 10695 }, { "epoch": 0.38562727502072297, "grad_norm": 0.16458600759506226, "learning_rate": 4.970963181464157e-05, "loss": 0.4233, "step": 10700 }, { "epoch": 0.3858074746819476, "grad_norm": 0.1625463217496872, "learning_rate": 4.970918818068844e-05, "loss": 0.4972, "step": 10705 }, { "epoch": 0.3859876743431722, "grad_norm": 0.12830519676208496, "learning_rate": 4.9708744210077406e-05, "loss": 0.4458, "step": 10710 }, { "epoch": 0.3861678740043969, "grad_norm": 0.20030345022678375, "learning_rate": 4.9708299902814516e-05, "loss": 0.4751, "step": 10715 }, { "epoch": 0.3863480736656215, "grad_norm": 0.15342171490192413, "learning_rate": 4.970785525890582e-05, "loss": 0.4467, "step": 10720 }, { "epoch": 0.38652827332684614, "grad_norm": 0.19483308494091034, "learning_rate": 4.9707410278357393e-05, "loss": 0.4594, "step": 10725 }, { "epoch": 0.38670847298807076, "grad_norm": 0.22155672311782837, "learning_rate": 4.970696496117527e-05, "loss": 0.4764, "step": 10730 }, { "epoch": 0.38688867264929544, "grad_norm": 0.22407083213329315, "learning_rate": 4.970651930736554e-05, "loss": 0.4337, "step": 10735 }, { "epoch": 0.38706887231052006, "grad_norm": 0.14635224640369415, "learning_rate": 4.970607331693427e-05, "loss": 0.4752, "step": 10740 }, { "epoch": 0.3872490719717447, "grad_norm": 0.18128085136413574, "learning_rate": 4.970562698988753e-05, "loss": 0.4927, "step": 10745 }, { "epoch": 0.3874292716329693, "grad_norm": 0.1800365000963211, "learning_rate": 4.970518032623141e-05, "loss": 0.4195, "step": 10750 }, { "epoch": 0.387609471294194, "grad_norm": 0.18861335515975952, "learning_rate": 4.9704733325971986e-05, "loss": 0.4563, "step": 10755 }, { "epoch": 0.3877896709554186, "grad_norm": 0.18732388317584991, "learning_rate": 4.9704285989115355e-05, "loss": 0.4643, "step": 10760 }, { "epoch": 0.38796987061664323, "grad_norm": 0.17132775485515594, "learning_rate": 4.970383831566762e-05, "loss": 0.4932, "step": 10765 }, { "epoch": 0.38815007027786785, "grad_norm": 0.17382220923900604, "learning_rate": 4.970339030563485e-05, "loss": 0.4402, "step": 10770 }, { "epoch": 0.38833026993909253, "grad_norm": 0.15054017305374146, "learning_rate": 4.9702941959023185e-05, "loss": 0.4593, "step": 10775 }, { "epoch": 0.38851046960031715, "grad_norm": 0.19927044212818146, "learning_rate": 4.9702493275838713e-05, "loss": 0.4606, "step": 10780 }, { "epoch": 0.3886906692615418, "grad_norm": 0.16357219219207764, "learning_rate": 4.970204425608756e-05, "loss": 0.4324, "step": 10785 }, { "epoch": 0.3888708689227664, "grad_norm": 0.1704314649105072, "learning_rate": 4.970159489977583e-05, "loss": 0.4257, "step": 10790 }, { "epoch": 0.3890510685839911, "grad_norm": 0.17188839614391327, "learning_rate": 4.970114520690965e-05, "loss": 0.4469, "step": 10795 }, { "epoch": 0.3892312682452157, "grad_norm": 0.17841334640979767, "learning_rate": 4.9700695177495154e-05, "loss": 0.3993, "step": 10800 }, { "epoch": 0.3894114679064403, "grad_norm": 0.21081610023975372, "learning_rate": 4.970024481153847e-05, "loss": 0.4844, "step": 10805 }, { "epoch": 0.38959166756766495, "grad_norm": 0.18982069194316864, "learning_rate": 4.9699794109045726e-05, "loss": 0.4578, "step": 10810 }, { "epoch": 0.3897718672288896, "grad_norm": 0.18108953535556793, "learning_rate": 4.969934307002307e-05, "loss": 0.4661, "step": 10815 }, { "epoch": 0.38995206689011425, "grad_norm": 0.1565956175327301, "learning_rate": 4.969889169447664e-05, "loss": 0.4347, "step": 10820 }, { "epoch": 0.39013226655133887, "grad_norm": 0.16742676496505737, "learning_rate": 4.9698439982412616e-05, "loss": 0.459, "step": 10825 }, { "epoch": 0.39031246621256355, "grad_norm": 0.15715833008289337, "learning_rate": 4.969798793383711e-05, "loss": 0.4821, "step": 10830 }, { "epoch": 0.39049266587378817, "grad_norm": 0.21100378036499023, "learning_rate": 4.9697535548756304e-05, "loss": 0.4679, "step": 10835 }, { "epoch": 0.3906728655350128, "grad_norm": 0.17519085109233856, "learning_rate": 4.969708282717635e-05, "loss": 0.4637, "step": 10840 }, { "epoch": 0.3908530651962374, "grad_norm": 0.15852008759975433, "learning_rate": 4.969662976910344e-05, "loss": 0.4755, "step": 10845 }, { "epoch": 0.3910332648574621, "grad_norm": 0.22311712801456451, "learning_rate": 4.969617637454373e-05, "loss": 0.4252, "step": 10850 }, { "epoch": 0.3912134645186867, "grad_norm": 0.17925986647605896, "learning_rate": 4.9695722643503384e-05, "loss": 0.4975, "step": 10855 }, { "epoch": 0.39139366417991134, "grad_norm": 0.17491289973258972, "learning_rate": 4.969526857598861e-05, "loss": 0.4618, "step": 10860 }, { "epoch": 0.39157386384113596, "grad_norm": 0.2179514467716217, "learning_rate": 4.969481417200558e-05, "loss": 0.4999, "step": 10865 }, { "epoch": 0.39175406350236064, "grad_norm": 0.21115978062152863, "learning_rate": 4.969435943156048e-05, "loss": 0.4691, "step": 10870 }, { "epoch": 0.39193426316358526, "grad_norm": 0.16908550262451172, "learning_rate": 4.969390435465952e-05, "loss": 0.4589, "step": 10875 }, { "epoch": 0.3921144628248099, "grad_norm": 0.16399134695529938, "learning_rate": 4.969344894130889e-05, "loss": 0.4356, "step": 10880 }, { "epoch": 0.3922946624860345, "grad_norm": 0.15767602622509003, "learning_rate": 4.96929931915148e-05, "loss": 0.4667, "step": 10885 }, { "epoch": 0.3924748621472592, "grad_norm": 0.20866945385932922, "learning_rate": 4.9692537105283465e-05, "loss": 0.4452, "step": 10890 }, { "epoch": 0.3926550618084838, "grad_norm": 0.15087029337882996, "learning_rate": 4.969208068262109e-05, "loss": 0.442, "step": 10895 }, { "epoch": 0.39283526146970843, "grad_norm": 0.1792287528514862, "learning_rate": 4.969162392353389e-05, "loss": 0.4511, "step": 10900 }, { "epoch": 0.39301546113093305, "grad_norm": 0.16663479804992676, "learning_rate": 4.96911668280281e-05, "loss": 0.4726, "step": 10905 }, { "epoch": 0.39319566079215773, "grad_norm": 0.16242770850658417, "learning_rate": 4.969070939610995e-05, "loss": 0.4885, "step": 10910 }, { "epoch": 0.39337586045338235, "grad_norm": 0.14637549221515656, "learning_rate": 4.969025162778566e-05, "loss": 0.4273, "step": 10915 }, { "epoch": 0.393556060114607, "grad_norm": 0.1730150431394577, "learning_rate": 4.968979352306146e-05, "loss": 0.4698, "step": 10920 }, { "epoch": 0.3937362597758316, "grad_norm": 0.15149009227752686, "learning_rate": 4.968933508194361e-05, "loss": 0.4338, "step": 10925 }, { "epoch": 0.3939164594370563, "grad_norm": 0.16374550759792328, "learning_rate": 4.968887630443836e-05, "loss": 0.4735, "step": 10930 }, { "epoch": 0.3940966590982809, "grad_norm": 0.16097819805145264, "learning_rate": 4.968841719055194e-05, "loss": 0.434, "step": 10935 }, { "epoch": 0.3942768587595055, "grad_norm": 0.18901294469833374, "learning_rate": 4.968795774029061e-05, "loss": 0.4419, "step": 10940 }, { "epoch": 0.39445705842073014, "grad_norm": 0.20963457226753235, "learning_rate": 4.9687497953660646e-05, "loss": 0.455, "step": 10945 }, { "epoch": 0.3946372580819548, "grad_norm": 0.1951969563961029, "learning_rate": 4.9687037830668306e-05, "loss": 0.4736, "step": 10950 }, { "epoch": 0.39481745774317945, "grad_norm": 0.17209535837173462, "learning_rate": 4.968657737131984e-05, "loss": 0.4568, "step": 10955 }, { "epoch": 0.39499765740440407, "grad_norm": 0.20058324933052063, "learning_rate": 4.968611657562154e-05, "loss": 0.4182, "step": 10960 }, { "epoch": 0.3951778570656287, "grad_norm": 0.15861481428146362, "learning_rate": 4.968565544357969e-05, "loss": 0.4993, "step": 10965 }, { "epoch": 0.39535805672685337, "grad_norm": 0.18526114523410797, "learning_rate": 4.968519397520056e-05, "loss": 0.4578, "step": 10970 }, { "epoch": 0.395538256388078, "grad_norm": 0.14980530738830566, "learning_rate": 4.968473217049044e-05, "loss": 0.4595, "step": 10975 }, { "epoch": 0.3957184560493026, "grad_norm": 0.186975359916687, "learning_rate": 4.9684270029455624e-05, "loss": 0.45, "step": 10980 }, { "epoch": 0.3958986557105273, "grad_norm": 0.15820086002349854, "learning_rate": 4.968380755210241e-05, "loss": 0.4645, "step": 10985 }, { "epoch": 0.3960788553717519, "grad_norm": 0.14868688583374023, "learning_rate": 4.9683344738437096e-05, "loss": 0.4714, "step": 10990 }, { "epoch": 0.39625905503297654, "grad_norm": 0.18641044199466705, "learning_rate": 4.968288158846599e-05, "loss": 0.4926, "step": 10995 }, { "epoch": 0.39643925469420116, "grad_norm": 0.17056040465831757, "learning_rate": 4.968241810219539e-05, "loss": 0.4738, "step": 11000 }, { "epoch": 0.39643925469420116, "eval_loss": 0.4763174057006836, "eval_runtime": 3.5834, "eval_samples_per_second": 27.907, "eval_steps_per_second": 6.977, "step": 11000 }, { "epoch": 0.39661945435542584, "grad_norm": 0.16105006635189056, "learning_rate": 4.9681954279631635e-05, "loss": 0.4637, "step": 11005 }, { "epoch": 0.39679965401665046, "grad_norm": 0.17040333151817322, "learning_rate": 4.968149012078103e-05, "loss": 0.4618, "step": 11010 }, { "epoch": 0.3969798536778751, "grad_norm": 0.16445614397525787, "learning_rate": 4.9681025625649905e-05, "loss": 0.4516, "step": 11015 }, { "epoch": 0.3971600533390997, "grad_norm": 0.16184879839420319, "learning_rate": 4.968056079424457e-05, "loss": 0.4915, "step": 11020 }, { "epoch": 0.3973402530003244, "grad_norm": 0.15629048645496368, "learning_rate": 4.9680095626571384e-05, "loss": 0.4608, "step": 11025 }, { "epoch": 0.397520452661549, "grad_norm": 0.20805750787258148, "learning_rate": 4.967963012263667e-05, "loss": 0.4356, "step": 11030 }, { "epoch": 0.39770065232277363, "grad_norm": 0.16487817466259003, "learning_rate": 4.967916428244677e-05, "loss": 0.4734, "step": 11035 }, { "epoch": 0.39788085198399825, "grad_norm": 0.1743331402540207, "learning_rate": 4.9678698106008034e-05, "loss": 0.4458, "step": 11040 }, { "epoch": 0.39806105164522293, "grad_norm": 0.17199203372001648, "learning_rate": 4.967823159332682e-05, "loss": 0.4683, "step": 11045 }, { "epoch": 0.39824125130644755, "grad_norm": 0.17917218804359436, "learning_rate": 4.967776474440948e-05, "loss": 0.4884, "step": 11050 }, { "epoch": 0.3984214509676722, "grad_norm": 0.16735875606536865, "learning_rate": 4.967729755926237e-05, "loss": 0.4558, "step": 11055 }, { "epoch": 0.3986016506288968, "grad_norm": 0.23970329761505127, "learning_rate": 4.967683003789185e-05, "loss": 0.4259, "step": 11060 }, { "epoch": 0.3987818502901215, "grad_norm": 0.17850400507450104, "learning_rate": 4.967636218030431e-05, "loss": 0.4225, "step": 11065 }, { "epoch": 0.3989620499513461, "grad_norm": 0.14817573130130768, "learning_rate": 4.967589398650611e-05, "loss": 0.4513, "step": 11070 }, { "epoch": 0.3991422496125707, "grad_norm": 0.15426768362522125, "learning_rate": 4.9675425456503634e-05, "loss": 0.4577, "step": 11075 }, { "epoch": 0.39932244927379534, "grad_norm": 0.18102167546749115, "learning_rate": 4.967495659030326e-05, "loss": 0.4538, "step": 11080 }, { "epoch": 0.39950264893502, "grad_norm": 0.21321766078472137, "learning_rate": 4.9674487387911374e-05, "loss": 0.4502, "step": 11085 }, { "epoch": 0.39968284859624464, "grad_norm": 0.17416805028915405, "learning_rate": 4.967401784933439e-05, "loss": 0.4346, "step": 11090 }, { "epoch": 0.39986304825746927, "grad_norm": 0.20586751401424408, "learning_rate": 4.9673547974578674e-05, "loss": 0.4706, "step": 11095 }, { "epoch": 0.4000432479186939, "grad_norm": 0.1607694774866104, "learning_rate": 4.967307776365065e-05, "loss": 0.4478, "step": 11100 }, { "epoch": 0.40022344757991857, "grad_norm": 0.2245350033044815, "learning_rate": 4.967260721655672e-05, "loss": 0.4874, "step": 11105 }, { "epoch": 0.4004036472411432, "grad_norm": 0.18260863423347473, "learning_rate": 4.967213633330329e-05, "loss": 0.4176, "step": 11110 }, { "epoch": 0.4005838469023678, "grad_norm": 0.1575452834367752, "learning_rate": 4.967166511389678e-05, "loss": 0.4249, "step": 11115 }, { "epoch": 0.40076404656359244, "grad_norm": 0.18237917125225067, "learning_rate": 4.967119355834361e-05, "loss": 0.4794, "step": 11120 }, { "epoch": 0.4009442462248171, "grad_norm": 0.2220340073108673, "learning_rate": 4.96707216666502e-05, "loss": 0.4514, "step": 11125 }, { "epoch": 0.40112444588604174, "grad_norm": 0.17675887048244476, "learning_rate": 4.9670249438822994e-05, "loss": 0.4648, "step": 11130 }, { "epoch": 0.40130464554726636, "grad_norm": 0.19447670876979828, "learning_rate": 4.966977687486841e-05, "loss": 0.4771, "step": 11135 }, { "epoch": 0.401484845208491, "grad_norm": 0.1338052898645401, "learning_rate": 4.966930397479289e-05, "loss": 0.416, "step": 11140 }, { "epoch": 0.40166504486971566, "grad_norm": 0.19171032309532166, "learning_rate": 4.966883073860288e-05, "loss": 0.4714, "step": 11145 }, { "epoch": 0.4018452445309403, "grad_norm": 0.17477290332317352, "learning_rate": 4.966835716630483e-05, "loss": 0.4862, "step": 11150 }, { "epoch": 0.4020254441921649, "grad_norm": 0.22172503173351288, "learning_rate": 4.966788325790519e-05, "loss": 0.4264, "step": 11155 }, { "epoch": 0.4022056438533896, "grad_norm": 0.1821010559797287, "learning_rate": 4.966740901341042e-05, "loss": 0.4472, "step": 11160 }, { "epoch": 0.4023858435146142, "grad_norm": 0.21025283634662628, "learning_rate": 4.9666934432826975e-05, "loss": 0.446, "step": 11165 }, { "epoch": 0.40256604317583883, "grad_norm": 0.16647890210151672, "learning_rate": 4.9666459516161316e-05, "loss": 0.4754, "step": 11170 }, { "epoch": 0.40274624283706345, "grad_norm": 0.15028271079063416, "learning_rate": 4.9665984263419926e-05, "loss": 0.4176, "step": 11175 }, { "epoch": 0.40292644249828813, "grad_norm": 0.1495635062456131, "learning_rate": 4.9665508674609277e-05, "loss": 0.4659, "step": 11180 }, { "epoch": 0.40310664215951275, "grad_norm": 0.17869426310062408, "learning_rate": 4.966503274973585e-05, "loss": 0.453, "step": 11185 }, { "epoch": 0.4032868418207374, "grad_norm": 0.1512574553489685, "learning_rate": 4.9664556488806124e-05, "loss": 0.4645, "step": 11190 }, { "epoch": 0.403467041481962, "grad_norm": 0.18554282188415527, "learning_rate": 4.96640798918266e-05, "loss": 0.4688, "step": 11195 }, { "epoch": 0.4036472411431867, "grad_norm": 0.16681110858917236, "learning_rate": 4.966360295880375e-05, "loss": 0.4333, "step": 11200 }, { "epoch": 0.4038274408044113, "grad_norm": 0.2254079133272171, "learning_rate": 4.966312568974409e-05, "loss": 0.4501, "step": 11205 }, { "epoch": 0.4040076404656359, "grad_norm": 0.14422693848609924, "learning_rate": 4.966264808465412e-05, "loss": 0.4039, "step": 11210 }, { "epoch": 0.40418784012686054, "grad_norm": 0.17305003106594086, "learning_rate": 4.9662170143540336e-05, "loss": 0.4803, "step": 11215 }, { "epoch": 0.4043680397880852, "grad_norm": 0.18754348158836365, "learning_rate": 4.966169186640927e-05, "loss": 0.4577, "step": 11220 }, { "epoch": 0.40454823944930984, "grad_norm": 0.15006259083747864, "learning_rate": 4.966121325326742e-05, "loss": 0.4578, "step": 11225 }, { "epoch": 0.40472843911053447, "grad_norm": 0.1722080409526825, "learning_rate": 4.9660734304121315e-05, "loss": 0.4343, "step": 11230 }, { "epoch": 0.4049086387717591, "grad_norm": 0.13911651074886322, "learning_rate": 4.9660255018977475e-05, "loss": 0.4435, "step": 11235 }, { "epoch": 0.40508883843298377, "grad_norm": 0.20192518830299377, "learning_rate": 4.9659775397842444e-05, "loss": 0.4632, "step": 11240 }, { "epoch": 0.4052690380942084, "grad_norm": 0.17392924427986145, "learning_rate": 4.965929544072274e-05, "loss": 0.4377, "step": 11245 }, { "epoch": 0.405449237755433, "grad_norm": 0.17670120298862457, "learning_rate": 4.9658815147624914e-05, "loss": 0.4451, "step": 11250 }, { "epoch": 0.40562943741665763, "grad_norm": 0.22466880083084106, "learning_rate": 4.9658334518555507e-05, "loss": 0.5001, "step": 11255 }, { "epoch": 0.4058096370778823, "grad_norm": 0.18523770570755005, "learning_rate": 4.965785355352106e-05, "loss": 0.4358, "step": 11260 }, { "epoch": 0.40598983673910694, "grad_norm": 0.20358721911907196, "learning_rate": 4.965737225252814e-05, "loss": 0.4626, "step": 11265 }, { "epoch": 0.40617003640033156, "grad_norm": 0.2111283838748932, "learning_rate": 4.9656890615583297e-05, "loss": 0.4708, "step": 11270 }, { "epoch": 0.4063502360615562, "grad_norm": 0.200625941157341, "learning_rate": 4.965640864269309e-05, "loss": 0.4916, "step": 11275 }, { "epoch": 0.40653043572278086, "grad_norm": 0.17772261798381805, "learning_rate": 4.965592633386408e-05, "loss": 0.4205, "step": 11280 }, { "epoch": 0.4067106353840055, "grad_norm": 0.16529619693756104, "learning_rate": 4.965544368910285e-05, "loss": 0.4354, "step": 11285 }, { "epoch": 0.4068908350452301, "grad_norm": 0.12482757121324539, "learning_rate": 4.965496070841599e-05, "loss": 0.4189, "step": 11290 }, { "epoch": 0.4070710347064547, "grad_norm": 0.16134874522686005, "learning_rate": 4.965447739181005e-05, "loss": 0.4405, "step": 11295 }, { "epoch": 0.4072512343676794, "grad_norm": 0.1545429825782776, "learning_rate": 4.965399373929163e-05, "loss": 0.4762, "step": 11300 }, { "epoch": 0.407431434028904, "grad_norm": 0.16678392887115479, "learning_rate": 4.9653509750867324e-05, "loss": 0.4442, "step": 11305 }, { "epoch": 0.40761163369012865, "grad_norm": 0.19490495324134827, "learning_rate": 4.965302542654371e-05, "loss": 0.436, "step": 11310 }, { "epoch": 0.4077918333513533, "grad_norm": 0.19900862872600555, "learning_rate": 4.9652540766327406e-05, "loss": 0.5119, "step": 11315 }, { "epoch": 0.40797203301257795, "grad_norm": 0.1822587251663208, "learning_rate": 4.9652055770225005e-05, "loss": 0.4696, "step": 11320 }, { "epoch": 0.4081522326738026, "grad_norm": 0.1810266673564911, "learning_rate": 4.965157043824311e-05, "loss": 0.4568, "step": 11325 }, { "epoch": 0.4083324323350272, "grad_norm": 0.19422537088394165, "learning_rate": 4.965108477038835e-05, "loss": 0.4705, "step": 11330 }, { "epoch": 0.4085126319962519, "grad_norm": 0.15788643062114716, "learning_rate": 4.965059876666733e-05, "loss": 0.4606, "step": 11335 }, { "epoch": 0.4086928316574765, "grad_norm": 0.15164151787757874, "learning_rate": 4.965011242708667e-05, "loss": 0.465, "step": 11340 }, { "epoch": 0.4088730313187011, "grad_norm": 0.19079901278018951, "learning_rate": 4.964962575165301e-05, "loss": 0.4909, "step": 11345 }, { "epoch": 0.40905323097992574, "grad_norm": 0.1349342167377472, "learning_rate": 4.964913874037296e-05, "loss": 0.4571, "step": 11350 }, { "epoch": 0.4092334306411504, "grad_norm": 0.1663023829460144, "learning_rate": 4.9648651393253176e-05, "loss": 0.461, "step": 11355 }, { "epoch": 0.40941363030237504, "grad_norm": 0.16813883185386658, "learning_rate": 4.964816371030029e-05, "loss": 0.408, "step": 11360 }, { "epoch": 0.40959382996359966, "grad_norm": 0.15600088238716125, "learning_rate": 4.964767569152093e-05, "loss": 0.4358, "step": 11365 }, { "epoch": 0.4097740296248243, "grad_norm": 0.14908957481384277, "learning_rate": 4.964718733692178e-05, "loss": 0.4046, "step": 11370 }, { "epoch": 0.40995422928604897, "grad_norm": 0.1925799548625946, "learning_rate": 4.9646698646509465e-05, "loss": 0.4426, "step": 11375 }, { "epoch": 0.4101344289472736, "grad_norm": 0.15564826130867004, "learning_rate": 4.9646209620290654e-05, "loss": 0.4438, "step": 11380 }, { "epoch": 0.4103146286084982, "grad_norm": 0.19330483675003052, "learning_rate": 4.9645720258272014e-05, "loss": 0.4536, "step": 11385 }, { "epoch": 0.41049482826972283, "grad_norm": 0.18074195086956024, "learning_rate": 4.96452305604602e-05, "loss": 0.4674, "step": 11390 }, { "epoch": 0.4106750279309475, "grad_norm": 0.20103107392787933, "learning_rate": 4.964474052686189e-05, "loss": 0.4257, "step": 11395 }, { "epoch": 0.41085522759217213, "grad_norm": 0.14884856343269348, "learning_rate": 4.9644250157483765e-05, "loss": 0.4443, "step": 11400 }, { "epoch": 0.41103542725339676, "grad_norm": 0.1957724243402481, "learning_rate": 4.96437594523325e-05, "loss": 0.4462, "step": 11405 }, { "epoch": 0.4112156269146214, "grad_norm": 0.15725165605545044, "learning_rate": 4.964326841141479e-05, "loss": 0.4184, "step": 11410 }, { "epoch": 0.41139582657584606, "grad_norm": 0.16227923333644867, "learning_rate": 4.964277703473731e-05, "loss": 0.4157, "step": 11415 }, { "epoch": 0.4115760262370707, "grad_norm": 0.1838981658220291, "learning_rate": 4.9642285322306766e-05, "loss": 0.4706, "step": 11420 }, { "epoch": 0.4117562258982953, "grad_norm": 0.19162507355213165, "learning_rate": 4.9641793274129864e-05, "loss": 0.4535, "step": 11425 }, { "epoch": 0.4119364255595199, "grad_norm": 0.14456330239772797, "learning_rate": 4.964130089021329e-05, "loss": 0.4752, "step": 11430 }, { "epoch": 0.4121166252207446, "grad_norm": 0.17111894488334656, "learning_rate": 4.964080817056377e-05, "loss": 0.4575, "step": 11435 }, { "epoch": 0.4122968248819692, "grad_norm": 0.16951219737529755, "learning_rate": 4.9640315115188004e-05, "loss": 0.4254, "step": 11440 }, { "epoch": 0.41247702454319385, "grad_norm": 0.14794260263442993, "learning_rate": 4.963982172409272e-05, "loss": 0.4429, "step": 11445 }, { "epoch": 0.41265722420441847, "grad_norm": 0.21253348886966705, "learning_rate": 4.963932799728462e-05, "loss": 0.4842, "step": 11450 }, { "epoch": 0.41283742386564315, "grad_norm": 0.19845059514045715, "learning_rate": 4.963883393477046e-05, "loss": 0.4506, "step": 11455 }, { "epoch": 0.41301762352686777, "grad_norm": 0.16377714276313782, "learning_rate": 4.963833953655696e-05, "loss": 0.43, "step": 11460 }, { "epoch": 0.4131978231880924, "grad_norm": 0.13513842225074768, "learning_rate": 4.963784480265085e-05, "loss": 0.4142, "step": 11465 }, { "epoch": 0.413378022849317, "grad_norm": 0.20631147921085358, "learning_rate": 4.963734973305887e-05, "loss": 0.4484, "step": 11470 }, { "epoch": 0.4135582225105417, "grad_norm": 0.14362797141075134, "learning_rate": 4.963685432778777e-05, "loss": 0.4563, "step": 11475 }, { "epoch": 0.4137384221717663, "grad_norm": 0.17292509973049164, "learning_rate": 4.963635858684431e-05, "loss": 0.4537, "step": 11480 }, { "epoch": 0.41391862183299094, "grad_norm": 0.19188092648983002, "learning_rate": 4.963586251023523e-05, "loss": 0.469, "step": 11485 }, { "epoch": 0.41409882149421556, "grad_norm": 0.17507772147655487, "learning_rate": 4.963536609796729e-05, "loss": 0.424, "step": 11490 }, { "epoch": 0.41427902115544024, "grad_norm": 0.13874538242816925, "learning_rate": 4.963486935004725e-05, "loss": 0.457, "step": 11495 }, { "epoch": 0.41445922081666486, "grad_norm": 0.1583535075187683, "learning_rate": 4.96343722664819e-05, "loss": 0.4558, "step": 11500 }, { "epoch": 0.41445922081666486, "eval_loss": 0.4750808775424957, "eval_runtime": 3.597, "eval_samples_per_second": 27.801, "eval_steps_per_second": 6.95, "step": 11500 }, { "epoch": 0.4146394204778895, "grad_norm": 0.1701897829771042, "learning_rate": 4.9633874847277985e-05, "loss": 0.509, "step": 11505 }, { "epoch": 0.41481962013911416, "grad_norm": 0.20449738204479218, "learning_rate": 4.9633377092442305e-05, "loss": 0.4207, "step": 11510 }, { "epoch": 0.4149998198003388, "grad_norm": 0.21852335333824158, "learning_rate": 4.9632879001981616e-05, "loss": 0.4482, "step": 11515 }, { "epoch": 0.4151800194615634, "grad_norm": 0.1580123007297516, "learning_rate": 4.963238057590273e-05, "loss": 0.4244, "step": 11520 }, { "epoch": 0.41536021912278803, "grad_norm": 0.18125052750110626, "learning_rate": 4.963188181421243e-05, "loss": 0.4599, "step": 11525 }, { "epoch": 0.4155404187840127, "grad_norm": 0.18217013776302338, "learning_rate": 4.9631382716917504e-05, "loss": 0.4347, "step": 11530 }, { "epoch": 0.41572061844523733, "grad_norm": 0.1741897016763687, "learning_rate": 4.9630883284024756e-05, "loss": 0.428, "step": 11535 }, { "epoch": 0.41590081810646196, "grad_norm": 0.20398588478565216, "learning_rate": 4.9630383515541e-05, "loss": 0.4587, "step": 11540 }, { "epoch": 0.4160810177676866, "grad_norm": 0.1447395384311676, "learning_rate": 4.9629883411473025e-05, "loss": 0.4406, "step": 11545 }, { "epoch": 0.41626121742891126, "grad_norm": 0.21012569963932037, "learning_rate": 4.962938297182767e-05, "loss": 0.4901, "step": 11550 }, { "epoch": 0.4164414170901359, "grad_norm": 0.17350612580776215, "learning_rate": 4.962888219661173e-05, "loss": 0.4447, "step": 11555 }, { "epoch": 0.4166216167513605, "grad_norm": 0.167585551738739, "learning_rate": 4.9628381085832046e-05, "loss": 0.4391, "step": 11560 }, { "epoch": 0.4168018164125851, "grad_norm": 0.15864285826683044, "learning_rate": 4.962787963949543e-05, "loss": 0.4621, "step": 11565 }, { "epoch": 0.4169820160738098, "grad_norm": 0.20318114757537842, "learning_rate": 4.9627377857608725e-05, "loss": 0.4257, "step": 11570 }, { "epoch": 0.4171622157350344, "grad_norm": 0.1912168711423874, "learning_rate": 4.962687574017877e-05, "loss": 0.43, "step": 11575 }, { "epoch": 0.41734241539625905, "grad_norm": 0.1616230010986328, "learning_rate": 4.962637328721239e-05, "loss": 0.446, "step": 11580 }, { "epoch": 0.41752261505748367, "grad_norm": 0.21484871208667755, "learning_rate": 4.962587049871645e-05, "loss": 0.4384, "step": 11585 }, { "epoch": 0.41770281471870835, "grad_norm": 0.1611786037683487, "learning_rate": 4.9625367374697795e-05, "loss": 0.4294, "step": 11590 }, { "epoch": 0.41788301437993297, "grad_norm": 0.17779512703418732, "learning_rate": 4.9624863915163275e-05, "loss": 0.487, "step": 11595 }, { "epoch": 0.4180632140411576, "grad_norm": 0.16837961971759796, "learning_rate": 4.962436012011975e-05, "loss": 0.4363, "step": 11600 }, { "epoch": 0.4182434137023822, "grad_norm": 0.13357023894786835, "learning_rate": 4.9623855989574086e-05, "loss": 0.4507, "step": 11605 }, { "epoch": 0.4184236133636069, "grad_norm": 0.13329064846038818, "learning_rate": 4.9623351523533144e-05, "loss": 0.4462, "step": 11610 }, { "epoch": 0.4186038130248315, "grad_norm": 0.16751320660114288, "learning_rate": 4.962284672200381e-05, "loss": 0.4459, "step": 11615 }, { "epoch": 0.41878401268605614, "grad_norm": 0.14165711402893066, "learning_rate": 4.962234158499296e-05, "loss": 0.4635, "step": 11620 }, { "epoch": 0.41896421234728076, "grad_norm": 0.20206506550312042, "learning_rate": 4.9621836112507475e-05, "loss": 0.487, "step": 11625 }, { "epoch": 0.41914441200850544, "grad_norm": 0.20387201011180878, "learning_rate": 4.9621330304554234e-05, "loss": 0.4851, "step": 11630 }, { "epoch": 0.41932461166973006, "grad_norm": 0.154767245054245, "learning_rate": 4.962082416114014e-05, "loss": 0.4584, "step": 11635 }, { "epoch": 0.4195048113309547, "grad_norm": 0.1658494919538498, "learning_rate": 4.962031768227208e-05, "loss": 0.4511, "step": 11640 }, { "epoch": 0.4196850109921793, "grad_norm": 0.16036735475063324, "learning_rate": 4.9619810867956954e-05, "loss": 0.4439, "step": 11645 }, { "epoch": 0.419865210653404, "grad_norm": 0.18504783511161804, "learning_rate": 4.9619303718201685e-05, "loss": 0.4651, "step": 11650 }, { "epoch": 0.4200454103146286, "grad_norm": 0.213784322142601, "learning_rate": 4.9618796233013155e-05, "loss": 0.4465, "step": 11655 }, { "epoch": 0.42022560997585323, "grad_norm": 0.15321993827819824, "learning_rate": 4.96182884123983e-05, "loss": 0.4583, "step": 11660 }, { "epoch": 0.4204058096370779, "grad_norm": 0.23537690937519073, "learning_rate": 4.961778025636402e-05, "loss": 0.5085, "step": 11665 }, { "epoch": 0.42058600929830253, "grad_norm": 0.17639757692813873, "learning_rate": 4.961727176491726e-05, "loss": 0.449, "step": 11670 }, { "epoch": 0.42076620895952715, "grad_norm": 0.14851510524749756, "learning_rate": 4.9616762938064945e-05, "loss": 0.4373, "step": 11675 }, { "epoch": 0.4209464086207518, "grad_norm": 0.19301354885101318, "learning_rate": 4.961625377581399e-05, "loss": 0.4564, "step": 11680 }, { "epoch": 0.42112660828197646, "grad_norm": 0.1511390209197998, "learning_rate": 4.961574427817135e-05, "loss": 0.4522, "step": 11685 }, { "epoch": 0.4213068079432011, "grad_norm": 0.16378375887870789, "learning_rate": 4.9615234445143954e-05, "loss": 0.4472, "step": 11690 }, { "epoch": 0.4214870076044257, "grad_norm": 0.14959801733493805, "learning_rate": 4.961472427673875e-05, "loss": 0.4596, "step": 11695 }, { "epoch": 0.4216672072656503, "grad_norm": 0.16105423867702484, "learning_rate": 4.961421377296271e-05, "loss": 0.4472, "step": 11700 }, { "epoch": 0.421847406926875, "grad_norm": 0.18409523367881775, "learning_rate": 4.9613702933822756e-05, "loss": 0.4599, "step": 11705 }, { "epoch": 0.4220276065880996, "grad_norm": 0.215390145778656, "learning_rate": 4.961319175932588e-05, "loss": 0.4891, "step": 11710 }, { "epoch": 0.42220780624932425, "grad_norm": 0.22909221053123474, "learning_rate": 4.961268024947902e-05, "loss": 0.4779, "step": 11715 }, { "epoch": 0.42238800591054887, "grad_norm": 0.1938232034444809, "learning_rate": 4.961216840428916e-05, "loss": 0.4564, "step": 11720 }, { "epoch": 0.42256820557177355, "grad_norm": 0.18986928462982178, "learning_rate": 4.961165622376327e-05, "loss": 0.485, "step": 11725 }, { "epoch": 0.42274840523299817, "grad_norm": 0.1635420173406601, "learning_rate": 4.9611143707908336e-05, "loss": 0.4641, "step": 11730 }, { "epoch": 0.4229286048942228, "grad_norm": 0.2149038463830948, "learning_rate": 4.961063085673132e-05, "loss": 0.5116, "step": 11735 }, { "epoch": 0.4231088045554474, "grad_norm": 0.15662051737308502, "learning_rate": 4.9610117670239235e-05, "loss": 0.4396, "step": 11740 }, { "epoch": 0.4232890042166721, "grad_norm": 0.1819683313369751, "learning_rate": 4.960960414843906e-05, "loss": 0.4357, "step": 11745 }, { "epoch": 0.4234692038778967, "grad_norm": 0.17882205545902252, "learning_rate": 4.960909029133779e-05, "loss": 0.4215, "step": 11750 }, { "epoch": 0.42364940353912134, "grad_norm": 0.1997404247522354, "learning_rate": 4.9608576098942426e-05, "loss": 0.4522, "step": 11755 }, { "epoch": 0.42382960320034596, "grad_norm": 0.18902002274990082, "learning_rate": 4.960806157125998e-05, "loss": 0.4362, "step": 11760 }, { "epoch": 0.42400980286157064, "grad_norm": 0.18085581064224243, "learning_rate": 4.960754670829746e-05, "loss": 0.4583, "step": 11765 }, { "epoch": 0.42419000252279526, "grad_norm": 0.2524605989456177, "learning_rate": 4.960703151006189e-05, "loss": 0.4756, "step": 11770 }, { "epoch": 0.4243702021840199, "grad_norm": 0.1703946441411972, "learning_rate": 4.9606515976560265e-05, "loss": 0.4533, "step": 11775 }, { "epoch": 0.4245504018452445, "grad_norm": 0.16210578382015228, "learning_rate": 4.960600010779963e-05, "loss": 0.4632, "step": 11780 }, { "epoch": 0.4247306015064692, "grad_norm": 0.17518621683120728, "learning_rate": 4.9605483903787006e-05, "loss": 0.4919, "step": 11785 }, { "epoch": 0.4249108011676938, "grad_norm": 0.15813525021076202, "learning_rate": 4.960496736452943e-05, "loss": 0.4522, "step": 11790 }, { "epoch": 0.42509100082891843, "grad_norm": 0.19452042877674103, "learning_rate": 4.9604450490033936e-05, "loss": 0.4526, "step": 11795 }, { "epoch": 0.42527120049014305, "grad_norm": 0.21348360180854797, "learning_rate": 4.960393328030757e-05, "loss": 0.4682, "step": 11800 }, { "epoch": 0.42545140015136773, "grad_norm": 0.1495480090379715, "learning_rate": 4.9603415735357374e-05, "loss": 0.4668, "step": 11805 }, { "epoch": 0.42563159981259235, "grad_norm": 0.1405077874660492, "learning_rate": 4.96028978551904e-05, "loss": 0.3944, "step": 11810 }, { "epoch": 0.425811799473817, "grad_norm": 0.18521526455879211, "learning_rate": 4.96023796398137e-05, "loss": 0.4723, "step": 11815 }, { "epoch": 0.4259919991350416, "grad_norm": 0.22655785083770752, "learning_rate": 4.9601861089234355e-05, "loss": 0.4484, "step": 11820 }, { "epoch": 0.4261721987962663, "grad_norm": 0.1640099287033081, "learning_rate": 4.9601342203459405e-05, "loss": 0.4423, "step": 11825 }, { "epoch": 0.4263523984574909, "grad_norm": 0.17364855110645294, "learning_rate": 4.960082298249593e-05, "loss": 0.4574, "step": 11830 }, { "epoch": 0.4265325981187155, "grad_norm": 0.18359048664569855, "learning_rate": 4.9600303426351013e-05, "loss": 0.4274, "step": 11835 }, { "epoch": 0.4267127977799402, "grad_norm": 0.1665671169757843, "learning_rate": 4.959978353503172e-05, "loss": 0.4399, "step": 11840 }, { "epoch": 0.4268929974411648, "grad_norm": 0.1284245401620865, "learning_rate": 4.959926330854514e-05, "loss": 0.4541, "step": 11845 }, { "epoch": 0.42707319710238945, "grad_norm": 0.19221022725105286, "learning_rate": 4.9598742746898364e-05, "loss": 0.4692, "step": 11850 }, { "epoch": 0.42725339676361407, "grad_norm": 0.21203385293483734, "learning_rate": 4.959822185009847e-05, "loss": 0.4709, "step": 11855 }, { "epoch": 0.42743359642483875, "grad_norm": 0.15997375547885895, "learning_rate": 4.959770061815258e-05, "loss": 0.4242, "step": 11860 }, { "epoch": 0.42761379608606337, "grad_norm": 0.19017674028873444, "learning_rate": 4.959717905106777e-05, "loss": 0.4596, "step": 11865 }, { "epoch": 0.427793995747288, "grad_norm": 0.19879359006881714, "learning_rate": 4.9596657148851154e-05, "loss": 0.4626, "step": 11870 }, { "epoch": 0.4279741954085126, "grad_norm": 0.14506494998931885, "learning_rate": 4.9596134911509865e-05, "loss": 0.4507, "step": 11875 }, { "epoch": 0.4281543950697373, "grad_norm": 0.16767846047878265, "learning_rate": 4.959561233905098e-05, "loss": 0.4191, "step": 11880 }, { "epoch": 0.4283345947309619, "grad_norm": 0.1973155289888382, "learning_rate": 4.9595089431481645e-05, "loss": 0.4705, "step": 11885 }, { "epoch": 0.42851479439218654, "grad_norm": 0.16755364835262299, "learning_rate": 4.9594566188808985e-05, "loss": 0.421, "step": 11890 }, { "epoch": 0.42869499405341116, "grad_norm": 0.18635892868041992, "learning_rate": 4.959404261104012e-05, "loss": 0.4883, "step": 11895 }, { "epoch": 0.42887519371463584, "grad_norm": 0.18180792033672333, "learning_rate": 4.959351869818218e-05, "loss": 0.4775, "step": 11900 }, { "epoch": 0.42905539337586046, "grad_norm": 0.1798887848854065, "learning_rate": 4.9592994450242316e-05, "loss": 0.4618, "step": 11905 }, { "epoch": 0.4292355930370851, "grad_norm": 0.23241207003593445, "learning_rate": 4.9592469867227655e-05, "loss": 0.4479, "step": 11910 }, { "epoch": 0.4294157926983097, "grad_norm": 0.18085531890392303, "learning_rate": 4.959194494914537e-05, "loss": 0.4617, "step": 11915 }, { "epoch": 0.4295959923595344, "grad_norm": 0.1726156622171402, "learning_rate": 4.9591419696002575e-05, "loss": 0.4501, "step": 11920 }, { "epoch": 0.429776192020759, "grad_norm": 0.16227740049362183, "learning_rate": 4.9590894107806454e-05, "loss": 0.4358, "step": 11925 }, { "epoch": 0.42995639168198363, "grad_norm": 0.15402673184871674, "learning_rate": 4.959036818456417e-05, "loss": 0.44, "step": 11930 }, { "epoch": 0.43013659134320825, "grad_norm": 0.1438482403755188, "learning_rate": 4.958984192628288e-05, "loss": 0.463, "step": 11935 }, { "epoch": 0.43031679100443293, "grad_norm": 0.18575410544872284, "learning_rate": 4.958931533296975e-05, "loss": 0.4778, "step": 11940 }, { "epoch": 0.43049699066565755, "grad_norm": 0.17367704212665558, "learning_rate": 4.958878840463196e-05, "loss": 0.4159, "step": 11945 }, { "epoch": 0.4306771903268822, "grad_norm": 0.1562519371509552, "learning_rate": 4.958826114127668e-05, "loss": 0.4685, "step": 11950 }, { "epoch": 0.4308573899881068, "grad_norm": 0.13806965947151184, "learning_rate": 4.958773354291111e-05, "loss": 0.4313, "step": 11955 }, { "epoch": 0.4310375896493315, "grad_norm": 0.19402550160884857, "learning_rate": 4.958720560954243e-05, "loss": 0.4232, "step": 11960 }, { "epoch": 0.4312177893105561, "grad_norm": 0.18233174085617065, "learning_rate": 4.958667734117784e-05, "loss": 0.4742, "step": 11965 }, { "epoch": 0.4313979889717807, "grad_norm": 0.16293558478355408, "learning_rate": 4.958614873782452e-05, "loss": 0.4507, "step": 11970 }, { "epoch": 0.43157818863300534, "grad_norm": 0.23064178228378296, "learning_rate": 4.95856197994897e-05, "loss": 0.4566, "step": 11975 }, { "epoch": 0.43175838829423, "grad_norm": 0.14595326781272888, "learning_rate": 4.958509052618055e-05, "loss": 0.4589, "step": 11980 }, { "epoch": 0.43193858795545464, "grad_norm": 0.13431671261787415, "learning_rate": 4.958456091790431e-05, "loss": 0.4459, "step": 11985 }, { "epoch": 0.43211878761667927, "grad_norm": 0.17889688909053802, "learning_rate": 4.9584030974668195e-05, "loss": 0.441, "step": 11990 }, { "epoch": 0.4322989872779039, "grad_norm": 0.18637695908546448, "learning_rate": 4.958350069647941e-05, "loss": 0.4372, "step": 11995 }, { "epoch": 0.43247918693912857, "grad_norm": 0.20668061077594757, "learning_rate": 4.958297008334519e-05, "loss": 0.4777, "step": 12000 }, { "epoch": 0.43247918693912857, "eval_loss": 0.474940687417984, "eval_runtime": 3.5638, "eval_samples_per_second": 28.06, "eval_steps_per_second": 7.015, "step": 12000 }, { "epoch": 0.4326593866003532, "grad_norm": 0.2014397382736206, "learning_rate": 4.958243913527276e-05, "loss": 0.4991, "step": 12005 }, { "epoch": 0.4328395862615778, "grad_norm": 0.17615807056427002, "learning_rate": 4.958190785226936e-05, "loss": 0.47, "step": 12010 }, { "epoch": 0.4330197859228025, "grad_norm": 0.20073723793029785, "learning_rate": 4.958137623434222e-05, "loss": 0.4421, "step": 12015 }, { "epoch": 0.4331999855840271, "grad_norm": 0.15735125541687012, "learning_rate": 4.958084428149859e-05, "loss": 0.4472, "step": 12020 }, { "epoch": 0.43338018524525174, "grad_norm": 0.21121670305728912, "learning_rate": 4.9580311993745715e-05, "loss": 0.451, "step": 12025 }, { "epoch": 0.43356038490647636, "grad_norm": 0.17626595497131348, "learning_rate": 4.957977937109085e-05, "loss": 0.4332, "step": 12030 }, { "epoch": 0.43374058456770104, "grad_norm": 0.18630550801753998, "learning_rate": 4.9579246413541245e-05, "loss": 0.4434, "step": 12035 }, { "epoch": 0.43392078422892566, "grad_norm": 0.17722322046756744, "learning_rate": 4.957871312110417e-05, "loss": 0.4437, "step": 12040 }, { "epoch": 0.4341009838901503, "grad_norm": 0.1969458907842636, "learning_rate": 4.9578179493786884e-05, "loss": 0.4555, "step": 12045 }, { "epoch": 0.4342811835513749, "grad_norm": 0.19826200604438782, "learning_rate": 4.9577645531596666e-05, "loss": 0.4186, "step": 12050 }, { "epoch": 0.4344613832125996, "grad_norm": 0.2032472789287567, "learning_rate": 4.957711123454079e-05, "loss": 0.4377, "step": 12055 }, { "epoch": 0.4346415828738242, "grad_norm": 0.20564451813697815, "learning_rate": 4.957657660262652e-05, "loss": 0.4529, "step": 12060 }, { "epoch": 0.43482178253504883, "grad_norm": 0.2119935005903244, "learning_rate": 4.957604163586116e-05, "loss": 0.4644, "step": 12065 }, { "epoch": 0.43500198219627345, "grad_norm": 0.15134219825267792, "learning_rate": 4.9575506334251984e-05, "loss": 0.4497, "step": 12070 }, { "epoch": 0.43518218185749813, "grad_norm": 0.19318123161792755, "learning_rate": 4.95749706978063e-05, "loss": 0.4525, "step": 12075 }, { "epoch": 0.43536238151872275, "grad_norm": 0.19543355703353882, "learning_rate": 4.9574434726531395e-05, "loss": 0.4887, "step": 12080 }, { "epoch": 0.4355425811799474, "grad_norm": 0.17110168933868408, "learning_rate": 4.957389842043457e-05, "loss": 0.4431, "step": 12085 }, { "epoch": 0.435722780841172, "grad_norm": 0.18978413939476013, "learning_rate": 4.957336177952314e-05, "loss": 0.46, "step": 12090 }, { "epoch": 0.4359029805023967, "grad_norm": 0.22278402745723724, "learning_rate": 4.957282480380442e-05, "loss": 0.4595, "step": 12095 }, { "epoch": 0.4360831801636213, "grad_norm": 0.17783726751804352, "learning_rate": 4.957228749328571e-05, "loss": 0.4289, "step": 12100 }, { "epoch": 0.4362633798248459, "grad_norm": 0.2311362475156784, "learning_rate": 4.957174984797434e-05, "loss": 0.4326, "step": 12105 }, { "epoch": 0.43644357948607054, "grad_norm": 0.1933164894580841, "learning_rate": 4.957121186787764e-05, "loss": 0.4614, "step": 12110 }, { "epoch": 0.4366237791472952, "grad_norm": 0.1860789954662323, "learning_rate": 4.957067355300293e-05, "loss": 0.4762, "step": 12115 }, { "epoch": 0.43680397880851984, "grad_norm": 0.18945522606372833, "learning_rate": 4.9570134903357556e-05, "loss": 0.4207, "step": 12120 }, { "epoch": 0.43698417846974447, "grad_norm": 0.21275950968265533, "learning_rate": 4.956959591894885e-05, "loss": 0.4836, "step": 12125 }, { "epoch": 0.4371643781309691, "grad_norm": 0.18478412926197052, "learning_rate": 4.956905659978416e-05, "loss": 0.4406, "step": 12130 }, { "epoch": 0.43734457779219377, "grad_norm": 0.16761377453804016, "learning_rate": 4.9568516945870825e-05, "loss": 0.4556, "step": 12135 }, { "epoch": 0.4375247774534184, "grad_norm": 0.14763802289962769, "learning_rate": 4.9567976957216204e-05, "loss": 0.4445, "step": 12140 }, { "epoch": 0.437704977114643, "grad_norm": 0.1819683164358139, "learning_rate": 4.956743663382766e-05, "loss": 0.4411, "step": 12145 }, { "epoch": 0.43788517677586763, "grad_norm": 0.1549748033285141, "learning_rate": 4.9566895975712533e-05, "loss": 0.444, "step": 12150 }, { "epoch": 0.4380653764370923, "grad_norm": 0.16789652407169342, "learning_rate": 4.9566354982878215e-05, "loss": 0.4636, "step": 12155 }, { "epoch": 0.43824557609831694, "grad_norm": 0.15570096671581268, "learning_rate": 4.956581365533207e-05, "loss": 0.4223, "step": 12160 }, { "epoch": 0.43842577575954156, "grad_norm": 0.20014168322086334, "learning_rate": 4.956527199308146e-05, "loss": 0.429, "step": 12165 }, { "epoch": 0.43860597542076624, "grad_norm": 0.2379717230796814, "learning_rate": 4.956472999613379e-05, "loss": 0.4862, "step": 12170 }, { "epoch": 0.43878617508199086, "grad_norm": 0.1970072239637375, "learning_rate": 4.956418766449642e-05, "loss": 0.4299, "step": 12175 }, { "epoch": 0.4389663747432155, "grad_norm": 0.19804427027702332, "learning_rate": 4.956364499817674e-05, "loss": 0.4662, "step": 12180 }, { "epoch": 0.4391465744044401, "grad_norm": 0.18238474428653717, "learning_rate": 4.956310199718217e-05, "loss": 0.4628, "step": 12185 }, { "epoch": 0.4393267740656648, "grad_norm": 0.1626834273338318, "learning_rate": 4.956255866152008e-05, "loss": 0.445, "step": 12190 }, { "epoch": 0.4395069737268894, "grad_norm": 0.18369080126285553, "learning_rate": 4.956201499119788e-05, "loss": 0.4526, "step": 12195 }, { "epoch": 0.439687173388114, "grad_norm": 0.1799646019935608, "learning_rate": 4.956147098622299e-05, "loss": 0.4295, "step": 12200 }, { "epoch": 0.43986737304933865, "grad_norm": 0.13832755386829376, "learning_rate": 4.9560926646602813e-05, "loss": 0.4452, "step": 12205 }, { "epoch": 0.44004757271056333, "grad_norm": 0.1626981645822525, "learning_rate": 4.9560381972344765e-05, "loss": 0.4387, "step": 12210 }, { "epoch": 0.44022777237178795, "grad_norm": 0.15181361138820648, "learning_rate": 4.955983696345626e-05, "loss": 0.4615, "step": 12215 }, { "epoch": 0.4404079720330126, "grad_norm": 0.1977403461933136, "learning_rate": 4.955929161994474e-05, "loss": 0.429, "step": 12220 }, { "epoch": 0.4405881716942372, "grad_norm": 0.1452832669019699, "learning_rate": 4.955874594181763e-05, "loss": 0.4529, "step": 12225 }, { "epoch": 0.4407683713554619, "grad_norm": 0.15825548768043518, "learning_rate": 4.955819992908235e-05, "loss": 0.4748, "step": 12230 }, { "epoch": 0.4409485710166865, "grad_norm": 0.17782393097877502, "learning_rate": 4.9557653581746355e-05, "loss": 0.4872, "step": 12235 }, { "epoch": 0.4411287706779111, "grad_norm": 0.18894019722938538, "learning_rate": 4.955710689981708e-05, "loss": 0.4215, "step": 12240 }, { "epoch": 0.44130897033913574, "grad_norm": 0.18643000721931458, "learning_rate": 4.955655988330199e-05, "loss": 0.4314, "step": 12245 }, { "epoch": 0.4414891700003604, "grad_norm": 0.17689816653728485, "learning_rate": 4.955601253220852e-05, "loss": 0.4323, "step": 12250 }, { "epoch": 0.44166936966158504, "grad_norm": 0.19838203489780426, "learning_rate": 4.955546484654413e-05, "loss": 0.47, "step": 12255 }, { "epoch": 0.44184956932280967, "grad_norm": 0.17451122403144836, "learning_rate": 4.955491682631629e-05, "loss": 0.4202, "step": 12260 }, { "epoch": 0.4420297689840343, "grad_norm": 0.16732750833034515, "learning_rate": 4.955436847153246e-05, "loss": 0.4592, "step": 12265 }, { "epoch": 0.44220996864525897, "grad_norm": 0.17255975306034088, "learning_rate": 4.955381978220011e-05, "loss": 0.4607, "step": 12270 }, { "epoch": 0.4423901683064836, "grad_norm": 0.20615766942501068, "learning_rate": 4.955327075832672e-05, "loss": 0.4488, "step": 12275 }, { "epoch": 0.4425703679677082, "grad_norm": 0.16766251623630524, "learning_rate": 4.955272139991978e-05, "loss": 0.4407, "step": 12280 }, { "epoch": 0.44275056762893283, "grad_norm": 0.16281658411026, "learning_rate": 4.955217170698675e-05, "loss": 0.4417, "step": 12285 }, { "epoch": 0.4429307672901575, "grad_norm": 0.1925809234380722, "learning_rate": 4.955162167953514e-05, "loss": 0.4254, "step": 12290 }, { "epoch": 0.44311096695138213, "grad_norm": 0.15888631343841553, "learning_rate": 4.955107131757244e-05, "loss": 0.4305, "step": 12295 }, { "epoch": 0.44329116661260676, "grad_norm": 0.153702974319458, "learning_rate": 4.955052062110615e-05, "loss": 0.4306, "step": 12300 }, { "epoch": 0.4434713662738314, "grad_norm": 0.190146803855896, "learning_rate": 4.9549969590143765e-05, "loss": 0.4711, "step": 12305 }, { "epoch": 0.44365156593505606, "grad_norm": 0.150401771068573, "learning_rate": 4.9549418224692795e-05, "loss": 0.4323, "step": 12310 }, { "epoch": 0.4438317655962807, "grad_norm": 0.17060506343841553, "learning_rate": 4.954886652476076e-05, "loss": 0.4777, "step": 12315 }, { "epoch": 0.4440119652575053, "grad_norm": 0.16213923692703247, "learning_rate": 4.9548314490355165e-05, "loss": 0.4736, "step": 12320 }, { "epoch": 0.4441921649187299, "grad_norm": 0.17394715547561646, "learning_rate": 4.954776212148354e-05, "loss": 0.4731, "step": 12325 }, { "epoch": 0.4443723645799546, "grad_norm": 0.16405607759952545, "learning_rate": 4.95472094181534e-05, "loss": 0.42, "step": 12330 }, { "epoch": 0.4445525642411792, "grad_norm": 0.13627347350120544, "learning_rate": 4.9546656380372306e-05, "loss": 0.4613, "step": 12335 }, { "epoch": 0.44473276390240385, "grad_norm": 0.19392867386341095, "learning_rate": 4.954610300814776e-05, "loss": 0.4315, "step": 12340 }, { "epoch": 0.4449129635636285, "grad_norm": 0.14533652365207672, "learning_rate": 4.9545549301487306e-05, "loss": 0.467, "step": 12345 }, { "epoch": 0.44509316322485315, "grad_norm": 0.17351950705051422, "learning_rate": 4.95449952603985e-05, "loss": 0.432, "step": 12350 }, { "epoch": 0.4452733628860778, "grad_norm": 0.1487894058227539, "learning_rate": 4.9544440884888885e-05, "loss": 0.4856, "step": 12355 }, { "epoch": 0.4454535625473024, "grad_norm": 0.16079506278038025, "learning_rate": 4.954388617496602e-05, "loss": 0.4793, "step": 12360 }, { "epoch": 0.4456337622085271, "grad_norm": 0.16144710779190063, "learning_rate": 4.954333113063745e-05, "loss": 0.4313, "step": 12365 }, { "epoch": 0.4458139618697517, "grad_norm": 0.1664080023765564, "learning_rate": 4.9542775751910755e-05, "loss": 0.4207, "step": 12370 }, { "epoch": 0.4459941615309763, "grad_norm": 0.16213445365428925, "learning_rate": 4.954222003879349e-05, "loss": 0.4896, "step": 12375 }, { "epoch": 0.44617436119220094, "grad_norm": 0.15756379067897797, "learning_rate": 4.954166399129322e-05, "loss": 0.4438, "step": 12380 }, { "epoch": 0.4463545608534256, "grad_norm": 0.21012753248214722, "learning_rate": 4.954110760941754e-05, "loss": 0.4689, "step": 12385 }, { "epoch": 0.44653476051465024, "grad_norm": 0.1980627030134201, "learning_rate": 4.954055089317401e-05, "loss": 0.4605, "step": 12390 }, { "epoch": 0.44671496017587486, "grad_norm": 0.19075866043567657, "learning_rate": 4.9539993842570226e-05, "loss": 0.4795, "step": 12395 }, { "epoch": 0.4468951598370995, "grad_norm": 0.19187109172344208, "learning_rate": 4.953943645761378e-05, "loss": 0.4509, "step": 12400 }, { "epoch": 0.44707535949832417, "grad_norm": 0.16127610206604004, "learning_rate": 4.9538878738312265e-05, "loss": 0.4858, "step": 12405 }, { "epoch": 0.4472555591595488, "grad_norm": 0.1766381561756134, "learning_rate": 4.953832068467328e-05, "loss": 0.4753, "step": 12410 }, { "epoch": 0.4474357588207734, "grad_norm": 0.16287901997566223, "learning_rate": 4.953776229670442e-05, "loss": 0.4756, "step": 12415 }, { "epoch": 0.44761595848199803, "grad_norm": 0.15042343735694885, "learning_rate": 4.9537203574413305e-05, "loss": 0.408, "step": 12420 }, { "epoch": 0.4477961581432227, "grad_norm": 0.18103763461112976, "learning_rate": 4.953664451780754e-05, "loss": 0.4455, "step": 12425 }, { "epoch": 0.44797635780444733, "grad_norm": 0.23670852184295654, "learning_rate": 4.953608512689474e-05, "loss": 0.4823, "step": 12430 }, { "epoch": 0.44815655746567196, "grad_norm": 0.21633177995681763, "learning_rate": 4.9535525401682535e-05, "loss": 0.4432, "step": 12435 }, { "epoch": 0.4483367571268966, "grad_norm": 0.178372323513031, "learning_rate": 4.9534965342178546e-05, "loss": 0.4945, "step": 12440 }, { "epoch": 0.44851695678812126, "grad_norm": 0.2000657618045807, "learning_rate": 4.95344049483904e-05, "loss": 0.5071, "step": 12445 }, { "epoch": 0.4486971564493459, "grad_norm": 0.15917164087295532, "learning_rate": 4.953384422032574e-05, "loss": 0.4075, "step": 12450 }, { "epoch": 0.4488773561105705, "grad_norm": 0.14177782833576202, "learning_rate": 4.9533283157992206e-05, "loss": 0.4403, "step": 12455 }, { "epoch": 0.4490575557717951, "grad_norm": 0.17211756110191345, "learning_rate": 4.953272176139744e-05, "loss": 0.4176, "step": 12460 }, { "epoch": 0.4492377554330198, "grad_norm": 0.14136864244937897, "learning_rate": 4.953216003054908e-05, "loss": 0.4492, "step": 12465 }, { "epoch": 0.4494179550942444, "grad_norm": 0.19541148841381073, "learning_rate": 4.95315979654548e-05, "loss": 0.4768, "step": 12470 }, { "epoch": 0.44959815475546905, "grad_norm": 0.15044446289539337, "learning_rate": 4.953103556612224e-05, "loss": 0.4469, "step": 12475 }, { "epoch": 0.44977835441669367, "grad_norm": 0.20979291200637817, "learning_rate": 4.953047283255907e-05, "loss": 0.4432, "step": 12480 }, { "epoch": 0.44995855407791835, "grad_norm": 0.20431764423847198, "learning_rate": 4.9529909764772956e-05, "loss": 0.4589, "step": 12485 }, { "epoch": 0.45013875373914297, "grad_norm": 0.2022426575422287, "learning_rate": 4.952934636277158e-05, "loss": 0.4544, "step": 12490 }, { "epoch": 0.4503189534003676, "grad_norm": 0.22232398390769958, "learning_rate": 4.95287826265626e-05, "loss": 0.4909, "step": 12495 }, { "epoch": 0.4504991530615922, "grad_norm": 0.18882164359092712, "learning_rate": 4.95282185561537e-05, "loss": 0.4937, "step": 12500 }, { "epoch": 0.4504991530615922, "eval_loss": 0.47478610277175903, "eval_runtime": 3.606, "eval_samples_per_second": 27.732, "eval_steps_per_second": 6.933, "step": 12500 }, { "epoch": 0.4506793527228169, "grad_norm": 0.15758134424686432, "learning_rate": 4.952765415155258e-05, "loss": 0.4397, "step": 12505 }, { "epoch": 0.4508595523840415, "grad_norm": 0.15819260478019714, "learning_rate": 4.9527089412766926e-05, "loss": 0.4245, "step": 12510 }, { "epoch": 0.45103975204526614, "grad_norm": 0.20191067457199097, "learning_rate": 4.952652433980442e-05, "loss": 0.4507, "step": 12515 }, { "epoch": 0.4512199517064908, "grad_norm": 0.14131148159503937, "learning_rate": 4.952595893267277e-05, "loss": 0.4334, "step": 12520 }, { "epoch": 0.45140015136771544, "grad_norm": 0.1871136575937271, "learning_rate": 4.9525393191379674e-05, "loss": 0.4563, "step": 12525 }, { "epoch": 0.45158035102894006, "grad_norm": 0.1495848298072815, "learning_rate": 4.952482711593285e-05, "loss": 0.4611, "step": 12530 }, { "epoch": 0.4517605506901647, "grad_norm": 0.1738033890724182, "learning_rate": 4.9524260706339996e-05, "loss": 0.4509, "step": 12535 }, { "epoch": 0.45194075035138936, "grad_norm": 0.2945556938648224, "learning_rate": 4.952369396260884e-05, "loss": 0.4461, "step": 12540 }, { "epoch": 0.452120950012614, "grad_norm": 0.19618237018585205, "learning_rate": 4.952312688474711e-05, "loss": 0.4466, "step": 12545 }, { "epoch": 0.4523011496738386, "grad_norm": 0.1678617298603058, "learning_rate": 4.952255947276252e-05, "loss": 0.474, "step": 12550 }, { "epoch": 0.45248134933506323, "grad_norm": 0.1514994353055954, "learning_rate": 4.95219917266628e-05, "loss": 0.4271, "step": 12555 }, { "epoch": 0.4526615489962879, "grad_norm": 0.14985314011573792, "learning_rate": 4.95214236464557e-05, "loss": 0.4505, "step": 12560 }, { "epoch": 0.45284174865751253, "grad_norm": 0.19544857740402222, "learning_rate": 4.952085523214894e-05, "loss": 0.4893, "step": 12565 }, { "epoch": 0.45302194831873716, "grad_norm": 0.19242407381534576, "learning_rate": 4.9520286483750277e-05, "loss": 0.4616, "step": 12570 }, { "epoch": 0.4532021479799618, "grad_norm": 0.14265184104442596, "learning_rate": 4.9519717401267465e-05, "loss": 0.4539, "step": 12575 }, { "epoch": 0.45338234764118646, "grad_norm": 0.181315615773201, "learning_rate": 4.9519147984708246e-05, "loss": 0.4585, "step": 12580 }, { "epoch": 0.4535625473024111, "grad_norm": 0.16452257335186005, "learning_rate": 4.9518578234080384e-05, "loss": 0.4469, "step": 12585 }, { "epoch": 0.4537427469636357, "grad_norm": 0.14965125918388367, "learning_rate": 4.951800814939164e-05, "loss": 0.4432, "step": 12590 }, { "epoch": 0.4539229466248603, "grad_norm": 0.16924121975898743, "learning_rate": 4.951743773064978e-05, "loss": 0.4248, "step": 12595 }, { "epoch": 0.454103146286085, "grad_norm": 0.21651718020439148, "learning_rate": 4.951686697786258e-05, "loss": 0.4606, "step": 12600 }, { "epoch": 0.4542833459473096, "grad_norm": 0.1570291370153427, "learning_rate": 4.951629589103781e-05, "loss": 0.4161, "step": 12605 }, { "epoch": 0.45446354560853425, "grad_norm": 0.18217509984970093, "learning_rate": 4.951572447018326e-05, "loss": 0.4096, "step": 12610 }, { "epoch": 0.45464374526975887, "grad_norm": 0.1948910504579544, "learning_rate": 4.951515271530671e-05, "loss": 0.4347, "step": 12615 }, { "epoch": 0.45482394493098355, "grad_norm": 0.1557484269142151, "learning_rate": 4.951458062641595e-05, "loss": 0.44, "step": 12620 }, { "epoch": 0.45500414459220817, "grad_norm": 0.21191979944705963, "learning_rate": 4.951400820351877e-05, "loss": 0.4676, "step": 12625 }, { "epoch": 0.4551843442534328, "grad_norm": 0.20376695692539215, "learning_rate": 4.951343544662298e-05, "loss": 0.4121, "step": 12630 }, { "epoch": 0.4553645439146574, "grad_norm": 0.18317648768424988, "learning_rate": 4.9512862355736376e-05, "loss": 0.4806, "step": 12635 }, { "epoch": 0.4555447435758821, "grad_norm": 0.15849465131759644, "learning_rate": 4.951228893086677e-05, "loss": 0.4107, "step": 12640 }, { "epoch": 0.4557249432371067, "grad_norm": 0.1734628528356552, "learning_rate": 4.951171517202197e-05, "loss": 0.4431, "step": 12645 }, { "epoch": 0.45590514289833134, "grad_norm": 0.19202375411987305, "learning_rate": 4.95111410792098e-05, "loss": 0.4589, "step": 12650 }, { "epoch": 0.45608534255955596, "grad_norm": 0.1690249890089035, "learning_rate": 4.951056665243807e-05, "loss": 0.4878, "step": 12655 }, { "epoch": 0.45626554222078064, "grad_norm": 0.13854099810123444, "learning_rate": 4.950999189171463e-05, "loss": 0.4422, "step": 12660 }, { "epoch": 0.45644574188200526, "grad_norm": 0.16640843451023102, "learning_rate": 4.9509416797047284e-05, "loss": 0.4411, "step": 12665 }, { "epoch": 0.4566259415432299, "grad_norm": 0.19313612580299377, "learning_rate": 4.9508841368443884e-05, "loss": 0.4788, "step": 12670 }, { "epoch": 0.45680614120445456, "grad_norm": 0.19194231927394867, "learning_rate": 4.9508265605912265e-05, "loss": 0.4654, "step": 12675 }, { "epoch": 0.4569863408656792, "grad_norm": 0.2459125965833664, "learning_rate": 4.950768950946026e-05, "loss": 0.4259, "step": 12680 }, { "epoch": 0.4571665405269038, "grad_norm": 0.17043115198612213, "learning_rate": 4.950711307909575e-05, "loss": 0.4381, "step": 12685 }, { "epoch": 0.45734674018812843, "grad_norm": 0.1516115516424179, "learning_rate": 4.950653631482656e-05, "loss": 0.4153, "step": 12690 }, { "epoch": 0.4575269398493531, "grad_norm": 0.18203213810920715, "learning_rate": 4.9505959216660556e-05, "loss": 0.4587, "step": 12695 }, { "epoch": 0.45770713951057773, "grad_norm": 0.16077850759029388, "learning_rate": 4.9505381784605606e-05, "loss": 0.4684, "step": 12700 }, { "epoch": 0.45788733917180235, "grad_norm": 0.16512863337993622, "learning_rate": 4.9504804018669574e-05, "loss": 0.4445, "step": 12705 }, { "epoch": 0.458067538833027, "grad_norm": 0.16586919128894806, "learning_rate": 4.9504225918860326e-05, "loss": 0.4602, "step": 12710 }, { "epoch": 0.45824773849425166, "grad_norm": 0.1363137811422348, "learning_rate": 4.9503647485185744e-05, "loss": 0.4658, "step": 12715 }, { "epoch": 0.4584279381554763, "grad_norm": 0.18424583971500397, "learning_rate": 4.950306871765371e-05, "loss": 0.4875, "step": 12720 }, { "epoch": 0.4586081378167009, "grad_norm": 0.21040771901607513, "learning_rate": 4.9502489616272115e-05, "loss": 0.492, "step": 12725 }, { "epoch": 0.4587883374779255, "grad_norm": 0.15792599320411682, "learning_rate": 4.9501910181048836e-05, "loss": 0.4733, "step": 12730 }, { "epoch": 0.4589685371391502, "grad_norm": 0.1881541758775711, "learning_rate": 4.950133041199177e-05, "loss": 0.4476, "step": 12735 }, { "epoch": 0.4591487368003748, "grad_norm": 0.14302214980125427, "learning_rate": 4.950075030910883e-05, "loss": 0.457, "step": 12740 }, { "epoch": 0.45932893646159945, "grad_norm": 0.16461120545864105, "learning_rate": 4.9500169872407906e-05, "loss": 0.4546, "step": 12745 }, { "epoch": 0.45950913612282407, "grad_norm": 0.18678593635559082, "learning_rate": 4.949958910189692e-05, "loss": 0.4279, "step": 12750 }, { "epoch": 0.45968933578404875, "grad_norm": 0.18721038103103638, "learning_rate": 4.949900799758377e-05, "loss": 0.443, "step": 12755 }, { "epoch": 0.45986953544527337, "grad_norm": 0.1702054888010025, "learning_rate": 4.949842655947637e-05, "loss": 0.4364, "step": 12760 }, { "epoch": 0.460049735106498, "grad_norm": 0.18055669963359833, "learning_rate": 4.9497844787582665e-05, "loss": 0.4579, "step": 12765 }, { "epoch": 0.4602299347677226, "grad_norm": 0.18560905754566193, "learning_rate": 4.949726268191056e-05, "loss": 0.4471, "step": 12770 }, { "epoch": 0.4604101344289473, "grad_norm": 0.16065679490566254, "learning_rate": 4.9496680242467994e-05, "loss": 0.4403, "step": 12775 }, { "epoch": 0.4605903340901719, "grad_norm": 0.1695624440908432, "learning_rate": 4.94960974692629e-05, "loss": 0.4245, "step": 12780 }, { "epoch": 0.46077053375139654, "grad_norm": 0.1966797411441803, "learning_rate": 4.949551436230323e-05, "loss": 0.4664, "step": 12785 }, { "epoch": 0.46095073341262116, "grad_norm": 0.19263668358325958, "learning_rate": 4.9494930921596913e-05, "loss": 0.4823, "step": 12790 }, { "epoch": 0.46113093307384584, "grad_norm": 0.16463421285152435, "learning_rate": 4.949434714715191e-05, "loss": 0.4511, "step": 12795 }, { "epoch": 0.46131113273507046, "grad_norm": 0.18872933089733124, "learning_rate": 4.949376303897616e-05, "loss": 0.4424, "step": 12800 }, { "epoch": 0.4614913323962951, "grad_norm": 0.14205656945705414, "learning_rate": 4.949317859707764e-05, "loss": 0.4761, "step": 12805 }, { "epoch": 0.4616715320575197, "grad_norm": 0.16842302680015564, "learning_rate": 4.94925938214643e-05, "loss": 0.4496, "step": 12810 }, { "epoch": 0.4618517317187444, "grad_norm": 0.15166863799095154, "learning_rate": 4.9492008712144104e-05, "loss": 0.4274, "step": 12815 }, { "epoch": 0.462031931379969, "grad_norm": 0.17034749686717987, "learning_rate": 4.9491423269125035e-05, "loss": 0.4313, "step": 12820 }, { "epoch": 0.46221213104119363, "grad_norm": 0.1309286206960678, "learning_rate": 4.9490837492415074e-05, "loss": 0.4346, "step": 12825 }, { "epoch": 0.46239233070241825, "grad_norm": 0.18976598978042603, "learning_rate": 4.949025138202218e-05, "loss": 0.4361, "step": 12830 }, { "epoch": 0.46257253036364293, "grad_norm": 0.1648472547531128, "learning_rate": 4.948966493795437e-05, "loss": 0.4638, "step": 12835 }, { "epoch": 0.46275273002486755, "grad_norm": 0.18952538073062897, "learning_rate": 4.94890781602196e-05, "loss": 0.4521, "step": 12840 }, { "epoch": 0.4629329296860922, "grad_norm": 0.15713319182395935, "learning_rate": 4.9488491048825894e-05, "loss": 0.4122, "step": 12845 }, { "epoch": 0.46311312934731685, "grad_norm": 0.16628234088420868, "learning_rate": 4.9487903603781225e-05, "loss": 0.4013, "step": 12850 }, { "epoch": 0.4632933290085415, "grad_norm": 0.17581500113010406, "learning_rate": 4.948731582509362e-05, "loss": 0.4685, "step": 12855 }, { "epoch": 0.4634735286697661, "grad_norm": 0.1562207192182541, "learning_rate": 4.948672771277107e-05, "loss": 0.4381, "step": 12860 }, { "epoch": 0.4636537283309907, "grad_norm": 0.16530804336071014, "learning_rate": 4.9486139266821606e-05, "loss": 0.4846, "step": 12865 }, { "epoch": 0.4638339279922154, "grad_norm": 0.17569473385810852, "learning_rate": 4.948555048725323e-05, "loss": 0.4499, "step": 12870 }, { "epoch": 0.46401412765344, "grad_norm": 0.16137835383415222, "learning_rate": 4.948496137407397e-05, "loss": 0.4263, "step": 12875 }, { "epoch": 0.46419432731466465, "grad_norm": 0.18716813623905182, "learning_rate": 4.948437192729186e-05, "loss": 0.4634, "step": 12880 }, { "epoch": 0.46437452697588927, "grad_norm": 0.17155876755714417, "learning_rate": 4.948378214691491e-05, "loss": 0.4479, "step": 12885 }, { "epoch": 0.46455472663711395, "grad_norm": 0.16690602898597717, "learning_rate": 4.948319203295117e-05, "loss": 0.409, "step": 12890 }, { "epoch": 0.46473492629833857, "grad_norm": 0.18938426673412323, "learning_rate": 4.9482601585408684e-05, "loss": 0.4384, "step": 12895 }, { "epoch": 0.4649151259595632, "grad_norm": 0.1318012923002243, "learning_rate": 4.9482010804295485e-05, "loss": 0.4067, "step": 12900 }, { "epoch": 0.4650953256207878, "grad_norm": 0.15733405947685242, "learning_rate": 4.9481419689619635e-05, "loss": 0.4644, "step": 12905 }, { "epoch": 0.4652755252820125, "grad_norm": 0.18979695439338684, "learning_rate": 4.9480828241389176e-05, "loss": 0.469, "step": 12910 }, { "epoch": 0.4654557249432371, "grad_norm": 0.23890700936317444, "learning_rate": 4.948023645961218e-05, "loss": 0.468, "step": 12915 }, { "epoch": 0.46563592460446174, "grad_norm": 0.13900893926620483, "learning_rate": 4.9479644344296694e-05, "loss": 0.4231, "step": 12920 }, { "epoch": 0.46581612426568636, "grad_norm": 0.16857506334781647, "learning_rate": 4.94790518954508e-05, "loss": 0.3977, "step": 12925 }, { "epoch": 0.46599632392691104, "grad_norm": 0.2040061503648758, "learning_rate": 4.9478459113082556e-05, "loss": 0.4529, "step": 12930 }, { "epoch": 0.46617652358813566, "grad_norm": 0.1292639821767807, "learning_rate": 4.9477865997200044e-05, "loss": 0.4345, "step": 12935 }, { "epoch": 0.4663567232493603, "grad_norm": 0.1888824999332428, "learning_rate": 4.9477272547811354e-05, "loss": 0.436, "step": 12940 }, { "epoch": 0.4665369229105849, "grad_norm": 0.18811088800430298, "learning_rate": 4.947667876492457e-05, "loss": 0.4403, "step": 12945 }, { "epoch": 0.4667171225718096, "grad_norm": 0.17005407810211182, "learning_rate": 4.947608464854776e-05, "loss": 0.4412, "step": 12950 }, { "epoch": 0.4668973222330342, "grad_norm": 0.17521242797374725, "learning_rate": 4.9475490198689044e-05, "loss": 0.4371, "step": 12955 }, { "epoch": 0.46707752189425883, "grad_norm": 0.21031251549720764, "learning_rate": 4.947489541535651e-05, "loss": 0.4775, "step": 12960 }, { "epoch": 0.46725772155548345, "grad_norm": 0.19381965696811676, "learning_rate": 4.947430029855827e-05, "loss": 0.4625, "step": 12965 }, { "epoch": 0.46743792121670813, "grad_norm": 0.19049398601055145, "learning_rate": 4.9473704848302424e-05, "loss": 0.4648, "step": 12970 }, { "epoch": 0.46761812087793275, "grad_norm": 0.17350056767463684, "learning_rate": 4.947310906459709e-05, "loss": 0.4143, "step": 12975 }, { "epoch": 0.4677983205391574, "grad_norm": 0.20652742683887482, "learning_rate": 4.947251294745038e-05, "loss": 0.4555, "step": 12980 }, { "epoch": 0.467978520200382, "grad_norm": 0.1818598359823227, "learning_rate": 4.947191649687042e-05, "loss": 0.4312, "step": 12985 }, { "epoch": 0.4681587198616067, "grad_norm": 0.17591311037540436, "learning_rate": 4.947131971286534e-05, "loss": 0.4311, "step": 12990 }, { "epoch": 0.4683389195228313, "grad_norm": 0.14075951278209686, "learning_rate": 4.947072259544326e-05, "loss": 0.4396, "step": 12995 }, { "epoch": 0.4685191191840559, "grad_norm": 0.20111533999443054, "learning_rate": 4.947012514461232e-05, "loss": 0.4697, "step": 13000 }, { "epoch": 0.4685191191840559, "eval_loss": 0.4741581976413727, "eval_runtime": 3.5647, "eval_samples_per_second": 28.053, "eval_steps_per_second": 7.013, "step": 13000 }, { "epoch": 0.46869931884528054, "grad_norm": 0.1940506100654602, "learning_rate": 4.9469527360380676e-05, "loss": 0.4683, "step": 13005 }, { "epoch": 0.4688795185065052, "grad_norm": 0.16118963062763214, "learning_rate": 4.946892924275645e-05, "loss": 0.4534, "step": 13010 }, { "epoch": 0.46905971816772984, "grad_norm": 0.1373254507780075, "learning_rate": 4.9468330791747795e-05, "loss": 0.4503, "step": 13015 }, { "epoch": 0.46923991782895447, "grad_norm": 0.1803598254919052, "learning_rate": 4.946773200736288e-05, "loss": 0.481, "step": 13020 }, { "epoch": 0.46942011749017915, "grad_norm": 0.18587857484817505, "learning_rate": 4.9467132889609845e-05, "loss": 0.4849, "step": 13025 }, { "epoch": 0.46960031715140377, "grad_norm": 0.17999742925167084, "learning_rate": 4.946653343849686e-05, "loss": 0.4621, "step": 13030 }, { "epoch": 0.4697805168126284, "grad_norm": 0.17007941007614136, "learning_rate": 4.9465933654032106e-05, "loss": 0.441, "step": 13035 }, { "epoch": 0.469960716473853, "grad_norm": 0.1974543035030365, "learning_rate": 4.9465333536223734e-05, "loss": 0.4479, "step": 13040 }, { "epoch": 0.4701409161350777, "grad_norm": 0.1567683070898056, "learning_rate": 4.946473308507993e-05, "loss": 0.4881, "step": 13045 }, { "epoch": 0.4703211157963023, "grad_norm": 0.17787274718284607, "learning_rate": 4.9464132300608876e-05, "loss": 0.4597, "step": 13050 }, { "epoch": 0.47050131545752694, "grad_norm": 0.2110513597726822, "learning_rate": 4.9463531182818756e-05, "loss": 0.4353, "step": 13055 }, { "epoch": 0.47068151511875156, "grad_norm": 0.19464978575706482, "learning_rate": 4.946292973171777e-05, "loss": 0.419, "step": 13060 }, { "epoch": 0.47086171477997624, "grad_norm": 0.18189744651317596, "learning_rate": 4.946232794731408e-05, "loss": 0.463, "step": 13065 }, { "epoch": 0.47104191444120086, "grad_norm": 0.21045507490634918, "learning_rate": 4.946172582961593e-05, "loss": 0.5024, "step": 13070 }, { "epoch": 0.4712221141024255, "grad_norm": 0.16427326202392578, "learning_rate": 4.946112337863148e-05, "loss": 0.4796, "step": 13075 }, { "epoch": 0.4714023137636501, "grad_norm": 0.18981902301311493, "learning_rate": 4.9460520594368975e-05, "loss": 0.4395, "step": 13080 }, { "epoch": 0.4715825134248748, "grad_norm": 0.20156720280647278, "learning_rate": 4.945991747683661e-05, "loss": 0.4865, "step": 13085 }, { "epoch": 0.4717627130860994, "grad_norm": 0.14787843823432922, "learning_rate": 4.9459314026042605e-05, "loss": 0.4583, "step": 13090 }, { "epoch": 0.47194291274732403, "grad_norm": 0.1643127053976059, "learning_rate": 4.9458710241995174e-05, "loss": 0.4685, "step": 13095 }, { "epoch": 0.47212311240854865, "grad_norm": 0.19920234382152557, "learning_rate": 4.9458106124702565e-05, "loss": 0.453, "step": 13100 }, { "epoch": 0.47230331206977333, "grad_norm": 0.16437123715877533, "learning_rate": 4.945750167417299e-05, "loss": 0.4052, "step": 13105 }, { "epoch": 0.47248351173099795, "grad_norm": 0.1786852478981018, "learning_rate": 4.945689689041468e-05, "loss": 0.4609, "step": 13110 }, { "epoch": 0.4726637113922226, "grad_norm": NaN, "learning_rate": 4.9456412823488897e-05, "loss": 0.4437, "step": 13115 }, { "epoch": 0.4728439110534472, "grad_norm": 0.18749703466892242, "learning_rate": 4.9455807439939664e-05, "loss": 0.4415, "step": 13120 }, { "epoch": 0.4730241107146719, "grad_norm": 0.15940409898757935, "learning_rate": 4.9455201723184773e-05, "loss": 0.4648, "step": 13125 }, { "epoch": 0.4732043103758965, "grad_norm": 0.1593160629272461, "learning_rate": 4.9454595673232505e-05, "loss": 0.4266, "step": 13130 }, { "epoch": 0.4733845100371211, "grad_norm": 0.1828412562608719, "learning_rate": 4.9453989290091106e-05, "loss": 0.404, "step": 13135 }, { "epoch": 0.47356470969834574, "grad_norm": 0.16431830823421478, "learning_rate": 4.945338257376884e-05, "loss": 0.487, "step": 13140 }, { "epoch": 0.4737449093595704, "grad_norm": 0.17641369998455048, "learning_rate": 4.9452775524273963e-05, "loss": 0.4485, "step": 13145 }, { "epoch": 0.47392510902079504, "grad_norm": 0.1741415560245514, "learning_rate": 4.9452168141614754e-05, "loss": 0.4163, "step": 13150 }, { "epoch": 0.47410530868201967, "grad_norm": 0.20057284832000732, "learning_rate": 4.9451560425799495e-05, "loss": 0.4494, "step": 13155 }, { "epoch": 0.4742855083432443, "grad_norm": 0.16417837142944336, "learning_rate": 4.9450952376836454e-05, "loss": 0.4283, "step": 13160 }, { "epoch": 0.47446570800446897, "grad_norm": 0.14967764914035797, "learning_rate": 4.945034399473392e-05, "loss": 0.441, "step": 13165 }, { "epoch": 0.4746459076656936, "grad_norm": 0.17966294288635254, "learning_rate": 4.944973527950019e-05, "loss": 0.4765, "step": 13170 }, { "epoch": 0.4748261073269182, "grad_norm": 0.18716269731521606, "learning_rate": 4.944912623114354e-05, "loss": 0.4723, "step": 13175 }, { "epoch": 0.47500630698814283, "grad_norm": 0.16850250959396362, "learning_rate": 4.9448516849672285e-05, "loss": 0.4771, "step": 13180 }, { "epoch": 0.4751865066493675, "grad_norm": 0.1683826893568039, "learning_rate": 4.944790713509472e-05, "loss": 0.4295, "step": 13185 }, { "epoch": 0.47536670631059214, "grad_norm": 0.1629522442817688, "learning_rate": 4.9447297087419155e-05, "loss": 0.4116, "step": 13190 }, { "epoch": 0.47554690597181676, "grad_norm": 0.16249153017997742, "learning_rate": 4.9446686706653896e-05, "loss": 0.431, "step": 13195 }, { "epoch": 0.47572710563304144, "grad_norm": 0.16128148138523102, "learning_rate": 4.944607599280726e-05, "loss": 0.4552, "step": 13200 }, { "epoch": 0.47590730529426606, "grad_norm": 0.1829308420419693, "learning_rate": 4.944546494588758e-05, "loss": 0.4313, "step": 13205 }, { "epoch": 0.4760875049554907, "grad_norm": 0.1804686188697815, "learning_rate": 4.944485356590317e-05, "loss": 0.4562, "step": 13210 }, { "epoch": 0.4762677046167153, "grad_norm": 0.17455480992794037, "learning_rate": 4.9444241852862366e-05, "loss": 0.4468, "step": 13215 }, { "epoch": 0.47644790427794, "grad_norm": 0.18920566141605377, "learning_rate": 4.944362980677349e-05, "loss": 0.4687, "step": 13220 }, { "epoch": 0.4766281039391646, "grad_norm": 0.19825154542922974, "learning_rate": 4.9443017427644906e-05, "loss": 0.4629, "step": 13225 }, { "epoch": 0.4768083036003892, "grad_norm": 0.1756822019815445, "learning_rate": 4.944240471548493e-05, "loss": 0.4885, "step": 13230 }, { "epoch": 0.47698850326161385, "grad_norm": 0.20122404396533966, "learning_rate": 4.944179167030193e-05, "loss": 0.4641, "step": 13235 }, { "epoch": 0.47716870292283853, "grad_norm": 0.17638641595840454, "learning_rate": 4.944117829210424e-05, "loss": 0.4641, "step": 13240 }, { "epoch": 0.47734890258406315, "grad_norm": 0.20242132246494293, "learning_rate": 4.944056458090024e-05, "loss": 0.4815, "step": 13245 }, { "epoch": 0.4775291022452878, "grad_norm": 0.18638722598552704, "learning_rate": 4.943995053669827e-05, "loss": 0.4504, "step": 13250 }, { "epoch": 0.4777093019065124, "grad_norm": 0.16893352568149567, "learning_rate": 4.943933615950671e-05, "loss": 0.4275, "step": 13255 }, { "epoch": 0.4778895015677371, "grad_norm": 0.16085629165172577, "learning_rate": 4.9438721449333925e-05, "loss": 0.4648, "step": 13260 }, { "epoch": 0.4780697012289617, "grad_norm": 0.1869862973690033, "learning_rate": 4.943810640618829e-05, "loss": 0.4415, "step": 13265 }, { "epoch": 0.4782499008901863, "grad_norm": 0.16547581553459167, "learning_rate": 4.9437491030078185e-05, "loss": 0.5078, "step": 13270 }, { "epoch": 0.47843010055141094, "grad_norm": 0.19443146884441376, "learning_rate": 4.9436875321012e-05, "loss": 0.4237, "step": 13275 }, { "epoch": 0.4786103002126356, "grad_norm": 0.1601037085056305, "learning_rate": 4.943625927899812e-05, "loss": 0.4692, "step": 13280 }, { "epoch": 0.47879049987386024, "grad_norm": 0.17408448457717896, "learning_rate": 4.943564290404494e-05, "loss": 0.4617, "step": 13285 }, { "epoch": 0.47897069953508487, "grad_norm": 0.17495988309383392, "learning_rate": 4.943502619616085e-05, "loss": 0.4349, "step": 13290 }, { "epoch": 0.4791508991963095, "grad_norm": 0.18326228857040405, "learning_rate": 4.9434409155354266e-05, "loss": 0.4518, "step": 13295 }, { "epoch": 0.47933109885753417, "grad_norm": 0.14436647295951843, "learning_rate": 4.9433791781633584e-05, "loss": 0.4689, "step": 13300 }, { "epoch": 0.4795112985187588, "grad_norm": 0.1709493100643158, "learning_rate": 4.9433174075007216e-05, "loss": 0.4747, "step": 13305 }, { "epoch": 0.4796914981799834, "grad_norm": 0.19501015543937683, "learning_rate": 4.943255603548359e-05, "loss": 0.4446, "step": 13310 }, { "epoch": 0.47987169784120803, "grad_norm": 0.16329129040241241, "learning_rate": 4.943193766307111e-05, "loss": 0.4545, "step": 13315 }, { "epoch": 0.4800518975024327, "grad_norm": 0.16876693069934845, "learning_rate": 4.9431318957778214e-05, "loss": 0.4874, "step": 13320 }, { "epoch": 0.48023209716365733, "grad_norm": 0.18165095150470734, "learning_rate": 4.943069991961333e-05, "loss": 0.4349, "step": 13325 }, { "epoch": 0.48041229682488196, "grad_norm": 0.17460200190544128, "learning_rate": 4.9430080548584884e-05, "loss": 0.4678, "step": 13330 }, { "epoch": 0.4805924964861066, "grad_norm": 0.14548338949680328, "learning_rate": 4.9429460844701325e-05, "loss": 0.4717, "step": 13335 }, { "epoch": 0.48077269614733126, "grad_norm": 0.17974305152893066, "learning_rate": 4.942884080797109e-05, "loss": 0.4617, "step": 13340 }, { "epoch": 0.4809528958085559, "grad_norm": 0.194953054189682, "learning_rate": 4.942822043840262e-05, "loss": 0.4724, "step": 13345 }, { "epoch": 0.4811330954697805, "grad_norm": 0.16658879816532135, "learning_rate": 4.942759973600439e-05, "loss": 0.4236, "step": 13350 }, { "epoch": 0.4813132951310052, "grad_norm": 0.18458372354507446, "learning_rate": 4.9426978700784834e-05, "loss": 0.4801, "step": 13355 }, { "epoch": 0.4814934947922298, "grad_norm": 0.18043503165245056, "learning_rate": 4.942635733275243e-05, "loss": 0.4592, "step": 13360 }, { "epoch": 0.4816736944534544, "grad_norm": 0.2545734643936157, "learning_rate": 4.942573563191563e-05, "loss": 0.4656, "step": 13365 }, { "epoch": 0.48185389411467905, "grad_norm": 0.18358397483825684, "learning_rate": 4.9425113598282916e-05, "loss": 0.4401, "step": 13370 }, { "epoch": 0.4820340937759037, "grad_norm": 0.17365938425064087, "learning_rate": 4.942449123186274e-05, "loss": 0.4459, "step": 13375 }, { "epoch": 0.48221429343712835, "grad_norm": 0.15696145594120026, "learning_rate": 4.942386853266362e-05, "loss": 0.4618, "step": 13380 }, { "epoch": 0.48239449309835297, "grad_norm": 0.16444018483161926, "learning_rate": 4.942324550069402e-05, "loss": 0.4885, "step": 13385 }, { "epoch": 0.4825746927595776, "grad_norm": 0.14012210071086884, "learning_rate": 4.942262213596241e-05, "loss": 0.4398, "step": 13390 }, { "epoch": 0.4827548924208023, "grad_norm": 0.23050503432750702, "learning_rate": 4.942199843847732e-05, "loss": 0.4899, "step": 13395 }, { "epoch": 0.4829350920820269, "grad_norm": 0.15428675711154938, "learning_rate": 4.942137440824723e-05, "loss": 0.4707, "step": 13400 }, { "epoch": 0.4831152917432515, "grad_norm": 0.2107590287923813, "learning_rate": 4.9420750045280636e-05, "loss": 0.4507, "step": 13405 }, { "epoch": 0.48329549140447614, "grad_norm": 0.1626407951116562, "learning_rate": 4.942012534958605e-05, "loss": 0.4398, "step": 13410 }, { "epoch": 0.4834756910657008, "grad_norm": 0.14003750681877136, "learning_rate": 4.9419500321171987e-05, "loss": 0.4246, "step": 13415 }, { "epoch": 0.48365589072692544, "grad_norm": 0.16057373583316803, "learning_rate": 4.9418874960046954e-05, "loss": 0.4636, "step": 13420 }, { "epoch": 0.48383609038815006, "grad_norm": 0.1851695328950882, "learning_rate": 4.941824926621948e-05, "loss": 0.4292, "step": 13425 }, { "epoch": 0.4840162900493747, "grad_norm": 0.1717042773962021, "learning_rate": 4.941762323969809e-05, "loss": 0.4782, "step": 13430 }, { "epoch": 0.48419648971059936, "grad_norm": 0.16300024092197418, "learning_rate": 4.9416996880491305e-05, "loss": 0.4498, "step": 13435 }, { "epoch": 0.484376689371824, "grad_norm": 0.1653520166873932, "learning_rate": 4.941637018860767e-05, "loss": 0.417, "step": 13440 }, { "epoch": 0.4845568890330486, "grad_norm": 0.14384515583515167, "learning_rate": 4.941574316405572e-05, "loss": 0.4253, "step": 13445 }, { "epoch": 0.48473708869427323, "grad_norm": 0.18884818255901337, "learning_rate": 4.9415115806843993e-05, "loss": 0.4521, "step": 13450 }, { "epoch": 0.4849172883554979, "grad_norm": 0.14921367168426514, "learning_rate": 4.941448811698104e-05, "loss": 0.4797, "step": 13455 }, { "epoch": 0.48509748801672253, "grad_norm": 0.16940531134605408, "learning_rate": 4.9413860094475414e-05, "loss": 0.4575, "step": 13460 }, { "epoch": 0.48527768767794716, "grad_norm": 0.23463140428066254, "learning_rate": 4.9413231739335664e-05, "loss": 0.4568, "step": 13465 }, { "epoch": 0.4854578873391718, "grad_norm": 0.14103110134601593, "learning_rate": 4.9412603051570364e-05, "loss": 0.4384, "step": 13470 }, { "epoch": 0.48563808700039646, "grad_norm": 0.1720830202102661, "learning_rate": 4.941197403118808e-05, "loss": 0.4834, "step": 13475 }, { "epoch": 0.4858182866616211, "grad_norm": 0.1815214455127716, "learning_rate": 4.9411344678197366e-05, "loss": 0.4557, "step": 13480 }, { "epoch": 0.4859984863228457, "grad_norm": 0.18314550817012787, "learning_rate": 4.941071499260681e-05, "loss": 0.4589, "step": 13485 }, { "epoch": 0.4861786859840703, "grad_norm": 0.1780148446559906, "learning_rate": 4.9410084974424994e-05, "loss": 0.4338, "step": 13490 }, { "epoch": 0.486358885645295, "grad_norm": 0.17320701479911804, "learning_rate": 4.940945462366049e-05, "loss": 0.4356, "step": 13495 }, { "epoch": 0.4865390853065196, "grad_norm": 0.1759449988603592, "learning_rate": 4.94088239403219e-05, "loss": 0.4147, "step": 13500 }, { "epoch": 0.4865390853065196, "eval_loss": 0.47342750430107117, "eval_runtime": 3.5547, "eval_samples_per_second": 28.131, "eval_steps_per_second": 7.033, "step": 13500 }, { "epoch": 0.48671928496774425, "grad_norm": 0.1460382640361786, "learning_rate": 4.94081929244178e-05, "loss": 0.4134, "step": 13505 }, { "epoch": 0.48689948462896887, "grad_norm": 0.20045733451843262, "learning_rate": 4.94075615759568e-05, "loss": 0.4641, "step": 13510 }, { "epoch": 0.48707968429019355, "grad_norm": 0.1686198115348816, "learning_rate": 4.94069298949475e-05, "loss": 0.4628, "step": 13515 }, { "epoch": 0.48725988395141817, "grad_norm": 0.16252323985099792, "learning_rate": 4.9406297881398504e-05, "loss": 0.4763, "step": 13520 }, { "epoch": 0.4874400836126428, "grad_norm": 0.2166275978088379, "learning_rate": 4.940566553531843e-05, "loss": 0.4497, "step": 13525 }, { "epoch": 0.48762028327386747, "grad_norm": 0.22283247113227844, "learning_rate": 4.940503285671588e-05, "loss": 0.465, "step": 13530 }, { "epoch": 0.4878004829350921, "grad_norm": 0.11601988226175308, "learning_rate": 4.940439984559949e-05, "loss": 0.3932, "step": 13535 }, { "epoch": 0.4879806825963167, "grad_norm": 0.17983366549015045, "learning_rate": 4.940376650197787e-05, "loss": 0.4426, "step": 13540 }, { "epoch": 0.48816088225754134, "grad_norm": 0.2088562697172165, "learning_rate": 4.940313282585967e-05, "loss": 0.4687, "step": 13545 }, { "epoch": 0.488341081918766, "grad_norm": 0.17913992702960968, "learning_rate": 4.940249881725349e-05, "loss": 0.4504, "step": 13550 }, { "epoch": 0.48852128157999064, "grad_norm": 0.18297719955444336, "learning_rate": 4.9401864476168e-05, "loss": 0.4756, "step": 13555 }, { "epoch": 0.48870148124121526, "grad_norm": 0.18946805596351624, "learning_rate": 4.9401229802611826e-05, "loss": 0.4461, "step": 13560 }, { "epoch": 0.4888816809024399, "grad_norm": 0.19642286002635956, "learning_rate": 4.9400594796593626e-05, "loss": 0.4813, "step": 13565 }, { "epoch": 0.48906188056366456, "grad_norm": 0.21718788146972656, "learning_rate": 4.940008655241221e-05, "loss": 0.4587, "step": 13570 }, { "epoch": 0.4892420802248892, "grad_norm": 0.2410500943660736, "learning_rate": 4.939945094798416e-05, "loss": 0.4804, "step": 13575 }, { "epoch": 0.4894222798861138, "grad_norm": 0.1772763729095459, "learning_rate": 4.93988150111183e-05, "loss": 0.4455, "step": 13580 }, { "epoch": 0.48960247954733843, "grad_norm": 0.16895850002765656, "learning_rate": 4.939817874182333e-05, "loss": 0.4365, "step": 13585 }, { "epoch": 0.4897826792085631, "grad_norm": 0.2072199136018753, "learning_rate": 4.939754214010788e-05, "loss": 0.4403, "step": 13590 }, { "epoch": 0.48996287886978773, "grad_norm": 0.16987258195877075, "learning_rate": 4.939690520598065e-05, "loss": 0.449, "step": 13595 }, { "epoch": 0.49014307853101236, "grad_norm": 0.16706013679504395, "learning_rate": 4.9396267939450316e-05, "loss": 0.4429, "step": 13600 }, { "epoch": 0.490323278192237, "grad_norm": 0.15251483023166656, "learning_rate": 4.939563034052555e-05, "loss": 0.4386, "step": 13605 }, { "epoch": 0.49050347785346166, "grad_norm": 0.17916177213191986, "learning_rate": 4.9394992409215036e-05, "loss": 0.4344, "step": 13610 }, { "epoch": 0.4906836775146863, "grad_norm": 0.13827458024024963, "learning_rate": 4.939435414552748e-05, "loss": 0.4667, "step": 13615 }, { "epoch": 0.4908638771759109, "grad_norm": 0.1721857637166977, "learning_rate": 4.939371554947156e-05, "loss": 0.4614, "step": 13620 }, { "epoch": 0.4910440768371355, "grad_norm": 0.19055204093456268, "learning_rate": 4.9393076621056e-05, "loss": 0.4408, "step": 13625 }, { "epoch": 0.4912242764983602, "grad_norm": 0.1874268501996994, "learning_rate": 4.93924373602895e-05, "loss": 0.4647, "step": 13630 }, { "epoch": 0.4914044761595848, "grad_norm": 0.1848086565732956, "learning_rate": 4.9391797767180755e-05, "loss": 0.4801, "step": 13635 }, { "epoch": 0.49158467582080945, "grad_norm": 0.1567952185869217, "learning_rate": 4.939115784173849e-05, "loss": 0.4194, "step": 13640 }, { "epoch": 0.49176487548203407, "grad_norm": 0.16139036417007446, "learning_rate": 4.9390517583971416e-05, "loss": 0.473, "step": 13645 }, { "epoch": 0.49194507514325875, "grad_norm": 0.14114044606685638, "learning_rate": 4.938987699388827e-05, "loss": 0.4326, "step": 13650 }, { "epoch": 0.49212527480448337, "grad_norm": 0.20750269293785095, "learning_rate": 4.938923607149777e-05, "loss": 0.4792, "step": 13655 }, { "epoch": 0.492305474465708, "grad_norm": 0.1520867943763733, "learning_rate": 4.938859481680865e-05, "loss": 0.453, "step": 13660 }, { "epoch": 0.4924856741269326, "grad_norm": 0.17390841245651245, "learning_rate": 4.9387953229829644e-05, "loss": 0.4304, "step": 13665 }, { "epoch": 0.4926658737881573, "grad_norm": 0.14850212633609772, "learning_rate": 4.938731131056949e-05, "loss": 0.4328, "step": 13670 }, { "epoch": 0.4928460734493819, "grad_norm": 0.15505053102970123, "learning_rate": 4.938666905903696e-05, "loss": 0.4195, "step": 13675 }, { "epoch": 0.49302627311060654, "grad_norm": 0.1582787036895752, "learning_rate": 4.938602647524077e-05, "loss": 0.4119, "step": 13680 }, { "epoch": 0.49320647277183116, "grad_norm": 0.2123645544052124, "learning_rate": 4.93853835591897e-05, "loss": 0.4687, "step": 13685 }, { "epoch": 0.49338667243305584, "grad_norm": 0.19299419224262238, "learning_rate": 4.93847403108925e-05, "loss": 0.4948, "step": 13690 }, { "epoch": 0.49356687209428046, "grad_norm": 0.19016866385936737, "learning_rate": 4.938409673035793e-05, "loss": 0.4477, "step": 13695 }, { "epoch": 0.4937470717555051, "grad_norm": 0.1360589861869812, "learning_rate": 4.938345281759476e-05, "loss": 0.4734, "step": 13700 }, { "epoch": 0.49392727141672976, "grad_norm": 0.15430140495300293, "learning_rate": 4.9382808572611775e-05, "loss": 0.4432, "step": 13705 }, { "epoch": 0.4941074710779544, "grad_norm": 0.17401854693889618, "learning_rate": 4.938216399541773e-05, "loss": 0.4308, "step": 13710 }, { "epoch": 0.494287670739179, "grad_norm": 0.17819242179393768, "learning_rate": 4.9381519086021434e-05, "loss": 0.449, "step": 13715 }, { "epoch": 0.49446787040040363, "grad_norm": 0.1599666327238083, "learning_rate": 4.9380873844431654e-05, "loss": 0.4336, "step": 13720 }, { "epoch": 0.4946480700616283, "grad_norm": 0.13881821930408478, "learning_rate": 4.938022827065719e-05, "loss": 0.4488, "step": 13725 }, { "epoch": 0.49482826972285293, "grad_norm": 0.1781865805387497, "learning_rate": 4.937958236470684e-05, "loss": 0.4461, "step": 13730 }, { "epoch": 0.49500846938407755, "grad_norm": 0.1789911985397339, "learning_rate": 4.93789361265894e-05, "loss": 0.4537, "step": 13735 }, { "epoch": 0.4951886690453022, "grad_norm": 0.1587119698524475, "learning_rate": 4.9378289556313673e-05, "loss": 0.4544, "step": 13740 }, { "epoch": 0.49536886870652685, "grad_norm": 0.1573282778263092, "learning_rate": 4.9377642653888464e-05, "loss": 0.396, "step": 13745 }, { "epoch": 0.4955490683677515, "grad_norm": 0.13113892078399658, "learning_rate": 4.93769954193226e-05, "loss": 0.4652, "step": 13750 }, { "epoch": 0.4957292680289761, "grad_norm": 0.1477142572402954, "learning_rate": 4.9376347852624895e-05, "loss": 0.4209, "step": 13755 }, { "epoch": 0.4959094676902007, "grad_norm": 0.1767813116312027, "learning_rate": 4.937569995380417e-05, "loss": 0.4656, "step": 13760 }, { "epoch": 0.4960896673514254, "grad_norm": 0.14544841647148132, "learning_rate": 4.937505172286925e-05, "loss": 0.453, "step": 13765 }, { "epoch": 0.49626986701265, "grad_norm": 0.16864298284053802, "learning_rate": 4.9374403159828965e-05, "loss": 0.4242, "step": 13770 }, { "epoch": 0.49645006667387465, "grad_norm": 0.16903114318847656, "learning_rate": 4.9373754264692164e-05, "loss": 0.4519, "step": 13775 }, { "epoch": 0.49663026633509927, "grad_norm": 0.21901170909404755, "learning_rate": 4.937310503746767e-05, "loss": 0.4414, "step": 13780 }, { "epoch": 0.49681046599632395, "grad_norm": 0.18153007328510284, "learning_rate": 4.937245547816435e-05, "loss": 0.4262, "step": 13785 }, { "epoch": 0.49699066565754857, "grad_norm": 0.16006413102149963, "learning_rate": 4.937180558679104e-05, "loss": 0.4462, "step": 13790 }, { "epoch": 0.4971708653187732, "grad_norm": 0.17855258285999298, "learning_rate": 4.937115536335659e-05, "loss": 0.4327, "step": 13795 }, { "epoch": 0.4973510649799978, "grad_norm": 0.1678069531917572, "learning_rate": 4.937050480786987e-05, "loss": 0.4655, "step": 13800 }, { "epoch": 0.4975312646412225, "grad_norm": 0.13157707452774048, "learning_rate": 4.936985392033975e-05, "loss": 0.4951, "step": 13805 }, { "epoch": 0.4977114643024471, "grad_norm": 0.20059573650360107, "learning_rate": 4.936920270077508e-05, "loss": 0.456, "step": 13810 }, { "epoch": 0.49789166396367174, "grad_norm": 0.13859055936336517, "learning_rate": 4.936855114918474e-05, "loss": 0.4359, "step": 13815 }, { "epoch": 0.49807186362489636, "grad_norm": 0.165902242064476, "learning_rate": 4.936789926557761e-05, "loss": 0.4371, "step": 13820 }, { "epoch": 0.49825206328612104, "grad_norm": 0.1760404258966446, "learning_rate": 4.936724704996257e-05, "loss": 0.4465, "step": 13825 }, { "epoch": 0.49843226294734566, "grad_norm": 0.16966862976551056, "learning_rate": 4.936659450234851e-05, "loss": 0.4454, "step": 13830 }, { "epoch": 0.4986124626085703, "grad_norm": 0.16548436880111694, "learning_rate": 4.936594162274431e-05, "loss": 0.4679, "step": 13835 }, { "epoch": 0.4987926622697949, "grad_norm": 0.19853737950325012, "learning_rate": 4.936528841115887e-05, "loss": 0.4545, "step": 13840 }, { "epoch": 0.4989728619310196, "grad_norm": 0.19320517778396606, "learning_rate": 4.936463486760111e-05, "loss": 0.4758, "step": 13845 }, { "epoch": 0.4991530615922442, "grad_norm": 0.2133074700832367, "learning_rate": 4.936398099207991e-05, "loss": 0.4675, "step": 13850 }, { "epoch": 0.49933326125346883, "grad_norm": 0.16722871363162994, "learning_rate": 4.936332678460417e-05, "loss": 0.4416, "step": 13855 }, { "epoch": 0.4995134609146935, "grad_norm": 0.1773030161857605, "learning_rate": 4.936267224518284e-05, "loss": 0.475, "step": 13860 }, { "epoch": 0.49969366057591813, "grad_norm": 0.13764159381389618, "learning_rate": 4.936201737382481e-05, "loss": 0.4597, "step": 13865 }, { "epoch": 0.49987386023714275, "grad_norm": 0.19111090898513794, "learning_rate": 4.9361362170539006e-05, "loss": 0.4601, "step": 13870 }, { "epoch": 0.5000540598983674, "grad_norm": 0.13824209570884705, "learning_rate": 4.936070663533436e-05, "loss": 0.4103, "step": 13875 }, { "epoch": 0.500234259559592, "grad_norm": 0.17304690182209015, "learning_rate": 4.936005076821981e-05, "loss": 0.4446, "step": 13880 }, { "epoch": 0.5004144592208166, "grad_norm": 0.1318497657775879, "learning_rate": 4.9359394569204274e-05, "loss": 0.4341, "step": 13885 }, { "epoch": 0.5005946588820414, "grad_norm": 0.1591554433107376, "learning_rate": 4.9358738038296714e-05, "loss": 0.4414, "step": 13890 }, { "epoch": 0.500774858543266, "grad_norm": 0.18022438883781433, "learning_rate": 4.935808117550605e-05, "loss": 0.4741, "step": 13895 }, { "epoch": 0.5009550582044906, "grad_norm": 0.16793961822986603, "learning_rate": 4.935742398084127e-05, "loss": 0.4567, "step": 13900 }, { "epoch": 0.5011352578657152, "grad_norm": 0.13464850187301636, "learning_rate": 4.935676645431128e-05, "loss": 0.447, "step": 13905 }, { "epoch": 0.5013154575269398, "grad_norm": 0.1593722254037857, "learning_rate": 4.935610859592508e-05, "loss": 0.425, "step": 13910 }, { "epoch": 0.5014956571881645, "grad_norm": 0.1893470287322998, "learning_rate": 4.93554504056916e-05, "loss": 0.4611, "step": 13915 }, { "epoch": 0.5016758568493891, "grad_norm": 0.17857231199741364, "learning_rate": 4.935479188361983e-05, "loss": 0.4438, "step": 13920 }, { "epoch": 0.5018560565106137, "grad_norm": 0.15780913829803467, "learning_rate": 4.935413302971874e-05, "loss": 0.4474, "step": 13925 }, { "epoch": 0.5020362561718384, "grad_norm": 0.17613886296749115, "learning_rate": 4.93534738439973e-05, "loss": 0.4686, "step": 13930 }, { "epoch": 0.5022164558330631, "grad_norm": 0.16873963177204132, "learning_rate": 4.9352814326464493e-05, "loss": 0.4534, "step": 13935 }, { "epoch": 0.5023966554942877, "grad_norm": 0.1617756485939026, "learning_rate": 4.93521544771293e-05, "loss": 0.4431, "step": 13940 }, { "epoch": 0.5025768551555123, "grad_norm": 0.1517868936061859, "learning_rate": 4.9351494296000726e-05, "loss": 0.4691, "step": 13945 }, { "epoch": 0.5027570548167369, "grad_norm": 0.18268106877803802, "learning_rate": 4.935083378308776e-05, "loss": 0.4257, "step": 13950 }, { "epoch": 0.5029372544779616, "grad_norm": 0.16596758365631104, "learning_rate": 4.935017293839939e-05, "loss": 0.408, "step": 13955 }, { "epoch": 0.5031174541391862, "grad_norm": 0.2062714397907257, "learning_rate": 4.934951176194462e-05, "loss": 0.4706, "step": 13960 }, { "epoch": 0.5032976538004108, "grad_norm": 0.1877586394548416, "learning_rate": 4.934885025373248e-05, "loss": 0.4726, "step": 13965 }, { "epoch": 0.5034778534616355, "grad_norm": 0.17197106778621674, "learning_rate": 4.9348188413771966e-05, "loss": 0.4465, "step": 13970 }, { "epoch": 0.5036580531228602, "grad_norm": 0.16831296682357788, "learning_rate": 4.93475262420721e-05, "loss": 0.489, "step": 13975 }, { "epoch": 0.5038382527840848, "grad_norm": 0.18322400748729706, "learning_rate": 4.93468637386419e-05, "loss": 0.4343, "step": 13980 }, { "epoch": 0.5040184524453094, "grad_norm": 0.21650521457195282, "learning_rate": 4.93462009034904e-05, "loss": 0.4735, "step": 13985 }, { "epoch": 0.504198652106534, "grad_norm": 0.17821024358272552, "learning_rate": 4.9345537736626626e-05, "loss": 0.4375, "step": 13990 }, { "epoch": 0.5043788517677587, "grad_norm": 0.15985864400863647, "learning_rate": 4.934487423805961e-05, "loss": 0.4419, "step": 13995 }, { "epoch": 0.5045590514289833, "grad_norm": 0.1555582582950592, "learning_rate": 4.93442104077984e-05, "loss": 0.4606, "step": 14000 }, { "epoch": 0.5045590514289833, "eval_loss": 0.4720987379550934, "eval_runtime": 3.5871, "eval_samples_per_second": 27.878, "eval_steps_per_second": 6.969, "step": 14000 }, { "epoch": 0.5047392510902079, "grad_norm": 0.1948181539773941, "learning_rate": 4.934354624585202e-05, "loss": 0.4392, "step": 14005 }, { "epoch": 0.5049194507514326, "grad_norm": 0.1451338827610016, "learning_rate": 4.934288175222955e-05, "loss": 0.4184, "step": 14010 }, { "epoch": 0.5050996504126573, "grad_norm": 0.1611844301223755, "learning_rate": 4.934221692694003e-05, "loss": 0.4428, "step": 14015 }, { "epoch": 0.5052798500738819, "grad_norm": 0.19232018291950226, "learning_rate": 4.934155176999252e-05, "loss": 0.4512, "step": 14020 }, { "epoch": 0.5054600497351065, "grad_norm": 0.24862439930438995, "learning_rate": 4.934088628139607e-05, "loss": 0.4715, "step": 14025 }, { "epoch": 0.5056402493963311, "grad_norm": 0.15161451697349548, "learning_rate": 4.9340220461159757e-05, "loss": 0.4179, "step": 14030 }, { "epoch": 0.5058204490575557, "grad_norm": 0.19004757702350616, "learning_rate": 4.933955430929266e-05, "loss": 0.4474, "step": 14035 }, { "epoch": 0.5060006487187804, "grad_norm": 0.23055416345596313, "learning_rate": 4.933888782580385e-05, "loss": 0.4602, "step": 14040 }, { "epoch": 0.5061808483800051, "grad_norm": 0.18940792977809906, "learning_rate": 4.93382210107024e-05, "loss": 0.4346, "step": 14045 }, { "epoch": 0.5063610480412297, "grad_norm": 0.1852693110704422, "learning_rate": 4.9337553863997396e-05, "loss": 0.4499, "step": 14050 }, { "epoch": 0.5065412477024543, "grad_norm": 0.1449885368347168, "learning_rate": 4.933688638569794e-05, "loss": 0.4657, "step": 14055 }, { "epoch": 0.506721447363679, "grad_norm": 0.1560732126235962, "learning_rate": 4.933621857581312e-05, "loss": 0.4184, "step": 14060 }, { "epoch": 0.5069016470249036, "grad_norm": 0.1511814445257187, "learning_rate": 4.933555043435203e-05, "loss": 0.4409, "step": 14065 }, { "epoch": 0.5070818466861282, "grad_norm": 0.1766042411327362, "learning_rate": 4.9334881961323776e-05, "loss": 0.4559, "step": 14070 }, { "epoch": 0.5072620463473528, "grad_norm": 0.14052477478981018, "learning_rate": 4.9334213156737465e-05, "loss": 0.4476, "step": 14075 }, { "epoch": 0.5074422460085775, "grad_norm": 0.1610243022441864, "learning_rate": 4.933354402060221e-05, "loss": 0.434, "step": 14080 }, { "epoch": 0.5076224456698022, "grad_norm": 0.1901833862066269, "learning_rate": 4.9332874552927135e-05, "loss": 0.4489, "step": 14085 }, { "epoch": 0.5078026453310268, "grad_norm": 0.16567949950695038, "learning_rate": 4.933220475372136e-05, "loss": 0.4473, "step": 14090 }, { "epoch": 0.5079828449922514, "grad_norm": 0.2085447609424591, "learning_rate": 4.933153462299399e-05, "loss": 0.4374, "step": 14095 }, { "epoch": 0.5081630446534761, "grad_norm": 0.1837865263223648, "learning_rate": 4.933086416075418e-05, "loss": 0.4615, "step": 14100 }, { "epoch": 0.5083432443147007, "grad_norm": 0.1936039924621582, "learning_rate": 4.933019336701106e-05, "loss": 0.4533, "step": 14105 }, { "epoch": 0.5085234439759253, "grad_norm": 0.38860994577407837, "learning_rate": 4.932952224177376e-05, "loss": 0.4412, "step": 14110 }, { "epoch": 0.5087036436371499, "grad_norm": 0.18047145009040833, "learning_rate": 4.932885078505143e-05, "loss": 0.4725, "step": 14115 }, { "epoch": 0.5088838432983745, "grad_norm": 0.16559572517871857, "learning_rate": 4.932817899685323e-05, "loss": 0.452, "step": 14120 }, { "epoch": 0.5090640429595993, "grad_norm": 0.1487295925617218, "learning_rate": 4.9327506877188284e-05, "loss": 0.4177, "step": 14125 }, { "epoch": 0.5092442426208239, "grad_norm": 0.14861665666103363, "learning_rate": 4.9326834426065775e-05, "loss": 0.4183, "step": 14130 }, { "epoch": 0.5094244422820485, "grad_norm": 0.13988757133483887, "learning_rate": 4.9326161643494856e-05, "loss": 0.4216, "step": 14135 }, { "epoch": 0.5096046419432732, "grad_norm": 0.1480373740196228, "learning_rate": 4.93254885294847e-05, "loss": 0.4165, "step": 14140 }, { "epoch": 0.5097848416044978, "grad_norm": 0.1930646002292633, "learning_rate": 4.932481508404446e-05, "loss": 0.426, "step": 14145 }, { "epoch": 0.5099650412657224, "grad_norm": 0.18709403276443481, "learning_rate": 4.932414130718334e-05, "loss": 0.4608, "step": 14150 }, { "epoch": 0.510145240926947, "grad_norm": 0.17628611624240875, "learning_rate": 4.932346719891049e-05, "loss": 0.4455, "step": 14155 }, { "epoch": 0.5103254405881716, "grad_norm": 0.1651298552751541, "learning_rate": 4.9322792759235115e-05, "loss": 0.4711, "step": 14160 }, { "epoch": 0.5105056402493964, "grad_norm": 0.16614636778831482, "learning_rate": 4.93221179881664e-05, "loss": 0.4232, "step": 14165 }, { "epoch": 0.510685839910621, "grad_norm": 0.16470062732696533, "learning_rate": 4.932144288571353e-05, "loss": 0.4558, "step": 14170 }, { "epoch": 0.5108660395718456, "grad_norm": 0.20364750921726227, "learning_rate": 4.932076745188571e-05, "loss": 0.4421, "step": 14175 }, { "epoch": 0.5110462392330702, "grad_norm": 0.19398504495620728, "learning_rate": 4.9320091686692136e-05, "loss": 0.4581, "step": 14180 }, { "epoch": 0.5112264388942949, "grad_norm": 0.1814093142747879, "learning_rate": 4.931941559014204e-05, "loss": 0.4678, "step": 14185 }, { "epoch": 0.5114066385555195, "grad_norm": 0.15303486585617065, "learning_rate": 4.93187391622446e-05, "loss": 0.436, "step": 14190 }, { "epoch": 0.5115868382167441, "grad_norm": 0.1580374389886856, "learning_rate": 4.931806240300905e-05, "loss": 0.4715, "step": 14195 }, { "epoch": 0.5117670378779688, "grad_norm": 0.18431012332439423, "learning_rate": 4.931738531244461e-05, "loss": 0.4241, "step": 14200 }, { "epoch": 0.5119472375391935, "grad_norm": 0.20473140478134155, "learning_rate": 4.93167078905605e-05, "loss": 0.4226, "step": 14205 }, { "epoch": 0.5121274372004181, "grad_norm": 0.1783171445131302, "learning_rate": 4.931603013736595e-05, "loss": 0.4515, "step": 14210 }, { "epoch": 0.5123076368616427, "grad_norm": 0.1747739315032959, "learning_rate": 4.931535205287021e-05, "loss": 0.4537, "step": 14215 }, { "epoch": 0.5124878365228673, "grad_norm": 0.17563074827194214, "learning_rate": 4.93146736370825e-05, "loss": 0.459, "step": 14220 }, { "epoch": 0.512668036184092, "grad_norm": 0.16567584872245789, "learning_rate": 4.931399489001206e-05, "loss": 0.4407, "step": 14225 }, { "epoch": 0.5128482358453166, "grad_norm": 0.20225879549980164, "learning_rate": 4.931331581166816e-05, "loss": 0.5078, "step": 14230 }, { "epoch": 0.5130284355065412, "grad_norm": 0.17894190549850464, "learning_rate": 4.931263640206003e-05, "loss": 0.429, "step": 14235 }, { "epoch": 0.5132086351677659, "grad_norm": 0.2424396276473999, "learning_rate": 4.9311956661196945e-05, "loss": 0.4652, "step": 14240 }, { "epoch": 0.5133888348289906, "grad_norm": 0.17801520228385925, "learning_rate": 4.931127658908815e-05, "loss": 0.4764, "step": 14245 }, { "epoch": 0.5135690344902152, "grad_norm": 0.16889409720897675, "learning_rate": 4.931059618574292e-05, "loss": 0.4521, "step": 14250 }, { "epoch": 0.5137492341514398, "grad_norm": 0.17756950855255127, "learning_rate": 4.930991545117052e-05, "loss": 0.4439, "step": 14255 }, { "epoch": 0.5139294338126644, "grad_norm": 0.13921865820884705, "learning_rate": 4.930923438538024e-05, "loss": 0.4176, "step": 14260 }, { "epoch": 0.514109633473889, "grad_norm": 0.21671532094478607, "learning_rate": 4.930855298838134e-05, "loss": 0.4944, "step": 14265 }, { "epoch": 0.5142898331351137, "grad_norm": 0.1547165960073471, "learning_rate": 4.930787126018311e-05, "loss": 0.4394, "step": 14270 }, { "epoch": 0.5144700327963383, "grad_norm": 0.16631481051445007, "learning_rate": 4.930718920079484e-05, "loss": 0.4397, "step": 14275 }, { "epoch": 0.514650232457563, "grad_norm": 0.14847290515899658, "learning_rate": 4.9306506810225824e-05, "loss": 0.4688, "step": 14280 }, { "epoch": 0.5148304321187877, "grad_norm": 0.1879221498966217, "learning_rate": 4.930582408848536e-05, "loss": 0.4581, "step": 14285 }, { "epoch": 0.5150106317800123, "grad_norm": 0.15472771227359772, "learning_rate": 4.930514103558275e-05, "loss": 0.4211, "step": 14290 }, { "epoch": 0.5151908314412369, "grad_norm": 0.22856763005256653, "learning_rate": 4.9304457651527305e-05, "loss": 0.4399, "step": 14295 }, { "epoch": 0.5153710311024615, "grad_norm": 0.13668227195739746, "learning_rate": 4.930377393632832e-05, "loss": 0.4319, "step": 14300 }, { "epoch": 0.5155512307636861, "grad_norm": 0.1626194417476654, "learning_rate": 4.9303089889995125e-05, "loss": 0.4161, "step": 14305 }, { "epoch": 0.5157314304249108, "grad_norm": 0.17859403789043427, "learning_rate": 4.930240551253703e-05, "loss": 0.4305, "step": 14310 }, { "epoch": 0.5159116300861354, "grad_norm": 0.16735269129276276, "learning_rate": 4.930172080396337e-05, "loss": 0.4726, "step": 14315 }, { "epoch": 0.5160918297473601, "grad_norm": 0.1954079419374466, "learning_rate": 4.930103576428346e-05, "loss": 0.4316, "step": 14320 }, { "epoch": 0.5162720294085847, "grad_norm": 0.17768363654613495, "learning_rate": 4.9300350393506655e-05, "loss": 0.4013, "step": 14325 }, { "epoch": 0.5164522290698094, "grad_norm": 0.1929953694343567, "learning_rate": 4.929966469164228e-05, "loss": 0.4537, "step": 14330 }, { "epoch": 0.516632428731034, "grad_norm": 0.17287170886993408, "learning_rate": 4.9298978658699674e-05, "loss": 0.4316, "step": 14335 }, { "epoch": 0.5168126283922586, "grad_norm": 0.18152733147144318, "learning_rate": 4.9298292294688183e-05, "loss": 0.4502, "step": 14340 }, { "epoch": 0.5169928280534832, "grad_norm": 0.19768409430980682, "learning_rate": 4.929760559961717e-05, "loss": 0.4375, "step": 14345 }, { "epoch": 0.5171730277147079, "grad_norm": 0.18449656665325165, "learning_rate": 4.9296918573495984e-05, "loss": 0.4628, "step": 14350 }, { "epoch": 0.5173532273759325, "grad_norm": 0.13669456541538239, "learning_rate": 4.9296231216333986e-05, "loss": 0.4446, "step": 14355 }, { "epoch": 0.5175334270371572, "grad_norm": 0.17907440662384033, "learning_rate": 4.929554352814055e-05, "loss": 0.4351, "step": 14360 }, { "epoch": 0.5177136266983818, "grad_norm": 0.2036353498697281, "learning_rate": 4.9294855508925026e-05, "loss": 0.4293, "step": 14365 }, { "epoch": 0.5178938263596065, "grad_norm": 0.14478379487991333, "learning_rate": 4.92941671586968e-05, "loss": 0.4088, "step": 14370 }, { "epoch": 0.5180740260208311, "grad_norm": 0.19350174069404602, "learning_rate": 4.9293478477465254e-05, "loss": 0.4735, "step": 14375 }, { "epoch": 0.5182542256820557, "grad_norm": 0.15223993360996246, "learning_rate": 4.9292789465239765e-05, "loss": 0.42, "step": 14380 }, { "epoch": 0.5184344253432803, "grad_norm": 0.17773163318634033, "learning_rate": 4.929210012202973e-05, "loss": 0.4626, "step": 14385 }, { "epoch": 0.518614625004505, "grad_norm": 0.19823719561100006, "learning_rate": 4.929141044784452e-05, "loss": 0.4291, "step": 14390 }, { "epoch": 0.5187948246657297, "grad_norm": 0.163058340549469, "learning_rate": 4.929072044269356e-05, "loss": 0.4824, "step": 14395 }, { "epoch": 0.5189750243269543, "grad_norm": 0.17107297480106354, "learning_rate": 4.929003010658623e-05, "loss": 0.4953, "step": 14400 }, { "epoch": 0.5191552239881789, "grad_norm": 0.17323638498783112, "learning_rate": 4.928933943953193e-05, "loss": 0.4444, "step": 14405 }, { "epoch": 0.5193354236494035, "grad_norm": 0.19337867200374603, "learning_rate": 4.92886484415401e-05, "loss": 0.4295, "step": 14410 }, { "epoch": 0.5195156233106282, "grad_norm": 0.250715434551239, "learning_rate": 4.9287957112620134e-05, "loss": 0.5048, "step": 14415 }, { "epoch": 0.5196958229718528, "grad_norm": 0.19110535085201263, "learning_rate": 4.928726545278145e-05, "loss": 0.4508, "step": 14420 }, { "epoch": 0.5198760226330774, "grad_norm": 0.2401556819677353, "learning_rate": 4.9286573462033484e-05, "loss": 0.52, "step": 14425 }, { "epoch": 0.520056222294302, "grad_norm": 0.145725280046463, "learning_rate": 4.9285881140385645e-05, "loss": 0.4753, "step": 14430 }, { "epoch": 0.5202364219555268, "grad_norm": 0.17486672103405, "learning_rate": 4.928518848784739e-05, "loss": 0.4178, "step": 14435 }, { "epoch": 0.5204166216167514, "grad_norm": 0.1457482874393463, "learning_rate": 4.928449550442814e-05, "loss": 0.4379, "step": 14440 }, { "epoch": 0.520596821277976, "grad_norm": 0.21257434785366058, "learning_rate": 4.928380219013734e-05, "loss": 0.465, "step": 14445 }, { "epoch": 0.5207770209392006, "grad_norm": 0.17879636585712433, "learning_rate": 4.928310854498444e-05, "loss": 0.4134, "step": 14450 }, { "epoch": 0.5209572206004253, "grad_norm": 0.20684200525283813, "learning_rate": 4.928241456897887e-05, "loss": 0.4, "step": 14455 }, { "epoch": 0.5211374202616499, "grad_norm": 0.15960803627967834, "learning_rate": 4.928172026213012e-05, "loss": 0.4198, "step": 14460 }, { "epoch": 0.5213176199228745, "grad_norm": 0.1935255080461502, "learning_rate": 4.928102562444763e-05, "loss": 0.4503, "step": 14465 }, { "epoch": 0.5214978195840991, "grad_norm": 0.18149693310260773, "learning_rate": 4.928033065594086e-05, "loss": 0.4337, "step": 14470 }, { "epoch": 0.5216780192453239, "grad_norm": 0.21554191410541534, "learning_rate": 4.927963535661929e-05, "loss": 0.3948, "step": 14475 }, { "epoch": 0.5218582189065485, "grad_norm": 0.14424189925193787, "learning_rate": 4.927893972649239e-05, "loss": 0.4235, "step": 14480 }, { "epoch": 0.5220384185677731, "grad_norm": 0.1991516500711441, "learning_rate": 4.927824376556964e-05, "loss": 0.4594, "step": 14485 }, { "epoch": 0.5222186182289977, "grad_norm": 0.12362515181303024, "learning_rate": 4.927754747386051e-05, "loss": 0.4268, "step": 14490 }, { "epoch": 0.5223988178902224, "grad_norm": 0.219016894698143, "learning_rate": 4.92768508513745e-05, "loss": 0.4574, "step": 14495 }, { "epoch": 0.522579017551447, "grad_norm": 0.1444326788187027, "learning_rate": 4.92761538981211e-05, "loss": 0.4237, "step": 14500 }, { "epoch": 0.522579017551447, "eval_loss": 0.47057196497917175, "eval_runtime": 3.5636, "eval_samples_per_second": 28.062, "eval_steps_per_second": 7.015, "step": 14500 }, { "epoch": 0.5227592172126716, "grad_norm": 0.23088853061199188, "learning_rate": 4.92754566141098e-05, "loss": 0.4652, "step": 14505 }, { "epoch": 0.5229394168738962, "grad_norm": 0.20848578214645386, "learning_rate": 4.9274758999350115e-05, "loss": 0.4217, "step": 14510 }, { "epoch": 0.523119616535121, "grad_norm": 0.1422787457704544, "learning_rate": 4.9274061053851525e-05, "loss": 0.438, "step": 14515 }, { "epoch": 0.5232998161963456, "grad_norm": 0.21294035017490387, "learning_rate": 4.9273362777623555e-05, "loss": 0.4632, "step": 14520 }, { "epoch": 0.5234800158575702, "grad_norm": 0.1711554378271103, "learning_rate": 4.927266417067572e-05, "loss": 0.469, "step": 14525 }, { "epoch": 0.5236602155187948, "grad_norm": 0.17742256820201874, "learning_rate": 4.9271965233017527e-05, "loss": 0.4506, "step": 14530 }, { "epoch": 0.5238404151800194, "grad_norm": 0.17392541468143463, "learning_rate": 4.9271265964658517e-05, "loss": 0.4368, "step": 14535 }, { "epoch": 0.5240206148412441, "grad_norm": 0.15873579680919647, "learning_rate": 4.92705663656082e-05, "loss": 0.4176, "step": 14540 }, { "epoch": 0.5242008145024687, "grad_norm": 0.19420740008354187, "learning_rate": 4.926986643587612e-05, "loss": 0.4758, "step": 14545 }, { "epoch": 0.5243810141636934, "grad_norm": 0.16508692502975464, "learning_rate": 4.9269166175471806e-05, "loss": 0.4577, "step": 14550 }, { "epoch": 0.524561213824918, "grad_norm": 0.1393464207649231, "learning_rate": 4.92684655844048e-05, "loss": 0.4444, "step": 14555 }, { "epoch": 0.5247414134861427, "grad_norm": 0.16871263086795807, "learning_rate": 4.9267764662684654e-05, "loss": 0.4719, "step": 14560 }, { "epoch": 0.5249216131473673, "grad_norm": 0.16914774477481842, "learning_rate": 4.9267063410320907e-05, "loss": 0.4371, "step": 14565 }, { "epoch": 0.5251018128085919, "grad_norm": 0.15868036448955536, "learning_rate": 4.926636182732313e-05, "loss": 0.4464, "step": 14570 }, { "epoch": 0.5252820124698165, "grad_norm": 0.20540837943553925, "learning_rate": 4.926565991370086e-05, "loss": 0.4362, "step": 14575 }, { "epoch": 0.5254622121310412, "grad_norm": 0.20324106514453888, "learning_rate": 4.926495766946368e-05, "loss": 0.4428, "step": 14580 }, { "epoch": 0.5256424117922658, "grad_norm": 0.190400630235672, "learning_rate": 4.9264255094621135e-05, "loss": 0.4453, "step": 14585 }, { "epoch": 0.5258226114534905, "grad_norm": 0.19358183443546295, "learning_rate": 4.9263552189182826e-05, "loss": 0.454, "step": 14590 }, { "epoch": 0.5260028111147151, "grad_norm": 0.18700343370437622, "learning_rate": 4.926284895315831e-05, "loss": 0.486, "step": 14595 }, { "epoch": 0.5261830107759398, "grad_norm": 0.1986667960882187, "learning_rate": 4.926214538655718e-05, "loss": 0.4768, "step": 14600 }, { "epoch": 0.5263632104371644, "grad_norm": 0.16327090561389923, "learning_rate": 4.926144148938901e-05, "loss": 0.4532, "step": 14605 }, { "epoch": 0.526543410098389, "grad_norm": 0.18385253846645355, "learning_rate": 4.92607372616634e-05, "loss": 0.4345, "step": 14610 }, { "epoch": 0.5267236097596136, "grad_norm": 0.16108223795890808, "learning_rate": 4.9260032703389936e-05, "loss": 0.4524, "step": 14615 }, { "epoch": 0.5269038094208383, "grad_norm": 0.22687973082065582, "learning_rate": 4.9259327814578234e-05, "loss": 0.4706, "step": 14620 }, { "epoch": 0.5270840090820629, "grad_norm": 0.20703905820846558, "learning_rate": 4.925862259523788e-05, "loss": 0.4411, "step": 14625 }, { "epoch": 0.5272642087432876, "grad_norm": 0.17245493829250336, "learning_rate": 4.925791704537849e-05, "loss": 0.4483, "step": 14630 }, { "epoch": 0.5274444084045122, "grad_norm": 0.21393609046936035, "learning_rate": 4.925721116500968e-05, "loss": 0.4409, "step": 14635 }, { "epoch": 0.5276246080657369, "grad_norm": 0.16517458856105804, "learning_rate": 4.9256504954141066e-05, "loss": 0.4424, "step": 14640 }, { "epoch": 0.5278048077269615, "grad_norm": 0.1912750005722046, "learning_rate": 4.925579841278226e-05, "loss": 0.4668, "step": 14645 }, { "epoch": 0.5279850073881861, "grad_norm": 0.19278542697429657, "learning_rate": 4.9255091540942905e-05, "loss": 0.4311, "step": 14650 }, { "epoch": 0.5281652070494107, "grad_norm": 0.14870208501815796, "learning_rate": 4.925438433863262e-05, "loss": 0.4558, "step": 14655 }, { "epoch": 0.5283454067106353, "grad_norm": 0.15407264232635498, "learning_rate": 4.925367680586104e-05, "loss": 0.4101, "step": 14660 }, { "epoch": 0.52852560637186, "grad_norm": 0.183024600148201, "learning_rate": 4.925296894263782e-05, "loss": 0.4485, "step": 14665 }, { "epoch": 0.5287058060330847, "grad_norm": 0.15193411707878113, "learning_rate": 4.925226074897259e-05, "loss": 0.4471, "step": 14670 }, { "epoch": 0.5288860056943093, "grad_norm": 0.18354560434818268, "learning_rate": 4.9251552224875e-05, "loss": 0.4246, "step": 14675 }, { "epoch": 0.529066205355534, "grad_norm": 0.16226573288440704, "learning_rate": 4.9250843370354704e-05, "loss": 0.4549, "step": 14680 }, { "epoch": 0.5292464050167586, "grad_norm": 0.18447591364383698, "learning_rate": 4.925013418542136e-05, "loss": 0.4531, "step": 14685 }, { "epoch": 0.5294266046779832, "grad_norm": 0.12299656867980957, "learning_rate": 4.9249424670084636e-05, "loss": 0.4247, "step": 14690 }, { "epoch": 0.5296068043392078, "grad_norm": 0.19318439066410065, "learning_rate": 4.92487148243542e-05, "loss": 0.4277, "step": 14695 }, { "epoch": 0.5297870040004324, "grad_norm": 0.22538743913173676, "learning_rate": 4.924800464823971e-05, "loss": 0.4561, "step": 14700 }, { "epoch": 0.5299672036616572, "grad_norm": 0.15232376754283905, "learning_rate": 4.9247294141750864e-05, "loss": 0.4437, "step": 14705 }, { "epoch": 0.5301474033228818, "grad_norm": 0.2040311098098755, "learning_rate": 4.924658330489732e-05, "loss": 0.451, "step": 14710 }, { "epoch": 0.5303276029841064, "grad_norm": 0.1730954647064209, "learning_rate": 4.9245872137688776e-05, "loss": 0.4292, "step": 14715 }, { "epoch": 0.530507802645331, "grad_norm": 0.16043753921985626, "learning_rate": 4.9245160640134916e-05, "loss": 0.4814, "step": 14720 }, { "epoch": 0.5306880023065557, "grad_norm": 0.17977766692638397, "learning_rate": 4.924444881224544e-05, "loss": 0.4409, "step": 14725 }, { "epoch": 0.5308682019677803, "grad_norm": 0.21359415352344513, "learning_rate": 4.924373665403004e-05, "loss": 0.4522, "step": 14730 }, { "epoch": 0.5310484016290049, "grad_norm": 0.1500881314277649, "learning_rate": 4.924302416549842e-05, "loss": 0.4555, "step": 14735 }, { "epoch": 0.5312286012902295, "grad_norm": 0.18042995035648346, "learning_rate": 4.92423113466603e-05, "loss": 0.467, "step": 14740 }, { "epoch": 0.5314088009514543, "grad_norm": 0.14702163636684418, "learning_rate": 4.9241598197525374e-05, "loss": 0.465, "step": 14745 }, { "epoch": 0.5315890006126789, "grad_norm": 0.14355307817459106, "learning_rate": 4.9240884718103366e-05, "loss": 0.4373, "step": 14750 }, { "epoch": 0.5317692002739035, "grad_norm": 0.17057915031909943, "learning_rate": 4.9240170908403996e-05, "loss": 0.4753, "step": 14755 }, { "epoch": 0.5319493999351281, "grad_norm": 0.16943103075027466, "learning_rate": 4.9239456768436985e-05, "loss": 0.442, "step": 14760 }, { "epoch": 0.5321295995963528, "grad_norm": 0.21389932930469513, "learning_rate": 4.923874229821208e-05, "loss": 0.4608, "step": 14765 }, { "epoch": 0.5323097992575774, "grad_norm": 0.17261211574077606, "learning_rate": 4.9238027497738995e-05, "loss": 0.4512, "step": 14770 }, { "epoch": 0.532489998918802, "grad_norm": 0.1695832759141922, "learning_rate": 4.9237312367027484e-05, "loss": 0.4398, "step": 14775 }, { "epoch": 0.5326701985800266, "grad_norm": 0.17145270109176636, "learning_rate": 4.923659690608728e-05, "loss": 0.4597, "step": 14780 }, { "epoch": 0.5328503982412514, "grad_norm": 0.193446546792984, "learning_rate": 4.923588111492814e-05, "loss": 0.4286, "step": 14785 }, { "epoch": 0.533030597902476, "grad_norm": 0.19881367683410645, "learning_rate": 4.923516499355981e-05, "loss": 0.4713, "step": 14790 }, { "epoch": 0.5332107975637006, "grad_norm": 0.16294337809085846, "learning_rate": 4.9234448541992045e-05, "loss": 0.3998, "step": 14795 }, { "epoch": 0.5333909972249252, "grad_norm": 0.1420733481645584, "learning_rate": 4.9233731760234616e-05, "loss": 0.4301, "step": 14800 }, { "epoch": 0.5335711968861498, "grad_norm": 0.17047178745269775, "learning_rate": 4.923301464829728e-05, "loss": 0.4181, "step": 14805 }, { "epoch": 0.5337513965473745, "grad_norm": 0.13956919312477112, "learning_rate": 4.923229720618981e-05, "loss": 0.455, "step": 14810 }, { "epoch": 0.5339315962085991, "grad_norm": 0.17539288103580475, "learning_rate": 4.923157943392199e-05, "loss": 0.4681, "step": 14815 }, { "epoch": 0.5341117958698237, "grad_norm": 0.19295528531074524, "learning_rate": 4.9230861331503586e-05, "loss": 0.4691, "step": 14820 }, { "epoch": 0.5342919955310484, "grad_norm": 0.15335458517074585, "learning_rate": 4.923014289894439e-05, "loss": 0.4094, "step": 14825 }, { "epoch": 0.5344721951922731, "grad_norm": 0.17784667015075684, "learning_rate": 4.922942413625418e-05, "loss": 0.4263, "step": 14830 }, { "epoch": 0.5346523948534977, "grad_norm": 0.14942409098148346, "learning_rate": 4.922870504344276e-05, "loss": 0.432, "step": 14835 }, { "epoch": 0.5348325945147223, "grad_norm": 0.1628066897392273, "learning_rate": 4.9227985620519934e-05, "loss": 0.4321, "step": 14840 }, { "epoch": 0.5350127941759469, "grad_norm": 0.17270931601524353, "learning_rate": 4.922726586749549e-05, "loss": 0.4449, "step": 14845 }, { "epoch": 0.5351929938371716, "grad_norm": 0.1773119568824768, "learning_rate": 4.922654578437923e-05, "loss": 0.4791, "step": 14850 }, { "epoch": 0.5353731934983962, "grad_norm": 0.1501081883907318, "learning_rate": 4.9225825371180985e-05, "loss": 0.4242, "step": 14855 }, { "epoch": 0.5355533931596208, "grad_norm": 0.16684913635253906, "learning_rate": 4.9225104627910553e-05, "loss": 0.4395, "step": 14860 }, { "epoch": 0.5357335928208455, "grad_norm": 0.2014274299144745, "learning_rate": 4.922438355457777e-05, "loss": 0.4403, "step": 14865 }, { "epoch": 0.5359137924820702, "grad_norm": 0.14999887347221375, "learning_rate": 4.922366215119244e-05, "loss": 0.4354, "step": 14870 }, { "epoch": 0.5360939921432948, "grad_norm": 0.17910043895244598, "learning_rate": 4.922294041776441e-05, "loss": 0.4549, "step": 14875 }, { "epoch": 0.5362741918045194, "grad_norm": 0.1802271157503128, "learning_rate": 4.922221835430351e-05, "loss": 0.4787, "step": 14880 }, { "epoch": 0.536454391465744, "grad_norm": 0.15048733353614807, "learning_rate": 4.922149596081956e-05, "loss": 0.4334, "step": 14885 }, { "epoch": 0.5366345911269687, "grad_norm": 0.17902320623397827, "learning_rate": 4.9220773237322424e-05, "loss": 0.4446, "step": 14890 }, { "epoch": 0.5368147907881933, "grad_norm": 0.2159227430820465, "learning_rate": 4.922005018382195e-05, "loss": 0.398, "step": 14895 }, { "epoch": 0.536994990449418, "grad_norm": 0.17634055018424988, "learning_rate": 4.9219326800327967e-05, "loss": 0.4409, "step": 14900 }, { "epoch": 0.5371751901106426, "grad_norm": 0.15450234711170197, "learning_rate": 4.921860308685036e-05, "loss": 0.482, "step": 14905 }, { "epoch": 0.5373553897718673, "grad_norm": 0.18871042132377625, "learning_rate": 4.921787904339897e-05, "loss": 0.4748, "step": 14910 }, { "epoch": 0.5375355894330919, "grad_norm": 0.15520833432674408, "learning_rate": 4.921715466998366e-05, "loss": 0.4209, "step": 14915 }, { "epoch": 0.5377157890943165, "grad_norm": 0.17904292047023773, "learning_rate": 4.921642996661431e-05, "loss": 0.4665, "step": 14920 }, { "epoch": 0.5378959887555411, "grad_norm": 0.23425260186195374, "learning_rate": 4.9215704933300795e-05, "loss": 0.4156, "step": 14925 }, { "epoch": 0.5380761884167657, "grad_norm": 0.18364645540714264, "learning_rate": 4.921497957005299e-05, "loss": 0.4648, "step": 14930 }, { "epoch": 0.5382563880779904, "grad_norm": 0.13957159221172333, "learning_rate": 4.921425387688077e-05, "loss": 0.4577, "step": 14935 }, { "epoch": 0.5384365877392151, "grad_norm": 0.1416681855916977, "learning_rate": 4.9213527853794025e-05, "loss": 0.4146, "step": 14940 }, { "epoch": 0.5386167874004397, "grad_norm": 0.18616509437561035, "learning_rate": 4.921280150080266e-05, "loss": 0.4689, "step": 14945 }, { "epoch": 0.5387969870616643, "grad_norm": 0.2297711819410324, "learning_rate": 4.9212074817916554e-05, "loss": 0.4326, "step": 14950 }, { "epoch": 0.538977186722889, "grad_norm": 0.17574380338191986, "learning_rate": 4.9211347805145626e-05, "loss": 0.4521, "step": 14955 }, { "epoch": 0.5391573863841136, "grad_norm": 0.15670587122440338, "learning_rate": 4.921062046249976e-05, "loss": 0.4515, "step": 14960 }, { "epoch": 0.5393375860453382, "grad_norm": 0.17424379289150238, "learning_rate": 4.9209892789988886e-05, "loss": 0.4088, "step": 14965 }, { "epoch": 0.5395177857065628, "grad_norm": 0.16352404654026031, "learning_rate": 4.920916478762291e-05, "loss": 0.4369, "step": 14970 }, { "epoch": 0.5396979853677875, "grad_norm": 0.1390729397535324, "learning_rate": 4.920843645541174e-05, "loss": 0.442, "step": 14975 }, { "epoch": 0.5398781850290122, "grad_norm": 0.20522846281528473, "learning_rate": 4.9207707793365325e-05, "loss": 0.4439, "step": 14980 }, { "epoch": 0.5400583846902368, "grad_norm": 0.18843060731887817, "learning_rate": 4.9206978801493574e-05, "loss": 0.4427, "step": 14985 }, { "epoch": 0.5402385843514614, "grad_norm": 0.1719266027212143, "learning_rate": 4.920624947980642e-05, "loss": 0.4301, "step": 14990 }, { "epoch": 0.5404187840126861, "grad_norm": 0.21065543591976166, "learning_rate": 4.9205519828313804e-05, "loss": 0.4552, "step": 14995 }, { "epoch": 0.5405989836739107, "grad_norm": 0.16618047654628754, "learning_rate": 4.9204789847025666e-05, "loss": 0.4583, "step": 15000 }, { "epoch": 0.5405989836739107, "eval_loss": 0.4699234068393707, "eval_runtime": 3.5836, "eval_samples_per_second": 27.905, "eval_steps_per_second": 6.976, "step": 15000 }, { "epoch": 0.5407791833351353, "grad_norm": 0.16865772008895874, "learning_rate": 4.920405953595196e-05, "loss": 0.4513, "step": 15005 }, { "epoch": 0.5409593829963599, "grad_norm": 0.18350325524806976, "learning_rate": 4.9203328895102616e-05, "loss": 0.4655, "step": 15010 }, { "epoch": 0.5411395826575846, "grad_norm": 0.1351919025182724, "learning_rate": 4.920259792448761e-05, "loss": 0.4242, "step": 15015 }, { "epoch": 0.5413197823188093, "grad_norm": 0.18003039062023163, "learning_rate": 4.9201866624116896e-05, "loss": 0.4616, "step": 15020 }, { "epoch": 0.5414999819800339, "grad_norm": 0.19233831763267517, "learning_rate": 4.920113499400043e-05, "loss": 0.4467, "step": 15025 }, { "epoch": 0.5416801816412585, "grad_norm": 0.16504104435443878, "learning_rate": 4.9200403034148186e-05, "loss": 0.4578, "step": 15030 }, { "epoch": 0.5418603813024832, "grad_norm": 0.2171589583158493, "learning_rate": 4.919967074457014e-05, "loss": 0.4737, "step": 15035 }, { "epoch": 0.5420405809637078, "grad_norm": 0.17700451612472534, "learning_rate": 4.919893812527626e-05, "loss": 0.4803, "step": 15040 }, { "epoch": 0.5422207806249324, "grad_norm": 0.15637724101543427, "learning_rate": 4.919820517627653e-05, "loss": 0.4535, "step": 15045 }, { "epoch": 0.542400980286157, "grad_norm": 0.18739740550518036, "learning_rate": 4.9197471897580945e-05, "loss": 0.4589, "step": 15050 }, { "epoch": 0.5425811799473818, "grad_norm": 0.1649918407201767, "learning_rate": 4.9196738289199484e-05, "loss": 0.4825, "step": 15055 }, { "epoch": 0.5427613796086064, "grad_norm": 0.15141309797763824, "learning_rate": 4.9196004351142156e-05, "loss": 0.4802, "step": 15060 }, { "epoch": 0.542941579269831, "grad_norm": 0.18119317293167114, "learning_rate": 4.9195270083418946e-05, "loss": 0.469, "step": 15065 }, { "epoch": 0.5431217789310556, "grad_norm": 0.145644873380661, "learning_rate": 4.919453548603987e-05, "loss": 0.4697, "step": 15070 }, { "epoch": 0.5433019785922802, "grad_norm": 0.19345882534980774, "learning_rate": 4.9193800559014935e-05, "loss": 0.4733, "step": 15075 }, { "epoch": 0.5434821782535049, "grad_norm": 0.16904519498348236, "learning_rate": 4.919306530235415e-05, "loss": 0.4677, "step": 15080 }, { "epoch": 0.5436623779147295, "grad_norm": 0.15902064740657806, "learning_rate": 4.919232971606753e-05, "loss": 0.4743, "step": 15085 }, { "epoch": 0.5438425775759541, "grad_norm": 0.1665191799402237, "learning_rate": 4.91915938001651e-05, "loss": 0.4044, "step": 15090 }, { "epoch": 0.5440227772371788, "grad_norm": 0.19838020205497742, "learning_rate": 4.91908575546569e-05, "loss": 0.4449, "step": 15095 }, { "epoch": 0.5442029768984035, "grad_norm": 0.1867869794368744, "learning_rate": 4.919012097955294e-05, "loss": 0.4659, "step": 15100 }, { "epoch": 0.5443831765596281, "grad_norm": 0.1696668267250061, "learning_rate": 4.918938407486326e-05, "loss": 0.4349, "step": 15105 }, { "epoch": 0.5445633762208527, "grad_norm": 0.15441665053367615, "learning_rate": 4.918864684059792e-05, "loss": 0.4425, "step": 15110 }, { "epoch": 0.5447435758820773, "grad_norm": 0.1583663374185562, "learning_rate": 4.918790927676694e-05, "loss": 0.3933, "step": 15115 }, { "epoch": 0.544923775543302, "grad_norm": 0.16343651711940765, "learning_rate": 4.918717138338038e-05, "loss": 0.4006, "step": 15120 }, { "epoch": 0.5451039752045266, "grad_norm": 0.14582295715808868, "learning_rate": 4.918643316044829e-05, "loss": 0.4413, "step": 15125 }, { "epoch": 0.5452841748657512, "grad_norm": 0.15008191764354706, "learning_rate": 4.9185694607980737e-05, "loss": 0.4231, "step": 15130 }, { "epoch": 0.5454643745269759, "grad_norm": 0.16880400478839874, "learning_rate": 4.918495572598777e-05, "loss": 0.4398, "step": 15135 }, { "epoch": 0.5456445741882006, "grad_norm": 0.14454464614391327, "learning_rate": 4.918421651447948e-05, "loss": 0.4116, "step": 15140 }, { "epoch": 0.5458247738494252, "grad_norm": 0.22429820895195007, "learning_rate": 4.9183476973465905e-05, "loss": 0.4625, "step": 15145 }, { "epoch": 0.5460049735106498, "grad_norm": 0.17825621366500854, "learning_rate": 4.9182737102957147e-05, "loss": 0.4437, "step": 15150 }, { "epoch": 0.5461851731718744, "grad_norm": 0.16203615069389343, "learning_rate": 4.9181996902963265e-05, "loss": 0.4262, "step": 15155 }, { "epoch": 0.546365372833099, "grad_norm": 0.2014383226633072, "learning_rate": 4.918125637349437e-05, "loss": 0.479, "step": 15160 }, { "epoch": 0.5465455724943237, "grad_norm": 0.16691508889198303, "learning_rate": 4.918051551456053e-05, "loss": 0.4402, "step": 15165 }, { "epoch": 0.5467257721555483, "grad_norm": 0.18141350150108337, "learning_rate": 4.917977432617186e-05, "loss": 0.4648, "step": 15170 }, { "epoch": 0.546905971816773, "grad_norm": 0.15297532081604004, "learning_rate": 4.9179032808338435e-05, "loss": 0.4106, "step": 15175 }, { "epoch": 0.5470861714779977, "grad_norm": 0.14727525413036346, "learning_rate": 4.917829096107037e-05, "loss": 0.4584, "step": 15180 }, { "epoch": 0.5472663711392223, "grad_norm": 0.18853066861629486, "learning_rate": 4.917754878437778e-05, "loss": 0.482, "step": 15185 }, { "epoch": 0.5474465708004469, "grad_norm": 0.196510449051857, "learning_rate": 4.9176806278270757e-05, "loss": 0.4466, "step": 15190 }, { "epoch": 0.5476267704616715, "grad_norm": 0.16554872691631317, "learning_rate": 4.917606344275944e-05, "loss": 0.4687, "step": 15195 }, { "epoch": 0.5478069701228961, "grad_norm": 0.15905487537384033, "learning_rate": 4.917532027785394e-05, "loss": 0.4855, "step": 15200 }, { "epoch": 0.5479871697841208, "grad_norm": 0.17025235295295715, "learning_rate": 4.917457678356437e-05, "loss": 0.4565, "step": 15205 }, { "epoch": 0.5481673694453455, "grad_norm": 0.18130546808242798, "learning_rate": 4.917383295990088e-05, "loss": 0.456, "step": 15210 }, { "epoch": 0.5483475691065701, "grad_norm": 0.13437901437282562, "learning_rate": 4.9173088806873596e-05, "loss": 0.4207, "step": 15215 }, { "epoch": 0.5485277687677947, "grad_norm": 0.1458578109741211, "learning_rate": 4.917234432449266e-05, "loss": 0.4671, "step": 15220 }, { "epoch": 0.5487079684290194, "grad_norm": 0.16906622052192688, "learning_rate": 4.9171599512768206e-05, "loss": 0.4346, "step": 15225 }, { "epoch": 0.548888168090244, "grad_norm": 0.17966735363006592, "learning_rate": 4.917085437171038e-05, "loss": 0.4734, "step": 15230 }, { "epoch": 0.5490683677514686, "grad_norm": 0.135285422205925, "learning_rate": 4.917010890132936e-05, "loss": 0.4293, "step": 15235 }, { "epoch": 0.5492485674126932, "grad_norm": 0.15000662207603455, "learning_rate": 4.916936310163528e-05, "loss": 0.4076, "step": 15240 }, { "epoch": 0.5494287670739179, "grad_norm": 0.17892701923847198, "learning_rate": 4.9168616972638304e-05, "loss": 0.4461, "step": 15245 }, { "epoch": 0.5496089667351426, "grad_norm": 0.16764943301677704, "learning_rate": 4.91678705143486e-05, "loss": 0.4662, "step": 15250 }, { "epoch": 0.5497891663963672, "grad_norm": 0.19673018157482147, "learning_rate": 4.916712372677635e-05, "loss": 0.4367, "step": 15255 }, { "epoch": 0.5499693660575918, "grad_norm": 0.13246646523475647, "learning_rate": 4.916637660993171e-05, "loss": 0.4468, "step": 15260 }, { "epoch": 0.5501495657188165, "grad_norm": 0.1905840039253235, "learning_rate": 4.916562916382487e-05, "loss": 0.4495, "step": 15265 }, { "epoch": 0.5503297653800411, "grad_norm": 0.18271660804748535, "learning_rate": 4.916488138846601e-05, "loss": 0.4694, "step": 15270 }, { "epoch": 0.5505099650412657, "grad_norm": 0.15053506195545197, "learning_rate": 4.916413328386531e-05, "loss": 0.4371, "step": 15275 }, { "epoch": 0.5506901647024903, "grad_norm": 0.17273497581481934, "learning_rate": 4.916338485003298e-05, "loss": 0.4595, "step": 15280 }, { "epoch": 0.550870364363715, "grad_norm": 0.1739252358675003, "learning_rate": 4.916263608697921e-05, "loss": 0.4688, "step": 15285 }, { "epoch": 0.5510505640249397, "grad_norm": 0.16885848343372345, "learning_rate": 4.916188699471421e-05, "loss": 0.4307, "step": 15290 }, { "epoch": 0.5512307636861643, "grad_norm": 0.15223874151706696, "learning_rate": 4.916113757324817e-05, "loss": 0.4597, "step": 15295 }, { "epoch": 0.5514109633473889, "grad_norm": 0.2001715898513794, "learning_rate": 4.9160387822591306e-05, "loss": 0.4417, "step": 15300 }, { "epoch": 0.5515911630086135, "grad_norm": 0.1790451556444168, "learning_rate": 4.915963774275384e-05, "loss": 0.447, "step": 15305 }, { "epoch": 0.5517713626698382, "grad_norm": 0.16498644649982452, "learning_rate": 4.915888733374598e-05, "loss": 0.4523, "step": 15310 }, { "epoch": 0.5519515623310628, "grad_norm": 0.13701501488685608, "learning_rate": 4.915813659557796e-05, "loss": 0.4299, "step": 15315 }, { "epoch": 0.5521317619922874, "grad_norm": 0.16090936958789825, "learning_rate": 4.9157385528260016e-05, "loss": 0.4038, "step": 15320 }, { "epoch": 0.552311961653512, "grad_norm": 0.156123086810112, "learning_rate": 4.915663413180236e-05, "loss": 0.4675, "step": 15325 }, { "epoch": 0.5524921613147368, "grad_norm": 0.17523333430290222, "learning_rate": 4.915588240621524e-05, "loss": 0.4965, "step": 15330 }, { "epoch": 0.5526723609759614, "grad_norm": 0.1823972910642624, "learning_rate": 4.9155130351508904e-05, "loss": 0.4191, "step": 15335 }, { "epoch": 0.552852560637186, "grad_norm": 0.19572138786315918, "learning_rate": 4.915437796769359e-05, "loss": 0.4305, "step": 15340 }, { "epoch": 0.5530327602984106, "grad_norm": 0.16662746667861938, "learning_rate": 4.915362525477955e-05, "loss": 0.4268, "step": 15345 }, { "epoch": 0.5532129599596353, "grad_norm": 0.22275352478027344, "learning_rate": 4.915287221277706e-05, "loss": 0.4884, "step": 15350 }, { "epoch": 0.5533931596208599, "grad_norm": 0.19144576787948608, "learning_rate": 4.915211884169635e-05, "loss": 0.4326, "step": 15355 }, { "epoch": 0.5535733592820845, "grad_norm": 0.19240762293338776, "learning_rate": 4.915136514154769e-05, "loss": 0.4484, "step": 15360 }, { "epoch": 0.5537535589433091, "grad_norm": 0.13195356726646423, "learning_rate": 4.915061111234136e-05, "loss": 0.4294, "step": 15365 }, { "epoch": 0.5539337586045339, "grad_norm": 0.1802310198545456, "learning_rate": 4.914985675408763e-05, "loss": 0.4616, "step": 15370 }, { "epoch": 0.5541139582657585, "grad_norm": 0.12225214391946793, "learning_rate": 4.914910206679678e-05, "loss": 0.3886, "step": 15375 }, { "epoch": 0.5542941579269831, "grad_norm": 0.139492928981781, "learning_rate": 4.914834705047909e-05, "loss": 0.447, "step": 15380 }, { "epoch": 0.5544743575882077, "grad_norm": 0.14158743619918823, "learning_rate": 4.9147591705144844e-05, "loss": 0.4547, "step": 15385 }, { "epoch": 0.5546545572494324, "grad_norm": 0.164401575922966, "learning_rate": 4.9146836030804346e-05, "loss": 0.4442, "step": 15390 }, { "epoch": 0.554834756910657, "grad_norm": 0.17058567702770233, "learning_rate": 4.914608002746787e-05, "loss": 0.4267, "step": 15395 }, { "epoch": 0.5550149565718816, "grad_norm": 0.18380822241306305, "learning_rate": 4.914532369514573e-05, "loss": 0.4541, "step": 15400 }, { "epoch": 0.5551951562331063, "grad_norm": 0.1776258945465088, "learning_rate": 4.914456703384823e-05, "loss": 0.4566, "step": 15405 }, { "epoch": 0.555375355894331, "grad_norm": 0.13963723182678223, "learning_rate": 4.914381004358568e-05, "loss": 0.4172, "step": 15410 }, { "epoch": 0.5555555555555556, "grad_norm": 0.1411467045545578, "learning_rate": 4.9143052724368396e-05, "loss": 0.3885, "step": 15415 }, { "epoch": 0.5557357552167802, "grad_norm": 0.1662912219762802, "learning_rate": 4.914229507620669e-05, "loss": 0.4943, "step": 15420 }, { "epoch": 0.5559159548780048, "grad_norm": 0.2055153101682663, "learning_rate": 4.914153709911088e-05, "loss": 0.4344, "step": 15425 }, { "epoch": 0.5560961545392294, "grad_norm": 0.17844471335411072, "learning_rate": 4.9140778793091316e-05, "loss": 0.4778, "step": 15430 }, { "epoch": 0.5562763542004541, "grad_norm": 0.17044708132743835, "learning_rate": 4.9140020158158305e-05, "loss": 0.4267, "step": 15435 }, { "epoch": 0.5564565538616787, "grad_norm": 0.1435849368572235, "learning_rate": 4.91392611943222e-05, "loss": 0.4465, "step": 15440 }, { "epoch": 0.5566367535229034, "grad_norm": 0.19326213002204895, "learning_rate": 4.913850190159333e-05, "loss": 0.4709, "step": 15445 }, { "epoch": 0.556816953184128, "grad_norm": 0.17136943340301514, "learning_rate": 4.9137742279982035e-05, "loss": 0.4668, "step": 15450 }, { "epoch": 0.5569971528453527, "grad_norm": 0.16831465065479279, "learning_rate": 4.913698232949868e-05, "loss": 0.471, "step": 15455 }, { "epoch": 0.5571773525065773, "grad_norm": 0.20569349825382233, "learning_rate": 4.9136222050153626e-05, "loss": 0.4573, "step": 15460 }, { "epoch": 0.5573575521678019, "grad_norm": 0.1882450133562088, "learning_rate": 4.913546144195721e-05, "loss": 0.4867, "step": 15465 }, { "epoch": 0.5575377518290265, "grad_norm": 0.15884269773960114, "learning_rate": 4.9134700504919805e-05, "loss": 0.4389, "step": 15470 }, { "epoch": 0.5577179514902512, "grad_norm": 0.16877703368663788, "learning_rate": 4.913393923905178e-05, "loss": 0.4443, "step": 15475 }, { "epoch": 0.5578981511514758, "grad_norm": 0.18422040343284607, "learning_rate": 4.9133177644363506e-05, "loss": 0.4407, "step": 15480 }, { "epoch": 0.5580783508127005, "grad_norm": 0.1388750970363617, "learning_rate": 4.9132415720865355e-05, "loss": 0.4309, "step": 15485 }, { "epoch": 0.5582585504739251, "grad_norm": 0.16202110052108765, "learning_rate": 4.913165346856772e-05, "loss": 0.4428, "step": 15490 }, { "epoch": 0.5584387501351498, "grad_norm": 0.17716795206069946, "learning_rate": 4.9130890887480966e-05, "loss": 0.4904, "step": 15495 }, { "epoch": 0.5586189497963744, "grad_norm": 0.14335161447525024, "learning_rate": 4.91301279776155e-05, "loss": 0.4346, "step": 15500 }, { "epoch": 0.5586189497963744, "eval_loss": 0.4687560200691223, "eval_runtime": 3.5676, "eval_samples_per_second": 28.03, "eval_steps_per_second": 7.008, "step": 15500 }, { "epoch": 0.558799149457599, "grad_norm": 0.1963035613298416, "learning_rate": 4.912936473898172e-05, "loss": 0.4499, "step": 15505 }, { "epoch": 0.5589793491188236, "grad_norm": 0.1802736520767212, "learning_rate": 4.912860117159001e-05, "loss": 0.4784, "step": 15510 }, { "epoch": 0.5591595487800483, "grad_norm": 0.16385473310947418, "learning_rate": 4.9127837275450786e-05, "loss": 0.428, "step": 15515 }, { "epoch": 0.5593397484412729, "grad_norm": 0.1578623354434967, "learning_rate": 4.9127073050574445e-05, "loss": 0.4125, "step": 15520 }, { "epoch": 0.5595199481024976, "grad_norm": 0.2023286670446396, "learning_rate": 4.912630849697141e-05, "loss": 0.4491, "step": 15525 }, { "epoch": 0.5597001477637222, "grad_norm": 0.17793312668800354, "learning_rate": 4.912554361465208e-05, "loss": 0.435, "step": 15530 }, { "epoch": 0.5598803474249469, "grad_norm": 0.18119443953037262, "learning_rate": 4.91247784036269e-05, "loss": 0.4516, "step": 15535 }, { "epoch": 0.5600605470861715, "grad_norm": 0.15839527547359467, "learning_rate": 4.912401286390629e-05, "loss": 0.4549, "step": 15540 }, { "epoch": 0.5602407467473961, "grad_norm": 0.19357532262802124, "learning_rate": 4.912324699550066e-05, "loss": 0.4639, "step": 15545 }, { "epoch": 0.5604209464086207, "grad_norm": 0.16731002926826477, "learning_rate": 4.9122480798420474e-05, "loss": 0.4079, "step": 15550 }, { "epoch": 0.5606011460698453, "grad_norm": 0.16960617899894714, "learning_rate": 4.912171427267615e-05, "loss": 0.4516, "step": 15555 }, { "epoch": 0.5607813457310701, "grad_norm": 0.16323991119861603, "learning_rate": 4.912094741827814e-05, "loss": 0.4396, "step": 15560 }, { "epoch": 0.5609615453922947, "grad_norm": 0.17487618327140808, "learning_rate": 4.9120180235236895e-05, "loss": 0.4219, "step": 15565 }, { "epoch": 0.5611417450535193, "grad_norm": 0.15167950093746185, "learning_rate": 4.911941272356286e-05, "loss": 0.4486, "step": 15570 }, { "epoch": 0.561321944714744, "grad_norm": 0.17126460373401642, "learning_rate": 4.91186448832665e-05, "loss": 0.4338, "step": 15575 }, { "epoch": 0.5615021443759686, "grad_norm": 0.18867506086826324, "learning_rate": 4.911787671435827e-05, "loss": 0.4422, "step": 15580 }, { "epoch": 0.5616823440371932, "grad_norm": 0.1628337949514389, "learning_rate": 4.911710821684864e-05, "loss": 0.4467, "step": 15585 }, { "epoch": 0.5618625436984178, "grad_norm": 0.1885543018579483, "learning_rate": 4.911633939074808e-05, "loss": 0.4332, "step": 15590 }, { "epoch": 0.5620427433596424, "grad_norm": 0.20939011871814728, "learning_rate": 4.9115570236067065e-05, "loss": 0.4422, "step": 15595 }, { "epoch": 0.5622229430208672, "grad_norm": 0.15987086296081543, "learning_rate": 4.911480075281607e-05, "loss": 0.4408, "step": 15600 }, { "epoch": 0.5624031426820918, "grad_norm": 0.13029427826404572, "learning_rate": 4.911403094100559e-05, "loss": 0.4403, "step": 15605 }, { "epoch": 0.5625833423433164, "grad_norm": 0.18694214522838593, "learning_rate": 4.91132608006461e-05, "loss": 0.4415, "step": 15610 }, { "epoch": 0.562763542004541, "grad_norm": 0.1758272796869278, "learning_rate": 4.9112490331748105e-05, "loss": 0.4124, "step": 15615 }, { "epoch": 0.5629437416657657, "grad_norm": 0.16514959931373596, "learning_rate": 4.91117195343221e-05, "loss": 0.4524, "step": 15620 }, { "epoch": 0.5631239413269903, "grad_norm": 0.14684879779815674, "learning_rate": 4.911094840837859e-05, "loss": 0.4564, "step": 15625 }, { "epoch": 0.5633041409882149, "grad_norm": 0.17186830937862396, "learning_rate": 4.911017695392807e-05, "loss": 0.4371, "step": 15630 }, { "epoch": 0.5634843406494395, "grad_norm": 0.1487584263086319, "learning_rate": 4.910940517098105e-05, "loss": 0.4323, "step": 15635 }, { "epoch": 0.5636645403106643, "grad_norm": 0.1303669810295105, "learning_rate": 4.910863305954805e-05, "loss": 0.4372, "step": 15640 }, { "epoch": 0.5638447399718889, "grad_norm": 0.16420136392116547, "learning_rate": 4.91078606196396e-05, "loss": 0.4775, "step": 15645 }, { "epoch": 0.5640249396331135, "grad_norm": 0.18261446058750153, "learning_rate": 4.910708785126622e-05, "loss": 0.4135, "step": 15650 }, { "epoch": 0.5642051392943381, "grad_norm": 0.1571108102798462, "learning_rate": 4.910631475443843e-05, "loss": 0.4463, "step": 15655 }, { "epoch": 0.5643853389555628, "grad_norm": 0.18948879837989807, "learning_rate": 4.910554132916677e-05, "loss": 0.4544, "step": 15660 }, { "epoch": 0.5645655386167874, "grad_norm": 0.19807395339012146, "learning_rate": 4.910476757546177e-05, "loss": 0.4835, "step": 15665 }, { "epoch": 0.564745738278012, "grad_norm": 0.1863390952348709, "learning_rate": 4.910399349333399e-05, "loss": 0.4522, "step": 15670 }, { "epoch": 0.5649259379392366, "grad_norm": 0.17297495901584625, "learning_rate": 4.910321908279396e-05, "loss": 0.4493, "step": 15675 }, { "epoch": 0.5651061376004614, "grad_norm": 0.1477319598197937, "learning_rate": 4.910244434385224e-05, "loss": 0.4326, "step": 15680 }, { "epoch": 0.565286337261686, "grad_norm": 0.18947510421276093, "learning_rate": 4.9101669276519374e-05, "loss": 0.4266, "step": 15685 }, { "epoch": 0.5654665369229106, "grad_norm": 0.1564638763666153, "learning_rate": 4.910089388080593e-05, "loss": 0.4275, "step": 15690 }, { "epoch": 0.5656467365841352, "grad_norm": 0.1595679074525833, "learning_rate": 4.9100118156722485e-05, "loss": 0.4342, "step": 15695 }, { "epoch": 0.5658269362453598, "grad_norm": 0.16428592801094055, "learning_rate": 4.909934210427959e-05, "loss": 0.4652, "step": 15700 }, { "epoch": 0.5660071359065845, "grad_norm": 0.15228533744812012, "learning_rate": 4.909856572348782e-05, "loss": 0.3994, "step": 15705 }, { "epoch": 0.5661873355678091, "grad_norm": 0.18478292226791382, "learning_rate": 4.9097789014357763e-05, "loss": 0.4288, "step": 15710 }, { "epoch": 0.5663675352290337, "grad_norm": 0.1817883402109146, "learning_rate": 4.9097011976899996e-05, "loss": 0.4205, "step": 15715 }, { "epoch": 0.5665477348902584, "grad_norm": 0.16065534949302673, "learning_rate": 4.90962346111251e-05, "loss": 0.4548, "step": 15720 }, { "epoch": 0.5667279345514831, "grad_norm": 0.1838037371635437, "learning_rate": 4.909545691704368e-05, "loss": 0.4084, "step": 15725 }, { "epoch": 0.5669081342127077, "grad_norm": 0.21379324793815613, "learning_rate": 4.909467889466632e-05, "loss": 0.4492, "step": 15730 }, { "epoch": 0.5670883338739323, "grad_norm": 0.18225577473640442, "learning_rate": 4.9093900544003625e-05, "loss": 0.4403, "step": 15735 }, { "epoch": 0.5672685335351569, "grad_norm": 0.16789789497852325, "learning_rate": 4.909327762711516e-05, "loss": 0.4592, "step": 15740 }, { "epoch": 0.5674487331963816, "grad_norm": 0.16580015420913696, "learning_rate": 4.909249868556559e-05, "loss": 0.4639, "step": 15745 }, { "epoch": 0.5676289328576062, "grad_norm": 0.16387851536273956, "learning_rate": 4.909171941576039e-05, "loss": 0.3932, "step": 15750 }, { "epoch": 0.5678091325188309, "grad_norm": 0.1637287735939026, "learning_rate": 4.909093981771018e-05, "loss": 0.4378, "step": 15755 }, { "epoch": 0.5679893321800555, "grad_norm": 0.1793089210987091, "learning_rate": 4.909015989142557e-05, "loss": 0.4265, "step": 15760 }, { "epoch": 0.5681695318412802, "grad_norm": 0.17664669454097748, "learning_rate": 4.908937963691721e-05, "loss": 0.4682, "step": 15765 }, { "epoch": 0.5683497315025048, "grad_norm": 0.17455190420150757, "learning_rate": 4.90885990541957e-05, "loss": 0.4658, "step": 15770 }, { "epoch": 0.5685299311637294, "grad_norm": 0.17027586698532104, "learning_rate": 4.9087818143271703e-05, "loss": 0.4062, "step": 15775 }, { "epoch": 0.568710130824954, "grad_norm": 0.23216940462589264, "learning_rate": 4.9087036904155844e-05, "loss": 0.4304, "step": 15780 }, { "epoch": 0.5688903304861787, "grad_norm": 0.14484082162380219, "learning_rate": 4.908625533685878e-05, "loss": 0.4446, "step": 15785 }, { "epoch": 0.5690705301474033, "grad_norm": 0.18710222840309143, "learning_rate": 4.908547344139114e-05, "loss": 0.4319, "step": 15790 }, { "epoch": 0.569250729808628, "grad_norm": 0.16132237017154694, "learning_rate": 4.908469121776358e-05, "loss": 0.4643, "step": 15795 }, { "epoch": 0.5694309294698526, "grad_norm": 0.16530324518680573, "learning_rate": 4.908390866598678e-05, "loss": 0.4599, "step": 15800 }, { "epoch": 0.5696111291310773, "grad_norm": 0.16622117161750793, "learning_rate": 4.908312578607138e-05, "loss": 0.4115, "step": 15805 }, { "epoch": 0.5697913287923019, "grad_norm": 0.19069482386112213, "learning_rate": 4.9082342578028054e-05, "loss": 0.4046, "step": 15810 }, { "epoch": 0.5699715284535265, "grad_norm": 0.15032880008220673, "learning_rate": 4.908155904186747e-05, "loss": 0.4386, "step": 15815 }, { "epoch": 0.5701517281147511, "grad_norm": 0.18905720114707947, "learning_rate": 4.9080775177600316e-05, "loss": 0.4328, "step": 15820 }, { "epoch": 0.5703319277759757, "grad_norm": 0.1644611656665802, "learning_rate": 4.907999098523726e-05, "loss": 0.4288, "step": 15825 }, { "epoch": 0.5705121274372004, "grad_norm": 0.21615582704544067, "learning_rate": 4.9079206464788986e-05, "loss": 0.4575, "step": 15830 }, { "epoch": 0.5706923270984251, "grad_norm": 0.15707986056804657, "learning_rate": 4.907842161626618e-05, "loss": 0.4264, "step": 15835 }, { "epoch": 0.5708725267596497, "grad_norm": 0.22739115357398987, "learning_rate": 4.9077636439679554e-05, "loss": 0.4573, "step": 15840 }, { "epoch": 0.5710527264208743, "grad_norm": 0.16067542135715485, "learning_rate": 4.907685093503978e-05, "loss": 0.4617, "step": 15845 }, { "epoch": 0.571232926082099, "grad_norm": 0.17511925101280212, "learning_rate": 4.907606510235758e-05, "loss": 0.4553, "step": 15850 }, { "epoch": 0.5714131257433236, "grad_norm": 0.15019887685775757, "learning_rate": 4.9075278941643655e-05, "loss": 0.4247, "step": 15855 }, { "epoch": 0.5715933254045482, "grad_norm": 0.2024499922990799, "learning_rate": 4.907449245290872e-05, "loss": 0.4579, "step": 15860 }, { "epoch": 0.5717735250657728, "grad_norm": 0.1599128544330597, "learning_rate": 4.907370563616347e-05, "loss": 0.4592, "step": 15865 }, { "epoch": 0.5719537247269975, "grad_norm": 0.18762356042861938, "learning_rate": 4.907291849141865e-05, "loss": 0.4446, "step": 15870 }, { "epoch": 0.5721339243882222, "grad_norm": 0.20976999402046204, "learning_rate": 4.907213101868498e-05, "loss": 0.4023, "step": 15875 }, { "epoch": 0.5723141240494468, "grad_norm": 0.1850195974111557, "learning_rate": 4.9071343217973185e-05, "loss": 0.3954, "step": 15880 }, { "epoch": 0.5724943237106714, "grad_norm": 0.1581835001707077, "learning_rate": 4.9070555089294004e-05, "loss": 0.4613, "step": 15885 }, { "epoch": 0.5726745233718961, "grad_norm": 0.13934732973575592, "learning_rate": 4.906976663265816e-05, "loss": 0.3904, "step": 15890 }, { "epoch": 0.5728547230331207, "grad_norm": 0.18741364777088165, "learning_rate": 4.906897784807641e-05, "loss": 0.4099, "step": 15895 }, { "epoch": 0.5730349226943453, "grad_norm": 0.24076466262340546, "learning_rate": 4.9068188735559494e-05, "loss": 0.4394, "step": 15900 }, { "epoch": 0.5732151223555699, "grad_norm": 0.14442354440689087, "learning_rate": 4.9067399295118166e-05, "loss": 0.389, "step": 15905 }, { "epoch": 0.5733953220167947, "grad_norm": 0.15537168085575104, "learning_rate": 4.906660952676318e-05, "loss": 0.4864, "step": 15910 }, { "epoch": 0.5735755216780193, "grad_norm": 0.1750095635652542, "learning_rate": 4.906581943050531e-05, "loss": 0.453, "step": 15915 }, { "epoch": 0.5737557213392439, "grad_norm": 0.13674676418304443, "learning_rate": 4.90650290063553e-05, "loss": 0.4463, "step": 15920 }, { "epoch": 0.5739359210004685, "grad_norm": 0.21467788517475128, "learning_rate": 4.9064238254323934e-05, "loss": 0.4174, "step": 15925 }, { "epoch": 0.5741161206616932, "grad_norm": 0.19165554642677307, "learning_rate": 4.906344717442198e-05, "loss": 0.4429, "step": 15930 }, { "epoch": 0.5742963203229178, "grad_norm": 0.183024063706398, "learning_rate": 4.906265576666022e-05, "loss": 0.4571, "step": 15935 }, { "epoch": 0.5744765199841424, "grad_norm": 0.18742796778678894, "learning_rate": 4.906186403104942e-05, "loss": 0.4394, "step": 15940 }, { "epoch": 0.574656719645367, "grad_norm": 0.17034278810024261, "learning_rate": 4.9061071967600394e-05, "loss": 0.4422, "step": 15945 }, { "epoch": 0.5748369193065918, "grad_norm": 0.19667524099349976, "learning_rate": 4.906027957632392e-05, "loss": 0.4346, "step": 15950 }, { "epoch": 0.5750171189678164, "grad_norm": 0.16463510692119598, "learning_rate": 4.9059486857230785e-05, "loss": 0.4951, "step": 15955 }, { "epoch": 0.575197318629041, "grad_norm": 0.1695079207420349, "learning_rate": 4.9058693810331815e-05, "loss": 0.4675, "step": 15960 }, { "epoch": 0.5753775182902656, "grad_norm": 0.13872882723808289, "learning_rate": 4.905790043563779e-05, "loss": 0.4046, "step": 15965 }, { "epoch": 0.5755577179514902, "grad_norm": 0.196579709649086, "learning_rate": 4.905710673315953e-05, "loss": 0.452, "step": 15970 }, { "epoch": 0.5757379176127149, "grad_norm": 0.195785254240036, "learning_rate": 4.9056312702907844e-05, "loss": 0.4594, "step": 15975 }, { "epoch": 0.5759181172739395, "grad_norm": 0.1784789115190506, "learning_rate": 4.905551834489356e-05, "loss": 0.4217, "step": 15980 }, { "epoch": 0.5760983169351641, "grad_norm": 0.1556771695613861, "learning_rate": 4.9054723659127496e-05, "loss": 0.4264, "step": 15985 }, { "epoch": 0.5762785165963888, "grad_norm": 0.1567842811346054, "learning_rate": 4.9053928645620484e-05, "loss": 0.449, "step": 15990 }, { "epoch": 0.5764587162576135, "grad_norm": 0.19066265225410461, "learning_rate": 4.9053133304383346e-05, "loss": 0.4512, "step": 15995 }, { "epoch": 0.5766389159188381, "grad_norm": 0.17999373376369476, "learning_rate": 4.9052337635426925e-05, "loss": 0.4467, "step": 16000 }, { "epoch": 0.5766389159188381, "eval_loss": 0.4678036570549011, "eval_runtime": 3.5673, "eval_samples_per_second": 28.032, "eval_steps_per_second": 7.008, "step": 16000 }, { "epoch": 0.5768191155800627, "grad_norm": 0.21432943642139435, "learning_rate": 4.9051541638762055e-05, "loss": 0.4813, "step": 16005 }, { "epoch": 0.5769993152412873, "grad_norm": 0.1877758800983429, "learning_rate": 4.905074531439959e-05, "loss": 0.4132, "step": 16010 }, { "epoch": 0.577179514902512, "grad_norm": 0.1863996535539627, "learning_rate": 4.904994866235038e-05, "loss": 0.4336, "step": 16015 }, { "epoch": 0.5773597145637366, "grad_norm": 0.16214637458324432, "learning_rate": 4.904915168262527e-05, "loss": 0.4463, "step": 16020 }, { "epoch": 0.5775399142249612, "grad_norm": 0.17068390548229218, "learning_rate": 4.9048354375235125e-05, "loss": 0.4509, "step": 16025 }, { "epoch": 0.5777201138861859, "grad_norm": 0.1428307294845581, "learning_rate": 4.9047556740190814e-05, "loss": 0.4417, "step": 16030 }, { "epoch": 0.5779003135474106, "grad_norm": 0.19168958067893982, "learning_rate": 4.90467587775032e-05, "loss": 0.4479, "step": 16035 }, { "epoch": 0.5780805132086352, "grad_norm": 0.15157431364059448, "learning_rate": 4.9045960487183144e-05, "loss": 0.4559, "step": 16040 }, { "epoch": 0.5782607128698598, "grad_norm": 0.17288236320018768, "learning_rate": 4.904516186924154e-05, "loss": 0.4378, "step": 16045 }, { "epoch": 0.5784409125310844, "grad_norm": 0.15723341703414917, "learning_rate": 4.904436292368925e-05, "loss": 0.4208, "step": 16050 }, { "epoch": 0.578621112192309, "grad_norm": 0.13947317004203796, "learning_rate": 4.9043563650537185e-05, "loss": 0.445, "step": 16055 }, { "epoch": 0.5788013118535337, "grad_norm": 0.18408885598182678, "learning_rate": 4.9042764049796205e-05, "loss": 0.4146, "step": 16060 }, { "epoch": 0.5789815115147584, "grad_norm": 0.17368938028812408, "learning_rate": 4.904196412147723e-05, "loss": 0.4689, "step": 16065 }, { "epoch": 0.579161711175983, "grad_norm": 0.14086973667144775, "learning_rate": 4.904116386559115e-05, "loss": 0.4167, "step": 16070 }, { "epoch": 0.5793419108372077, "grad_norm": 0.15786948800086975, "learning_rate": 4.9040363282148854e-05, "loss": 0.4391, "step": 16075 }, { "epoch": 0.5795221104984323, "grad_norm": 0.15535566210746765, "learning_rate": 4.903956237116127e-05, "loss": 0.4552, "step": 16080 }, { "epoch": 0.5797023101596569, "grad_norm": 0.3356935679912567, "learning_rate": 4.9038761132639304e-05, "loss": 0.4424, "step": 16085 }, { "epoch": 0.5798825098208815, "grad_norm": 0.1919674128293991, "learning_rate": 4.903795956659387e-05, "loss": 0.4388, "step": 16090 }, { "epoch": 0.5800627094821061, "grad_norm": 0.18245883285999298, "learning_rate": 4.9037157673035894e-05, "loss": 0.452, "step": 16095 }, { "epoch": 0.5802429091433308, "grad_norm": 0.18774840235710144, "learning_rate": 4.903635545197629e-05, "loss": 0.4434, "step": 16100 }, { "epoch": 0.5804231088045555, "grad_norm": 0.1595427542924881, "learning_rate": 4.9035552903426006e-05, "loss": 0.4435, "step": 16105 }, { "epoch": 0.5806033084657801, "grad_norm": 0.1629880964756012, "learning_rate": 4.903475002739596e-05, "loss": 0.4895, "step": 16110 }, { "epoch": 0.5807835081270047, "grad_norm": 0.18076345324516296, "learning_rate": 4.903394682389711e-05, "loss": 0.4733, "step": 16115 }, { "epoch": 0.5809637077882294, "grad_norm": 0.16395287215709686, "learning_rate": 4.9033143292940376e-05, "loss": 0.4474, "step": 16120 }, { "epoch": 0.581143907449454, "grad_norm": 0.21613916754722595, "learning_rate": 4.903233943453672e-05, "loss": 0.4297, "step": 16125 }, { "epoch": 0.5813241071106786, "grad_norm": 0.15141765773296356, "learning_rate": 4.9031535248697095e-05, "loss": 0.4635, "step": 16130 }, { "epoch": 0.5815043067719032, "grad_norm": 0.18841727077960968, "learning_rate": 4.9030730735432453e-05, "loss": 0.4198, "step": 16135 }, { "epoch": 0.5816845064331279, "grad_norm": 0.1524806171655655, "learning_rate": 4.902992589475376e-05, "loss": 0.431, "step": 16140 }, { "epoch": 0.5818647060943526, "grad_norm": 0.21377435326576233, "learning_rate": 4.9029120726671974e-05, "loss": 0.4369, "step": 16145 }, { "epoch": 0.5820449057555772, "grad_norm": 0.17817193269729614, "learning_rate": 4.902831523119808e-05, "loss": 0.4394, "step": 16150 }, { "epoch": 0.5822251054168018, "grad_norm": 0.14097528159618378, "learning_rate": 4.902750940834303e-05, "loss": 0.4648, "step": 16155 }, { "epoch": 0.5824053050780265, "grad_norm": 0.1693866103887558, "learning_rate": 4.9026703258117825e-05, "loss": 0.4576, "step": 16160 }, { "epoch": 0.5825855047392511, "grad_norm": 0.17576813697814941, "learning_rate": 4.902589678053343e-05, "loss": 0.4165, "step": 16165 }, { "epoch": 0.5827657044004757, "grad_norm": 0.16277313232421875, "learning_rate": 4.9025089975600855e-05, "loss": 0.4695, "step": 16170 }, { "epoch": 0.5829459040617003, "grad_norm": 0.17352886497974396, "learning_rate": 4.902428284333107e-05, "loss": 0.4715, "step": 16175 }, { "epoch": 0.583126103722925, "grad_norm": 0.16284751892089844, "learning_rate": 4.902347538373509e-05, "loss": 0.4683, "step": 16180 }, { "epoch": 0.5833063033841497, "grad_norm": 0.17194764316082, "learning_rate": 4.9022667596823904e-05, "loss": 0.4208, "step": 16185 }, { "epoch": 0.5834865030453743, "grad_norm": 0.1685447096824646, "learning_rate": 4.902185948260853e-05, "loss": 0.4543, "step": 16190 }, { "epoch": 0.5836667027065989, "grad_norm": 0.18297097086906433, "learning_rate": 4.9021051041099966e-05, "loss": 0.4427, "step": 16195 }, { "epoch": 0.5838469023678236, "grad_norm": 0.20284834504127502, "learning_rate": 4.902024227230924e-05, "loss": 0.4038, "step": 16200 }, { "epoch": 0.5840271020290482, "grad_norm": 0.1463736742734909, "learning_rate": 4.9019433176247353e-05, "loss": 0.4688, "step": 16205 }, { "epoch": 0.5842073016902728, "grad_norm": 0.14948871731758118, "learning_rate": 4.901862375292534e-05, "loss": 0.4649, "step": 16210 }, { "epoch": 0.5843875013514974, "grad_norm": 0.16728514432907104, "learning_rate": 4.901781400235423e-05, "loss": 0.4369, "step": 16215 }, { "epoch": 0.584567701012722, "grad_norm": 0.1717401146888733, "learning_rate": 4.901700392454506e-05, "loss": 0.4683, "step": 16220 }, { "epoch": 0.5847479006739468, "grad_norm": 0.18547184765338898, "learning_rate": 4.9016193519508855e-05, "loss": 0.4614, "step": 16225 }, { "epoch": 0.5849281003351714, "grad_norm": 0.1905558854341507, "learning_rate": 4.901538278725666e-05, "loss": 0.4074, "step": 16230 }, { "epoch": 0.585108299996396, "grad_norm": 0.169610396027565, "learning_rate": 4.901457172779953e-05, "loss": 0.4232, "step": 16235 }, { "epoch": 0.5852884996576206, "grad_norm": 0.14899376034736633, "learning_rate": 4.9013760341148515e-05, "loss": 0.4495, "step": 16240 }, { "epoch": 0.5854686993188453, "grad_norm": 0.1737765222787857, "learning_rate": 4.901294862731466e-05, "loss": 0.4513, "step": 16245 }, { "epoch": 0.5856488989800699, "grad_norm": 0.18790043890476227, "learning_rate": 4.901213658630902e-05, "loss": 0.4812, "step": 16250 }, { "epoch": 0.5858290986412945, "grad_norm": 0.13040530681610107, "learning_rate": 4.901132421814267e-05, "loss": 0.4453, "step": 16255 }, { "epoch": 0.5860092983025192, "grad_norm": 0.1362471878528595, "learning_rate": 4.901051152282669e-05, "loss": 0.433, "step": 16260 }, { "epoch": 0.5861894979637439, "grad_norm": 0.1804387867450714, "learning_rate": 4.9009698500372124e-05, "loss": 0.4489, "step": 16265 }, { "epoch": 0.5863696976249685, "grad_norm": 0.1898290514945984, "learning_rate": 4.9008885150790076e-05, "loss": 0.4161, "step": 16270 }, { "epoch": 0.5865498972861931, "grad_norm": 0.16169433295726776, "learning_rate": 4.900807147409161e-05, "loss": 0.4563, "step": 16275 }, { "epoch": 0.5867300969474177, "grad_norm": 0.13549362123012543, "learning_rate": 4.900725747028782e-05, "loss": 0.4627, "step": 16280 }, { "epoch": 0.5869102966086424, "grad_norm": 0.15982681512832642, "learning_rate": 4.900644313938979e-05, "loss": 0.4483, "step": 16285 }, { "epoch": 0.587090496269867, "grad_norm": 0.14519517123699188, "learning_rate": 4.900562848140863e-05, "loss": 0.4483, "step": 16290 }, { "epoch": 0.5872706959310916, "grad_norm": 0.21161200106143951, "learning_rate": 4.900481349635542e-05, "loss": 0.4173, "step": 16295 }, { "epoch": 0.5874508955923163, "grad_norm": 0.14361968636512756, "learning_rate": 4.9003998184241275e-05, "loss": 0.477, "step": 16300 }, { "epoch": 0.587631095253541, "grad_norm": 0.16247104108333588, "learning_rate": 4.9003182545077305e-05, "loss": 0.4436, "step": 16305 }, { "epoch": 0.5878112949147656, "grad_norm": 0.21194829046726227, "learning_rate": 4.9002366578874626e-05, "loss": 0.4429, "step": 16310 }, { "epoch": 0.5879914945759902, "grad_norm": 0.15204815566539764, "learning_rate": 4.9001550285644336e-05, "loss": 0.4554, "step": 16315 }, { "epoch": 0.5881716942372148, "grad_norm": 0.16609960794448853, "learning_rate": 4.900073366539758e-05, "loss": 0.4564, "step": 16320 }, { "epoch": 0.5883518938984394, "grad_norm": 0.1630159169435501, "learning_rate": 4.8999916718145475e-05, "loss": 0.4304, "step": 16325 }, { "epoch": 0.5885320935596641, "grad_norm": 0.1939852088689804, "learning_rate": 4.899909944389914e-05, "loss": 0.4236, "step": 16330 }, { "epoch": 0.5887122932208887, "grad_norm": 0.19106720387935638, "learning_rate": 4.8998281842669734e-05, "loss": 0.4602, "step": 16335 }, { "epoch": 0.5888924928821134, "grad_norm": 0.19278432428836823, "learning_rate": 4.8997463914468376e-05, "loss": 0.4576, "step": 16340 }, { "epoch": 0.589072692543338, "grad_norm": 0.20767638087272644, "learning_rate": 4.899664565930623e-05, "loss": 0.484, "step": 16345 }, { "epoch": 0.5892528922045627, "grad_norm": 0.1724158674478531, "learning_rate": 4.8995827077194425e-05, "loss": 0.4261, "step": 16350 }, { "epoch": 0.5894330918657873, "grad_norm": 0.1584085077047348, "learning_rate": 4.899500816814412e-05, "loss": 0.4627, "step": 16355 }, { "epoch": 0.5896132915270119, "grad_norm": 0.159664586186409, "learning_rate": 4.8994188932166473e-05, "loss": 0.4483, "step": 16360 }, { "epoch": 0.5897934911882365, "grad_norm": 0.14001238346099854, "learning_rate": 4.8993369369272646e-05, "loss": 0.4758, "step": 16365 }, { "epoch": 0.5899736908494612, "grad_norm": 0.1366659700870514, "learning_rate": 4.899254947947382e-05, "loss": 0.441, "step": 16370 }, { "epoch": 0.5901538905106858, "grad_norm": 0.13994751870632172, "learning_rate": 4.899172926278113e-05, "loss": 0.3618, "step": 16375 }, { "epoch": 0.5903340901719105, "grad_norm": 0.18011939525604248, "learning_rate": 4.899090871920579e-05, "loss": 0.444, "step": 16380 }, { "epoch": 0.5905142898331351, "grad_norm": 0.18300089240074158, "learning_rate": 4.899008784875896e-05, "loss": 0.4418, "step": 16385 }, { "epoch": 0.5906944894943598, "grad_norm": 0.16376063227653503, "learning_rate": 4.898926665145183e-05, "loss": 0.4708, "step": 16390 }, { "epoch": 0.5908746891555844, "grad_norm": 0.20874595642089844, "learning_rate": 4.898844512729558e-05, "loss": 0.4593, "step": 16395 }, { "epoch": 0.591054888816809, "grad_norm": 0.1489773839712143, "learning_rate": 4.898762327630142e-05, "loss": 0.4314, "step": 16400 }, { "epoch": 0.5912350884780336, "grad_norm": 0.1538877636194229, "learning_rate": 4.898680109848053e-05, "loss": 0.4327, "step": 16405 }, { "epoch": 0.5914152881392583, "grad_norm": 0.16070400178432465, "learning_rate": 4.8985978593844115e-05, "loss": 0.4837, "step": 16410 }, { "epoch": 0.591595487800483, "grad_norm": 0.15595030784606934, "learning_rate": 4.898515576240339e-05, "loss": 0.4543, "step": 16415 }, { "epoch": 0.5917756874617076, "grad_norm": 0.1852567046880722, "learning_rate": 4.898433260416956e-05, "loss": 0.4185, "step": 16420 }, { "epoch": 0.5919558871229322, "grad_norm": 0.18064150214195251, "learning_rate": 4.898350911915385e-05, "loss": 0.4524, "step": 16425 }, { "epoch": 0.5921360867841569, "grad_norm": 0.1406656950712204, "learning_rate": 4.898268530736746e-05, "loss": 0.4357, "step": 16430 }, { "epoch": 0.5923162864453815, "grad_norm": 0.1857532560825348, "learning_rate": 4.898186116882163e-05, "loss": 0.4287, "step": 16435 }, { "epoch": 0.5924964861066061, "grad_norm": 0.2215130776166916, "learning_rate": 4.8981036703527584e-05, "loss": 0.4508, "step": 16440 }, { "epoch": 0.5926766857678307, "grad_norm": 0.16718199849128723, "learning_rate": 4.898021191149655e-05, "loss": 0.4264, "step": 16445 }, { "epoch": 0.5928568854290553, "grad_norm": 0.16508464515209198, "learning_rate": 4.897938679273979e-05, "loss": 0.4389, "step": 16450 }, { "epoch": 0.5930370850902801, "grad_norm": 0.16538691520690918, "learning_rate": 4.897856134726851e-05, "loss": 0.4461, "step": 16455 }, { "epoch": 0.5932172847515047, "grad_norm": 0.16483493149280548, "learning_rate": 4.897773557509398e-05, "loss": 0.4803, "step": 16460 }, { "epoch": 0.5933974844127293, "grad_norm": 0.17354772984981537, "learning_rate": 4.897690947622745e-05, "loss": 0.4594, "step": 16465 }, { "epoch": 0.593577684073954, "grad_norm": 0.18836447596549988, "learning_rate": 4.8976083050680164e-05, "loss": 0.4594, "step": 16470 }, { "epoch": 0.5937578837351786, "grad_norm": 0.17023785412311554, "learning_rate": 4.897525629846339e-05, "loss": 0.4418, "step": 16475 }, { "epoch": 0.5939380833964032, "grad_norm": 0.13566334545612335, "learning_rate": 4.897442921958839e-05, "loss": 0.4338, "step": 16480 }, { "epoch": 0.5941182830576278, "grad_norm": 0.18862979114055634, "learning_rate": 4.8973601814066436e-05, "loss": 0.4653, "step": 16485 }, { "epoch": 0.5942984827188524, "grad_norm": 0.16662077605724335, "learning_rate": 4.897277408190879e-05, "loss": 0.4596, "step": 16490 }, { "epoch": 0.5944786823800772, "grad_norm": 0.20535314083099365, "learning_rate": 4.897194602312675e-05, "loss": 0.4829, "step": 16495 }, { "epoch": 0.5946588820413018, "grad_norm": 0.16183775663375854, "learning_rate": 4.8971117637731576e-05, "loss": 0.4483, "step": 16500 }, { "epoch": 0.5946588820413018, "eval_loss": 0.4669727683067322, "eval_runtime": 3.6173, "eval_samples_per_second": 27.645, "eval_steps_per_second": 6.911, "step": 16500 }, { "epoch": 0.5948390817025264, "grad_norm": 0.18323639035224915, "learning_rate": 4.8970288925734575e-05, "loss": 0.4502, "step": 16505 }, { "epoch": 0.595019281363751, "grad_norm": 0.1653471291065216, "learning_rate": 4.8969459887147025e-05, "loss": 0.4789, "step": 16510 }, { "epoch": 0.5951994810249757, "grad_norm": 0.14966969192028046, "learning_rate": 4.896863052198022e-05, "loss": 0.4468, "step": 16515 }, { "epoch": 0.5953796806862003, "grad_norm": 0.16616958379745483, "learning_rate": 4.896780083024547e-05, "loss": 0.49, "step": 16520 }, { "epoch": 0.5955598803474249, "grad_norm": 0.17226427793502808, "learning_rate": 4.896697081195407e-05, "loss": 0.4843, "step": 16525 }, { "epoch": 0.5957400800086495, "grad_norm": 0.21469104290008545, "learning_rate": 4.896614046711734e-05, "loss": 0.4526, "step": 16530 }, { "epoch": 0.5959202796698743, "grad_norm": 0.17571860551834106, "learning_rate": 4.896530979574658e-05, "loss": 0.4803, "step": 16535 }, { "epoch": 0.5961004793310989, "grad_norm": 0.16215255856513977, "learning_rate": 4.896447879785311e-05, "loss": 0.4276, "step": 16540 }, { "epoch": 0.5962806789923235, "grad_norm": 0.17751629650592804, "learning_rate": 4.896364747344827e-05, "loss": 0.4628, "step": 16545 }, { "epoch": 0.5964608786535481, "grad_norm": 0.1444987803697586, "learning_rate": 4.896281582254336e-05, "loss": 0.4643, "step": 16550 }, { "epoch": 0.5966410783147728, "grad_norm": 0.17875704169273376, "learning_rate": 4.8961983845149725e-05, "loss": 0.4847, "step": 16555 }, { "epoch": 0.5968212779759974, "grad_norm": 0.20280811190605164, "learning_rate": 4.8961151541278706e-05, "loss": 0.4574, "step": 16560 }, { "epoch": 0.597001477637222, "grad_norm": 0.1767544448375702, "learning_rate": 4.8960318910941626e-05, "loss": 0.4164, "step": 16565 }, { "epoch": 0.5971816772984467, "grad_norm": 0.17916817963123322, "learning_rate": 4.8959485954149855e-05, "loss": 0.4439, "step": 16570 }, { "epoch": 0.5973618769596714, "grad_norm": 0.16673676669597626, "learning_rate": 4.895865267091471e-05, "loss": 0.4553, "step": 16575 }, { "epoch": 0.597542076620896, "grad_norm": 0.1650109440088272, "learning_rate": 4.895781906124757e-05, "loss": 0.4454, "step": 16580 }, { "epoch": 0.5977222762821206, "grad_norm": 0.1822521984577179, "learning_rate": 4.895698512515978e-05, "loss": 0.4683, "step": 16585 }, { "epoch": 0.5979024759433452, "grad_norm": 0.1935487687587738, "learning_rate": 4.895615086266271e-05, "loss": 0.4437, "step": 16590 }, { "epoch": 0.5980826756045698, "grad_norm": 0.15225858986377716, "learning_rate": 4.895531627376772e-05, "loss": 0.4567, "step": 16595 }, { "epoch": 0.5982628752657945, "grad_norm": 0.1286819726228714, "learning_rate": 4.8954481358486185e-05, "loss": 0.4368, "step": 16600 }, { "epoch": 0.5984430749270191, "grad_norm": 0.1508607119321823, "learning_rate": 4.8953646116829477e-05, "loss": 0.4622, "step": 16605 }, { "epoch": 0.5986232745882438, "grad_norm": 0.1493672877550125, "learning_rate": 4.895281054880898e-05, "loss": 0.4192, "step": 16610 }, { "epoch": 0.5988034742494684, "grad_norm": 0.17611226439476013, "learning_rate": 4.895197465443608e-05, "loss": 0.4706, "step": 16615 }, { "epoch": 0.5989836739106931, "grad_norm": 0.1974530965089798, "learning_rate": 4.8951138433722154e-05, "loss": 0.4066, "step": 16620 }, { "epoch": 0.5991638735719177, "grad_norm": 0.14464324712753296, "learning_rate": 4.895030188667861e-05, "loss": 0.4478, "step": 16625 }, { "epoch": 0.5993440732331423, "grad_norm": 0.13125242292881012, "learning_rate": 4.894946501331684e-05, "loss": 0.4564, "step": 16630 }, { "epoch": 0.5995242728943669, "grad_norm": 0.20426687598228455, "learning_rate": 4.894862781364824e-05, "loss": 0.4608, "step": 16635 }, { "epoch": 0.5997044725555916, "grad_norm": 0.17692770063877106, "learning_rate": 4.894779028768423e-05, "loss": 0.469, "step": 16640 }, { "epoch": 0.5998846722168162, "grad_norm": 0.18528582155704498, "learning_rate": 4.894695243543621e-05, "loss": 0.4498, "step": 16645 }, { "epoch": 0.6000648718780409, "grad_norm": 0.19530199468135834, "learning_rate": 4.8946114256915597e-05, "loss": 0.4055, "step": 16650 }, { "epoch": 0.6002450715392655, "grad_norm": 0.1795613169670105, "learning_rate": 4.894527575213382e-05, "loss": 0.4876, "step": 16655 }, { "epoch": 0.6004252712004902, "grad_norm": 0.16493387520313263, "learning_rate": 4.8944436921102296e-05, "loss": 0.4382, "step": 16660 }, { "epoch": 0.6006054708617148, "grad_norm": 0.1708437204360962, "learning_rate": 4.894359776383245e-05, "loss": 0.4134, "step": 16665 }, { "epoch": 0.6007856705229394, "grad_norm": 0.14613789319992065, "learning_rate": 4.894275828033572e-05, "loss": 0.469, "step": 16670 }, { "epoch": 0.600965870184164, "grad_norm": 0.15820561349391937, "learning_rate": 4.894191847062355e-05, "loss": 0.435, "step": 16675 }, { "epoch": 0.6011460698453887, "grad_norm": 0.16094128787517548, "learning_rate": 4.894107833470737e-05, "loss": 0.4457, "step": 16680 }, { "epoch": 0.6013262695066133, "grad_norm": 0.21351011097431183, "learning_rate": 4.8940237872598635e-05, "loss": 0.4868, "step": 16685 }, { "epoch": 0.601506469167838, "grad_norm": 0.1446036994457245, "learning_rate": 4.8939397084308794e-05, "loss": 0.4357, "step": 16690 }, { "epoch": 0.6016866688290626, "grad_norm": 0.16345882415771484, "learning_rate": 4.8938555969849306e-05, "loss": 0.4722, "step": 16695 }, { "epoch": 0.6018668684902873, "grad_norm": 0.1572277694940567, "learning_rate": 4.893771452923162e-05, "loss": 0.4265, "step": 16700 }, { "epoch": 0.6020470681515119, "grad_norm": 0.188238263130188, "learning_rate": 4.893687276246721e-05, "loss": 0.4051, "step": 16705 }, { "epoch": 0.6022272678127365, "grad_norm": 0.15958626568317413, "learning_rate": 4.893603066956755e-05, "loss": 0.4454, "step": 16710 }, { "epoch": 0.6024074674739611, "grad_norm": 0.18399354815483093, "learning_rate": 4.89351882505441e-05, "loss": 0.4459, "step": 16715 }, { "epoch": 0.6025876671351857, "grad_norm": 0.16669590771198273, "learning_rate": 4.8934345505408344e-05, "loss": 0.4402, "step": 16720 }, { "epoch": 0.6027678667964104, "grad_norm": 0.16864614188671112, "learning_rate": 4.893350243417177e-05, "loss": 0.4374, "step": 16725 }, { "epoch": 0.6029480664576351, "grad_norm": 0.1995852291584015, "learning_rate": 4.893265903684585e-05, "loss": 0.4212, "step": 16730 }, { "epoch": 0.6031282661188597, "grad_norm": 0.17713283002376556, "learning_rate": 4.8931815313442095e-05, "loss": 0.4731, "step": 16735 }, { "epoch": 0.6033084657800843, "grad_norm": 0.16761018335819244, "learning_rate": 4.893097126397198e-05, "loss": 0.4195, "step": 16740 }, { "epoch": 0.603488665441309, "grad_norm": 0.14120902121067047, "learning_rate": 4.893012688844702e-05, "loss": 0.4436, "step": 16745 }, { "epoch": 0.6036688651025336, "grad_norm": 0.1854945719242096, "learning_rate": 4.8929282186878714e-05, "loss": 0.4699, "step": 16750 }, { "epoch": 0.6038490647637582, "grad_norm": 0.15078695118427277, "learning_rate": 4.892843715927857e-05, "loss": 0.4322, "step": 16755 }, { "epoch": 0.6040292644249828, "grad_norm": 0.17069613933563232, "learning_rate": 4.89275918056581e-05, "loss": 0.4183, "step": 16760 }, { "epoch": 0.6042094640862076, "grad_norm": 0.1491325944662094, "learning_rate": 4.892674612602882e-05, "loss": 0.4358, "step": 16765 }, { "epoch": 0.6043896637474322, "grad_norm": 0.17146345973014832, "learning_rate": 4.892590012040227e-05, "loss": 0.4274, "step": 16770 }, { "epoch": 0.6045698634086568, "grad_norm": 0.1850728988647461, "learning_rate": 4.8925053788789954e-05, "loss": 0.4698, "step": 16775 }, { "epoch": 0.6047500630698814, "grad_norm": 0.1922415941953659, "learning_rate": 4.892420713120341e-05, "loss": 0.4253, "step": 16780 }, { "epoch": 0.6049302627311061, "grad_norm": 0.1879766434431076, "learning_rate": 4.8923360147654184e-05, "loss": 0.4515, "step": 16785 }, { "epoch": 0.6051104623923307, "grad_norm": 0.19672177731990814, "learning_rate": 4.89225128381538e-05, "loss": 0.4909, "step": 16790 }, { "epoch": 0.6052906620535553, "grad_norm": 0.16113631427288055, "learning_rate": 4.8921665202713816e-05, "loss": 0.4287, "step": 16795 }, { "epoch": 0.6054708617147799, "grad_norm": 0.1752663552761078, "learning_rate": 4.8920817241345776e-05, "loss": 0.4615, "step": 16800 }, { "epoch": 0.6056510613760047, "grad_norm": 0.15758828818798065, "learning_rate": 4.8919968954061223e-05, "loss": 0.4314, "step": 16805 }, { "epoch": 0.6058312610372293, "grad_norm": 0.22554457187652588, "learning_rate": 4.891912034087173e-05, "loss": 0.48, "step": 16810 }, { "epoch": 0.6060114606984539, "grad_norm": 0.16206426918506622, "learning_rate": 4.8918271401788855e-05, "loss": 0.4626, "step": 16815 }, { "epoch": 0.6061916603596785, "grad_norm": 0.16757212579250336, "learning_rate": 4.8917422136824167e-05, "loss": 0.4692, "step": 16820 }, { "epoch": 0.6063718600209032, "grad_norm": 0.19474922120571136, "learning_rate": 4.891657254598922e-05, "loss": 0.4177, "step": 16825 }, { "epoch": 0.6065520596821278, "grad_norm": 0.15436606109142303, "learning_rate": 4.891572262929561e-05, "loss": 0.4621, "step": 16830 }, { "epoch": 0.6067322593433524, "grad_norm": 0.15269695222377777, "learning_rate": 4.89148723867549e-05, "loss": 0.4116, "step": 16835 }, { "epoch": 0.606912459004577, "grad_norm": 0.20142877101898193, "learning_rate": 4.89140218183787e-05, "loss": 0.4751, "step": 16840 }, { "epoch": 0.6070926586658018, "grad_norm": 0.14995922148227692, "learning_rate": 4.8913170924178576e-05, "loss": 0.3919, "step": 16845 }, { "epoch": 0.6072728583270264, "grad_norm": 0.179569274187088, "learning_rate": 4.8912319704166124e-05, "loss": 0.4542, "step": 16850 }, { "epoch": 0.607453057988251, "grad_norm": 0.15274690091609955, "learning_rate": 4.891146815835295e-05, "loss": 0.4364, "step": 16855 }, { "epoch": 0.6076332576494756, "grad_norm": 0.13948167860507965, "learning_rate": 4.8910616286750654e-05, "loss": 0.4454, "step": 16860 }, { "epoch": 0.6078134573107002, "grad_norm": 0.18718601763248444, "learning_rate": 4.890976408937084e-05, "loss": 0.4327, "step": 16865 }, { "epoch": 0.6079936569719249, "grad_norm": 0.2123367339372635, "learning_rate": 4.890891156622511e-05, "loss": 0.419, "step": 16870 }, { "epoch": 0.6081738566331495, "grad_norm": 0.18205073475837708, "learning_rate": 4.8908058717325097e-05, "loss": 0.4819, "step": 16875 }, { "epoch": 0.6083540562943741, "grad_norm": 0.1736924946308136, "learning_rate": 4.8907205542682414e-05, "loss": 0.4596, "step": 16880 }, { "epoch": 0.6085342559555988, "grad_norm": 0.18796013295650482, "learning_rate": 4.890635204230868e-05, "loss": 0.4273, "step": 16885 }, { "epoch": 0.6087144556168235, "grad_norm": 0.19030854105949402, "learning_rate": 4.890549821621553e-05, "loss": 0.4642, "step": 16890 }, { "epoch": 0.6088946552780481, "grad_norm": 0.16654013097286224, "learning_rate": 4.8904644064414585e-05, "loss": 0.4615, "step": 16895 }, { "epoch": 0.6090748549392727, "grad_norm": 0.20467707514762878, "learning_rate": 4.8903789586917505e-05, "loss": 0.4292, "step": 16900 }, { "epoch": 0.6092550546004973, "grad_norm": 0.14566195011138916, "learning_rate": 4.890293478373592e-05, "loss": 0.483, "step": 16905 }, { "epoch": 0.609435254261722, "grad_norm": 0.19296708703041077, "learning_rate": 4.8902079654881466e-05, "loss": 0.4536, "step": 16910 }, { "epoch": 0.6096154539229466, "grad_norm": 0.15972061455249786, "learning_rate": 4.890122420036581e-05, "loss": 0.4392, "step": 16915 }, { "epoch": 0.6097956535841713, "grad_norm": 0.18985244631767273, "learning_rate": 4.890036842020061e-05, "loss": 0.4542, "step": 16920 }, { "epoch": 0.6099758532453959, "grad_norm": 0.1495533585548401, "learning_rate": 4.88995123143975e-05, "loss": 0.4318, "step": 16925 }, { "epoch": 0.6101560529066206, "grad_norm": 0.21450436115264893, "learning_rate": 4.8898655882968175e-05, "loss": 0.4514, "step": 16930 }, { "epoch": 0.6103362525678452, "grad_norm": 0.2090601772069931, "learning_rate": 4.889779912592429e-05, "loss": 0.4552, "step": 16935 }, { "epoch": 0.6105164522290698, "grad_norm": 0.1768369823694229, "learning_rate": 4.889694204327751e-05, "loss": 0.4526, "step": 16940 }, { "epoch": 0.6106966518902944, "grad_norm": 0.16384200751781464, "learning_rate": 4.889608463503953e-05, "loss": 0.4559, "step": 16945 }, { "epoch": 0.610876851551519, "grad_norm": 0.14642253518104553, "learning_rate": 4.8895226901222026e-05, "loss": 0.4559, "step": 16950 }, { "epoch": 0.6110570512127437, "grad_norm": 0.14469105005264282, "learning_rate": 4.889436884183667e-05, "loss": 0.4116, "step": 16955 }, { "epoch": 0.6112372508739684, "grad_norm": 0.15812519192695618, "learning_rate": 4.889351045689518e-05, "loss": 0.4722, "step": 16960 }, { "epoch": 0.611417450535193, "grad_norm": 0.18630902469158173, "learning_rate": 4.889265174640922e-05, "loss": 0.4555, "step": 16965 }, { "epoch": 0.6115976501964177, "grad_norm": 0.1799592524766922, "learning_rate": 4.889179271039052e-05, "loss": 0.4385, "step": 16970 }, { "epoch": 0.6117778498576423, "grad_norm": 0.17888915538787842, "learning_rate": 4.8890933348850757e-05, "loss": 0.4545, "step": 16975 }, { "epoch": 0.6119580495188669, "grad_norm": 0.16347283124923706, "learning_rate": 4.8890073661801655e-05, "loss": 0.3919, "step": 16980 }, { "epoch": 0.6121382491800915, "grad_norm": 0.22050714492797852, "learning_rate": 4.888921364925493e-05, "loss": 0.4469, "step": 16985 }, { "epoch": 0.6123184488413161, "grad_norm": 0.2015305459499359, "learning_rate": 4.888835331122229e-05, "loss": 0.4552, "step": 16990 }, { "epoch": 0.6124986485025408, "grad_norm": 0.1845419853925705, "learning_rate": 4.888749264771546e-05, "loss": 0.4543, "step": 16995 }, { "epoch": 0.6126788481637655, "grad_norm": 0.2000664323568344, "learning_rate": 4.8886631658746175e-05, "loss": 0.4648, "step": 17000 }, { "epoch": 0.6126788481637655, "eval_loss": 0.4671698808670044, "eval_runtime": 3.5317, "eval_samples_per_second": 28.315, "eval_steps_per_second": 7.079, "step": 17000 }, { "epoch": 0.6128590478249901, "grad_norm": 0.17209337651729584, "learning_rate": 4.8885770344326154e-05, "loss": 0.3971, "step": 17005 }, { "epoch": 0.6130392474862147, "grad_norm": 0.17581631243228912, "learning_rate": 4.8884908704467137e-05, "loss": 0.4352, "step": 17010 }, { "epoch": 0.6132194471474394, "grad_norm": 0.19064390659332275, "learning_rate": 4.888404673918085e-05, "loss": 0.4705, "step": 17015 }, { "epoch": 0.613399646808664, "grad_norm": 0.1969899833202362, "learning_rate": 4.8883184448479066e-05, "loss": 0.4157, "step": 17020 }, { "epoch": 0.6135798464698886, "grad_norm": 0.17131738364696503, "learning_rate": 4.888232183237352e-05, "loss": 0.436, "step": 17025 }, { "epoch": 0.6137600461311132, "grad_norm": 0.17256931960582733, "learning_rate": 4.888145889087595e-05, "loss": 0.4619, "step": 17030 }, { "epoch": 0.6139402457923379, "grad_norm": 0.17418308556079865, "learning_rate": 4.888059562399814e-05, "loss": 0.4318, "step": 17035 }, { "epoch": 0.6141204454535626, "grad_norm": 0.1743146926164627, "learning_rate": 4.887973203175183e-05, "loss": 0.4002, "step": 17040 }, { "epoch": 0.6143006451147872, "grad_norm": 0.168928861618042, "learning_rate": 4.88788681141488e-05, "loss": 0.4291, "step": 17045 }, { "epoch": 0.6144808447760118, "grad_norm": 0.1870613992214203, "learning_rate": 4.8878003871200807e-05, "loss": 0.4425, "step": 17050 }, { "epoch": 0.6146610444372365, "grad_norm": 0.1746305525302887, "learning_rate": 4.8877139302919636e-05, "loss": 0.4185, "step": 17055 }, { "epoch": 0.6148412440984611, "grad_norm": 0.16369090974330902, "learning_rate": 4.887627440931707e-05, "loss": 0.4495, "step": 17060 }, { "epoch": 0.6150214437596857, "grad_norm": 0.16876505315303802, "learning_rate": 4.887540919040488e-05, "loss": 0.4685, "step": 17065 }, { "epoch": 0.6152016434209103, "grad_norm": 0.17735637724399567, "learning_rate": 4.887454364619487e-05, "loss": 0.3699, "step": 17070 }, { "epoch": 0.6153818430821351, "grad_norm": 0.17825958132743835, "learning_rate": 4.8873677776698824e-05, "loss": 0.4413, "step": 17075 }, { "epoch": 0.6155620427433597, "grad_norm": 0.21326866745948792, "learning_rate": 4.8872811581928536e-05, "loss": 0.4284, "step": 17080 }, { "epoch": 0.6157422424045843, "grad_norm": 0.16894589364528656, "learning_rate": 4.887194506189581e-05, "loss": 0.4346, "step": 17085 }, { "epoch": 0.6159224420658089, "grad_norm": 0.18406248092651367, "learning_rate": 4.887107821661247e-05, "loss": 0.4555, "step": 17090 }, { "epoch": 0.6161026417270336, "grad_norm": 0.17447605729103088, "learning_rate": 4.887021104609029e-05, "loss": 0.4917, "step": 17095 }, { "epoch": 0.6162828413882582, "grad_norm": 0.1875600963830948, "learning_rate": 4.886934355034112e-05, "loss": 0.4652, "step": 17100 }, { "epoch": 0.6164630410494828, "grad_norm": 0.1392199695110321, "learning_rate": 4.886847572937676e-05, "loss": 0.4185, "step": 17105 }, { "epoch": 0.6166432407107074, "grad_norm": 0.21595901250839233, "learning_rate": 4.886760758320904e-05, "loss": 0.445, "step": 17110 }, { "epoch": 0.6168234403719322, "grad_norm": 0.19852277636528015, "learning_rate": 4.886673911184978e-05, "loss": 0.4681, "step": 17115 }, { "epoch": 0.6170036400331568, "grad_norm": 0.15305131673812866, "learning_rate": 4.8865870315310834e-05, "loss": 0.4507, "step": 17120 }, { "epoch": 0.6171838396943814, "grad_norm": 0.1263405680656433, "learning_rate": 4.886500119360402e-05, "loss": 0.4257, "step": 17125 }, { "epoch": 0.617364039355606, "grad_norm": 0.17302808165550232, "learning_rate": 4.886413174674118e-05, "loss": 0.4358, "step": 17130 }, { "epoch": 0.6175442390168306, "grad_norm": 0.157601997256279, "learning_rate": 4.886326197473417e-05, "loss": 0.4449, "step": 17135 }, { "epoch": 0.6177244386780553, "grad_norm": 0.20116667449474335, "learning_rate": 4.8862391877594835e-05, "loss": 0.4696, "step": 17140 }, { "epoch": 0.6179046383392799, "grad_norm": 0.14023269712924957, "learning_rate": 4.886152145533503e-05, "loss": 0.4229, "step": 17145 }, { "epoch": 0.6180848380005045, "grad_norm": 0.18752652406692505, "learning_rate": 4.886065070796662e-05, "loss": 0.4556, "step": 17150 }, { "epoch": 0.6182650376617292, "grad_norm": 0.1420794129371643, "learning_rate": 4.8859779635501456e-05, "loss": 0.4229, "step": 17155 }, { "epoch": 0.6184452373229539, "grad_norm": 0.13428914546966553, "learning_rate": 4.885890823795142e-05, "loss": 0.4079, "step": 17160 }, { "epoch": 0.6186254369841785, "grad_norm": 0.18215683102607727, "learning_rate": 4.885803651532838e-05, "loss": 0.4227, "step": 17165 }, { "epoch": 0.6188056366454031, "grad_norm": 0.14137986302375793, "learning_rate": 4.88571644676442e-05, "loss": 0.3939, "step": 17170 }, { "epoch": 0.6189858363066277, "grad_norm": 0.16938935220241547, "learning_rate": 4.885629209491078e-05, "loss": 0.4753, "step": 17175 }, { "epoch": 0.6191660359678524, "grad_norm": 0.16520722210407257, "learning_rate": 4.885541939714e-05, "loss": 0.4466, "step": 17180 }, { "epoch": 0.619346235629077, "grad_norm": 0.16338035464286804, "learning_rate": 4.885454637434375e-05, "loss": 0.4746, "step": 17185 }, { "epoch": 0.6195264352903016, "grad_norm": 0.1479274183511734, "learning_rate": 4.8853673026533926e-05, "loss": 0.4363, "step": 17190 }, { "epoch": 0.6197066349515263, "grad_norm": 0.1792302280664444, "learning_rate": 4.885279935372242e-05, "loss": 0.4159, "step": 17195 }, { "epoch": 0.619886834612751, "grad_norm": 0.1646362543106079, "learning_rate": 4.8851925355921144e-05, "loss": 0.4298, "step": 17200 }, { "epoch": 0.6200670342739756, "grad_norm": 0.20265613496303558, "learning_rate": 4.8851051033142004e-05, "loss": 0.4525, "step": 17205 }, { "epoch": 0.6202472339352002, "grad_norm": 0.18897970020771027, "learning_rate": 4.8850176385396904e-05, "loss": 0.4509, "step": 17210 }, { "epoch": 0.6204274335964248, "grad_norm": 0.2247178852558136, "learning_rate": 4.884930141269778e-05, "loss": 0.4515, "step": 17215 }, { "epoch": 0.6206076332576494, "grad_norm": 0.22851864993572235, "learning_rate": 4.884842611505653e-05, "loss": 0.4337, "step": 17220 }, { "epoch": 0.6207878329188741, "grad_norm": 0.17176374793052673, "learning_rate": 4.8847550492485094e-05, "loss": 0.4607, "step": 17225 }, { "epoch": 0.6209680325800987, "grad_norm": 0.16532184183597565, "learning_rate": 4.884667454499541e-05, "loss": 0.4572, "step": 17230 }, { "epoch": 0.6211482322413234, "grad_norm": 0.16950088739395142, "learning_rate": 4.884579827259939e-05, "loss": 0.4596, "step": 17235 }, { "epoch": 0.621328431902548, "grad_norm": 0.13479334115982056, "learning_rate": 4.8844921675308985e-05, "loss": 0.4187, "step": 17240 }, { "epoch": 0.6215086315637727, "grad_norm": 0.1946098804473877, "learning_rate": 4.884404475313614e-05, "loss": 0.4806, "step": 17245 }, { "epoch": 0.6216888312249973, "grad_norm": 0.20148830115795135, "learning_rate": 4.884316750609281e-05, "loss": 0.4833, "step": 17250 }, { "epoch": 0.6218690308862219, "grad_norm": 0.18048211932182312, "learning_rate": 4.884228993419093e-05, "loss": 0.4797, "step": 17255 }, { "epoch": 0.6220492305474465, "grad_norm": 0.17456752061843872, "learning_rate": 4.884141203744248e-05, "loss": 0.4505, "step": 17260 }, { "epoch": 0.6222294302086712, "grad_norm": 0.19601954519748688, "learning_rate": 4.884053381585939e-05, "loss": 0.4685, "step": 17265 }, { "epoch": 0.6224096298698959, "grad_norm": 0.191552072763443, "learning_rate": 4.883965526945365e-05, "loss": 0.4564, "step": 17270 }, { "epoch": 0.6225898295311205, "grad_norm": 0.17397992312908173, "learning_rate": 4.883877639823722e-05, "loss": 0.4623, "step": 17275 }, { "epoch": 0.6227700291923451, "grad_norm": 0.1623370498418808, "learning_rate": 4.8837897202222076e-05, "loss": 0.4449, "step": 17280 }, { "epoch": 0.6229502288535698, "grad_norm": 0.15014895796775818, "learning_rate": 4.88370176814202e-05, "loss": 0.443, "step": 17285 }, { "epoch": 0.6231304285147944, "grad_norm": 0.17987573146820068, "learning_rate": 4.8836137835843574e-05, "loss": 0.4186, "step": 17290 }, { "epoch": 0.623310628176019, "grad_norm": 0.16012683510780334, "learning_rate": 4.8835257665504184e-05, "loss": 0.4429, "step": 17295 }, { "epoch": 0.6234908278372436, "grad_norm": 0.17883889377117157, "learning_rate": 4.883437717041403e-05, "loss": 0.4408, "step": 17300 }, { "epoch": 0.6236710274984683, "grad_norm": 0.17255452275276184, "learning_rate": 4.883349635058508e-05, "loss": 0.4326, "step": 17305 }, { "epoch": 0.623851227159693, "grad_norm": 0.14830821752548218, "learning_rate": 4.883261520602937e-05, "loss": 0.4488, "step": 17310 }, { "epoch": 0.6240314268209176, "grad_norm": 0.16953101754188538, "learning_rate": 4.88317337367589e-05, "loss": 0.4381, "step": 17315 }, { "epoch": 0.6242116264821422, "grad_norm": 0.16917838156223297, "learning_rate": 4.883085194278566e-05, "loss": 0.4023, "step": 17320 }, { "epoch": 0.6243918261433669, "grad_norm": 0.14477120339870453, "learning_rate": 4.882996982412168e-05, "loss": 0.416, "step": 17325 }, { "epoch": 0.6245720258045915, "grad_norm": 0.1996786743402481, "learning_rate": 4.882908738077898e-05, "loss": 0.4539, "step": 17330 }, { "epoch": 0.6247522254658161, "grad_norm": 0.18695120513439178, "learning_rate": 4.8828204612769566e-05, "loss": 0.4737, "step": 17335 }, { "epoch": 0.6249324251270407, "grad_norm": 0.17920991778373718, "learning_rate": 4.882732152010549e-05, "loss": 0.4147, "step": 17340 }, { "epoch": 0.6251126247882653, "grad_norm": 0.1538057178258896, "learning_rate": 4.8826438102798766e-05, "loss": 0.4651, "step": 17345 }, { "epoch": 0.6252928244494901, "grad_norm": 0.17022547125816345, "learning_rate": 4.8825554360861436e-05, "loss": 0.4641, "step": 17350 }, { "epoch": 0.6254730241107147, "grad_norm": 0.17763908207416534, "learning_rate": 4.882467029430554e-05, "loss": 0.3932, "step": 17355 }, { "epoch": 0.6256532237719393, "grad_norm": 0.1638226956129074, "learning_rate": 4.8823785903143124e-05, "loss": 0.4439, "step": 17360 }, { "epoch": 0.625833423433164, "grad_norm": 0.1570916473865509, "learning_rate": 4.882290118738624e-05, "loss": 0.4014, "step": 17365 }, { "epoch": 0.6260136230943886, "grad_norm": 0.16065819561481476, "learning_rate": 4.882201614704694e-05, "loss": 0.4374, "step": 17370 }, { "epoch": 0.6261938227556132, "grad_norm": 0.1514330506324768, "learning_rate": 4.8821130782137275e-05, "loss": 0.4195, "step": 17375 }, { "epoch": 0.6263740224168378, "grad_norm": 0.14630937576293945, "learning_rate": 4.882024509266932e-05, "loss": 0.4394, "step": 17380 }, { "epoch": 0.6265542220780624, "grad_norm": 0.15218989551067352, "learning_rate": 4.881935907865514e-05, "loss": 0.4132, "step": 17385 }, { "epoch": 0.6267344217392872, "grad_norm": 0.20841450989246368, "learning_rate": 4.88184727401068e-05, "loss": 0.4669, "step": 17390 }, { "epoch": 0.6269146214005118, "grad_norm": 0.18255776166915894, "learning_rate": 4.881758607703638e-05, "loss": 0.4354, "step": 17395 }, { "epoch": 0.6270948210617364, "grad_norm": 0.12769578397274017, "learning_rate": 4.881669908945596e-05, "loss": 0.4514, "step": 17400 }, { "epoch": 0.627275020722961, "grad_norm": 0.18235896527767181, "learning_rate": 4.881581177737763e-05, "loss": 0.4523, "step": 17405 }, { "epoch": 0.6274552203841857, "grad_norm": 0.13113798201084137, "learning_rate": 4.8814924140813466e-05, "loss": 0.4246, "step": 17410 }, { "epoch": 0.6276354200454103, "grad_norm": 0.18605348467826843, "learning_rate": 4.881403617977558e-05, "loss": 0.4335, "step": 17415 }, { "epoch": 0.6278156197066349, "grad_norm": 0.19018565118312836, "learning_rate": 4.8813147894276065e-05, "loss": 0.4198, "step": 17420 }, { "epoch": 0.6279958193678596, "grad_norm": 0.22579999268054962, "learning_rate": 4.8812259284327013e-05, "loss": 0.4842, "step": 17425 }, { "epoch": 0.6281760190290843, "grad_norm": 0.13994662463665009, "learning_rate": 4.8811370349940545e-05, "loss": 0.4695, "step": 17430 }, { "epoch": 0.6283562186903089, "grad_norm": 0.1737351268529892, "learning_rate": 4.8810481091128756e-05, "loss": 0.4521, "step": 17435 }, { "epoch": 0.6285364183515335, "grad_norm": 0.1526714414358139, "learning_rate": 4.880959150790379e-05, "loss": 0.428, "step": 17440 }, { "epoch": 0.6287166180127581, "grad_norm": 0.16444048285484314, "learning_rate": 4.880870160027773e-05, "loss": 0.4125, "step": 17445 }, { "epoch": 0.6288968176739828, "grad_norm": 0.2083633989095688, "learning_rate": 4.8807811368262726e-05, "loss": 0.4616, "step": 17450 }, { "epoch": 0.6290770173352074, "grad_norm": 0.2831677496433258, "learning_rate": 4.8806920811870905e-05, "loss": 0.4404, "step": 17455 }, { "epoch": 0.629257216996432, "grad_norm": 0.17356400191783905, "learning_rate": 4.88060299311144e-05, "loss": 0.4132, "step": 17460 }, { "epoch": 0.6294374166576567, "grad_norm": 0.1940339356660843, "learning_rate": 4.880513872600534e-05, "loss": 0.422, "step": 17465 }, { "epoch": 0.6296176163188814, "grad_norm": 0.15967166423797607, "learning_rate": 4.880424719655588e-05, "loss": 0.4363, "step": 17470 }, { "epoch": 0.629797815980106, "grad_norm": 0.16466853022575378, "learning_rate": 4.8803355342778145e-05, "loss": 0.4705, "step": 17475 }, { "epoch": 0.6299780156413306, "grad_norm": 0.17861898243427277, "learning_rate": 4.880246316468432e-05, "loss": 0.4117, "step": 17480 }, { "epoch": 0.6301582153025552, "grad_norm": 0.13602939248085022, "learning_rate": 4.880157066228653e-05, "loss": 0.4148, "step": 17485 }, { "epoch": 0.6303384149637798, "grad_norm": 0.1881345808506012, "learning_rate": 4.880067783559695e-05, "loss": 0.4365, "step": 17490 }, { "epoch": 0.6305186146250045, "grad_norm": 0.23560486733913422, "learning_rate": 4.8799784684627745e-05, "loss": 0.459, "step": 17495 }, { "epoch": 0.6306988142862291, "grad_norm": 0.1441272646188736, "learning_rate": 4.8798891209391085e-05, "loss": 0.4468, "step": 17500 }, { "epoch": 0.6306988142862291, "eval_loss": 0.4662560522556305, "eval_runtime": 3.533, "eval_samples_per_second": 28.305, "eval_steps_per_second": 7.076, "step": 17500 }, { "epoch": 0.6308790139474538, "grad_norm": 0.18622781336307526, "learning_rate": 4.879799740989914e-05, "loss": 0.4294, "step": 17505 }, { "epoch": 0.6310592136086784, "grad_norm": 0.17122696340084076, "learning_rate": 4.879710328616408e-05, "loss": 0.463, "step": 17510 }, { "epoch": 0.6312394132699031, "grad_norm": 0.18917937576770782, "learning_rate": 4.87962088381981e-05, "loss": 0.4626, "step": 17515 }, { "epoch": 0.6314196129311277, "grad_norm": 0.1895674765110016, "learning_rate": 4.8795314066013386e-05, "loss": 0.4487, "step": 17520 }, { "epoch": 0.6315998125923523, "grad_norm": 0.17945843935012817, "learning_rate": 4.879441896962212e-05, "loss": 0.4573, "step": 17525 }, { "epoch": 0.6317800122535769, "grad_norm": 0.16244640946388245, "learning_rate": 4.87935235490365e-05, "loss": 0.415, "step": 17530 }, { "epoch": 0.6319602119148016, "grad_norm": 0.16950534284114838, "learning_rate": 4.879262780426873e-05, "loss": 0.4423, "step": 17535 }, { "epoch": 0.6321404115760262, "grad_norm": 0.15685759484767914, "learning_rate": 4.879173173533101e-05, "loss": 0.4512, "step": 17540 }, { "epoch": 0.6323206112372509, "grad_norm": 0.18144509196281433, "learning_rate": 4.879083534223556e-05, "loss": 0.4618, "step": 17545 }, { "epoch": 0.6325008108984755, "grad_norm": 0.14772038161754608, "learning_rate": 4.878993862499458e-05, "loss": 0.4673, "step": 17550 }, { "epoch": 0.6326810105597002, "grad_norm": 0.17589062452316284, "learning_rate": 4.8789041583620285e-05, "loss": 0.4252, "step": 17555 }, { "epoch": 0.6328612102209248, "grad_norm": 0.1934499889612198, "learning_rate": 4.878814421812492e-05, "loss": 0.4261, "step": 17560 }, { "epoch": 0.6330414098821494, "grad_norm": 0.16541697084903717, "learning_rate": 4.878724652852068e-05, "loss": 0.4186, "step": 17565 }, { "epoch": 0.633221609543374, "grad_norm": 0.18114767968654633, "learning_rate": 4.878634851481982e-05, "loss": 0.4684, "step": 17570 }, { "epoch": 0.6334018092045987, "grad_norm": 0.17742811143398285, "learning_rate": 4.878545017703457e-05, "loss": 0.4497, "step": 17575 }, { "epoch": 0.6335820088658234, "grad_norm": 0.16837285459041595, "learning_rate": 4.8784551515177154e-05, "loss": 0.4176, "step": 17580 }, { "epoch": 0.633762208527048, "grad_norm": 0.14889617264270782, "learning_rate": 4.878365252925984e-05, "loss": 0.4337, "step": 17585 }, { "epoch": 0.6339424081882726, "grad_norm": 0.1742192506790161, "learning_rate": 4.878275321929485e-05, "loss": 0.4535, "step": 17590 }, { "epoch": 0.6341226078494973, "grad_norm": 0.20734204351902008, "learning_rate": 4.878185358529447e-05, "loss": 0.4681, "step": 17595 }, { "epoch": 0.6343028075107219, "grad_norm": 0.16586579382419586, "learning_rate": 4.8780953627270924e-05, "loss": 0.4205, "step": 17600 }, { "epoch": 0.6344830071719465, "grad_norm": 0.1419735550880432, "learning_rate": 4.8780053345236496e-05, "loss": 0.4481, "step": 17605 }, { "epoch": 0.6346632068331711, "grad_norm": 0.1975310742855072, "learning_rate": 4.877915273920345e-05, "loss": 0.4853, "step": 17610 }, { "epoch": 0.6348434064943957, "grad_norm": 0.19924567639827728, "learning_rate": 4.8778251809184045e-05, "loss": 0.4531, "step": 17615 }, { "epoch": 0.6350236061556205, "grad_norm": 0.17732134461402893, "learning_rate": 4.877735055519056e-05, "loss": 0.4414, "step": 17620 }, { "epoch": 0.6352038058168451, "grad_norm": 0.2488788664340973, "learning_rate": 4.877644897723528e-05, "loss": 0.4517, "step": 17625 }, { "epoch": 0.6353840054780697, "grad_norm": 0.1604602336883545, "learning_rate": 4.877554707533049e-05, "loss": 0.4141, "step": 17630 }, { "epoch": 0.6355642051392943, "grad_norm": 0.1665060818195343, "learning_rate": 4.877464484948847e-05, "loss": 0.4129, "step": 17635 }, { "epoch": 0.635744404800519, "grad_norm": 0.17682184278964996, "learning_rate": 4.877374229972152e-05, "loss": 0.4501, "step": 17640 }, { "epoch": 0.6359246044617436, "grad_norm": 0.17669498920440674, "learning_rate": 4.8772839426041935e-05, "loss": 0.4363, "step": 17645 }, { "epoch": 0.6361048041229682, "grad_norm": 0.17283748090267181, "learning_rate": 4.877193622846201e-05, "loss": 0.4117, "step": 17650 }, { "epoch": 0.6362850037841928, "grad_norm": 0.18358486890792847, "learning_rate": 4.877103270699406e-05, "loss": 0.4261, "step": 17655 }, { "epoch": 0.6364652034454176, "grad_norm": 0.15538670122623444, "learning_rate": 4.877012886165039e-05, "loss": 0.4387, "step": 17660 }, { "epoch": 0.6366454031066422, "grad_norm": 0.17753304541110992, "learning_rate": 4.8769224692443315e-05, "loss": 0.4024, "step": 17665 }, { "epoch": 0.6368256027678668, "grad_norm": 0.20612354576587677, "learning_rate": 4.8768320199385166e-05, "loss": 0.465, "step": 17670 }, { "epoch": 0.6370058024290914, "grad_norm": 0.21157251298427582, "learning_rate": 4.876741538248825e-05, "loss": 0.4686, "step": 17675 }, { "epoch": 0.6371860020903161, "grad_norm": 0.22163233160972595, "learning_rate": 4.876651024176489e-05, "loss": 0.4685, "step": 17680 }, { "epoch": 0.6373662017515407, "grad_norm": 0.1729961782693863, "learning_rate": 4.8765604777227434e-05, "loss": 0.4498, "step": 17685 }, { "epoch": 0.6375464014127653, "grad_norm": 0.16511957347393036, "learning_rate": 4.8764698988888223e-05, "loss": 0.4789, "step": 17690 }, { "epoch": 0.6377266010739899, "grad_norm": 0.15577971935272217, "learning_rate": 4.876379287675958e-05, "loss": 0.4499, "step": 17695 }, { "epoch": 0.6379068007352147, "grad_norm": 0.23532330989837646, "learning_rate": 4.8762886440853865e-05, "loss": 0.458, "step": 17700 }, { "epoch": 0.6380870003964393, "grad_norm": 0.1516086459159851, "learning_rate": 4.876197968118342e-05, "loss": 0.4136, "step": 17705 }, { "epoch": 0.6382672000576639, "grad_norm": 0.16963382065296173, "learning_rate": 4.8761072597760604e-05, "loss": 0.4761, "step": 17710 }, { "epoch": 0.6384473997188885, "grad_norm": 0.17492809891700745, "learning_rate": 4.876016519059778e-05, "loss": 0.4299, "step": 17715 }, { "epoch": 0.6386275993801132, "grad_norm": 0.21216866374015808, "learning_rate": 4.87592574597073e-05, "loss": 0.4399, "step": 17720 }, { "epoch": 0.6388077990413378, "grad_norm": 0.1838618665933609, "learning_rate": 4.8758349405101535e-05, "loss": 0.4794, "step": 17725 }, { "epoch": 0.6389879987025624, "grad_norm": 0.20086225867271423, "learning_rate": 4.875744102679286e-05, "loss": 0.4475, "step": 17730 }, { "epoch": 0.639168198363787, "grad_norm": 0.18566478788852692, "learning_rate": 4.8756532324793646e-05, "loss": 0.4394, "step": 17735 }, { "epoch": 0.6393483980250118, "grad_norm": 0.1914960891008377, "learning_rate": 4.875562329911629e-05, "loss": 0.4574, "step": 17740 }, { "epoch": 0.6395285976862364, "grad_norm": 0.14925505220890045, "learning_rate": 4.8754713949773156e-05, "loss": 0.4712, "step": 17745 }, { "epoch": 0.639708797347461, "grad_norm": 0.1491568684577942, "learning_rate": 4.875380427677665e-05, "loss": 0.4545, "step": 17750 }, { "epoch": 0.6398889970086856, "grad_norm": 0.16431868076324463, "learning_rate": 4.8752894280139155e-05, "loss": 0.4363, "step": 17755 }, { "epoch": 0.6400691966699102, "grad_norm": 0.16594944894313812, "learning_rate": 4.8751983959873074e-05, "loss": 0.443, "step": 17760 }, { "epoch": 0.6402493963311349, "grad_norm": 0.17833413183689117, "learning_rate": 4.875107331599081e-05, "loss": 0.4451, "step": 17765 }, { "epoch": 0.6404295959923595, "grad_norm": 0.1914624124765396, "learning_rate": 4.875016234850478e-05, "loss": 0.4021, "step": 17770 }, { "epoch": 0.6406097956535842, "grad_norm": 0.16454005241394043, "learning_rate": 4.8749251057427374e-05, "loss": 0.4073, "step": 17775 }, { "epoch": 0.6407899953148088, "grad_norm": 0.1951947659254074, "learning_rate": 4.874833944277103e-05, "loss": 0.5004, "step": 17780 }, { "epoch": 0.6409701949760335, "grad_norm": 0.1762334704399109, "learning_rate": 4.8747427504548146e-05, "loss": 0.4096, "step": 17785 }, { "epoch": 0.6411503946372581, "grad_norm": 0.18907704949378967, "learning_rate": 4.8746515242771175e-05, "loss": 0.4241, "step": 17790 }, { "epoch": 0.6413305942984827, "grad_norm": 0.20052656531333923, "learning_rate": 4.874560265745252e-05, "loss": 0.4246, "step": 17795 }, { "epoch": 0.6415107939597073, "grad_norm": 0.19393053650856018, "learning_rate": 4.874468974860463e-05, "loss": 0.4644, "step": 17800 }, { "epoch": 0.641690993620932, "grad_norm": 0.2143370658159256, "learning_rate": 4.874377651623994e-05, "loss": 0.462, "step": 17805 }, { "epoch": 0.6418711932821566, "grad_norm": 0.20955999195575714, "learning_rate": 4.8742862960370895e-05, "loss": 0.4356, "step": 17810 }, { "epoch": 0.6420513929433813, "grad_norm": 0.1920044720172882, "learning_rate": 4.874194908100993e-05, "loss": 0.4294, "step": 17815 }, { "epoch": 0.6422315926046059, "grad_norm": 0.20551706850528717, "learning_rate": 4.874103487816951e-05, "loss": 0.4574, "step": 17820 }, { "epoch": 0.6424117922658306, "grad_norm": 0.1954600214958191, "learning_rate": 4.8740120351862087e-05, "loss": 0.439, "step": 17825 }, { "epoch": 0.6425919919270552, "grad_norm": 0.17379389703273773, "learning_rate": 4.873920550210012e-05, "loss": 0.4056, "step": 17830 }, { "epoch": 0.6427721915882798, "grad_norm": 0.17289118468761444, "learning_rate": 4.8738290328896076e-05, "loss": 0.4321, "step": 17835 }, { "epoch": 0.6429523912495044, "grad_norm": 0.181446373462677, "learning_rate": 4.8737374832262415e-05, "loss": 0.4323, "step": 17840 }, { "epoch": 0.643132590910729, "grad_norm": 0.25358590483665466, "learning_rate": 4.8736459012211624e-05, "loss": 0.4085, "step": 17845 }, { "epoch": 0.6433127905719537, "grad_norm": 0.23851445317268372, "learning_rate": 4.873554286875618e-05, "loss": 0.4697, "step": 17850 }, { "epoch": 0.6434929902331784, "grad_norm": 0.18585999310016632, "learning_rate": 4.873462640190855e-05, "loss": 0.457, "step": 17855 }, { "epoch": 0.643673189894403, "grad_norm": 0.16106140613555908, "learning_rate": 4.873370961168123e-05, "loss": 0.449, "step": 17860 }, { "epoch": 0.6438533895556277, "grad_norm": 0.18756996095180511, "learning_rate": 4.873279249808672e-05, "loss": 0.4517, "step": 17865 }, { "epoch": 0.6440335892168523, "grad_norm": 0.26352357864379883, "learning_rate": 4.87318750611375e-05, "loss": 0.4494, "step": 17870 }, { "epoch": 0.6442137888780769, "grad_norm": 0.18100088834762573, "learning_rate": 4.873095730084608e-05, "loss": 0.442, "step": 17875 }, { "epoch": 0.6443939885393015, "grad_norm": 0.1749761700630188, "learning_rate": 4.873003921722496e-05, "loss": 0.4248, "step": 17880 }, { "epoch": 0.6445741882005261, "grad_norm": 0.16620847582817078, "learning_rate": 4.8729120810286654e-05, "loss": 0.4626, "step": 17885 }, { "epoch": 0.6447543878617508, "grad_norm": 0.18539084494113922, "learning_rate": 4.872820208004367e-05, "loss": 0.42, "step": 17890 }, { "epoch": 0.6449345875229755, "grad_norm": 0.15285968780517578, "learning_rate": 4.8727283026508516e-05, "loss": 0.428, "step": 17895 }, { "epoch": 0.6451147871842001, "grad_norm": 0.16171573102474213, "learning_rate": 4.872636364969373e-05, "loss": 0.4222, "step": 17900 }, { "epoch": 0.6452949868454247, "grad_norm": 0.2132413238286972, "learning_rate": 4.872544394961184e-05, "loss": 0.4037, "step": 17905 }, { "epoch": 0.6454751865066494, "grad_norm": 0.16865713894367218, "learning_rate": 4.872452392627537e-05, "loss": 0.4025, "step": 17910 }, { "epoch": 0.645655386167874, "grad_norm": 0.18928374350070953, "learning_rate": 4.8723603579696844e-05, "loss": 0.4316, "step": 17915 }, { "epoch": 0.6458355858290986, "grad_norm": 0.14977779984474182, "learning_rate": 4.872268290988882e-05, "loss": 0.4344, "step": 17920 }, { "epoch": 0.6460157854903232, "grad_norm": 0.15273813903331757, "learning_rate": 4.8721761916863826e-05, "loss": 0.4632, "step": 17925 }, { "epoch": 0.646195985151548, "grad_norm": 0.16625703871250153, "learning_rate": 4.872084060063443e-05, "loss": 0.4079, "step": 17930 }, { "epoch": 0.6463761848127726, "grad_norm": 0.16598527133464813, "learning_rate": 4.871991896121317e-05, "loss": 0.4384, "step": 17935 }, { "epoch": 0.6465563844739972, "grad_norm": 0.14382204413414001, "learning_rate": 4.8718996998612605e-05, "loss": 0.4601, "step": 17940 }, { "epoch": 0.6467365841352218, "grad_norm": 0.169216588139534, "learning_rate": 4.8718074712845296e-05, "loss": 0.4329, "step": 17945 }, { "epoch": 0.6469167837964465, "grad_norm": 0.21951742470264435, "learning_rate": 4.871715210392381e-05, "loss": 0.4817, "step": 17950 }, { "epoch": 0.6470969834576711, "grad_norm": 0.14136020839214325, "learning_rate": 4.871622917186073e-05, "loss": 0.4133, "step": 17955 }, { "epoch": 0.6472771831188957, "grad_norm": 0.22290566563606262, "learning_rate": 4.871530591666861e-05, "loss": 0.4538, "step": 17960 }, { "epoch": 0.6474573827801203, "grad_norm": 0.1538752168416977, "learning_rate": 4.871438233836003e-05, "loss": 0.4566, "step": 17965 }, { "epoch": 0.6476375824413451, "grad_norm": 0.13945698738098145, "learning_rate": 4.871345843694759e-05, "loss": 0.4536, "step": 17970 }, { "epoch": 0.6478177821025697, "grad_norm": 0.19078098237514496, "learning_rate": 4.871253421244387e-05, "loss": 0.4743, "step": 17975 }, { "epoch": 0.6479979817637943, "grad_norm": 0.17093034088611603, "learning_rate": 4.871160966486147e-05, "loss": 0.4625, "step": 17980 }, { "epoch": 0.6481781814250189, "grad_norm": 0.20973354578018188, "learning_rate": 4.8710684794212966e-05, "loss": 0.4308, "step": 17985 }, { "epoch": 0.6483583810862436, "grad_norm": 0.19713053107261658, "learning_rate": 4.870975960051098e-05, "loss": 0.4244, "step": 17990 }, { "epoch": 0.6485385807474682, "grad_norm": 0.18588168919086456, "learning_rate": 4.8708834083768106e-05, "loss": 0.4199, "step": 17995 }, { "epoch": 0.6487187804086928, "grad_norm": 0.15173941850662231, "learning_rate": 4.870790824399696e-05, "loss": 0.3994, "step": 18000 }, { "epoch": 0.6487187804086928, "eval_loss": 0.4651120901107788, "eval_runtime": 3.5665, "eval_samples_per_second": 28.038, "eval_steps_per_second": 7.01, "step": 18000 }, { "epoch": 0.6488989800699174, "grad_norm": 0.1769859790802002, "learning_rate": 4.870698208121015e-05, "loss": 0.4507, "step": 18005 }, { "epoch": 0.6490791797311422, "grad_norm": 0.19280551373958588, "learning_rate": 4.870605559542031e-05, "loss": 0.4139, "step": 18010 }, { "epoch": 0.6492593793923668, "grad_norm": 0.1606101095676422, "learning_rate": 4.870512878664004e-05, "loss": 0.4447, "step": 18015 }, { "epoch": 0.6494395790535914, "grad_norm": 0.16047056019306183, "learning_rate": 4.870420165488199e-05, "loss": 0.4453, "step": 18020 }, { "epoch": 0.649619778714816, "grad_norm": 0.1686955988407135, "learning_rate": 4.8703274200158765e-05, "loss": 0.4609, "step": 18025 }, { "epoch": 0.6497999783760406, "grad_norm": 0.15635234117507935, "learning_rate": 4.8702346422483035e-05, "loss": 0.4296, "step": 18030 }, { "epoch": 0.6499801780372653, "grad_norm": 0.158394455909729, "learning_rate": 4.870141832186742e-05, "loss": 0.447, "step": 18035 }, { "epoch": 0.6501603776984899, "grad_norm": 0.16858071088790894, "learning_rate": 4.870048989832456e-05, "loss": 0.4233, "step": 18040 }, { "epoch": 0.6503405773597145, "grad_norm": 0.11584287881851196, "learning_rate": 4.869956115186712e-05, "loss": 0.3873, "step": 18045 }, { "epoch": 0.6505207770209392, "grad_norm": 0.15428020060062408, "learning_rate": 4.8698632082507754e-05, "loss": 0.4426, "step": 18050 }, { "epoch": 0.6507009766821639, "grad_norm": 0.2866957187652588, "learning_rate": 4.869770269025911e-05, "loss": 0.4639, "step": 18055 }, { "epoch": 0.6508811763433885, "grad_norm": 0.21071985363960266, "learning_rate": 4.8696772975133854e-05, "loss": 0.4842, "step": 18060 }, { "epoch": 0.6510613760046131, "grad_norm": 0.17984484136104584, "learning_rate": 4.869584293714465e-05, "loss": 0.4272, "step": 18065 }, { "epoch": 0.6512415756658377, "grad_norm": 0.20826445519924164, "learning_rate": 4.8694912576304175e-05, "loss": 0.4561, "step": 18070 }, { "epoch": 0.6514217753270624, "grad_norm": 0.14839668571949005, "learning_rate": 4.8693981892625105e-05, "loss": 0.445, "step": 18075 }, { "epoch": 0.651601974988287, "grad_norm": 0.15876393020153046, "learning_rate": 4.869305088612012e-05, "loss": 0.4377, "step": 18080 }, { "epoch": 0.6517821746495117, "grad_norm": 0.20614424347877502, "learning_rate": 4.8692119556801905e-05, "loss": 0.4499, "step": 18085 }, { "epoch": 0.6519623743107363, "grad_norm": 0.17227613925933838, "learning_rate": 4.8691187904683145e-05, "loss": 0.4511, "step": 18090 }, { "epoch": 0.652142573971961, "grad_norm": 0.15382865071296692, "learning_rate": 4.869025592977654e-05, "loss": 0.4734, "step": 18095 }, { "epoch": 0.6523227736331856, "grad_norm": 0.1578940749168396, "learning_rate": 4.8689323632094784e-05, "loss": 0.4353, "step": 18100 }, { "epoch": 0.6525029732944102, "grad_norm": 0.17838072776794434, "learning_rate": 4.868839101165058e-05, "loss": 0.4476, "step": 18105 }, { "epoch": 0.6526831729556348, "grad_norm": 0.16446293890476227, "learning_rate": 4.868745806845663e-05, "loss": 0.4486, "step": 18110 }, { "epoch": 0.6528633726168594, "grad_norm": 0.16984452307224274, "learning_rate": 4.8686524802525656e-05, "loss": 0.4135, "step": 18115 }, { "epoch": 0.6530435722780841, "grad_norm": 0.18492572009563446, "learning_rate": 4.868559121387036e-05, "loss": 0.465, "step": 18120 }, { "epoch": 0.6532237719393088, "grad_norm": 0.18226665258407593, "learning_rate": 4.868465730250348e-05, "loss": 0.452, "step": 18125 }, { "epoch": 0.6534039716005334, "grad_norm": 0.18231280148029327, "learning_rate": 4.868372306843772e-05, "loss": 0.4465, "step": 18130 }, { "epoch": 0.653584171261758, "grad_norm": 0.22582538425922394, "learning_rate": 4.868278851168582e-05, "loss": 0.4061, "step": 18135 }, { "epoch": 0.6537643709229827, "grad_norm": 0.16542305052280426, "learning_rate": 4.8681853632260524e-05, "loss": 0.4386, "step": 18140 }, { "epoch": 0.6539445705842073, "grad_norm": 0.21034276485443115, "learning_rate": 4.868091843017454e-05, "loss": 0.4536, "step": 18145 }, { "epoch": 0.6541247702454319, "grad_norm": 0.20631127059459686, "learning_rate": 4.867998290544064e-05, "loss": 0.4516, "step": 18150 }, { "epoch": 0.6543049699066565, "grad_norm": 0.1725880652666092, "learning_rate": 4.867904705807155e-05, "loss": 0.4615, "step": 18155 }, { "epoch": 0.6544851695678812, "grad_norm": 0.14972399175167084, "learning_rate": 4.8678110888080026e-05, "loss": 0.4562, "step": 18160 }, { "epoch": 0.6546653692291059, "grad_norm": 0.1854199767112732, "learning_rate": 4.8677174395478834e-05, "loss": 0.4723, "step": 18165 }, { "epoch": 0.6548455688903305, "grad_norm": 0.16778244078159332, "learning_rate": 4.867623758028072e-05, "loss": 0.4472, "step": 18170 }, { "epoch": 0.6550257685515551, "grad_norm": 0.14149196445941925, "learning_rate": 4.867530044249845e-05, "loss": 0.4036, "step": 18175 }, { "epoch": 0.6552059682127798, "grad_norm": 0.15938983857631683, "learning_rate": 4.86743629821448e-05, "loss": 0.431, "step": 18180 }, { "epoch": 0.6553861678740044, "grad_norm": 0.16331376135349274, "learning_rate": 4.867342519923254e-05, "loss": 0.4418, "step": 18185 }, { "epoch": 0.655566367535229, "grad_norm": 0.15382960438728333, "learning_rate": 4.867248709377443e-05, "loss": 0.4471, "step": 18190 }, { "epoch": 0.6557465671964536, "grad_norm": 0.1844521015882492, "learning_rate": 4.8671548665783287e-05, "loss": 0.4734, "step": 18195 }, { "epoch": 0.6559267668576783, "grad_norm": 0.21769674122333527, "learning_rate": 4.867060991527186e-05, "loss": 0.4561, "step": 18200 }, { "epoch": 0.656106966518903, "grad_norm": 0.21248985826969147, "learning_rate": 4.866967084225297e-05, "loss": 0.4137, "step": 18205 }, { "epoch": 0.6562871661801276, "grad_norm": 0.18385855853557587, "learning_rate": 4.866873144673939e-05, "loss": 0.4094, "step": 18210 }, { "epoch": 0.6564673658413522, "grad_norm": 0.13242194056510925, "learning_rate": 4.866779172874392e-05, "loss": 0.4741, "step": 18215 }, { "epoch": 0.6566475655025769, "grad_norm": 0.17850540578365326, "learning_rate": 4.866685168827938e-05, "loss": 0.4539, "step": 18220 }, { "epoch": 0.6568277651638015, "grad_norm": 0.1694021075963974, "learning_rate": 4.8665911325358556e-05, "loss": 0.4531, "step": 18225 }, { "epoch": 0.6570079648250261, "grad_norm": 0.13871118426322937, "learning_rate": 4.866497063999429e-05, "loss": 0.4149, "step": 18230 }, { "epoch": 0.6571881644862507, "grad_norm": 0.13802644610404968, "learning_rate": 4.866402963219937e-05, "loss": 0.4539, "step": 18235 }, { "epoch": 0.6573683641474753, "grad_norm": 0.17566382884979248, "learning_rate": 4.8663088301986625e-05, "loss": 0.4297, "step": 18240 }, { "epoch": 0.6575485638087001, "grad_norm": 0.21962518990039825, "learning_rate": 4.866214664936889e-05, "loss": 0.4451, "step": 18245 }, { "epoch": 0.6577287634699247, "grad_norm": 0.12450151145458221, "learning_rate": 4.8661204674358984e-05, "loss": 0.4547, "step": 18250 }, { "epoch": 0.6579089631311493, "grad_norm": 0.16591274738311768, "learning_rate": 4.8660262376969745e-05, "loss": 0.4615, "step": 18255 }, { "epoch": 0.658089162792374, "grad_norm": 0.1811162382364273, "learning_rate": 4.8659319757214016e-05, "loss": 0.443, "step": 18260 }, { "epoch": 0.6582693624535986, "grad_norm": 0.17516489326953888, "learning_rate": 4.865837681510463e-05, "loss": 0.4069, "step": 18265 }, { "epoch": 0.6584495621148232, "grad_norm": 0.16720084846019745, "learning_rate": 4.865743355065444e-05, "loss": 0.4127, "step": 18270 }, { "epoch": 0.6586297617760478, "grad_norm": 0.13735070824623108, "learning_rate": 4.8656489963876305e-05, "loss": 0.4367, "step": 18275 }, { "epoch": 0.6588099614372726, "grad_norm": 0.15320982038974762, "learning_rate": 4.865554605478308e-05, "loss": 0.4592, "step": 18280 }, { "epoch": 0.6589901610984972, "grad_norm": 0.2053997814655304, "learning_rate": 4.8654601823387605e-05, "loss": 0.4266, "step": 18285 }, { "epoch": 0.6591703607597218, "grad_norm": 0.1573738306760788, "learning_rate": 4.8653657269702765e-05, "loss": 0.4257, "step": 18290 }, { "epoch": 0.6593505604209464, "grad_norm": 0.1637280136346817, "learning_rate": 4.865271239374142e-05, "loss": 0.4559, "step": 18295 }, { "epoch": 0.659530760082171, "grad_norm": 0.2082606703042984, "learning_rate": 4.865176719551645e-05, "loss": 0.4498, "step": 18300 }, { "epoch": 0.6597109597433957, "grad_norm": 0.1640523225069046, "learning_rate": 4.8650821675040735e-05, "loss": 0.4573, "step": 18305 }, { "epoch": 0.6598911594046203, "grad_norm": 0.130737766623497, "learning_rate": 4.864987583232715e-05, "loss": 0.4277, "step": 18310 }, { "epoch": 0.6600713590658449, "grad_norm": 0.17735107243061066, "learning_rate": 4.8648929667388596e-05, "loss": 0.4758, "step": 18315 }, { "epoch": 0.6602515587270696, "grad_norm": 0.180114284157753, "learning_rate": 4.8647983180237945e-05, "loss": 0.4557, "step": 18320 }, { "epoch": 0.6604317583882943, "grad_norm": 0.19766369462013245, "learning_rate": 4.8647036370888096e-05, "loss": 0.4227, "step": 18325 }, { "epoch": 0.6606119580495189, "grad_norm": 0.18029169738292694, "learning_rate": 4.8646089239351966e-05, "loss": 0.4243, "step": 18330 }, { "epoch": 0.6607921577107435, "grad_norm": 0.1884010136127472, "learning_rate": 4.8645141785642444e-05, "loss": 0.469, "step": 18335 }, { "epoch": 0.6609723573719681, "grad_norm": 0.17342260479927063, "learning_rate": 4.864419400977244e-05, "loss": 0.4331, "step": 18340 }, { "epoch": 0.6611525570331928, "grad_norm": 0.15091748535633087, "learning_rate": 4.864324591175487e-05, "loss": 0.4205, "step": 18345 }, { "epoch": 0.6613327566944174, "grad_norm": 0.20120559632778168, "learning_rate": 4.8642297491602654e-05, "loss": 0.441, "step": 18350 }, { "epoch": 0.661512956355642, "grad_norm": 0.2035011202096939, "learning_rate": 4.8641348749328716e-05, "loss": 0.4648, "step": 18355 }, { "epoch": 0.6616931560168667, "grad_norm": 0.1479252576828003, "learning_rate": 4.864039968494597e-05, "loss": 0.4378, "step": 18360 }, { "epoch": 0.6618733556780914, "grad_norm": 0.15773577988147736, "learning_rate": 4.863945029846736e-05, "loss": 0.4031, "step": 18365 }, { "epoch": 0.662053555339316, "grad_norm": 0.20971988141536713, "learning_rate": 4.863850058990582e-05, "loss": 0.4773, "step": 18370 }, { "epoch": 0.6622337550005406, "grad_norm": 0.17060470581054688, "learning_rate": 4.863755055927428e-05, "loss": 0.4178, "step": 18375 }, { "epoch": 0.6624139546617652, "grad_norm": 0.22001157701015472, "learning_rate": 4.863660020658569e-05, "loss": 0.4637, "step": 18380 }, { "epoch": 0.6625941543229898, "grad_norm": 0.15214625000953674, "learning_rate": 4.8635649531853e-05, "loss": 0.4174, "step": 18385 }, { "epoch": 0.6627743539842145, "grad_norm": 0.15511523187160492, "learning_rate": 4.863469853508916e-05, "loss": 0.4156, "step": 18390 }, { "epoch": 0.6629545536454391, "grad_norm": 0.18479692935943604, "learning_rate": 4.863374721630713e-05, "loss": 0.4064, "step": 18395 }, { "epoch": 0.6631347533066638, "grad_norm": 0.3170281946659088, "learning_rate": 4.8632795575519876e-05, "loss": 0.4339, "step": 18400 }, { "epoch": 0.6633149529678884, "grad_norm": 0.1785048097372055, "learning_rate": 4.863184361274035e-05, "loss": 0.4805, "step": 18405 }, { "epoch": 0.6634951526291131, "grad_norm": 0.1605699211359024, "learning_rate": 4.863089132798153e-05, "loss": 0.4641, "step": 18410 }, { "epoch": 0.6636753522903377, "grad_norm": 0.20723554491996765, "learning_rate": 4.86299387212564e-05, "loss": 0.4315, "step": 18415 }, { "epoch": 0.6638555519515623, "grad_norm": 0.14896468818187714, "learning_rate": 4.862898579257792e-05, "loss": 0.4462, "step": 18420 }, { "epoch": 0.6640357516127869, "grad_norm": 0.16965442895889282, "learning_rate": 4.8628032541959086e-05, "loss": 0.4679, "step": 18425 }, { "epoch": 0.6642159512740116, "grad_norm": 0.1498372107744217, "learning_rate": 4.8627078969412885e-05, "loss": 0.43, "step": 18430 }, { "epoch": 0.6643961509352363, "grad_norm": 0.1531539112329483, "learning_rate": 4.86261250749523e-05, "loss": 0.4028, "step": 18435 }, { "epoch": 0.6645763505964609, "grad_norm": 0.1347551792860031, "learning_rate": 4.862517085859034e-05, "loss": 0.3987, "step": 18440 }, { "epoch": 0.6647565502576855, "grad_norm": 0.2017221748828888, "learning_rate": 4.862421632034e-05, "loss": 0.4778, "step": 18445 }, { "epoch": 0.6649367499189102, "grad_norm": 0.16347235441207886, "learning_rate": 4.8623261460214284e-05, "loss": 0.4565, "step": 18450 }, { "epoch": 0.6651169495801348, "grad_norm": 0.18009866774082184, "learning_rate": 4.862230627822621e-05, "loss": 0.4521, "step": 18455 }, { "epoch": 0.6652971492413594, "grad_norm": 0.18370333313941956, "learning_rate": 4.8621350774388784e-05, "loss": 0.4353, "step": 18460 }, { "epoch": 0.665477348902584, "grad_norm": 0.17444929480552673, "learning_rate": 4.8620394948715034e-05, "loss": 0.4615, "step": 18465 }, { "epoch": 0.6656575485638087, "grad_norm": 0.17793411016464233, "learning_rate": 4.8619438801217966e-05, "loss": 0.4321, "step": 18470 }, { "epoch": 0.6658377482250334, "grad_norm": 0.1900653839111328, "learning_rate": 4.861848233191062e-05, "loss": 0.4412, "step": 18475 }, { "epoch": 0.666017947886258, "grad_norm": 0.191162571310997, "learning_rate": 4.861752554080602e-05, "loss": 0.433, "step": 18480 }, { "epoch": 0.6661981475474826, "grad_norm": 0.1730828732252121, "learning_rate": 4.861656842791722e-05, "loss": 0.457, "step": 18485 }, { "epoch": 0.6663783472087073, "grad_norm": 0.1662280112504959, "learning_rate": 4.861561099325723e-05, "loss": 0.4409, "step": 18490 }, { "epoch": 0.6665585468699319, "grad_norm": 0.16253672540187836, "learning_rate": 4.8614653236839125e-05, "loss": 0.4723, "step": 18495 }, { "epoch": 0.6667387465311565, "grad_norm": 0.17181946337223053, "learning_rate": 4.861369515867594e-05, "loss": 0.4322, "step": 18500 }, { "epoch": 0.6667387465311565, "eval_loss": 0.4643363654613495, "eval_runtime": 3.5289, "eval_samples_per_second": 28.337, "eval_steps_per_second": 7.084, "step": 18500 }, { "epoch": 0.6669189461923811, "grad_norm": 0.2049456685781479, "learning_rate": 4.861273675878073e-05, "loss": 0.4311, "step": 18505 }, { "epoch": 0.6670991458536057, "grad_norm": 0.1761494129896164, "learning_rate": 4.861177803716655e-05, "loss": 0.4485, "step": 18510 }, { "epoch": 0.6672793455148305, "grad_norm": 0.17736728489398956, "learning_rate": 4.861081899384647e-05, "loss": 0.4739, "step": 18515 }, { "epoch": 0.6674595451760551, "grad_norm": 0.19387714564800262, "learning_rate": 4.860985962883355e-05, "loss": 0.4827, "step": 18520 }, { "epoch": 0.6676397448372797, "grad_norm": 0.16782864928245544, "learning_rate": 4.860889994214086e-05, "loss": 0.4753, "step": 18525 }, { "epoch": 0.6678199444985043, "grad_norm": 0.17974188923835754, "learning_rate": 4.860793993378149e-05, "loss": 0.4338, "step": 18530 }, { "epoch": 0.668000144159729, "grad_norm": 0.1873011589050293, "learning_rate": 4.86069796037685e-05, "loss": 0.4763, "step": 18535 }, { "epoch": 0.6681803438209536, "grad_norm": 0.1606231927871704, "learning_rate": 4.8606018952114985e-05, "loss": 0.4097, "step": 18540 }, { "epoch": 0.6683605434821782, "grad_norm": 0.19824917614459991, "learning_rate": 4.8605057978834034e-05, "loss": 0.4387, "step": 18545 }, { "epoch": 0.6685407431434028, "grad_norm": 0.16622477769851685, "learning_rate": 4.860409668393873e-05, "loss": 0.4453, "step": 18550 }, { "epoch": 0.6687209428046276, "grad_norm": 0.1473122388124466, "learning_rate": 4.8603135067442184e-05, "loss": 0.4171, "step": 18555 }, { "epoch": 0.6689011424658522, "grad_norm": 0.15606799721717834, "learning_rate": 4.8602173129357496e-05, "loss": 0.4392, "step": 18560 }, { "epoch": 0.6690813421270768, "grad_norm": 0.2004939466714859, "learning_rate": 4.8601210869697766e-05, "loss": 0.4659, "step": 18565 }, { "epoch": 0.6692615417883014, "grad_norm": 0.17140799760818481, "learning_rate": 4.8600248288476105e-05, "loss": 0.4405, "step": 18570 }, { "epoch": 0.6694417414495261, "grad_norm": 0.20640818774700165, "learning_rate": 4.859928538570564e-05, "loss": 0.4663, "step": 18575 }, { "epoch": 0.6696219411107507, "grad_norm": 0.15983538329601288, "learning_rate": 4.859832216139947e-05, "loss": 0.4411, "step": 18580 }, { "epoch": 0.6698021407719753, "grad_norm": 0.18739761412143707, "learning_rate": 4.8597358615570734e-05, "loss": 0.4453, "step": 18585 }, { "epoch": 0.6699823404332, "grad_norm": 0.2296430617570877, "learning_rate": 4.8596394748232556e-05, "loss": 0.4441, "step": 18590 }, { "epoch": 0.6701625400944247, "grad_norm": 0.1832958459854126, "learning_rate": 4.8595430559398056e-05, "loss": 0.4621, "step": 18595 }, { "epoch": 0.6703427397556493, "grad_norm": 0.2158745378255844, "learning_rate": 4.85944660490804e-05, "loss": 0.4708, "step": 18600 }, { "epoch": 0.6705229394168739, "grad_norm": 0.16886022686958313, "learning_rate": 4.85935012172927e-05, "loss": 0.4215, "step": 18605 }, { "epoch": 0.6707031390780985, "grad_norm": 0.15665309131145477, "learning_rate": 4.859253606404812e-05, "loss": 0.4263, "step": 18610 }, { "epoch": 0.6708833387393232, "grad_norm": 0.1781836897134781, "learning_rate": 4.85915705893598e-05, "loss": 0.4506, "step": 18615 }, { "epoch": 0.6710635384005478, "grad_norm": 0.18428124487400055, "learning_rate": 4.85906047932409e-05, "loss": 0.4865, "step": 18620 }, { "epoch": 0.6712437380617724, "grad_norm": 0.1660621613264084, "learning_rate": 4.858963867570458e-05, "loss": 0.4609, "step": 18625 }, { "epoch": 0.6714239377229971, "grad_norm": 0.1916099488735199, "learning_rate": 4.8588672236764e-05, "loss": 0.4376, "step": 18630 }, { "epoch": 0.6716041373842218, "grad_norm": 0.1742095947265625, "learning_rate": 4.858770547643232e-05, "loss": 0.4813, "step": 18635 }, { "epoch": 0.6717843370454464, "grad_norm": 0.1986844837665558, "learning_rate": 4.858673839472273e-05, "loss": 0.447, "step": 18640 }, { "epoch": 0.671964536706671, "grad_norm": 0.168625608086586, "learning_rate": 4.858577099164839e-05, "loss": 0.4426, "step": 18645 }, { "epoch": 0.6721447363678956, "grad_norm": 0.18472012877464294, "learning_rate": 4.858480326722249e-05, "loss": 0.4643, "step": 18650 }, { "epoch": 0.6723249360291202, "grad_norm": 0.1810387521982193, "learning_rate": 4.858383522145821e-05, "loss": 0.4517, "step": 18655 }, { "epoch": 0.6725051356903449, "grad_norm": 0.18452127277851105, "learning_rate": 4.858286685436873e-05, "loss": 0.4554, "step": 18660 }, { "epoch": 0.6726853353515695, "grad_norm": 0.15459772944450378, "learning_rate": 4.8581898165967274e-05, "loss": 0.4355, "step": 18665 }, { "epoch": 0.6728655350127942, "grad_norm": 0.1914420872926712, "learning_rate": 4.858092915626701e-05, "loss": 0.3937, "step": 18670 }, { "epoch": 0.6730457346740188, "grad_norm": 0.15395872294902802, "learning_rate": 4.8579959825281155e-05, "loss": 0.4635, "step": 18675 }, { "epoch": 0.6732259343352435, "grad_norm": 0.13936087489128113, "learning_rate": 4.857899017302291e-05, "loss": 0.4779, "step": 18680 }, { "epoch": 0.6734061339964681, "grad_norm": 0.14681056141853333, "learning_rate": 4.8578020199505495e-05, "loss": 0.4428, "step": 18685 }, { "epoch": 0.6735863336576927, "grad_norm": 0.13621117174625397, "learning_rate": 4.857704990474211e-05, "loss": 0.4088, "step": 18690 }, { "epoch": 0.6737665333189173, "grad_norm": 0.1684703528881073, "learning_rate": 4.857607928874599e-05, "loss": 0.4154, "step": 18695 }, { "epoch": 0.673946732980142, "grad_norm": 0.1323196440935135, "learning_rate": 4.857510835153035e-05, "loss": 0.4315, "step": 18700 }, { "epoch": 0.6741269326413666, "grad_norm": 0.16398736834526062, "learning_rate": 4.857413709310843e-05, "loss": 0.4505, "step": 18705 }, { "epoch": 0.6743071323025913, "grad_norm": 0.18038709461688995, "learning_rate": 4.857316551349345e-05, "loss": 0.4387, "step": 18710 }, { "epoch": 0.6744873319638159, "grad_norm": 0.169601172208786, "learning_rate": 4.8572193612698656e-05, "loss": 0.4301, "step": 18715 }, { "epoch": 0.6746675316250406, "grad_norm": 0.19574612379074097, "learning_rate": 4.857122139073729e-05, "loss": 0.4735, "step": 18720 }, { "epoch": 0.6748477312862652, "grad_norm": 0.19240935146808624, "learning_rate": 4.857024884762259e-05, "loss": 0.4467, "step": 18725 }, { "epoch": 0.6750279309474898, "grad_norm": 0.16014917194843292, "learning_rate": 4.856927598336782e-05, "loss": 0.4609, "step": 18730 }, { "epoch": 0.6752081306087144, "grad_norm": 0.1911192089319229, "learning_rate": 4.856830279798623e-05, "loss": 0.5029, "step": 18735 }, { "epoch": 0.675388330269939, "grad_norm": 0.1548725664615631, "learning_rate": 4.856732929149107e-05, "loss": 0.4556, "step": 18740 }, { "epoch": 0.6755685299311637, "grad_norm": 0.16711898148059845, "learning_rate": 4.856635546389562e-05, "loss": 0.4449, "step": 18745 }, { "epoch": 0.6757487295923884, "grad_norm": 0.18358518183231354, "learning_rate": 4.856538131521313e-05, "loss": 0.4313, "step": 18750 }, { "epoch": 0.675928929253613, "grad_norm": 0.18038628995418549, "learning_rate": 4.8564406845456885e-05, "loss": 0.4427, "step": 18755 }, { "epoch": 0.6761091289148377, "grad_norm": 0.16218414902687073, "learning_rate": 4.8563432054640155e-05, "loss": 0.4352, "step": 18760 }, { "epoch": 0.6762893285760623, "grad_norm": 0.1689649075269699, "learning_rate": 4.856245694277623e-05, "loss": 0.4558, "step": 18765 }, { "epoch": 0.6764695282372869, "grad_norm": 0.2027473747730255, "learning_rate": 4.8561481509878395e-05, "loss": 0.4304, "step": 18770 }, { "epoch": 0.6766497278985115, "grad_norm": 0.18545962870121002, "learning_rate": 4.856050575595993e-05, "loss": 0.4456, "step": 18775 }, { "epoch": 0.6768299275597361, "grad_norm": 0.17125312983989716, "learning_rate": 4.8559529681034135e-05, "loss": 0.4816, "step": 18780 }, { "epoch": 0.6770101272209609, "grad_norm": 0.14758311212062836, "learning_rate": 4.855855328511432e-05, "loss": 0.458, "step": 18785 }, { "epoch": 0.6771903268821855, "grad_norm": 0.14200688898563385, "learning_rate": 4.8557576568213755e-05, "loss": 0.4096, "step": 18790 }, { "epoch": 0.6773705265434101, "grad_norm": 0.15297238528728485, "learning_rate": 4.855659953034579e-05, "loss": 0.3955, "step": 18795 }, { "epoch": 0.6775507262046347, "grad_norm": 0.17643597722053528, "learning_rate": 4.855562217152371e-05, "loss": 0.4581, "step": 18800 }, { "epoch": 0.6777309258658594, "grad_norm": 0.15438847243785858, "learning_rate": 4.8554644491760845e-05, "loss": 0.4511, "step": 18805 }, { "epoch": 0.677911125527084, "grad_norm": 0.17449019849300385, "learning_rate": 4.8553666491070505e-05, "loss": 0.4128, "step": 18810 }, { "epoch": 0.6780913251883086, "grad_norm": 0.180333212018013, "learning_rate": 4.855268816946601e-05, "loss": 0.4442, "step": 18815 }, { "epoch": 0.6782715248495332, "grad_norm": 0.1781253069639206, "learning_rate": 4.855170952696071e-05, "loss": 0.4484, "step": 18820 }, { "epoch": 0.678451724510758, "grad_norm": 0.15952306985855103, "learning_rate": 4.855073056356793e-05, "loss": 0.4409, "step": 18825 }, { "epoch": 0.6786319241719826, "grad_norm": 0.18420594930648804, "learning_rate": 4.854975127930099e-05, "loss": 0.4467, "step": 18830 }, { "epoch": 0.6788121238332072, "grad_norm": 0.15397235751152039, "learning_rate": 4.854877167417327e-05, "loss": 0.4439, "step": 18835 }, { "epoch": 0.6789923234944318, "grad_norm": 0.15009304881095886, "learning_rate": 4.854779174819807e-05, "loss": 0.4126, "step": 18840 }, { "epoch": 0.6791725231556565, "grad_norm": 0.17719674110412598, "learning_rate": 4.8546811501388784e-05, "loss": 0.4565, "step": 18845 }, { "epoch": 0.6793527228168811, "grad_norm": 0.1648426502943039, "learning_rate": 4.854583093375875e-05, "loss": 0.4347, "step": 18850 }, { "epoch": 0.6795329224781057, "grad_norm": 0.14466212689876556, "learning_rate": 4.8544850045321324e-05, "loss": 0.3926, "step": 18855 }, { "epoch": 0.6797131221393303, "grad_norm": 0.1948980838060379, "learning_rate": 4.8543868836089865e-05, "loss": 0.4354, "step": 18860 }, { "epoch": 0.6798933218005551, "grad_norm": 0.1797015517950058, "learning_rate": 4.8542887306077765e-05, "loss": 0.4419, "step": 18865 }, { "epoch": 0.6800735214617797, "grad_norm": 0.12938088178634644, "learning_rate": 4.8541905455298373e-05, "loss": 0.4491, "step": 18870 }, { "epoch": 0.6802537211230043, "grad_norm": 0.21112160384655, "learning_rate": 4.854092328376509e-05, "loss": 0.4418, "step": 18875 }, { "epoch": 0.6804339207842289, "grad_norm": 0.18489904701709747, "learning_rate": 4.853994079149128e-05, "loss": 0.4395, "step": 18880 }, { "epoch": 0.6806141204454536, "grad_norm": 0.15210841596126556, "learning_rate": 4.853895797849033e-05, "loss": 0.4479, "step": 18885 }, { "epoch": 0.6807943201066782, "grad_norm": 0.15210148692131042, "learning_rate": 4.8537974844775636e-05, "loss": 0.3912, "step": 18890 }, { "epoch": 0.6809745197679028, "grad_norm": 0.1489754021167755, "learning_rate": 4.85369913903606e-05, "loss": 0.465, "step": 18895 }, { "epoch": 0.6811547194291274, "grad_norm": 0.20377914607524872, "learning_rate": 4.85360076152586e-05, "loss": 0.454, "step": 18900 }, { "epoch": 0.6813349190903522, "grad_norm": 0.20608647167682648, "learning_rate": 4.8535023519483055e-05, "loss": 0.4495, "step": 18905 }, { "epoch": 0.6815151187515768, "grad_norm": 0.1873638778924942, "learning_rate": 4.853403910304738e-05, "loss": 0.4436, "step": 18910 }, { "epoch": 0.6816953184128014, "grad_norm": 0.16861487925052643, "learning_rate": 4.853305436596497e-05, "loss": 0.4342, "step": 18915 }, { "epoch": 0.681875518074026, "grad_norm": 0.14870557188987732, "learning_rate": 4.853206930824925e-05, "loss": 0.4825, "step": 18920 }, { "epoch": 0.6820557177352506, "grad_norm": 0.16530665755271912, "learning_rate": 4.853108392991366e-05, "loss": 0.4248, "step": 18925 }, { "epoch": 0.6822359173964753, "grad_norm": 0.15075165033340454, "learning_rate": 4.8530098230971586e-05, "loss": 0.4616, "step": 18930 }, { "epoch": 0.6824161170576999, "grad_norm": 0.16559144854545593, "learning_rate": 4.852911221143649e-05, "loss": 0.4233, "step": 18935 }, { "epoch": 0.6825963167189246, "grad_norm": 0.17002665996551514, "learning_rate": 4.852812587132178e-05, "loss": 0.448, "step": 18940 }, { "epoch": 0.6827765163801492, "grad_norm": 0.14457158744335175, "learning_rate": 4.8527139210640924e-05, "loss": 0.4475, "step": 18945 }, { "epoch": 0.6829567160413739, "grad_norm": 0.18576878309249878, "learning_rate": 4.852615222940735e-05, "loss": 0.432, "step": 18950 }, { "epoch": 0.6831369157025985, "grad_norm": 0.20716910064220428, "learning_rate": 4.852516492763451e-05, "loss": 0.4773, "step": 18955 }, { "epoch": 0.6833171153638231, "grad_norm": 0.1403721570968628, "learning_rate": 4.852417730533585e-05, "loss": 0.4248, "step": 18960 }, { "epoch": 0.6834973150250477, "grad_norm": 0.1828588992357254, "learning_rate": 4.852318936252482e-05, "loss": 0.4335, "step": 18965 }, { "epoch": 0.6836775146862724, "grad_norm": 0.18035824596881866, "learning_rate": 4.8522201099214904e-05, "loss": 0.4083, "step": 18970 }, { "epoch": 0.683857714347497, "grad_norm": 0.16249680519104004, "learning_rate": 4.852121251541955e-05, "loss": 0.4238, "step": 18975 }, { "epoch": 0.6840379140087217, "grad_norm": 0.1821296066045761, "learning_rate": 4.8520223611152215e-05, "loss": 0.4195, "step": 18980 }, { "epoch": 0.6842181136699463, "grad_norm": 0.16044920682907104, "learning_rate": 4.851923438642639e-05, "loss": 0.4541, "step": 18985 }, { "epoch": 0.684398313331171, "grad_norm": 0.169897198677063, "learning_rate": 4.851824484125556e-05, "loss": 0.4616, "step": 18990 }, { "epoch": 0.6845785129923956, "grad_norm": 0.17817699909210205, "learning_rate": 4.851725497565319e-05, "loss": 0.4653, "step": 18995 }, { "epoch": 0.6847587126536202, "grad_norm": 0.19420120120048523, "learning_rate": 4.851626478963278e-05, "loss": 0.4647, "step": 19000 }, { "epoch": 0.6847587126536202, "eval_loss": 0.46408936381340027, "eval_runtime": 3.5332, "eval_samples_per_second": 28.303, "eval_steps_per_second": 7.076, "step": 19000 }, { "epoch": 0.6849389123148448, "grad_norm": 0.18936601281166077, "learning_rate": 4.851527428320781e-05, "loss": 0.4437, "step": 19005 }, { "epoch": 0.6851191119760695, "grad_norm": 0.16841618716716766, "learning_rate": 4.8514283456391785e-05, "loss": 0.4336, "step": 19010 }, { "epoch": 0.6852993116372941, "grad_norm": 0.16009177267551422, "learning_rate": 4.8513292309198197e-05, "loss": 0.4376, "step": 19015 }, { "epoch": 0.6854795112985188, "grad_norm": 0.16848459839820862, "learning_rate": 4.851230084164056e-05, "loss": 0.4548, "step": 19020 }, { "epoch": 0.6856597109597434, "grad_norm": 0.1737171709537506, "learning_rate": 4.851130905373237e-05, "loss": 0.4012, "step": 19025 }, { "epoch": 0.685839910620968, "grad_norm": 0.1548008769750595, "learning_rate": 4.851031694548714e-05, "loss": 0.4344, "step": 19030 }, { "epoch": 0.6860201102821927, "grad_norm": 0.17847006022930145, "learning_rate": 4.85093245169184e-05, "loss": 0.4582, "step": 19035 }, { "epoch": 0.6862003099434173, "grad_norm": 0.17521251738071442, "learning_rate": 4.8508331768039674e-05, "loss": 0.436, "step": 19040 }, { "epoch": 0.6863805096046419, "grad_norm": 0.1900710016489029, "learning_rate": 4.850733869886446e-05, "loss": 0.4429, "step": 19045 }, { "epoch": 0.6865607092658665, "grad_norm": 0.22452399134635925, "learning_rate": 4.850634530940632e-05, "loss": 0.4393, "step": 19050 }, { "epoch": 0.6867409089270912, "grad_norm": 0.1778751015663147, "learning_rate": 4.850535159967877e-05, "loss": 0.4614, "step": 19055 }, { "epoch": 0.6869211085883159, "grad_norm": 0.1997576355934143, "learning_rate": 4.8504357569695365e-05, "loss": 0.4381, "step": 19060 }, { "epoch": 0.6871013082495405, "grad_norm": 0.17467275261878967, "learning_rate": 4.850336321946963e-05, "loss": 0.4544, "step": 19065 }, { "epoch": 0.6872815079107651, "grad_norm": 0.15613585710525513, "learning_rate": 4.850236854901513e-05, "loss": 0.42, "step": 19070 }, { "epoch": 0.6874617075719898, "grad_norm": 0.18194815516471863, "learning_rate": 4.85013735583454e-05, "loss": 0.475, "step": 19075 }, { "epoch": 0.6876419072332144, "grad_norm": 0.17985866963863373, "learning_rate": 4.850037824747401e-05, "loss": 0.4177, "step": 19080 }, { "epoch": 0.687822106894439, "grad_norm": 0.19839969277381897, "learning_rate": 4.8499382616414515e-05, "loss": 0.4429, "step": 19085 }, { "epoch": 0.6880023065556636, "grad_norm": 0.1918293535709381, "learning_rate": 4.8498386665180474e-05, "loss": 0.4412, "step": 19090 }, { "epoch": 0.6881825062168883, "grad_norm": 0.20238642394542694, "learning_rate": 4.8497390393785475e-05, "loss": 0.4255, "step": 19095 }, { "epoch": 0.688362705878113, "grad_norm": 0.15424714982509613, "learning_rate": 4.849639380224308e-05, "loss": 0.4449, "step": 19100 }, { "epoch": 0.6885429055393376, "grad_norm": 0.15987904369831085, "learning_rate": 4.8495396890566855e-05, "loss": 0.4287, "step": 19105 }, { "epoch": 0.6887231052005622, "grad_norm": 0.19473698735237122, "learning_rate": 4.849439965877041e-05, "loss": 0.4664, "step": 19110 }, { "epoch": 0.6889033048617869, "grad_norm": 0.15366004407405853, "learning_rate": 4.849340210686732e-05, "loss": 0.4612, "step": 19115 }, { "epoch": 0.6890835045230115, "grad_norm": 0.20622040331363678, "learning_rate": 4.849240423487117e-05, "loss": 0.4397, "step": 19120 }, { "epoch": 0.6892637041842361, "grad_norm": 0.1534797102212906, "learning_rate": 4.8491406042795565e-05, "loss": 0.4492, "step": 19125 }, { "epoch": 0.6894439038454607, "grad_norm": 0.16360458731651306, "learning_rate": 4.849040753065409e-05, "loss": 0.4591, "step": 19130 }, { "epoch": 0.6896241035066855, "grad_norm": 0.15157437324523926, "learning_rate": 4.848940869846037e-05, "loss": 0.4604, "step": 19135 }, { "epoch": 0.6898043031679101, "grad_norm": 0.2104407548904419, "learning_rate": 4.8488409546228e-05, "loss": 0.4153, "step": 19140 }, { "epoch": 0.6899845028291347, "grad_norm": 0.2011989951133728, "learning_rate": 4.8487410073970594e-05, "loss": 0.4757, "step": 19145 }, { "epoch": 0.6901647024903593, "grad_norm": 0.18023835122585297, "learning_rate": 4.848641028170178e-05, "loss": 0.4305, "step": 19150 }, { "epoch": 0.690344902151584, "grad_norm": 0.1514541208744049, "learning_rate": 4.848541016943516e-05, "loss": 0.4403, "step": 19155 }, { "epoch": 0.6905251018128086, "grad_norm": 0.1530730277299881, "learning_rate": 4.8484409737184386e-05, "loss": 0.4532, "step": 19160 }, { "epoch": 0.6907053014740332, "grad_norm": 0.18207421898841858, "learning_rate": 4.848340898496308e-05, "loss": 0.4431, "step": 19165 }, { "epoch": 0.6908855011352578, "grad_norm": 0.17817628383636475, "learning_rate": 4.848240791278486e-05, "loss": 0.425, "step": 19170 }, { "epoch": 0.6910657007964826, "grad_norm": 0.13877823948860168, "learning_rate": 4.848140652066339e-05, "loss": 0.4346, "step": 19175 }, { "epoch": 0.6912459004577072, "grad_norm": 0.15629951655864716, "learning_rate": 4.84804048086123e-05, "loss": 0.4438, "step": 19180 }, { "epoch": 0.6914261001189318, "grad_norm": 0.15390846133232117, "learning_rate": 4.8479402776645235e-05, "loss": 0.4709, "step": 19185 }, { "epoch": 0.6916062997801564, "grad_norm": 0.17919474840164185, "learning_rate": 4.847840042477586e-05, "loss": 0.4731, "step": 19190 }, { "epoch": 0.691786499441381, "grad_norm": 0.170258030295372, "learning_rate": 4.8477397753017816e-05, "loss": 0.468, "step": 19195 }, { "epoch": 0.6919666991026057, "grad_norm": 0.19165678322315216, "learning_rate": 4.847639476138478e-05, "loss": 0.5018, "step": 19200 }, { "epoch": 0.6921468987638303, "grad_norm": 0.15710307657718658, "learning_rate": 4.8475391449890405e-05, "loss": 0.4447, "step": 19205 }, { "epoch": 0.6923270984250549, "grad_norm": 0.1523151695728302, "learning_rate": 4.847438781854837e-05, "loss": 0.4627, "step": 19210 }, { "epoch": 0.6925072980862796, "grad_norm": 0.14756174385547638, "learning_rate": 4.8473383867372345e-05, "loss": 0.4621, "step": 19215 }, { "epoch": 0.6926874977475043, "grad_norm": 0.2157120406627655, "learning_rate": 4.847237959637602e-05, "loss": 0.4199, "step": 19220 }, { "epoch": 0.6928676974087289, "grad_norm": 0.14184360206127167, "learning_rate": 4.847137500557305e-05, "loss": 0.4347, "step": 19225 }, { "epoch": 0.6930478970699535, "grad_norm": 0.13508060574531555, "learning_rate": 4.847037009497715e-05, "loss": 0.4535, "step": 19230 }, { "epoch": 0.6932280967311781, "grad_norm": 0.14974738657474518, "learning_rate": 4.8469364864602e-05, "loss": 0.4337, "step": 19235 }, { "epoch": 0.6934082963924028, "grad_norm": 0.2109774798154831, "learning_rate": 4.846835931446129e-05, "loss": 0.4732, "step": 19240 }, { "epoch": 0.6935884960536274, "grad_norm": 0.1753171980381012, "learning_rate": 4.846735344456873e-05, "loss": 0.4438, "step": 19245 }, { "epoch": 0.693768695714852, "grad_norm": 0.20404259860515594, "learning_rate": 4.8466347254938034e-05, "loss": 0.4431, "step": 19250 }, { "epoch": 0.6939488953760767, "grad_norm": 0.1709374636411667, "learning_rate": 4.846534074558289e-05, "loss": 0.4206, "step": 19255 }, { "epoch": 0.6941290950373014, "grad_norm": 0.13313916325569153, "learning_rate": 4.8464333916517025e-05, "loss": 0.4372, "step": 19260 }, { "epoch": 0.694309294698526, "grad_norm": 0.1679733395576477, "learning_rate": 4.8463326767754145e-05, "loss": 0.4753, "step": 19265 }, { "epoch": 0.6944894943597506, "grad_norm": 0.14599260687828064, "learning_rate": 4.846231929930799e-05, "loss": 0.423, "step": 19270 }, { "epoch": 0.6946696940209752, "grad_norm": 0.17458441853523254, "learning_rate": 4.846131151119228e-05, "loss": 0.4576, "step": 19275 }, { "epoch": 0.6948498936821998, "grad_norm": 0.14298151433467865, "learning_rate": 4.8460303403420735e-05, "loss": 0.4628, "step": 19280 }, { "epoch": 0.6950300933434245, "grad_norm": 0.18267078697681427, "learning_rate": 4.84592949760071e-05, "loss": 0.4505, "step": 19285 }, { "epoch": 0.6952102930046492, "grad_norm": 0.18791311979293823, "learning_rate": 4.845828622896511e-05, "loss": 0.3925, "step": 19290 }, { "epoch": 0.6953904926658738, "grad_norm": 0.1504139006137848, "learning_rate": 4.8457277162308526e-05, "loss": 0.4281, "step": 19295 }, { "epoch": 0.6955706923270984, "grad_norm": 0.1674523800611496, "learning_rate": 4.8456267776051066e-05, "loss": 0.4462, "step": 19300 }, { "epoch": 0.6957508919883231, "grad_norm": 0.1913890391588211, "learning_rate": 4.84552580702065e-05, "loss": 0.4054, "step": 19305 }, { "epoch": 0.6959310916495477, "grad_norm": 0.18787144124507904, "learning_rate": 4.8454248044788594e-05, "loss": 0.4503, "step": 19310 }, { "epoch": 0.6961112913107723, "grad_norm": 0.15281574428081512, "learning_rate": 4.845323769981109e-05, "loss": 0.4506, "step": 19315 }, { "epoch": 0.6962914909719969, "grad_norm": 0.17581430077552795, "learning_rate": 4.845222703528777e-05, "loss": 0.4174, "step": 19320 }, { "epoch": 0.6964716906332216, "grad_norm": 0.14308330416679382, "learning_rate": 4.845121605123239e-05, "loss": 0.4106, "step": 19325 }, { "epoch": 0.6966518902944463, "grad_norm": 0.1626398116350174, "learning_rate": 4.8450204747658734e-05, "loss": 0.4736, "step": 19330 }, { "epoch": 0.6968320899556709, "grad_norm": 0.14919036626815796, "learning_rate": 4.844919312458058e-05, "loss": 0.4042, "step": 19335 }, { "epoch": 0.6970122896168955, "grad_norm": 0.15542307496070862, "learning_rate": 4.844818118201171e-05, "loss": 0.4378, "step": 19340 }, { "epoch": 0.6971924892781202, "grad_norm": 0.18490786850452423, "learning_rate": 4.844716891996591e-05, "loss": 0.4315, "step": 19345 }, { "epoch": 0.6973726889393448, "grad_norm": 0.20370669662952423, "learning_rate": 4.8446156338456975e-05, "loss": 0.4472, "step": 19350 }, { "epoch": 0.6975528886005694, "grad_norm": 0.1456826627254486, "learning_rate": 4.8445143437498696e-05, "loss": 0.4498, "step": 19355 }, { "epoch": 0.697733088261794, "grad_norm": 0.159530371427536, "learning_rate": 4.844413021710488e-05, "loss": 0.4576, "step": 19360 }, { "epoch": 0.6979132879230187, "grad_norm": 0.1378137171268463, "learning_rate": 4.844311667728932e-05, "loss": 0.4643, "step": 19365 }, { "epoch": 0.6980934875842434, "grad_norm": 0.17406761646270752, "learning_rate": 4.844210281806585e-05, "loss": 0.4662, "step": 19370 }, { "epoch": 0.698273687245468, "grad_norm": 0.14517958462238312, "learning_rate": 4.844108863944826e-05, "loss": 0.4136, "step": 19375 }, { "epoch": 0.6984538869066926, "grad_norm": 0.16255003213882446, "learning_rate": 4.844007414145037e-05, "loss": 0.4686, "step": 19380 }, { "epoch": 0.6986340865679173, "grad_norm": 0.14890755712985992, "learning_rate": 4.843905932408601e-05, "loss": 0.4148, "step": 19385 }, { "epoch": 0.6988142862291419, "grad_norm": 0.1591944694519043, "learning_rate": 4.843804418736901e-05, "loss": 0.4243, "step": 19390 }, { "epoch": 0.6989944858903665, "grad_norm": 0.15910327434539795, "learning_rate": 4.84370287313132e-05, "loss": 0.4091, "step": 19395 }, { "epoch": 0.6991746855515911, "grad_norm": 0.19701680541038513, "learning_rate": 4.84360129559324e-05, "loss": 0.4521, "step": 19400 }, { "epoch": 0.6993548852128157, "grad_norm": 0.16682995855808258, "learning_rate": 4.8434996861240457e-05, "loss": 0.4676, "step": 19405 }, { "epoch": 0.6995350848740405, "grad_norm": 0.14017580449581146, "learning_rate": 4.843398044725123e-05, "loss": 0.4273, "step": 19410 }, { "epoch": 0.6997152845352651, "grad_norm": 0.19788876175880432, "learning_rate": 4.843296371397855e-05, "loss": 0.4129, "step": 19415 }, { "epoch": 0.6998954841964897, "grad_norm": 0.21277104318141937, "learning_rate": 4.843194666143628e-05, "loss": 0.5118, "step": 19420 }, { "epoch": 0.7000756838577143, "grad_norm": 0.19250738620758057, "learning_rate": 4.843092928963827e-05, "loss": 0.4856, "step": 19425 }, { "epoch": 0.700255883518939, "grad_norm": 0.15623074769973755, "learning_rate": 4.8429911598598386e-05, "loss": 0.4348, "step": 19430 }, { "epoch": 0.7004360831801636, "grad_norm": 0.13664281368255615, "learning_rate": 4.84288935883305e-05, "loss": 0.4232, "step": 19435 }, { "epoch": 0.7006162828413882, "grad_norm": 0.1700459122657776, "learning_rate": 4.842787525884847e-05, "loss": 0.4598, "step": 19440 }, { "epoch": 0.700796482502613, "grad_norm": 0.15901970863342285, "learning_rate": 4.842685661016617e-05, "loss": 0.4267, "step": 19445 }, { "epoch": 0.7009766821638376, "grad_norm": 0.20481829345226288, "learning_rate": 4.842583764229749e-05, "loss": 0.4309, "step": 19450 }, { "epoch": 0.7011568818250622, "grad_norm": 0.19733159244060516, "learning_rate": 4.8424818355256304e-05, "loss": 0.4754, "step": 19455 }, { "epoch": 0.7013370814862868, "grad_norm": 0.1566084325313568, "learning_rate": 4.84237987490565e-05, "loss": 0.4086, "step": 19460 }, { "epoch": 0.7015172811475114, "grad_norm": 0.16943992674350739, "learning_rate": 4.842277882371198e-05, "loss": 0.4623, "step": 19465 }, { "epoch": 0.7016974808087361, "grad_norm": 0.1365501880645752, "learning_rate": 4.842175857923663e-05, "loss": 0.4053, "step": 19470 }, { "epoch": 0.7018776804699607, "grad_norm": 0.1714967042207718, "learning_rate": 4.842073801564436e-05, "loss": 0.4687, "step": 19475 }, { "epoch": 0.7020578801311853, "grad_norm": 0.1788230538368225, "learning_rate": 4.841971713294906e-05, "loss": 0.4179, "step": 19480 }, { "epoch": 0.70223807979241, "grad_norm": 0.16317056119441986, "learning_rate": 4.841869593116466e-05, "loss": 0.4278, "step": 19485 }, { "epoch": 0.7024182794536347, "grad_norm": 0.149271622300148, "learning_rate": 4.841767441030505e-05, "loss": 0.4528, "step": 19490 }, { "epoch": 0.7025984791148593, "grad_norm": 0.17781810462474823, "learning_rate": 4.841665257038416e-05, "loss": 0.421, "step": 19495 }, { "epoch": 0.7027786787760839, "grad_norm": 0.16509835422039032, "learning_rate": 4.841563041141592e-05, "loss": 0.4252, "step": 19500 }, { "epoch": 0.7027786787760839, "eval_loss": 0.4628825783729553, "eval_runtime": 3.5462, "eval_samples_per_second": 28.199, "eval_steps_per_second": 7.05, "step": 19500 }, { "epoch": 0.7029588784373085, "grad_norm": 0.16837020218372345, "learning_rate": 4.841460793341425e-05, "loss": 0.457, "step": 19505 }, { "epoch": 0.7031390780985332, "grad_norm": 0.18554091453552246, "learning_rate": 4.8413585136393066e-05, "loss": 0.421, "step": 19510 }, { "epoch": 0.7033192777597578, "grad_norm": 0.20547594130039215, "learning_rate": 4.841256202036634e-05, "loss": 0.4686, "step": 19515 }, { "epoch": 0.7034994774209824, "grad_norm": 0.18383750319480896, "learning_rate": 4.841153858534797e-05, "loss": 0.4405, "step": 19520 }, { "epoch": 0.7036796770822071, "grad_norm": 0.18066047132015228, "learning_rate": 4.8410514831351926e-05, "loss": 0.3865, "step": 19525 }, { "epoch": 0.7038598767434318, "grad_norm": 0.144275963306427, "learning_rate": 4.840949075839215e-05, "loss": 0.4067, "step": 19530 }, { "epoch": 0.7040400764046564, "grad_norm": 0.19798412919044495, "learning_rate": 4.84084663664826e-05, "loss": 0.4514, "step": 19535 }, { "epoch": 0.704220276065881, "grad_norm": 0.16639620065689087, "learning_rate": 4.840744165563721e-05, "loss": 0.4901, "step": 19540 }, { "epoch": 0.7044004757271056, "grad_norm": 0.17380645871162415, "learning_rate": 4.8406416625869977e-05, "loss": 0.4401, "step": 19545 }, { "epoch": 0.7045806753883302, "grad_norm": 0.16596467792987823, "learning_rate": 4.840539127719484e-05, "loss": 0.4576, "step": 19550 }, { "epoch": 0.7047608750495549, "grad_norm": 0.14880597591400146, "learning_rate": 4.8404365609625785e-05, "loss": 0.4128, "step": 19555 }, { "epoch": 0.7049410747107795, "grad_norm": 0.21893320977687836, "learning_rate": 4.840333962317678e-05, "loss": 0.4075, "step": 19560 }, { "epoch": 0.7051212743720042, "grad_norm": 0.16774235665798187, "learning_rate": 4.8402313317861797e-05, "loss": 0.4624, "step": 19565 }, { "epoch": 0.7053014740332288, "grad_norm": 0.1787051409482956, "learning_rate": 4.840128669369483e-05, "loss": 0.441, "step": 19570 }, { "epoch": 0.7054816736944535, "grad_norm": 0.1392039656639099, "learning_rate": 4.840025975068986e-05, "loss": 0.4667, "step": 19575 }, { "epoch": 0.7056618733556781, "grad_norm": 0.17485755681991577, "learning_rate": 4.839923248886089e-05, "loss": 0.4378, "step": 19580 }, { "epoch": 0.7058420730169027, "grad_norm": 0.14245180785655975, "learning_rate": 4.8398204908221894e-05, "loss": 0.445, "step": 19585 }, { "epoch": 0.7060222726781273, "grad_norm": 0.19247664511203766, "learning_rate": 4.839717700878689e-05, "loss": 0.4163, "step": 19590 }, { "epoch": 0.706202472339352, "grad_norm": 0.14661888778209686, "learning_rate": 4.839614879056989e-05, "loss": 0.4428, "step": 19595 }, { "epoch": 0.7063826720005766, "grad_norm": 0.2213519662618637, "learning_rate": 4.839512025358488e-05, "loss": 0.4248, "step": 19600 }, { "epoch": 0.7065628716618013, "grad_norm": 0.13426418602466583, "learning_rate": 4.839409139784589e-05, "loss": 0.4201, "step": 19605 }, { "epoch": 0.7067430713230259, "grad_norm": 0.17198221385478973, "learning_rate": 4.839306222336694e-05, "loss": 0.4788, "step": 19610 }, { "epoch": 0.7069232709842506, "grad_norm": 0.1814018040895462, "learning_rate": 4.839203273016203e-05, "loss": 0.4323, "step": 19615 }, { "epoch": 0.7071034706454752, "grad_norm": 0.20377859473228455, "learning_rate": 4.839100291824522e-05, "loss": 0.4059, "step": 19620 }, { "epoch": 0.7072836703066998, "grad_norm": 0.16725854575634003, "learning_rate": 4.838997278763051e-05, "loss": 0.4669, "step": 19625 }, { "epoch": 0.7074638699679244, "grad_norm": 0.15145759284496307, "learning_rate": 4.838894233833196e-05, "loss": 0.4384, "step": 19630 }, { "epoch": 0.707644069629149, "grad_norm": 0.16167926788330078, "learning_rate": 4.8387911570363596e-05, "loss": 0.4536, "step": 19635 }, { "epoch": 0.7078242692903738, "grad_norm": 0.15461744368076324, "learning_rate": 4.838688048373946e-05, "loss": 0.4833, "step": 19640 }, { "epoch": 0.7080044689515984, "grad_norm": 0.18322111666202545, "learning_rate": 4.838584907847361e-05, "loss": 0.4641, "step": 19645 }, { "epoch": 0.708184668612823, "grad_norm": 0.15840373933315277, "learning_rate": 4.83848173545801e-05, "loss": 0.4341, "step": 19650 }, { "epoch": 0.7083648682740477, "grad_norm": 0.1756211519241333, "learning_rate": 4.8383785312072974e-05, "loss": 0.4358, "step": 19655 }, { "epoch": 0.7085450679352723, "grad_norm": 0.17935514450073242, "learning_rate": 4.83827529509663e-05, "loss": 0.4682, "step": 19660 }, { "epoch": 0.7087252675964969, "grad_norm": 0.20242036879062653, "learning_rate": 4.8381720271274146e-05, "loss": 0.4391, "step": 19665 }, { "epoch": 0.7089054672577215, "grad_norm": 0.15790332853794098, "learning_rate": 4.838068727301057e-05, "loss": 0.4066, "step": 19670 }, { "epoch": 0.7090856669189461, "grad_norm": 0.14953632652759552, "learning_rate": 4.8379653956189675e-05, "loss": 0.4043, "step": 19675 }, { "epoch": 0.7092658665801709, "grad_norm": 0.20323163270950317, "learning_rate": 4.8378620320825505e-05, "loss": 0.4517, "step": 19680 }, { "epoch": 0.7094460662413955, "grad_norm": 0.17088757455348969, "learning_rate": 4.837758636693217e-05, "loss": 0.4426, "step": 19685 }, { "epoch": 0.7096262659026201, "grad_norm": 0.15249846875667572, "learning_rate": 4.837655209452374e-05, "loss": 0.4364, "step": 19690 }, { "epoch": 0.7098064655638447, "grad_norm": 0.17006030678749084, "learning_rate": 4.837551750361432e-05, "loss": 0.4031, "step": 19695 }, { "epoch": 0.7099866652250694, "grad_norm": 0.17397870123386383, "learning_rate": 4.837448259421799e-05, "loss": 0.4184, "step": 19700 }, { "epoch": 0.710166864886294, "grad_norm": 0.16801589727401733, "learning_rate": 4.837344736634887e-05, "loss": 0.4629, "step": 19705 }, { "epoch": 0.7103470645475186, "grad_norm": 0.18342332541942596, "learning_rate": 4.8372411820021054e-05, "loss": 0.4433, "step": 19710 }, { "epoch": 0.7105272642087432, "grad_norm": 0.15994971990585327, "learning_rate": 4.8371375955248644e-05, "loss": 0.444, "step": 19715 }, { "epoch": 0.710707463869968, "grad_norm": 0.1518271565437317, "learning_rate": 4.837033977204577e-05, "loss": 0.4278, "step": 19720 }, { "epoch": 0.7108876635311926, "grad_norm": 0.18965473771095276, "learning_rate": 4.836930327042654e-05, "loss": 0.4386, "step": 19725 }, { "epoch": 0.7110678631924172, "grad_norm": 0.20504818856716156, "learning_rate": 4.8368266450405077e-05, "loss": 0.4627, "step": 19730 }, { "epoch": 0.7112480628536418, "grad_norm": 0.1648981124162674, "learning_rate": 4.83672293119955e-05, "loss": 0.4572, "step": 19735 }, { "epoch": 0.7114282625148665, "grad_norm": 0.2660510241985321, "learning_rate": 4.836619185521196e-05, "loss": 0.4748, "step": 19740 }, { "epoch": 0.7116084621760911, "grad_norm": 0.14745326340198517, "learning_rate": 4.836515408006857e-05, "loss": 0.4311, "step": 19745 }, { "epoch": 0.7117886618373157, "grad_norm": 0.13908466696739197, "learning_rate": 4.8364115986579485e-05, "loss": 0.436, "step": 19750 }, { "epoch": 0.7119688614985403, "grad_norm": 0.16212470829486847, "learning_rate": 4.8363077574758836e-05, "loss": 0.4103, "step": 19755 }, { "epoch": 0.7121490611597651, "grad_norm": 0.17893613874912262, "learning_rate": 4.836203884462078e-05, "loss": 0.4971, "step": 19760 }, { "epoch": 0.7123292608209897, "grad_norm": 0.18436241149902344, "learning_rate": 4.836099979617947e-05, "loss": 0.4828, "step": 19765 }, { "epoch": 0.7125094604822143, "grad_norm": 0.16563914716243744, "learning_rate": 4.835996042944907e-05, "loss": 0.4026, "step": 19770 }, { "epoch": 0.7126896601434389, "grad_norm": 0.1609710305929184, "learning_rate": 4.835892074444372e-05, "loss": 0.4159, "step": 19775 }, { "epoch": 0.7128698598046636, "grad_norm": 0.14441636204719543, "learning_rate": 4.8357880741177605e-05, "loss": 0.4171, "step": 19780 }, { "epoch": 0.7130500594658882, "grad_norm": 0.14850714802742004, "learning_rate": 4.835684041966488e-05, "loss": 0.4341, "step": 19785 }, { "epoch": 0.7132302591271128, "grad_norm": 0.17348629236221313, "learning_rate": 4.835579977991973e-05, "loss": 0.4328, "step": 19790 }, { "epoch": 0.7134104587883375, "grad_norm": 0.18963472545146942, "learning_rate": 4.8354758821956325e-05, "loss": 0.418, "step": 19795 }, { "epoch": 0.7135906584495622, "grad_norm": 0.15153972804546356, "learning_rate": 4.8353717545788855e-05, "loss": 0.4423, "step": 19800 }, { "epoch": 0.7137708581107868, "grad_norm": 0.17662297189235687, "learning_rate": 4.835267595143151e-05, "loss": 0.4625, "step": 19805 }, { "epoch": 0.7139510577720114, "grad_norm": 0.1881992071866989, "learning_rate": 4.8351842446858454e-05, "loss": 0.4305, "step": 19810 }, { "epoch": 0.714131257433236, "grad_norm": 0.1803097277879715, "learning_rate": 4.8350800279795086e-05, "loss": 0.4069, "step": 19815 }, { "epoch": 0.7143114570944606, "grad_norm": 0.14879196882247925, "learning_rate": 4.834975779458158e-05, "loss": 0.447, "step": 19820 }, { "epoch": 0.7144916567556853, "grad_norm": 0.1836402863264084, "learning_rate": 4.834871499123216e-05, "loss": 0.4518, "step": 19825 }, { "epoch": 0.7146718564169099, "grad_norm": 0.16930605471134186, "learning_rate": 4.8347671869761e-05, "loss": 0.4471, "step": 19830 }, { "epoch": 0.7148520560781346, "grad_norm": 0.14788581430912018, "learning_rate": 4.8346837143546e-05, "loss": 0.4639, "step": 19835 }, { "epoch": 0.7150322557393592, "grad_norm": 0.16612641513347626, "learning_rate": 4.834579344949157e-05, "loss": 0.4454, "step": 19840 }, { "epoch": 0.7152124554005839, "grad_norm": 0.17854836583137512, "learning_rate": 4.834474943735522e-05, "loss": 0.3823, "step": 19845 }, { "epoch": 0.7153926550618085, "grad_norm": 0.1602153182029724, "learning_rate": 4.834370510715118e-05, "loss": 0.4561, "step": 19850 }, { "epoch": 0.7155728547230331, "grad_norm": 0.1520301103591919, "learning_rate": 4.8342660458893677e-05, "loss": 0.4599, "step": 19855 }, { "epoch": 0.7157530543842577, "grad_norm": 0.16628147661685944, "learning_rate": 4.834161549259695e-05, "loss": 0.4178, "step": 19860 }, { "epoch": 0.7159332540454824, "grad_norm": 0.19706813991069794, "learning_rate": 4.8340570208275224e-05, "loss": 0.4677, "step": 19865 }, { "epoch": 0.716113453706707, "grad_norm": 0.1859341561794281, "learning_rate": 4.833952460594275e-05, "loss": 0.4665, "step": 19870 }, { "epoch": 0.7162936533679317, "grad_norm": 0.16585981845855713, "learning_rate": 4.8338478685613775e-05, "loss": 0.4378, "step": 19875 }, { "epoch": 0.7164738530291563, "grad_norm": 0.15817083418369293, "learning_rate": 4.8337432447302544e-05, "loss": 0.453, "step": 19880 }, { "epoch": 0.716654052690381, "grad_norm": 0.1971365064382553, "learning_rate": 4.833638589102332e-05, "loss": 0.4405, "step": 19885 }, { "epoch": 0.7168342523516056, "grad_norm": 0.16757121682167053, "learning_rate": 4.8335339016790346e-05, "loss": 0.4392, "step": 19890 }, { "epoch": 0.7170144520128302, "grad_norm": 0.20326150953769684, "learning_rate": 4.8334291824617905e-05, "loss": 0.4335, "step": 19895 }, { "epoch": 0.7171946516740548, "grad_norm": 0.16170883178710938, "learning_rate": 4.8333244314520254e-05, "loss": 0.4238, "step": 19900 }, { "epoch": 0.7173748513352795, "grad_norm": 0.17413537204265594, "learning_rate": 4.833219648651166e-05, "loss": 0.479, "step": 19905 }, { "epoch": 0.7175550509965041, "grad_norm": 0.17065051198005676, "learning_rate": 4.8331148340606416e-05, "loss": 0.4408, "step": 19910 }, { "epoch": 0.7177352506577288, "grad_norm": 0.22111043334007263, "learning_rate": 4.833009987681878e-05, "loss": 0.4456, "step": 19915 }, { "epoch": 0.7179154503189534, "grad_norm": 0.14916540682315826, "learning_rate": 4.832905109516306e-05, "loss": 0.4503, "step": 19920 }, { "epoch": 0.718095649980178, "grad_norm": 0.16950784623622894, "learning_rate": 4.832800199565353e-05, "loss": 0.4433, "step": 19925 }, { "epoch": 0.7182758496414027, "grad_norm": 0.15380850434303284, "learning_rate": 4.8326952578304496e-05, "loss": 0.4298, "step": 19930 }, { "epoch": 0.7184560493026273, "grad_norm": 0.16172832250595093, "learning_rate": 4.832590284313024e-05, "loss": 0.4582, "step": 19935 }, { "epoch": 0.7186362489638519, "grad_norm": 0.17244853079319, "learning_rate": 4.832485279014508e-05, "loss": 0.471, "step": 19940 }, { "epoch": 0.7188164486250765, "grad_norm": 0.14485011994838715, "learning_rate": 4.832380241936332e-05, "loss": 0.4233, "step": 19945 }, { "epoch": 0.7189966482863013, "grad_norm": 0.1884828507900238, "learning_rate": 4.832275173079926e-05, "loss": 0.4718, "step": 19950 }, { "epoch": 0.7191768479475259, "grad_norm": 0.17681987583637238, "learning_rate": 4.832170072446723e-05, "loss": 0.4592, "step": 19955 }, { "epoch": 0.7193570476087505, "grad_norm": 0.1624554544687271, "learning_rate": 4.832064940038154e-05, "loss": 0.4429, "step": 19960 }, { "epoch": 0.7195372472699751, "grad_norm": 0.19783754646778107, "learning_rate": 4.831959775855651e-05, "loss": 0.4366, "step": 19965 }, { "epoch": 0.7197174469311998, "grad_norm": 0.15661893784999847, "learning_rate": 4.831854579900649e-05, "loss": 0.4336, "step": 19970 }, { "epoch": 0.7198976465924244, "grad_norm": 0.1397693157196045, "learning_rate": 4.831749352174578e-05, "loss": 0.427, "step": 19975 }, { "epoch": 0.720077846253649, "grad_norm": 0.175705686211586, "learning_rate": 4.831644092678875e-05, "loss": 0.4108, "step": 19980 }, { "epoch": 0.7202580459148736, "grad_norm": 0.1558927595615387, "learning_rate": 4.831538801414972e-05, "loss": 0.4495, "step": 19985 }, { "epoch": 0.7204382455760984, "grad_norm": 0.17159906029701233, "learning_rate": 4.831433478384304e-05, "loss": 0.4014, "step": 19990 }, { "epoch": 0.720618445237323, "grad_norm": 0.13353735208511353, "learning_rate": 4.831328123588307e-05, "loss": 0.3997, "step": 19995 }, { "epoch": 0.7207986448985476, "grad_norm": 0.1744467169046402, "learning_rate": 4.8312227370284155e-05, "loss": 0.4517, "step": 20000 }, { "epoch": 0.7207986448985476, "eval_loss": 0.46302729845046997, "eval_runtime": 3.5385, "eval_samples_per_second": 28.261, "eval_steps_per_second": 7.065, "step": 20000 }, { "epoch": 0.7209788445597722, "grad_norm": 0.14821864664554596, "learning_rate": 4.831117318706065e-05, "loss": 0.4504, "step": 20005 }, { "epoch": 0.7211590442209969, "grad_norm": 0.16435876488685608, "learning_rate": 4.8310118686226926e-05, "loss": 0.4398, "step": 20010 }, { "epoch": 0.7213392438822215, "grad_norm": 0.17198477685451508, "learning_rate": 4.830906386779735e-05, "loss": 0.4684, "step": 20015 }, { "epoch": 0.7215194435434461, "grad_norm": 0.1611550748348236, "learning_rate": 4.830800873178629e-05, "loss": 0.4502, "step": 20020 }, { "epoch": 0.7216996432046707, "grad_norm": 0.18277272582054138, "learning_rate": 4.830695327820812e-05, "loss": 0.4442, "step": 20025 }, { "epoch": 0.7218798428658955, "grad_norm": 0.21301540732383728, "learning_rate": 4.8305897507077236e-05, "loss": 0.4286, "step": 20030 }, { "epoch": 0.7220600425271201, "grad_norm": 0.19387184083461761, "learning_rate": 4.8304841418407995e-05, "loss": 0.4271, "step": 20035 }, { "epoch": 0.7222402421883447, "grad_norm": 0.18802416324615479, "learning_rate": 4.8303785012214814e-05, "loss": 0.4276, "step": 20040 }, { "epoch": 0.7224204418495693, "grad_norm": 0.18947114050388336, "learning_rate": 4.8302728288512064e-05, "loss": 0.4585, "step": 20045 }, { "epoch": 0.722600641510794, "grad_norm": 0.18048980832099915, "learning_rate": 4.8301671247314165e-05, "loss": 0.4562, "step": 20050 }, { "epoch": 0.7227808411720186, "grad_norm": 0.16718988120555878, "learning_rate": 4.830061388863549e-05, "loss": 0.4233, "step": 20055 }, { "epoch": 0.7229610408332432, "grad_norm": 0.17260010540485382, "learning_rate": 4.8299556212490474e-05, "loss": 0.3985, "step": 20060 }, { "epoch": 0.7231412404944678, "grad_norm": 0.1411719173192978, "learning_rate": 4.829849821889352e-05, "loss": 0.4143, "step": 20065 }, { "epoch": 0.7233214401556926, "grad_norm": 0.19284233450889587, "learning_rate": 4.829743990785903e-05, "loss": 0.4585, "step": 20070 }, { "epoch": 0.7235016398169172, "grad_norm": 0.13794685900211334, "learning_rate": 4.829638127940143e-05, "loss": 0.4115, "step": 20075 }, { "epoch": 0.7236818394781418, "grad_norm": 0.17087949812412262, "learning_rate": 4.8295322333535146e-05, "loss": 0.4631, "step": 20080 }, { "epoch": 0.7238620391393664, "grad_norm": 0.18118272721767426, "learning_rate": 4.829426307027461e-05, "loss": 0.4688, "step": 20085 }, { "epoch": 0.724042238800591, "grad_norm": 0.1501915603876114, "learning_rate": 4.829320348963425e-05, "loss": 0.4322, "step": 20090 }, { "epoch": 0.7242224384618157, "grad_norm": 0.1526506394147873, "learning_rate": 4.8292143591628494e-05, "loss": 0.4439, "step": 20095 }, { "epoch": 0.7244026381230403, "grad_norm": 0.16467134654521942, "learning_rate": 4.82910833762718e-05, "loss": 0.4476, "step": 20100 }, { "epoch": 0.7245828377842649, "grad_norm": 0.199048712849617, "learning_rate": 4.82900228435786e-05, "loss": 0.4174, "step": 20105 }, { "epoch": 0.7247630374454896, "grad_norm": 0.22329410910606384, "learning_rate": 4.828896199356335e-05, "loss": 0.4558, "step": 20110 }, { "epoch": 0.7249432371067143, "grad_norm": 0.15609164535999298, "learning_rate": 4.82879008262405e-05, "loss": 0.4342, "step": 20115 }, { "epoch": 0.7251234367679389, "grad_norm": 0.15675166249275208, "learning_rate": 4.8286839341624515e-05, "loss": 0.4524, "step": 20120 }, { "epoch": 0.7253036364291635, "grad_norm": 0.16207316517829895, "learning_rate": 4.828577753972984e-05, "loss": 0.4452, "step": 20125 }, { "epoch": 0.7254838360903881, "grad_norm": 0.15296733379364014, "learning_rate": 4.8284715420570964e-05, "loss": 0.3956, "step": 20130 }, { "epoch": 0.7256640357516128, "grad_norm": 0.16610366106033325, "learning_rate": 4.8283652984162345e-05, "loss": 0.4279, "step": 20135 }, { "epoch": 0.7258442354128374, "grad_norm": 0.1927533745765686, "learning_rate": 4.828259023051847e-05, "loss": 0.4534, "step": 20140 }, { "epoch": 0.7260244350740621, "grad_norm": 0.13150177896022797, "learning_rate": 4.82815271596538e-05, "loss": 0.4348, "step": 20145 }, { "epoch": 0.7262046347352867, "grad_norm": 0.15792779624462128, "learning_rate": 4.8280463771582835e-05, "loss": 0.4647, "step": 20150 }, { "epoch": 0.7263848343965114, "grad_norm": 0.16151747107505798, "learning_rate": 4.8279400066320055e-05, "loss": 0.4718, "step": 20155 }, { "epoch": 0.726565034057736, "grad_norm": 0.14716650545597076, "learning_rate": 4.827833604387996e-05, "loss": 0.4125, "step": 20160 }, { "epoch": 0.7267452337189606, "grad_norm": 0.16372278332710266, "learning_rate": 4.827727170427704e-05, "loss": 0.4419, "step": 20165 }, { "epoch": 0.7269254333801852, "grad_norm": 0.15306620299816132, "learning_rate": 4.82762070475258e-05, "loss": 0.4368, "step": 20170 }, { "epoch": 0.7271056330414098, "grad_norm": 0.13666532933712006, "learning_rate": 4.827514207364075e-05, "loss": 0.4406, "step": 20175 }, { "epoch": 0.7272858327026345, "grad_norm": 0.19249454140663147, "learning_rate": 4.8274076782636393e-05, "loss": 0.4606, "step": 20180 }, { "epoch": 0.7274660323638592, "grad_norm": 0.1536799520254135, "learning_rate": 4.827301117452725e-05, "loss": 0.4276, "step": 20185 }, { "epoch": 0.7276462320250838, "grad_norm": 0.15559114515781403, "learning_rate": 4.8271945249327825e-05, "loss": 0.4438, "step": 20190 }, { "epoch": 0.7278264316863085, "grad_norm": 0.15832901000976562, "learning_rate": 4.827087900705266e-05, "loss": 0.4691, "step": 20195 }, { "epoch": 0.7280066313475331, "grad_norm": 0.20250186324119568, "learning_rate": 4.826981244771627e-05, "loss": 0.4298, "step": 20200 }, { "epoch": 0.7281868310087577, "grad_norm": 0.17052537202835083, "learning_rate": 4.826874557133319e-05, "loss": 0.4224, "step": 20205 }, { "epoch": 0.7283670306699823, "grad_norm": 0.15635935962200165, "learning_rate": 4.826767837791796e-05, "loss": 0.4561, "step": 20210 }, { "epoch": 0.7285472303312069, "grad_norm": 0.15289193391799927, "learning_rate": 4.826661086748512e-05, "loss": 0.4283, "step": 20215 }, { "epoch": 0.7287274299924316, "grad_norm": 0.16655376553535461, "learning_rate": 4.82655430400492e-05, "loss": 0.4535, "step": 20220 }, { "epoch": 0.7289076296536563, "grad_norm": 0.18925489485263824, "learning_rate": 4.826447489562477e-05, "loss": 0.4096, "step": 20225 }, { "epoch": 0.7290878293148809, "grad_norm": 0.15604831278324127, "learning_rate": 4.826340643422637e-05, "loss": 0.4379, "step": 20230 }, { "epoch": 0.7292680289761055, "grad_norm": 0.16162985563278198, "learning_rate": 4.826233765586856e-05, "loss": 0.4017, "step": 20235 }, { "epoch": 0.7294482286373302, "grad_norm": 0.17815397679805756, "learning_rate": 4.826126856056591e-05, "loss": 0.4229, "step": 20240 }, { "epoch": 0.7296284282985548, "grad_norm": 0.17258299887180328, "learning_rate": 4.826019914833297e-05, "loss": 0.4471, "step": 20245 }, { "epoch": 0.7298086279597794, "grad_norm": 0.1543063521385193, "learning_rate": 4.8259129419184326e-05, "loss": 0.4246, "step": 20250 }, { "epoch": 0.729988827621004, "grad_norm": 0.2001371681690216, "learning_rate": 4.8258059373134546e-05, "loss": 0.437, "step": 20255 }, { "epoch": 0.7301690272822287, "grad_norm": 0.16070033609867096, "learning_rate": 4.8256989010198215e-05, "loss": 0.4449, "step": 20260 }, { "epoch": 0.7303492269434534, "grad_norm": 0.16648970544338226, "learning_rate": 4.8255918330389906e-05, "loss": 0.4399, "step": 20265 }, { "epoch": 0.730529426604678, "grad_norm": 0.2026122361421585, "learning_rate": 4.8254847333724204e-05, "loss": 0.446, "step": 20270 }, { "epoch": 0.7307096262659026, "grad_norm": 0.1755245178937912, "learning_rate": 4.8253776020215725e-05, "loss": 0.4188, "step": 20275 }, { "epoch": 0.7308898259271273, "grad_norm": 0.15573327243328094, "learning_rate": 4.825270438987904e-05, "loss": 0.4301, "step": 20280 }, { "epoch": 0.7310700255883519, "grad_norm": 0.17587211728096008, "learning_rate": 4.825163244272876e-05, "loss": 0.4094, "step": 20285 }, { "epoch": 0.7312502252495765, "grad_norm": 0.15777091681957245, "learning_rate": 4.825056017877949e-05, "loss": 0.446, "step": 20290 }, { "epoch": 0.7314304249108011, "grad_norm": 0.1694624423980713, "learning_rate": 4.824948759804584e-05, "loss": 0.4262, "step": 20295 }, { "epoch": 0.7316106245720259, "grad_norm": 0.15097464621067047, "learning_rate": 4.824841470054242e-05, "loss": 0.4455, "step": 20300 }, { "epoch": 0.7317908242332505, "grad_norm": 0.1695692390203476, "learning_rate": 4.824734148628386e-05, "loss": 0.4678, "step": 20305 }, { "epoch": 0.7319710238944751, "grad_norm": 0.18447819352149963, "learning_rate": 4.824626795528476e-05, "loss": 0.4814, "step": 20310 }, { "epoch": 0.7321512235556997, "grad_norm": 0.15823550522327423, "learning_rate": 4.8245194107559774e-05, "loss": 0.4266, "step": 20315 }, { "epoch": 0.7323314232169243, "grad_norm": 0.17037831246852875, "learning_rate": 4.824411994312351e-05, "loss": 0.4264, "step": 20320 }, { "epoch": 0.732511622878149, "grad_norm": 0.17536459863185883, "learning_rate": 4.824304546199061e-05, "loss": 0.4311, "step": 20325 }, { "epoch": 0.7326918225393736, "grad_norm": 0.1618126928806305, "learning_rate": 4.824197066417572e-05, "loss": 0.4535, "step": 20330 }, { "epoch": 0.7328720222005982, "grad_norm": 0.19258461892604828, "learning_rate": 4.824089554969348e-05, "loss": 0.4267, "step": 20335 }, { "epoch": 0.733052221861823, "grad_norm": 0.19731229543685913, "learning_rate": 4.823982011855854e-05, "loss": 0.4058, "step": 20340 }, { "epoch": 0.7332324215230476, "grad_norm": 0.1781267374753952, "learning_rate": 4.8238744370785545e-05, "loss": 0.4357, "step": 20345 }, { "epoch": 0.7334126211842722, "grad_norm": 0.11069954186677933, "learning_rate": 4.823766830638916e-05, "loss": 0.3899, "step": 20350 }, { "epoch": 0.7335928208454968, "grad_norm": 0.19684797525405884, "learning_rate": 4.823659192538404e-05, "loss": 0.4537, "step": 20355 }, { "epoch": 0.7337730205067214, "grad_norm": 0.13966168463230133, "learning_rate": 4.8235515227784856e-05, "loss": 0.407, "step": 20360 }, { "epoch": 0.7339532201679461, "grad_norm": 0.17704619467258453, "learning_rate": 4.823443821360627e-05, "loss": 0.4499, "step": 20365 }, { "epoch": 0.7341334198291707, "grad_norm": 0.16276319324970245, "learning_rate": 4.8233360882862965e-05, "loss": 0.4285, "step": 20370 }, { "epoch": 0.7343136194903953, "grad_norm": 0.1888384073972702, "learning_rate": 4.823228323556962e-05, "loss": 0.4563, "step": 20375 }, { "epoch": 0.73449381915162, "grad_norm": 0.17120252549648285, "learning_rate": 4.8231205271740916e-05, "loss": 0.4775, "step": 20380 }, { "epoch": 0.7346740188128447, "grad_norm": 0.1270987093448639, "learning_rate": 4.8230126991391534e-05, "loss": 0.4342, "step": 20385 }, { "epoch": 0.7348542184740693, "grad_norm": 0.16976775228977203, "learning_rate": 4.822904839453617e-05, "loss": 0.4159, "step": 20390 }, { "epoch": 0.7350344181352939, "grad_norm": 0.1889120191335678, "learning_rate": 4.822796948118952e-05, "loss": 0.4339, "step": 20395 }, { "epoch": 0.7352146177965185, "grad_norm": 0.18324951827526093, "learning_rate": 4.822689025136627e-05, "loss": 0.4625, "step": 20400 }, { "epoch": 0.7353948174577432, "grad_norm": 0.19597896933555603, "learning_rate": 4.822581070508115e-05, "loss": 0.4828, "step": 20405 }, { "epoch": 0.7355750171189678, "grad_norm": 0.20112694799900055, "learning_rate": 4.8224730842348856e-05, "loss": 0.4774, "step": 20410 }, { "epoch": 0.7357552167801924, "grad_norm": 0.16758687794208527, "learning_rate": 4.8223650663184094e-05, "loss": 0.4098, "step": 20415 }, { "epoch": 0.7359354164414171, "grad_norm": 0.16736017167568207, "learning_rate": 4.82225701676016e-05, "loss": 0.452, "step": 20420 }, { "epoch": 0.7361156161026418, "grad_norm": 0.20359498262405396, "learning_rate": 4.822148935561607e-05, "loss": 0.419, "step": 20425 }, { "epoch": 0.7362958157638664, "grad_norm": 0.17675939202308655, "learning_rate": 4.8220408227242255e-05, "loss": 0.4267, "step": 20430 }, { "epoch": 0.736476015425091, "grad_norm": 0.18879984319210052, "learning_rate": 4.821932678249487e-05, "loss": 0.4493, "step": 20435 }, { "epoch": 0.7366562150863156, "grad_norm": 0.17970100045204163, "learning_rate": 4.821824502138864e-05, "loss": 0.4387, "step": 20440 }, { "epoch": 0.7368364147475402, "grad_norm": 0.16193163394927979, "learning_rate": 4.8217162943938333e-05, "loss": 0.418, "step": 20445 }, { "epoch": 0.7370166144087649, "grad_norm": 0.14100541174411774, "learning_rate": 4.821608055015867e-05, "loss": 0.4138, "step": 20450 }, { "epoch": 0.7371968140699896, "grad_norm": 0.16195495426654816, "learning_rate": 4.8214997840064404e-05, "loss": 0.4636, "step": 20455 }, { "epoch": 0.7373770137312142, "grad_norm": 0.15169329941272736, "learning_rate": 4.821391481367029e-05, "loss": 0.4275, "step": 20460 }, { "epoch": 0.7375572133924388, "grad_norm": 0.14933131635189056, "learning_rate": 4.8212831470991076e-05, "loss": 0.4403, "step": 20465 }, { "epoch": 0.7377374130536635, "grad_norm": 0.19773240387439728, "learning_rate": 4.821174781204153e-05, "loss": 0.4501, "step": 20470 }, { "epoch": 0.7379176127148881, "grad_norm": 0.18996679782867432, "learning_rate": 4.821066383683641e-05, "loss": 0.4403, "step": 20475 }, { "epoch": 0.7380978123761127, "grad_norm": 0.17520780861377716, "learning_rate": 4.82095795453905e-05, "loss": 0.4348, "step": 20480 }, { "epoch": 0.7382780120373373, "grad_norm": 0.22540989518165588, "learning_rate": 4.820849493771855e-05, "loss": 0.4144, "step": 20485 }, { "epoch": 0.738458211698562, "grad_norm": 0.1550588756799698, "learning_rate": 4.820741001383536e-05, "loss": 0.3918, "step": 20490 }, { "epoch": 0.7386384113597867, "grad_norm": 0.16206443309783936, "learning_rate": 4.82063247737557e-05, "loss": 0.4357, "step": 20495 }, { "epoch": 0.7388186110210113, "grad_norm": 0.20156899094581604, "learning_rate": 4.820523921749435e-05, "loss": 0.4141, "step": 20500 }, { "epoch": 0.7388186110210113, "eval_loss": 0.4630433917045593, "eval_runtime": 3.5384, "eval_samples_per_second": 28.261, "eval_steps_per_second": 7.065, "step": 20500 }, { "epoch": 0.7389988106822359, "grad_norm": 0.15963950753211975, "learning_rate": 4.820415334506611e-05, "loss": 0.433, "step": 20505 }, { "epoch": 0.7391790103434606, "grad_norm": 0.14632678031921387, "learning_rate": 4.8203067156485775e-05, "loss": 0.401, "step": 20510 }, { "epoch": 0.7393592100046852, "grad_norm": 0.13777978718280792, "learning_rate": 4.820198065176814e-05, "loss": 0.4334, "step": 20515 }, { "epoch": 0.7395394096659098, "grad_norm": 0.1814904361963272, "learning_rate": 4.820089383092802e-05, "loss": 0.4499, "step": 20520 }, { "epoch": 0.7397196093271344, "grad_norm": 0.17688485980033875, "learning_rate": 4.819980669398021e-05, "loss": 0.4625, "step": 20525 }, { "epoch": 0.739899808988359, "grad_norm": 0.17957401275634766, "learning_rate": 4.819871924093951e-05, "loss": 0.4324, "step": 20530 }, { "epoch": 0.7400800086495838, "grad_norm": 0.17410768568515778, "learning_rate": 4.819763147182077e-05, "loss": 0.4289, "step": 20535 }, { "epoch": 0.7402602083108084, "grad_norm": 0.15775148570537567, "learning_rate": 4.819654338663879e-05, "loss": 0.423, "step": 20540 }, { "epoch": 0.740440407972033, "grad_norm": 0.2120039314031601, "learning_rate": 4.8195454985408394e-05, "loss": 0.4377, "step": 20545 }, { "epoch": 0.7406206076332577, "grad_norm": 0.15215463936328888, "learning_rate": 4.8194366268144415e-05, "loss": 0.4067, "step": 20550 }, { "epoch": 0.7408008072944823, "grad_norm": 0.14415352046489716, "learning_rate": 4.8193277234861686e-05, "loss": 0.4525, "step": 20555 }, { "epoch": 0.7409810069557069, "grad_norm": 0.18399272859096527, "learning_rate": 4.819218788557505e-05, "loss": 0.4216, "step": 20560 }, { "epoch": 0.7411612066169315, "grad_norm": 0.20752954483032227, "learning_rate": 4.819109822029933e-05, "loss": 0.4236, "step": 20565 }, { "epoch": 0.7413414062781561, "grad_norm": 0.1981305330991745, "learning_rate": 4.81900082390494e-05, "loss": 0.4087, "step": 20570 }, { "epoch": 0.7415216059393809, "grad_norm": 0.22662696242332458, "learning_rate": 4.818891794184009e-05, "loss": 0.4559, "step": 20575 }, { "epoch": 0.7417018056006055, "grad_norm": 0.16101478040218353, "learning_rate": 4.818782732868627e-05, "loss": 0.4777, "step": 20580 }, { "epoch": 0.7418820052618301, "grad_norm": 0.1838495433330536, "learning_rate": 4.8186736399602784e-05, "loss": 0.4384, "step": 20585 }, { "epoch": 0.7420622049230547, "grad_norm": 0.13098390400409698, "learning_rate": 4.818564515460451e-05, "loss": 0.4176, "step": 20590 }, { "epoch": 0.7422424045842794, "grad_norm": 0.18415838479995728, "learning_rate": 4.818455359370631e-05, "loss": 0.4396, "step": 20595 }, { "epoch": 0.742422604245504, "grad_norm": 0.19082091748714447, "learning_rate": 4.818346171692305e-05, "loss": 0.4251, "step": 20600 }, { "epoch": 0.7426028039067286, "grad_norm": 0.1550193876028061, "learning_rate": 4.8182369524269616e-05, "loss": 0.4532, "step": 20605 }, { "epoch": 0.7427830035679532, "grad_norm": 0.18546633422374725, "learning_rate": 4.818127701576089e-05, "loss": 0.4659, "step": 20610 }, { "epoch": 0.742963203229178, "grad_norm": 0.1380166858434677, "learning_rate": 4.818018419141174e-05, "loss": 0.4328, "step": 20615 }, { "epoch": 0.7431434028904026, "grad_norm": 0.15850889682769775, "learning_rate": 4.817909105123708e-05, "loss": 0.4383, "step": 20620 }, { "epoch": 0.7433236025516272, "grad_norm": 0.14858169853687286, "learning_rate": 4.817799759525179e-05, "loss": 0.4636, "step": 20625 }, { "epoch": 0.7435038022128518, "grad_norm": 0.15085402131080627, "learning_rate": 4.8176903823470765e-05, "loss": 0.4968, "step": 20630 }, { "epoch": 0.7436840018740765, "grad_norm": 0.16185277700424194, "learning_rate": 4.817580973590892e-05, "loss": 0.4329, "step": 20635 }, { "epoch": 0.7438642015353011, "grad_norm": 0.13464045524597168, "learning_rate": 4.817471533258114e-05, "loss": 0.4278, "step": 20640 }, { "epoch": 0.7440444011965257, "grad_norm": 0.18652474880218506, "learning_rate": 4.817362061350237e-05, "loss": 0.4305, "step": 20645 }, { "epoch": 0.7442246008577504, "grad_norm": 0.17949911952018738, "learning_rate": 4.81725255786875e-05, "loss": 0.4353, "step": 20650 }, { "epoch": 0.7444048005189751, "grad_norm": 0.1657877266407013, "learning_rate": 4.817143022815145e-05, "loss": 0.4464, "step": 20655 }, { "epoch": 0.7445850001801997, "grad_norm": 0.14313843846321106, "learning_rate": 4.817033456190915e-05, "loss": 0.4019, "step": 20660 }, { "epoch": 0.7447651998414243, "grad_norm": 0.2136252373456955, "learning_rate": 4.816923857997553e-05, "loss": 0.4717, "step": 20665 }, { "epoch": 0.7449453995026489, "grad_norm": 0.19905641674995422, "learning_rate": 4.816814228236551e-05, "loss": 0.4624, "step": 20670 }, { "epoch": 0.7451255991638736, "grad_norm": 0.18663300573825836, "learning_rate": 4.8167045669094044e-05, "loss": 0.4294, "step": 20675 }, { "epoch": 0.7453057988250982, "grad_norm": 0.1515270322561264, "learning_rate": 4.816594874017607e-05, "loss": 0.4471, "step": 20680 }, { "epoch": 0.7454859984863228, "grad_norm": 0.15805648267269135, "learning_rate": 4.8164851495626526e-05, "loss": 0.4288, "step": 20685 }, { "epoch": 0.7456661981475475, "grad_norm": 0.15768414735794067, "learning_rate": 4.816375393546037e-05, "loss": 0.4443, "step": 20690 }, { "epoch": 0.7458463978087722, "grad_norm": 0.15465152263641357, "learning_rate": 4.8162656059692545e-05, "loss": 0.4479, "step": 20695 }, { "epoch": 0.7460265974699968, "grad_norm": 0.17679835855960846, "learning_rate": 4.816155786833802e-05, "loss": 0.4345, "step": 20700 }, { "epoch": 0.7462067971312214, "grad_norm": 0.1588921993970871, "learning_rate": 4.816045936141175e-05, "loss": 0.4298, "step": 20705 }, { "epoch": 0.746386996792446, "grad_norm": 0.18818655610084534, "learning_rate": 4.815936053892871e-05, "loss": 0.4402, "step": 20710 }, { "epoch": 0.7465671964536706, "grad_norm": 0.11379419267177582, "learning_rate": 4.815826140090386e-05, "loss": 0.4041, "step": 20715 }, { "epoch": 0.7467473961148953, "grad_norm": 0.22116567194461823, "learning_rate": 4.815716194735218e-05, "loss": 0.4516, "step": 20720 }, { "epoch": 0.7469275957761199, "grad_norm": 0.20635400712490082, "learning_rate": 4.8156062178288666e-05, "loss": 0.4372, "step": 20725 }, { "epoch": 0.7471077954373446, "grad_norm": 0.19008758664131165, "learning_rate": 4.815496209372827e-05, "loss": 0.4649, "step": 20730 }, { "epoch": 0.7472879950985692, "grad_norm": 0.1746254712343216, "learning_rate": 4.815386169368601e-05, "loss": 0.454, "step": 20735 }, { "epoch": 0.7474681947597939, "grad_norm": 0.16542162001132965, "learning_rate": 4.8152760978176864e-05, "loss": 0.4185, "step": 20740 }, { "epoch": 0.7476483944210185, "grad_norm": 0.17639301717281342, "learning_rate": 4.815165994721583e-05, "loss": 0.4053, "step": 20745 }, { "epoch": 0.7478285940822431, "grad_norm": 0.13610634207725525, "learning_rate": 4.8150558600817916e-05, "loss": 0.4214, "step": 20750 }, { "epoch": 0.7480087937434677, "grad_norm": 0.17135217785835266, "learning_rate": 4.814945693899812e-05, "loss": 0.4546, "step": 20755 }, { "epoch": 0.7481889934046924, "grad_norm": 0.18286027014255524, "learning_rate": 4.8148354961771457e-05, "loss": 0.4598, "step": 20760 }, { "epoch": 0.748369193065917, "grad_norm": 0.17390398681163788, "learning_rate": 4.814725266915294e-05, "loss": 0.4174, "step": 20765 }, { "epoch": 0.7485493927271417, "grad_norm": 0.18580150604248047, "learning_rate": 4.814615006115759e-05, "loss": 0.4566, "step": 20770 }, { "epoch": 0.7487295923883663, "grad_norm": 0.1448039710521698, "learning_rate": 4.814504713780041e-05, "loss": 0.4102, "step": 20775 }, { "epoch": 0.748909792049591, "grad_norm": 0.14498092234134674, "learning_rate": 4.814394389909647e-05, "loss": 0.4198, "step": 20780 }, { "epoch": 0.7490899917108156, "grad_norm": 0.14252233505249023, "learning_rate": 4.814284034506076e-05, "loss": 0.4443, "step": 20785 }, { "epoch": 0.7492701913720402, "grad_norm": 0.15442687273025513, "learning_rate": 4.8141736475708325e-05, "loss": 0.4487, "step": 20790 }, { "epoch": 0.7494503910332648, "grad_norm": 0.15849895775318146, "learning_rate": 4.814063229105422e-05, "loss": 0.418, "step": 20795 }, { "epoch": 0.7496305906944895, "grad_norm": 0.16628096997737885, "learning_rate": 4.813952779111348e-05, "loss": 0.4183, "step": 20800 }, { "epoch": 0.7498107903557142, "grad_norm": 0.16264590620994568, "learning_rate": 4.813842297590115e-05, "loss": 0.4466, "step": 20805 }, { "epoch": 0.7499909900169388, "grad_norm": 0.16959542036056519, "learning_rate": 4.813731784543229e-05, "loss": 0.4476, "step": 20810 }, { "epoch": 0.7501711896781634, "grad_norm": 0.14073093235492706, "learning_rate": 4.813621239972195e-05, "loss": 0.4246, "step": 20815 }, { "epoch": 0.750351389339388, "grad_norm": 0.21107293665409088, "learning_rate": 4.81351066387852e-05, "loss": 0.4279, "step": 20820 }, { "epoch": 0.7505315890006127, "grad_norm": 0.1864260733127594, "learning_rate": 4.813400056263709e-05, "loss": 0.4295, "step": 20825 }, { "epoch": 0.7507117886618373, "grad_norm": 0.13779747486114502, "learning_rate": 4.813289417129272e-05, "loss": 0.4166, "step": 20830 }, { "epoch": 0.7508919883230619, "grad_norm": 0.17728812992572784, "learning_rate": 4.813178746476713e-05, "loss": 0.4764, "step": 20835 }, { "epoch": 0.7510721879842865, "grad_norm": 0.1685779094696045, "learning_rate": 4.813068044307543e-05, "loss": 0.4457, "step": 20840 }, { "epoch": 0.7512523876455113, "grad_norm": 0.17559683322906494, "learning_rate": 4.812957310623267e-05, "loss": 0.4377, "step": 20845 }, { "epoch": 0.7514325873067359, "grad_norm": 0.18856866657733917, "learning_rate": 4.812846545425396e-05, "loss": 0.4306, "step": 20850 }, { "epoch": 0.7516127869679605, "grad_norm": 0.1679704487323761, "learning_rate": 4.812735748715439e-05, "loss": 0.4218, "step": 20855 }, { "epoch": 0.7517929866291851, "grad_norm": 0.20436346530914307, "learning_rate": 4.812624920494905e-05, "loss": 0.4475, "step": 20860 }, { "epoch": 0.7519731862904098, "grad_norm": 0.16926033794879913, "learning_rate": 4.812514060765304e-05, "loss": 0.4159, "step": 20865 }, { "epoch": 0.7521533859516344, "grad_norm": 0.18585766851902008, "learning_rate": 4.8124031695281465e-05, "loss": 0.4935, "step": 20870 }, { "epoch": 0.752333585612859, "grad_norm": 0.15015818178653717, "learning_rate": 4.812292246784944e-05, "loss": 0.4683, "step": 20875 }, { "epoch": 0.7525137852740836, "grad_norm": 0.18261437118053436, "learning_rate": 4.8121812925372074e-05, "loss": 0.4706, "step": 20880 }, { "epoch": 0.7526939849353084, "grad_norm": 0.15235644578933716, "learning_rate": 4.812070306786448e-05, "loss": 0.493, "step": 20885 }, { "epoch": 0.752874184596533, "grad_norm": 0.1405205875635147, "learning_rate": 4.811959289534178e-05, "loss": 0.4798, "step": 20890 }, { "epoch": 0.7530543842577576, "grad_norm": 0.16182038187980652, "learning_rate": 4.811848240781911e-05, "loss": 0.4287, "step": 20895 }, { "epoch": 0.7532345839189822, "grad_norm": 0.17305776476860046, "learning_rate": 4.811737160531159e-05, "loss": 0.4401, "step": 20900 }, { "epoch": 0.7534147835802069, "grad_norm": 0.1699703484773636, "learning_rate": 4.811626048783435e-05, "loss": 0.4514, "step": 20905 }, { "epoch": 0.7535949832414315, "grad_norm": 0.1501130610704422, "learning_rate": 4.8115149055402545e-05, "loss": 0.4463, "step": 20910 }, { "epoch": 0.7537751829026561, "grad_norm": 0.18883809447288513, "learning_rate": 4.811403730803131e-05, "loss": 0.4432, "step": 20915 }, { "epoch": 0.7539553825638807, "grad_norm": 0.1885019838809967, "learning_rate": 4.811292524573579e-05, "loss": 0.4145, "step": 20920 }, { "epoch": 0.7541355822251055, "grad_norm": 0.2025119662284851, "learning_rate": 4.811181286853113e-05, "loss": 0.4805, "step": 20925 }, { "epoch": 0.7543157818863301, "grad_norm": 0.22509577870368958, "learning_rate": 4.811070017643251e-05, "loss": 0.4674, "step": 20930 }, { "epoch": 0.7544959815475547, "grad_norm": 0.18206484615802765, "learning_rate": 4.810958716945507e-05, "loss": 0.4384, "step": 20935 }, { "epoch": 0.7546761812087793, "grad_norm": 0.1454673856496811, "learning_rate": 4.810847384761397e-05, "loss": 0.4224, "step": 20940 }, { "epoch": 0.754856380870004, "grad_norm": 0.15570025146007538, "learning_rate": 4.810736021092439e-05, "loss": 0.471, "step": 20945 }, { "epoch": 0.7550365805312286, "grad_norm": 0.18441855907440186, "learning_rate": 4.810624625940151e-05, "loss": 0.4365, "step": 20950 }, { "epoch": 0.7552167801924532, "grad_norm": 0.20488105714321136, "learning_rate": 4.810513199306049e-05, "loss": 0.458, "step": 20955 }, { "epoch": 0.7553969798536779, "grad_norm": 0.16301538050174713, "learning_rate": 4.8104017411916526e-05, "loss": 0.4031, "step": 20960 }, { "epoch": 0.7555771795149026, "grad_norm": 0.1542273312807083, "learning_rate": 4.810290251598479e-05, "loss": 0.4097, "step": 20965 }, { "epoch": 0.7557573791761272, "grad_norm": 0.1671588271856308, "learning_rate": 4.8101787305280485e-05, "loss": 0.3957, "step": 20970 }, { "epoch": 0.7559375788373518, "grad_norm": 0.1824352741241455, "learning_rate": 4.8100671779818795e-05, "loss": 0.4619, "step": 20975 }, { "epoch": 0.7561177784985764, "grad_norm": 0.18826597929000854, "learning_rate": 4.809955593961493e-05, "loss": 0.4472, "step": 20980 }, { "epoch": 0.756297978159801, "grad_norm": 0.14293283224105835, "learning_rate": 4.809843978468409e-05, "loss": 0.4693, "step": 20985 }, { "epoch": 0.7564781778210257, "grad_norm": 0.13552211225032806, "learning_rate": 4.809732331504148e-05, "loss": 0.39, "step": 20990 }, { "epoch": 0.7566583774822503, "grad_norm": 0.1683761477470398, "learning_rate": 4.8096206530702305e-05, "loss": 0.4059, "step": 20995 }, { "epoch": 0.756838577143475, "grad_norm": 0.17060311138629913, "learning_rate": 4.809508943168179e-05, "loss": 0.4352, "step": 21000 }, { "epoch": 0.756838577143475, "eval_loss": 0.462345689535141, "eval_runtime": 3.5382, "eval_samples_per_second": 28.263, "eval_steps_per_second": 7.066, "step": 21000 }, { "epoch": 0.7570187768046996, "grad_norm": 0.179361492395401, "learning_rate": 4.8093972017995155e-05, "loss": 0.4466, "step": 21005 }, { "epoch": 0.7571989764659243, "grad_norm": 0.19304408133029938, "learning_rate": 4.8092854289657617e-05, "loss": 0.4786, "step": 21010 }, { "epoch": 0.7573791761271489, "grad_norm": 0.16757160425186157, "learning_rate": 4.809173624668442e-05, "loss": 0.4614, "step": 21015 }, { "epoch": 0.7575593757883735, "grad_norm": 0.16519469022750854, "learning_rate": 4.8090617889090786e-05, "loss": 0.4297, "step": 21020 }, { "epoch": 0.7577395754495981, "grad_norm": 0.1686154156923294, "learning_rate": 4.808949921689194e-05, "loss": 0.387, "step": 21025 }, { "epoch": 0.7579197751108228, "grad_norm": 0.1524842232465744, "learning_rate": 4.8088380230103145e-05, "loss": 0.4291, "step": 21030 }, { "epoch": 0.7580999747720474, "grad_norm": 0.18697072565555573, "learning_rate": 4.808726092873964e-05, "loss": 0.4262, "step": 21035 }, { "epoch": 0.7582801744332721, "grad_norm": 0.21223722398281097, "learning_rate": 4.808614131281668e-05, "loss": 0.4522, "step": 21040 }, { "epoch": 0.7584603740944967, "grad_norm": 0.16958850622177124, "learning_rate": 4.808502138234951e-05, "loss": 0.4667, "step": 21045 }, { "epoch": 0.7586405737557214, "grad_norm": 0.1471187025308609, "learning_rate": 4.808390113735339e-05, "loss": 0.4847, "step": 21050 }, { "epoch": 0.758820773416946, "grad_norm": 0.156825989484787, "learning_rate": 4.808278057784359e-05, "loss": 0.4648, "step": 21055 }, { "epoch": 0.7590009730781706, "grad_norm": 0.15920226275920868, "learning_rate": 4.808165970383538e-05, "loss": 0.4215, "step": 21060 }, { "epoch": 0.7591811727393952, "grad_norm": 0.1420614868402481, "learning_rate": 4.808053851534401e-05, "loss": 0.411, "step": 21065 }, { "epoch": 0.7593613724006198, "grad_norm": 0.21698161959648132, "learning_rate": 4.8079417012384786e-05, "loss": 0.4415, "step": 21070 }, { "epoch": 0.7595415720618445, "grad_norm": 0.18251734972000122, "learning_rate": 4.807829519497297e-05, "loss": 0.4644, "step": 21075 }, { "epoch": 0.7597217717230692, "grad_norm": 0.18656039237976074, "learning_rate": 4.8077173063123843e-05, "loss": 0.456, "step": 21080 }, { "epoch": 0.7599019713842938, "grad_norm": 0.17298544943332672, "learning_rate": 4.807605061685271e-05, "loss": 0.4445, "step": 21085 }, { "epoch": 0.7600821710455185, "grad_norm": 0.1439000368118286, "learning_rate": 4.807492785617484e-05, "loss": 0.4311, "step": 21090 }, { "epoch": 0.7602623707067431, "grad_norm": 0.14586910605430603, "learning_rate": 4.807380478110556e-05, "loss": 0.4383, "step": 21095 }, { "epoch": 0.7604425703679677, "grad_norm": 0.18215352296829224, "learning_rate": 4.8072681391660153e-05, "loss": 0.4222, "step": 21100 }, { "epoch": 0.7606227700291923, "grad_norm": 0.1855933964252472, "learning_rate": 4.807155768785393e-05, "loss": 0.4237, "step": 21105 }, { "epoch": 0.7608029696904169, "grad_norm": 0.22114041447639465, "learning_rate": 4.80704336697022e-05, "loss": 0.4643, "step": 21110 }, { "epoch": 0.7609831693516416, "grad_norm": 0.1789657175540924, "learning_rate": 4.806930933722027e-05, "loss": 0.4599, "step": 21115 }, { "epoch": 0.7611633690128663, "grad_norm": 0.17131780087947845, "learning_rate": 4.806818469042348e-05, "loss": 0.4171, "step": 21120 }, { "epoch": 0.7613435686740909, "grad_norm": 0.20241133868694305, "learning_rate": 4.806705972932713e-05, "loss": 0.4611, "step": 21125 }, { "epoch": 0.7615237683353155, "grad_norm": 0.19503600895404816, "learning_rate": 4.806593445394656e-05, "loss": 0.4813, "step": 21130 }, { "epoch": 0.7617039679965402, "grad_norm": 0.1657884567975998, "learning_rate": 4.8064808864297094e-05, "loss": 0.4784, "step": 21135 }, { "epoch": 0.7618841676577648, "grad_norm": 0.18670715391635895, "learning_rate": 4.806368296039409e-05, "loss": 0.42, "step": 21140 }, { "epoch": 0.7620643673189894, "grad_norm": 0.16101418435573578, "learning_rate": 4.806255674225285e-05, "loss": 0.4258, "step": 21145 }, { "epoch": 0.762244566980214, "grad_norm": 0.16799142956733704, "learning_rate": 4.806143020988875e-05, "loss": 0.4105, "step": 21150 }, { "epoch": 0.7624247666414388, "grad_norm": 0.15857410430908203, "learning_rate": 4.806030336331713e-05, "loss": 0.4251, "step": 21155 }, { "epoch": 0.7626049663026634, "grad_norm": 0.16540922224521637, "learning_rate": 4.8059176202553337e-05, "loss": 0.4593, "step": 21160 }, { "epoch": 0.762785165963888, "grad_norm": 0.1826573759317398, "learning_rate": 4.8058048727612724e-05, "loss": 0.4513, "step": 21165 }, { "epoch": 0.7629653656251126, "grad_norm": 0.1935107260942459, "learning_rate": 4.8056920938510675e-05, "loss": 0.4293, "step": 21170 }, { "epoch": 0.7631455652863373, "grad_norm": 0.18235011398792267, "learning_rate": 4.8055792835262536e-05, "loss": 0.4956, "step": 21175 }, { "epoch": 0.7633257649475619, "grad_norm": 0.168792262673378, "learning_rate": 4.8054664417883685e-05, "loss": 0.4623, "step": 21180 }, { "epoch": 0.7635059646087865, "grad_norm": 0.17810115218162537, "learning_rate": 4.8053535686389495e-05, "loss": 0.4265, "step": 21185 }, { "epoch": 0.7636861642700111, "grad_norm": 0.20032259821891785, "learning_rate": 4.805240664079534e-05, "loss": 0.4372, "step": 21190 }, { "epoch": 0.7638663639312359, "grad_norm": 0.147112175822258, "learning_rate": 4.805127728111662e-05, "loss": 0.4288, "step": 21195 }, { "epoch": 0.7640465635924605, "grad_norm": 0.1480092704296112, "learning_rate": 4.80501476073687e-05, "loss": 0.4137, "step": 21200 }, { "epoch": 0.7642267632536851, "grad_norm": 0.17761540412902832, "learning_rate": 4.8049017619566986e-05, "loss": 0.4727, "step": 21205 }, { "epoch": 0.7644069629149097, "grad_norm": 0.14854562282562256, "learning_rate": 4.8047887317726865e-05, "loss": 0.4549, "step": 21210 }, { "epoch": 0.7645871625761343, "grad_norm": 0.1848391890525818, "learning_rate": 4.804675670186374e-05, "loss": 0.4468, "step": 21215 }, { "epoch": 0.764767362237359, "grad_norm": 0.18001249432563782, "learning_rate": 4.804562577199302e-05, "loss": 0.4149, "step": 21220 }, { "epoch": 0.7649475618985836, "grad_norm": 0.16468872129917145, "learning_rate": 4.804449452813011e-05, "loss": 0.4224, "step": 21225 }, { "epoch": 0.7651277615598082, "grad_norm": 0.17478923499584198, "learning_rate": 4.804336297029043e-05, "loss": 0.4528, "step": 21230 }, { "epoch": 0.765307961221033, "grad_norm": 0.15894725918769836, "learning_rate": 4.804223109848939e-05, "loss": 0.4523, "step": 21235 }, { "epoch": 0.7654881608822576, "grad_norm": 0.14751605689525604, "learning_rate": 4.80410989127424e-05, "loss": 0.4273, "step": 21240 }, { "epoch": 0.7656683605434822, "grad_norm": 0.2025453895330429, "learning_rate": 4.803996641306491e-05, "loss": 0.44, "step": 21245 }, { "epoch": 0.7658485602047068, "grad_norm": 0.14769452810287476, "learning_rate": 4.803883359947233e-05, "loss": 0.4416, "step": 21250 }, { "epoch": 0.7660287598659314, "grad_norm": 0.15946455299854279, "learning_rate": 4.803770047198011e-05, "loss": 0.4366, "step": 21255 }, { "epoch": 0.7662089595271561, "grad_norm": 0.1485140323638916, "learning_rate": 4.8036567030603676e-05, "loss": 0.4624, "step": 21260 }, { "epoch": 0.7663891591883807, "grad_norm": 0.15414132177829742, "learning_rate": 4.803543327535848e-05, "loss": 0.4321, "step": 21265 }, { "epoch": 0.7665693588496053, "grad_norm": 0.17065320909023285, "learning_rate": 4.803429920625996e-05, "loss": 0.4396, "step": 21270 }, { "epoch": 0.76674955851083, "grad_norm": 0.15544413030147552, "learning_rate": 4.803316482332358e-05, "loss": 0.4714, "step": 21275 }, { "epoch": 0.7669297581720547, "grad_norm": 0.22066541016101837, "learning_rate": 4.803203012656479e-05, "loss": 0.4651, "step": 21280 }, { "epoch": 0.7671099578332793, "grad_norm": 0.21565388143062592, "learning_rate": 4.803089511599904e-05, "loss": 0.4593, "step": 21285 }, { "epoch": 0.7672901574945039, "grad_norm": 0.15955856442451477, "learning_rate": 4.8029759791641804e-05, "loss": 0.4406, "step": 21290 }, { "epoch": 0.7674703571557285, "grad_norm": 0.19206349551677704, "learning_rate": 4.8028624153508555e-05, "loss": 0.4127, "step": 21295 }, { "epoch": 0.7676505568169532, "grad_norm": 0.15551474690437317, "learning_rate": 4.8027488201614754e-05, "loss": 0.4212, "step": 21300 }, { "epoch": 0.7678307564781778, "grad_norm": 0.16938930749893188, "learning_rate": 4.802635193597589e-05, "loss": 0.4515, "step": 21305 }, { "epoch": 0.7680109561394025, "grad_norm": 0.16218577325344086, "learning_rate": 4.802521535660744e-05, "loss": 0.4228, "step": 21310 }, { "epoch": 0.7681911558006271, "grad_norm": 0.14813528954982758, "learning_rate": 4.802407846352488e-05, "loss": 0.4586, "step": 21315 }, { "epoch": 0.7683713554618518, "grad_norm": 0.17613081634044647, "learning_rate": 4.802294125674372e-05, "loss": 0.4574, "step": 21320 }, { "epoch": 0.7685515551230764, "grad_norm": 0.1591581255197525, "learning_rate": 4.8021803736279435e-05, "loss": 0.4451, "step": 21325 }, { "epoch": 0.768731754784301, "grad_norm": 0.16665023565292358, "learning_rate": 4.8020665902147535e-05, "loss": 0.4057, "step": 21330 }, { "epoch": 0.7689119544455256, "grad_norm": 0.17413721978664398, "learning_rate": 4.801952775436352e-05, "loss": 0.4655, "step": 21335 }, { "epoch": 0.7690921541067502, "grad_norm": 0.1840430349111557, "learning_rate": 4.8018389292942886e-05, "loss": 0.4568, "step": 21340 }, { "epoch": 0.7692723537679749, "grad_norm": 0.158147931098938, "learning_rate": 4.801725051790117e-05, "loss": 0.4459, "step": 21345 }, { "epoch": 0.7694525534291996, "grad_norm": 0.19126906991004944, "learning_rate": 4.801611142925386e-05, "loss": 0.4429, "step": 21350 }, { "epoch": 0.7696327530904242, "grad_norm": 0.22057095170021057, "learning_rate": 4.801497202701649e-05, "loss": 0.4369, "step": 21355 }, { "epoch": 0.7698129527516488, "grad_norm": 0.16785407066345215, "learning_rate": 4.8013832311204586e-05, "loss": 0.4594, "step": 21360 }, { "epoch": 0.7699931524128735, "grad_norm": 0.17427514493465424, "learning_rate": 4.801269228183367e-05, "loss": 0.4408, "step": 21365 }, { "epoch": 0.7701733520740981, "grad_norm": 0.14857777953147888, "learning_rate": 4.8011551938919283e-05, "loss": 0.4515, "step": 21370 }, { "epoch": 0.7703535517353227, "grad_norm": 0.19615520536899567, "learning_rate": 4.801041128247695e-05, "loss": 0.4395, "step": 21375 }, { "epoch": 0.7705337513965473, "grad_norm": 0.46812736988067627, "learning_rate": 4.800927031252222e-05, "loss": 0.3949, "step": 21380 }, { "epoch": 0.770713951057772, "grad_norm": 0.18027018010616302, "learning_rate": 4.800812902907063e-05, "loss": 0.446, "step": 21385 }, { "epoch": 0.7708941507189967, "grad_norm": 0.1679840236902237, "learning_rate": 4.800698743213774e-05, "loss": 0.4554, "step": 21390 }, { "epoch": 0.7710743503802213, "grad_norm": 0.19741559028625488, "learning_rate": 4.80058455217391e-05, "loss": 0.4695, "step": 21395 }, { "epoch": 0.7712545500414459, "grad_norm": 0.19206707179546356, "learning_rate": 4.800470329789027e-05, "loss": 0.4572, "step": 21400 }, { "epoch": 0.7714347497026706, "grad_norm": 0.17079663276672363, "learning_rate": 4.800356076060682e-05, "loss": 0.4186, "step": 21405 }, { "epoch": 0.7716149493638952, "grad_norm": 0.20327655971050262, "learning_rate": 4.800241790990429e-05, "loss": 0.4195, "step": 21410 }, { "epoch": 0.7717951490251198, "grad_norm": 0.1791381984949112, "learning_rate": 4.8001274745798286e-05, "loss": 0.457, "step": 21415 }, { "epoch": 0.7719753486863444, "grad_norm": 0.1447453647851944, "learning_rate": 4.800013126830437e-05, "loss": 0.4155, "step": 21420 }, { "epoch": 0.772155548347569, "grad_norm": 0.17693381011486053, "learning_rate": 4.799898747743811e-05, "loss": 0.3966, "step": 21425 }, { "epoch": 0.7723357480087938, "grad_norm": 0.2063083052635193, "learning_rate": 4.799784337321509e-05, "loss": 0.4712, "step": 21430 }, { "epoch": 0.7725159476700184, "grad_norm": 0.217716246843338, "learning_rate": 4.799669895565092e-05, "loss": 0.4261, "step": 21435 }, { "epoch": 0.772696147331243, "grad_norm": 0.13754284381866455, "learning_rate": 4.799555422476117e-05, "loss": 0.4144, "step": 21440 }, { "epoch": 0.7728763469924677, "grad_norm": 0.1717095524072647, "learning_rate": 4.799440918056145e-05, "loss": 0.4505, "step": 21445 }, { "epoch": 0.7730565466536923, "grad_norm": 0.16796359419822693, "learning_rate": 4.7993263823067355e-05, "loss": 0.4523, "step": 21450 }, { "epoch": 0.7732367463149169, "grad_norm": 0.1903134435415268, "learning_rate": 4.79921181522945e-05, "loss": 0.4197, "step": 21455 }, { "epoch": 0.7734169459761415, "grad_norm": 0.1834080070257187, "learning_rate": 4.799097216825847e-05, "loss": 0.4232, "step": 21460 }, { "epoch": 0.7735971456373663, "grad_norm": 0.17639869451522827, "learning_rate": 4.7989825870974904e-05, "loss": 0.4085, "step": 21465 }, { "epoch": 0.7737773452985909, "grad_norm": 0.18790148198604584, "learning_rate": 4.798867926045941e-05, "loss": 0.4522, "step": 21470 }, { "epoch": 0.7739575449598155, "grad_norm": 0.17099972069263458, "learning_rate": 4.798753233672762e-05, "loss": 0.4409, "step": 21475 }, { "epoch": 0.7741377446210401, "grad_norm": 0.15639808773994446, "learning_rate": 4.798638509979514e-05, "loss": 0.4576, "step": 21480 }, { "epoch": 0.7743179442822647, "grad_norm": 0.19810642302036285, "learning_rate": 4.7985237549677624e-05, "loss": 0.4336, "step": 21485 }, { "epoch": 0.7744981439434894, "grad_norm": 0.192165344953537, "learning_rate": 4.79840896863907e-05, "loss": 0.4481, "step": 21490 }, { "epoch": 0.774678343604714, "grad_norm": 0.17463374137878418, "learning_rate": 4.798294150994999e-05, "loss": 0.4444, "step": 21495 }, { "epoch": 0.7748585432659386, "grad_norm": 0.1958521455526352, "learning_rate": 4.798179302037116e-05, "loss": 0.4435, "step": 21500 }, { "epoch": 0.7748585432659386, "eval_loss": 0.4604308009147644, "eval_runtime": 3.5333, "eval_samples_per_second": 28.302, "eval_steps_per_second": 7.075, "step": 21500 }, { "epoch": 0.7750387429271633, "grad_norm": 0.18340334296226501, "learning_rate": 4.798064421766985e-05, "loss": 0.4139, "step": 21505 }, { "epoch": 0.775218942588388, "grad_norm": 0.18603767454624176, "learning_rate": 4.7979495101861705e-05, "loss": 0.418, "step": 21510 }, { "epoch": 0.7753991422496126, "grad_norm": 0.21015454828739166, "learning_rate": 4.7978345672962395e-05, "loss": 0.4884, "step": 21515 }, { "epoch": 0.7755793419108372, "grad_norm": 0.16696617007255554, "learning_rate": 4.797719593098757e-05, "loss": 0.4452, "step": 21520 }, { "epoch": 0.7757595415720618, "grad_norm": 0.19136182963848114, "learning_rate": 4.79760458759529e-05, "loss": 0.449, "step": 21525 }, { "epoch": 0.7759397412332865, "grad_norm": 0.14051967859268188, "learning_rate": 4.797489550787405e-05, "loss": 0.4396, "step": 21530 }, { "epoch": 0.7761199408945111, "grad_norm": 0.15521185100078583, "learning_rate": 4.7973744826766706e-05, "loss": 0.479, "step": 21535 }, { "epoch": 0.7763001405557357, "grad_norm": 0.1503034085035324, "learning_rate": 4.797259383264653e-05, "loss": 0.4187, "step": 21540 }, { "epoch": 0.7764803402169604, "grad_norm": 0.18658557534217834, "learning_rate": 4.7971442525529206e-05, "loss": 0.4604, "step": 21545 }, { "epoch": 0.7766605398781851, "grad_norm": 0.19798524677753448, "learning_rate": 4.797029090543044e-05, "loss": 0.447, "step": 21550 }, { "epoch": 0.7768407395394097, "grad_norm": 0.21234340965747833, "learning_rate": 4.796913897236589e-05, "loss": 0.4392, "step": 21555 }, { "epoch": 0.7770209392006343, "grad_norm": 0.16527117788791656, "learning_rate": 4.796798672635128e-05, "loss": 0.4516, "step": 21560 }, { "epoch": 0.7772011388618589, "grad_norm": 0.20785808563232422, "learning_rate": 4.7966834167402295e-05, "loss": 0.4954, "step": 21565 }, { "epoch": 0.7773813385230836, "grad_norm": 0.15618237853050232, "learning_rate": 4.7965681295534635e-05, "loss": 0.399, "step": 21570 }, { "epoch": 0.7775615381843082, "grad_norm": 0.1682867556810379, "learning_rate": 4.7964528110764026e-05, "loss": 0.4513, "step": 21575 }, { "epoch": 0.7777417378455328, "grad_norm": 0.1370386779308319, "learning_rate": 4.796337461310616e-05, "loss": 0.4479, "step": 21580 }, { "epoch": 0.7779219375067575, "grad_norm": 0.41439640522003174, "learning_rate": 4.796222080257676e-05, "loss": 0.4443, "step": 21585 }, { "epoch": 0.7781021371679822, "grad_norm": 0.1332816630601883, "learning_rate": 4.7961066679191544e-05, "loss": 0.429, "step": 21590 }, { "epoch": 0.7782823368292068, "grad_norm": 0.20065148174762726, "learning_rate": 4.7959912242966245e-05, "loss": 0.4144, "step": 21595 }, { "epoch": 0.7784625364904314, "grad_norm": 0.18139199912548065, "learning_rate": 4.795875749391659e-05, "loss": 0.4245, "step": 21600 }, { "epoch": 0.778642736151656, "grad_norm": 0.1368846297264099, "learning_rate": 4.79576024320583e-05, "loss": 0.4475, "step": 21605 }, { "epoch": 0.7788229358128806, "grad_norm": 0.21975240111351013, "learning_rate": 4.7956447057407125e-05, "loss": 0.4563, "step": 21610 }, { "epoch": 0.7790031354741053, "grad_norm": 0.16079792380332947, "learning_rate": 4.795529136997881e-05, "loss": 0.4176, "step": 21615 }, { "epoch": 0.7791833351353299, "grad_norm": 0.16716989874839783, "learning_rate": 4.795413536978909e-05, "loss": 0.4083, "step": 21620 }, { "epoch": 0.7793635347965546, "grad_norm": 0.21081286668777466, "learning_rate": 4.795297905685372e-05, "loss": 0.4439, "step": 21625 }, { "epoch": 0.7795437344577792, "grad_norm": 0.17637106776237488, "learning_rate": 4.7951822431188455e-05, "loss": 0.4057, "step": 21630 }, { "epoch": 0.7797239341190039, "grad_norm": 0.1675841063261032, "learning_rate": 4.7950665492809047e-05, "loss": 0.467, "step": 21635 }, { "epoch": 0.7799041337802285, "grad_norm": 0.1935756951570511, "learning_rate": 4.794950824173127e-05, "loss": 0.3958, "step": 21640 }, { "epoch": 0.7800843334414531, "grad_norm": 0.18767641484737396, "learning_rate": 4.794835067797089e-05, "loss": 0.4528, "step": 21645 }, { "epoch": 0.7802645331026777, "grad_norm": 0.1749449372291565, "learning_rate": 4.794719280154367e-05, "loss": 0.4452, "step": 21650 }, { "epoch": 0.7804447327639024, "grad_norm": 0.13821455836296082, "learning_rate": 4.794603461246539e-05, "loss": 0.4487, "step": 21655 }, { "epoch": 0.7806249324251271, "grad_norm": 0.15566319227218628, "learning_rate": 4.794487611075184e-05, "loss": 0.4393, "step": 21660 }, { "epoch": 0.7808051320863517, "grad_norm": 0.17537739872932434, "learning_rate": 4.794371729641878e-05, "loss": 0.4475, "step": 21665 }, { "epoch": 0.7809853317475763, "grad_norm": 0.1799578070640564, "learning_rate": 4.794255816948202e-05, "loss": 0.4405, "step": 21670 }, { "epoch": 0.781165531408801, "grad_norm": 0.1784065216779709, "learning_rate": 4.794139872995736e-05, "loss": 0.4588, "step": 21675 }, { "epoch": 0.7813457310700256, "grad_norm": 0.18928910791873932, "learning_rate": 4.7940238977860563e-05, "loss": 0.4349, "step": 21680 }, { "epoch": 0.7815259307312502, "grad_norm": 0.15001779794692993, "learning_rate": 4.793907891320746e-05, "loss": 0.3882, "step": 21685 }, { "epoch": 0.7817061303924748, "grad_norm": 0.21316440403461456, "learning_rate": 4.793791853601385e-05, "loss": 0.4141, "step": 21690 }, { "epoch": 0.7818863300536995, "grad_norm": 0.1485886126756668, "learning_rate": 4.793675784629554e-05, "loss": 0.444, "step": 21695 }, { "epoch": 0.7820665297149242, "grad_norm": 0.17598822712898254, "learning_rate": 4.7935596844068343e-05, "loss": 0.4348, "step": 21700 }, { "epoch": 0.7822467293761488, "grad_norm": 0.1468224823474884, "learning_rate": 4.793443552934808e-05, "loss": 0.4099, "step": 21705 }, { "epoch": 0.7824269290373734, "grad_norm": 0.21252159774303436, "learning_rate": 4.793327390215058e-05, "loss": 0.4587, "step": 21710 }, { "epoch": 0.782607128698598, "grad_norm": 0.1317516267299652, "learning_rate": 4.7932111962491654e-05, "loss": 0.4466, "step": 21715 }, { "epoch": 0.7827873283598227, "grad_norm": 0.17863036692142487, "learning_rate": 4.7930949710387145e-05, "loss": 0.436, "step": 21720 }, { "epoch": 0.7829675280210473, "grad_norm": 0.1505396068096161, "learning_rate": 4.792978714585289e-05, "loss": 0.4427, "step": 21725 }, { "epoch": 0.7831477276822719, "grad_norm": 0.17207299172878265, "learning_rate": 4.7928624268904724e-05, "loss": 0.3839, "step": 21730 }, { "epoch": 0.7833279273434965, "grad_norm": 0.1621370017528534, "learning_rate": 4.7927461079558476e-05, "loss": 0.4446, "step": 21735 }, { "epoch": 0.7835081270047213, "grad_norm": 0.1606713980436325, "learning_rate": 4.792629757783003e-05, "loss": 0.4394, "step": 21740 }, { "epoch": 0.7836883266659459, "grad_norm": 0.15808479487895966, "learning_rate": 4.792513376373521e-05, "loss": 0.4333, "step": 21745 }, { "epoch": 0.7838685263271705, "grad_norm": 0.19757013022899628, "learning_rate": 4.7923969637289875e-05, "loss": 0.4498, "step": 21750 }, { "epoch": 0.7840487259883951, "grad_norm": 0.1744808852672577, "learning_rate": 4.7922805198509905e-05, "loss": 0.4234, "step": 21755 }, { "epoch": 0.7842289256496198, "grad_norm": 0.14034539461135864, "learning_rate": 4.7921640447411146e-05, "loss": 0.428, "step": 21760 }, { "epoch": 0.7844091253108444, "grad_norm": 0.12732501327991486, "learning_rate": 4.792047538400947e-05, "loss": 0.415, "step": 21765 }, { "epoch": 0.784589324972069, "grad_norm": 0.13818371295928955, "learning_rate": 4.791931000832076e-05, "loss": 0.4057, "step": 21770 }, { "epoch": 0.7847695246332936, "grad_norm": 0.2030201405286789, "learning_rate": 4.791814432036088e-05, "loss": 0.4127, "step": 21775 }, { "epoch": 0.7849497242945184, "grad_norm": 0.13716308772563934, "learning_rate": 4.791697832014573e-05, "loss": 0.4666, "step": 21780 }, { "epoch": 0.785129923955743, "grad_norm": 0.21387408673763275, "learning_rate": 4.791581200769118e-05, "loss": 0.4192, "step": 21785 }, { "epoch": 0.7853101236169676, "grad_norm": 0.21067731082439423, "learning_rate": 4.7914645383013134e-05, "loss": 0.4534, "step": 21790 }, { "epoch": 0.7854903232781922, "grad_norm": 0.18593548238277435, "learning_rate": 4.791347844612748e-05, "loss": 0.4512, "step": 21795 }, { "epoch": 0.7856705229394169, "grad_norm": 0.18970215320587158, "learning_rate": 4.7912311197050115e-05, "loss": 0.4188, "step": 21800 }, { "epoch": 0.7858507226006415, "grad_norm": 0.15977688133716583, "learning_rate": 4.791114363579695e-05, "loss": 0.4136, "step": 21805 }, { "epoch": 0.7860309222618661, "grad_norm": 0.17266367375850677, "learning_rate": 4.790997576238389e-05, "loss": 0.4518, "step": 21810 }, { "epoch": 0.7862111219230908, "grad_norm": 0.20080499351024628, "learning_rate": 4.790880757682684e-05, "loss": 0.427, "step": 21815 }, { "epoch": 0.7863913215843155, "grad_norm": 0.18386758863925934, "learning_rate": 4.790763907914172e-05, "loss": 0.4013, "step": 21820 }, { "epoch": 0.7865715212455401, "grad_norm": 0.17184041440486908, "learning_rate": 4.790647026934446e-05, "loss": 0.4037, "step": 21825 }, { "epoch": 0.7867517209067647, "grad_norm": 0.15970267355442047, "learning_rate": 4.790530114745097e-05, "loss": 0.4333, "step": 21830 }, { "epoch": 0.7869319205679893, "grad_norm": 0.17542044818401337, "learning_rate": 4.7904131713477196e-05, "loss": 0.4588, "step": 21835 }, { "epoch": 0.787112120229214, "grad_norm": 0.16546791791915894, "learning_rate": 4.790296196743905e-05, "loss": 0.4421, "step": 21840 }, { "epoch": 0.7872923198904386, "grad_norm": 0.1802026927471161, "learning_rate": 4.790179190935249e-05, "loss": 0.4335, "step": 21845 }, { "epoch": 0.7874725195516632, "grad_norm": 0.19843994081020355, "learning_rate": 4.790062153923345e-05, "loss": 0.4394, "step": 21850 }, { "epoch": 0.7876527192128879, "grad_norm": 0.1890823394060135, "learning_rate": 4.7899450857097875e-05, "loss": 0.4763, "step": 21855 }, { "epoch": 0.7878329188741126, "grad_norm": 0.15386788547039032, "learning_rate": 4.789827986296172e-05, "loss": 0.4183, "step": 21860 }, { "epoch": 0.7880131185353372, "grad_norm": 0.17523469030857086, "learning_rate": 4.789710855684092e-05, "loss": 0.4289, "step": 21865 }, { "epoch": 0.7881933181965618, "grad_norm": 0.1903219223022461, "learning_rate": 4.789593693875146e-05, "loss": 0.4049, "step": 21870 }, { "epoch": 0.7883735178577864, "grad_norm": 0.21866121888160706, "learning_rate": 4.7894765008709286e-05, "loss": 0.4868, "step": 21875 }, { "epoch": 0.788553717519011, "grad_norm": 0.13972169160842896, "learning_rate": 4.789359276673038e-05, "loss": 0.4237, "step": 21880 }, { "epoch": 0.7887339171802357, "grad_norm": 0.18218587338924408, "learning_rate": 4.789242021283069e-05, "loss": 0.4031, "step": 21885 }, { "epoch": 0.7889141168414603, "grad_norm": 0.1976306140422821, "learning_rate": 4.789124734702622e-05, "loss": 0.4757, "step": 21890 }, { "epoch": 0.789094316502685, "grad_norm": 0.15920376777648926, "learning_rate": 4.789007416933293e-05, "loss": 0.4281, "step": 21895 }, { "epoch": 0.7892745161639096, "grad_norm": 0.1940070241689682, "learning_rate": 4.788890067976682e-05, "loss": 0.4296, "step": 21900 }, { "epoch": 0.7894547158251343, "grad_norm": 0.1800207644701004, "learning_rate": 4.788772687834386e-05, "loss": 0.4521, "step": 21905 }, { "epoch": 0.7896349154863589, "grad_norm": 0.166525736451149, "learning_rate": 4.7886552765080055e-05, "loss": 0.4653, "step": 21910 }, { "epoch": 0.7898151151475835, "grad_norm": 0.19525845348834991, "learning_rate": 4.78853783399914e-05, "loss": 0.4835, "step": 21915 }, { "epoch": 0.7899953148088081, "grad_norm": 0.19068776071071625, "learning_rate": 4.78842036030939e-05, "loss": 0.427, "step": 21920 }, { "epoch": 0.7901755144700328, "grad_norm": 0.1702924221754074, "learning_rate": 4.7883028554403554e-05, "loss": 0.4455, "step": 21925 }, { "epoch": 0.7903557141312574, "grad_norm": 0.14877758920192719, "learning_rate": 4.788185319393637e-05, "loss": 0.4352, "step": 21930 }, { "epoch": 0.7905359137924821, "grad_norm": 0.1357695311307907, "learning_rate": 4.788067752170837e-05, "loss": 0.4631, "step": 21935 }, { "epoch": 0.7907161134537067, "grad_norm": 0.194273442029953, "learning_rate": 4.787950153773557e-05, "loss": 0.4193, "step": 21940 }, { "epoch": 0.7908963131149314, "grad_norm": 0.15530972182750702, "learning_rate": 4.7878325242033987e-05, "loss": 0.4188, "step": 21945 }, { "epoch": 0.791076512776156, "grad_norm": 0.14824257791042328, "learning_rate": 4.7877148634619657e-05, "loss": 0.4546, "step": 21950 }, { "epoch": 0.7912567124373806, "grad_norm": 0.19110989570617676, "learning_rate": 4.7875971715508606e-05, "loss": 0.426, "step": 21955 }, { "epoch": 0.7914369120986052, "grad_norm": 0.1911250203847885, "learning_rate": 4.787479448471686e-05, "loss": 0.4591, "step": 21960 }, { "epoch": 0.7916171117598299, "grad_norm": 0.16899849474430084, "learning_rate": 4.787361694226048e-05, "loss": 0.4502, "step": 21965 }, { "epoch": 0.7917973114210546, "grad_norm": 0.15524151921272278, "learning_rate": 4.787243908815548e-05, "loss": 0.4257, "step": 21970 }, { "epoch": 0.7919775110822792, "grad_norm": 0.16246455907821655, "learning_rate": 4.787126092241795e-05, "loss": 0.4214, "step": 21975 }, { "epoch": 0.7921577107435038, "grad_norm": 0.1904502511024475, "learning_rate": 4.78700824450639e-05, "loss": 0.4643, "step": 21980 }, { "epoch": 0.7923379104047285, "grad_norm": 0.17854608595371246, "learning_rate": 4.786890365610941e-05, "loss": 0.4244, "step": 21985 }, { "epoch": 0.7925181100659531, "grad_norm": 0.1559765338897705, "learning_rate": 4.786772455557054e-05, "loss": 0.4281, "step": 21990 }, { "epoch": 0.7926983097271777, "grad_norm": 0.16417820751667023, "learning_rate": 4.786654514346335e-05, "loss": 0.4106, "step": 21995 }, { "epoch": 0.7928785093884023, "grad_norm": 0.20214970409870148, "learning_rate": 4.7865365419803896e-05, "loss": 0.4704, "step": 22000 }, { "epoch": 0.7928785093884023, "eval_loss": 0.4604956805706024, "eval_runtime": 3.5345, "eval_samples_per_second": 28.293, "eval_steps_per_second": 7.073, "step": 22000 }, { "epoch": 0.7930587090496269, "grad_norm": 0.14078626036643982, "learning_rate": 4.786418538460828e-05, "loss": 0.4562, "step": 22005 }, { "epoch": 0.7932389087108517, "grad_norm": 0.15193304419517517, "learning_rate": 4.7863005037892554e-05, "loss": 0.482, "step": 22010 }, { "epoch": 0.7934191083720763, "grad_norm": 0.1598198413848877, "learning_rate": 4.786182437967282e-05, "loss": 0.4144, "step": 22015 }, { "epoch": 0.7935993080333009, "grad_norm": 0.1988956481218338, "learning_rate": 4.786064340996515e-05, "loss": 0.4673, "step": 22020 }, { "epoch": 0.7937795076945255, "grad_norm": 0.19150066375732422, "learning_rate": 4.7859462128785635e-05, "loss": 0.4278, "step": 22025 }, { "epoch": 0.7939597073557502, "grad_norm": 0.18495355546474457, "learning_rate": 4.785828053615038e-05, "loss": 0.4411, "step": 22030 }, { "epoch": 0.7941399070169748, "grad_norm": 0.1614546775817871, "learning_rate": 4.785709863207548e-05, "loss": 0.3946, "step": 22035 }, { "epoch": 0.7943201066781994, "grad_norm": 0.1417195051908493, "learning_rate": 4.785591641657704e-05, "loss": 0.4045, "step": 22040 }, { "epoch": 0.794500306339424, "grad_norm": 0.16670504212379456, "learning_rate": 4.7854733889671154e-05, "loss": 0.4343, "step": 22045 }, { "epoch": 0.7946805060006488, "grad_norm": 0.17932060360908508, "learning_rate": 4.785355105137395e-05, "loss": 0.4841, "step": 22050 }, { "epoch": 0.7948607056618734, "grad_norm": 0.1873452365398407, "learning_rate": 4.785236790170153e-05, "loss": 0.432, "step": 22055 }, { "epoch": 0.795040905323098, "grad_norm": 0.16498543322086334, "learning_rate": 4.7851184440670026e-05, "loss": 0.4354, "step": 22060 }, { "epoch": 0.7952211049843226, "grad_norm": 0.15727977454662323, "learning_rate": 4.785000066829556e-05, "loss": 0.4322, "step": 22065 }, { "epoch": 0.7954013046455473, "grad_norm": 0.17471718788146973, "learning_rate": 4.784881658459426e-05, "loss": 0.422, "step": 22070 }, { "epoch": 0.7955815043067719, "grad_norm": 0.19372014701366425, "learning_rate": 4.784763218958226e-05, "loss": 0.4249, "step": 22075 }, { "epoch": 0.7957617039679965, "grad_norm": 0.1373492181301117, "learning_rate": 4.784644748327568e-05, "loss": 0.4094, "step": 22080 }, { "epoch": 0.7959419036292211, "grad_norm": 0.16347211599349976, "learning_rate": 4.7845262465690695e-05, "loss": 0.4199, "step": 22085 }, { "epoch": 0.7961221032904459, "grad_norm": 0.16035209596157074, "learning_rate": 4.7844077136843426e-05, "loss": 0.442, "step": 22090 }, { "epoch": 0.7963023029516705, "grad_norm": 0.19094820320606232, "learning_rate": 4.784289149675002e-05, "loss": 0.4381, "step": 22095 }, { "epoch": 0.7964825026128951, "grad_norm": 0.17128083109855652, "learning_rate": 4.784170554542665e-05, "loss": 0.4571, "step": 22100 }, { "epoch": 0.7966627022741197, "grad_norm": 0.16196797788143158, "learning_rate": 4.784051928288946e-05, "loss": 0.4337, "step": 22105 }, { "epoch": 0.7968429019353443, "grad_norm": 0.16744011640548706, "learning_rate": 4.7839332709154613e-05, "loss": 0.4011, "step": 22110 }, { "epoch": 0.797023101596569, "grad_norm": 0.1657259315252304, "learning_rate": 4.783814582423829e-05, "loss": 0.4232, "step": 22115 }, { "epoch": 0.7972033012577936, "grad_norm": 0.17152729630470276, "learning_rate": 4.783695862815664e-05, "loss": 0.4448, "step": 22120 }, { "epoch": 0.7973835009190182, "grad_norm": 0.17332147061824799, "learning_rate": 4.783577112092585e-05, "loss": 0.4088, "step": 22125 }, { "epoch": 0.797563700580243, "grad_norm": 0.1492212414741516, "learning_rate": 4.783458330256211e-05, "loss": 0.4846, "step": 22130 }, { "epoch": 0.7977439002414676, "grad_norm": 0.21426644921302795, "learning_rate": 4.783339517308159e-05, "loss": 0.3948, "step": 22135 }, { "epoch": 0.7979240999026922, "grad_norm": 0.18977008759975433, "learning_rate": 4.783220673250048e-05, "loss": 0.4355, "step": 22140 }, { "epoch": 0.7981042995639168, "grad_norm": 0.1697508692741394, "learning_rate": 4.783101798083498e-05, "loss": 0.3963, "step": 22145 }, { "epoch": 0.7982844992251414, "grad_norm": 0.1844054013490677, "learning_rate": 4.782982891810127e-05, "loss": 0.4502, "step": 22150 }, { "epoch": 0.7984646988863661, "grad_norm": 0.19432871043682098, "learning_rate": 4.782863954431557e-05, "loss": 0.4602, "step": 22155 }, { "epoch": 0.7986448985475907, "grad_norm": 0.16126291453838348, "learning_rate": 4.7827449859494065e-05, "loss": 0.4409, "step": 22160 }, { "epoch": 0.7988250982088154, "grad_norm": 0.17771978676319122, "learning_rate": 4.7826259863652975e-05, "loss": 0.4657, "step": 22165 }, { "epoch": 0.79900529787004, "grad_norm": 0.18326117098331451, "learning_rate": 4.7825069556808525e-05, "loss": 0.4722, "step": 22170 }, { "epoch": 0.7991854975312647, "grad_norm": 0.14180731773376465, "learning_rate": 4.782387893897692e-05, "loss": 0.4298, "step": 22175 }, { "epoch": 0.7993656971924893, "grad_norm": 0.1825062483549118, "learning_rate": 4.7822688010174376e-05, "loss": 0.4577, "step": 22180 }, { "epoch": 0.7995458968537139, "grad_norm": 0.19557367265224457, "learning_rate": 4.782149677041713e-05, "loss": 0.4641, "step": 22185 }, { "epoch": 0.7997260965149385, "grad_norm": 0.17543824017047882, "learning_rate": 4.782030521972141e-05, "loss": 0.4703, "step": 22190 }, { "epoch": 0.7999062961761632, "grad_norm": 0.15802043676376343, "learning_rate": 4.781911335810345e-05, "loss": 0.4311, "step": 22195 }, { "epoch": 0.8000864958373878, "grad_norm": 0.14436450600624084, "learning_rate": 4.781792118557948e-05, "loss": 0.4534, "step": 22200 }, { "epoch": 0.8002666954986125, "grad_norm": 0.14026832580566406, "learning_rate": 4.7816728702165765e-05, "loss": 0.3809, "step": 22205 }, { "epoch": 0.8004468951598371, "grad_norm": 0.1877908557653427, "learning_rate": 4.781553590787853e-05, "loss": 0.4493, "step": 22210 }, { "epoch": 0.8006270948210618, "grad_norm": 0.18697671592235565, "learning_rate": 4.7814342802734034e-05, "loss": 0.4576, "step": 22215 }, { "epoch": 0.8008072944822864, "grad_norm": 0.1507420688867569, "learning_rate": 4.781314938674855e-05, "loss": 0.472, "step": 22220 }, { "epoch": 0.800987494143511, "grad_norm": 0.11861146241426468, "learning_rate": 4.78119556599383e-05, "loss": 0.4527, "step": 22225 }, { "epoch": 0.8011676938047356, "grad_norm": 0.24812163412570953, "learning_rate": 4.781076162231959e-05, "loss": 0.4276, "step": 22230 }, { "epoch": 0.8013478934659602, "grad_norm": 0.13593071699142456, "learning_rate": 4.7809567273908656e-05, "loss": 0.4523, "step": 22235 }, { "epoch": 0.8015280931271849, "grad_norm": 0.16076035797595978, "learning_rate": 4.7808372614721786e-05, "loss": 0.401, "step": 22240 }, { "epoch": 0.8017082927884096, "grad_norm": 0.2315647453069687, "learning_rate": 4.780717764477526e-05, "loss": 0.3886, "step": 22245 }, { "epoch": 0.8018884924496342, "grad_norm": 0.15930241346359253, "learning_rate": 4.780598236408535e-05, "loss": 0.4124, "step": 22250 }, { "epoch": 0.8020686921108588, "grad_norm": 0.18420235812664032, "learning_rate": 4.780478677266835e-05, "loss": 0.4208, "step": 22255 }, { "epoch": 0.8022488917720835, "grad_norm": 0.17045623064041138, "learning_rate": 4.780359087054054e-05, "loss": 0.4138, "step": 22260 }, { "epoch": 0.8024290914333081, "grad_norm": 0.165228009223938, "learning_rate": 4.780239465771822e-05, "loss": 0.4348, "step": 22265 }, { "epoch": 0.8026092910945327, "grad_norm": 0.20021255314350128, "learning_rate": 4.78011981342177e-05, "loss": 0.4712, "step": 22270 }, { "epoch": 0.8027894907557573, "grad_norm": 0.18207870423793793, "learning_rate": 4.7800001300055254e-05, "loss": 0.4717, "step": 22275 }, { "epoch": 0.802969690416982, "grad_norm": 0.16366621851921082, "learning_rate": 4.7798804155247205e-05, "loss": 0.4682, "step": 22280 }, { "epoch": 0.8031498900782067, "grad_norm": 0.15080103278160095, "learning_rate": 4.7797606699809874e-05, "loss": 0.409, "step": 22285 }, { "epoch": 0.8033300897394313, "grad_norm": 0.1815640777349472, "learning_rate": 4.779640893375956e-05, "loss": 0.456, "step": 22290 }, { "epoch": 0.8035102894006559, "grad_norm": 0.17141170799732208, "learning_rate": 4.7795210857112585e-05, "loss": 0.4179, "step": 22295 }, { "epoch": 0.8036904890618806, "grad_norm": 0.1825985610485077, "learning_rate": 4.7794012469885276e-05, "loss": 0.4053, "step": 22300 }, { "epoch": 0.8038706887231052, "grad_norm": 0.17858922481536865, "learning_rate": 4.779281377209396e-05, "loss": 0.4419, "step": 22305 }, { "epoch": 0.8040508883843298, "grad_norm": 0.18266747891902924, "learning_rate": 4.779161476375497e-05, "loss": 0.4313, "step": 22310 }, { "epoch": 0.8042310880455544, "grad_norm": 0.15619586408138275, "learning_rate": 4.7790415444884645e-05, "loss": 0.4456, "step": 22315 }, { "epoch": 0.8044112877067792, "grad_norm": 0.14940191805362701, "learning_rate": 4.778921581549932e-05, "loss": 0.4155, "step": 22320 }, { "epoch": 0.8045914873680038, "grad_norm": 0.18177048861980438, "learning_rate": 4.778801587561535e-05, "loss": 0.4538, "step": 22325 }, { "epoch": 0.8047716870292284, "grad_norm": 0.19796302914619446, "learning_rate": 4.778681562524906e-05, "loss": 0.4435, "step": 22330 }, { "epoch": 0.804951886690453, "grad_norm": 0.17838706076145172, "learning_rate": 4.778561506441682e-05, "loss": 0.4309, "step": 22335 }, { "epoch": 0.8051320863516777, "grad_norm": 0.14537879824638367, "learning_rate": 4.7784414193135e-05, "loss": 0.4494, "step": 22340 }, { "epoch": 0.8053122860129023, "grad_norm": 0.2153496891260147, "learning_rate": 4.778321301141994e-05, "loss": 0.4376, "step": 22345 }, { "epoch": 0.8054924856741269, "grad_norm": 0.17818701267242432, "learning_rate": 4.7782011519288e-05, "loss": 0.4245, "step": 22350 }, { "epoch": 0.8056726853353515, "grad_norm": 0.15808773040771484, "learning_rate": 4.778080971675558e-05, "loss": 0.42, "step": 22355 }, { "epoch": 0.8058528849965763, "grad_norm": 0.16724058985710144, "learning_rate": 4.777960760383904e-05, "loss": 0.4454, "step": 22360 }, { "epoch": 0.8060330846578009, "grad_norm": 0.14002850651741028, "learning_rate": 4.777840518055475e-05, "loss": 0.4447, "step": 22365 }, { "epoch": 0.8062132843190255, "grad_norm": 0.1951669454574585, "learning_rate": 4.77772024469191e-05, "loss": 0.4589, "step": 22370 }, { "epoch": 0.8063934839802501, "grad_norm": 0.17650310695171356, "learning_rate": 4.7775999402948476e-05, "loss": 0.3997, "step": 22375 }, { "epoch": 0.8065736836414747, "grad_norm": 0.17684021592140198, "learning_rate": 4.7774796048659276e-05, "loss": 0.4246, "step": 22380 }, { "epoch": 0.8067538833026994, "grad_norm": 0.18748098611831665, "learning_rate": 4.7773592384067884e-05, "loss": 0.4371, "step": 22385 }, { "epoch": 0.806934082963924, "grad_norm": 0.19001294672489166, "learning_rate": 4.7772388409190704e-05, "loss": 0.4281, "step": 22390 }, { "epoch": 0.8071142826251486, "grad_norm": 0.1857748180627823, "learning_rate": 4.7771184124044144e-05, "loss": 0.4078, "step": 22395 }, { "epoch": 0.8072944822863733, "grad_norm": 0.1612103283405304, "learning_rate": 4.776997952864461e-05, "loss": 0.4418, "step": 22400 }, { "epoch": 0.807474681947598, "grad_norm": 0.1528964787721634, "learning_rate": 4.7768774623008506e-05, "loss": 0.4683, "step": 22405 }, { "epoch": 0.8076548816088226, "grad_norm": 0.15798471868038177, "learning_rate": 4.776756940715226e-05, "loss": 0.4182, "step": 22410 }, { "epoch": 0.8078350812700472, "grad_norm": 0.16511400043964386, "learning_rate": 4.7766363881092294e-05, "loss": 0.4249, "step": 22415 }, { "epoch": 0.8080152809312718, "grad_norm": 0.17201100289821625, "learning_rate": 4.776515804484502e-05, "loss": 0.4506, "step": 22420 }, { "epoch": 0.8081954805924965, "grad_norm": 0.2050049602985382, "learning_rate": 4.776395189842688e-05, "loss": 0.4595, "step": 22425 }, { "epoch": 0.8083756802537211, "grad_norm": 0.16215232014656067, "learning_rate": 4.7762745441854296e-05, "loss": 0.4377, "step": 22430 }, { "epoch": 0.8085558799149457, "grad_norm": 0.1810922622680664, "learning_rate": 4.776153867514372e-05, "loss": 0.4346, "step": 22435 }, { "epoch": 0.8087360795761704, "grad_norm": 0.1510697603225708, "learning_rate": 4.7760331598311584e-05, "loss": 0.4279, "step": 22440 }, { "epoch": 0.8089162792373951, "grad_norm": 0.21630224585533142, "learning_rate": 4.7759124211374335e-05, "loss": 0.4654, "step": 22445 }, { "epoch": 0.8090964788986197, "grad_norm": 0.1879904568195343, "learning_rate": 4.775791651434843e-05, "loss": 0.3944, "step": 22450 }, { "epoch": 0.8092766785598443, "grad_norm": 0.20264166593551636, "learning_rate": 4.7756708507250314e-05, "loss": 0.4777, "step": 22455 }, { "epoch": 0.8094568782210689, "grad_norm": 0.1782803237438202, "learning_rate": 4.775550019009645e-05, "loss": 0.4346, "step": 22460 }, { "epoch": 0.8096370778822936, "grad_norm": 0.16567277908325195, "learning_rate": 4.77542915629033e-05, "loss": 0.4615, "step": 22465 }, { "epoch": 0.8098172775435182, "grad_norm": 0.17552456259727478, "learning_rate": 4.7753082625687334e-05, "loss": 0.4801, "step": 22470 }, { "epoch": 0.8099974772047429, "grad_norm": 0.21018646657466888, "learning_rate": 4.7751873378465026e-05, "loss": 0.4294, "step": 22475 }, { "epoch": 0.8101776768659675, "grad_norm": 0.16765153408050537, "learning_rate": 4.7750663821252844e-05, "loss": 0.4349, "step": 22480 }, { "epoch": 0.8103578765271922, "grad_norm": 0.18359611928462982, "learning_rate": 4.7749453954067275e-05, "loss": 0.4012, "step": 22485 }, { "epoch": 0.8105380761884168, "grad_norm": 0.19209226965904236, "learning_rate": 4.774824377692479e-05, "loss": 0.4422, "step": 22490 }, { "epoch": 0.8107182758496414, "grad_norm": 0.18595997989177704, "learning_rate": 4.77470332898419e-05, "loss": 0.4376, "step": 22495 }, { "epoch": 0.810898475510866, "grad_norm": 0.12643958628177643, "learning_rate": 4.774582249283509e-05, "loss": 0.3845, "step": 22500 }, { "epoch": 0.810898475510866, "eval_loss": 0.46091198921203613, "eval_runtime": 3.5434, "eval_samples_per_second": 28.221, "eval_steps_per_second": 7.055, "step": 22500 }, { "epoch": 0.8110786751720906, "grad_norm": 0.18900783360004425, "learning_rate": 4.774461138592085e-05, "loss": 0.4837, "step": 22505 }, { "epoch": 0.8112588748333153, "grad_norm": 0.1918240487575531, "learning_rate": 4.774339996911567e-05, "loss": 0.4452, "step": 22510 }, { "epoch": 0.81143907449454, "grad_norm": 0.18408475816249847, "learning_rate": 4.7742188242436075e-05, "loss": 0.4227, "step": 22515 }, { "epoch": 0.8116192741557646, "grad_norm": 0.1483888477087021, "learning_rate": 4.774097620589857e-05, "loss": 0.465, "step": 22520 }, { "epoch": 0.8117994738169892, "grad_norm": 0.1650637537240982, "learning_rate": 4.773976385951967e-05, "loss": 0.4325, "step": 22525 }, { "epoch": 0.8119796734782139, "grad_norm": 0.19324971735477448, "learning_rate": 4.773855120331588e-05, "loss": 0.4326, "step": 22530 }, { "epoch": 0.8121598731394385, "grad_norm": 0.16592490673065186, "learning_rate": 4.773733823730374e-05, "loss": 0.4163, "step": 22535 }, { "epoch": 0.8123400728006631, "grad_norm": 0.17151491343975067, "learning_rate": 4.773612496149977e-05, "loss": 0.4414, "step": 22540 }, { "epoch": 0.8125202724618877, "grad_norm": 0.1465132236480713, "learning_rate": 4.77349113759205e-05, "loss": 0.4609, "step": 22545 }, { "epoch": 0.8127004721231124, "grad_norm": 0.1558566689491272, "learning_rate": 4.7733697480582464e-05, "loss": 0.4365, "step": 22550 }, { "epoch": 0.8128806717843371, "grad_norm": 0.16560520231723785, "learning_rate": 4.7732483275502194e-05, "loss": 0.438, "step": 22555 }, { "epoch": 0.8130608714455617, "grad_norm": 0.14548031985759735, "learning_rate": 4.773126876069625e-05, "loss": 0.4234, "step": 22560 }, { "epoch": 0.8132410711067863, "grad_norm": 0.16290007531642914, "learning_rate": 4.773005393618116e-05, "loss": 0.407, "step": 22565 }, { "epoch": 0.813421270768011, "grad_norm": 0.16611815989017487, "learning_rate": 4.7728838801973485e-05, "loss": 0.3838, "step": 22570 }, { "epoch": 0.8136014704292356, "grad_norm": 0.1640740931034088, "learning_rate": 4.772762335808979e-05, "loss": 0.4413, "step": 22575 }, { "epoch": 0.8137816700904602, "grad_norm": 0.1402037888765335, "learning_rate": 4.772640760454663e-05, "loss": 0.4131, "step": 22580 }, { "epoch": 0.8139618697516848, "grad_norm": 0.15909187495708466, "learning_rate": 4.772519154136056e-05, "loss": 0.4276, "step": 22585 }, { "epoch": 0.8141420694129095, "grad_norm": 0.16949698328971863, "learning_rate": 4.772397516854815e-05, "loss": 0.4531, "step": 22590 }, { "epoch": 0.8143222690741342, "grad_norm": 0.19324524700641632, "learning_rate": 4.7722758486125986e-05, "loss": 0.4291, "step": 22595 }, { "epoch": 0.8145024687353588, "grad_norm": 0.17862281203269958, "learning_rate": 4.772154149411063e-05, "loss": 0.469, "step": 22600 }, { "epoch": 0.8146826683965834, "grad_norm": 0.19284473359584808, "learning_rate": 4.772032419251868e-05, "loss": 0.4235, "step": 22605 }, { "epoch": 0.814862868057808, "grad_norm": 0.14753711223602295, "learning_rate": 4.77191065813667e-05, "loss": 0.4528, "step": 22610 }, { "epoch": 0.8150430677190327, "grad_norm": 0.1892748773097992, "learning_rate": 4.7717888660671306e-05, "loss": 0.4374, "step": 22615 }, { "epoch": 0.8152232673802573, "grad_norm": 0.15627089142799377, "learning_rate": 4.771667043044906e-05, "loss": 0.4639, "step": 22620 }, { "epoch": 0.8154034670414819, "grad_norm": 0.1614178568124771, "learning_rate": 4.771545189071659e-05, "loss": 0.449, "step": 22625 }, { "epoch": 0.8155836667027065, "grad_norm": 0.18917955458164215, "learning_rate": 4.771423304149049e-05, "loss": 0.4695, "step": 22630 }, { "epoch": 0.8157638663639313, "grad_norm": 0.17164795100688934, "learning_rate": 4.771301388278735e-05, "loss": 0.4405, "step": 22635 }, { "epoch": 0.8159440660251559, "grad_norm": 0.18476401269435883, "learning_rate": 4.7711794414623796e-05, "loss": 0.4584, "step": 22640 }, { "epoch": 0.8161242656863805, "grad_norm": 0.1459241509437561, "learning_rate": 4.771057463701644e-05, "loss": 0.4258, "step": 22645 }, { "epoch": 0.8163044653476051, "grad_norm": 0.16828380525112152, "learning_rate": 4.77093545499819e-05, "loss": 0.4332, "step": 22650 }, { "epoch": 0.8164846650088298, "grad_norm": 0.14585858583450317, "learning_rate": 4.770813415353681e-05, "loss": 0.4004, "step": 22655 }, { "epoch": 0.8166648646700544, "grad_norm": 0.16096548736095428, "learning_rate": 4.7706913447697785e-05, "loss": 0.4693, "step": 22660 }, { "epoch": 0.816845064331279, "grad_norm": 0.19337119162082672, "learning_rate": 4.7705692432481455e-05, "loss": 0.4197, "step": 22665 }, { "epoch": 0.8170252639925037, "grad_norm": 0.1468539834022522, "learning_rate": 4.770447110790447e-05, "loss": 0.4135, "step": 22670 }, { "epoch": 0.8172054636537284, "grad_norm": 0.13654807209968567, "learning_rate": 4.770324947398346e-05, "loss": 0.4459, "step": 22675 }, { "epoch": 0.817385663314953, "grad_norm": 0.16397765278816223, "learning_rate": 4.770202753073506e-05, "loss": 0.4188, "step": 22680 }, { "epoch": 0.8175658629761776, "grad_norm": 0.20366117358207703, "learning_rate": 4.770080527817594e-05, "loss": 0.4387, "step": 22685 }, { "epoch": 0.8177460626374022, "grad_norm": 0.1833733767271042, "learning_rate": 4.7699582716322743e-05, "loss": 0.4509, "step": 22690 }, { "epoch": 0.8179262622986269, "grad_norm": 0.18486203253269196, "learning_rate": 4.7698359845192126e-05, "loss": 0.4042, "step": 22695 }, { "epoch": 0.8181064619598515, "grad_norm": 0.17110316455364227, "learning_rate": 4.769713666480075e-05, "loss": 0.406, "step": 22700 }, { "epoch": 0.8182866616210761, "grad_norm": 0.15279895067214966, "learning_rate": 4.769591317516528e-05, "loss": 0.4436, "step": 22705 }, { "epoch": 0.8184668612823008, "grad_norm": 0.15406733751296997, "learning_rate": 4.769468937630239e-05, "loss": 0.4464, "step": 22710 }, { "epoch": 0.8186470609435255, "grad_norm": 0.16378438472747803, "learning_rate": 4.769346526822874e-05, "loss": 0.4166, "step": 22715 }, { "epoch": 0.8188272606047501, "grad_norm": 0.16179268062114716, "learning_rate": 4.769224085096103e-05, "loss": 0.3881, "step": 22720 }, { "epoch": 0.8190074602659747, "grad_norm": 0.18712985515594482, "learning_rate": 4.769101612451593e-05, "loss": 0.4326, "step": 22725 }, { "epoch": 0.8191876599271993, "grad_norm": 0.1865232139825821, "learning_rate": 4.768979108891013e-05, "loss": 0.4499, "step": 22730 }, { "epoch": 0.819367859588424, "grad_norm": 0.1458199918270111, "learning_rate": 4.7688565744160315e-05, "loss": 0.4193, "step": 22735 }, { "epoch": 0.8195480592496486, "grad_norm": 0.17637084424495697, "learning_rate": 4.768734009028319e-05, "loss": 0.4239, "step": 22740 }, { "epoch": 0.8197282589108732, "grad_norm": 0.14654532074928284, "learning_rate": 4.768611412729545e-05, "loss": 0.4284, "step": 22745 }, { "epoch": 0.8199084585720979, "grad_norm": 0.19782017171382904, "learning_rate": 4.768488785521379e-05, "loss": 0.4591, "step": 22750 }, { "epoch": 0.8200886582333226, "grad_norm": 0.126174658536911, "learning_rate": 4.768366127405493e-05, "loss": 0.4302, "step": 22755 }, { "epoch": 0.8202688578945472, "grad_norm": 0.17687900364398956, "learning_rate": 4.768243438383557e-05, "loss": 0.446, "step": 22760 }, { "epoch": 0.8204490575557718, "grad_norm": 0.17083390057086945, "learning_rate": 4.768120718457244e-05, "loss": 0.4627, "step": 22765 }, { "epoch": 0.8206292572169964, "grad_norm": 0.16993774473667145, "learning_rate": 4.767997967628225e-05, "loss": 0.4264, "step": 22770 }, { "epoch": 0.820809456878221, "grad_norm": 0.19042931497097015, "learning_rate": 4.767875185898173e-05, "loss": 0.4299, "step": 22775 }, { "epoch": 0.8209896565394457, "grad_norm": 0.19879989326000214, "learning_rate": 4.767752373268761e-05, "loss": 0.4124, "step": 22780 }, { "epoch": 0.8211698562006703, "grad_norm": 0.17709743976593018, "learning_rate": 4.7676295297416615e-05, "loss": 0.4415, "step": 22785 }, { "epoch": 0.821350055861895, "grad_norm": 0.1532343327999115, "learning_rate": 4.767506655318549e-05, "loss": 0.3988, "step": 22790 }, { "epoch": 0.8215302555231196, "grad_norm": 0.20457062125205994, "learning_rate": 4.767383750001097e-05, "loss": 0.4313, "step": 22795 }, { "epoch": 0.8217104551843443, "grad_norm": 0.1625969558954239, "learning_rate": 4.76726081379098e-05, "loss": 0.4156, "step": 22800 }, { "epoch": 0.8218906548455689, "grad_norm": 0.17059491574764252, "learning_rate": 4.7671378466898735e-05, "loss": 0.4303, "step": 22805 }, { "epoch": 0.8220708545067935, "grad_norm": 0.18344855308532715, "learning_rate": 4.767014848699453e-05, "loss": 0.4356, "step": 22810 }, { "epoch": 0.8222510541680181, "grad_norm": 0.21067775785923004, "learning_rate": 4.766891819821394e-05, "loss": 0.4577, "step": 22815 }, { "epoch": 0.8224312538292428, "grad_norm": 0.1624084860086441, "learning_rate": 4.766768760057374e-05, "loss": 0.441, "step": 22820 }, { "epoch": 0.8226114534904675, "grad_norm": 0.16547471284866333, "learning_rate": 4.766645669409067e-05, "loss": 0.4524, "step": 22825 }, { "epoch": 0.8227916531516921, "grad_norm": 0.15320567786693573, "learning_rate": 4.766522547878152e-05, "loss": 0.3913, "step": 22830 }, { "epoch": 0.8229718528129167, "grad_norm": 0.19396758079528809, "learning_rate": 4.766399395466307e-05, "loss": 0.4019, "step": 22835 }, { "epoch": 0.8231520524741414, "grad_norm": 0.16974592208862305, "learning_rate": 4.766276212175207e-05, "loss": 0.4173, "step": 22840 }, { "epoch": 0.823332252135366, "grad_norm": 0.17741717398166656, "learning_rate": 4.766152998006534e-05, "loss": 0.4183, "step": 22845 }, { "epoch": 0.8235124517965906, "grad_norm": 0.20168620347976685, "learning_rate": 4.766029752961965e-05, "loss": 0.4499, "step": 22850 }, { "epoch": 0.8236926514578152, "grad_norm": 0.1789730191230774, "learning_rate": 4.765906477043179e-05, "loss": 0.4358, "step": 22855 }, { "epoch": 0.8238728511190399, "grad_norm": 0.1608288586139679, "learning_rate": 4.765783170251856e-05, "loss": 0.4312, "step": 22860 }, { "epoch": 0.8240530507802646, "grad_norm": 0.16104714572429657, "learning_rate": 4.7656598325896755e-05, "loss": 0.4676, "step": 22865 }, { "epoch": 0.8242332504414892, "grad_norm": 0.17927196621894836, "learning_rate": 4.765536464058319e-05, "loss": 0.4367, "step": 22870 }, { "epoch": 0.8244134501027138, "grad_norm": 0.1429985910654068, "learning_rate": 4.7654130646594666e-05, "loss": 0.4697, "step": 22875 }, { "epoch": 0.8245936497639385, "grad_norm": 0.1574939787387848, "learning_rate": 4.7652896343948e-05, "loss": 0.4621, "step": 22880 }, { "epoch": 0.8247738494251631, "grad_norm": 0.15647585690021515, "learning_rate": 4.765166173266001e-05, "loss": 0.4169, "step": 22885 }, { "epoch": 0.8249540490863877, "grad_norm": 0.15062108635902405, "learning_rate": 4.7650426812747505e-05, "loss": 0.4242, "step": 22890 }, { "epoch": 0.8251342487476123, "grad_norm": 0.15938465297222137, "learning_rate": 4.764919158422733e-05, "loss": 0.44, "step": 22895 }, { "epoch": 0.8253144484088369, "grad_norm": 0.19385161995887756, "learning_rate": 4.76479560471163e-05, "loss": 0.4743, "step": 22900 }, { "epoch": 0.8254946480700617, "grad_norm": 0.1362161636352539, "learning_rate": 4.764672020143125e-05, "loss": 0.417, "step": 22905 }, { "epoch": 0.8256748477312863, "grad_norm": 0.1929071843624115, "learning_rate": 4.7645484047189025e-05, "loss": 0.4504, "step": 22910 }, { "epoch": 0.8258550473925109, "grad_norm": 0.17340517044067383, "learning_rate": 4.764424758440647e-05, "loss": 0.4313, "step": 22915 }, { "epoch": 0.8260352470537355, "grad_norm": 0.13786520063877106, "learning_rate": 4.764301081310042e-05, "loss": 0.4284, "step": 22920 }, { "epoch": 0.8262154467149602, "grad_norm": 0.22200287878513336, "learning_rate": 4.764177373328773e-05, "loss": 0.4638, "step": 22925 }, { "epoch": 0.8263956463761848, "grad_norm": 0.1627381592988968, "learning_rate": 4.764053634498526e-05, "loss": 0.473, "step": 22930 }, { "epoch": 0.8265758460374094, "grad_norm": 0.18759234249591827, "learning_rate": 4.763929864820986e-05, "loss": 0.4233, "step": 22935 }, { "epoch": 0.826756045698634, "grad_norm": 0.16739268600940704, "learning_rate": 4.7638060642978405e-05, "loss": 0.4639, "step": 22940 }, { "epoch": 0.8269362453598588, "grad_norm": 0.13698381185531616, "learning_rate": 4.763682232930776e-05, "loss": 0.4376, "step": 22945 }, { "epoch": 0.8271164450210834, "grad_norm": 0.16585668921470642, "learning_rate": 4.763558370721478e-05, "loss": 0.3934, "step": 22950 }, { "epoch": 0.827296644682308, "grad_norm": 0.1562475860118866, "learning_rate": 4.7634344776716364e-05, "loss": 0.4305, "step": 22955 }, { "epoch": 0.8274768443435326, "grad_norm": 0.18153679370880127, "learning_rate": 4.763310553782938e-05, "loss": 0.4395, "step": 22960 }, { "epoch": 0.8276570440047573, "grad_norm": 0.21862560510635376, "learning_rate": 4.7631865990570715e-05, "loss": 0.4454, "step": 22965 }, { "epoch": 0.8278372436659819, "grad_norm": 0.16682131588459015, "learning_rate": 4.763062613495726e-05, "loss": 0.4646, "step": 22970 }, { "epoch": 0.8280174433272065, "grad_norm": 0.19076871871948242, "learning_rate": 4.76293859710059e-05, "loss": 0.4251, "step": 22975 }, { "epoch": 0.8281976429884311, "grad_norm": 0.14495553076267242, "learning_rate": 4.7628145498733543e-05, "loss": 0.4469, "step": 22980 }, { "epoch": 0.8283778426496559, "grad_norm": 0.15680228173732758, "learning_rate": 4.762690471815708e-05, "loss": 0.4153, "step": 22985 }, { "epoch": 0.8285580423108805, "grad_norm": 0.15533293783664703, "learning_rate": 4.762566362929343e-05, "loss": 0.426, "step": 22990 }, { "epoch": 0.8287382419721051, "grad_norm": 0.16373470425605774, "learning_rate": 4.762442223215949e-05, "loss": 0.4247, "step": 22995 }, { "epoch": 0.8289184416333297, "grad_norm": 0.15683810412883759, "learning_rate": 4.762318052677217e-05, "loss": 0.4058, "step": 23000 }, { "epoch": 0.8289184416333297, "eval_loss": 0.4584188461303711, "eval_runtime": 3.5371, "eval_samples_per_second": 28.271, "eval_steps_per_second": 7.068, "step": 23000 }, { "epoch": 0.8290986412945544, "grad_norm": 0.20195883512496948, "learning_rate": 4.76219385131484e-05, "loss": 0.4855, "step": 23005 }, { "epoch": 0.829278840955779, "grad_norm": 0.1833004504442215, "learning_rate": 4.76206961913051e-05, "loss": 0.4323, "step": 23010 }, { "epoch": 0.8294590406170036, "grad_norm": 0.19199278950691223, "learning_rate": 4.7619453561259195e-05, "loss": 0.4518, "step": 23015 }, { "epoch": 0.8296392402782283, "grad_norm": 0.16530929505825043, "learning_rate": 4.76182106230276e-05, "loss": 0.4471, "step": 23020 }, { "epoch": 0.829819439939453, "grad_norm": 0.1725630909204483, "learning_rate": 4.761696737662728e-05, "loss": 0.4395, "step": 23025 }, { "epoch": 0.8299996396006776, "grad_norm": 0.17936821281909943, "learning_rate": 4.761572382207515e-05, "loss": 0.4192, "step": 23030 }, { "epoch": 0.8301798392619022, "grad_norm": 0.18170322477817535, "learning_rate": 4.761447995938817e-05, "loss": 0.4352, "step": 23035 }, { "epoch": 0.8303600389231268, "grad_norm": 0.18376685678958893, "learning_rate": 4.761323578858327e-05, "loss": 0.4149, "step": 23040 }, { "epoch": 0.8305402385843514, "grad_norm": 0.16382743418216705, "learning_rate": 4.761199130967742e-05, "loss": 0.4194, "step": 23045 }, { "epoch": 0.8307204382455761, "grad_norm": 0.2041010707616806, "learning_rate": 4.761074652268755e-05, "loss": 0.457, "step": 23050 }, { "epoch": 0.8309006379068007, "grad_norm": 0.15971267223358154, "learning_rate": 4.7609501427630657e-05, "loss": 0.41, "step": 23055 }, { "epoch": 0.8310808375680254, "grad_norm": 0.16985180974006653, "learning_rate": 4.7608256024523666e-05, "loss": 0.4303, "step": 23060 }, { "epoch": 0.83126103722925, "grad_norm": 0.1820903718471527, "learning_rate": 4.760701031338358e-05, "loss": 0.4765, "step": 23065 }, { "epoch": 0.8314412368904747, "grad_norm": 0.14193592965602875, "learning_rate": 4.760576429422734e-05, "loss": 0.4375, "step": 23070 }, { "epoch": 0.8316214365516993, "grad_norm": 0.1694120466709137, "learning_rate": 4.760451796707195e-05, "loss": 0.4215, "step": 23075 }, { "epoch": 0.8318016362129239, "grad_norm": 0.172800675034523, "learning_rate": 4.7603271331934376e-05, "loss": 0.4102, "step": 23080 }, { "epoch": 0.8319818358741485, "grad_norm": 0.15721376240253448, "learning_rate": 4.76020243888316e-05, "loss": 0.4472, "step": 23085 }, { "epoch": 0.8321620355353732, "grad_norm": 0.15566657483577728, "learning_rate": 4.760077713778062e-05, "loss": 0.3864, "step": 23090 }, { "epoch": 0.8323422351965978, "grad_norm": 0.18366749584674835, "learning_rate": 4.759952957879843e-05, "loss": 0.4561, "step": 23095 }, { "epoch": 0.8325224348578225, "grad_norm": 0.15724977850914001, "learning_rate": 4.759828171190202e-05, "loss": 0.428, "step": 23100 }, { "epoch": 0.8327026345190471, "grad_norm": 0.2003716081380844, "learning_rate": 4.7597033537108405e-05, "loss": 0.4906, "step": 23105 }, { "epoch": 0.8328828341802718, "grad_norm": 0.1818516105413437, "learning_rate": 4.759578505443458e-05, "loss": 0.4225, "step": 23110 }, { "epoch": 0.8330630338414964, "grad_norm": 0.21417291462421417, "learning_rate": 4.759453626389756e-05, "loss": 0.4341, "step": 23115 }, { "epoch": 0.833243233502721, "grad_norm": 0.16403871774673462, "learning_rate": 4.759328716551435e-05, "loss": 0.4445, "step": 23120 }, { "epoch": 0.8334234331639456, "grad_norm": 0.16569961607456207, "learning_rate": 4.759203775930198e-05, "loss": 0.4387, "step": 23125 }, { "epoch": 0.8336036328251702, "grad_norm": 0.16702339053153992, "learning_rate": 4.7590788045277474e-05, "loss": 0.3888, "step": 23130 }, { "epoch": 0.8337838324863949, "grad_norm": 0.16976197063922882, "learning_rate": 4.758953802345785e-05, "loss": 0.4482, "step": 23135 }, { "epoch": 0.8339640321476196, "grad_norm": 0.16811056435108185, "learning_rate": 4.758828769386015e-05, "loss": 0.4274, "step": 23140 }, { "epoch": 0.8341442318088442, "grad_norm": 0.13746735453605652, "learning_rate": 4.758703705650139e-05, "loss": 0.4351, "step": 23145 }, { "epoch": 0.8343244314700689, "grad_norm": 0.16167333722114563, "learning_rate": 4.758578611139864e-05, "loss": 0.4553, "step": 23150 }, { "epoch": 0.8345046311312935, "grad_norm": 0.16944856941699982, "learning_rate": 4.758453485856892e-05, "loss": 0.3807, "step": 23155 }, { "epoch": 0.8346848307925181, "grad_norm": 0.17565567791461945, "learning_rate": 4.758328329802928e-05, "loss": 0.4513, "step": 23160 }, { "epoch": 0.8348650304537427, "grad_norm": 0.1966121941804886, "learning_rate": 4.758203142979678e-05, "loss": 0.4304, "step": 23165 }, { "epoch": 0.8350452301149673, "grad_norm": 0.15772442519664764, "learning_rate": 4.7580779253888476e-05, "loss": 0.4289, "step": 23170 }, { "epoch": 0.8352254297761921, "grad_norm": 0.16442222893238068, "learning_rate": 4.757952677032143e-05, "loss": 0.4614, "step": 23175 }, { "epoch": 0.8354056294374167, "grad_norm": 0.15727081894874573, "learning_rate": 4.7578273979112696e-05, "loss": 0.4287, "step": 23180 }, { "epoch": 0.8355858290986413, "grad_norm": 0.15362627804279327, "learning_rate": 4.757702088027935e-05, "loss": 0.4199, "step": 23185 }, { "epoch": 0.8357660287598659, "grad_norm": 0.1784539520740509, "learning_rate": 4.757576747383847e-05, "loss": 0.4247, "step": 23190 }, { "epoch": 0.8359462284210906, "grad_norm": 0.1273956149816513, "learning_rate": 4.757451375980713e-05, "loss": 0.4282, "step": 23195 }, { "epoch": 0.8361264280823152, "grad_norm": 0.10995710641145706, "learning_rate": 4.757325973820241e-05, "loss": 0.393, "step": 23200 }, { "epoch": 0.8363066277435398, "grad_norm": 0.1851365864276886, "learning_rate": 4.75720054090414e-05, "loss": 0.4583, "step": 23205 }, { "epoch": 0.8364868274047644, "grad_norm": 0.16413268446922302, "learning_rate": 4.7570750772341174e-05, "loss": 0.4797, "step": 23210 }, { "epoch": 0.8366670270659892, "grad_norm": 0.1683937907218933, "learning_rate": 4.756949582811885e-05, "loss": 0.4372, "step": 23215 }, { "epoch": 0.8368472267272138, "grad_norm": 0.1728678047657013, "learning_rate": 4.7568240576391507e-05, "loss": 0.4376, "step": 23220 }, { "epoch": 0.8370274263884384, "grad_norm": 0.20877417922019958, "learning_rate": 4.7566985017176255e-05, "loss": 0.4624, "step": 23225 }, { "epoch": 0.837207626049663, "grad_norm": 0.18232528865337372, "learning_rate": 4.756572915049021e-05, "loss": 0.4714, "step": 23230 }, { "epoch": 0.8373878257108877, "grad_norm": 0.1842896044254303, "learning_rate": 4.756447297635047e-05, "loss": 0.4341, "step": 23235 }, { "epoch": 0.8375680253721123, "grad_norm": 0.17772360146045685, "learning_rate": 4.756321649477415e-05, "loss": 0.4225, "step": 23240 }, { "epoch": 0.8377482250333369, "grad_norm": 0.16995792090892792, "learning_rate": 4.756195970577838e-05, "loss": 0.4395, "step": 23245 }, { "epoch": 0.8379284246945615, "grad_norm": 0.1724376529455185, "learning_rate": 4.7560702609380275e-05, "loss": 0.428, "step": 23250 }, { "epoch": 0.8381086243557863, "grad_norm": 0.1635865867137909, "learning_rate": 4.755944520559697e-05, "loss": 0.424, "step": 23255 }, { "epoch": 0.8382888240170109, "grad_norm": 0.14121320843696594, "learning_rate": 4.755818749444558e-05, "loss": 0.4165, "step": 23260 }, { "epoch": 0.8384690236782355, "grad_norm": 0.19829101860523224, "learning_rate": 4.755692947594326e-05, "loss": 0.4237, "step": 23265 }, { "epoch": 0.8386492233394601, "grad_norm": 0.17292529344558716, "learning_rate": 4.755567115010714e-05, "loss": 0.4166, "step": 23270 }, { "epoch": 0.8388294230006847, "grad_norm": 0.174628347158432, "learning_rate": 4.755441251695437e-05, "loss": 0.4377, "step": 23275 }, { "epoch": 0.8390096226619094, "grad_norm": 0.1492447406053543, "learning_rate": 4.75531535765021e-05, "loss": 0.4591, "step": 23280 }, { "epoch": 0.839189822323134, "grad_norm": 0.20081254839897156, "learning_rate": 4.755189432876747e-05, "loss": 0.4029, "step": 23285 }, { "epoch": 0.8393700219843586, "grad_norm": 0.19274207949638367, "learning_rate": 4.755063477376766e-05, "loss": 0.475, "step": 23290 }, { "epoch": 0.8395502216455833, "grad_norm": 0.16275402903556824, "learning_rate": 4.75493749115198e-05, "loss": 0.456, "step": 23295 }, { "epoch": 0.839730421306808, "grad_norm": 0.2030659317970276, "learning_rate": 4.7548114742041084e-05, "loss": 0.4322, "step": 23300 }, { "epoch": 0.8399106209680326, "grad_norm": 0.15633316338062286, "learning_rate": 4.754685426534866e-05, "loss": 0.4611, "step": 23305 }, { "epoch": 0.8400908206292572, "grad_norm": 0.17400497198104858, "learning_rate": 4.754559348145972e-05, "loss": 0.4066, "step": 23310 }, { "epoch": 0.8402710202904818, "grad_norm": 0.157339945435524, "learning_rate": 4.754433239039143e-05, "loss": 0.4289, "step": 23315 }, { "epoch": 0.8404512199517065, "grad_norm": 0.1500500738620758, "learning_rate": 4.7543070992160984e-05, "loss": 0.4215, "step": 23320 }, { "epoch": 0.8406314196129311, "grad_norm": 0.21953436732292175, "learning_rate": 4.754180928678555e-05, "loss": 0.4825, "step": 23325 }, { "epoch": 0.8408116192741558, "grad_norm": 0.1740504950284958, "learning_rate": 4.754054727428233e-05, "loss": 0.4395, "step": 23330 }, { "epoch": 0.8409918189353804, "grad_norm": 0.17277638614177704, "learning_rate": 4.753928495466853e-05, "loss": 0.431, "step": 23335 }, { "epoch": 0.8411720185966051, "grad_norm": 0.196901336312294, "learning_rate": 4.7538022327961316e-05, "loss": 0.4826, "step": 23340 }, { "epoch": 0.8413522182578297, "grad_norm": 0.2405674159526825, "learning_rate": 4.7536759394177925e-05, "loss": 0.4524, "step": 23345 }, { "epoch": 0.8415324179190543, "grad_norm": 0.1807735562324524, "learning_rate": 4.7535496153335544e-05, "loss": 0.4144, "step": 23350 }, { "epoch": 0.8417126175802789, "grad_norm": 0.1897483915090561, "learning_rate": 4.75342326054514e-05, "loss": 0.431, "step": 23355 }, { "epoch": 0.8418928172415036, "grad_norm": 0.17141766846179962, "learning_rate": 4.7532968750542694e-05, "loss": 0.4671, "step": 23360 }, { "epoch": 0.8420730169027282, "grad_norm": 0.17842607200145721, "learning_rate": 4.753170458862665e-05, "loss": 0.471, "step": 23365 }, { "epoch": 0.8422532165639529, "grad_norm": 0.14743296802043915, "learning_rate": 4.75304401197205e-05, "loss": 0.4735, "step": 23370 }, { "epoch": 0.8424334162251775, "grad_norm": 0.1760016530752182, "learning_rate": 4.7529175343841455e-05, "loss": 0.431, "step": 23375 }, { "epoch": 0.8426136158864022, "grad_norm": 0.16929085552692413, "learning_rate": 4.7527910261006755e-05, "loss": 0.4272, "step": 23380 }, { "epoch": 0.8427938155476268, "grad_norm": 0.167909175157547, "learning_rate": 4.752664487123365e-05, "loss": 0.4715, "step": 23385 }, { "epoch": 0.8429740152088514, "grad_norm": 0.17611196637153625, "learning_rate": 4.752537917453937e-05, "loss": 0.4361, "step": 23390 }, { "epoch": 0.843154214870076, "grad_norm": 0.1789659559726715, "learning_rate": 4.752411317094115e-05, "loss": 0.4534, "step": 23395 }, { "epoch": 0.8433344145313006, "grad_norm": 0.17830581963062286, "learning_rate": 4.752284686045626e-05, "loss": 0.4091, "step": 23400 }, { "epoch": 0.8435146141925253, "grad_norm": 0.18294012546539307, "learning_rate": 4.752158024310194e-05, "loss": 0.4673, "step": 23405 }, { "epoch": 0.84369481385375, "grad_norm": 0.15136177837848663, "learning_rate": 4.752031331889545e-05, "loss": 0.4319, "step": 23410 }, { "epoch": 0.8438750135149746, "grad_norm": 0.20712289214134216, "learning_rate": 4.751904608785405e-05, "loss": 0.4255, "step": 23415 }, { "epoch": 0.8440552131761992, "grad_norm": 0.1756359040737152, "learning_rate": 4.7517778549994994e-05, "loss": 0.4445, "step": 23420 }, { "epoch": 0.8442354128374239, "grad_norm": 0.15145882964134216, "learning_rate": 4.751651070533558e-05, "loss": 0.4436, "step": 23425 }, { "epoch": 0.8444156124986485, "grad_norm": 0.18007196485996246, "learning_rate": 4.7515242553893056e-05, "loss": 0.4498, "step": 23430 }, { "epoch": 0.8445958121598731, "grad_norm": 0.17577087879180908, "learning_rate": 4.751397409568472e-05, "loss": 0.4107, "step": 23435 }, { "epoch": 0.8447760118210977, "grad_norm": 0.1596294343471527, "learning_rate": 4.7512705330727847e-05, "loss": 0.4042, "step": 23440 }, { "epoch": 0.8449562114823224, "grad_norm": 0.14652547240257263, "learning_rate": 4.751143625903972e-05, "loss": 0.4424, "step": 23445 }, { "epoch": 0.8451364111435471, "grad_norm": 0.14940057694911957, "learning_rate": 4.751016688063763e-05, "loss": 0.4387, "step": 23450 }, { "epoch": 0.8453166108047717, "grad_norm": 0.16962915658950806, "learning_rate": 4.750889719553888e-05, "loss": 0.447, "step": 23455 }, { "epoch": 0.8454968104659963, "grad_norm": 0.193365216255188, "learning_rate": 4.7507627203760754e-05, "loss": 0.4235, "step": 23460 }, { "epoch": 0.845677010127221, "grad_norm": 0.24096348881721497, "learning_rate": 4.7506356905320574e-05, "loss": 0.4828, "step": 23465 }, { "epoch": 0.8458572097884456, "grad_norm": 0.17045122385025024, "learning_rate": 4.7505086300235635e-05, "loss": 0.3911, "step": 23470 }, { "epoch": 0.8460374094496702, "grad_norm": 0.18029987812042236, "learning_rate": 4.750381538852325e-05, "loss": 0.4495, "step": 23475 }, { "epoch": 0.8462176091108948, "grad_norm": 0.1832016408443451, "learning_rate": 4.7502544170200735e-05, "loss": 0.4109, "step": 23480 }, { "epoch": 0.8463978087721195, "grad_norm": 0.14569438993930817, "learning_rate": 4.750127264528542e-05, "loss": 0.4221, "step": 23485 }, { "epoch": 0.8465780084333442, "grad_norm": 0.16590553522109985, "learning_rate": 4.750000081379462e-05, "loss": 0.4384, "step": 23490 }, { "epoch": 0.8467582080945688, "grad_norm": 0.201650008559227, "learning_rate": 4.749872867574566e-05, "loss": 0.4363, "step": 23495 }, { "epoch": 0.8469384077557934, "grad_norm": 0.20361419022083282, "learning_rate": 4.7497456231155884e-05, "loss": 0.407, "step": 23500 }, { "epoch": 0.8469384077557934, "eval_loss": 0.4585105776786804, "eval_runtime": 3.5434, "eval_samples_per_second": 28.221, "eval_steps_per_second": 7.055, "step": 23500 }, { "epoch": 0.847118607417018, "grad_norm": 0.1582178771495819, "learning_rate": 4.749618348004262e-05, "loss": 0.441, "step": 23505 }, { "epoch": 0.8472988070782427, "grad_norm": 0.1565631479024887, "learning_rate": 4.749491042242321e-05, "loss": 0.4264, "step": 23510 }, { "epoch": 0.8474790067394673, "grad_norm": 0.15611840784549713, "learning_rate": 4.7493637058314996e-05, "loss": 0.4848, "step": 23515 }, { "epoch": 0.8476592064006919, "grad_norm": 0.17525018751621246, "learning_rate": 4.749236338773535e-05, "loss": 0.4368, "step": 23520 }, { "epoch": 0.8478394060619167, "grad_norm": 0.18957500159740448, "learning_rate": 4.749108941070159e-05, "loss": 0.4456, "step": 23525 }, { "epoch": 0.8480196057231413, "grad_norm": 0.1911652833223343, "learning_rate": 4.74898151272311e-05, "loss": 0.4294, "step": 23530 }, { "epoch": 0.8481998053843659, "grad_norm": 0.17784404754638672, "learning_rate": 4.748854053734122e-05, "loss": 0.4079, "step": 23535 }, { "epoch": 0.8483800050455905, "grad_norm": 0.16650652885437012, "learning_rate": 4.748726564104935e-05, "loss": 0.4114, "step": 23540 }, { "epoch": 0.8485602047068151, "grad_norm": 0.15656359493732452, "learning_rate": 4.748599043837282e-05, "loss": 0.4926, "step": 23545 }, { "epoch": 0.8487404043680398, "grad_norm": 0.1699918955564499, "learning_rate": 4.748471492932903e-05, "loss": 0.4273, "step": 23550 }, { "epoch": 0.8489206040292644, "grad_norm": 0.15680480003356934, "learning_rate": 4.748343911393536e-05, "loss": 0.3945, "step": 23555 }, { "epoch": 0.849100803690489, "grad_norm": 0.14087727665901184, "learning_rate": 4.748216299220918e-05, "loss": 0.4801, "step": 23560 }, { "epoch": 0.8492810033517137, "grad_norm": 0.1624569147825241, "learning_rate": 4.7480886564167877e-05, "loss": 0.3957, "step": 23565 }, { "epoch": 0.8494612030129384, "grad_norm": 0.15580429136753082, "learning_rate": 4.747960982982885e-05, "loss": 0.425, "step": 23570 }, { "epoch": 0.849641402674163, "grad_norm": 0.15668196976184845, "learning_rate": 4.747833278920949e-05, "loss": 0.4421, "step": 23575 }, { "epoch": 0.8498216023353876, "grad_norm": 0.14758959412574768, "learning_rate": 4.74770554423272e-05, "loss": 0.4473, "step": 23580 }, { "epoch": 0.8500018019966122, "grad_norm": 0.15379658341407776, "learning_rate": 4.747577778919938e-05, "loss": 0.4246, "step": 23585 }, { "epoch": 0.8501820016578369, "grad_norm": 0.14333048462867737, "learning_rate": 4.747449982984344e-05, "loss": 0.4508, "step": 23590 }, { "epoch": 0.8503622013190615, "grad_norm": 0.18294481933116913, "learning_rate": 4.747322156427679e-05, "loss": 0.4115, "step": 23595 }, { "epoch": 0.8505424009802861, "grad_norm": 0.18898597359657288, "learning_rate": 4.747194299251685e-05, "loss": 0.4568, "step": 23600 }, { "epoch": 0.8507226006415108, "grad_norm": 0.1791786253452301, "learning_rate": 4.7470664114581034e-05, "loss": 0.4312, "step": 23605 }, { "epoch": 0.8509028003027355, "grad_norm": 0.2092021107673645, "learning_rate": 4.746938493048677e-05, "loss": 0.4392, "step": 23610 }, { "epoch": 0.8510829999639601, "grad_norm": 0.17262564599514008, "learning_rate": 4.7468105440251494e-05, "loss": 0.4335, "step": 23615 }, { "epoch": 0.8512631996251847, "grad_norm": 0.1536947339773178, "learning_rate": 4.746682564389262e-05, "loss": 0.3972, "step": 23620 }, { "epoch": 0.8514433992864093, "grad_norm": 0.16581547260284424, "learning_rate": 4.7465545541427605e-05, "loss": 0.4377, "step": 23625 }, { "epoch": 0.851623598947634, "grad_norm": 0.1816360354423523, "learning_rate": 4.746426513287387e-05, "loss": 0.4616, "step": 23630 }, { "epoch": 0.8518037986088586, "grad_norm": 0.1770544946193695, "learning_rate": 4.746298441824889e-05, "loss": 0.4234, "step": 23635 }, { "epoch": 0.8519839982700832, "grad_norm": 0.15067480504512787, "learning_rate": 4.746170339757009e-05, "loss": 0.4579, "step": 23640 }, { "epoch": 0.8521641979313079, "grad_norm": 0.12504181265830994, "learning_rate": 4.746042207085492e-05, "loss": 0.4412, "step": 23645 }, { "epoch": 0.8523443975925326, "grad_norm": 0.1552511602640152, "learning_rate": 4.745914043812085e-05, "loss": 0.4427, "step": 23650 }, { "epoch": 0.8525245972537572, "grad_norm": 0.21108126640319824, "learning_rate": 4.745785849938535e-05, "loss": 0.4025, "step": 23655 }, { "epoch": 0.8527047969149818, "grad_norm": 0.17185276746749878, "learning_rate": 4.7456576254665866e-05, "loss": 0.4341, "step": 23660 }, { "epoch": 0.8528849965762064, "grad_norm": 0.21345165371894836, "learning_rate": 4.7455293703979884e-05, "loss": 0.4513, "step": 23665 }, { "epoch": 0.853065196237431, "grad_norm": 0.16104549169540405, "learning_rate": 4.745401084734487e-05, "loss": 0.4178, "step": 23670 }, { "epoch": 0.8532453958986557, "grad_norm": 0.1738523244857788, "learning_rate": 4.7452727684778305e-05, "loss": 0.4372, "step": 23675 }, { "epoch": 0.8534255955598804, "grad_norm": 0.19414255023002625, "learning_rate": 4.7451444216297674e-05, "loss": 0.4444, "step": 23680 }, { "epoch": 0.853605795221105, "grad_norm": 0.16047124564647675, "learning_rate": 4.7450160441920466e-05, "loss": 0.4439, "step": 23685 }, { "epoch": 0.8537859948823296, "grad_norm": 0.1373186558485031, "learning_rate": 4.744887636166416e-05, "loss": 0.396, "step": 23690 }, { "epoch": 0.8539661945435543, "grad_norm": 0.15237957239151, "learning_rate": 4.744759197554626e-05, "loss": 0.4106, "step": 23695 }, { "epoch": 0.8541463942047789, "grad_norm": 0.18528686463832855, "learning_rate": 4.744630728358427e-05, "loss": 0.4458, "step": 23700 }, { "epoch": 0.8543265938660035, "grad_norm": 0.16544073820114136, "learning_rate": 4.744502228579569e-05, "loss": 0.4257, "step": 23705 }, { "epoch": 0.8545067935272281, "grad_norm": 0.16749577224254608, "learning_rate": 4.7443736982198026e-05, "loss": 0.4441, "step": 23710 }, { "epoch": 0.8546869931884528, "grad_norm": 0.19285382330417633, "learning_rate": 4.7442451372808797e-05, "loss": 0.4527, "step": 23715 }, { "epoch": 0.8548671928496775, "grad_norm": 0.17987944185733795, "learning_rate": 4.74411654576455e-05, "loss": 0.4205, "step": 23720 }, { "epoch": 0.8550473925109021, "grad_norm": 0.14557349681854248, "learning_rate": 4.7439879236725674e-05, "loss": 0.4193, "step": 23725 }, { "epoch": 0.8552275921721267, "grad_norm": 0.184067040681839, "learning_rate": 4.743859271006684e-05, "loss": 0.4426, "step": 23730 }, { "epoch": 0.8554077918333514, "grad_norm": 0.14480531215667725, "learning_rate": 4.743730587768652e-05, "loss": 0.4561, "step": 23735 }, { "epoch": 0.855587991494576, "grad_norm": 0.16141416132450104, "learning_rate": 4.7436018739602255e-05, "loss": 0.4452, "step": 23740 }, { "epoch": 0.8557681911558006, "grad_norm": 0.1394343227148056, "learning_rate": 4.743473129583158e-05, "loss": 0.4205, "step": 23745 }, { "epoch": 0.8559483908170252, "grad_norm": 0.18532565236091614, "learning_rate": 4.743344354639203e-05, "loss": 0.4093, "step": 23750 }, { "epoch": 0.8561285904782499, "grad_norm": 0.17660491168498993, "learning_rate": 4.743215549130115e-05, "loss": 0.4328, "step": 23755 }, { "epoch": 0.8563087901394746, "grad_norm": 0.20785756409168243, "learning_rate": 4.743086713057651e-05, "loss": 0.427, "step": 23760 }, { "epoch": 0.8564889898006992, "grad_norm": 0.19306261837482452, "learning_rate": 4.7429578464235635e-05, "loss": 0.4162, "step": 23765 }, { "epoch": 0.8566691894619238, "grad_norm": 0.1779967099428177, "learning_rate": 4.742828949229611e-05, "loss": 0.4251, "step": 23770 }, { "epoch": 0.8568493891231485, "grad_norm": 0.22586210072040558, "learning_rate": 4.742700021477547e-05, "loss": 0.4546, "step": 23775 }, { "epoch": 0.8570295887843731, "grad_norm": 0.1746557056903839, "learning_rate": 4.74257106316913e-05, "loss": 0.3891, "step": 23780 }, { "epoch": 0.8572097884455977, "grad_norm": 0.1889764368534088, "learning_rate": 4.742442074306116e-05, "loss": 0.4304, "step": 23785 }, { "epoch": 0.8573899881068223, "grad_norm": 0.16799265146255493, "learning_rate": 4.742313054890263e-05, "loss": 0.3993, "step": 23790 }, { "epoch": 0.8575701877680469, "grad_norm": 0.17759232223033905, "learning_rate": 4.742184004923329e-05, "loss": 0.4243, "step": 23795 }, { "epoch": 0.8577503874292717, "grad_norm": 0.16500405967235565, "learning_rate": 4.742054924407072e-05, "loss": 0.4649, "step": 23800 }, { "epoch": 0.8579305870904963, "grad_norm": 0.20370623469352722, "learning_rate": 4.7419258133432504e-05, "loss": 0.4699, "step": 23805 }, { "epoch": 0.8581107867517209, "grad_norm": 0.18750417232513428, "learning_rate": 4.741796671733624e-05, "loss": 0.4242, "step": 23810 }, { "epoch": 0.8582909864129455, "grad_norm": 0.1584675908088684, "learning_rate": 4.741667499579952e-05, "loss": 0.4649, "step": 23815 }, { "epoch": 0.8584711860741702, "grad_norm": 0.2016519159078598, "learning_rate": 4.741538296883994e-05, "loss": 0.4401, "step": 23820 }, { "epoch": 0.8586513857353948, "grad_norm": 0.14827871322631836, "learning_rate": 4.741409063647511e-05, "loss": 0.4315, "step": 23825 }, { "epoch": 0.8588315853966194, "grad_norm": 0.22020429372787476, "learning_rate": 4.741279799872263e-05, "loss": 0.4144, "step": 23830 }, { "epoch": 0.8590117850578441, "grad_norm": 0.15583039820194244, "learning_rate": 4.7411505055600115e-05, "loss": 0.4558, "step": 23835 }, { "epoch": 0.8591919847190688, "grad_norm": 0.17358680069446564, "learning_rate": 4.741021180712519e-05, "loss": 0.441, "step": 23840 }, { "epoch": 0.8593721843802934, "grad_norm": 0.2088773101568222, "learning_rate": 4.7408918253315464e-05, "loss": 0.4209, "step": 23845 }, { "epoch": 0.859552384041518, "grad_norm": 0.1951671689748764, "learning_rate": 4.740762439418856e-05, "loss": 0.4292, "step": 23850 }, { "epoch": 0.8597325837027426, "grad_norm": 0.14672303199768066, "learning_rate": 4.740633022976213e-05, "loss": 0.4192, "step": 23855 }, { "epoch": 0.8599127833639673, "grad_norm": 0.21486572921276093, "learning_rate": 4.740503576005377e-05, "loss": 0.4734, "step": 23860 }, { "epoch": 0.8600929830251919, "grad_norm": 0.15998758375644684, "learning_rate": 4.740374098508115e-05, "loss": 0.4318, "step": 23865 }, { "epoch": 0.8602731826864165, "grad_norm": 0.17849770188331604, "learning_rate": 4.740244590486188e-05, "loss": 0.4296, "step": 23870 }, { "epoch": 0.8604533823476412, "grad_norm": 0.18155914545059204, "learning_rate": 4.740115051941363e-05, "loss": 0.4598, "step": 23875 }, { "epoch": 0.8606335820088659, "grad_norm": 0.15962116420269012, "learning_rate": 4.7399854828754045e-05, "loss": 0.417, "step": 23880 }, { "epoch": 0.8608137816700905, "grad_norm": 0.15650032460689545, "learning_rate": 4.7398558832900774e-05, "loss": 0.4187, "step": 23885 }, { "epoch": 0.8609939813313151, "grad_norm": 0.1548149287700653, "learning_rate": 4.7397262531871466e-05, "loss": 0.4128, "step": 23890 }, { "epoch": 0.8611741809925397, "grad_norm": 0.1666049212217331, "learning_rate": 4.739596592568381e-05, "loss": 0.4565, "step": 23895 }, { "epoch": 0.8613543806537644, "grad_norm": 0.14310970902442932, "learning_rate": 4.7394669014355444e-05, "loss": 0.4583, "step": 23900 }, { "epoch": 0.861534580314989, "grad_norm": 0.15406273305416107, "learning_rate": 4.739337179790404e-05, "loss": 0.4391, "step": 23905 }, { "epoch": 0.8617147799762136, "grad_norm": 0.15325191617012024, "learning_rate": 4.739207427634729e-05, "loss": 0.4413, "step": 23910 }, { "epoch": 0.8618949796374383, "grad_norm": 0.14947649836540222, "learning_rate": 4.7390776449702864e-05, "loss": 0.439, "step": 23915 }, { "epoch": 0.862075179298663, "grad_norm": 0.16361840069293976, "learning_rate": 4.738947831798844e-05, "loss": 0.4578, "step": 23920 }, { "epoch": 0.8622553789598876, "grad_norm": 0.20598135888576508, "learning_rate": 4.738817988122171e-05, "loss": 0.4678, "step": 23925 }, { "epoch": 0.8624355786211122, "grad_norm": 0.17498494684696198, "learning_rate": 4.738688113942036e-05, "loss": 0.4866, "step": 23930 }, { "epoch": 0.8626157782823368, "grad_norm": 0.1920240968465805, "learning_rate": 4.73855820926021e-05, "loss": 0.4274, "step": 23935 }, { "epoch": 0.8627959779435614, "grad_norm": 0.14575999975204468, "learning_rate": 4.738428274078461e-05, "loss": 0.4185, "step": 23940 }, { "epoch": 0.8629761776047861, "grad_norm": 0.14960631728172302, "learning_rate": 4.73829830839856e-05, "loss": 0.4148, "step": 23945 }, { "epoch": 0.8631563772660107, "grad_norm": 0.15745064616203308, "learning_rate": 4.738168312222278e-05, "loss": 0.4097, "step": 23950 }, { "epoch": 0.8633365769272354, "grad_norm": 0.1760377436876297, "learning_rate": 4.738038285551386e-05, "loss": 0.4678, "step": 23955 }, { "epoch": 0.86351677658846, "grad_norm": 0.15912814438343048, "learning_rate": 4.7379082283876566e-05, "loss": 0.4585, "step": 23960 }, { "epoch": 0.8636969762496847, "grad_norm": 0.13206642866134644, "learning_rate": 4.737778140732859e-05, "loss": 0.4157, "step": 23965 }, { "epoch": 0.8638771759109093, "grad_norm": 0.21865518391132355, "learning_rate": 4.737648022588769e-05, "loss": 0.4373, "step": 23970 }, { "epoch": 0.8640573755721339, "grad_norm": 0.15692706406116486, "learning_rate": 4.737517873957158e-05, "loss": 0.3932, "step": 23975 }, { "epoch": 0.8642375752333585, "grad_norm": 0.19991809129714966, "learning_rate": 4.737387694839798e-05, "loss": 0.4778, "step": 23980 }, { "epoch": 0.8644177748945832, "grad_norm": 0.1880597621202469, "learning_rate": 4.737257485238465e-05, "loss": 0.3943, "step": 23985 }, { "epoch": 0.8645979745558078, "grad_norm": 0.21935416758060455, "learning_rate": 4.737127245154931e-05, "loss": 0.4207, "step": 23990 }, { "epoch": 0.8647781742170325, "grad_norm": 0.16751424968242645, "learning_rate": 4.736996974590972e-05, "loss": 0.4191, "step": 23995 }, { "epoch": 0.8649583738782571, "grad_norm": 0.18504785001277924, "learning_rate": 4.736866673548362e-05, "loss": 0.4088, "step": 24000 }, { "epoch": 0.8649583738782571, "eval_loss": 0.45828837156295776, "eval_runtime": 3.5369, "eval_samples_per_second": 28.273, "eval_steps_per_second": 7.068, "step": 24000 }, { "epoch": 0.8651385735394818, "grad_norm": 0.18134254217147827, "learning_rate": 4.7367363420288765e-05, "loss": 0.4821, "step": 24005 }, { "epoch": 0.8653187732007064, "grad_norm": 0.15396596491336823, "learning_rate": 4.736605980034292e-05, "loss": 0.433, "step": 24010 }, { "epoch": 0.865498972861931, "grad_norm": 0.1897013932466507, "learning_rate": 4.7364755875663834e-05, "loss": 0.4416, "step": 24015 }, { "epoch": 0.8656791725231556, "grad_norm": 0.15314961969852448, "learning_rate": 4.736345164626929e-05, "loss": 0.4014, "step": 24020 }, { "epoch": 0.8658593721843802, "grad_norm": 0.17612630128860474, "learning_rate": 4.736214711217703e-05, "loss": 0.4435, "step": 24025 }, { "epoch": 0.866039571845605, "grad_norm": 0.15653395652770996, "learning_rate": 4.736084227340486e-05, "loss": 0.4473, "step": 24030 }, { "epoch": 0.8662197715068296, "grad_norm": 0.14282876253128052, "learning_rate": 4.735953712997053e-05, "loss": 0.4484, "step": 24035 }, { "epoch": 0.8663999711680542, "grad_norm": 0.184425950050354, "learning_rate": 4.7358231681891855e-05, "loss": 0.4453, "step": 24040 }, { "epoch": 0.8665801708292789, "grad_norm": 0.1802368015050888, "learning_rate": 4.735692592918658e-05, "loss": 0.4524, "step": 24045 }, { "epoch": 0.8667603704905035, "grad_norm": 0.15805204212665558, "learning_rate": 4.735561987187253e-05, "loss": 0.4139, "step": 24050 }, { "epoch": 0.8669405701517281, "grad_norm": 0.16449551284313202, "learning_rate": 4.7354313509967486e-05, "loss": 0.4236, "step": 24055 }, { "epoch": 0.8671207698129527, "grad_norm": 0.1771911382675171, "learning_rate": 4.735300684348925e-05, "loss": 0.4665, "step": 24060 }, { "epoch": 0.8673009694741773, "grad_norm": 0.1740954965353012, "learning_rate": 4.735169987245561e-05, "loss": 0.4394, "step": 24065 }, { "epoch": 0.8674811691354021, "grad_norm": 0.18734583258628845, "learning_rate": 4.735039259688441e-05, "loss": 0.4227, "step": 24070 }, { "epoch": 0.8676613687966267, "grad_norm": 0.1404682993888855, "learning_rate": 4.734908501679342e-05, "loss": 0.4358, "step": 24075 }, { "epoch": 0.8678415684578513, "grad_norm": 0.16821545362472534, "learning_rate": 4.7347777132200475e-05, "loss": 0.4389, "step": 24080 }, { "epoch": 0.8680217681190759, "grad_norm": 0.17715495824813843, "learning_rate": 4.73464689431234e-05, "loss": 0.4148, "step": 24085 }, { "epoch": 0.8682019677803006, "grad_norm": 0.16599716246128082, "learning_rate": 4.734516044958001e-05, "loss": 0.4243, "step": 24090 }, { "epoch": 0.8683821674415252, "grad_norm": 0.15897536277770996, "learning_rate": 4.7343851651588137e-05, "loss": 0.431, "step": 24095 }, { "epoch": 0.8685623671027498, "grad_norm": 0.14617212116718292, "learning_rate": 4.73425425491656e-05, "loss": 0.4395, "step": 24100 }, { "epoch": 0.8687425667639744, "grad_norm": 0.1834208071231842, "learning_rate": 4.734123314233026e-05, "loss": 0.4274, "step": 24105 }, { "epoch": 0.8689227664251992, "grad_norm": 0.22381344437599182, "learning_rate": 4.733992343109994e-05, "loss": 0.4279, "step": 24110 }, { "epoch": 0.8691029660864238, "grad_norm": 0.14666973054409027, "learning_rate": 4.7338613415492486e-05, "loss": 0.4255, "step": 24115 }, { "epoch": 0.8692831657476484, "grad_norm": 0.17257143557071686, "learning_rate": 4.733730309552575e-05, "loss": 0.4548, "step": 24120 }, { "epoch": 0.869463365408873, "grad_norm": 0.17522400617599487, "learning_rate": 4.733599247121758e-05, "loss": 0.4354, "step": 24125 }, { "epoch": 0.8696435650700977, "grad_norm": 0.19729764759540558, "learning_rate": 4.733468154258585e-05, "loss": 0.46, "step": 24130 }, { "epoch": 0.8698237647313223, "grad_norm": 0.15954478085041046, "learning_rate": 4.733337030964839e-05, "loss": 0.4451, "step": 24135 }, { "epoch": 0.8700039643925469, "grad_norm": 0.16600678861141205, "learning_rate": 4.73320587724231e-05, "loss": 0.4397, "step": 24140 }, { "epoch": 0.8701841640537715, "grad_norm": 0.16135752201080322, "learning_rate": 4.733074693092783e-05, "loss": 0.4045, "step": 24145 }, { "epoch": 0.8703643637149963, "grad_norm": 0.16556213796138763, "learning_rate": 4.732943478518045e-05, "loss": 0.4408, "step": 24150 }, { "epoch": 0.8705445633762209, "grad_norm": 0.15862923860549927, "learning_rate": 4.732812233519884e-05, "loss": 0.4239, "step": 24155 }, { "epoch": 0.8707247630374455, "grad_norm": 0.19938628375530243, "learning_rate": 4.73268095810009e-05, "loss": 0.4735, "step": 24160 }, { "epoch": 0.8709049626986701, "grad_norm": 0.19490042328834534, "learning_rate": 4.732549652260449e-05, "loss": 0.4184, "step": 24165 }, { "epoch": 0.8710851623598947, "grad_norm": 0.19054916501045227, "learning_rate": 4.732418316002751e-05, "loss": 0.4391, "step": 24170 }, { "epoch": 0.8712653620211194, "grad_norm": 0.15895889699459076, "learning_rate": 4.732286949328787e-05, "loss": 0.448, "step": 24175 }, { "epoch": 0.871445561682344, "grad_norm": 0.16004562377929688, "learning_rate": 4.732155552240345e-05, "loss": 0.3978, "step": 24180 }, { "epoch": 0.8716257613435687, "grad_norm": 0.2268204391002655, "learning_rate": 4.732024124739215e-05, "loss": 0.4128, "step": 24185 }, { "epoch": 0.8718059610047934, "grad_norm": 0.18399815261363983, "learning_rate": 4.731892666827189e-05, "loss": 0.4377, "step": 24190 }, { "epoch": 0.871986160666018, "grad_norm": 0.13044115900993347, "learning_rate": 4.731761178506058e-05, "loss": 0.4663, "step": 24195 }, { "epoch": 0.8721663603272426, "grad_norm": 0.1856122612953186, "learning_rate": 4.7316296597776123e-05, "loss": 0.4289, "step": 24200 }, { "epoch": 0.8723465599884672, "grad_norm": 0.17390163242816925, "learning_rate": 4.731498110643645e-05, "loss": 0.4633, "step": 24205 }, { "epoch": 0.8725267596496918, "grad_norm": 0.15870732069015503, "learning_rate": 4.731366531105947e-05, "loss": 0.436, "step": 24210 }, { "epoch": 0.8727069593109165, "grad_norm": 0.18578395247459412, "learning_rate": 4.731234921166313e-05, "loss": 0.4246, "step": 24215 }, { "epoch": 0.8728871589721411, "grad_norm": 0.19250373542308807, "learning_rate": 4.7311032808265356e-05, "loss": 0.4511, "step": 24220 }, { "epoch": 0.8730673586333658, "grad_norm": 0.18087449669837952, "learning_rate": 4.730971610088407e-05, "loss": 0.4365, "step": 24225 }, { "epoch": 0.8732475582945904, "grad_norm": 0.15141281485557556, "learning_rate": 4.7308399089537224e-05, "loss": 0.4196, "step": 24230 }, { "epoch": 0.8734277579558151, "grad_norm": 0.16130012273788452, "learning_rate": 4.730708177424276e-05, "loss": 0.4339, "step": 24235 }, { "epoch": 0.8736079576170397, "grad_norm": 0.21587517857551575, "learning_rate": 4.730576415501863e-05, "loss": 0.4417, "step": 24240 }, { "epoch": 0.8737881572782643, "grad_norm": 0.1740473508834839, "learning_rate": 4.730444623188278e-05, "loss": 0.4073, "step": 24245 }, { "epoch": 0.8739683569394889, "grad_norm": 0.1535811573266983, "learning_rate": 4.730312800485316e-05, "loss": 0.4328, "step": 24250 }, { "epoch": 0.8741485566007136, "grad_norm": 0.20786762237548828, "learning_rate": 4.7301809473947744e-05, "loss": 0.4224, "step": 24255 }, { "epoch": 0.8743287562619382, "grad_norm": 0.19472654163837433, "learning_rate": 4.730049063918449e-05, "loss": 0.4272, "step": 24260 }, { "epoch": 0.8745089559231629, "grad_norm": 0.2173738032579422, "learning_rate": 4.729917150058137e-05, "loss": 0.4626, "step": 24265 }, { "epoch": 0.8746891555843875, "grad_norm": 0.2047669142484665, "learning_rate": 4.729785205815637e-05, "loss": 0.4195, "step": 24270 }, { "epoch": 0.8748693552456122, "grad_norm": 0.19127419590950012, "learning_rate": 4.7296532311927436e-05, "loss": 0.4646, "step": 24275 }, { "epoch": 0.8750495549068368, "grad_norm": 0.22150792181491852, "learning_rate": 4.729521226191257e-05, "loss": 0.426, "step": 24280 }, { "epoch": 0.8752297545680614, "grad_norm": 0.2369442582130432, "learning_rate": 4.729389190812975e-05, "loss": 0.4241, "step": 24285 }, { "epoch": 0.875409954229286, "grad_norm": 0.17270110547542572, "learning_rate": 4.729257125059697e-05, "loss": 0.4575, "step": 24290 }, { "epoch": 0.8755901538905106, "grad_norm": 0.16564151644706726, "learning_rate": 4.729125028933222e-05, "loss": 0.4041, "step": 24295 }, { "epoch": 0.8757703535517353, "grad_norm": 0.1850225180387497, "learning_rate": 4.728992902435351e-05, "loss": 0.4068, "step": 24300 }, { "epoch": 0.87595055321296, "grad_norm": 0.17433954775333405, "learning_rate": 4.728860745567883e-05, "loss": 0.4035, "step": 24305 }, { "epoch": 0.8761307528741846, "grad_norm": 0.1604575216770172, "learning_rate": 4.728728558332618e-05, "loss": 0.4102, "step": 24310 }, { "epoch": 0.8763109525354092, "grad_norm": 0.18941442668437958, "learning_rate": 4.7285963407313594e-05, "loss": 0.4188, "step": 24315 }, { "epoch": 0.8764911521966339, "grad_norm": 0.15453235805034637, "learning_rate": 4.728464092765906e-05, "loss": 0.4138, "step": 24320 }, { "epoch": 0.8766713518578585, "grad_norm": 0.16290608048439026, "learning_rate": 4.7283318144380606e-05, "loss": 0.4676, "step": 24325 }, { "epoch": 0.8768515515190831, "grad_norm": 0.15486080944538116, "learning_rate": 4.728199505749626e-05, "loss": 0.4694, "step": 24330 }, { "epoch": 0.8770317511803077, "grad_norm": 0.14686860144138336, "learning_rate": 4.728067166702404e-05, "loss": 0.4268, "step": 24335 }, { "epoch": 0.8772119508415325, "grad_norm": 0.18616507947444916, "learning_rate": 4.7279347972982e-05, "loss": 0.4561, "step": 24340 }, { "epoch": 0.8773921505027571, "grad_norm": 0.17040805518627167, "learning_rate": 4.727802397538814e-05, "loss": 0.4399, "step": 24345 }, { "epoch": 0.8775723501639817, "grad_norm": 0.18918024003505707, "learning_rate": 4.7276699674260525e-05, "loss": 0.4592, "step": 24350 }, { "epoch": 0.8777525498252063, "grad_norm": 0.1710633933544159, "learning_rate": 4.727537506961719e-05, "loss": 0.4546, "step": 24355 }, { "epoch": 0.877932749486431, "grad_norm": 0.16070812940597534, "learning_rate": 4.727405016147618e-05, "loss": 0.443, "step": 24360 }, { "epoch": 0.8781129491476556, "grad_norm": 0.20280031859874725, "learning_rate": 4.727272494985554e-05, "loss": 0.4375, "step": 24365 }, { "epoch": 0.8782931488088802, "grad_norm": 0.1605139523744583, "learning_rate": 4.7271399434773345e-05, "loss": 0.44, "step": 24370 }, { "epoch": 0.8784733484701048, "grad_norm": 0.19736407697200775, "learning_rate": 4.7270073616247646e-05, "loss": 0.4579, "step": 24375 }, { "epoch": 0.8786535481313296, "grad_norm": 0.19877927005290985, "learning_rate": 4.72687474942965e-05, "loss": 0.4602, "step": 24380 }, { "epoch": 0.8788337477925542, "grad_norm": 0.16131383180618286, "learning_rate": 4.7267421068937984e-05, "loss": 0.4178, "step": 24385 }, { "epoch": 0.8790139474537788, "grad_norm": 0.19206862151622772, "learning_rate": 4.7266094340190165e-05, "loss": 0.4558, "step": 24390 }, { "epoch": 0.8791941471150034, "grad_norm": 0.20475110411643982, "learning_rate": 4.7264767308071126e-05, "loss": 0.417, "step": 24395 }, { "epoch": 0.879374346776228, "grad_norm": 0.21035943925380707, "learning_rate": 4.726343997259893e-05, "loss": 0.4433, "step": 24400 }, { "epoch": 0.8795545464374527, "grad_norm": 0.18233457207679749, "learning_rate": 4.7262112333791685e-05, "loss": 0.3937, "step": 24405 }, { "epoch": 0.8797347460986773, "grad_norm": 0.18523730337619781, "learning_rate": 4.7260784391667475e-05, "loss": 0.4456, "step": 24410 }, { "epoch": 0.8799149457599019, "grad_norm": 0.19751974940299988, "learning_rate": 4.725945614624438e-05, "loss": 0.4533, "step": 24415 }, { "epoch": 0.8800951454211267, "grad_norm": 0.15248170495033264, "learning_rate": 4.7258127597540505e-05, "loss": 0.4397, "step": 24420 }, { "epoch": 0.8802753450823513, "grad_norm": 0.1679372489452362, "learning_rate": 4.725679874557395e-05, "loss": 0.4419, "step": 24425 }, { "epoch": 0.8804555447435759, "grad_norm": 0.256967157125473, "learning_rate": 4.7255469590362825e-05, "loss": 0.432, "step": 24430 }, { "epoch": 0.8806357444048005, "grad_norm": 0.18858453631401062, "learning_rate": 4.725414013192523e-05, "loss": 0.4169, "step": 24435 }, { "epoch": 0.8808159440660251, "grad_norm": 0.18355509638786316, "learning_rate": 4.725281037027929e-05, "loss": 0.4668, "step": 24440 }, { "epoch": 0.8809961437272498, "grad_norm": 0.16345731914043427, "learning_rate": 4.725148030544311e-05, "loss": 0.4466, "step": 24445 }, { "epoch": 0.8811763433884744, "grad_norm": 0.19114868342876434, "learning_rate": 4.7250149937434826e-05, "loss": 0.3905, "step": 24450 }, { "epoch": 0.881356543049699, "grad_norm": 0.19460555911064148, "learning_rate": 4.724881926627255e-05, "loss": 0.437, "step": 24455 }, { "epoch": 0.8815367427109237, "grad_norm": Infinity, "learning_rate": 4.7247754511084054e-05, "loss": 0.4409, "step": 24460 }, { "epoch": 0.8817169423721484, "grad_norm": 0.14812225103378296, "learning_rate": 4.72464232942903e-05, "loss": 0.3929, "step": 24465 }, { "epoch": 0.881897142033373, "grad_norm": 0.1727660894393921, "learning_rate": 4.724509177439333e-05, "loss": 0.4262, "step": 24470 }, { "epoch": 0.8820773416945976, "grad_norm": 0.16415224969387054, "learning_rate": 4.724375995141129e-05, "loss": 0.4162, "step": 24475 }, { "epoch": 0.8822575413558222, "grad_norm": 0.16160134971141815, "learning_rate": 4.724242782536234e-05, "loss": 0.4698, "step": 24480 }, { "epoch": 0.8824377410170469, "grad_norm": 0.15631291270256042, "learning_rate": 4.724109539626461e-05, "loss": 0.4455, "step": 24485 }, { "epoch": 0.8826179406782715, "grad_norm": 0.16617485880851746, "learning_rate": 4.7239762664136264e-05, "loss": 0.4638, "step": 24490 }, { "epoch": 0.8827981403394961, "grad_norm": 0.1969350427389145, "learning_rate": 4.7238429628995456e-05, "loss": 0.4458, "step": 24495 }, { "epoch": 0.8829783400007208, "grad_norm": 0.12761616706848145, "learning_rate": 4.723709629086035e-05, "loss": 0.4276, "step": 24500 }, { "epoch": 0.8829783400007208, "eval_loss": 0.4570537507534027, "eval_runtime": 3.5263, "eval_samples_per_second": 28.359, "eval_steps_per_second": 7.09, "step": 24500 }, { "epoch": 0.8831585396619455, "grad_norm": 0.20359250903129578, "learning_rate": 4.723576264974911e-05, "loss": 0.4593, "step": 24505 }, { "epoch": 0.8833387393231701, "grad_norm": 0.1633104681968689, "learning_rate": 4.723442870567991e-05, "loss": 0.4538, "step": 24510 }, { "epoch": 0.8835189389843947, "grad_norm": 0.19287440180778503, "learning_rate": 4.7233094458670926e-05, "loss": 0.4838, "step": 24515 }, { "epoch": 0.8836991386456193, "grad_norm": 0.16194848716259003, "learning_rate": 4.723175990874034e-05, "loss": 0.4247, "step": 24520 }, { "epoch": 0.883879338306844, "grad_norm": 0.15081371366977692, "learning_rate": 4.723042505590631e-05, "loss": 0.4416, "step": 24525 }, { "epoch": 0.8840595379680686, "grad_norm": 0.15278756618499756, "learning_rate": 4.7229089900187065e-05, "loss": 0.4031, "step": 24530 }, { "epoch": 0.8842397376292933, "grad_norm": 0.19165587425231934, "learning_rate": 4.722775444160076e-05, "loss": 0.4159, "step": 24535 }, { "epoch": 0.8844199372905179, "grad_norm": 0.17740246653556824, "learning_rate": 4.722641868016561e-05, "loss": 0.4277, "step": 24540 }, { "epoch": 0.8846001369517426, "grad_norm": 0.14334651827812195, "learning_rate": 4.72250826158998e-05, "loss": 0.4596, "step": 24545 }, { "epoch": 0.8847803366129672, "grad_norm": 0.1924460381269455, "learning_rate": 4.722374624882155e-05, "loss": 0.4452, "step": 24550 }, { "epoch": 0.8849605362741918, "grad_norm": 0.18541020154953003, "learning_rate": 4.7222409578949054e-05, "loss": 0.4299, "step": 24555 }, { "epoch": 0.8851407359354164, "grad_norm": 0.14057400822639465, "learning_rate": 4.7221072606300543e-05, "loss": 0.4235, "step": 24560 }, { "epoch": 0.885320935596641, "grad_norm": 0.17242777347564697, "learning_rate": 4.721973533089421e-05, "loss": 0.4452, "step": 24565 }, { "epoch": 0.8855011352578657, "grad_norm": 0.14792689681053162, "learning_rate": 4.721866529259576e-05, "loss": 0.4112, "step": 24570 }, { "epoch": 0.8856813349190904, "grad_norm": 0.20297929644584656, "learning_rate": 4.7217327472271283e-05, "loss": 0.4249, "step": 24575 }, { "epoch": 0.885861534580315, "grad_norm": 0.19619841873645782, "learning_rate": 4.7215989349240026e-05, "loss": 0.4288, "step": 24580 }, { "epoch": 0.8860417342415396, "grad_norm": 0.15129470825195312, "learning_rate": 4.721465092352021e-05, "loss": 0.4335, "step": 24585 }, { "epoch": 0.8862219339027643, "grad_norm": 0.18301598727703094, "learning_rate": 4.7213312195130076e-05, "loss": 0.4433, "step": 24590 }, { "epoch": 0.8864021335639889, "grad_norm": 0.16560180485248566, "learning_rate": 4.721197316408787e-05, "loss": 0.4086, "step": 24595 }, { "epoch": 0.8865823332252135, "grad_norm": 0.1662699282169342, "learning_rate": 4.721063383041182e-05, "loss": 0.4438, "step": 24600 }, { "epoch": 0.8867625328864381, "grad_norm": 0.2002149224281311, "learning_rate": 4.720929419412019e-05, "loss": 0.4215, "step": 24605 }, { "epoch": 0.8869427325476628, "grad_norm": 0.15055419504642487, "learning_rate": 4.720795425523122e-05, "loss": 0.4198, "step": 24610 }, { "epoch": 0.8871229322088875, "grad_norm": 0.18256331980228424, "learning_rate": 4.720661401376318e-05, "loss": 0.4618, "step": 24615 }, { "epoch": 0.8873031318701121, "grad_norm": 0.15107060968875885, "learning_rate": 4.7205273469734325e-05, "loss": 0.4164, "step": 24620 }, { "epoch": 0.8874833315313367, "grad_norm": 0.16118554770946503, "learning_rate": 4.7203932623162917e-05, "loss": 0.4657, "step": 24625 }, { "epoch": 0.8876635311925614, "grad_norm": 0.1474160999059677, "learning_rate": 4.720259147406722e-05, "loss": 0.4628, "step": 24630 }, { "epoch": 0.887843730853786, "grad_norm": 0.18264853954315186, "learning_rate": 4.720125002246552e-05, "loss": 0.4558, "step": 24635 }, { "epoch": 0.8880239305150106, "grad_norm": 0.2043534368276596, "learning_rate": 4.719990826837608e-05, "loss": 0.4473, "step": 24640 }, { "epoch": 0.8882041301762352, "grad_norm": 0.1613204926252365, "learning_rate": 4.719856621181719e-05, "loss": 0.4262, "step": 24645 }, { "epoch": 0.8883843298374599, "grad_norm": 0.13441753387451172, "learning_rate": 4.7197223852807136e-05, "loss": 0.4354, "step": 24650 }, { "epoch": 0.8885645294986846, "grad_norm": 0.221405029296875, "learning_rate": 4.71958811913642e-05, "loss": 0.4739, "step": 24655 }, { "epoch": 0.8887447291599092, "grad_norm": 0.18559223413467407, "learning_rate": 4.719453822750669e-05, "loss": 0.4838, "step": 24660 }, { "epoch": 0.8889249288211338, "grad_norm": 0.16596554219722748, "learning_rate": 4.7193194961252885e-05, "loss": 0.4106, "step": 24665 }, { "epoch": 0.8891051284823585, "grad_norm": 0.18130573630332947, "learning_rate": 4.7191851392621086e-05, "loss": 0.4411, "step": 24670 }, { "epoch": 0.8892853281435831, "grad_norm": 0.19347991049289703, "learning_rate": 4.719050752162962e-05, "loss": 0.4076, "step": 24675 }, { "epoch": 0.8894655278048077, "grad_norm": 0.18345235288143158, "learning_rate": 4.7189163348296794e-05, "loss": 0.3907, "step": 24680 }, { "epoch": 0.8896457274660323, "grad_norm": 0.18901300430297852, "learning_rate": 4.71878188726409e-05, "loss": 0.4677, "step": 24685 }, { "epoch": 0.889825927127257, "grad_norm": 0.17101964354515076, "learning_rate": 4.718647409468028e-05, "loss": 0.4077, "step": 24690 }, { "epoch": 0.8900061267884817, "grad_norm": 0.2040674090385437, "learning_rate": 4.7185129014433234e-05, "loss": 0.4288, "step": 24695 }, { "epoch": 0.8901863264497063, "grad_norm": 0.13120241463184357, "learning_rate": 4.71837836319181e-05, "loss": 0.4134, "step": 24700 }, { "epoch": 0.8903665261109309, "grad_norm": 0.18854455649852753, "learning_rate": 4.7182437947153216e-05, "loss": 0.4561, "step": 24705 }, { "epoch": 0.8905467257721555, "grad_norm": 0.14998573064804077, "learning_rate": 4.718109196015691e-05, "loss": 0.4087, "step": 24710 }, { "epoch": 0.8907269254333802, "grad_norm": 0.1941816210746765, "learning_rate": 4.717974567094752e-05, "loss": 0.4604, "step": 24715 }, { "epoch": 0.8909071250946048, "grad_norm": 0.16004863381385803, "learning_rate": 4.7178399079543386e-05, "loss": 0.4399, "step": 24720 }, { "epoch": 0.8910873247558294, "grad_norm": 0.192369744181633, "learning_rate": 4.717705218596286e-05, "loss": 0.4439, "step": 24725 }, { "epoch": 0.8912675244170541, "grad_norm": 0.2026439905166626, "learning_rate": 4.717570499022429e-05, "loss": 0.4401, "step": 24730 }, { "epoch": 0.8914477240782788, "grad_norm": 0.1843568980693817, "learning_rate": 4.7174357492346035e-05, "loss": 0.4492, "step": 24735 }, { "epoch": 0.8916279237395034, "grad_norm": 0.17085200548171997, "learning_rate": 4.717300969234645e-05, "loss": 0.4401, "step": 24740 }, { "epoch": 0.891808123400728, "grad_norm": 0.15291930735111237, "learning_rate": 4.717166159024391e-05, "loss": 0.3833, "step": 24745 }, { "epoch": 0.8919883230619526, "grad_norm": 0.21157534420490265, "learning_rate": 4.717031318605676e-05, "loss": 0.4876, "step": 24750 }, { "epoch": 0.8921685227231773, "grad_norm": 0.1500784456729889, "learning_rate": 4.716896447980339e-05, "loss": 0.4665, "step": 24755 }, { "epoch": 0.8923487223844019, "grad_norm": 0.20307470858097076, "learning_rate": 4.716761547150218e-05, "loss": 0.4211, "step": 24760 }, { "epoch": 0.8925289220456265, "grad_norm": 0.21808555722236633, "learning_rate": 4.716626616117149e-05, "loss": 0.4722, "step": 24765 }, { "epoch": 0.8927091217068512, "grad_norm": 0.16036725044250488, "learning_rate": 4.7164916548829716e-05, "loss": 0.4207, "step": 24770 }, { "epoch": 0.8928893213680759, "grad_norm": 0.1857818365097046, "learning_rate": 4.716356663449525e-05, "loss": 0.4423, "step": 24775 }, { "epoch": 0.8930695210293005, "grad_norm": 0.16408823430538177, "learning_rate": 4.716221641818648e-05, "loss": 0.3968, "step": 24780 }, { "epoch": 0.8932497206905251, "grad_norm": 0.1698572039604187, "learning_rate": 4.716086589992179e-05, "loss": 0.4458, "step": 24785 }, { "epoch": 0.8934299203517497, "grad_norm": 0.17390035092830658, "learning_rate": 4.7159515079719606e-05, "loss": 0.4407, "step": 24790 }, { "epoch": 0.8936101200129744, "grad_norm": 0.321679025888443, "learning_rate": 4.715816395759832e-05, "loss": 0.4106, "step": 24795 }, { "epoch": 0.893790319674199, "grad_norm": 0.14596271514892578, "learning_rate": 4.715681253357633e-05, "loss": 0.4319, "step": 24800 }, { "epoch": 0.8939705193354236, "grad_norm": 0.1981964260339737, "learning_rate": 4.715546080767207e-05, "loss": 0.4163, "step": 24805 }, { "epoch": 0.8941507189966483, "grad_norm": 0.13960348069667816, "learning_rate": 4.715410877990394e-05, "loss": 0.4762, "step": 24810 }, { "epoch": 0.894330918657873, "grad_norm": 0.13228678703308105, "learning_rate": 4.7152756450290365e-05, "loss": 0.4419, "step": 24815 }, { "epoch": 0.8945111183190976, "grad_norm": 0.13322246074676514, "learning_rate": 4.715140381884977e-05, "loss": 0.4458, "step": 24820 }, { "epoch": 0.8946913179803222, "grad_norm": 0.190599724650383, "learning_rate": 4.715005088560059e-05, "loss": 0.4297, "step": 24825 }, { "epoch": 0.8948715176415468, "grad_norm": 0.2033384144306183, "learning_rate": 4.714869765056126e-05, "loss": 0.4869, "step": 24830 }, { "epoch": 0.8950517173027714, "grad_norm": 0.1728903204202652, "learning_rate": 4.714734411375021e-05, "loss": 0.4397, "step": 24835 }, { "epoch": 0.8952319169639961, "grad_norm": 0.1565089374780655, "learning_rate": 4.714599027518588e-05, "loss": 0.4149, "step": 24840 }, { "epoch": 0.8954121166252208, "grad_norm": 0.14530473947525024, "learning_rate": 4.7144636134886725e-05, "loss": 0.4479, "step": 24845 }, { "epoch": 0.8955923162864454, "grad_norm": 0.18022358417510986, "learning_rate": 4.714328169287119e-05, "loss": 0.4322, "step": 24850 }, { "epoch": 0.89577251594767, "grad_norm": 0.20143041014671326, "learning_rate": 4.714192694915772e-05, "loss": 0.4065, "step": 24855 }, { "epoch": 0.8959527156088947, "grad_norm": 0.1598571240901947, "learning_rate": 4.7140571903764796e-05, "loss": 0.4355, "step": 24860 }, { "epoch": 0.8961329152701193, "grad_norm": 0.14883951842784882, "learning_rate": 4.713921655671086e-05, "loss": 0.4303, "step": 24865 }, { "epoch": 0.8963131149313439, "grad_norm": 0.22746653854846954, "learning_rate": 4.713786090801438e-05, "loss": 0.414, "step": 24870 }, { "epoch": 0.8964933145925685, "grad_norm": 0.1794048696756363, "learning_rate": 4.713650495769384e-05, "loss": 0.441, "step": 24875 }, { "epoch": 0.8966735142537932, "grad_norm": 0.1713542640209198, "learning_rate": 4.713514870576769e-05, "loss": 0.4191, "step": 24880 }, { "epoch": 0.8968537139150179, "grad_norm": 0.189683198928833, "learning_rate": 4.713379215225444e-05, "loss": 0.4236, "step": 24885 }, { "epoch": 0.8970339135762425, "grad_norm": 0.13822448253631592, "learning_rate": 4.713243529717256e-05, "loss": 0.4481, "step": 24890 }, { "epoch": 0.8972141132374671, "grad_norm": 0.1470467448234558, "learning_rate": 4.713107814054052e-05, "loss": 0.4471, "step": 24895 }, { "epoch": 0.8973943128986918, "grad_norm": 0.23228789865970612, "learning_rate": 4.7129720682376835e-05, "loss": 0.4273, "step": 24900 }, { "epoch": 0.8975745125599164, "grad_norm": 0.16581259667873383, "learning_rate": 4.712836292269999e-05, "loss": 0.4401, "step": 24905 }, { "epoch": 0.897754712221141, "grad_norm": 0.20045721530914307, "learning_rate": 4.712700486152848e-05, "loss": 0.4381, "step": 24910 }, { "epoch": 0.8979349118823656, "grad_norm": 0.1778937131166458, "learning_rate": 4.712564649888081e-05, "loss": 0.4466, "step": 24915 }, { "epoch": 0.8981151115435902, "grad_norm": 0.22372958064079285, "learning_rate": 4.7124287834775496e-05, "loss": 0.4296, "step": 24920 }, { "epoch": 0.898295311204815, "grad_norm": 0.15579721331596375, "learning_rate": 4.7122928869231044e-05, "loss": 0.4306, "step": 24925 }, { "epoch": 0.8984755108660396, "grad_norm": 0.1593790054321289, "learning_rate": 4.712156960226597e-05, "loss": 0.4518, "step": 24930 }, { "epoch": 0.8986557105272642, "grad_norm": 0.18624834716320038, "learning_rate": 4.7120210033898784e-05, "loss": 0.4181, "step": 24935 }, { "epoch": 0.8988359101884889, "grad_norm": 0.19123771786689758, "learning_rate": 4.711885016414802e-05, "loss": 0.4106, "step": 24940 }, { "epoch": 0.8990161098497135, "grad_norm": 0.1681036353111267, "learning_rate": 4.7117489993032216e-05, "loss": 0.4661, "step": 24945 }, { "epoch": 0.8991963095109381, "grad_norm": 0.16487166285514832, "learning_rate": 4.711612952056988e-05, "loss": 0.4357, "step": 24950 }, { "epoch": 0.8993765091721627, "grad_norm": 0.1486726552248001, "learning_rate": 4.711476874677957e-05, "loss": 0.4281, "step": 24955 }, { "epoch": 0.8995567088333873, "grad_norm": 0.1914074718952179, "learning_rate": 4.711340767167982e-05, "loss": 0.4557, "step": 24960 }, { "epoch": 0.8997369084946121, "grad_norm": 0.17185872793197632, "learning_rate": 4.711204629528917e-05, "loss": 0.4334, "step": 24965 }, { "epoch": 0.8999171081558367, "grad_norm": 0.15413062274456024, "learning_rate": 4.711068461762617e-05, "loss": 0.4027, "step": 24970 }, { "epoch": 0.9000973078170613, "grad_norm": 0.2285022735595703, "learning_rate": 4.710932263870936e-05, "loss": 0.423, "step": 24975 }, { "epoch": 0.9002775074782859, "grad_norm": 0.18323256075382233, "learning_rate": 4.710796035855732e-05, "loss": 0.4656, "step": 24980 }, { "epoch": 0.9004577071395106, "grad_norm": 0.19098562002182007, "learning_rate": 4.71065977771886e-05, "loss": 0.4367, "step": 24985 }, { "epoch": 0.9006379068007352, "grad_norm": 0.14779239892959595, "learning_rate": 4.710523489462177e-05, "loss": 0.4426, "step": 24990 }, { "epoch": 0.9008181064619598, "grad_norm": 0.1570071429014206, "learning_rate": 4.710387171087539e-05, "loss": 0.4674, "step": 24995 }, { "epoch": 0.9009983061231844, "grad_norm": 0.1645687371492386, "learning_rate": 4.7102508225968035e-05, "loss": 0.4766, "step": 25000 }, { "epoch": 0.9009983061231844, "eval_loss": 0.4570350646972656, "eval_runtime": 3.5365, "eval_samples_per_second": 28.276, "eval_steps_per_second": 7.069, "step": 25000 }, { "epoch": 0.9011785057844092, "grad_norm": 0.15871667861938477, "learning_rate": 4.7101144439918287e-05, "loss": 0.4376, "step": 25005 }, { "epoch": 0.9013587054456338, "grad_norm": 0.1973850131034851, "learning_rate": 4.709978035274473e-05, "loss": 0.446, "step": 25010 }, { "epoch": 0.9015389051068584, "grad_norm": 0.15039071440696716, "learning_rate": 4.709841596446594e-05, "loss": 0.4431, "step": 25015 }, { "epoch": 0.901719104768083, "grad_norm": 0.16307663917541504, "learning_rate": 4.70970512751005e-05, "loss": 0.4178, "step": 25020 }, { "epoch": 0.9018993044293077, "grad_norm": 0.17655940353870392, "learning_rate": 4.709568628466703e-05, "loss": 0.4112, "step": 25025 }, { "epoch": 0.9020795040905323, "grad_norm": 0.21620622277259827, "learning_rate": 4.709432099318411e-05, "loss": 0.4354, "step": 25030 }, { "epoch": 0.9022597037517569, "grad_norm": 0.17951133847236633, "learning_rate": 4.7092955400670336e-05, "loss": 0.4332, "step": 25035 }, { "epoch": 0.9024399034129816, "grad_norm": 0.15933279693126678, "learning_rate": 4.7091589507144326e-05, "loss": 0.3898, "step": 25040 }, { "epoch": 0.9026201030742063, "grad_norm": 0.1578451693058014, "learning_rate": 4.7090223312624683e-05, "loss": 0.4215, "step": 25045 }, { "epoch": 0.9028003027354309, "grad_norm": 0.1939324289560318, "learning_rate": 4.708885681713003e-05, "loss": 0.4807, "step": 25050 }, { "epoch": 0.9029805023966555, "grad_norm": 0.1886306256055832, "learning_rate": 4.708749002067897e-05, "loss": 0.4057, "step": 25055 }, { "epoch": 0.9031607020578801, "grad_norm": 0.1769237220287323, "learning_rate": 4.708612292329015e-05, "loss": 0.4619, "step": 25060 }, { "epoch": 0.9033409017191047, "grad_norm": 0.17207685112953186, "learning_rate": 4.7084755524982175e-05, "loss": 0.4632, "step": 25065 }, { "epoch": 0.9035211013803294, "grad_norm": 0.17112012207508087, "learning_rate": 4.7083387825773676e-05, "loss": 0.4322, "step": 25070 }, { "epoch": 0.903701301041554, "grad_norm": 0.1867535561323166, "learning_rate": 4.70820198256833e-05, "loss": 0.4359, "step": 25075 }, { "epoch": 0.9038815007027787, "grad_norm": 0.21555501222610474, "learning_rate": 4.708065152472967e-05, "loss": 0.4375, "step": 25080 }, { "epoch": 0.9040617003640034, "grad_norm": 0.16034695506095886, "learning_rate": 4.707928292293144e-05, "loss": 0.4587, "step": 25085 }, { "epoch": 0.904241900025228, "grad_norm": 0.15322132408618927, "learning_rate": 4.7077914020307266e-05, "loss": 0.4422, "step": 25090 }, { "epoch": 0.9044220996864526, "grad_norm": 0.16674070060253143, "learning_rate": 4.707654481687578e-05, "loss": 0.4216, "step": 25095 }, { "epoch": 0.9046022993476772, "grad_norm": 0.16037312150001526, "learning_rate": 4.707517531265565e-05, "loss": 0.4055, "step": 25100 }, { "epoch": 0.9047824990089018, "grad_norm": 0.17139527201652527, "learning_rate": 4.707380550766553e-05, "loss": 0.4565, "step": 25105 }, { "epoch": 0.9049626986701265, "grad_norm": 0.1893669068813324, "learning_rate": 4.7072435401924075e-05, "loss": 0.4536, "step": 25110 }, { "epoch": 0.9051428983313511, "grad_norm": 0.17662879824638367, "learning_rate": 4.7071064995449964e-05, "loss": 0.4653, "step": 25115 }, { "epoch": 0.9053230979925758, "grad_norm": 0.16549894213676453, "learning_rate": 4.7069694288261864e-05, "loss": 0.402, "step": 25120 }, { "epoch": 0.9055032976538004, "grad_norm": 0.1662409007549286, "learning_rate": 4.706832328037846e-05, "loss": 0.4282, "step": 25125 }, { "epoch": 0.9056834973150251, "grad_norm": 0.21818654239177704, "learning_rate": 4.706695197181842e-05, "loss": 0.4141, "step": 25130 }, { "epoch": 0.9058636969762497, "grad_norm": 0.14861366152763367, "learning_rate": 4.706558036260042e-05, "loss": 0.4126, "step": 25135 }, { "epoch": 0.9060438966374743, "grad_norm": 0.17982593178749084, "learning_rate": 4.7064208452743174e-05, "loss": 0.4186, "step": 25140 }, { "epoch": 0.9062240962986989, "grad_norm": 0.19204726815223694, "learning_rate": 4.706283624226536e-05, "loss": 0.4466, "step": 25145 }, { "epoch": 0.9064042959599236, "grad_norm": 0.20537865161895752, "learning_rate": 4.7061463731185676e-05, "loss": 0.4208, "step": 25150 }, { "epoch": 0.9065844956211482, "grad_norm": 0.12418222427368164, "learning_rate": 4.7060090919522806e-05, "loss": 0.4412, "step": 25155 }, { "epoch": 0.9067646952823729, "grad_norm": 0.15775543451309204, "learning_rate": 4.705871780729548e-05, "loss": 0.4425, "step": 25160 }, { "epoch": 0.9069448949435975, "grad_norm": 0.15570229291915894, "learning_rate": 4.705734439452239e-05, "loss": 0.445, "step": 25165 }, { "epoch": 0.9071250946048222, "grad_norm": 0.18232616782188416, "learning_rate": 4.705597068122225e-05, "loss": 0.4503, "step": 25170 }, { "epoch": 0.9073052942660468, "grad_norm": 0.20230647921562195, "learning_rate": 4.705459666741379e-05, "loss": 0.4519, "step": 25175 }, { "epoch": 0.9074854939272714, "grad_norm": 0.1548745334148407, "learning_rate": 4.705322235311571e-05, "loss": 0.3896, "step": 25180 }, { "epoch": 0.907665693588496, "grad_norm": 0.19131234288215637, "learning_rate": 4.705184773834675e-05, "loss": 0.4694, "step": 25185 }, { "epoch": 0.9078458932497206, "grad_norm": 0.17528343200683594, "learning_rate": 4.705047282312563e-05, "loss": 0.4463, "step": 25190 }, { "epoch": 0.9080260929109454, "grad_norm": 0.16300806403160095, "learning_rate": 4.704909760747109e-05, "loss": 0.4469, "step": 25195 }, { "epoch": 0.90820629257217, "grad_norm": 0.14464612305164337, "learning_rate": 4.704772209140186e-05, "loss": 0.4079, "step": 25200 }, { "epoch": 0.9083864922333946, "grad_norm": 0.19416652619838715, "learning_rate": 4.704634627493669e-05, "loss": 0.4484, "step": 25205 }, { "epoch": 0.9085666918946192, "grad_norm": 0.18099582195281982, "learning_rate": 4.704497015809432e-05, "loss": 0.4251, "step": 25210 }, { "epoch": 0.9087468915558439, "grad_norm": 0.19250603020191193, "learning_rate": 4.70435937408935e-05, "loss": 0.434, "step": 25215 }, { "epoch": 0.9089270912170685, "grad_norm": 0.17500755190849304, "learning_rate": 4.704221702335298e-05, "loss": 0.3891, "step": 25220 }, { "epoch": 0.9091072908782931, "grad_norm": 0.1807602047920227, "learning_rate": 4.7040840005491526e-05, "loss": 0.4505, "step": 25225 }, { "epoch": 0.9092874905395177, "grad_norm": 0.1530625820159912, "learning_rate": 4.7039462687327885e-05, "loss": 0.4264, "step": 25230 }, { "epoch": 0.9094676902007425, "grad_norm": 0.1661711484193802, "learning_rate": 4.703808506888084e-05, "loss": 0.4535, "step": 25235 }, { "epoch": 0.9096478898619671, "grad_norm": 0.14613687992095947, "learning_rate": 4.7036707150169145e-05, "loss": 0.4152, "step": 25240 }, { "epoch": 0.9098280895231917, "grad_norm": 0.18025970458984375, "learning_rate": 4.703532893121159e-05, "loss": 0.4294, "step": 25245 }, { "epoch": 0.9100082891844163, "grad_norm": 0.14726756513118744, "learning_rate": 4.703395041202694e-05, "loss": 0.4612, "step": 25250 }, { "epoch": 0.910188488845641, "grad_norm": 0.19874265789985657, "learning_rate": 4.703257159263398e-05, "loss": 0.4415, "step": 25255 }, { "epoch": 0.9103686885068656, "grad_norm": 0.17537064850330353, "learning_rate": 4.70311924730515e-05, "loss": 0.4454, "step": 25260 }, { "epoch": 0.9105488881680902, "grad_norm": 0.1721314936876297, "learning_rate": 4.702981305329829e-05, "loss": 0.4314, "step": 25265 }, { "epoch": 0.9107290878293148, "grad_norm": 0.16811582446098328, "learning_rate": 4.702843333339314e-05, "loss": 0.4317, "step": 25270 }, { "epoch": 0.9109092874905396, "grad_norm": 0.19957655668258667, "learning_rate": 4.702705331335485e-05, "loss": 0.4278, "step": 25275 }, { "epoch": 0.9110894871517642, "grad_norm": 0.17485888302326202, "learning_rate": 4.702567299320223e-05, "loss": 0.4501, "step": 25280 }, { "epoch": 0.9112696868129888, "grad_norm": 0.13707765936851501, "learning_rate": 4.702429237295407e-05, "loss": 0.4126, "step": 25285 }, { "epoch": 0.9114498864742134, "grad_norm": 0.182311549782753, "learning_rate": 4.702291145262919e-05, "loss": 0.4308, "step": 25290 }, { "epoch": 0.9116300861354381, "grad_norm": 0.16993586719036102, "learning_rate": 4.702153023224641e-05, "loss": 0.4155, "step": 25295 }, { "epoch": 0.9118102857966627, "grad_norm": 0.16655287146568298, "learning_rate": 4.7020148711824546e-05, "loss": 0.4481, "step": 25300 }, { "epoch": 0.9119904854578873, "grad_norm": 0.1623249053955078, "learning_rate": 4.701876689138242e-05, "loss": 0.4569, "step": 25305 }, { "epoch": 0.9121706851191119, "grad_norm": 0.15807580947875977, "learning_rate": 4.701738477093885e-05, "loss": 0.4459, "step": 25310 }, { "epoch": 0.9123508847803367, "grad_norm": 0.20741410553455353, "learning_rate": 4.701600235051268e-05, "loss": 0.4314, "step": 25315 }, { "epoch": 0.9125310844415613, "grad_norm": 0.1682579219341278, "learning_rate": 4.701461963012274e-05, "loss": 0.4152, "step": 25320 }, { "epoch": 0.9127112841027859, "grad_norm": 0.15219363570213318, "learning_rate": 4.701323660978787e-05, "loss": 0.415, "step": 25325 }, { "epoch": 0.9128914837640105, "grad_norm": 0.1439961940050125, "learning_rate": 4.701185328952692e-05, "loss": 0.4282, "step": 25330 }, { "epoch": 0.9130716834252351, "grad_norm": 0.18314993381500244, "learning_rate": 4.701046966935872e-05, "loss": 0.4325, "step": 25335 }, { "epoch": 0.9132518830864598, "grad_norm": 0.19595153629779816, "learning_rate": 4.700908574930213e-05, "loss": 0.4474, "step": 25340 }, { "epoch": 0.9134320827476844, "grad_norm": 0.14837300777435303, "learning_rate": 4.700770152937601e-05, "loss": 0.4244, "step": 25345 }, { "epoch": 0.9136122824089091, "grad_norm": 0.19343934953212738, "learning_rate": 4.700631700959923e-05, "loss": 0.4336, "step": 25350 }, { "epoch": 0.9137924820701337, "grad_norm": 0.15232184529304504, "learning_rate": 4.700493218999063e-05, "loss": 0.4504, "step": 25355 }, { "epoch": 0.9139726817313584, "grad_norm": 0.15530715882778168, "learning_rate": 4.700354707056909e-05, "loss": 0.4485, "step": 25360 }, { "epoch": 0.914152881392583, "grad_norm": 0.19562314450740814, "learning_rate": 4.7002161651353485e-05, "loss": 0.46, "step": 25365 }, { "epoch": 0.9143330810538076, "grad_norm": 0.17155522108078003, "learning_rate": 4.7000775932362684e-05, "loss": 0.4414, "step": 25370 }, { "epoch": 0.9145132807150322, "grad_norm": 0.17853830754756927, "learning_rate": 4.699938991361558e-05, "loss": 0.4158, "step": 25375 }, { "epoch": 0.9146934803762569, "grad_norm": 0.17986997961997986, "learning_rate": 4.6998003595131035e-05, "loss": 0.4345, "step": 25380 }, { "epoch": 0.9148736800374815, "grad_norm": 0.16720250248908997, "learning_rate": 4.699661697692796e-05, "loss": 0.4374, "step": 25385 }, { "epoch": 0.9150538796987062, "grad_norm": 0.18035994470119476, "learning_rate": 4.699523005902522e-05, "loss": 0.423, "step": 25390 }, { "epoch": 0.9152340793599308, "grad_norm": 0.17745894193649292, "learning_rate": 4.699384284144174e-05, "loss": 0.4273, "step": 25395 }, { "epoch": 0.9154142790211555, "grad_norm": 0.1389733850955963, "learning_rate": 4.699245532419642e-05, "loss": 0.4206, "step": 25400 }, { "epoch": 0.9155944786823801, "grad_norm": 0.16255025565624237, "learning_rate": 4.699106750730814e-05, "loss": 0.4705, "step": 25405 }, { "epoch": 0.9157746783436047, "grad_norm": 0.16465483605861664, "learning_rate": 4.6989679390795826e-05, "loss": 0.4509, "step": 25410 }, { "epoch": 0.9159548780048293, "grad_norm": 0.16897955536842346, "learning_rate": 4.6988290974678384e-05, "loss": 0.4106, "step": 25415 }, { "epoch": 0.916135077666054, "grad_norm": 0.17163941264152527, "learning_rate": 4.698690225897474e-05, "loss": 0.4134, "step": 25420 }, { "epoch": 0.9163152773272786, "grad_norm": 0.16389816999435425, "learning_rate": 4.698551324370381e-05, "loss": 0.4388, "step": 25425 }, { "epoch": 0.9164954769885033, "grad_norm": 0.17674943804740906, "learning_rate": 4.698412392888452e-05, "loss": 0.4547, "step": 25430 }, { "epoch": 0.9166756766497279, "grad_norm": 0.1594432145357132, "learning_rate": 4.698273431453579e-05, "loss": 0.4396, "step": 25435 }, { "epoch": 0.9168558763109526, "grad_norm": 0.17538687586784363, "learning_rate": 4.6981344400676566e-05, "loss": 0.4724, "step": 25440 }, { "epoch": 0.9170360759721772, "grad_norm": 0.1810724139213562, "learning_rate": 4.697995418732578e-05, "loss": 0.4075, "step": 25445 }, { "epoch": 0.9172162756334018, "grad_norm": 0.15371228754520416, "learning_rate": 4.6978563674502375e-05, "loss": 0.4578, "step": 25450 }, { "epoch": 0.9173964752946264, "grad_norm": 0.14616931974887848, "learning_rate": 4.6977172862225294e-05, "loss": 0.4127, "step": 25455 }, { "epoch": 0.917576674955851, "grad_norm": 0.19712692499160767, "learning_rate": 4.697578175051348e-05, "loss": 0.4611, "step": 25460 }, { "epoch": 0.9177568746170757, "grad_norm": 0.16714803874492645, "learning_rate": 4.69743903393859e-05, "loss": 0.4397, "step": 25465 }, { "epoch": 0.9179370742783004, "grad_norm": 0.15617181360721588, "learning_rate": 4.6972998628861506e-05, "loss": 0.3992, "step": 25470 }, { "epoch": 0.918117273939525, "grad_norm": 0.17703229188919067, "learning_rate": 4.697160661895927e-05, "loss": 0.4722, "step": 25475 }, { "epoch": 0.9182974736007496, "grad_norm": 0.17935994267463684, "learning_rate": 4.6970214309698134e-05, "loss": 0.4434, "step": 25480 }, { "epoch": 0.9184776732619743, "grad_norm": 0.22057127952575684, "learning_rate": 4.6968821701097086e-05, "loss": 0.4237, "step": 25485 }, { "epoch": 0.9186578729231989, "grad_norm": 0.17813293635845184, "learning_rate": 4.69674287931751e-05, "loss": 0.4612, "step": 25490 }, { "epoch": 0.9188380725844235, "grad_norm": 0.20391541719436646, "learning_rate": 4.696603558595115e-05, "loss": 0.4252, "step": 25495 }, { "epoch": 0.9190182722456481, "grad_norm": 0.1561935395002365, "learning_rate": 4.696464207944421e-05, "loss": 0.4011, "step": 25500 }, { "epoch": 0.9190182722456481, "eval_loss": 0.4568355083465576, "eval_runtime": 3.5356, "eval_samples_per_second": 28.284, "eval_steps_per_second": 7.071, "step": 25500 }, { "epoch": 0.9191984719068728, "grad_norm": 0.18729767203330994, "learning_rate": 4.696324827367328e-05, "loss": 0.4199, "step": 25505 }, { "epoch": 0.9193786715680975, "grad_norm": 0.19635900855064392, "learning_rate": 4.696185416865734e-05, "loss": 0.4135, "step": 25510 }, { "epoch": 0.9195588712293221, "grad_norm": 0.1535876840353012, "learning_rate": 4.6960459764415386e-05, "loss": 0.4719, "step": 25515 }, { "epoch": 0.9197390708905467, "grad_norm": 0.16010642051696777, "learning_rate": 4.695906506096643e-05, "loss": 0.432, "step": 25520 }, { "epoch": 0.9199192705517714, "grad_norm": 0.1595637947320938, "learning_rate": 4.6957670058329464e-05, "loss": 0.4195, "step": 25525 }, { "epoch": 0.920099470212996, "grad_norm": 0.15828733146190643, "learning_rate": 4.6956274756523484e-05, "loss": 0.4074, "step": 25530 }, { "epoch": 0.9202796698742206, "grad_norm": 0.16714178025722504, "learning_rate": 4.695487915556752e-05, "loss": 0.4126, "step": 25535 }, { "epoch": 0.9204598695354452, "grad_norm": 0.1568571776151657, "learning_rate": 4.695348325548057e-05, "loss": 0.451, "step": 25540 }, { "epoch": 0.92064006919667, "grad_norm": 0.1514507383108139, "learning_rate": 4.695208705628167e-05, "loss": 0.4052, "step": 25545 }, { "epoch": 0.9208202688578946, "grad_norm": 0.15713444352149963, "learning_rate": 4.695069055798983e-05, "loss": 0.4466, "step": 25550 }, { "epoch": 0.9210004685191192, "grad_norm": 0.1931026428937912, "learning_rate": 4.694929376062408e-05, "loss": 0.4462, "step": 25555 }, { "epoch": 0.9211806681803438, "grad_norm": 0.1435370147228241, "learning_rate": 4.694789666420345e-05, "loss": 0.4197, "step": 25560 }, { "epoch": 0.9213608678415685, "grad_norm": 0.15287569165229797, "learning_rate": 4.694649926874698e-05, "loss": 0.4207, "step": 25565 }, { "epoch": 0.9215410675027931, "grad_norm": 0.17821337282657623, "learning_rate": 4.694510157427371e-05, "loss": 0.4436, "step": 25570 }, { "epoch": 0.9217212671640177, "grad_norm": 0.1588643342256546, "learning_rate": 4.694370358080267e-05, "loss": 0.4427, "step": 25575 }, { "epoch": 0.9219014668252423, "grad_norm": 0.2273692488670349, "learning_rate": 4.6942305288352926e-05, "loss": 0.4394, "step": 25580 }, { "epoch": 0.922081666486467, "grad_norm": 0.1667856127023697, "learning_rate": 4.694090669694351e-05, "loss": 0.4425, "step": 25585 }, { "epoch": 0.9222618661476917, "grad_norm": 0.17408248782157898, "learning_rate": 4.69395078065935e-05, "loss": 0.4393, "step": 25590 }, { "epoch": 0.9224420658089163, "grad_norm": 0.17394454777240753, "learning_rate": 4.693810861732194e-05, "loss": 0.4212, "step": 25595 }, { "epoch": 0.9226222654701409, "grad_norm": 0.21937377750873566, "learning_rate": 4.69367091291479e-05, "loss": 0.4295, "step": 25600 }, { "epoch": 0.9228024651313655, "grad_norm": 0.17998327314853668, "learning_rate": 4.693530934209044e-05, "loss": 0.4028, "step": 25605 }, { "epoch": 0.9229826647925902, "grad_norm": 0.14169585704803467, "learning_rate": 4.693390925616864e-05, "loss": 0.4253, "step": 25610 }, { "epoch": 0.9231628644538148, "grad_norm": 0.16097652912139893, "learning_rate": 4.693250887140157e-05, "loss": 0.4052, "step": 25615 }, { "epoch": 0.9233430641150394, "grad_norm": 0.14775985479354858, "learning_rate": 4.6931108187808316e-05, "loss": 0.4093, "step": 25620 }, { "epoch": 0.9235232637762641, "grad_norm": 0.1724456250667572, "learning_rate": 4.692970720540796e-05, "loss": 0.4383, "step": 25625 }, { "epoch": 0.9237034634374888, "grad_norm": 0.14798258244991302, "learning_rate": 4.6928305924219587e-05, "loss": 0.441, "step": 25630 }, { "epoch": 0.9238836630987134, "grad_norm": 0.16991879045963287, "learning_rate": 4.692690434426229e-05, "loss": 0.4178, "step": 25635 }, { "epoch": 0.924063862759938, "grad_norm": 0.1819402128458023, "learning_rate": 4.692550246555517e-05, "loss": 0.4375, "step": 25640 }, { "epoch": 0.9242440624211626, "grad_norm": 0.17619052529335022, "learning_rate": 4.692410028811732e-05, "loss": 0.4423, "step": 25645 }, { "epoch": 0.9244242620823873, "grad_norm": 0.17652717232704163, "learning_rate": 4.692269781196785e-05, "loss": 0.4498, "step": 25650 }, { "epoch": 0.9246044617436119, "grad_norm": 0.1433330625295639, "learning_rate": 4.692129503712587e-05, "loss": 0.4225, "step": 25655 }, { "epoch": 0.9247846614048365, "grad_norm": 0.18287613987922668, "learning_rate": 4.6919891963610485e-05, "loss": 0.4516, "step": 25660 }, { "epoch": 0.9249648610660612, "grad_norm": 0.17915308475494385, "learning_rate": 4.691848859144081e-05, "loss": 0.4276, "step": 25665 }, { "epoch": 0.9251450607272859, "grad_norm": 0.16471678018569946, "learning_rate": 4.691708492063598e-05, "loss": 0.4445, "step": 25670 }, { "epoch": 0.9253252603885105, "grad_norm": 0.18665508925914764, "learning_rate": 4.6915680951215114e-05, "loss": 0.4602, "step": 25675 }, { "epoch": 0.9255054600497351, "grad_norm": 0.1585237830877304, "learning_rate": 4.6914276683197334e-05, "loss": 0.4243, "step": 25680 }, { "epoch": 0.9256856597109597, "grad_norm": 0.1394348442554474, "learning_rate": 4.6912872116601776e-05, "loss": 0.3881, "step": 25685 }, { "epoch": 0.9258658593721844, "grad_norm": 0.14394313097000122, "learning_rate": 4.6911467251447574e-05, "loss": 0.424, "step": 25690 }, { "epoch": 0.926046059033409, "grad_norm": 0.20186100900173187, "learning_rate": 4.691006208775388e-05, "loss": 0.4345, "step": 25695 }, { "epoch": 0.9262262586946337, "grad_norm": 0.16559197008609772, "learning_rate": 4.690865662553983e-05, "loss": 0.4314, "step": 25700 }, { "epoch": 0.9264064583558583, "grad_norm": 0.2052149623632431, "learning_rate": 4.690725086482457e-05, "loss": 0.4128, "step": 25705 }, { "epoch": 0.926586658017083, "grad_norm": 0.1739431619644165, "learning_rate": 4.690584480562726e-05, "loss": 0.3859, "step": 25710 }, { "epoch": 0.9267668576783076, "grad_norm": 0.17957325279712677, "learning_rate": 4.6904438447967064e-05, "loss": 0.429, "step": 25715 }, { "epoch": 0.9269470573395322, "grad_norm": 0.16392971575260162, "learning_rate": 4.690303179186313e-05, "loss": 0.4385, "step": 25720 }, { "epoch": 0.9271272570007568, "grad_norm": 0.20912306010723114, "learning_rate": 4.690162483733462e-05, "loss": 0.4398, "step": 25725 }, { "epoch": 0.9273074566619814, "grad_norm": 0.18170976638793945, "learning_rate": 4.690021758440072e-05, "loss": 0.4031, "step": 25730 }, { "epoch": 0.9274876563232061, "grad_norm": 0.24062508344650269, "learning_rate": 4.68988100330806e-05, "loss": 0.4034, "step": 25735 }, { "epoch": 0.9276678559844308, "grad_norm": 0.14883898198604584, "learning_rate": 4.689740218339342e-05, "loss": 0.4644, "step": 25740 }, { "epoch": 0.9278480556456554, "grad_norm": 0.19792114198207855, "learning_rate": 4.689599403535839e-05, "loss": 0.4129, "step": 25745 }, { "epoch": 0.92802825530688, "grad_norm": 0.16648323833942413, "learning_rate": 4.6894585588994676e-05, "loss": 0.4379, "step": 25750 }, { "epoch": 0.9282084549681047, "grad_norm": 0.18078407645225525, "learning_rate": 4.689317684432147e-05, "loss": 0.4586, "step": 25755 }, { "epoch": 0.9283886546293293, "grad_norm": 0.14701730012893677, "learning_rate": 4.689176780135797e-05, "loss": 0.4344, "step": 25760 }, { "epoch": 0.9285688542905539, "grad_norm": 0.1665094792842865, "learning_rate": 4.689035846012336e-05, "loss": 0.4549, "step": 25765 }, { "epoch": 0.9287490539517785, "grad_norm": 0.19282084703445435, "learning_rate": 4.688894882063687e-05, "loss": 0.4313, "step": 25770 }, { "epoch": 0.9289292536130032, "grad_norm": 0.18455705046653748, "learning_rate": 4.688753888291768e-05, "loss": 0.4461, "step": 25775 }, { "epoch": 0.9291094532742279, "grad_norm": 0.1790475845336914, "learning_rate": 4.688612864698502e-05, "loss": 0.4302, "step": 25780 }, { "epoch": 0.9292896529354525, "grad_norm": 0.16020038723945618, "learning_rate": 4.6884718112858085e-05, "loss": 0.4316, "step": 25785 }, { "epoch": 0.9294698525966771, "grad_norm": 0.22309835255146027, "learning_rate": 4.688330728055611e-05, "loss": 0.4452, "step": 25790 }, { "epoch": 0.9296500522579018, "grad_norm": 0.1724500060081482, "learning_rate": 4.68818961500983e-05, "loss": 0.4117, "step": 25795 }, { "epoch": 0.9298302519191264, "grad_norm": 0.1488490104675293, "learning_rate": 4.68804847215039e-05, "loss": 0.4567, "step": 25800 }, { "epoch": 0.930010451580351, "grad_norm": 0.18638138473033905, "learning_rate": 4.6879072994792126e-05, "loss": 0.4115, "step": 25805 }, { "epoch": 0.9301906512415756, "grad_norm": 0.1752612441778183, "learning_rate": 4.687766096998223e-05, "loss": 0.4133, "step": 25810 }, { "epoch": 0.9303708509028003, "grad_norm": 0.17065556347370148, "learning_rate": 4.6876248647093424e-05, "loss": 0.4351, "step": 25815 }, { "epoch": 0.930551050564025, "grad_norm": 0.15603967010974884, "learning_rate": 4.687483602614497e-05, "loss": 0.4381, "step": 25820 }, { "epoch": 0.9307312502252496, "grad_norm": 0.1808996945619583, "learning_rate": 4.687342310715612e-05, "loss": 0.4155, "step": 25825 }, { "epoch": 0.9309114498864742, "grad_norm": 0.17924173176288605, "learning_rate": 4.687200989014611e-05, "loss": 0.4348, "step": 25830 }, { "epoch": 0.9310916495476989, "grad_norm": 0.17122434079647064, "learning_rate": 4.687059637513419e-05, "loss": 0.4378, "step": 25835 }, { "epoch": 0.9312718492089235, "grad_norm": 0.1833307445049286, "learning_rate": 4.686918256213964e-05, "loss": 0.4553, "step": 25840 }, { "epoch": 0.9314520488701481, "grad_norm": 0.14123891294002533, "learning_rate": 4.6867768451181706e-05, "loss": 0.4414, "step": 25845 }, { "epoch": 0.9316322485313727, "grad_norm": 0.18901309370994568, "learning_rate": 4.6866354042279666e-05, "loss": 0.4157, "step": 25850 }, { "epoch": 0.9318124481925975, "grad_norm": 0.1698739230632782, "learning_rate": 4.686493933545278e-05, "loss": 0.4427, "step": 25855 }, { "epoch": 0.9319926478538221, "grad_norm": 0.14457178115844727, "learning_rate": 4.686352433072033e-05, "loss": 0.4243, "step": 25860 }, { "epoch": 0.9321728475150467, "grad_norm": 0.18088513612747192, "learning_rate": 4.6862109028101596e-05, "loss": 0.4398, "step": 25865 }, { "epoch": 0.9323530471762713, "grad_norm": 0.17031700909137726, "learning_rate": 4.686069342761585e-05, "loss": 0.4415, "step": 25870 }, { "epoch": 0.9325332468374959, "grad_norm": 0.16327226161956787, "learning_rate": 4.6859277529282406e-05, "loss": 0.4316, "step": 25875 }, { "epoch": 0.9327134464987206, "grad_norm": 0.15043266117572784, "learning_rate": 4.6857861333120525e-05, "loss": 0.4595, "step": 25880 }, { "epoch": 0.9328936461599452, "grad_norm": 0.1847275346517563, "learning_rate": 4.685644483914952e-05, "loss": 0.4269, "step": 25885 }, { "epoch": 0.9330738458211698, "grad_norm": 0.1813531517982483, "learning_rate": 4.685502804738868e-05, "loss": 0.4171, "step": 25890 }, { "epoch": 0.9332540454823945, "grad_norm": 0.16259461641311646, "learning_rate": 4.685361095785732e-05, "loss": 0.4319, "step": 25895 }, { "epoch": 0.9334342451436192, "grad_norm": 0.17456987500190735, "learning_rate": 4.685219357057474e-05, "loss": 0.4362, "step": 25900 }, { "epoch": 0.9336144448048438, "grad_norm": 0.1771409511566162, "learning_rate": 4.6850775885560255e-05, "loss": 0.4237, "step": 25905 }, { "epoch": 0.9337946444660684, "grad_norm": 0.14823564887046814, "learning_rate": 4.684935790283318e-05, "loss": 0.4253, "step": 25910 }, { "epoch": 0.933974844127293, "grad_norm": 0.17045463621616364, "learning_rate": 4.684793962241283e-05, "loss": 0.4249, "step": 25915 }, { "epoch": 0.9341550437885177, "grad_norm": 0.2057085633277893, "learning_rate": 4.684652104431852e-05, "loss": 0.4473, "step": 25920 }, { "epoch": 0.9343352434497423, "grad_norm": 0.1884680688381195, "learning_rate": 4.684510216856961e-05, "loss": 0.4832, "step": 25925 }, { "epoch": 0.9345154431109669, "grad_norm": 0.15199509263038635, "learning_rate": 4.684368299518541e-05, "loss": 0.4648, "step": 25930 }, { "epoch": 0.9346956427721916, "grad_norm": 0.2007426768541336, "learning_rate": 4.684226352418525e-05, "loss": 0.465, "step": 25935 }, { "epoch": 0.9348758424334163, "grad_norm": 0.19974127411842346, "learning_rate": 4.684084375558848e-05, "loss": 0.4523, "step": 25940 }, { "epoch": 0.9350560420946409, "grad_norm": 0.1712539941072464, "learning_rate": 4.6839423689414455e-05, "loss": 0.4529, "step": 25945 }, { "epoch": 0.9352362417558655, "grad_norm": 0.19021165370941162, "learning_rate": 4.68380033256825e-05, "loss": 0.409, "step": 25950 }, { "epoch": 0.9354164414170901, "grad_norm": 0.1501091569662094, "learning_rate": 4.6836582664411975e-05, "loss": 0.4259, "step": 25955 }, { "epoch": 0.9355966410783148, "grad_norm": 0.16499082744121552, "learning_rate": 4.683516170562224e-05, "loss": 0.4451, "step": 25960 }, { "epoch": 0.9357768407395394, "grad_norm": 0.12097856402397156, "learning_rate": 4.683374044933266e-05, "loss": 0.4216, "step": 25965 }, { "epoch": 0.935957040400764, "grad_norm": 0.17843008041381836, "learning_rate": 4.683231889556259e-05, "loss": 0.4188, "step": 25970 }, { "epoch": 0.9361372400619887, "grad_norm": 0.18024946749210358, "learning_rate": 4.68308970443314e-05, "loss": 0.4292, "step": 25975 }, { "epoch": 0.9363174397232134, "grad_norm": 0.18790417909622192, "learning_rate": 4.6829474895658464e-05, "loss": 0.4282, "step": 25980 }, { "epoch": 0.936497639384438, "grad_norm": 0.16022486984729767, "learning_rate": 4.682805244956316e-05, "loss": 0.4387, "step": 25985 }, { "epoch": 0.9366778390456626, "grad_norm": 0.15988902747631073, "learning_rate": 4.682662970606487e-05, "loss": 0.4369, "step": 25990 }, { "epoch": 0.9368580387068872, "grad_norm": 0.1928686499595642, "learning_rate": 4.682520666518297e-05, "loss": 0.4324, "step": 25995 }, { "epoch": 0.9370382383681118, "grad_norm": 0.18302272260189056, "learning_rate": 4.682378332693686e-05, "loss": 0.4272, "step": 26000 }, { "epoch": 0.9370382383681118, "eval_loss": 0.45559266209602356, "eval_runtime": 3.5347, "eval_samples_per_second": 28.291, "eval_steps_per_second": 7.073, "step": 26000 }, { "epoch": 0.9372184380293365, "grad_norm": 0.15167242288589478, "learning_rate": 4.682235969134593e-05, "loss": 0.4238, "step": 26005 }, { "epoch": 0.9373986376905611, "grad_norm": 0.16070915758609772, "learning_rate": 4.682093575842957e-05, "loss": 0.4412, "step": 26010 }, { "epoch": 0.9375788373517858, "grad_norm": 0.15172840654850006, "learning_rate": 4.681951152820718e-05, "loss": 0.4573, "step": 26015 }, { "epoch": 0.9377590370130104, "grad_norm": 0.1980069875717163, "learning_rate": 4.6818087000698175e-05, "loss": 0.4407, "step": 26020 }, { "epoch": 0.9379392366742351, "grad_norm": 0.17154526710510254, "learning_rate": 4.6816662175921965e-05, "loss": 0.4435, "step": 26025 }, { "epoch": 0.9381194363354597, "grad_norm": 0.21349559724330902, "learning_rate": 4.6815237053897946e-05, "loss": 0.4316, "step": 26030 }, { "epoch": 0.9382996359966843, "grad_norm": 0.2062828242778778, "learning_rate": 4.6813811634645554e-05, "loss": 0.4443, "step": 26035 }, { "epoch": 0.9384798356579089, "grad_norm": 0.16467832028865814, "learning_rate": 4.68123859181842e-05, "loss": 0.4363, "step": 26040 }, { "epoch": 0.9386600353191336, "grad_norm": 0.16752782464027405, "learning_rate": 4.68109599045333e-05, "loss": 0.4136, "step": 26045 }, { "epoch": 0.9388402349803583, "grad_norm": 0.2054414600133896, "learning_rate": 4.6809533593712305e-05, "loss": 0.4687, "step": 26050 }, { "epoch": 0.9390204346415829, "grad_norm": 0.20668792724609375, "learning_rate": 4.680810698574064e-05, "loss": 0.4421, "step": 26055 }, { "epoch": 0.9392006343028075, "grad_norm": 0.1917664259672165, "learning_rate": 4.680668008063773e-05, "loss": 0.4454, "step": 26060 }, { "epoch": 0.9393808339640322, "grad_norm": 0.13623978197574615, "learning_rate": 4.680525287842303e-05, "loss": 0.4358, "step": 26065 }, { "epoch": 0.9395610336252568, "grad_norm": 0.1284467577934265, "learning_rate": 4.6803825379115985e-05, "loss": 0.4677, "step": 26070 }, { "epoch": 0.9397412332864814, "grad_norm": 0.16711989045143127, "learning_rate": 4.680239758273604e-05, "loss": 0.4238, "step": 26075 }, { "epoch": 0.939921432947706, "grad_norm": 0.1910637617111206, "learning_rate": 4.6800969489302646e-05, "loss": 0.4641, "step": 26080 }, { "epoch": 0.9401016326089306, "grad_norm": 0.16136784851551056, "learning_rate": 4.6799541098835264e-05, "loss": 0.4006, "step": 26085 }, { "epoch": 0.9402818322701554, "grad_norm": 0.1435597538948059, "learning_rate": 4.679811241135335e-05, "loss": 0.4049, "step": 26090 }, { "epoch": 0.94046203193138, "grad_norm": 0.16800041496753693, "learning_rate": 4.679668342687638e-05, "loss": 0.411, "step": 26095 }, { "epoch": 0.9406422315926046, "grad_norm": 0.19089345633983612, "learning_rate": 4.679525414542382e-05, "loss": 0.435, "step": 26100 }, { "epoch": 0.9408224312538292, "grad_norm": 0.22108210623264313, "learning_rate": 4.6793824567015135e-05, "loss": 0.438, "step": 26105 }, { "epoch": 0.9410026309150539, "grad_norm": 0.15207885205745697, "learning_rate": 4.679239469166982e-05, "loss": 0.4272, "step": 26110 }, { "epoch": 0.9411828305762785, "grad_norm": 0.19638319313526154, "learning_rate": 4.679096451940734e-05, "loss": 0.4027, "step": 26115 }, { "epoch": 0.9413630302375031, "grad_norm": 0.22127874195575714, "learning_rate": 4.678953405024718e-05, "loss": 0.4261, "step": 26120 }, { "epoch": 0.9415432298987277, "grad_norm": 0.17142963409423828, "learning_rate": 4.678810328420885e-05, "loss": 0.4106, "step": 26125 }, { "epoch": 0.9417234295599525, "grad_norm": 0.15688422322273254, "learning_rate": 4.678667222131183e-05, "loss": 0.4312, "step": 26130 }, { "epoch": 0.9419036292211771, "grad_norm": 0.1551523059606552, "learning_rate": 4.678524086157561e-05, "loss": 0.4178, "step": 26135 }, { "epoch": 0.9420838288824017, "grad_norm": 0.1784355640411377, "learning_rate": 4.678380920501971e-05, "loss": 0.4415, "step": 26140 }, { "epoch": 0.9422640285436263, "grad_norm": 0.14627361297607422, "learning_rate": 4.6782377251663624e-05, "loss": 0.4048, "step": 26145 }, { "epoch": 0.942444228204851, "grad_norm": 0.14835497736930847, "learning_rate": 4.678094500152686e-05, "loss": 0.4471, "step": 26150 }, { "epoch": 0.9426244278660756, "grad_norm": 0.15386229753494263, "learning_rate": 4.677951245462895e-05, "loss": 0.4297, "step": 26155 }, { "epoch": 0.9428046275273002, "grad_norm": 0.16428400576114655, "learning_rate": 4.677807961098939e-05, "loss": 0.4265, "step": 26160 }, { "epoch": 0.9429848271885248, "grad_norm": 0.18662281334400177, "learning_rate": 4.677664647062771e-05, "loss": 0.4396, "step": 26165 }, { "epoch": 0.9431650268497496, "grad_norm": 0.17339280247688293, "learning_rate": 4.6775213033563445e-05, "loss": 0.4346, "step": 26170 }, { "epoch": 0.9433452265109742, "grad_norm": 0.1801465004682541, "learning_rate": 4.677377929981611e-05, "loss": 0.4492, "step": 26175 }, { "epoch": 0.9435254261721988, "grad_norm": 0.14782361686229706, "learning_rate": 4.6772345269405255e-05, "loss": 0.4003, "step": 26180 }, { "epoch": 0.9437056258334234, "grad_norm": 0.20001257956027985, "learning_rate": 4.677091094235041e-05, "loss": 0.4631, "step": 26185 }, { "epoch": 0.9438858254946481, "grad_norm": 0.16880610585212708, "learning_rate": 4.6769476318671116e-05, "loss": 0.4457, "step": 26190 }, { "epoch": 0.9440660251558727, "grad_norm": 0.18512383103370667, "learning_rate": 4.676804139838692e-05, "loss": 0.4106, "step": 26195 }, { "epoch": 0.9442462248170973, "grad_norm": 0.19107241928577423, "learning_rate": 4.6766606181517375e-05, "loss": 0.417, "step": 26200 }, { "epoch": 0.944426424478322, "grad_norm": 0.18682074546813965, "learning_rate": 4.676517066808204e-05, "loss": 0.4276, "step": 26205 }, { "epoch": 0.9446066241395467, "grad_norm": 0.19449475407600403, "learning_rate": 4.676373485810046e-05, "loss": 0.4347, "step": 26210 }, { "epoch": 0.9447868238007713, "grad_norm": 0.16231991350650787, "learning_rate": 4.6762298751592215e-05, "loss": 0.4527, "step": 26215 }, { "epoch": 0.9449670234619959, "grad_norm": 0.19462081789970398, "learning_rate": 4.676086234857686e-05, "loss": 0.4579, "step": 26220 }, { "epoch": 0.9451472231232205, "grad_norm": 0.14375783503055573, "learning_rate": 4.675942564907396e-05, "loss": 0.4156, "step": 26225 }, { "epoch": 0.9453274227844451, "grad_norm": 0.19837602972984314, "learning_rate": 4.675798865310311e-05, "loss": 0.4444, "step": 26230 }, { "epoch": 0.9455076224456698, "grad_norm": 0.2307501882314682, "learning_rate": 4.675655136068387e-05, "loss": 0.4416, "step": 26235 }, { "epoch": 0.9456878221068944, "grad_norm": 0.19053228199481964, "learning_rate": 4.675511377183583e-05, "loss": 0.4442, "step": 26240 }, { "epoch": 0.9458680217681191, "grad_norm": 0.1901927888393402, "learning_rate": 4.675367588657858e-05, "loss": 0.4622, "step": 26245 }, { "epoch": 0.9460482214293437, "grad_norm": 0.14448778331279755, "learning_rate": 4.675223770493171e-05, "loss": 0.4176, "step": 26250 }, { "epoch": 0.9462284210905684, "grad_norm": 0.1635839194059372, "learning_rate": 4.675079922691481e-05, "loss": 0.4253, "step": 26255 }, { "epoch": 0.946408620751793, "grad_norm": 0.18405769765377045, "learning_rate": 4.6749360452547485e-05, "loss": 0.4426, "step": 26260 }, { "epoch": 0.9465888204130176, "grad_norm": 0.15434348583221436, "learning_rate": 4.674792138184933e-05, "loss": 0.4832, "step": 26265 }, { "epoch": 0.9467690200742422, "grad_norm": 0.14686670899391174, "learning_rate": 4.674648201483995e-05, "loss": 0.4396, "step": 26270 }, { "epoch": 0.9469492197354669, "grad_norm": 0.16869889199733734, "learning_rate": 4.674504235153898e-05, "loss": 0.4205, "step": 26275 }, { "epoch": 0.9471294193966915, "grad_norm": 0.14260682463645935, "learning_rate": 4.6743602391966004e-05, "loss": 0.4266, "step": 26280 }, { "epoch": 0.9473096190579162, "grad_norm": 0.19560997188091278, "learning_rate": 4.674216213614066e-05, "loss": 0.4319, "step": 26285 }, { "epoch": 0.9474898187191408, "grad_norm": 0.19733212888240814, "learning_rate": 4.674072158408257e-05, "loss": 0.4331, "step": 26290 }, { "epoch": 0.9476700183803655, "grad_norm": 0.1900247186422348, "learning_rate": 4.6739280735811355e-05, "loss": 0.4258, "step": 26295 }, { "epoch": 0.9478502180415901, "grad_norm": 0.2038991004228592, "learning_rate": 4.6737839591346645e-05, "loss": 0.446, "step": 26300 }, { "epoch": 0.9480304177028147, "grad_norm": 0.19306516647338867, "learning_rate": 4.6736398150708076e-05, "loss": 0.4093, "step": 26305 }, { "epoch": 0.9482106173640393, "grad_norm": 0.15221892297267914, "learning_rate": 4.67349564139153e-05, "loss": 0.403, "step": 26310 }, { "epoch": 0.948390817025264, "grad_norm": 0.14861221611499786, "learning_rate": 4.673351438098794e-05, "loss": 0.4573, "step": 26315 }, { "epoch": 0.9485710166864886, "grad_norm": 0.15452301502227783, "learning_rate": 4.673207205194566e-05, "loss": 0.4291, "step": 26320 }, { "epoch": 0.9487512163477133, "grad_norm": 0.1777305006980896, "learning_rate": 4.6730629426808114e-05, "loss": 0.4437, "step": 26325 }, { "epoch": 0.9489314160089379, "grad_norm": 0.17141470313072205, "learning_rate": 4.6729186505594943e-05, "loss": 0.4247, "step": 26330 }, { "epoch": 0.9491116156701626, "grad_norm": 0.21854913234710693, "learning_rate": 4.672774328832581e-05, "loss": 0.4053, "step": 26335 }, { "epoch": 0.9492918153313872, "grad_norm": 0.2022477239370346, "learning_rate": 4.6726299775020385e-05, "loss": 0.4414, "step": 26340 }, { "epoch": 0.9494720149926118, "grad_norm": 0.12977053225040436, "learning_rate": 4.672485596569833e-05, "loss": 0.4409, "step": 26345 }, { "epoch": 0.9496522146538364, "grad_norm": 0.15741246938705444, "learning_rate": 4.672341186037932e-05, "loss": 0.4438, "step": 26350 }, { "epoch": 0.949832414315061, "grad_norm": 0.1705687791109085, "learning_rate": 4.672196745908303e-05, "loss": 0.407, "step": 26355 }, { "epoch": 0.9500126139762857, "grad_norm": 0.17730578780174255, "learning_rate": 4.672052276182913e-05, "loss": 0.4412, "step": 26360 }, { "epoch": 0.9501928136375104, "grad_norm": 0.14052340388298035, "learning_rate": 4.671907776863732e-05, "loss": 0.408, "step": 26365 }, { "epoch": 0.950373013298735, "grad_norm": 0.15693028271198273, "learning_rate": 4.671763247952728e-05, "loss": 0.3939, "step": 26370 }, { "epoch": 0.9505532129599596, "grad_norm": 0.19423072040081024, "learning_rate": 4.67161868945187e-05, "loss": 0.4108, "step": 26375 }, { "epoch": 0.9507334126211843, "grad_norm": 0.15230786800384521, "learning_rate": 4.671474101363128e-05, "loss": 0.4263, "step": 26380 }, { "epoch": 0.9509136122824089, "grad_norm": 0.19921550154685974, "learning_rate": 4.6713294836884716e-05, "loss": 0.4178, "step": 26385 }, { "epoch": 0.9510938119436335, "grad_norm": 0.16791215538978577, "learning_rate": 4.671184836429871e-05, "loss": 0.4573, "step": 26390 }, { "epoch": 0.9512740116048581, "grad_norm": 0.17958621680736542, "learning_rate": 4.6710401595892986e-05, "loss": 0.4352, "step": 26395 }, { "epoch": 0.9514542112660829, "grad_norm": 0.16500048339366913, "learning_rate": 4.670895453168724e-05, "loss": 0.4216, "step": 26400 }, { "epoch": 0.9516344109273075, "grad_norm": 0.1619076132774353, "learning_rate": 4.670750717170119e-05, "loss": 0.4489, "step": 26405 }, { "epoch": 0.9518146105885321, "grad_norm": 0.14866675436496735, "learning_rate": 4.6706059515954546e-05, "loss": 0.4267, "step": 26410 }, { "epoch": 0.9519948102497567, "grad_norm": 0.17370857298374176, "learning_rate": 4.670461156446706e-05, "loss": 0.4023, "step": 26415 }, { "epoch": 0.9521750099109814, "grad_norm": 0.23588383197784424, "learning_rate": 4.6703163317258436e-05, "loss": 0.4328, "step": 26420 }, { "epoch": 0.952355209572206, "grad_norm": 0.16146144270896912, "learning_rate": 4.670171477434841e-05, "loss": 0.402, "step": 26425 }, { "epoch": 0.9525354092334306, "grad_norm": 0.1492733508348465, "learning_rate": 4.670026593575673e-05, "loss": 0.4807, "step": 26430 }, { "epoch": 0.9527156088946552, "grad_norm": 0.15443341434001923, "learning_rate": 4.669881680150312e-05, "loss": 0.4296, "step": 26435 }, { "epoch": 0.95289580855588, "grad_norm": 0.14959372580051422, "learning_rate": 4.6697367371607334e-05, "loss": 0.4148, "step": 26440 }, { "epoch": 0.9530760082171046, "grad_norm": 0.18775780498981476, "learning_rate": 4.669591764608913e-05, "loss": 0.4036, "step": 26445 }, { "epoch": 0.9532562078783292, "grad_norm": 0.20374846458435059, "learning_rate": 4.669446762496823e-05, "loss": 0.4418, "step": 26450 }, { "epoch": 0.9534364075395538, "grad_norm": 0.17194941639900208, "learning_rate": 4.669301730826442e-05, "loss": 0.4616, "step": 26455 }, { "epoch": 0.9536166072007785, "grad_norm": Infinity, "learning_rate": 4.669185684209495e-05, "loss": 0.4184, "step": 26460 }, { "epoch": 0.9537968068620031, "grad_norm": 0.26384392380714417, "learning_rate": 4.669040599339167e-05, "loss": 0.4399, "step": 26465 }, { "epoch": 0.9539770065232277, "grad_norm": 0.16087359189987183, "learning_rate": 4.6688954849160817e-05, "loss": 0.4367, "step": 26470 }, { "epoch": 0.9541572061844523, "grad_norm": 0.16282179951667786, "learning_rate": 4.668750340942215e-05, "loss": 0.4427, "step": 26475 }, { "epoch": 0.9543374058456771, "grad_norm": 0.14763139188289642, "learning_rate": 4.6686051674195454e-05, "loss": 0.4148, "step": 26480 }, { "epoch": 0.9545176055069017, "grad_norm": 0.17169588804244995, "learning_rate": 4.66845996435005e-05, "loss": 0.3915, "step": 26485 }, { "epoch": 0.9546978051681263, "grad_norm": 0.1894696205854416, "learning_rate": 4.668314731735707e-05, "loss": 0.4249, "step": 26490 }, { "epoch": 0.9548780048293509, "grad_norm": 0.14295339584350586, "learning_rate": 4.668169469578496e-05, "loss": 0.4194, "step": 26495 }, { "epoch": 0.9550582044905755, "grad_norm": 0.1540336161851883, "learning_rate": 4.6680241778803955e-05, "loss": 0.4442, "step": 26500 }, { "epoch": 0.9550582044905755, "eval_loss": 0.45503300428390503, "eval_runtime": 3.5368, "eval_samples_per_second": 28.274, "eval_steps_per_second": 7.069, "step": 26500 }, { "epoch": 0.9552384041518002, "grad_norm": 0.12401163578033447, "learning_rate": 4.6678788566433854e-05, "loss": 0.4263, "step": 26505 }, { "epoch": 0.9554186038130248, "grad_norm": 0.1782900094985962, "learning_rate": 4.6677335058694454e-05, "loss": 0.4176, "step": 26510 }, { "epoch": 0.9555988034742494, "grad_norm": 0.18796321749687195, "learning_rate": 4.667588125560556e-05, "loss": 0.4209, "step": 26515 }, { "epoch": 0.9557790031354741, "grad_norm": 0.17212247848510742, "learning_rate": 4.667442715718698e-05, "loss": 0.4083, "step": 26520 }, { "epoch": 0.9559592027966988, "grad_norm": 0.1399925798177719, "learning_rate": 4.667297276345853e-05, "loss": 0.4253, "step": 26525 }, { "epoch": 0.9561394024579234, "grad_norm": 0.16920849680900574, "learning_rate": 4.6671518074440025e-05, "loss": 0.4156, "step": 26530 }, { "epoch": 0.956319602119148, "grad_norm": 0.22345153987407684, "learning_rate": 4.6670063090151286e-05, "loss": 0.4336, "step": 26535 }, { "epoch": 0.9564998017803726, "grad_norm": 0.1682891845703125, "learning_rate": 4.666860781061212e-05, "loss": 0.446, "step": 26540 }, { "epoch": 0.9566800014415973, "grad_norm": 0.15830697119235992, "learning_rate": 4.666715223584237e-05, "loss": 0.4206, "step": 26545 }, { "epoch": 0.9568602011028219, "grad_norm": 0.16852952539920807, "learning_rate": 4.6665696365861874e-05, "loss": 0.4387, "step": 26550 }, { "epoch": 0.9570404007640466, "grad_norm": 0.13287803530693054, "learning_rate": 4.666424020069045e-05, "loss": 0.4461, "step": 26555 }, { "epoch": 0.9572206004252712, "grad_norm": 0.15766523778438568, "learning_rate": 4.666278374034795e-05, "loss": 0.422, "step": 26560 }, { "epoch": 0.9574008000864959, "grad_norm": 0.1761545091867447, "learning_rate": 4.6661326984854225e-05, "loss": 0.4452, "step": 26565 }, { "epoch": 0.9575809997477205, "grad_norm": 0.17396709322929382, "learning_rate": 4.6659869934229106e-05, "loss": 0.4474, "step": 26570 }, { "epoch": 0.9577611994089451, "grad_norm": 0.16844242811203003, "learning_rate": 4.665841258849245e-05, "loss": 0.4374, "step": 26575 }, { "epoch": 0.9579413990701697, "grad_norm": 0.1774928718805313, "learning_rate": 4.6656954947664125e-05, "loss": 0.404, "step": 26580 }, { "epoch": 0.9581215987313944, "grad_norm": 0.16202402114868164, "learning_rate": 4.665549701176397e-05, "loss": 0.4328, "step": 26585 }, { "epoch": 0.958301798392619, "grad_norm": 0.2063705176115036, "learning_rate": 4.6654038780811866e-05, "loss": 0.458, "step": 26590 }, { "epoch": 0.9584819980538437, "grad_norm": 0.14139318466186523, "learning_rate": 4.665258025482767e-05, "loss": 0.3822, "step": 26595 }, { "epoch": 0.9586621977150683, "grad_norm": 0.1882011443376541, "learning_rate": 4.665112143383127e-05, "loss": 0.424, "step": 26600 }, { "epoch": 0.958842397376293, "grad_norm": 0.1827850192785263, "learning_rate": 4.664966231784253e-05, "loss": 0.4365, "step": 26605 }, { "epoch": 0.9590225970375176, "grad_norm": 0.20430363714694977, "learning_rate": 4.664820290688133e-05, "loss": 0.4152, "step": 26610 }, { "epoch": 0.9592027966987422, "grad_norm": 0.21347634494304657, "learning_rate": 4.664674320096756e-05, "loss": 0.4451, "step": 26615 }, { "epoch": 0.9593829963599668, "grad_norm": 0.17805293202400208, "learning_rate": 4.66452832001211e-05, "loss": 0.4693, "step": 26620 }, { "epoch": 0.9595631960211914, "grad_norm": 0.17523930966854095, "learning_rate": 4.664382290436185e-05, "loss": 0.4101, "step": 26625 }, { "epoch": 0.9597433956824161, "grad_norm": 0.17870113253593445, "learning_rate": 4.6642362313709706e-05, "loss": 0.4164, "step": 26630 }, { "epoch": 0.9599235953436408, "grad_norm": 0.20278626680374146, "learning_rate": 4.664090142818456e-05, "loss": 0.457, "step": 26635 }, { "epoch": 0.9601037950048654, "grad_norm": 0.2184595763683319, "learning_rate": 4.663944024780632e-05, "loss": 0.4481, "step": 26640 }, { "epoch": 0.96028399466609, "grad_norm": 0.174354687333107, "learning_rate": 4.66379787725949e-05, "loss": 0.443, "step": 26645 }, { "epoch": 0.9604641943273147, "grad_norm": 0.20273347198963165, "learning_rate": 4.663651700257021e-05, "loss": 0.4258, "step": 26650 }, { "epoch": 0.9606443939885393, "grad_norm": 0.1632155179977417, "learning_rate": 4.6635054937752166e-05, "loss": 0.4303, "step": 26655 }, { "epoch": 0.9608245936497639, "grad_norm": 0.16507910192012787, "learning_rate": 4.6633592578160687e-05, "loss": 0.4055, "step": 26660 }, { "epoch": 0.9610047933109885, "grad_norm": 0.1875597983598709, "learning_rate": 4.6632129923815694e-05, "loss": 0.4617, "step": 26665 }, { "epoch": 0.9611849929722132, "grad_norm": 0.16827437281608582, "learning_rate": 4.663066697473711e-05, "loss": 0.4083, "step": 26670 }, { "epoch": 0.9613651926334379, "grad_norm": 0.1666988730430603, "learning_rate": 4.662920373094489e-05, "loss": 0.4235, "step": 26675 }, { "epoch": 0.9615453922946625, "grad_norm": 0.18340694904327393, "learning_rate": 4.662774019245896e-05, "loss": 0.4255, "step": 26680 }, { "epoch": 0.9617255919558871, "grad_norm": 0.18287798762321472, "learning_rate": 4.6626276359299245e-05, "loss": 0.3855, "step": 26685 }, { "epoch": 0.9619057916171118, "grad_norm": 0.1787051558494568, "learning_rate": 4.662481223148571e-05, "loss": 0.4594, "step": 26690 }, { "epoch": 0.9620859912783364, "grad_norm": 0.1771918535232544, "learning_rate": 4.662334780903829e-05, "loss": 0.415, "step": 26695 }, { "epoch": 0.962266190939561, "grad_norm": 0.15356236696243286, "learning_rate": 4.6621883091976945e-05, "loss": 0.4285, "step": 26700 }, { "epoch": 0.9624463906007856, "grad_norm": 0.1645512878894806, "learning_rate": 4.662041808032163e-05, "loss": 0.465, "step": 26705 }, { "epoch": 0.9626265902620104, "grad_norm": 0.20833955705165863, "learning_rate": 4.661895277409231e-05, "loss": 0.4383, "step": 26710 }, { "epoch": 0.962806789923235, "grad_norm": 0.15828882157802582, "learning_rate": 4.661748717330893e-05, "loss": 0.4052, "step": 26715 }, { "epoch": 0.9629869895844596, "grad_norm": 0.15166039764881134, "learning_rate": 4.6616021277991476e-05, "loss": 0.4495, "step": 26720 }, { "epoch": 0.9631671892456842, "grad_norm": 0.18399517238140106, "learning_rate": 4.6614555088159924e-05, "loss": 0.4607, "step": 26725 }, { "epoch": 0.9633473889069089, "grad_norm": 0.16991674900054932, "learning_rate": 4.661308860383424e-05, "loss": 0.4267, "step": 26730 }, { "epoch": 0.9635275885681335, "grad_norm": 0.18746553361415863, "learning_rate": 4.661162182503441e-05, "loss": 0.456, "step": 26735 }, { "epoch": 0.9637077882293581, "grad_norm": 0.18917517364025116, "learning_rate": 4.661015475178041e-05, "loss": 0.4372, "step": 26740 }, { "epoch": 0.9638879878905827, "grad_norm": 0.16291776299476624, "learning_rate": 4.6608687384092244e-05, "loss": 0.4211, "step": 26745 }, { "epoch": 0.9640681875518075, "grad_norm": 0.17410635948181152, "learning_rate": 4.660721972198989e-05, "loss": 0.4218, "step": 26750 }, { "epoch": 0.9642483872130321, "grad_norm": 0.17787551879882812, "learning_rate": 4.6605751765493354e-05, "loss": 0.4457, "step": 26755 }, { "epoch": 0.9644285868742567, "grad_norm": 0.17140106856822968, "learning_rate": 4.660428351462263e-05, "loss": 0.4006, "step": 26760 }, { "epoch": 0.9646087865354813, "grad_norm": 0.17782960832118988, "learning_rate": 4.660281496939773e-05, "loss": 0.4501, "step": 26765 }, { "epoch": 0.9647889861967059, "grad_norm": 0.15600888431072235, "learning_rate": 4.6601346129838655e-05, "loss": 0.4476, "step": 26770 }, { "epoch": 0.9649691858579306, "grad_norm": 0.18162627518177032, "learning_rate": 4.6599876995965424e-05, "loss": 0.418, "step": 26775 }, { "epoch": 0.9651493855191552, "grad_norm": 0.20125152170658112, "learning_rate": 4.659840756779805e-05, "loss": 0.4183, "step": 26780 }, { "epoch": 0.9653295851803798, "grad_norm": 0.16319221258163452, "learning_rate": 4.6596937845356556e-05, "loss": 0.4437, "step": 26785 }, { "epoch": 0.9655097848416045, "grad_norm": 0.2335626482963562, "learning_rate": 4.659546782866096e-05, "loss": 0.4571, "step": 26790 }, { "epoch": 0.9656899845028292, "grad_norm": 0.17783339321613312, "learning_rate": 4.6593997517731305e-05, "loss": 0.4296, "step": 26795 }, { "epoch": 0.9658701841640538, "grad_norm": 0.16893276572227478, "learning_rate": 4.65925269125876e-05, "loss": 0.4179, "step": 26800 }, { "epoch": 0.9660503838252784, "grad_norm": 0.160037562251091, "learning_rate": 4.6591056013249914e-05, "loss": 0.4114, "step": 26805 }, { "epoch": 0.966230583486503, "grad_norm": 0.18261387944221497, "learning_rate": 4.6589584819738254e-05, "loss": 0.4149, "step": 26810 }, { "epoch": 0.9664107831477277, "grad_norm": 0.18064436316490173, "learning_rate": 4.658811333207269e-05, "loss": 0.4456, "step": 26815 }, { "epoch": 0.9665909828089523, "grad_norm": 0.16004979610443115, "learning_rate": 4.658664155027326e-05, "loss": 0.4362, "step": 26820 }, { "epoch": 0.9667711824701769, "grad_norm": 0.18018919229507446, "learning_rate": 4.658516947436001e-05, "loss": 0.4137, "step": 26825 }, { "epoch": 0.9669513821314016, "grad_norm": 0.1572667360305786, "learning_rate": 4.658369710435302e-05, "loss": 0.4265, "step": 26830 }, { "epoch": 0.9671315817926263, "grad_norm": 0.14630809426307678, "learning_rate": 4.6582224440272325e-05, "loss": 0.4567, "step": 26835 }, { "epoch": 0.9673117814538509, "grad_norm": 0.15697656571865082, "learning_rate": 4.6580751482138e-05, "loss": 0.4598, "step": 26840 }, { "epoch": 0.9674919811150755, "grad_norm": 0.1883528083562851, "learning_rate": 4.657927822997012e-05, "loss": 0.4144, "step": 26845 }, { "epoch": 0.9676721807763001, "grad_norm": 0.1584809124469757, "learning_rate": 4.657780468378875e-05, "loss": 0.421, "step": 26850 }, { "epoch": 0.9678523804375248, "grad_norm": 0.1841040551662445, "learning_rate": 4.657633084361397e-05, "loss": 0.4483, "step": 26855 }, { "epoch": 0.9680325800987494, "grad_norm": 0.1718447059392929, "learning_rate": 4.657485670946585e-05, "loss": 0.4267, "step": 26860 }, { "epoch": 0.968212779759974, "grad_norm": 0.16161082684993744, "learning_rate": 4.65733822813645e-05, "loss": 0.4503, "step": 26865 }, { "epoch": 0.9683929794211987, "grad_norm": 0.175176739692688, "learning_rate": 4.657190755932999e-05, "loss": 0.4539, "step": 26870 }, { "epoch": 0.9685731790824234, "grad_norm": 0.16012953221797943, "learning_rate": 4.65704325433824e-05, "loss": 0.4453, "step": 26875 }, { "epoch": 0.968753378743648, "grad_norm": 0.16824325919151306, "learning_rate": 4.6568957233541854e-05, "loss": 0.4654, "step": 26880 }, { "epoch": 0.9689335784048726, "grad_norm": 0.15294811129570007, "learning_rate": 4.6567481629828443e-05, "loss": 0.4517, "step": 26885 }, { "epoch": 0.9691137780660972, "grad_norm": 0.19897432625293732, "learning_rate": 4.6566005732262275e-05, "loss": 0.4646, "step": 26890 }, { "epoch": 0.9692939777273218, "grad_norm": 0.16633781790733337, "learning_rate": 4.6564529540863446e-05, "loss": 0.3853, "step": 26895 }, { "epoch": 0.9694741773885465, "grad_norm": 0.13114029169082642, "learning_rate": 4.656305305565208e-05, "loss": 0.4149, "step": 26900 }, { "epoch": 0.9696543770497712, "grad_norm": 0.17593474686145782, "learning_rate": 4.656157627664829e-05, "loss": 0.4126, "step": 26905 }, { "epoch": 0.9698345767109958, "grad_norm": 0.1551685929298401, "learning_rate": 4.6560099203872196e-05, "loss": 0.4258, "step": 26910 }, { "epoch": 0.9700147763722204, "grad_norm": 0.18880245089530945, "learning_rate": 4.655862183734392e-05, "loss": 0.4148, "step": 26915 }, { "epoch": 0.9701949760334451, "grad_norm": 0.1571785807609558, "learning_rate": 4.6557144177083604e-05, "loss": 0.4471, "step": 26920 }, { "epoch": 0.9703751756946697, "grad_norm": 0.18303348124027252, "learning_rate": 4.655566622311137e-05, "loss": 0.4398, "step": 26925 }, { "epoch": 0.9705553753558943, "grad_norm": 0.19541716575622559, "learning_rate": 4.6554187975447364e-05, "loss": 0.4099, "step": 26930 }, { "epoch": 0.9707355750171189, "grad_norm": 0.17658253014087677, "learning_rate": 4.655270943411171e-05, "loss": 0.4512, "step": 26935 }, { "epoch": 0.9709157746783436, "grad_norm": 0.18457411229610443, "learning_rate": 4.655123059912456e-05, "loss": 0.4632, "step": 26940 }, { "epoch": 0.9710959743395683, "grad_norm": 0.15944646298885345, "learning_rate": 4.654975147050607e-05, "loss": 0.4436, "step": 26945 }, { "epoch": 0.9712761740007929, "grad_norm": 0.17047551274299622, "learning_rate": 4.654827204827639e-05, "loss": 0.4476, "step": 26950 }, { "epoch": 0.9714563736620175, "grad_norm": 0.18987303972244263, "learning_rate": 4.654679233245568e-05, "loss": 0.4751, "step": 26955 }, { "epoch": 0.9716365733232422, "grad_norm": 0.18949106335639954, "learning_rate": 4.654531232306409e-05, "loss": 0.432, "step": 26960 }, { "epoch": 0.9718167729844668, "grad_norm": 0.15808527171611786, "learning_rate": 4.654383202012179e-05, "loss": 0.3919, "step": 26965 }, { "epoch": 0.9719969726456914, "grad_norm": 0.1749500036239624, "learning_rate": 4.654235142364895e-05, "loss": 0.4509, "step": 26970 }, { "epoch": 0.972177172306916, "grad_norm": 0.1732529252767563, "learning_rate": 4.654087053366575e-05, "loss": 0.4012, "step": 26975 }, { "epoch": 0.9723573719681406, "grad_norm": 0.17838901281356812, "learning_rate": 4.653938935019235e-05, "loss": 0.4284, "step": 26980 }, { "epoch": 0.9725375716293654, "grad_norm": 0.16471531987190247, "learning_rate": 4.653790787324894e-05, "loss": 0.4306, "step": 26985 }, { "epoch": 0.97271777129059, "grad_norm": 0.19036082923412323, "learning_rate": 4.6536426102855714e-05, "loss": 0.4657, "step": 26990 }, { "epoch": 0.9728979709518146, "grad_norm": 0.15139730274677277, "learning_rate": 4.6534944039032845e-05, "loss": 0.4456, "step": 26995 }, { "epoch": 0.9730781706130393, "grad_norm": 0.17167389392852783, "learning_rate": 4.6533461681800534e-05, "loss": 0.4286, "step": 27000 }, { "epoch": 0.9730781706130393, "eval_loss": 0.45476090908050537, "eval_runtime": 3.5374, "eval_samples_per_second": 28.269, "eval_steps_per_second": 7.067, "step": 27000 }, { "epoch": 0.9732583702742639, "grad_norm": 0.17848706245422363, "learning_rate": 4.6531979031178975e-05, "loss": 0.3916, "step": 27005 }, { "epoch": 0.9734385699354885, "grad_norm": 0.15405860543251038, "learning_rate": 4.6530496087188374e-05, "loss": 0.4158, "step": 27010 }, { "epoch": 0.9736187695967131, "grad_norm": 0.17654937505722046, "learning_rate": 4.652901284984893e-05, "loss": 0.4398, "step": 27015 }, { "epoch": 0.9737989692579377, "grad_norm": 0.17176496982574463, "learning_rate": 4.652752931918085e-05, "loss": 0.4457, "step": 27020 }, { "epoch": 0.9739791689191625, "grad_norm": 0.1502595841884613, "learning_rate": 4.652604549520436e-05, "loss": 0.445, "step": 27025 }, { "epoch": 0.9741593685803871, "grad_norm": 0.14152762293815613, "learning_rate": 4.652456137793966e-05, "loss": 0.3946, "step": 27030 }, { "epoch": 0.9743395682416117, "grad_norm": 0.18833976984024048, "learning_rate": 4.6523076967406984e-05, "loss": 0.4336, "step": 27035 }, { "epoch": 0.9745197679028363, "grad_norm": 0.14718258380889893, "learning_rate": 4.652159226362655e-05, "loss": 0.4079, "step": 27040 }, { "epoch": 0.974699967564061, "grad_norm": 0.16922341287136078, "learning_rate": 4.652010726661858e-05, "loss": 0.4368, "step": 27045 }, { "epoch": 0.9748801672252856, "grad_norm": 0.18971046805381775, "learning_rate": 4.6518621976403335e-05, "loss": 0.4514, "step": 27050 }, { "epoch": 0.9750603668865102, "grad_norm": 0.15908949077129364, "learning_rate": 4.6517136393001015e-05, "loss": 0.4215, "step": 27055 }, { "epoch": 0.9752405665477349, "grad_norm": 0.15814034640789032, "learning_rate": 4.651565051643188e-05, "loss": 0.4227, "step": 27060 }, { "epoch": 0.9754207662089596, "grad_norm": 0.19080659747123718, "learning_rate": 4.651416434671617e-05, "loss": 0.4194, "step": 27065 }, { "epoch": 0.9756009658701842, "grad_norm": 0.17519068717956543, "learning_rate": 4.651267788387415e-05, "loss": 0.423, "step": 27070 }, { "epoch": 0.9757811655314088, "grad_norm": 0.18204259872436523, "learning_rate": 4.651119112792604e-05, "loss": 0.4161, "step": 27075 }, { "epoch": 0.9759613651926334, "grad_norm": 0.17226997017860413, "learning_rate": 4.6509704078892124e-05, "loss": 0.4583, "step": 27080 }, { "epoch": 0.9761415648538581, "grad_norm": 0.15808525681495667, "learning_rate": 4.650821673679265e-05, "loss": 0.4533, "step": 27085 }, { "epoch": 0.9763217645150827, "grad_norm": 0.15862226486206055, "learning_rate": 4.6506729101647897e-05, "loss": 0.3829, "step": 27090 }, { "epoch": 0.9765019641763073, "grad_norm": 0.18071158230304718, "learning_rate": 4.650524117347812e-05, "loss": 0.4365, "step": 27095 }, { "epoch": 0.976682163837532, "grad_norm": 0.2250252366065979, "learning_rate": 4.650375295230359e-05, "loss": 0.453, "step": 27100 }, { "epoch": 0.9768623634987567, "grad_norm": 0.182828888297081, "learning_rate": 4.6502264438144596e-05, "loss": 0.4589, "step": 27105 }, { "epoch": 0.9770425631599813, "grad_norm": 0.18622487783432007, "learning_rate": 4.650077563102141e-05, "loss": 0.4099, "step": 27110 }, { "epoch": 0.9772227628212059, "grad_norm": 0.16842947900295258, "learning_rate": 4.6499286530954314e-05, "loss": 0.4102, "step": 27115 }, { "epoch": 0.9774029624824305, "grad_norm": 0.17270709574222565, "learning_rate": 4.649779713796361e-05, "loss": 0.4312, "step": 27120 }, { "epoch": 0.9775831621436551, "grad_norm": 0.1607503890991211, "learning_rate": 4.649630745206958e-05, "loss": 0.4738, "step": 27125 }, { "epoch": 0.9777633618048798, "grad_norm": 0.13477887213230133, "learning_rate": 4.649481747329252e-05, "loss": 0.4357, "step": 27130 }, { "epoch": 0.9779435614661044, "grad_norm": 0.16214247047901154, "learning_rate": 4.6493327201652725e-05, "loss": 0.3851, "step": 27135 }, { "epoch": 0.9781237611273291, "grad_norm": 0.19764074683189392, "learning_rate": 4.649183663717052e-05, "loss": 0.4212, "step": 27140 }, { "epoch": 0.9783039607885538, "grad_norm": 0.169803187251091, "learning_rate": 4.6490345779866197e-05, "loss": 0.4267, "step": 27145 }, { "epoch": 0.9784841604497784, "grad_norm": 0.16970309615135193, "learning_rate": 4.6488854629760074e-05, "loss": 0.4518, "step": 27150 }, { "epoch": 0.978664360111003, "grad_norm": 0.15592272579669952, "learning_rate": 4.648736318687247e-05, "loss": 0.4588, "step": 27155 }, { "epoch": 0.9788445597722276, "grad_norm": 0.17874795198440552, "learning_rate": 4.64858714512237e-05, "loss": 0.4404, "step": 27160 }, { "epoch": 0.9790247594334522, "grad_norm": 0.15960319340229034, "learning_rate": 4.648437942283409e-05, "loss": 0.408, "step": 27165 }, { "epoch": 0.9792049590946769, "grad_norm": 0.17572569847106934, "learning_rate": 4.6482887101723974e-05, "loss": 0.4118, "step": 27170 }, { "epoch": 0.9793851587559015, "grad_norm": 0.20097064971923828, "learning_rate": 4.6481394487913673e-05, "loss": 0.4195, "step": 27175 }, { "epoch": 0.9795653584171262, "grad_norm": 0.18549709022045135, "learning_rate": 4.647990158142354e-05, "loss": 0.4298, "step": 27180 }, { "epoch": 0.9797455580783508, "grad_norm": 0.17674687504768372, "learning_rate": 4.6478408382273905e-05, "loss": 0.441, "step": 27185 }, { "epoch": 0.9799257577395755, "grad_norm": 0.14485035836696625, "learning_rate": 4.6476914890485114e-05, "loss": 0.4422, "step": 27190 }, { "epoch": 0.9801059574008001, "grad_norm": 0.1638946533203125, "learning_rate": 4.647542110607751e-05, "loss": 0.4341, "step": 27195 }, { "epoch": 0.9802861570620247, "grad_norm": 0.1771542876958847, "learning_rate": 4.6473927029071454e-05, "loss": 0.4298, "step": 27200 }, { "epoch": 0.9804663567232493, "grad_norm": 0.16256387531757355, "learning_rate": 4.6472432659487296e-05, "loss": 0.4271, "step": 27205 }, { "epoch": 0.980646556384474, "grad_norm": 0.15982799232006073, "learning_rate": 4.647093799734541e-05, "loss": 0.4554, "step": 27210 }, { "epoch": 0.9808267560456987, "grad_norm": 0.19146406650543213, "learning_rate": 4.646944304266615e-05, "loss": 0.4195, "step": 27215 }, { "epoch": 0.9810069557069233, "grad_norm": 0.17436739802360535, "learning_rate": 4.646794779546988e-05, "loss": 0.4558, "step": 27220 }, { "epoch": 0.9811871553681479, "grad_norm": 0.17052249610424042, "learning_rate": 4.6466452255776976e-05, "loss": 0.4448, "step": 27225 }, { "epoch": 0.9813673550293726, "grad_norm": 0.1632729321718216, "learning_rate": 4.646495642360782e-05, "loss": 0.4211, "step": 27230 }, { "epoch": 0.9815475546905972, "grad_norm": 0.2103988230228424, "learning_rate": 4.6463460298982787e-05, "loss": 0.431, "step": 27235 }, { "epoch": 0.9817277543518218, "grad_norm": 0.19348739087581635, "learning_rate": 4.646196388192226e-05, "loss": 0.4097, "step": 27240 }, { "epoch": 0.9819079540130464, "grad_norm": 0.1999654471874237, "learning_rate": 4.646046717244663e-05, "loss": 0.435, "step": 27245 }, { "epoch": 0.982088153674271, "grad_norm": 0.16381704807281494, "learning_rate": 4.6458970170576296e-05, "loss": 0.3906, "step": 27250 }, { "epoch": 0.9822683533354958, "grad_norm": 0.15966589748859406, "learning_rate": 4.6457472876331644e-05, "loss": 0.4372, "step": 27255 }, { "epoch": 0.9824485529967204, "grad_norm": 0.17224369943141937, "learning_rate": 4.6455975289733077e-05, "loss": 0.4433, "step": 27260 }, { "epoch": 0.982628752657945, "grad_norm": 0.1269349455833435, "learning_rate": 4.6454477410801e-05, "loss": 0.3989, "step": 27265 }, { "epoch": 0.9828089523191696, "grad_norm": 0.1905910223722458, "learning_rate": 4.6452979239555825e-05, "loss": 0.4544, "step": 27270 }, { "epoch": 0.9829891519803943, "grad_norm": 0.17192591726779938, "learning_rate": 4.645148077601796e-05, "loss": 0.4112, "step": 27275 }, { "epoch": 0.9831693516416189, "grad_norm": 0.1754426509141922, "learning_rate": 4.644998202020783e-05, "loss": 0.471, "step": 27280 }, { "epoch": 0.9833495513028435, "grad_norm": 0.16824094951152802, "learning_rate": 4.644848297214584e-05, "loss": 0.4322, "step": 27285 }, { "epoch": 0.9835297509640681, "grad_norm": 0.15957225859165192, "learning_rate": 4.6446983631852424e-05, "loss": 0.4364, "step": 27290 }, { "epoch": 0.9837099506252929, "grad_norm": 0.16426332294940948, "learning_rate": 4.6445483999348006e-05, "loss": 0.4243, "step": 27295 }, { "epoch": 0.9838901502865175, "grad_norm": 0.1607593595981598, "learning_rate": 4.6443984074653026e-05, "loss": 0.4289, "step": 27300 }, { "epoch": 0.9840703499477421, "grad_norm": 0.15784458816051483, "learning_rate": 4.644248385778791e-05, "loss": 0.396, "step": 27305 }, { "epoch": 0.9842505496089667, "grad_norm": 0.21137282252311707, "learning_rate": 4.6440983348773105e-05, "loss": 0.4444, "step": 27310 }, { "epoch": 0.9844307492701914, "grad_norm": 0.21864286065101624, "learning_rate": 4.6439482547629046e-05, "loss": 0.4633, "step": 27315 }, { "epoch": 0.984610948931416, "grad_norm": 0.1778162717819214, "learning_rate": 4.6437981454376194e-05, "loss": 0.3835, "step": 27320 }, { "epoch": 0.9847911485926406, "grad_norm": 0.1485341191291809, "learning_rate": 4.6436480069034995e-05, "loss": 0.4085, "step": 27325 }, { "epoch": 0.9849713482538652, "grad_norm": 0.14085252583026886, "learning_rate": 4.6434978391625905e-05, "loss": 0.4097, "step": 27330 }, { "epoch": 0.98515154791509, "grad_norm": 0.12692780792713165, "learning_rate": 4.643347642216939e-05, "loss": 0.4053, "step": 27335 }, { "epoch": 0.9853317475763146, "grad_norm": 0.18438071012496948, "learning_rate": 4.64319741606859e-05, "loss": 0.43, "step": 27340 }, { "epoch": 0.9855119472375392, "grad_norm": 0.21135936677455902, "learning_rate": 4.6430471607195917e-05, "loss": 0.4677, "step": 27345 }, { "epoch": 0.9856921468987638, "grad_norm": 0.16982705891132355, "learning_rate": 4.64289687617199e-05, "loss": 0.4531, "step": 27350 }, { "epoch": 0.9858723465599885, "grad_norm": 0.16164273023605347, "learning_rate": 4.642746562427834e-05, "loss": 0.4122, "step": 27355 }, { "epoch": 0.9860525462212131, "grad_norm": 0.20969519019126892, "learning_rate": 4.6425962194891705e-05, "loss": 0.4288, "step": 27360 }, { "epoch": 0.9862327458824377, "grad_norm": 0.14550209045410156, "learning_rate": 4.6424458473580486e-05, "loss": 0.4714, "step": 27365 }, { "epoch": 0.9864129455436623, "grad_norm": 0.18479301035404205, "learning_rate": 4.6422954460365165e-05, "loss": 0.4166, "step": 27370 }, { "epoch": 0.9865931452048871, "grad_norm": 0.15923672914505005, "learning_rate": 4.642145015526624e-05, "loss": 0.4384, "step": 27375 }, { "epoch": 0.9867733448661117, "grad_norm": 0.1702156960964203, "learning_rate": 4.64199455583042e-05, "loss": 0.427, "step": 27380 }, { "epoch": 0.9869535445273363, "grad_norm": 0.19491025805473328, "learning_rate": 4.641844066949955e-05, "loss": 0.4257, "step": 27385 }, { "epoch": 0.9871337441885609, "grad_norm": 0.21191620826721191, "learning_rate": 4.6416935488872806e-05, "loss": 0.4846, "step": 27390 }, { "epoch": 0.9873139438497855, "grad_norm": 0.1783130019903183, "learning_rate": 4.6415430016444445e-05, "loss": 0.4066, "step": 27395 }, { "epoch": 0.9874941435110102, "grad_norm": 0.16882510483264923, "learning_rate": 4.6413924252235006e-05, "loss": 0.4118, "step": 27400 }, { "epoch": 0.9876743431722348, "grad_norm": 0.18911772966384888, "learning_rate": 4.641241819626499e-05, "loss": 0.4378, "step": 27405 }, { "epoch": 0.9878545428334595, "grad_norm": 0.15727469325065613, "learning_rate": 4.641091184855492e-05, "loss": 0.4031, "step": 27410 }, { "epoch": 0.9880347424946841, "grad_norm": 0.1802516132593155, "learning_rate": 4.640940520912532e-05, "loss": 0.4541, "step": 27415 }, { "epoch": 0.9882149421559088, "grad_norm": 0.18460440635681152, "learning_rate": 4.640789827799673e-05, "loss": 0.409, "step": 27420 }, { "epoch": 0.9883951418171334, "grad_norm": 0.18090972304344177, "learning_rate": 4.640639105518966e-05, "loss": 0.4232, "step": 27425 }, { "epoch": 0.988575341478358, "grad_norm": 0.15177130699157715, "learning_rate": 4.6404883540724665e-05, "loss": 0.3694, "step": 27430 }, { "epoch": 0.9887555411395826, "grad_norm": 0.19808749854564667, "learning_rate": 4.6403375734622265e-05, "loss": 0.416, "step": 27435 }, { "epoch": 0.9889357408008073, "grad_norm": 0.18107527494430542, "learning_rate": 4.640186763690302e-05, "loss": 0.4578, "step": 27440 }, { "epoch": 0.9891159404620319, "grad_norm": 0.18067267537117004, "learning_rate": 4.640035924758748e-05, "loss": 0.4572, "step": 27445 }, { "epoch": 0.9892961401232566, "grad_norm": 0.14522598683834076, "learning_rate": 4.6398850566696176e-05, "loss": 0.4252, "step": 27450 }, { "epoch": 0.9894763397844812, "grad_norm": 0.1613176465034485, "learning_rate": 4.6397341594249675e-05, "loss": 0.3907, "step": 27455 }, { "epoch": 0.9896565394457059, "grad_norm": 0.1852157860994339, "learning_rate": 4.639583233026855e-05, "loss": 0.4324, "step": 27460 }, { "epoch": 0.9898367391069305, "grad_norm": 0.16514049470424652, "learning_rate": 4.639432277477335e-05, "loss": 0.4413, "step": 27465 }, { "epoch": 0.9900169387681551, "grad_norm": 0.14694753289222717, "learning_rate": 4.639281292778464e-05, "loss": 0.4236, "step": 27470 }, { "epoch": 0.9901971384293797, "grad_norm": 0.18619796633720398, "learning_rate": 4.639130278932299e-05, "loss": 0.4442, "step": 27475 }, { "epoch": 0.9903773380906044, "grad_norm": 0.15744751691818237, "learning_rate": 4.638979235940899e-05, "loss": 0.4074, "step": 27480 }, { "epoch": 0.990557537751829, "grad_norm": 0.17762751877307892, "learning_rate": 4.63882816380632e-05, "loss": 0.4212, "step": 27485 }, { "epoch": 0.9907377374130537, "grad_norm": 0.18236143887043, "learning_rate": 4.638677062530622e-05, "loss": 0.4089, "step": 27490 }, { "epoch": 0.9909179370742783, "grad_norm": 0.1682094931602478, "learning_rate": 4.638525932115863e-05, "loss": 0.4288, "step": 27495 }, { "epoch": 0.991098136735503, "grad_norm": 0.17078998684883118, "learning_rate": 4.6383747725641027e-05, "loss": 0.4328, "step": 27500 }, { "epoch": 0.991098136735503, "eval_loss": 0.45491889119148254, "eval_runtime": 3.529, "eval_samples_per_second": 28.337, "eval_steps_per_second": 7.084, "step": 27500 }, { "epoch": 0.9912783363967276, "grad_norm": 0.185177743434906, "learning_rate": 4.6382235838774e-05, "loss": 0.4294, "step": 27505 }, { "epoch": 0.9914585360579522, "grad_norm": 0.1422509402036667, "learning_rate": 4.6380723660578144e-05, "loss": 0.3911, "step": 27510 }, { "epoch": 0.9916387357191768, "grad_norm": 0.1539120376110077, "learning_rate": 4.6379211191074066e-05, "loss": 0.4519, "step": 27515 }, { "epoch": 0.9918189353804014, "grad_norm": 0.19037242233753204, "learning_rate": 4.637769843028238e-05, "loss": 0.4822, "step": 27520 }, { "epoch": 0.9919991350416261, "grad_norm": 0.2115742713212967, "learning_rate": 4.637618537822369e-05, "loss": 0.4498, "step": 27525 }, { "epoch": 0.9921793347028508, "grad_norm": 0.17919567227363586, "learning_rate": 4.6374672034918606e-05, "loss": 0.4453, "step": 27530 }, { "epoch": 0.9923595343640754, "grad_norm": 0.13095134496688843, "learning_rate": 4.6373158400387775e-05, "loss": 0.4289, "step": 27535 }, { "epoch": 0.9925397340253, "grad_norm": 0.18032823503017426, "learning_rate": 4.6371644474651773e-05, "loss": 0.4262, "step": 27540 }, { "epoch": 0.9927199336865247, "grad_norm": 0.14844289422035217, "learning_rate": 4.637013025773127e-05, "loss": 0.4254, "step": 27545 }, { "epoch": 0.9929001333477493, "grad_norm": 0.182834193110466, "learning_rate": 4.636861574964687e-05, "loss": 0.4526, "step": 27550 }, { "epoch": 0.9930803330089739, "grad_norm": 0.18387030065059662, "learning_rate": 4.6367100950419226e-05, "loss": 0.4377, "step": 27555 }, { "epoch": 0.9932605326701985, "grad_norm": 0.200217604637146, "learning_rate": 4.636558586006896e-05, "loss": 0.4289, "step": 27560 }, { "epoch": 0.9934407323314233, "grad_norm": 0.16323430836200714, "learning_rate": 4.636407047861673e-05, "loss": 0.41, "step": 27565 }, { "epoch": 0.9936209319926479, "grad_norm": 0.14790096879005432, "learning_rate": 4.6362554806083176e-05, "loss": 0.4418, "step": 27570 }, { "epoch": 0.9938011316538725, "grad_norm": 0.18364425003528595, "learning_rate": 4.6361038842488944e-05, "loss": 0.4132, "step": 27575 }, { "epoch": 0.9939813313150971, "grad_norm": 0.15414080023765564, "learning_rate": 4.63595225878547e-05, "loss": 0.4203, "step": 27580 }, { "epoch": 0.9941615309763218, "grad_norm": 0.17159409821033478, "learning_rate": 4.635800604220109e-05, "loss": 0.4208, "step": 27585 }, { "epoch": 0.9943417306375464, "grad_norm": 0.13989026844501495, "learning_rate": 4.635648920554878e-05, "loss": 0.4302, "step": 27590 }, { "epoch": 0.994521930298771, "grad_norm": 0.19220641255378723, "learning_rate": 4.635497207791845e-05, "loss": 0.4385, "step": 27595 }, { "epoch": 0.9947021299599956, "grad_norm": 0.20542806386947632, "learning_rate": 4.635345465933075e-05, "loss": 0.4185, "step": 27600 }, { "epoch": 0.9948823296212204, "grad_norm": 0.15193575620651245, "learning_rate": 4.635193694980636e-05, "loss": 0.4152, "step": 27605 }, { "epoch": 0.995062529282445, "grad_norm": 0.174140065908432, "learning_rate": 4.635041894936598e-05, "loss": 0.4468, "step": 27610 }, { "epoch": 0.9952427289436696, "grad_norm": 0.18024513125419617, "learning_rate": 4.6348900658030263e-05, "loss": 0.4326, "step": 27615 }, { "epoch": 0.9954229286048942, "grad_norm": 0.18026669323444366, "learning_rate": 4.634738207581991e-05, "loss": 0.4329, "step": 27620 }, { "epoch": 0.9956031282661189, "grad_norm": 0.15884456038475037, "learning_rate": 4.634586320275561e-05, "loss": 0.4202, "step": 27625 }, { "epoch": 0.9957833279273435, "grad_norm": 0.1757356971502304, "learning_rate": 4.634434403885805e-05, "loss": 0.4291, "step": 27630 }, { "epoch": 0.9959635275885681, "grad_norm": 0.16361142694950104, "learning_rate": 4.634282458414795e-05, "loss": 0.4216, "step": 27635 }, { "epoch": 0.9961437272497927, "grad_norm": 0.21342548727989197, "learning_rate": 4.634130483864598e-05, "loss": 0.4669, "step": 27640 }, { "epoch": 0.9963239269110175, "grad_norm": 0.2188446968793869, "learning_rate": 4.6339784802372874e-05, "loss": 0.4772, "step": 27645 }, { "epoch": 0.9965041265722421, "grad_norm": 0.1779574155807495, "learning_rate": 4.633826447534934e-05, "loss": 0.4633, "step": 27650 }, { "epoch": 0.9966843262334667, "grad_norm": 0.19599558413028717, "learning_rate": 4.633674385759607e-05, "loss": 0.4405, "step": 27655 }, { "epoch": 0.9968645258946913, "grad_norm": 0.15029726922512054, "learning_rate": 4.6335222949133794e-05, "loss": 0.4461, "step": 27660 }, { "epoch": 0.997044725555916, "grad_norm": 0.17829090356826782, "learning_rate": 4.633370174998324e-05, "loss": 0.4567, "step": 27665 }, { "epoch": 0.9972249252171406, "grad_norm": 0.1724914163351059, "learning_rate": 4.6332180260165135e-05, "loss": 0.4121, "step": 27670 }, { "epoch": 0.9974051248783652, "grad_norm": 0.18331369757652283, "learning_rate": 4.63306584797002e-05, "loss": 0.4185, "step": 27675 }, { "epoch": 0.9975853245395898, "grad_norm": 0.2127685546875, "learning_rate": 4.632913640860918e-05, "loss": 0.465, "step": 27680 }, { "epoch": 0.9977655242008145, "grad_norm": 0.18392300605773926, "learning_rate": 4.6327614046912796e-05, "loss": 0.4276, "step": 27685 }, { "epoch": 0.9979457238620392, "grad_norm": 0.18069952726364136, "learning_rate": 4.63260913946318e-05, "loss": 0.4089, "step": 27690 }, { "epoch": 0.9981259235232638, "grad_norm": 0.17359404265880585, "learning_rate": 4.632456845178694e-05, "loss": 0.4294, "step": 27695 }, { "epoch": 0.9983061231844884, "grad_norm": 0.1843539923429489, "learning_rate": 4.632304521839896e-05, "loss": 0.4795, "step": 27700 }, { "epoch": 0.998486322845713, "grad_norm": 0.17044410109519958, "learning_rate": 4.6321521694488627e-05, "loss": 0.4345, "step": 27705 }, { "epoch": 0.9986665225069377, "grad_norm": 0.24153609573841095, "learning_rate": 4.631999788007668e-05, "loss": 0.437, "step": 27710 }, { "epoch": 0.9988467221681623, "grad_norm": 0.16202819347381592, "learning_rate": 4.631847377518389e-05, "loss": 0.4272, "step": 27715 }, { "epoch": 0.999026921829387, "grad_norm": 0.16928540170192719, "learning_rate": 4.6316949379831025e-05, "loss": 0.4584, "step": 27720 }, { "epoch": 0.9992071214906116, "grad_norm": 0.20613226294517517, "learning_rate": 4.6315424694038854e-05, "loss": 0.4415, "step": 27725 }, { "epoch": 0.9993873211518363, "grad_norm": 0.17505665123462677, "learning_rate": 4.631389971782815e-05, "loss": 0.4521, "step": 27730 }, { "epoch": 0.9995675208130609, "grad_norm": 0.18743610382080078, "learning_rate": 4.631237445121968e-05, "loss": 0.4555, "step": 27735 }, { "epoch": 0.9997477204742855, "grad_norm": 0.17451363801956177, "learning_rate": 4.631084889423424e-05, "loss": 0.4042, "step": 27740 }, { "epoch": 0.9999279201355101, "grad_norm": 0.17138254642486572, "learning_rate": 4.630932304689261e-05, "loss": 0.411, "step": 27745 }, { "epoch": 1.0001081197967348, "grad_norm": 0.18573379516601562, "learning_rate": 4.6307796909215574e-05, "loss": 0.4382, "step": 27750 }, { "epoch": 1.0002883194579595, "grad_norm": 0.18925435841083527, "learning_rate": 4.630627048122393e-05, "loss": 0.4086, "step": 27755 }, { "epoch": 1.000468519119184, "grad_norm": 0.15313346683979034, "learning_rate": 4.630474376293849e-05, "loss": 0.435, "step": 27760 }, { "epoch": 1.0006487187804087, "grad_norm": 0.16638581454753876, "learning_rate": 4.630321675438002e-05, "loss": 0.3956, "step": 27765 }, { "epoch": 1.0008289184416332, "grad_norm": 0.23420801758766174, "learning_rate": 4.630168945556937e-05, "loss": 0.4247, "step": 27770 }, { "epoch": 1.001009118102858, "grad_norm": 0.19458545744419098, "learning_rate": 4.63001618665273e-05, "loss": 0.4022, "step": 27775 }, { "epoch": 1.0011893177640827, "grad_norm": 0.15574969351291656, "learning_rate": 4.629863398727466e-05, "loss": 0.4313, "step": 27780 }, { "epoch": 1.0013695174253072, "grad_norm": 0.14644834399223328, "learning_rate": 4.629710581783226e-05, "loss": 0.4278, "step": 27785 }, { "epoch": 1.001549717086532, "grad_norm": 0.17037297785282135, "learning_rate": 4.6295577358220914e-05, "loss": 0.4216, "step": 27790 }, { "epoch": 1.0017299167477565, "grad_norm": 0.18147750198841095, "learning_rate": 4.629404860846145e-05, "loss": 0.4044, "step": 27795 }, { "epoch": 1.0019101164089812, "grad_norm": 0.17213241755962372, "learning_rate": 4.629251956857469e-05, "loss": 0.4361, "step": 27800 }, { "epoch": 1.0020903160702057, "grad_norm": 0.16381776332855225, "learning_rate": 4.629099023858148e-05, "loss": 0.4295, "step": 27805 }, { "epoch": 1.0022705157314304, "grad_norm": 0.19637686014175415, "learning_rate": 4.628946061850265e-05, "loss": 0.4377, "step": 27810 }, { "epoch": 1.002450715392655, "grad_norm": 0.19968008995056152, "learning_rate": 4.628793070835904e-05, "loss": 0.406, "step": 27815 }, { "epoch": 1.0026309150538797, "grad_norm": 0.19794724881649017, "learning_rate": 4.628640050817149e-05, "loss": 0.4257, "step": 27820 }, { "epoch": 1.0028111147151044, "grad_norm": 0.14951010048389435, "learning_rate": 4.628487001796086e-05, "loss": 0.4467, "step": 27825 }, { "epoch": 1.002991314376329, "grad_norm": 0.169041708111763, "learning_rate": 4.628333923774799e-05, "loss": 0.4444, "step": 27830 }, { "epoch": 1.0031715140375537, "grad_norm": 0.1682281494140625, "learning_rate": 4.628180816755375e-05, "loss": 0.4206, "step": 27835 }, { "epoch": 1.0033517136987782, "grad_norm": 0.18344461917877197, "learning_rate": 4.6280276807398994e-05, "loss": 0.4229, "step": 27840 }, { "epoch": 1.003531913360003, "grad_norm": 0.14406928420066833, "learning_rate": 4.627874515730459e-05, "loss": 0.4192, "step": 27845 }, { "epoch": 1.0037121130212274, "grad_norm": 0.1358763873577118, "learning_rate": 4.6277213217291395e-05, "loss": 0.4126, "step": 27850 }, { "epoch": 1.0038923126824522, "grad_norm": 0.15834762156009674, "learning_rate": 4.6275680987380296e-05, "loss": 0.4679, "step": 27855 }, { "epoch": 1.004072512343677, "grad_norm": 0.1732066124677658, "learning_rate": 4.627414846759216e-05, "loss": 0.3903, "step": 27860 }, { "epoch": 1.0042527120049014, "grad_norm": 0.17274266481399536, "learning_rate": 4.627261565794787e-05, "loss": 0.4179, "step": 27865 }, { "epoch": 1.0044329116661261, "grad_norm": 0.1775139421224594, "learning_rate": 4.6271082558468306e-05, "loss": 0.4045, "step": 27870 }, { "epoch": 1.0046131113273506, "grad_norm": 0.17623652517795563, "learning_rate": 4.626954916917436e-05, "loss": 0.3981, "step": 27875 }, { "epoch": 1.0047933109885754, "grad_norm": 0.18118661642074585, "learning_rate": 4.626801549008693e-05, "loss": 0.432, "step": 27880 }, { "epoch": 1.0049735106498, "grad_norm": 0.1708979606628418, "learning_rate": 4.6266481521226904e-05, "loss": 0.4477, "step": 27885 }, { "epoch": 1.0051537103110246, "grad_norm": 0.17798757553100586, "learning_rate": 4.6264947262615186e-05, "loss": 0.4128, "step": 27890 }, { "epoch": 1.0053339099722494, "grad_norm": 0.16669988632202148, "learning_rate": 4.626341271427268e-05, "loss": 0.4202, "step": 27895 }, { "epoch": 1.0055141096334739, "grad_norm": 0.1673901528120041, "learning_rate": 4.626187787622029e-05, "loss": 0.4245, "step": 27900 }, { "epoch": 1.0056943092946986, "grad_norm": 0.1705399453639984, "learning_rate": 4.626034274847893e-05, "loss": 0.4076, "step": 27905 }, { "epoch": 1.0058745089559231, "grad_norm": 0.17363837361335754, "learning_rate": 4.625880733106951e-05, "loss": 0.4292, "step": 27910 }, { "epoch": 1.0060547086171479, "grad_norm": 0.15469084680080414, "learning_rate": 4.625727162401296e-05, "loss": 0.3902, "step": 27915 }, { "epoch": 1.0062349082783724, "grad_norm": 0.1932782232761383, "learning_rate": 4.625573562733021e-05, "loss": 0.4187, "step": 27920 }, { "epoch": 1.006415107939597, "grad_norm": 0.2232416868209839, "learning_rate": 4.625419934104217e-05, "loss": 0.459, "step": 27925 }, { "epoch": 1.0065953076008216, "grad_norm": 0.1562723070383072, "learning_rate": 4.625266276516978e-05, "loss": 0.42, "step": 27930 }, { "epoch": 1.0067755072620463, "grad_norm": 0.16613322496414185, "learning_rate": 4.625112589973397e-05, "loss": 0.4205, "step": 27935 }, { "epoch": 1.006955706923271, "grad_norm": 0.15974131226539612, "learning_rate": 4.624958874475569e-05, "loss": 0.4098, "step": 27940 }, { "epoch": 1.0071359065844956, "grad_norm": 0.12717698514461517, "learning_rate": 4.624805130025588e-05, "loss": 0.4076, "step": 27945 }, { "epoch": 1.0073161062457203, "grad_norm": 0.20461279153823853, "learning_rate": 4.624651356625548e-05, "loss": 0.438, "step": 27950 }, { "epoch": 1.0074963059069448, "grad_norm": 0.16563838720321655, "learning_rate": 4.624497554277544e-05, "loss": 0.3795, "step": 27955 }, { "epoch": 1.0076765055681696, "grad_norm": 0.1689777672290802, "learning_rate": 4.6243437229836725e-05, "loss": 0.425, "step": 27960 }, { "epoch": 1.007856705229394, "grad_norm": 0.1565127670764923, "learning_rate": 4.624189862746029e-05, "loss": 0.3809, "step": 27965 }, { "epoch": 1.0080369048906188, "grad_norm": 0.17497403919696808, "learning_rate": 4.62403597356671e-05, "loss": 0.4502, "step": 27970 }, { "epoch": 1.0082171045518435, "grad_norm": 0.1647554188966751, "learning_rate": 4.623882055447813e-05, "loss": 0.4453, "step": 27975 }, { "epoch": 1.008397304213068, "grad_norm": 0.18609805405139923, "learning_rate": 4.623728108391433e-05, "loss": 0.4263, "step": 27980 }, { "epoch": 1.0085775038742928, "grad_norm": 0.15984831750392914, "learning_rate": 4.6235741323996696e-05, "loss": 0.4564, "step": 27985 }, { "epoch": 1.0087577035355173, "grad_norm": 0.1694299876689911, "learning_rate": 4.623420127474619e-05, "loss": 0.4075, "step": 27990 }, { "epoch": 1.008937903196742, "grad_norm": 0.18183918297290802, "learning_rate": 4.62326609361838e-05, "loss": 0.3977, "step": 27995 }, { "epoch": 1.0091181028579665, "grad_norm": 0.18939772248268127, "learning_rate": 4.623112030833052e-05, "loss": 0.4101, "step": 28000 }, { "epoch": 1.0091181028579665, "eval_loss": 0.4545483887195587, "eval_runtime": 3.5395, "eval_samples_per_second": 28.252, "eval_steps_per_second": 7.063, "step": 28000 }, { "epoch": 1.0092983025191913, "grad_norm": 0.15189491212368011, "learning_rate": 4.622957939120734e-05, "loss": 0.416, "step": 28005 }, { "epoch": 1.0094785021804158, "grad_norm": 0.1771238148212433, "learning_rate": 4.6228038184835256e-05, "loss": 0.4021, "step": 28010 }, { "epoch": 1.0096587018416405, "grad_norm": 0.18271222710609436, "learning_rate": 4.622649668923525e-05, "loss": 0.4491, "step": 28015 }, { "epoch": 1.0098389015028653, "grad_norm": 0.19635730981826782, "learning_rate": 4.622495490442834e-05, "loss": 0.408, "step": 28020 }, { "epoch": 1.0100191011640898, "grad_norm": 0.1971067190170288, "learning_rate": 4.622341283043553e-05, "loss": 0.4363, "step": 28025 }, { "epoch": 1.0101993008253145, "grad_norm": 0.1596841812133789, "learning_rate": 4.622187046727783e-05, "loss": 0.4135, "step": 28030 }, { "epoch": 1.010379500486539, "grad_norm": 0.20057372748851776, "learning_rate": 4.622032781497625e-05, "loss": 0.4142, "step": 28035 }, { "epoch": 1.0105597001477638, "grad_norm": 0.21350523829460144, "learning_rate": 4.6218784873551816e-05, "loss": 0.4545, "step": 28040 }, { "epoch": 1.0107398998089883, "grad_norm": 0.21572305262088776, "learning_rate": 4.621724164302554e-05, "loss": 0.4428, "step": 28045 }, { "epoch": 1.010920099470213, "grad_norm": 0.15721938014030457, "learning_rate": 4.621569812341846e-05, "loss": 0.4368, "step": 28050 }, { "epoch": 1.0111002991314377, "grad_norm": 0.18772733211517334, "learning_rate": 4.62141543147516e-05, "loss": 0.4255, "step": 28055 }, { "epoch": 1.0112804987926622, "grad_norm": 0.15941715240478516, "learning_rate": 4.6212610217045985e-05, "loss": 0.4191, "step": 28060 }, { "epoch": 1.011460698453887, "grad_norm": 0.191365584731102, "learning_rate": 4.6211065830322674e-05, "loss": 0.4096, "step": 28065 }, { "epoch": 1.0116408981151115, "grad_norm": 0.15204831957817078, "learning_rate": 4.62095211546027e-05, "loss": 0.4178, "step": 28070 }, { "epoch": 1.0118210977763362, "grad_norm": 0.1380978375673294, "learning_rate": 4.62079761899071e-05, "loss": 0.3991, "step": 28075 }, { "epoch": 1.0120012974375607, "grad_norm": 0.1999700665473938, "learning_rate": 4.6206430936256925e-05, "loss": 0.4427, "step": 28080 }, { "epoch": 1.0121814970987855, "grad_norm": 0.16516053676605225, "learning_rate": 4.620488539367325e-05, "loss": 0.4186, "step": 28085 }, { "epoch": 1.0123616967600102, "grad_norm": 0.1769554764032364, "learning_rate": 4.62033395621771e-05, "loss": 0.3821, "step": 28090 }, { "epoch": 1.0125418964212347, "grad_norm": 0.15776276588439941, "learning_rate": 4.6201793441789566e-05, "loss": 0.4336, "step": 28095 }, { "epoch": 1.0127220960824594, "grad_norm": 0.16874942183494568, "learning_rate": 4.620024703253169e-05, "loss": 0.4315, "step": 28100 }, { "epoch": 1.012902295743684, "grad_norm": 0.18879170715808868, "learning_rate": 4.619870033442456e-05, "loss": 0.4295, "step": 28105 }, { "epoch": 1.0130824954049087, "grad_norm": 0.17838184535503387, "learning_rate": 4.619715334748924e-05, "loss": 0.3818, "step": 28110 }, { "epoch": 1.0132626950661332, "grad_norm": 0.1531950831413269, "learning_rate": 4.619560607174681e-05, "loss": 0.4307, "step": 28115 }, { "epoch": 1.013442894727358, "grad_norm": 0.15358266234397888, "learning_rate": 4.619405850721835e-05, "loss": 0.3952, "step": 28120 }, { "epoch": 1.0136230943885824, "grad_norm": 0.16065557301044464, "learning_rate": 4.6192510653924956e-05, "loss": 0.4233, "step": 28125 }, { "epoch": 1.0138032940498072, "grad_norm": 0.2141285538673401, "learning_rate": 4.6190962511887694e-05, "loss": 0.3981, "step": 28130 }, { "epoch": 1.013983493711032, "grad_norm": 0.23794370889663696, "learning_rate": 4.618941408112768e-05, "loss": 0.3997, "step": 28135 }, { "epoch": 1.0141636933722564, "grad_norm": 0.17404253780841827, "learning_rate": 4.6187865361665995e-05, "loss": 0.4309, "step": 28140 }, { "epoch": 1.0143438930334812, "grad_norm": 0.1899445503950119, "learning_rate": 4.618631635352375e-05, "loss": 0.4306, "step": 28145 }, { "epoch": 1.0145240926947057, "grad_norm": 0.14172546565532684, "learning_rate": 4.6184767056722044e-05, "loss": 0.4384, "step": 28150 }, { "epoch": 1.0147042923559304, "grad_norm": 0.18066340684890747, "learning_rate": 4.618321747128199e-05, "loss": 0.4248, "step": 28155 }, { "epoch": 1.014884492017155, "grad_norm": 0.18051232397556305, "learning_rate": 4.61816675972247e-05, "loss": 0.4058, "step": 28160 }, { "epoch": 1.0150646916783796, "grad_norm": 0.21809528768062592, "learning_rate": 4.6180117434571285e-05, "loss": 0.3864, "step": 28165 }, { "epoch": 1.0152448913396044, "grad_norm": 0.1755502074956894, "learning_rate": 4.6178566983342875e-05, "loss": 0.411, "step": 28170 }, { "epoch": 1.015425091000829, "grad_norm": 0.2000172734260559, "learning_rate": 4.617701624356059e-05, "loss": 0.4297, "step": 28175 }, { "epoch": 1.0156052906620536, "grad_norm": 0.15490785241127014, "learning_rate": 4.6175465215245556e-05, "loss": 0.4408, "step": 28180 }, { "epoch": 1.0157854903232781, "grad_norm": 0.1819140464067459, "learning_rate": 4.617391389841891e-05, "loss": 0.3964, "step": 28185 }, { "epoch": 1.0159656899845029, "grad_norm": 0.1457093507051468, "learning_rate": 4.617236229310179e-05, "loss": 0.3747, "step": 28190 }, { "epoch": 1.0161458896457274, "grad_norm": 0.18323257565498352, "learning_rate": 4.617081039931532e-05, "loss": 0.4372, "step": 28195 }, { "epoch": 1.0163260893069521, "grad_norm": 0.16842052340507507, "learning_rate": 4.616925821708067e-05, "loss": 0.4285, "step": 28200 }, { "epoch": 1.0165062889681766, "grad_norm": 0.1632702648639679, "learning_rate": 4.6167705746418974e-05, "loss": 0.395, "step": 28205 }, { "epoch": 1.0166864886294014, "grad_norm": 0.15500997006893158, "learning_rate": 4.616615298735138e-05, "loss": 0.4169, "step": 28210 }, { "epoch": 1.016866688290626, "grad_norm": 0.18930770456790924, "learning_rate": 4.616459993989906e-05, "loss": 0.4304, "step": 28215 }, { "epoch": 1.0170468879518506, "grad_norm": 0.19972793757915497, "learning_rate": 4.616304660408315e-05, "loss": 0.4025, "step": 28220 }, { "epoch": 1.0172270876130753, "grad_norm": 0.1533392369747162, "learning_rate": 4.6161492979924834e-05, "loss": 0.4091, "step": 28225 }, { "epoch": 1.0174072872742999, "grad_norm": 0.17162777483463287, "learning_rate": 4.615993906744528e-05, "loss": 0.4054, "step": 28230 }, { "epoch": 1.0175874869355246, "grad_norm": 0.2027086317539215, "learning_rate": 4.615838486666564e-05, "loss": 0.3837, "step": 28235 }, { "epoch": 1.017767686596749, "grad_norm": 0.1900579333305359, "learning_rate": 4.6156830377607105e-05, "loss": 0.4096, "step": 28240 }, { "epoch": 1.0179478862579738, "grad_norm": 0.15131549537181854, "learning_rate": 4.615527560029086e-05, "loss": 0.4474, "step": 28245 }, { "epoch": 1.0181280859191986, "grad_norm": 0.1787458062171936, "learning_rate": 4.615372053473808e-05, "loss": 0.3827, "step": 28250 }, { "epoch": 1.018308285580423, "grad_norm": 0.18287619948387146, "learning_rate": 4.6152165180969944e-05, "loss": 0.4215, "step": 28255 }, { "epoch": 1.0184884852416478, "grad_norm": 0.15783603489398956, "learning_rate": 4.6150609539007664e-05, "loss": 0.3793, "step": 28260 }, { "epoch": 1.0186686849028723, "grad_norm": 0.2039651870727539, "learning_rate": 4.614905360887241e-05, "loss": 0.4043, "step": 28265 }, { "epoch": 1.018848884564097, "grad_norm": 0.18400070071220398, "learning_rate": 4.61474973905854e-05, "loss": 0.4042, "step": 28270 }, { "epoch": 1.0190290842253216, "grad_norm": 0.19877344369888306, "learning_rate": 4.614594088416784e-05, "loss": 0.4057, "step": 28275 }, { "epoch": 1.0192092838865463, "grad_norm": 0.19915084540843964, "learning_rate": 4.614438408964092e-05, "loss": 0.4553, "step": 28280 }, { "epoch": 1.019389483547771, "grad_norm": 0.19474834203720093, "learning_rate": 4.614282700702587e-05, "loss": 0.4374, "step": 28285 }, { "epoch": 1.0195696832089955, "grad_norm": 0.1851280927658081, "learning_rate": 4.614126963634389e-05, "loss": 0.4065, "step": 28290 }, { "epoch": 1.0197498828702203, "grad_norm": 0.1995769590139389, "learning_rate": 4.6139711977616207e-05, "loss": 0.4195, "step": 28295 }, { "epoch": 1.0199300825314448, "grad_norm": 0.17774897813796997, "learning_rate": 4.6138154030864036e-05, "loss": 0.4494, "step": 28300 }, { "epoch": 1.0201102821926695, "grad_norm": 0.1672256588935852, "learning_rate": 4.613659579610861e-05, "loss": 0.413, "step": 28305 }, { "epoch": 1.020290481853894, "grad_norm": 0.17119145393371582, "learning_rate": 4.613503727337116e-05, "loss": 0.3927, "step": 28310 }, { "epoch": 1.0204706815151188, "grad_norm": 0.16596178710460663, "learning_rate": 4.613347846267292e-05, "loss": 0.4003, "step": 28315 }, { "epoch": 1.0206508811763433, "grad_norm": 0.1888241469860077, "learning_rate": 4.6131919364035126e-05, "loss": 0.4401, "step": 28320 }, { "epoch": 1.020831080837568, "grad_norm": 0.2028852254152298, "learning_rate": 4.613035997747902e-05, "loss": 0.4529, "step": 28325 }, { "epoch": 1.0210112804987928, "grad_norm": 0.18602454662322998, "learning_rate": 4.612880030302585e-05, "loss": 0.4408, "step": 28330 }, { "epoch": 1.0211914801600173, "grad_norm": 0.2020941823720932, "learning_rate": 4.6127240340696876e-05, "loss": 0.4332, "step": 28335 }, { "epoch": 1.021371679821242, "grad_norm": 0.17948292195796967, "learning_rate": 4.6125680090513334e-05, "loss": 0.4235, "step": 28340 }, { "epoch": 1.0215518794824665, "grad_norm": 0.17429165542125702, "learning_rate": 4.612411955249649e-05, "loss": 0.4238, "step": 28345 }, { "epoch": 1.0217320791436912, "grad_norm": 0.18674267828464508, "learning_rate": 4.612255872666762e-05, "loss": 0.3849, "step": 28350 }, { "epoch": 1.0219122788049158, "grad_norm": 0.13337041437625885, "learning_rate": 4.612099761304796e-05, "loss": 0.4294, "step": 28355 }, { "epoch": 1.0220924784661405, "grad_norm": 0.15359793603420258, "learning_rate": 4.6119436211658805e-05, "loss": 0.4145, "step": 28360 }, { "epoch": 1.0222726781273652, "grad_norm": 0.17645424604415894, "learning_rate": 4.611787452252142e-05, "loss": 0.4163, "step": 28365 }, { "epoch": 1.0224528777885897, "grad_norm": 0.14870162308216095, "learning_rate": 4.6116312545657083e-05, "loss": 0.382, "step": 28370 }, { "epoch": 1.0226330774498145, "grad_norm": 0.18462905287742615, "learning_rate": 4.611475028108707e-05, "loss": 0.4238, "step": 28375 }, { "epoch": 1.022813277111039, "grad_norm": 0.1805817037820816, "learning_rate": 4.611318772883268e-05, "loss": 0.4203, "step": 28380 }, { "epoch": 1.0229934767722637, "grad_norm": 0.18119722604751587, "learning_rate": 4.6111624888915196e-05, "loss": 0.4058, "step": 28385 }, { "epoch": 1.0231736764334882, "grad_norm": 0.14671674370765686, "learning_rate": 4.611006176135591e-05, "loss": 0.4227, "step": 28390 }, { "epoch": 1.023353876094713, "grad_norm": 0.13989455997943878, "learning_rate": 4.610849834617611e-05, "loss": 0.3985, "step": 28395 }, { "epoch": 1.0235340757559377, "grad_norm": 0.17615246772766113, "learning_rate": 4.610693464339711e-05, "loss": 0.3773, "step": 28400 }, { "epoch": 1.0237142754171622, "grad_norm": 0.17603693902492523, "learning_rate": 4.6105370653040216e-05, "loss": 0.3988, "step": 28405 }, { "epoch": 1.023894475078387, "grad_norm": 0.15467150509357452, "learning_rate": 4.6103806375126735e-05, "loss": 0.4082, "step": 28410 }, { "epoch": 1.0240746747396114, "grad_norm": 0.19032159447669983, "learning_rate": 4.6102241809677974e-05, "loss": 0.4251, "step": 28415 }, { "epoch": 1.0242548744008362, "grad_norm": 0.19951966404914856, "learning_rate": 4.610067695671525e-05, "loss": 0.4239, "step": 28420 }, { "epoch": 1.0244350740620607, "grad_norm": 0.1830962896347046, "learning_rate": 4.609911181625989e-05, "loss": 0.4016, "step": 28425 }, { "epoch": 1.0246152737232854, "grad_norm": 0.18551193177700043, "learning_rate": 4.609754638833322e-05, "loss": 0.3889, "step": 28430 }, { "epoch": 1.02479547338451, "grad_norm": 0.18302911520004272, "learning_rate": 4.609598067295656e-05, "loss": 0.4453, "step": 28435 }, { "epoch": 1.0249756730457347, "grad_norm": 0.15674737095832825, "learning_rate": 4.6094414670151253e-05, "loss": 0.4165, "step": 28440 }, { "epoch": 1.0251558727069594, "grad_norm": 0.18824201822280884, "learning_rate": 4.609284837993863e-05, "loss": 0.4216, "step": 28445 }, { "epoch": 1.025336072368184, "grad_norm": 0.13510337471961975, "learning_rate": 4.609128180234003e-05, "loss": 0.4233, "step": 28450 }, { "epoch": 1.0255162720294086, "grad_norm": 0.17444869875907898, "learning_rate": 4.60897149373768e-05, "loss": 0.4555, "step": 28455 }, { "epoch": 1.0256964716906332, "grad_norm": 0.14340052008628845, "learning_rate": 4.6088147785070284e-05, "loss": 0.3925, "step": 28460 }, { "epoch": 1.025876671351858, "grad_norm": 0.15294180810451508, "learning_rate": 4.608658034544184e-05, "loss": 0.4104, "step": 28465 }, { "epoch": 1.0260568710130824, "grad_norm": 0.21026860177516937, "learning_rate": 4.608501261851282e-05, "loss": 0.4463, "step": 28470 }, { "epoch": 1.0262370706743071, "grad_norm": 0.18391993641853333, "learning_rate": 4.6083444604304584e-05, "loss": 0.4285, "step": 28475 }, { "epoch": 1.0264172703355319, "grad_norm": 0.16915670037269592, "learning_rate": 4.60818763028385e-05, "loss": 0.3984, "step": 28480 }, { "epoch": 1.0265974699967564, "grad_norm": 0.15066103637218475, "learning_rate": 4.608030771413593e-05, "loss": 0.4118, "step": 28485 }, { "epoch": 1.0267776696579811, "grad_norm": 0.1730443239212036, "learning_rate": 4.607873883821825e-05, "loss": 0.4169, "step": 28490 }, { "epoch": 1.0269578693192056, "grad_norm": 0.2241186797618866, "learning_rate": 4.6077169675106836e-05, "loss": 0.4354, "step": 28495 }, { "epoch": 1.0271380689804304, "grad_norm": 0.20758524537086487, "learning_rate": 4.6075600224823066e-05, "loss": 0.4531, "step": 28500 }, { "epoch": 1.0271380689804304, "eval_loss": 0.45461568236351013, "eval_runtime": 3.545, "eval_samples_per_second": 28.209, "eval_steps_per_second": 7.052, "step": 28500 }, { "epoch": 1.0273182686416549, "grad_norm": 0.143527090549469, "learning_rate": 4.607403048738832e-05, "loss": 0.3849, "step": 28505 }, { "epoch": 1.0274984683028796, "grad_norm": 0.17763735353946686, "learning_rate": 4.607246046282399e-05, "loss": 0.4174, "step": 28510 }, { "epoch": 1.0276786679641041, "grad_norm": 0.2017643004655838, "learning_rate": 4.607089015115147e-05, "loss": 0.4237, "step": 28515 }, { "epoch": 1.0278588676253289, "grad_norm": 0.16140924394130707, "learning_rate": 4.6069319552392145e-05, "loss": 0.4097, "step": 28520 }, { "epoch": 1.0280390672865536, "grad_norm": 0.16550087928771973, "learning_rate": 4.6067748666567425e-05, "loss": 0.3789, "step": 28525 }, { "epoch": 1.028219266947778, "grad_norm": 0.1847558319568634, "learning_rate": 4.60661774936987e-05, "loss": 0.4227, "step": 28530 }, { "epoch": 1.0283994666090028, "grad_norm": 0.18655303120613098, "learning_rate": 4.606460603380739e-05, "loss": 0.4278, "step": 28535 }, { "epoch": 1.0285796662702273, "grad_norm": 0.17594356834888458, "learning_rate": 4.606303428691491e-05, "loss": 0.4417, "step": 28540 }, { "epoch": 1.028759865931452, "grad_norm": 0.18802598118782043, "learning_rate": 4.606146225304265e-05, "loss": 0.4576, "step": 28545 }, { "epoch": 1.0289400655926766, "grad_norm": 0.190033420920372, "learning_rate": 4.605988993221206e-05, "loss": 0.3783, "step": 28550 }, { "epoch": 1.0291202652539013, "grad_norm": 0.16919276118278503, "learning_rate": 4.605831732444453e-05, "loss": 0.4453, "step": 28555 }, { "epoch": 1.029300464915126, "grad_norm": 0.17263488471508026, "learning_rate": 4.605674442976152e-05, "loss": 0.3981, "step": 28560 }, { "epoch": 1.0294806645763506, "grad_norm": 0.18015895783901215, "learning_rate": 4.6055171248184434e-05, "loss": 0.3846, "step": 28565 }, { "epoch": 1.0296608642375753, "grad_norm": 0.18624891340732574, "learning_rate": 4.605359777973472e-05, "loss": 0.4022, "step": 28570 }, { "epoch": 1.0298410638987998, "grad_norm": 0.18329830467700958, "learning_rate": 4.6052024024433815e-05, "loss": 0.4198, "step": 28575 }, { "epoch": 1.0300212635600245, "grad_norm": 0.17667533457279205, "learning_rate": 4.605044998230315e-05, "loss": 0.4046, "step": 28580 }, { "epoch": 1.030201463221249, "grad_norm": 0.1662374883890152, "learning_rate": 4.604887565336419e-05, "loss": 0.4322, "step": 28585 }, { "epoch": 1.0303816628824738, "grad_norm": 0.20289361476898193, "learning_rate": 4.604730103763837e-05, "loss": 0.4173, "step": 28590 }, { "epoch": 1.0305618625436985, "grad_norm": 0.17283061146736145, "learning_rate": 4.604572613514714e-05, "loss": 0.4174, "step": 28595 }, { "epoch": 1.030742062204923, "grad_norm": 0.21077826619148254, "learning_rate": 4.6044150945911974e-05, "loss": 0.4365, "step": 28600 }, { "epoch": 1.0309222618661478, "grad_norm": 0.19256427884101868, "learning_rate": 4.604257546995433e-05, "loss": 0.4491, "step": 28605 }, { "epoch": 1.0311024615273723, "grad_norm": 0.15783824026584625, "learning_rate": 4.6040999707295665e-05, "loss": 0.4072, "step": 28610 }, { "epoch": 1.031282661188597, "grad_norm": 0.15280930697917938, "learning_rate": 4.603942365795745e-05, "loss": 0.4217, "step": 28615 }, { "epoch": 1.0314628608498215, "grad_norm": 0.16231854259967804, "learning_rate": 4.603784732196116e-05, "loss": 0.3995, "step": 28620 }, { "epoch": 1.0316430605110463, "grad_norm": 0.1602470427751541, "learning_rate": 4.603627069932827e-05, "loss": 0.4217, "step": 28625 }, { "epoch": 1.0318232601722708, "grad_norm": 0.2087296098470688, "learning_rate": 4.603469379008028e-05, "loss": 0.4469, "step": 28630 }, { "epoch": 1.0320034598334955, "grad_norm": 0.18025125563144684, "learning_rate": 4.603311659423864e-05, "loss": 0.4198, "step": 28635 }, { "epoch": 1.0321836594947202, "grad_norm": 0.19022351503372192, "learning_rate": 4.603153911182487e-05, "loss": 0.4439, "step": 28640 }, { "epoch": 1.0323638591559448, "grad_norm": 0.17020079493522644, "learning_rate": 4.602996134286045e-05, "loss": 0.3931, "step": 28645 }, { "epoch": 1.0325440588171695, "grad_norm": 0.17506608366966248, "learning_rate": 4.602838328736688e-05, "loss": 0.4457, "step": 28650 }, { "epoch": 1.032724258478394, "grad_norm": 0.22321075201034546, "learning_rate": 4.6026804945365655e-05, "loss": 0.4347, "step": 28655 }, { "epoch": 1.0329044581396187, "grad_norm": 0.16402897238731384, "learning_rate": 4.602522631687829e-05, "loss": 0.4257, "step": 28660 }, { "epoch": 1.0330846578008432, "grad_norm": 0.18853814899921417, "learning_rate": 4.602364740192628e-05, "loss": 0.4312, "step": 28665 }, { "epoch": 1.033264857462068, "grad_norm": 0.1805652529001236, "learning_rate": 4.602206820053114e-05, "loss": 0.4294, "step": 28670 }, { "epoch": 1.0334450571232927, "grad_norm": 0.15662455558776855, "learning_rate": 4.6020488712714404e-05, "loss": 0.4338, "step": 28675 }, { "epoch": 1.0336252567845172, "grad_norm": 0.157413512468338, "learning_rate": 4.6018908938497573e-05, "loss": 0.4187, "step": 28680 }, { "epoch": 1.033805456445742, "grad_norm": 0.1799081712961197, "learning_rate": 4.6017328877902176e-05, "loss": 0.4412, "step": 28685 }, { "epoch": 1.0339856561069665, "grad_norm": 0.14759212732315063, "learning_rate": 4.601574853094974e-05, "loss": 0.407, "step": 28690 }, { "epoch": 1.0341658557681912, "grad_norm": 0.1873723268508911, "learning_rate": 4.60141678976618e-05, "loss": 0.4277, "step": 28695 }, { "epoch": 1.0343460554294157, "grad_norm": 0.16125832498073578, "learning_rate": 4.6012586978059893e-05, "loss": 0.4602, "step": 28700 }, { "epoch": 1.0345262550906404, "grad_norm": 0.1734544336795807, "learning_rate": 4.601100577216556e-05, "loss": 0.4233, "step": 28705 }, { "epoch": 1.034706454751865, "grad_norm": 0.1755981296300888, "learning_rate": 4.600942428000033e-05, "loss": 0.4266, "step": 28710 }, { "epoch": 1.0348866544130897, "grad_norm": 0.16786262392997742, "learning_rate": 4.600784250158577e-05, "loss": 0.4204, "step": 28715 }, { "epoch": 1.0350668540743144, "grad_norm": 0.14706341922283173, "learning_rate": 4.600626043694343e-05, "loss": 0.4241, "step": 28720 }, { "epoch": 1.035247053735539, "grad_norm": 0.1602163165807724, "learning_rate": 4.600467808609485e-05, "loss": 0.4105, "step": 28725 }, { "epoch": 1.0354272533967637, "grad_norm": 0.1791827380657196, "learning_rate": 4.60030954490616e-05, "loss": 0.4406, "step": 28730 }, { "epoch": 1.0356074530579882, "grad_norm": 0.17273133993148804, "learning_rate": 4.600151252586524e-05, "loss": 0.4023, "step": 28735 }, { "epoch": 1.035787652719213, "grad_norm": 0.16618365049362183, "learning_rate": 4.5999929316527335e-05, "loss": 0.4238, "step": 28740 }, { "epoch": 1.0359678523804374, "grad_norm": 0.147443950176239, "learning_rate": 4.599834582106946e-05, "loss": 0.4491, "step": 28745 }, { "epoch": 1.0361480520416622, "grad_norm": 0.1795797199010849, "learning_rate": 4.599676203951319e-05, "loss": 0.4178, "step": 28750 }, { "epoch": 1.036328251702887, "grad_norm": 0.15541589260101318, "learning_rate": 4.59951779718801e-05, "loss": 0.4204, "step": 28755 }, { "epoch": 1.0365084513641114, "grad_norm": 0.17285758256912231, "learning_rate": 4.599359361819178e-05, "loss": 0.3788, "step": 28760 }, { "epoch": 1.0366886510253361, "grad_norm": 0.13503800332546234, "learning_rate": 4.5992008978469806e-05, "loss": 0.3833, "step": 28765 }, { "epoch": 1.0368688506865607, "grad_norm": 0.19433645904064178, "learning_rate": 4.599042405273578e-05, "loss": 0.4648, "step": 28770 }, { "epoch": 1.0370490503477854, "grad_norm": 0.18631264567375183, "learning_rate": 4.598883884101128e-05, "loss": 0.4023, "step": 28775 }, { "epoch": 1.03722925000901, "grad_norm": 0.1467059701681137, "learning_rate": 4.598725334331793e-05, "loss": 0.4018, "step": 28780 }, { "epoch": 1.0374094496702346, "grad_norm": 0.160639688372612, "learning_rate": 4.5985667559677303e-05, "loss": 0.3925, "step": 28785 }, { "epoch": 1.0375896493314594, "grad_norm": 0.1615612506866455, "learning_rate": 4.598408149011102e-05, "loss": 0.4166, "step": 28790 }, { "epoch": 1.0377698489926839, "grad_norm": 0.24002741277217865, "learning_rate": 4.598249513464069e-05, "loss": 0.3697, "step": 28795 }, { "epoch": 1.0379500486539086, "grad_norm": 0.18659855425357819, "learning_rate": 4.5980908493287933e-05, "loss": 0.4156, "step": 28800 }, { "epoch": 1.0381302483151331, "grad_norm": 0.18351595103740692, "learning_rate": 4.5979321566074354e-05, "loss": 0.4355, "step": 28805 }, { "epoch": 1.0383104479763579, "grad_norm": 0.20558714866638184, "learning_rate": 4.5977734353021586e-05, "loss": 0.4162, "step": 28810 }, { "epoch": 1.0384906476375824, "grad_norm": 0.15841884911060333, "learning_rate": 4.5976146854151244e-05, "loss": 0.3912, "step": 28815 }, { "epoch": 1.038670847298807, "grad_norm": 0.17524218559265137, "learning_rate": 4.597455906948496e-05, "loss": 0.419, "step": 28820 }, { "epoch": 1.0388510469600316, "grad_norm": 0.1817321926355362, "learning_rate": 4.597297099904437e-05, "loss": 0.4137, "step": 28825 }, { "epoch": 1.0390312466212563, "grad_norm": 0.18538501858711243, "learning_rate": 4.597138264285111e-05, "loss": 0.3848, "step": 28830 }, { "epoch": 1.039211446282481, "grad_norm": 0.16245108842849731, "learning_rate": 4.596979400092683e-05, "loss": 0.4461, "step": 28835 }, { "epoch": 1.0393916459437056, "grad_norm": 0.2564604878425598, "learning_rate": 4.5968205073293156e-05, "loss": 0.4233, "step": 28840 }, { "epoch": 1.0395718456049303, "grad_norm": 0.1610010415315628, "learning_rate": 4.596661585997176e-05, "loss": 0.4268, "step": 28845 }, { "epoch": 1.0397520452661548, "grad_norm": 0.19828878343105316, "learning_rate": 4.596502636098427e-05, "loss": 0.43, "step": 28850 }, { "epoch": 1.0399322449273796, "grad_norm": 0.17699094116687775, "learning_rate": 4.596343657635236e-05, "loss": 0.4678, "step": 28855 }, { "epoch": 1.040112444588604, "grad_norm": 0.1853751391172409, "learning_rate": 4.596184650609768e-05, "loss": 0.4363, "step": 28860 }, { "epoch": 1.0402926442498288, "grad_norm": 0.17727990448474884, "learning_rate": 4.596025615024191e-05, "loss": 0.4541, "step": 28865 }, { "epoch": 1.0404728439110535, "grad_norm": 0.16852405667304993, "learning_rate": 4.5958665508806696e-05, "loss": 0.3916, "step": 28870 }, { "epoch": 1.040653043572278, "grad_norm": 0.1764410436153412, "learning_rate": 4.595707458181373e-05, "loss": 0.4453, "step": 28875 }, { "epoch": 1.0408332432335028, "grad_norm": 0.19933126866817474, "learning_rate": 4.595548336928468e-05, "loss": 0.4561, "step": 28880 }, { "epoch": 1.0410134428947273, "grad_norm": 0.2222566306591034, "learning_rate": 4.5953891871241226e-05, "loss": 0.418, "step": 28885 }, { "epoch": 1.041193642555952, "grad_norm": 0.16706469655036926, "learning_rate": 4.5952300087705055e-05, "loss": 0.453, "step": 28890 }, { "epoch": 1.0413738422171765, "grad_norm": 0.15461257100105286, "learning_rate": 4.595070801869784e-05, "loss": 0.4037, "step": 28895 }, { "epoch": 1.0415540418784013, "grad_norm": 0.16241170465946198, "learning_rate": 4.594911566424129e-05, "loss": 0.4157, "step": 28900 }, { "epoch": 1.041734241539626, "grad_norm": 0.18162140250205994, "learning_rate": 4.5947523024357106e-05, "loss": 0.3942, "step": 28905 }, { "epoch": 1.0419144412008505, "grad_norm": 0.21681730449199677, "learning_rate": 4.5945930099066966e-05, "loss": 0.4247, "step": 28910 }, { "epoch": 1.0420946408620753, "grad_norm": 0.16923941671848297, "learning_rate": 4.594433688839258e-05, "loss": 0.4093, "step": 28915 }, { "epoch": 1.0422748405232998, "grad_norm": 0.17253339290618896, "learning_rate": 4.594274339235567e-05, "loss": 0.4216, "step": 28920 }, { "epoch": 1.0424550401845245, "grad_norm": 0.17402751743793488, "learning_rate": 4.594114961097793e-05, "loss": 0.421, "step": 28925 }, { "epoch": 1.042635239845749, "grad_norm": 0.17757736146450043, "learning_rate": 4.593955554428108e-05, "loss": 0.416, "step": 28930 }, { "epoch": 1.0428154395069738, "grad_norm": 0.19702796638011932, "learning_rate": 4.593796119228684e-05, "loss": 0.4186, "step": 28935 }, { "epoch": 1.0429956391681983, "grad_norm": 0.17790283262729645, "learning_rate": 4.593636655501693e-05, "loss": 0.4218, "step": 28940 }, { "epoch": 1.043175838829423, "grad_norm": 0.18331415951251984, "learning_rate": 4.593477163249308e-05, "loss": 0.4208, "step": 28945 }, { "epoch": 1.0433560384906477, "grad_norm": 0.19453075528144836, "learning_rate": 4.5933176424737025e-05, "loss": 0.4193, "step": 28950 }, { "epoch": 1.0435362381518722, "grad_norm": 0.2180250734090805, "learning_rate": 4.5931580931770494e-05, "loss": 0.406, "step": 28955 }, { "epoch": 1.043716437813097, "grad_norm": 0.16775591671466827, "learning_rate": 4.592998515361522e-05, "loss": 0.4246, "step": 28960 }, { "epoch": 1.0438966374743215, "grad_norm": 0.1662595570087433, "learning_rate": 4.5928389090292955e-05, "loss": 0.4263, "step": 28965 }, { "epoch": 1.0440768371355462, "grad_norm": 0.17564332485198975, "learning_rate": 4.592679274182544e-05, "loss": 0.4001, "step": 28970 }, { "epoch": 1.0442570367967707, "grad_norm": 0.18523907661437988, "learning_rate": 4.592519610823442e-05, "loss": 0.4231, "step": 28975 }, { "epoch": 1.0444372364579955, "grad_norm": 0.17822639644145966, "learning_rate": 4.592359918954165e-05, "loss": 0.4239, "step": 28980 }, { "epoch": 1.0446174361192202, "grad_norm": 0.19693171977996826, "learning_rate": 4.59220019857689e-05, "loss": 0.4082, "step": 28985 }, { "epoch": 1.0447976357804447, "grad_norm": 0.16676361858844757, "learning_rate": 4.592040449693793e-05, "loss": 0.4109, "step": 28990 }, { "epoch": 1.0449778354416694, "grad_norm": 0.16993066668510437, "learning_rate": 4.59188067230705e-05, "loss": 0.4093, "step": 28995 }, { "epoch": 1.045158035102894, "grad_norm": 0.20470210909843445, "learning_rate": 4.591720866418836e-05, "loss": 0.4041, "step": 29000 }, { "epoch": 1.045158035102894, "eval_loss": 0.4535163640975952, "eval_runtime": 3.5354, "eval_samples_per_second": 28.285, "eval_steps_per_second": 7.071, "step": 29000 }, { "epoch": 1.0453382347641187, "grad_norm": 0.1358274519443512, "learning_rate": 4.591561032031332e-05, "loss": 0.366, "step": 29005 }, { "epoch": 1.0455184344253432, "grad_norm": 0.1652381271123886, "learning_rate": 4.591401169146713e-05, "loss": 0.4371, "step": 29010 }, { "epoch": 1.045698634086568, "grad_norm": 0.2014123499393463, "learning_rate": 4.591241277767158e-05, "loss": 0.4396, "step": 29015 }, { "epoch": 1.0458788337477924, "grad_norm": 0.23837004601955414, "learning_rate": 4.591081357894845e-05, "loss": 0.4038, "step": 29020 }, { "epoch": 1.0460590334090172, "grad_norm": 0.167801171541214, "learning_rate": 4.590921409531954e-05, "loss": 0.4502, "step": 29025 }, { "epoch": 1.046239233070242, "grad_norm": 0.14784109592437744, "learning_rate": 4.5907614326806635e-05, "loss": 0.4006, "step": 29030 }, { "epoch": 1.0464194327314664, "grad_norm": 0.17408788204193115, "learning_rate": 4.5906014273431535e-05, "loss": 0.3958, "step": 29035 }, { "epoch": 1.0465996323926912, "grad_norm": 0.1565713882446289, "learning_rate": 4.590441393521604e-05, "loss": 0.4062, "step": 29040 }, { "epoch": 1.0467798320539157, "grad_norm": 0.20593391358852386, "learning_rate": 4.590281331218195e-05, "loss": 0.4573, "step": 29045 }, { "epoch": 1.0469600317151404, "grad_norm": 0.1602008193731308, "learning_rate": 4.5901212404351065e-05, "loss": 0.4379, "step": 29050 }, { "epoch": 1.047140231376365, "grad_norm": 0.19146746397018433, "learning_rate": 4.589961121174522e-05, "loss": 0.3882, "step": 29055 }, { "epoch": 1.0473204310375896, "grad_norm": 0.18531233072280884, "learning_rate": 4.5898009734386216e-05, "loss": 0.4533, "step": 29060 }, { "epoch": 1.0475006306988144, "grad_norm": 0.15970437228679657, "learning_rate": 4.5896407972295875e-05, "loss": 0.3773, "step": 29065 }, { "epoch": 1.047680830360039, "grad_norm": 0.17048677802085876, "learning_rate": 4.589480592549602e-05, "loss": 0.4305, "step": 29070 }, { "epoch": 1.0478610300212636, "grad_norm": 0.1856071949005127, "learning_rate": 4.5893203594008476e-05, "loss": 0.4541, "step": 29075 }, { "epoch": 1.0480412296824881, "grad_norm": 0.19591088593006134, "learning_rate": 4.5891600977855085e-05, "loss": 0.3656, "step": 29080 }, { "epoch": 1.0482214293437129, "grad_norm": 0.15485340356826782, "learning_rate": 4.5889998077057674e-05, "loss": 0.4005, "step": 29085 }, { "epoch": 1.0484016290049374, "grad_norm": 0.17994803190231323, "learning_rate": 4.588839489163808e-05, "loss": 0.406, "step": 29090 }, { "epoch": 1.0485818286661621, "grad_norm": 0.1988825649023056, "learning_rate": 4.5886791421618155e-05, "loss": 0.402, "step": 29095 }, { "epoch": 1.0487620283273866, "grad_norm": 0.18159984052181244, "learning_rate": 4.5885187667019733e-05, "loss": 0.4012, "step": 29100 }, { "epoch": 1.0489422279886114, "grad_norm": 0.17193853855133057, "learning_rate": 4.588358362786468e-05, "loss": 0.4378, "step": 29105 }, { "epoch": 1.049122427649836, "grad_norm": 0.1589495688676834, "learning_rate": 4.588197930417484e-05, "loss": 0.4416, "step": 29110 }, { "epoch": 1.0493026273110606, "grad_norm": 0.1768224835395813, "learning_rate": 4.588037469597207e-05, "loss": 0.4335, "step": 29115 }, { "epoch": 1.0494828269722853, "grad_norm": 0.207745760679245, "learning_rate": 4.587876980327824e-05, "loss": 0.4003, "step": 29120 }, { "epoch": 1.0496630266335099, "grad_norm": 0.20602521300315857, "learning_rate": 4.587716462611522e-05, "loss": 0.453, "step": 29125 }, { "epoch": 1.0498432262947346, "grad_norm": 0.1503795087337494, "learning_rate": 4.587555916450487e-05, "loss": 0.3976, "step": 29130 }, { "epoch": 1.050023425955959, "grad_norm": 0.21797312796115875, "learning_rate": 4.587395341846907e-05, "loss": 0.4621, "step": 29135 }, { "epoch": 1.0502036256171838, "grad_norm": 0.17475320398807526, "learning_rate": 4.587234738802969e-05, "loss": 0.4362, "step": 29140 }, { "epoch": 1.0503838252784086, "grad_norm": 0.15241824090480804, "learning_rate": 4.5870741073208624e-05, "loss": 0.4161, "step": 29145 }, { "epoch": 1.050564024939633, "grad_norm": 0.17844238877296448, "learning_rate": 4.5869134474027756e-05, "loss": 0.4955, "step": 29150 }, { "epoch": 1.0507442246008578, "grad_norm": 0.1921539604663849, "learning_rate": 4.586752759050896e-05, "loss": 0.4587, "step": 29155 }, { "epoch": 1.0509244242620823, "grad_norm": 0.20518042147159576, "learning_rate": 4.586592042267415e-05, "loss": 0.4486, "step": 29160 }, { "epoch": 1.051104623923307, "grad_norm": 0.15805669128894806, "learning_rate": 4.586431297054521e-05, "loss": 0.4031, "step": 29165 }, { "epoch": 1.0512848235845316, "grad_norm": 0.20354914665222168, "learning_rate": 4.586270523414404e-05, "loss": 0.4089, "step": 29170 }, { "epoch": 1.0514650232457563, "grad_norm": 0.18959026038646698, "learning_rate": 4.586109721349255e-05, "loss": 0.4437, "step": 29175 }, { "epoch": 1.051645222906981, "grad_norm": 0.2117196023464203, "learning_rate": 4.585948890861266e-05, "loss": 0.4373, "step": 29180 }, { "epoch": 1.0518254225682055, "grad_norm": 0.18057450652122498, "learning_rate": 4.585788031952627e-05, "loss": 0.4321, "step": 29185 }, { "epoch": 1.0520056222294303, "grad_norm": 0.1702229529619217, "learning_rate": 4.5856271446255294e-05, "loss": 0.3866, "step": 29190 }, { "epoch": 1.0521858218906548, "grad_norm": 0.17912250757217407, "learning_rate": 4.5854662288821656e-05, "loss": 0.3849, "step": 29195 }, { "epoch": 1.0523660215518795, "grad_norm": 0.19952471554279327, "learning_rate": 4.5853052847247286e-05, "loss": 0.4252, "step": 29200 }, { "epoch": 1.052546221213104, "grad_norm": 0.17590820789337158, "learning_rate": 4.5851443121554104e-05, "loss": 0.4243, "step": 29205 }, { "epoch": 1.0527264208743288, "grad_norm": 0.18067772686481476, "learning_rate": 4.584983311176405e-05, "loss": 0.4142, "step": 29210 }, { "epoch": 1.0529066205355533, "grad_norm": 0.17660415172576904, "learning_rate": 4.584822281789906e-05, "loss": 0.4109, "step": 29215 }, { "epoch": 1.053086820196778, "grad_norm": 0.20561589300632477, "learning_rate": 4.5846612239981065e-05, "loss": 0.4339, "step": 29220 }, { "epoch": 1.0532670198580028, "grad_norm": 0.14123573899269104, "learning_rate": 4.584500137803201e-05, "loss": 0.4289, "step": 29225 }, { "epoch": 1.0534472195192273, "grad_norm": 0.17180036008358002, "learning_rate": 4.5843390232073856e-05, "loss": 0.4204, "step": 29230 }, { "epoch": 1.053627419180452, "grad_norm": 0.18303899466991425, "learning_rate": 4.5841778802128544e-05, "loss": 0.425, "step": 29235 }, { "epoch": 1.0538076188416765, "grad_norm": 0.15086086094379425, "learning_rate": 4.5840167088218026e-05, "loss": 0.3899, "step": 29240 }, { "epoch": 1.0539878185029012, "grad_norm": 0.15532901883125305, "learning_rate": 4.583855509036427e-05, "loss": 0.4025, "step": 29245 }, { "epoch": 1.0541680181641258, "grad_norm": 0.15308701992034912, "learning_rate": 4.5836942808589235e-05, "loss": 0.4168, "step": 29250 }, { "epoch": 1.0543482178253505, "grad_norm": 0.1958785504102707, "learning_rate": 4.583533024291489e-05, "loss": 0.4647, "step": 29255 }, { "epoch": 1.0545284174865752, "grad_norm": 0.15900172293186188, "learning_rate": 4.583371739336319e-05, "loss": 0.4076, "step": 29260 }, { "epoch": 1.0547086171477997, "grad_norm": 0.19627580046653748, "learning_rate": 4.583210425995614e-05, "loss": 0.4619, "step": 29265 }, { "epoch": 1.0548888168090245, "grad_norm": 0.17516663670539856, "learning_rate": 4.58304908427157e-05, "loss": 0.4433, "step": 29270 }, { "epoch": 1.055069016470249, "grad_norm": 0.1998327523469925, "learning_rate": 4.582887714166386e-05, "loss": 0.4309, "step": 29275 }, { "epoch": 1.0552492161314737, "grad_norm": 0.15834058821201324, "learning_rate": 4.582726315682259e-05, "loss": 0.4016, "step": 29280 }, { "epoch": 1.0554294157926982, "grad_norm": 0.16241154074668884, "learning_rate": 4.58256488882139e-05, "loss": 0.4014, "step": 29285 }, { "epoch": 1.055609615453923, "grad_norm": 0.18402625620365143, "learning_rate": 4.582403433585978e-05, "loss": 0.4168, "step": 29290 }, { "epoch": 1.0557898151151477, "grad_norm": 0.17826689779758453, "learning_rate": 4.582241949978221e-05, "loss": 0.4416, "step": 29295 }, { "epoch": 1.0559700147763722, "grad_norm": 0.18548059463500977, "learning_rate": 4.582080438000321e-05, "loss": 0.422, "step": 29300 }, { "epoch": 1.056150214437597, "grad_norm": 0.1519528478384018, "learning_rate": 4.581918897654479e-05, "loss": 0.4051, "step": 29305 }, { "epoch": 1.0563304140988214, "grad_norm": 0.14724068343639374, "learning_rate": 4.581757328942894e-05, "loss": 0.3999, "step": 29310 }, { "epoch": 1.0565106137600462, "grad_norm": 0.18171782791614532, "learning_rate": 4.5815957318677693e-05, "loss": 0.4232, "step": 29315 }, { "epoch": 1.0566908134212707, "grad_norm": 0.20490474998950958, "learning_rate": 4.5814341064313055e-05, "loss": 0.4333, "step": 29320 }, { "epoch": 1.0568710130824954, "grad_norm": 0.18439970910549164, "learning_rate": 4.581272452635705e-05, "loss": 0.3899, "step": 29325 }, { "epoch": 1.05705121274372, "grad_norm": 0.14571373164653778, "learning_rate": 4.58111077048317e-05, "loss": 0.4089, "step": 29330 }, { "epoch": 1.0572314124049447, "grad_norm": 0.15669065713882446, "learning_rate": 4.5809490599759034e-05, "loss": 0.3999, "step": 29335 }, { "epoch": 1.0574116120661694, "grad_norm": 0.16960056126117706, "learning_rate": 4.58078732111611e-05, "loss": 0.4173, "step": 29340 }, { "epoch": 1.057591811727394, "grad_norm": 0.17213228344917297, "learning_rate": 4.5806255539059905e-05, "loss": 0.4012, "step": 29345 }, { "epoch": 1.0577720113886186, "grad_norm": 0.15806017816066742, "learning_rate": 4.5804637583477514e-05, "loss": 0.4421, "step": 29350 }, { "epoch": 1.0579522110498432, "grad_norm": 0.19483284652233124, "learning_rate": 4.5803019344435974e-05, "loss": 0.4188, "step": 29355 }, { "epoch": 1.058132410711068, "grad_norm": 0.18508678674697876, "learning_rate": 4.580140082195731e-05, "loss": 0.4134, "step": 29360 }, { "epoch": 1.0583126103722924, "grad_norm": 0.1570422649383545, "learning_rate": 4.579978201606359e-05, "loss": 0.4138, "step": 29365 }, { "epoch": 1.0584928100335171, "grad_norm": 0.14425136148929596, "learning_rate": 4.5798162926776865e-05, "loss": 0.4375, "step": 29370 }, { "epoch": 1.0586730096947419, "grad_norm": 0.17451147735118866, "learning_rate": 4.57965435541192e-05, "loss": 0.4292, "step": 29375 }, { "epoch": 1.0588532093559664, "grad_norm": 0.16206900775432587, "learning_rate": 4.579492389811266e-05, "loss": 0.4548, "step": 29380 }, { "epoch": 1.0590334090171911, "grad_norm": 0.1823441982269287, "learning_rate": 4.57933039587793e-05, "loss": 0.409, "step": 29385 }, { "epoch": 1.0592136086784156, "grad_norm": 0.18232353031635284, "learning_rate": 4.579168373614121e-05, "loss": 0.3947, "step": 29390 }, { "epoch": 1.0593938083396404, "grad_norm": 0.16864867508411407, "learning_rate": 4.5790063230220444e-05, "loss": 0.4212, "step": 29395 }, { "epoch": 1.0595740080008649, "grad_norm": 0.19608835875988007, "learning_rate": 4.578844244103909e-05, "loss": 0.42, "step": 29400 }, { "epoch": 1.0597542076620896, "grad_norm": 0.1760438084602356, "learning_rate": 4.5786821368619236e-05, "loss": 0.4085, "step": 29405 }, { "epoch": 1.0599344073233143, "grad_norm": 0.17386280000209808, "learning_rate": 4.578520001298297e-05, "loss": 0.4454, "step": 29410 }, { "epoch": 1.0601146069845389, "grad_norm": 0.14320212602615356, "learning_rate": 4.5783578374152376e-05, "loss": 0.389, "step": 29415 }, { "epoch": 1.0602948066457636, "grad_norm": 0.20935241878032684, "learning_rate": 4.578195645214955e-05, "loss": 0.4252, "step": 29420 }, { "epoch": 1.060475006306988, "grad_norm": 0.17992691695690155, "learning_rate": 4.578033424699659e-05, "loss": 0.4362, "step": 29425 }, { "epoch": 1.0606552059682128, "grad_norm": 0.18552446365356445, "learning_rate": 4.57787117587156e-05, "loss": 0.4269, "step": 29430 }, { "epoch": 1.0608354056294373, "grad_norm": 0.18173004686832428, "learning_rate": 4.577708898732868e-05, "loss": 0.4358, "step": 29435 }, { "epoch": 1.061015605290662, "grad_norm": 0.19340266287326813, "learning_rate": 4.577546593285795e-05, "loss": 0.4268, "step": 29440 }, { "epoch": 1.0611958049518866, "grad_norm": 0.17074435949325562, "learning_rate": 4.577384259532552e-05, "loss": 0.4342, "step": 29445 }, { "epoch": 1.0613760046131113, "grad_norm": 0.17281962931156158, "learning_rate": 4.57722189747535e-05, "loss": 0.3953, "step": 29450 }, { "epoch": 1.061556204274336, "grad_norm": 0.16818857192993164, "learning_rate": 4.577059507116403e-05, "loss": 0.3934, "step": 29455 }, { "epoch": 1.0617364039355606, "grad_norm": 0.15899808704853058, "learning_rate": 4.576897088457921e-05, "loss": 0.4223, "step": 29460 }, { "epoch": 1.0619166035967853, "grad_norm": 0.17966797947883606, "learning_rate": 4.576734641502119e-05, "loss": 0.4265, "step": 29465 }, { "epoch": 1.0620968032580098, "grad_norm": 0.198250874876976, "learning_rate": 4.57657216625121e-05, "loss": 0.4491, "step": 29470 }, { "epoch": 1.0622770029192345, "grad_norm": 0.15183819830417633, "learning_rate": 4.576409662707406e-05, "loss": 0.4185, "step": 29475 }, { "epoch": 1.062457202580459, "grad_norm": 0.1668313890695572, "learning_rate": 4.576247130872924e-05, "loss": 0.3849, "step": 29480 }, { "epoch": 1.0626374022416838, "grad_norm": 0.15694022178649902, "learning_rate": 4.5760845707499753e-05, "loss": 0.4084, "step": 29485 }, { "epoch": 1.0628176019029085, "grad_norm": 0.17181478440761566, "learning_rate": 4.575921982340777e-05, "loss": 0.4529, "step": 29490 }, { "epoch": 1.062997801564133, "grad_norm": 0.18744981288909912, "learning_rate": 4.575759365647543e-05, "loss": 0.4149, "step": 29495 }, { "epoch": 1.0631780012253578, "grad_norm": 0.1948675960302353, "learning_rate": 4.5755967206724906e-05, "loss": 0.4541, "step": 29500 }, { "epoch": 1.0631780012253578, "eval_loss": 0.45399028062820435, "eval_runtime": 3.5401, "eval_samples_per_second": 28.248, "eval_steps_per_second": 7.062, "step": 29500 }, { "epoch": 1.0633582008865823, "grad_norm": 0.2056550681591034, "learning_rate": 4.5754340474178334e-05, "loss": 0.4456, "step": 29505 }, { "epoch": 1.063538400547807, "grad_norm": 0.14791448414325714, "learning_rate": 4.57527134588579e-05, "loss": 0.429, "step": 29510 }, { "epoch": 1.0637186002090315, "grad_norm": 0.16541562974452972, "learning_rate": 4.5751086160785764e-05, "loss": 0.4213, "step": 29515 }, { "epoch": 1.0638987998702563, "grad_norm": 0.18588918447494507, "learning_rate": 4.57494585799841e-05, "loss": 0.4371, "step": 29520 }, { "epoch": 1.064078999531481, "grad_norm": 0.20339952409267426, "learning_rate": 4.574783071647507e-05, "loss": 0.4157, "step": 29525 }, { "epoch": 1.0642591991927055, "grad_norm": 0.1556108593940735, "learning_rate": 4.574620257028087e-05, "loss": 0.4185, "step": 29530 }, { "epoch": 1.0644393988539302, "grad_norm": 0.2084941267967224, "learning_rate": 4.574457414142367e-05, "loss": 0.4225, "step": 29535 }, { "epoch": 1.0646195985151548, "grad_norm": 0.1711416095495224, "learning_rate": 4.574294542992566e-05, "loss": 0.4142, "step": 29540 }, { "epoch": 1.0647997981763795, "grad_norm": 0.17444539070129395, "learning_rate": 4.574131643580905e-05, "loss": 0.4169, "step": 29545 }, { "epoch": 1.064979997837604, "grad_norm": 0.2214493304491043, "learning_rate": 4.573968715909601e-05, "loss": 0.4479, "step": 29550 }, { "epoch": 1.0651601974988287, "grad_norm": 0.16091366112232208, "learning_rate": 4.573805759980875e-05, "loss": 0.4392, "step": 29555 }, { "epoch": 1.0653403971600532, "grad_norm": 0.1976114958524704, "learning_rate": 4.573642775796947e-05, "loss": 0.4436, "step": 29560 }, { "epoch": 1.065520596821278, "grad_norm": 0.14705707132816315, "learning_rate": 4.573479763360038e-05, "loss": 0.4187, "step": 29565 }, { "epoch": 1.0657007964825027, "grad_norm": 0.20177073776721954, "learning_rate": 4.573316722672368e-05, "loss": 0.4562, "step": 29570 }, { "epoch": 1.0658809961437272, "grad_norm": 0.16343575716018677, "learning_rate": 4.573153653736159e-05, "loss": 0.4348, "step": 29575 }, { "epoch": 1.066061195804952, "grad_norm": 0.19736631214618683, "learning_rate": 4.572990556553634e-05, "loss": 0.4196, "step": 29580 }, { "epoch": 1.0662413954661765, "grad_norm": 0.1972283273935318, "learning_rate": 4.572827431127012e-05, "loss": 0.4352, "step": 29585 }, { "epoch": 1.0664215951274012, "grad_norm": 0.17068789899349213, "learning_rate": 4.5726642774585195e-05, "loss": 0.4277, "step": 29590 }, { "epoch": 1.0666017947886257, "grad_norm": 0.22462329268455505, "learning_rate": 4.5725010955503764e-05, "loss": 0.4148, "step": 29595 }, { "epoch": 1.0667819944498504, "grad_norm": 0.14778359234333038, "learning_rate": 4.572337885404807e-05, "loss": 0.4104, "step": 29600 }, { "epoch": 1.066962194111075, "grad_norm": 0.1986793428659439, "learning_rate": 4.5721746470240355e-05, "loss": 0.4351, "step": 29605 }, { "epoch": 1.0671423937722997, "grad_norm": 0.20412606000900269, "learning_rate": 4.572011380410286e-05, "loss": 0.3812, "step": 29610 }, { "epoch": 1.0673225934335244, "grad_norm": 0.17064638435840607, "learning_rate": 4.571848085565783e-05, "loss": 0.4169, "step": 29615 }, { "epoch": 1.067502793094749, "grad_norm": 0.20599807798862457, "learning_rate": 4.5716847624927496e-05, "loss": 0.4217, "step": 29620 }, { "epoch": 1.0676829927559737, "grad_norm": 0.1645248532295227, "learning_rate": 4.5715214111934134e-05, "loss": 0.3981, "step": 29625 }, { "epoch": 1.0678631924171982, "grad_norm": 0.16410000622272491, "learning_rate": 4.571358031669999e-05, "loss": 0.4021, "step": 29630 }, { "epoch": 1.068043392078423, "grad_norm": 0.19438453018665314, "learning_rate": 4.571194623924731e-05, "loss": 0.4029, "step": 29635 }, { "epoch": 1.0682235917396474, "grad_norm": 0.18021252751350403, "learning_rate": 4.571031187959839e-05, "loss": 0.3966, "step": 29640 }, { "epoch": 1.0684037914008722, "grad_norm": 0.17498481273651123, "learning_rate": 4.570867723777548e-05, "loss": 0.3946, "step": 29645 }, { "epoch": 1.068583991062097, "grad_norm": 0.16093195974826813, "learning_rate": 4.570704231380084e-05, "loss": 0.4181, "step": 29650 }, { "epoch": 1.0687641907233214, "grad_norm": 0.20035532116889954, "learning_rate": 4.570540710769676e-05, "loss": 0.4173, "step": 29655 }, { "epoch": 1.0689443903845461, "grad_norm": 0.16321523487567902, "learning_rate": 4.5703771619485523e-05, "loss": 0.4154, "step": 29660 }, { "epoch": 1.0691245900457707, "grad_norm": 0.17585666477680206, "learning_rate": 4.570213584918941e-05, "loss": 0.4138, "step": 29665 }, { "epoch": 1.0693047897069954, "grad_norm": 0.14717882871627808, "learning_rate": 4.570049979683069e-05, "loss": 0.4144, "step": 29670 }, { "epoch": 1.06948498936822, "grad_norm": 0.16287022829055786, "learning_rate": 4.569886346243167e-05, "loss": 0.4082, "step": 29675 }, { "epoch": 1.0696651890294446, "grad_norm": 0.1774132400751114, "learning_rate": 4.569722684601465e-05, "loss": 0.443, "step": 29680 }, { "epoch": 1.0698453886906694, "grad_norm": 0.17134885489940643, "learning_rate": 4.569558994760192e-05, "loss": 0.423, "step": 29685 }, { "epoch": 1.0700255883518939, "grad_norm": 0.20845064520835876, "learning_rate": 4.569395276721579e-05, "loss": 0.4343, "step": 29690 }, { "epoch": 1.0702057880131186, "grad_norm": 0.19249121844768524, "learning_rate": 4.569231530487855e-05, "loss": 0.4168, "step": 29695 }, { "epoch": 1.0703859876743431, "grad_norm": 0.21180908381938934, "learning_rate": 4.569067756061253e-05, "loss": 0.4245, "step": 29700 }, { "epoch": 1.0705661873355679, "grad_norm": 0.199473574757576, "learning_rate": 4.568903953444003e-05, "loss": 0.4199, "step": 29705 }, { "epoch": 1.0707463869967924, "grad_norm": 0.19441378116607666, "learning_rate": 4.568740122638337e-05, "loss": 0.4198, "step": 29710 }, { "epoch": 1.070926586658017, "grad_norm": 0.17675237357616425, "learning_rate": 4.568576263646487e-05, "loss": 0.3893, "step": 29715 }, { "epoch": 1.0711067863192416, "grad_norm": 0.2046016901731491, "learning_rate": 4.5684123764706855e-05, "loss": 0.4285, "step": 29720 }, { "epoch": 1.0712869859804663, "grad_norm": 0.1835736781358719, "learning_rate": 4.568248461113167e-05, "loss": 0.4359, "step": 29725 }, { "epoch": 1.071467185641691, "grad_norm": 0.1686883121728897, "learning_rate": 4.5680845175761635e-05, "loss": 0.4405, "step": 29730 }, { "epoch": 1.0716473853029156, "grad_norm": 0.15105289220809937, "learning_rate": 4.567920545861908e-05, "loss": 0.4067, "step": 29735 }, { "epoch": 1.0718275849641403, "grad_norm": 0.21068331599235535, "learning_rate": 4.5677565459726355e-05, "loss": 0.4408, "step": 29740 }, { "epoch": 1.0720077846253648, "grad_norm": 0.16096103191375732, "learning_rate": 4.567592517910582e-05, "loss": 0.4391, "step": 29745 }, { "epoch": 1.0721879842865896, "grad_norm": 0.18630751967430115, "learning_rate": 4.567428461677979e-05, "loss": 0.4219, "step": 29750 }, { "epoch": 1.072368183947814, "grad_norm": 0.18894068896770477, "learning_rate": 4.567264377277064e-05, "loss": 0.3977, "step": 29755 }, { "epoch": 1.0725483836090388, "grad_norm": 0.2151520699262619, "learning_rate": 4.5671002647100716e-05, "loss": 0.4539, "step": 29760 }, { "epoch": 1.0727285832702635, "grad_norm": 0.1710183173418045, "learning_rate": 4.566936123979239e-05, "loss": 0.4098, "step": 29765 }, { "epoch": 1.072908782931488, "grad_norm": 0.17231152951717377, "learning_rate": 4.566771955086802e-05, "loss": 0.4076, "step": 29770 }, { "epoch": 1.0730889825927128, "grad_norm": 0.18413393199443817, "learning_rate": 4.566607758034997e-05, "loss": 0.4398, "step": 29775 }, { "epoch": 1.0732691822539373, "grad_norm": 0.18336208164691925, "learning_rate": 4.566443532826061e-05, "loss": 0.4552, "step": 29780 }, { "epoch": 1.073449381915162, "grad_norm": 0.18550068140029907, "learning_rate": 4.566279279462232e-05, "loss": 0.4303, "step": 29785 }, { "epoch": 1.0736295815763865, "grad_norm": 0.1502627283334732, "learning_rate": 4.5661149979457485e-05, "loss": 0.4258, "step": 29790 }, { "epoch": 1.0738097812376113, "grad_norm": 0.17316554486751556, "learning_rate": 4.565950688278848e-05, "loss": 0.414, "step": 29795 }, { "epoch": 1.073989980898836, "grad_norm": 0.19836434721946716, "learning_rate": 4.5657863504637694e-05, "loss": 0.3891, "step": 29800 }, { "epoch": 1.0741701805600605, "grad_norm": 0.17455318570137024, "learning_rate": 4.5656219845027516e-05, "loss": 0.3748, "step": 29805 }, { "epoch": 1.0743503802212853, "grad_norm": 0.2048964947462082, "learning_rate": 4.565457590398034e-05, "loss": 0.431, "step": 29810 }, { "epoch": 1.0745305798825098, "grad_norm": 0.1792224943637848, "learning_rate": 4.5652931681518565e-05, "loss": 0.4157, "step": 29815 }, { "epoch": 1.0747107795437345, "grad_norm": 0.22106477618217468, "learning_rate": 4.56512871776646e-05, "loss": 0.4332, "step": 29820 }, { "epoch": 1.074890979204959, "grad_norm": 0.19627970457077026, "learning_rate": 4.564964239244084e-05, "loss": 0.4105, "step": 29825 }, { "epoch": 1.0750711788661838, "grad_norm": 0.1940174698829651, "learning_rate": 4.564799732586971e-05, "loss": 0.4369, "step": 29830 }, { "epoch": 1.0752513785274083, "grad_norm": 0.19587305188179016, "learning_rate": 4.56463519779736e-05, "loss": 0.3994, "step": 29835 }, { "epoch": 1.075431578188633, "grad_norm": 0.14254744350910187, "learning_rate": 4.564470634877495e-05, "loss": 0.4357, "step": 29840 }, { "epoch": 1.0756117778498577, "grad_norm": 0.2016969919204712, "learning_rate": 4.564306043829617e-05, "loss": 0.4505, "step": 29845 }, { "epoch": 1.0757919775110822, "grad_norm": 0.17493173480033875, "learning_rate": 4.564141424655969e-05, "loss": 0.3733, "step": 29850 }, { "epoch": 1.075972177172307, "grad_norm": 0.16929864883422852, "learning_rate": 4.563976777358793e-05, "loss": 0.3971, "step": 29855 }, { "epoch": 1.0761523768335315, "grad_norm": 0.18552954494953156, "learning_rate": 4.563812101940334e-05, "loss": 0.4258, "step": 29860 }, { "epoch": 1.0763325764947562, "grad_norm": 0.1586046814918518, "learning_rate": 4.563647398402834e-05, "loss": 0.4375, "step": 29865 }, { "epoch": 1.0765127761559807, "grad_norm": 0.1995263397693634, "learning_rate": 4.5634826667485385e-05, "loss": 0.4137, "step": 29870 }, { "epoch": 1.0766929758172055, "grad_norm": 0.16926749050617218, "learning_rate": 4.56331790697969e-05, "loss": 0.4065, "step": 29875 }, { "epoch": 1.0768731754784302, "grad_norm": 0.18349143862724304, "learning_rate": 4.563153119098535e-05, "loss": 0.4106, "step": 29880 }, { "epoch": 1.0770533751396547, "grad_norm": 0.18791143596172333, "learning_rate": 4.562988303107319e-05, "loss": 0.4174, "step": 29885 }, { "epoch": 1.0772335748008794, "grad_norm": 0.1980997622013092, "learning_rate": 4.5628234590082864e-05, "loss": 0.4179, "step": 29890 }, { "epoch": 1.077413774462104, "grad_norm": 0.19258050620555878, "learning_rate": 4.562658586803683e-05, "loss": 0.4205, "step": 29895 }, { "epoch": 1.0775939741233287, "grad_norm": 0.1779131144285202, "learning_rate": 4.5624936864957556e-05, "loss": 0.3996, "step": 29900 }, { "epoch": 1.0777741737845532, "grad_norm": 0.14219936728477478, "learning_rate": 4.562328758086752e-05, "loss": 0.4039, "step": 29905 }, { "epoch": 1.077954373445778, "grad_norm": 0.18294230103492737, "learning_rate": 4.562163801578918e-05, "loss": 0.4341, "step": 29910 }, { "epoch": 1.0781345731070027, "grad_norm": 0.18481683731079102, "learning_rate": 4.561998816974501e-05, "loss": 0.4289, "step": 29915 }, { "epoch": 1.0783147727682272, "grad_norm": 0.17565883696079254, "learning_rate": 4.561833804275749e-05, "loss": 0.4318, "step": 29920 }, { "epoch": 1.078494972429452, "grad_norm": 0.17877116799354553, "learning_rate": 4.561668763484911e-05, "loss": 0.4196, "step": 29925 }, { "epoch": 1.0786751720906764, "grad_norm": 0.1558140218257904, "learning_rate": 4.561503694604236e-05, "loss": 0.3991, "step": 29930 }, { "epoch": 1.0788553717519012, "grad_norm": 0.19004541635513306, "learning_rate": 4.561338597635972e-05, "loss": 0.468, "step": 29935 }, { "epoch": 1.0790355714131257, "grad_norm": 0.14542394876480103, "learning_rate": 4.561173472582368e-05, "loss": 0.448, "step": 29940 }, { "epoch": 1.0792157710743504, "grad_norm": 0.19577011466026306, "learning_rate": 4.5610083194456754e-05, "loss": 0.4157, "step": 29945 }, { "epoch": 1.079395970735575, "grad_norm": 0.21589916944503784, "learning_rate": 4.5608431382281426e-05, "loss": 0.4265, "step": 29950 }, { "epoch": 1.0795761703967997, "grad_norm": 0.14693371951580048, "learning_rate": 4.560677928932021e-05, "loss": 0.411, "step": 29955 }, { "epoch": 1.0797563700580244, "grad_norm": 0.19670480489730835, "learning_rate": 4.560512691559562e-05, "loss": 0.4504, "step": 29960 }, { "epoch": 1.079936569719249, "grad_norm": 0.18953409790992737, "learning_rate": 4.560347426113017e-05, "loss": 0.4325, "step": 29965 }, { "epoch": 1.0801167693804736, "grad_norm": 0.18052424490451813, "learning_rate": 4.560182132594637e-05, "loss": 0.4092, "step": 29970 }, { "epoch": 1.0802969690416981, "grad_norm": 0.19230318069458008, "learning_rate": 4.560016811006673e-05, "loss": 0.4343, "step": 29975 }, { "epoch": 1.0804771687029229, "grad_norm": 0.2135818749666214, "learning_rate": 4.5598514613513796e-05, "loss": 0.4111, "step": 29980 }, { "epoch": 1.0806573683641474, "grad_norm": 0.1807836890220642, "learning_rate": 4.5596860836310094e-05, "loss": 0.435, "step": 29985 }, { "epoch": 1.0808375680253721, "grad_norm": 0.18161781132221222, "learning_rate": 4.559520677847815e-05, "loss": 0.4094, "step": 29990 }, { "epoch": 1.0810177676865969, "grad_norm": 0.19649210572242737, "learning_rate": 4.55935524400405e-05, "loss": 0.3864, "step": 29995 }, { "epoch": 1.0811979673478214, "grad_norm": 0.15059545636177063, "learning_rate": 4.559189782101968e-05, "loss": 0.3919, "step": 30000 }, { "epoch": 1.0811979673478214, "eval_loss": 0.4527240991592407, "eval_runtime": 3.5376, "eval_samples_per_second": 28.268, "eval_steps_per_second": 7.067, "step": 30000 }, { "epoch": 1.081378167009046, "grad_norm": 0.16200894117355347, "learning_rate": 4.559024292143824e-05, "loss": 0.4298, "step": 30005 }, { "epoch": 1.0815583666702706, "grad_norm": 0.19894878566265106, "learning_rate": 4.558858774131873e-05, "loss": 0.4217, "step": 30010 }, { "epoch": 1.0817385663314953, "grad_norm": 0.18201377987861633, "learning_rate": 4.5586932280683696e-05, "loss": 0.4006, "step": 30015 }, { "epoch": 1.0819187659927199, "grad_norm": 0.22016799449920654, "learning_rate": 4.55852765395557e-05, "loss": 0.4411, "step": 30020 }, { "epoch": 1.0820989656539446, "grad_norm": 0.21885946393013, "learning_rate": 4.5583620517957296e-05, "loss": 0.4293, "step": 30025 }, { "epoch": 1.0822791653151693, "grad_norm": 0.21789535880088806, "learning_rate": 4.5581964215911036e-05, "loss": 0.4038, "step": 30030 }, { "epoch": 1.0824593649763938, "grad_norm": 0.14212708175182343, "learning_rate": 4.558030763343951e-05, "loss": 0.4043, "step": 30035 }, { "epoch": 1.0826395646376186, "grad_norm": 0.1961928755044937, "learning_rate": 4.5578650770565274e-05, "loss": 0.4439, "step": 30040 }, { "epoch": 1.082819764298843, "grad_norm": 0.18222855031490326, "learning_rate": 4.55769936273109e-05, "loss": 0.4137, "step": 30045 }, { "epoch": 1.0829999639600678, "grad_norm": 0.18094827234745026, "learning_rate": 4.5575336203698985e-05, "loss": 0.3873, "step": 30050 }, { "epoch": 1.0831801636212923, "grad_norm": 0.14984826743602753, "learning_rate": 4.557367849975208e-05, "loss": 0.4094, "step": 30055 }, { "epoch": 1.083360363282517, "grad_norm": 0.18728286027908325, "learning_rate": 4.5572020515492805e-05, "loss": 0.4556, "step": 30060 }, { "epoch": 1.0835405629437416, "grad_norm": 0.1479881852865219, "learning_rate": 4.5570362250943725e-05, "loss": 0.3622, "step": 30065 }, { "epoch": 1.0837207626049663, "grad_norm": 0.2368391901254654, "learning_rate": 4.556870370612744e-05, "loss": 0.4298, "step": 30070 }, { "epoch": 1.083900962266191, "grad_norm": 0.1753835380077362, "learning_rate": 4.5567044881066555e-05, "loss": 0.4145, "step": 30075 }, { "epoch": 1.0840811619274155, "grad_norm": 0.17474301159381866, "learning_rate": 4.556538577578366e-05, "loss": 0.4013, "step": 30080 }, { "epoch": 1.0842613615886403, "grad_norm": 0.2075207531452179, "learning_rate": 4.5563726390301374e-05, "loss": 0.3922, "step": 30085 }, { "epoch": 1.0844415612498648, "grad_norm": 0.1771383434534073, "learning_rate": 4.5562066724642286e-05, "loss": 0.3737, "step": 30090 }, { "epoch": 1.0846217609110895, "grad_norm": 0.1907252073287964, "learning_rate": 4.556040677882903e-05, "loss": 0.4369, "step": 30095 }, { "epoch": 1.084801960572314, "grad_norm": 0.1812266856431961, "learning_rate": 4.55587465528842e-05, "loss": 0.4365, "step": 30100 }, { "epoch": 1.0849821602335388, "grad_norm": 0.17140717804431915, "learning_rate": 4.5557086046830446e-05, "loss": 0.4023, "step": 30105 }, { "epoch": 1.0851623598947633, "grad_norm": 0.20850670337677002, "learning_rate": 4.555542526069036e-05, "loss": 0.4173, "step": 30110 }, { "epoch": 1.085342559555988, "grad_norm": 0.21336951851844788, "learning_rate": 4.5553764194486583e-05, "loss": 0.431, "step": 30115 }, { "epoch": 1.0855227592172128, "grad_norm": 0.16418558359146118, "learning_rate": 4.5552102848241764e-05, "loss": 0.4199, "step": 30120 }, { "epoch": 1.0857029588784373, "grad_norm": 0.15260203182697296, "learning_rate": 4.5550441221978505e-05, "loss": 0.4031, "step": 30125 }, { "epoch": 1.085883158539662, "grad_norm": 0.17375853657722473, "learning_rate": 4.554877931571947e-05, "loss": 0.4301, "step": 30130 }, { "epoch": 1.0860633582008865, "grad_norm": 0.1743849366903305, "learning_rate": 4.5547117129487305e-05, "loss": 0.4394, "step": 30135 }, { "epoch": 1.0862435578621112, "grad_norm": 0.1563817262649536, "learning_rate": 4.554545466330463e-05, "loss": 0.423, "step": 30140 }, { "epoch": 1.0864237575233358, "grad_norm": 0.22052206099033356, "learning_rate": 4.554379191719412e-05, "loss": 0.4591, "step": 30145 }, { "epoch": 1.0866039571845605, "grad_norm": 0.23430973291397095, "learning_rate": 4.554212889117842e-05, "loss": 0.4231, "step": 30150 }, { "epoch": 1.0867841568457852, "grad_norm": 0.1482902467250824, "learning_rate": 4.55404655852802e-05, "loss": 0.4042, "step": 30155 }, { "epoch": 1.0869643565070097, "grad_norm": 0.19089116156101227, "learning_rate": 4.5538801999522115e-05, "loss": 0.3929, "step": 30160 }, { "epoch": 1.0871445561682345, "grad_norm": 0.1974368840456009, "learning_rate": 4.553713813392682e-05, "loss": 0.393, "step": 30165 }, { "epoch": 1.087324755829459, "grad_norm": 0.20312896370887756, "learning_rate": 4.5535473988517e-05, "loss": 0.3933, "step": 30170 }, { "epoch": 1.0875049554906837, "grad_norm": 0.16208785772323608, "learning_rate": 4.553380956331531e-05, "loss": 0.4074, "step": 30175 }, { "epoch": 1.0876851551519082, "grad_norm": 0.16057203710079193, "learning_rate": 4.5532144858344446e-05, "loss": 0.4167, "step": 30180 }, { "epoch": 1.087865354813133, "grad_norm": 0.19308283925056458, "learning_rate": 4.5530479873627095e-05, "loss": 0.4052, "step": 30185 }, { "epoch": 1.0880455544743577, "grad_norm": 0.15638864040374756, "learning_rate": 4.552881460918592e-05, "loss": 0.4148, "step": 30190 }, { "epoch": 1.0882257541355822, "grad_norm": 0.1577041745185852, "learning_rate": 4.552714906504362e-05, "loss": 0.4295, "step": 30195 }, { "epoch": 1.088405953796807, "grad_norm": 0.18273350596427917, "learning_rate": 4.552548324122289e-05, "loss": 0.4369, "step": 30200 }, { "epoch": 1.0885861534580314, "grad_norm": 0.18305066227912903, "learning_rate": 4.552381713774643e-05, "loss": 0.3818, "step": 30205 }, { "epoch": 1.0887663531192562, "grad_norm": 0.1907605528831482, "learning_rate": 4.5522150754636926e-05, "loss": 0.4491, "step": 30210 }, { "epoch": 1.0889465527804807, "grad_norm": 0.18072152137756348, "learning_rate": 4.55204840919171e-05, "loss": 0.3953, "step": 30215 }, { "epoch": 1.0891267524417054, "grad_norm": 0.15544374287128448, "learning_rate": 4.551881714960965e-05, "loss": 0.4226, "step": 30220 }, { "epoch": 1.08930695210293, "grad_norm": 0.19354958832263947, "learning_rate": 4.5517149927737276e-05, "loss": 0.3948, "step": 30225 }, { "epoch": 1.0894871517641547, "grad_norm": 0.21312229335308075, "learning_rate": 4.551548242632272e-05, "loss": 0.4241, "step": 30230 }, { "epoch": 1.0896673514253794, "grad_norm": 0.19396165013313293, "learning_rate": 4.5513814645388686e-05, "loss": 0.435, "step": 30235 }, { "epoch": 1.089847551086604, "grad_norm": 0.18126662075519562, "learning_rate": 4.55121465849579e-05, "loss": 0.4313, "step": 30240 }, { "epoch": 1.0900277507478286, "grad_norm": 0.1702282428741455, "learning_rate": 4.551047824505308e-05, "loss": 0.4408, "step": 30245 }, { "epoch": 1.0902079504090532, "grad_norm": 0.17638468742370605, "learning_rate": 4.550880962569697e-05, "loss": 0.4091, "step": 30250 }, { "epoch": 1.090388150070278, "grad_norm": 0.20289641618728638, "learning_rate": 4.55071407269123e-05, "loss": 0.4385, "step": 30255 }, { "epoch": 1.0905683497315024, "grad_norm": 0.16934141516685486, "learning_rate": 4.5505471548721815e-05, "loss": 0.4698, "step": 30260 }, { "epoch": 1.0907485493927271, "grad_norm": 0.1499088704586029, "learning_rate": 4.550380209114824e-05, "loss": 0.4283, "step": 30265 }, { "epoch": 1.0909287490539519, "grad_norm": 0.1858983188867569, "learning_rate": 4.550213235421433e-05, "loss": 0.4332, "step": 30270 }, { "epoch": 1.0911089487151764, "grad_norm": 0.1682644486427307, "learning_rate": 4.550046233794284e-05, "loss": 0.4435, "step": 30275 }, { "epoch": 1.0912891483764011, "grad_norm": 0.21229910850524902, "learning_rate": 4.5498792042356516e-05, "loss": 0.4552, "step": 30280 }, { "epoch": 1.0914693480376256, "grad_norm": 0.18798302114009857, "learning_rate": 4.549712146747812e-05, "loss": 0.4752, "step": 30285 }, { "epoch": 1.0916495476988504, "grad_norm": 0.192665234208107, "learning_rate": 4.549545061333042e-05, "loss": 0.4404, "step": 30290 }, { "epoch": 1.0918297473600749, "grad_norm": 0.18173715472221375, "learning_rate": 4.549377947993617e-05, "loss": 0.4051, "step": 30295 }, { "epoch": 1.0920099470212996, "grad_norm": 0.17702946066856384, "learning_rate": 4.549210806731814e-05, "loss": 0.4092, "step": 30300 }, { "epoch": 1.0921901466825243, "grad_norm": 0.19283123314380646, "learning_rate": 4.54904363754991e-05, "loss": 0.4396, "step": 30305 }, { "epoch": 1.0923703463437489, "grad_norm": 0.16534939408302307, "learning_rate": 4.5488764404501836e-05, "loss": 0.4085, "step": 30310 }, { "epoch": 1.0925505460049736, "grad_norm": 0.1978139728307724, "learning_rate": 4.5487092154349134e-05, "loss": 0.4261, "step": 30315 }, { "epoch": 1.092730745666198, "grad_norm": 0.17723916471004486, "learning_rate": 4.548541962506375e-05, "loss": 0.4394, "step": 30320 }, { "epoch": 1.0929109453274228, "grad_norm": 0.15324194729328156, "learning_rate": 4.54837468166685e-05, "loss": 0.4128, "step": 30325 }, { "epoch": 1.0930911449886473, "grad_norm": 0.14871060848236084, "learning_rate": 4.548207372918617e-05, "loss": 0.4189, "step": 30330 }, { "epoch": 1.093271344649872, "grad_norm": 0.1577349752187729, "learning_rate": 4.5480400362639544e-05, "loss": 0.3968, "step": 30335 }, { "epoch": 1.0934515443110966, "grad_norm": 0.18540742993354797, "learning_rate": 4.5478726717051425e-05, "loss": 0.4243, "step": 30340 }, { "epoch": 1.0936317439723213, "grad_norm": 0.17788349092006683, "learning_rate": 4.547705279244462e-05, "loss": 0.4177, "step": 30345 }, { "epoch": 1.093811943633546, "grad_norm": 0.17169490456581116, "learning_rate": 4.5475378588841945e-05, "loss": 0.4525, "step": 30350 }, { "epoch": 1.0939921432947706, "grad_norm": 0.17133040726184845, "learning_rate": 4.547370410626619e-05, "loss": 0.4083, "step": 30355 }, { "epoch": 1.0941723429559953, "grad_norm": 0.2028420865535736, "learning_rate": 4.547202934474019e-05, "loss": 0.448, "step": 30360 }, { "epoch": 1.0943525426172198, "grad_norm": 0.2202211320400238, "learning_rate": 4.5470354304286746e-05, "loss": 0.4242, "step": 30365 }, { "epoch": 1.0945327422784445, "grad_norm": 0.19225327670574188, "learning_rate": 4.546867898492869e-05, "loss": 0.4575, "step": 30370 }, { "epoch": 1.094712941939669, "grad_norm": 0.18303290009498596, "learning_rate": 4.546700338668884e-05, "loss": 0.4284, "step": 30375 }, { "epoch": 1.0948931416008938, "grad_norm": 0.19182315468788147, "learning_rate": 4.546532750959004e-05, "loss": 0.4241, "step": 30380 }, { "epoch": 1.0950733412621185, "grad_norm": 0.15474554896354675, "learning_rate": 4.546365135365511e-05, "loss": 0.4373, "step": 30385 }, { "epoch": 1.095253540923343, "grad_norm": 0.17440365254878998, "learning_rate": 4.54619749189069e-05, "loss": 0.4173, "step": 30390 }, { "epoch": 1.0954337405845678, "grad_norm": 0.18450212478637695, "learning_rate": 4.546029820536824e-05, "loss": 0.4307, "step": 30395 }, { "epoch": 1.0956139402457923, "grad_norm": 0.15440581738948822, "learning_rate": 4.545862121306197e-05, "loss": 0.4153, "step": 30400 }, { "epoch": 1.095794139907017, "grad_norm": 0.19222205877304077, "learning_rate": 4.5456943942010954e-05, "loss": 0.4095, "step": 30405 }, { "epoch": 1.0959743395682415, "grad_norm": 0.24602346122264862, "learning_rate": 4.545526639223804e-05, "loss": 0.4373, "step": 30410 }, { "epoch": 1.0961545392294663, "grad_norm": 0.12822362780570984, "learning_rate": 4.545358856376608e-05, "loss": 0.3651, "step": 30415 }, { "epoch": 1.096334738890691, "grad_norm": 0.1669246107339859, "learning_rate": 4.545191045661793e-05, "loss": 0.383, "step": 30420 }, { "epoch": 1.0965149385519155, "grad_norm": 0.18753668665885925, "learning_rate": 4.5450232070816455e-05, "loss": 0.4247, "step": 30425 }, { "epoch": 1.0966951382131402, "grad_norm": 0.19645550847053528, "learning_rate": 4.544855340638454e-05, "loss": 0.3919, "step": 30430 }, { "epoch": 1.0968753378743648, "grad_norm": 0.16774851083755493, "learning_rate": 4.544687446334504e-05, "loss": 0.4473, "step": 30435 }, { "epoch": 1.0970555375355895, "grad_norm": 0.16702622175216675, "learning_rate": 4.544519524172083e-05, "loss": 0.4206, "step": 30440 }, { "epoch": 1.097235737196814, "grad_norm": 0.18590982258319855, "learning_rate": 4.5443515741534805e-05, "loss": 0.374, "step": 30445 }, { "epoch": 1.0974159368580387, "grad_norm": 0.18357446789741516, "learning_rate": 4.544183596280982e-05, "loss": 0.4525, "step": 30450 }, { "epoch": 1.0975961365192632, "grad_norm": 0.196320578455925, "learning_rate": 4.544015590556879e-05, "loss": 0.4196, "step": 30455 }, { "epoch": 1.097776336180488, "grad_norm": 0.21362504363059998, "learning_rate": 4.5438475569834585e-05, "loss": 0.4115, "step": 30460 }, { "epoch": 1.0979565358417127, "grad_norm": 0.18946920335292816, "learning_rate": 4.5436794955630115e-05, "loss": 0.4346, "step": 30465 }, { "epoch": 1.0981367355029372, "grad_norm": 0.1708582490682602, "learning_rate": 4.5435114062978255e-05, "loss": 0.4377, "step": 30470 }, { "epoch": 1.098316935164162, "grad_norm": 0.1605166643857956, "learning_rate": 4.543343289190194e-05, "loss": 0.4377, "step": 30475 }, { "epoch": 1.0984971348253865, "grad_norm": 0.18001148104667664, "learning_rate": 4.543175144242405e-05, "loss": 0.4053, "step": 30480 }, { "epoch": 1.0986773344866112, "grad_norm": 0.16023088991641998, "learning_rate": 4.5430069714567503e-05, "loss": 0.4372, "step": 30485 }, { "epoch": 1.0988575341478357, "grad_norm": 0.1586974859237671, "learning_rate": 4.5428387708355214e-05, "loss": 0.3954, "step": 30490 }, { "epoch": 1.0990377338090604, "grad_norm": 0.1917564570903778, "learning_rate": 4.542670542381009e-05, "loss": 0.4374, "step": 30495 }, { "epoch": 1.099217933470285, "grad_norm": 0.16604667901992798, "learning_rate": 4.542502286095507e-05, "loss": 0.4139, "step": 30500 }, { "epoch": 1.099217933470285, "eval_loss": 0.4521316885948181, "eval_runtime": 3.5361, "eval_samples_per_second": 28.28, "eval_steps_per_second": 7.07, "step": 30500 }, { "epoch": 1.0993981331315097, "grad_norm": 0.16811847686767578, "learning_rate": 4.542334001981307e-05, "loss": 0.4095, "step": 30505 }, { "epoch": 1.0995783327927344, "grad_norm": 0.1503792107105255, "learning_rate": 4.5421656900407e-05, "loss": 0.442, "step": 30510 }, { "epoch": 1.099758532453959, "grad_norm": 0.18034709990024567, "learning_rate": 4.5419973502759816e-05, "loss": 0.4725, "step": 30515 }, { "epoch": 1.0999387321151837, "grad_norm": 0.16529808938503265, "learning_rate": 4.541828982689445e-05, "loss": 0.3991, "step": 30520 }, { "epoch": 1.1001189317764082, "grad_norm": 0.17672580480575562, "learning_rate": 4.541660587283384e-05, "loss": 0.4159, "step": 30525 }, { "epoch": 1.100299131437633, "grad_norm": 0.17020630836486816, "learning_rate": 4.541492164060092e-05, "loss": 0.4095, "step": 30530 }, { "epoch": 1.1004793310988576, "grad_norm": 0.19656726717948914, "learning_rate": 4.541323713021865e-05, "loss": 0.4345, "step": 30535 }, { "epoch": 1.1006595307600822, "grad_norm": 0.15002837777137756, "learning_rate": 4.541155234170997e-05, "loss": 0.4065, "step": 30540 }, { "epoch": 1.100839730421307, "grad_norm": 0.15486060082912445, "learning_rate": 4.540986727509785e-05, "loss": 0.3887, "step": 30545 }, { "epoch": 1.1010199300825314, "grad_norm": 0.16535469889640808, "learning_rate": 4.540818193040523e-05, "loss": 0.395, "step": 30550 }, { "epoch": 1.1012001297437561, "grad_norm": 0.14959602057933807, "learning_rate": 4.54064963076551e-05, "loss": 0.4287, "step": 30555 }, { "epoch": 1.1013803294049807, "grad_norm": 0.19328902661800385, "learning_rate": 4.5404810406870396e-05, "loss": 0.4119, "step": 30560 }, { "epoch": 1.1015605290662054, "grad_norm": 0.18112468719482422, "learning_rate": 4.54031242280741e-05, "loss": 0.4082, "step": 30565 }, { "epoch": 1.10174072872743, "grad_norm": 0.2009975016117096, "learning_rate": 4.540143777128919e-05, "loss": 0.4164, "step": 30570 }, { "epoch": 1.1019209283886546, "grad_norm": 0.15006959438323975, "learning_rate": 4.539975103653864e-05, "loss": 0.37, "step": 30575 }, { "epoch": 1.1021011280498794, "grad_norm": 0.15584491193294525, "learning_rate": 4.5398064023845424e-05, "loss": 0.4106, "step": 30580 }, { "epoch": 1.1022813277111039, "grad_norm": 0.17378970980644226, "learning_rate": 4.539637673323255e-05, "loss": 0.4463, "step": 30585 }, { "epoch": 1.1024615273723286, "grad_norm": 0.21250377595424652, "learning_rate": 4.539468916472298e-05, "loss": 0.4063, "step": 30590 }, { "epoch": 1.1026417270335531, "grad_norm": 0.19015663862228394, "learning_rate": 4.539300131833972e-05, "loss": 0.4417, "step": 30595 }, { "epoch": 1.1028219266947779, "grad_norm": 0.15822429955005646, "learning_rate": 4.539131319410577e-05, "loss": 0.4316, "step": 30600 }, { "epoch": 1.1030021263560024, "grad_norm": 0.17139464616775513, "learning_rate": 4.538962479204412e-05, "loss": 0.4495, "step": 30605 }, { "epoch": 1.103182326017227, "grad_norm": 0.14621873199939728, "learning_rate": 4.538793611217778e-05, "loss": 0.4219, "step": 30610 }, { "epoch": 1.1033625256784516, "grad_norm": 0.15971679985523224, "learning_rate": 4.538624715452976e-05, "loss": 0.4193, "step": 30615 }, { "epoch": 1.1035427253396763, "grad_norm": 0.16001123189926147, "learning_rate": 4.538455791912307e-05, "loss": 0.4119, "step": 30620 }, { "epoch": 1.103722925000901, "grad_norm": 0.17780780792236328, "learning_rate": 4.5382868405980724e-05, "loss": 0.4474, "step": 30625 }, { "epoch": 1.1039031246621256, "grad_norm": 0.17984482645988464, "learning_rate": 4.5381178615125746e-05, "loss": 0.4399, "step": 30630 }, { "epoch": 1.1040833243233503, "grad_norm": 0.1892779916524887, "learning_rate": 4.537948854658115e-05, "loss": 0.4361, "step": 30635 }, { "epoch": 1.1042635239845748, "grad_norm": 0.13633422553539276, "learning_rate": 4.537779820036997e-05, "loss": 0.4032, "step": 30640 }, { "epoch": 1.1044437236457996, "grad_norm": 0.1995527446269989, "learning_rate": 4.5376107576515235e-05, "loss": 0.4478, "step": 30645 }, { "epoch": 1.104623923307024, "grad_norm": 0.21352799236774445, "learning_rate": 4.537441667503998e-05, "loss": 0.4282, "step": 30650 }, { "epoch": 1.1048041229682488, "grad_norm": 0.2059534788131714, "learning_rate": 4.537272549596724e-05, "loss": 0.4291, "step": 30655 }, { "epoch": 1.1049843226294735, "grad_norm": 0.17112842202186584, "learning_rate": 4.5371034039320065e-05, "loss": 0.3752, "step": 30660 }, { "epoch": 1.105164522290698, "grad_norm": 0.18487350642681122, "learning_rate": 4.536934230512149e-05, "loss": 0.4198, "step": 30665 }, { "epoch": 1.1053447219519228, "grad_norm": 0.20532426238059998, "learning_rate": 4.5367650293394574e-05, "loss": 0.4549, "step": 30670 }, { "epoch": 1.1055249216131473, "grad_norm": 0.1958533376455307, "learning_rate": 4.536595800416236e-05, "loss": 0.424, "step": 30675 }, { "epoch": 1.105705121274372, "grad_norm": 0.18324759602546692, "learning_rate": 4.5364265437447915e-05, "loss": 0.4424, "step": 30680 }, { "epoch": 1.1058853209355965, "grad_norm": 0.16815263032913208, "learning_rate": 4.536257259327429e-05, "loss": 0.3943, "step": 30685 }, { "epoch": 1.1060655205968213, "grad_norm": 0.16847245395183563, "learning_rate": 4.536087947166456e-05, "loss": 0.4235, "step": 30690 }, { "epoch": 1.106245720258046, "grad_norm": 0.18886929750442505, "learning_rate": 4.5359186072641796e-05, "loss": 0.431, "step": 30695 }, { "epoch": 1.1064259199192705, "grad_norm": 0.19722402095794678, "learning_rate": 4.535749239622906e-05, "loss": 0.4551, "step": 30700 }, { "epoch": 1.1066061195804953, "grad_norm": 0.1684960126876831, "learning_rate": 4.535579844244943e-05, "loss": 0.4292, "step": 30705 }, { "epoch": 1.1067863192417198, "grad_norm": 0.16538777947425842, "learning_rate": 4.535410421132598e-05, "loss": 0.4574, "step": 30710 }, { "epoch": 1.1069665189029445, "grad_norm": 0.21061739325523376, "learning_rate": 4.535240970288181e-05, "loss": 0.402, "step": 30715 }, { "epoch": 1.107146718564169, "grad_norm": 0.18841953575611115, "learning_rate": 4.535071491713999e-05, "loss": 0.4477, "step": 30720 }, { "epoch": 1.1073269182253938, "grad_norm": 0.15210363268852234, "learning_rate": 4.534901985412363e-05, "loss": 0.4299, "step": 30725 }, { "epoch": 1.1075071178866183, "grad_norm": 0.18912887573242188, "learning_rate": 4.53473245138558e-05, "loss": 0.4201, "step": 30730 }, { "epoch": 1.107687317547843, "grad_norm": 0.1371183693408966, "learning_rate": 4.5345628896359625e-05, "loss": 0.4116, "step": 30735 }, { "epoch": 1.1078675172090677, "grad_norm": 0.15921291708946228, "learning_rate": 4.5343933001658194e-05, "loss": 0.4222, "step": 30740 }, { "epoch": 1.1080477168702922, "grad_norm": 0.23919863998889923, "learning_rate": 4.5342236829774617e-05, "loss": 0.453, "step": 30745 }, { "epoch": 1.108227916531517, "grad_norm": 0.2175007313489914, "learning_rate": 4.534054038073199e-05, "loss": 0.4146, "step": 30750 }, { "epoch": 1.1084081161927415, "grad_norm": 0.16294798254966736, "learning_rate": 4.533884365455345e-05, "loss": 0.3633, "step": 30755 }, { "epoch": 1.1085883158539662, "grad_norm": 0.15248903632164001, "learning_rate": 4.5337146651262094e-05, "loss": 0.4316, "step": 30760 }, { "epoch": 1.1087685155151907, "grad_norm": 0.18344025313854218, "learning_rate": 4.533544937088106e-05, "loss": 0.3941, "step": 30765 }, { "epoch": 1.1089487151764155, "grad_norm": 0.2048654705286026, "learning_rate": 4.533375181343346e-05, "loss": 0.4528, "step": 30770 }, { "epoch": 1.1091289148376402, "grad_norm": 0.20084497332572937, "learning_rate": 4.5332053978942436e-05, "loss": 0.4366, "step": 30775 }, { "epoch": 1.1093091144988647, "grad_norm": 0.13244780898094177, "learning_rate": 4.5330355867431106e-05, "loss": 0.4239, "step": 30780 }, { "epoch": 1.1094893141600894, "grad_norm": 0.21777132153511047, "learning_rate": 4.532865747892261e-05, "loss": 0.4454, "step": 30785 }, { "epoch": 1.109669513821314, "grad_norm": 0.19449090957641602, "learning_rate": 4.53269588134401e-05, "loss": 0.4324, "step": 30790 }, { "epoch": 1.1098497134825387, "grad_norm": 0.16358676552772522, "learning_rate": 4.532525987100671e-05, "loss": 0.4083, "step": 30795 }, { "epoch": 1.1100299131437632, "grad_norm": 0.1827852874994278, "learning_rate": 4.532356065164558e-05, "loss": 0.411, "step": 30800 }, { "epoch": 1.110210112804988, "grad_norm": 0.16371290385723114, "learning_rate": 4.5321861155379884e-05, "loss": 0.4314, "step": 30805 }, { "epoch": 1.1103903124662127, "grad_norm": 0.18602146208286285, "learning_rate": 4.532016138223276e-05, "loss": 0.4158, "step": 30810 }, { "epoch": 1.1105705121274372, "grad_norm": 0.14630118012428284, "learning_rate": 4.5318461332227365e-05, "loss": 0.3876, "step": 30815 }, { "epoch": 1.110750711788662, "grad_norm": 0.1606942117214203, "learning_rate": 4.531676100538688e-05, "loss": 0.4428, "step": 30820 }, { "epoch": 1.1109309114498864, "grad_norm": 0.16617447137832642, "learning_rate": 4.5315060401734445e-05, "loss": 0.4363, "step": 30825 }, { "epoch": 1.1111111111111112, "grad_norm": 0.1629336029291153, "learning_rate": 4.5313359521293254e-05, "loss": 0.403, "step": 30830 }, { "epoch": 1.1112913107723357, "grad_norm": 0.1905163824558258, "learning_rate": 4.5311658364086474e-05, "loss": 0.4358, "step": 30835 }, { "epoch": 1.1114715104335604, "grad_norm": 0.17347170412540436, "learning_rate": 4.531029723906539e-05, "loss": 0.4344, "step": 30840 }, { "epoch": 1.111651710094785, "grad_norm": 0.1710389107465744, "learning_rate": 4.530859558373896e-05, "loss": 0.4036, "step": 30845 }, { "epoch": 1.1118319097560097, "grad_norm": 0.15823177993297577, "learning_rate": 4.530689365171184e-05, "loss": 0.3619, "step": 30850 }, { "epoch": 1.1120121094172344, "grad_norm": 0.17477105557918549, "learning_rate": 4.530519144300722e-05, "loss": 0.4299, "step": 30855 }, { "epoch": 1.112192309078459, "grad_norm": 0.15027157962322235, "learning_rate": 4.530348895764831e-05, "loss": 0.4332, "step": 30860 }, { "epoch": 1.1123725087396836, "grad_norm": 0.1768927425146103, "learning_rate": 4.530178619565829e-05, "loss": 0.434, "step": 30865 }, { "epoch": 1.1125527084009081, "grad_norm": 0.17332357168197632, "learning_rate": 4.5300083157060356e-05, "loss": 0.4292, "step": 30870 }, { "epoch": 1.1127329080621329, "grad_norm": 0.22046640515327454, "learning_rate": 4.529837984187773e-05, "loss": 0.4375, "step": 30875 }, { "epoch": 1.1129131077233574, "grad_norm": 0.15392176806926727, "learning_rate": 4.529667625013361e-05, "loss": 0.4253, "step": 30880 }, { "epoch": 1.1130933073845821, "grad_norm": 0.1741206794977188, "learning_rate": 4.52949723818512e-05, "loss": 0.3929, "step": 30885 }, { "epoch": 1.1132735070458069, "grad_norm": 0.1908791959285736, "learning_rate": 4.529326823705372e-05, "loss": 0.4403, "step": 30890 }, { "epoch": 1.1134537067070314, "grad_norm": 0.20315563678741455, "learning_rate": 4.5291563815764384e-05, "loss": 0.4401, "step": 30895 }, { "epoch": 1.113633906368256, "grad_norm": 0.16982267796993256, "learning_rate": 4.528985911800643e-05, "loss": 0.4372, "step": 30900 }, { "epoch": 1.1138141060294806, "grad_norm": 0.1527600884437561, "learning_rate": 4.5288154143803066e-05, "loss": 0.3923, "step": 30905 }, { "epoch": 1.1139943056907053, "grad_norm": 0.17226542532444, "learning_rate": 4.528644889317753e-05, "loss": 0.4158, "step": 30910 }, { "epoch": 1.1141745053519299, "grad_norm": 0.178422212600708, "learning_rate": 4.528474336615306e-05, "loss": 0.4268, "step": 30915 }, { "epoch": 1.1143547050131546, "grad_norm": 0.1679055094718933, "learning_rate": 4.528303756275288e-05, "loss": 0.4216, "step": 30920 }, { "epoch": 1.1145349046743793, "grad_norm": 0.21220412850379944, "learning_rate": 4.528133148300026e-05, "loss": 0.4276, "step": 30925 }, { "epoch": 1.1147151043356038, "grad_norm": 0.16649261116981506, "learning_rate": 4.52796251269184e-05, "loss": 0.4486, "step": 30930 }, { "epoch": 1.1148953039968286, "grad_norm": 0.19539640843868256, "learning_rate": 4.527791849453059e-05, "loss": 0.4124, "step": 30935 }, { "epoch": 1.115075503658053, "grad_norm": 0.17714886367321014, "learning_rate": 4.5276211585860064e-05, "loss": 0.4119, "step": 30940 }, { "epoch": 1.1152557033192778, "grad_norm": 0.17066708207130432, "learning_rate": 4.527450440093008e-05, "loss": 0.4194, "step": 30945 }, { "epoch": 1.1154359029805023, "grad_norm": 0.1813109666109085, "learning_rate": 4.527279693976389e-05, "loss": 0.4032, "step": 30950 }, { "epoch": 1.115616102641727, "grad_norm": 0.1931227445602417, "learning_rate": 4.527108920238478e-05, "loss": 0.4249, "step": 30955 }, { "epoch": 1.1157963023029516, "grad_norm": 0.16914038360118866, "learning_rate": 4.5269381188815996e-05, "loss": 0.381, "step": 30960 }, { "epoch": 1.1159765019641763, "grad_norm": 0.1841052621603012, "learning_rate": 4.526767289908083e-05, "loss": 0.4565, "step": 30965 }, { "epoch": 1.116156701625401, "grad_norm": 0.1776377409696579, "learning_rate": 4.5265964333202526e-05, "loss": 0.4371, "step": 30970 }, { "epoch": 1.1163369012866255, "grad_norm": 0.17121796309947968, "learning_rate": 4.526425549120439e-05, "loss": 0.4383, "step": 30975 }, { "epoch": 1.1165171009478503, "grad_norm": 0.16264458000659943, "learning_rate": 4.526254637310971e-05, "loss": 0.4006, "step": 30980 }, { "epoch": 1.1166973006090748, "grad_norm": 0.19844268262386322, "learning_rate": 4.526083697894173e-05, "loss": 0.4223, "step": 30985 }, { "epoch": 1.1168775002702995, "grad_norm": 0.19541612267494202, "learning_rate": 4.525912730872379e-05, "loss": 0.42, "step": 30990 }, { "epoch": 1.117057699931524, "grad_norm": 0.19237811863422394, "learning_rate": 4.525741736247916e-05, "loss": 0.3994, "step": 30995 }, { "epoch": 1.1172378995927488, "grad_norm": 0.15686160326004028, "learning_rate": 4.5255707140231136e-05, "loss": 0.4275, "step": 31000 }, { "epoch": 1.1172378995927488, "eval_loss": 0.4531842768192291, "eval_runtime": 3.5417, "eval_samples_per_second": 28.235, "eval_steps_per_second": 7.059, "step": 31000 }, { "epoch": 1.1174180992539733, "grad_norm": 0.16874189674854279, "learning_rate": 4.5253996642003025e-05, "loss": 0.4781, "step": 31005 }, { "epoch": 1.117598298915198, "grad_norm": 0.17031274735927582, "learning_rate": 4.525228586781813e-05, "loss": 0.4237, "step": 31010 }, { "epoch": 1.1177784985764228, "grad_norm": 0.19716662168502808, "learning_rate": 4.525057481769976e-05, "loss": 0.415, "step": 31015 }, { "epoch": 1.1179586982376473, "grad_norm": 0.15895715355873108, "learning_rate": 4.524886349167124e-05, "loss": 0.3885, "step": 31020 }, { "epoch": 1.118138897898872, "grad_norm": 0.1836315244436264, "learning_rate": 4.5247151889755855e-05, "loss": 0.4201, "step": 31025 }, { "epoch": 1.1183190975600965, "grad_norm": 0.17792055010795593, "learning_rate": 4.5245440011976966e-05, "loss": 0.4348, "step": 31030 }, { "epoch": 1.1184992972213212, "grad_norm": 0.17918603122234344, "learning_rate": 4.524372785835787e-05, "loss": 0.4039, "step": 31035 }, { "epoch": 1.118679496882546, "grad_norm": 0.1892392635345459, "learning_rate": 4.524201542892189e-05, "loss": 0.4318, "step": 31040 }, { "epoch": 1.1188596965437705, "grad_norm": 0.17238296568393707, "learning_rate": 4.524030272369238e-05, "loss": 0.4143, "step": 31045 }, { "epoch": 1.1190398962049952, "grad_norm": 0.1613350510597229, "learning_rate": 4.523858974269266e-05, "loss": 0.4531, "step": 31050 }, { "epoch": 1.1192200958662197, "grad_norm": 0.20245467126369476, "learning_rate": 4.5236876485946076e-05, "loss": 0.4655, "step": 31055 }, { "epoch": 1.1194002955274445, "grad_norm": 0.16436737775802612, "learning_rate": 4.5235162953475966e-05, "loss": 0.4167, "step": 31060 }, { "epoch": 1.119580495188669, "grad_norm": 0.15972478687763214, "learning_rate": 4.523344914530568e-05, "loss": 0.4051, "step": 31065 }, { "epoch": 1.1197606948498937, "grad_norm": 0.15999290347099304, "learning_rate": 4.523173506145856e-05, "loss": 0.3884, "step": 31070 }, { "epoch": 1.1199408945111182, "grad_norm": 0.15831471979618073, "learning_rate": 4.5230020701957976e-05, "loss": 0.3952, "step": 31075 }, { "epoch": 1.120121094172343, "grad_norm": 0.17444223165512085, "learning_rate": 4.522830606682727e-05, "loss": 0.4452, "step": 31080 }, { "epoch": 1.1203012938335677, "grad_norm": 0.16463379561901093, "learning_rate": 4.5226591156089816e-05, "loss": 0.4349, "step": 31085 }, { "epoch": 1.1204814934947922, "grad_norm": 0.16084179282188416, "learning_rate": 4.522487596976897e-05, "loss": 0.4415, "step": 31090 }, { "epoch": 1.120661693156017, "grad_norm": 0.16912023723125458, "learning_rate": 4.5223160507888106e-05, "loss": 0.4269, "step": 31095 }, { "epoch": 1.1208418928172414, "grad_norm": 0.1586826741695404, "learning_rate": 4.5221444770470595e-05, "loss": 0.4002, "step": 31100 }, { "epoch": 1.1210220924784662, "grad_norm": 0.1658790409564972, "learning_rate": 4.521972875753981e-05, "loss": 0.41, "step": 31105 }, { "epoch": 1.1212022921396907, "grad_norm": 0.18843096494674683, "learning_rate": 4.521801246911914e-05, "loss": 0.4006, "step": 31110 }, { "epoch": 1.1213824918009154, "grad_norm": 0.1666111797094345, "learning_rate": 4.521629590523197e-05, "loss": 0.3906, "step": 31115 }, { "epoch": 1.12156269146214, "grad_norm": 0.15296362340450287, "learning_rate": 4.521457906590167e-05, "loss": 0.4152, "step": 31120 }, { "epoch": 1.1217428911233647, "grad_norm": 0.20949307084083557, "learning_rate": 4.521286195115165e-05, "loss": 0.4181, "step": 31125 }, { "epoch": 1.1219230907845894, "grad_norm": 0.1738746613264084, "learning_rate": 4.5211144561005295e-05, "loss": 0.4134, "step": 31130 }, { "epoch": 1.122103290445814, "grad_norm": 0.199616476893425, "learning_rate": 4.520942689548601e-05, "loss": 0.4121, "step": 31135 }, { "epoch": 1.1222834901070387, "grad_norm": 0.13446038961410522, "learning_rate": 4.52077089546172e-05, "loss": 0.4055, "step": 31140 }, { "epoch": 1.1224636897682632, "grad_norm": 0.15235783159732819, "learning_rate": 4.520599073842226e-05, "loss": 0.416, "step": 31145 }, { "epoch": 1.122643889429488, "grad_norm": 0.1988137811422348, "learning_rate": 4.520427224692462e-05, "loss": 0.4166, "step": 31150 }, { "epoch": 1.1228240890907124, "grad_norm": 0.19993172585964203, "learning_rate": 4.520255348014768e-05, "loss": 0.4663, "step": 31155 }, { "epoch": 1.1230042887519371, "grad_norm": 0.2015010416507721, "learning_rate": 4.520083443811485e-05, "loss": 0.3944, "step": 31160 }, { "epoch": 1.1231844884131619, "grad_norm": 0.17933201789855957, "learning_rate": 4.519911512084957e-05, "loss": 0.4343, "step": 31165 }, { "epoch": 1.1233646880743864, "grad_norm": 0.15731388330459595, "learning_rate": 4.519739552837526e-05, "loss": 0.4049, "step": 31170 }, { "epoch": 1.1235448877356111, "grad_norm": 0.17282412946224213, "learning_rate": 4.519567566071534e-05, "loss": 0.4227, "step": 31175 }, { "epoch": 1.1237250873968356, "grad_norm": 0.165944442152977, "learning_rate": 4.519395551789325e-05, "loss": 0.4523, "step": 31180 }, { "epoch": 1.1239052870580604, "grad_norm": 0.1765991449356079, "learning_rate": 4.5192235099932425e-05, "loss": 0.4304, "step": 31185 }, { "epoch": 1.1240854867192849, "grad_norm": 0.17275585234165192, "learning_rate": 4.519051440685631e-05, "loss": 0.4245, "step": 31190 }, { "epoch": 1.1242656863805096, "grad_norm": 0.18365871906280518, "learning_rate": 4.518879343868834e-05, "loss": 0.4363, "step": 31195 }, { "epoch": 1.1244458860417343, "grad_norm": 0.17454713582992554, "learning_rate": 4.5187072195451975e-05, "loss": 0.4329, "step": 31200 }, { "epoch": 1.1246260857029589, "grad_norm": 0.1820690780878067, "learning_rate": 4.518535067717066e-05, "loss": 0.4217, "step": 31205 }, { "epoch": 1.1248062853641836, "grad_norm": 0.16349612176418304, "learning_rate": 4.518362888386784e-05, "loss": 0.4456, "step": 31210 }, { "epoch": 1.124986485025408, "grad_norm": 0.17479030787944794, "learning_rate": 4.5181906815566986e-05, "loss": 0.4347, "step": 31215 }, { "epoch": 1.1251666846866328, "grad_norm": 0.14878250658512115, "learning_rate": 4.5180184472291566e-05, "loss": 0.4315, "step": 31220 }, { "epoch": 1.1253468843478573, "grad_norm": 0.18707720935344696, "learning_rate": 4.5178461854065036e-05, "loss": 0.3981, "step": 31225 }, { "epoch": 1.125527084009082, "grad_norm": 0.19858436286449432, "learning_rate": 4.517673896091087e-05, "loss": 0.405, "step": 31230 }, { "epoch": 1.1257072836703066, "grad_norm": 0.19132393598556519, "learning_rate": 4.517501579285255e-05, "loss": 0.4229, "step": 31235 }, { "epoch": 1.1258874833315313, "grad_norm": 0.1902177333831787, "learning_rate": 4.5173292349913534e-05, "loss": 0.3822, "step": 31240 }, { "epoch": 1.126067682992756, "grad_norm": 0.1548781394958496, "learning_rate": 4.517156863211732e-05, "loss": 0.4134, "step": 31245 }, { "epoch": 1.1262478826539806, "grad_norm": 0.20979823172092438, "learning_rate": 4.517018945999895e-05, "loss": 0.4261, "step": 31250 }, { "epoch": 1.1264280823152053, "grad_norm": 0.17488163709640503, "learning_rate": 4.5168465247518955e-05, "loss": 0.4078, "step": 31255 }, { "epoch": 1.1266082819764298, "grad_norm": 0.16054439544677734, "learning_rate": 4.516674076024752e-05, "loss": 0.4304, "step": 31260 }, { "epoch": 1.1267884816376545, "grad_norm": 0.16727547347545624, "learning_rate": 4.516501599820816e-05, "loss": 0.4086, "step": 31265 }, { "epoch": 1.1269686812988793, "grad_norm": 0.17022760212421417, "learning_rate": 4.5163290961424355e-05, "loss": 0.4378, "step": 31270 }, { "epoch": 1.1271488809601038, "grad_norm": 0.1838621199131012, "learning_rate": 4.5161565649919614e-05, "loss": 0.4176, "step": 31275 }, { "epoch": 1.1273290806213285, "grad_norm": 0.16698120534420013, "learning_rate": 4.515984006371744e-05, "loss": 0.4196, "step": 31280 }, { "epoch": 1.127509280282553, "grad_norm": 0.19196081161499023, "learning_rate": 4.5158114202841354e-05, "loss": 0.3924, "step": 31285 }, { "epoch": 1.1276894799437778, "grad_norm": 0.17519588768482208, "learning_rate": 4.515638806731486e-05, "loss": 0.4042, "step": 31290 }, { "epoch": 1.1278696796050023, "grad_norm": 0.1803540289402008, "learning_rate": 4.515466165716149e-05, "loss": 0.3852, "step": 31295 }, { "epoch": 1.128049879266227, "grad_norm": 0.19045265018939972, "learning_rate": 4.5152934972404746e-05, "loss": 0.4543, "step": 31300 }, { "epoch": 1.1282300789274515, "grad_norm": 0.2028571367263794, "learning_rate": 4.515120801306818e-05, "loss": 0.4327, "step": 31305 }, { "epoch": 1.1284102785886763, "grad_norm": 0.18844293057918549, "learning_rate": 4.514948077917529e-05, "loss": 0.4408, "step": 31310 }, { "epoch": 1.128590478249901, "grad_norm": 0.17702354490756989, "learning_rate": 4.514775327074963e-05, "loss": 0.4077, "step": 31315 }, { "epoch": 1.1287706779111255, "grad_norm": 0.1727331429719925, "learning_rate": 4.514602548781474e-05, "loss": 0.4235, "step": 31320 }, { "epoch": 1.1289508775723502, "grad_norm": 0.18904700875282288, "learning_rate": 4.514429743039414e-05, "loss": 0.4351, "step": 31325 }, { "epoch": 1.1291310772335748, "grad_norm": 0.15556100010871887, "learning_rate": 4.51425690985114e-05, "loss": 0.4397, "step": 31330 }, { "epoch": 1.1293112768947995, "grad_norm": 0.16988936066627502, "learning_rate": 4.514084049219005e-05, "loss": 0.3974, "step": 31335 }, { "epoch": 1.129491476556024, "grad_norm": 0.16999125480651855, "learning_rate": 4.513911161145365e-05, "loss": 0.3972, "step": 31340 }, { "epoch": 1.1296716762172487, "grad_norm": 0.20566906034946442, "learning_rate": 4.513738245632575e-05, "loss": 0.4368, "step": 31345 }, { "epoch": 1.1298518758784732, "grad_norm": 0.2229662835597992, "learning_rate": 4.513565302682992e-05, "loss": 0.4017, "step": 31350 }, { "epoch": 1.130032075539698, "grad_norm": 0.1942766159772873, "learning_rate": 4.513392332298971e-05, "loss": 0.4144, "step": 31355 }, { "epoch": 1.1302122752009227, "grad_norm": 0.1725579798221588, "learning_rate": 4.513219334482869e-05, "loss": 0.386, "step": 31360 }, { "epoch": 1.1303924748621472, "grad_norm": 0.1760469377040863, "learning_rate": 4.513046309237044e-05, "loss": 0.4181, "step": 31365 }, { "epoch": 1.130572674523372, "grad_norm": 0.1780589520931244, "learning_rate": 4.512873256563852e-05, "loss": 0.4206, "step": 31370 }, { "epoch": 1.1307528741845965, "grad_norm": 0.16992972791194916, "learning_rate": 4.5127001764656526e-05, "loss": 0.4126, "step": 31375 }, { "epoch": 1.1309330738458212, "grad_norm": 0.1983700394630432, "learning_rate": 4.5125270689448015e-05, "loss": 0.4072, "step": 31380 }, { "epoch": 1.1311132735070457, "grad_norm": 0.17569278180599213, "learning_rate": 4.51235393400366e-05, "loss": 0.4082, "step": 31385 }, { "epoch": 1.1312934731682704, "grad_norm": 0.22225306928157806, "learning_rate": 4.512180771644585e-05, "loss": 0.4641, "step": 31390 }, { "epoch": 1.131473672829495, "grad_norm": 0.1481614112854004, "learning_rate": 4.512007581869937e-05, "loss": 0.3931, "step": 31395 }, { "epoch": 1.1316538724907197, "grad_norm": 0.21973121166229248, "learning_rate": 4.511834364682076e-05, "loss": 0.4273, "step": 31400 }, { "epoch": 1.1318340721519444, "grad_norm": 0.21641644835472107, "learning_rate": 4.51166112008336e-05, "loss": 0.4151, "step": 31405 }, { "epoch": 1.132014271813169, "grad_norm": 0.18511059880256653, "learning_rate": 4.511487848076151e-05, "loss": 0.4423, "step": 31410 }, { "epoch": 1.1321944714743937, "grad_norm": 0.2063097059726715, "learning_rate": 4.5113145486628095e-05, "loss": 0.4257, "step": 31415 }, { "epoch": 1.1323746711356182, "grad_norm": 0.17430323362350464, "learning_rate": 4.511141221845697e-05, "loss": 0.4076, "step": 31420 }, { "epoch": 1.132554870796843, "grad_norm": 0.1422608643770218, "learning_rate": 4.510967867627175e-05, "loss": 0.3761, "step": 31425 }, { "epoch": 1.1327350704580676, "grad_norm": 0.2072693109512329, "learning_rate": 4.510794486009604e-05, "loss": 0.4133, "step": 31430 }, { "epoch": 1.1329152701192922, "grad_norm": 0.1433665156364441, "learning_rate": 4.5106210769953484e-05, "loss": 0.3992, "step": 31435 }, { "epoch": 1.133095469780517, "grad_norm": 0.20627710223197937, "learning_rate": 4.5104476405867705e-05, "loss": 0.4233, "step": 31440 }, { "epoch": 1.1332756694417414, "grad_norm": 0.1617184430360794, "learning_rate": 4.510274176786231e-05, "loss": 0.4564, "step": 31445 }, { "epoch": 1.1334558691029661, "grad_norm": 0.1667255014181137, "learning_rate": 4.5101006855960956e-05, "loss": 0.4173, "step": 31450 }, { "epoch": 1.1336360687641907, "grad_norm": 0.17955441772937775, "learning_rate": 4.5099271670187285e-05, "loss": 0.4203, "step": 31455 }, { "epoch": 1.1338162684254154, "grad_norm": 0.1967443972826004, "learning_rate": 4.5097536210564915e-05, "loss": 0.4261, "step": 31460 }, { "epoch": 1.13399646808664, "grad_norm": 0.1767064779996872, "learning_rate": 4.5095800477117514e-05, "loss": 0.423, "step": 31465 }, { "epoch": 1.1341766677478646, "grad_norm": 0.15333469212055206, "learning_rate": 4.5094064469868726e-05, "loss": 0.4139, "step": 31470 }, { "epoch": 1.1343568674090894, "grad_norm": 0.1856624186038971, "learning_rate": 4.509232818884219e-05, "loss": 0.4326, "step": 31475 }, { "epoch": 1.1345370670703139, "grad_norm": 0.20386086404323578, "learning_rate": 4.5090591634061577e-05, "loss": 0.4424, "step": 31480 }, { "epoch": 1.1347172667315386, "grad_norm": 0.2295321226119995, "learning_rate": 4.508885480555055e-05, "loss": 0.4369, "step": 31485 }, { "epoch": 1.1348974663927631, "grad_norm": 0.2000851184129715, "learning_rate": 4.5087117703332755e-05, "loss": 0.3865, "step": 31490 }, { "epoch": 1.1350776660539879, "grad_norm": 0.19415894150733948, "learning_rate": 4.508538032743187e-05, "loss": 0.3817, "step": 31495 }, { "epoch": 1.1352578657152124, "grad_norm": 0.23722659051418304, "learning_rate": 4.5083642677871575e-05, "loss": 0.4131, "step": 31500 }, { "epoch": 1.1352578657152124, "eval_loss": 0.45174121856689453, "eval_runtime": 3.5299, "eval_samples_per_second": 28.33, "eval_steps_per_second": 7.082, "step": 31500 }, { "epoch": 1.135438065376437, "grad_norm": 0.1844346821308136, "learning_rate": 4.508190475467553e-05, "loss": 0.4107, "step": 31505 }, { "epoch": 1.1356182650376616, "grad_norm": 0.16183345019817352, "learning_rate": 4.508016655786742e-05, "loss": 0.4014, "step": 31510 }, { "epoch": 1.1357984646988863, "grad_norm": 0.20634831488132477, "learning_rate": 4.507842808747093e-05, "loss": 0.4184, "step": 31515 }, { "epoch": 1.135978664360111, "grad_norm": 0.15508055686950684, "learning_rate": 4.507668934350975e-05, "loss": 0.4209, "step": 31520 }, { "epoch": 1.1361588640213356, "grad_norm": 0.1634722650051117, "learning_rate": 4.507495032600756e-05, "loss": 0.3867, "step": 31525 }, { "epoch": 1.1363390636825603, "grad_norm": 0.1492082178592682, "learning_rate": 4.5073211034988055e-05, "loss": 0.3896, "step": 31530 }, { "epoch": 1.1365192633437848, "grad_norm": 0.17373375594615936, "learning_rate": 4.507147147047494e-05, "loss": 0.4057, "step": 31535 }, { "epoch": 1.1366994630050096, "grad_norm": 0.178725004196167, "learning_rate": 4.5069731632491914e-05, "loss": 0.4372, "step": 31540 }, { "epoch": 1.1368796626662343, "grad_norm": 0.17013704776763916, "learning_rate": 4.506799152106268e-05, "loss": 0.441, "step": 31545 }, { "epoch": 1.1370598623274588, "grad_norm": 0.1601504683494568, "learning_rate": 4.506625113621094e-05, "loss": 0.4496, "step": 31550 }, { "epoch": 1.1372400619886835, "grad_norm": 0.16774295270442963, "learning_rate": 4.506451047796042e-05, "loss": 0.4396, "step": 31555 }, { "epoch": 1.137420261649908, "grad_norm": 0.18739162385463715, "learning_rate": 4.506276954633483e-05, "loss": 0.4404, "step": 31560 }, { "epoch": 1.1376004613111328, "grad_norm": 0.2417900562286377, "learning_rate": 4.506102834135788e-05, "loss": 0.4549, "step": 31565 }, { "epoch": 1.1377806609723573, "grad_norm": 0.182041198015213, "learning_rate": 4.5059286863053314e-05, "loss": 0.4239, "step": 31570 }, { "epoch": 1.137960860633582, "grad_norm": 0.18919362127780914, "learning_rate": 4.5057545111444846e-05, "loss": 0.4025, "step": 31575 }, { "epoch": 1.1381410602948066, "grad_norm": 0.17599248886108398, "learning_rate": 4.50558030865562e-05, "loss": 0.4041, "step": 31580 }, { "epoch": 1.1383212599560313, "grad_norm": 0.15774129331111908, "learning_rate": 4.505406078841113e-05, "loss": 0.4701, "step": 31585 }, { "epoch": 1.138501459617256, "grad_norm": 0.17122262716293335, "learning_rate": 4.505231821703336e-05, "loss": 0.4054, "step": 31590 }, { "epoch": 1.1386816592784805, "grad_norm": 0.15675261616706848, "learning_rate": 4.505057537244664e-05, "loss": 0.4538, "step": 31595 }, { "epoch": 1.1388618589397053, "grad_norm": 0.1774090677499771, "learning_rate": 4.504883225467471e-05, "loss": 0.4258, "step": 31600 }, { "epoch": 1.1390420586009298, "grad_norm": 0.13083837926387787, "learning_rate": 4.5047088863741314e-05, "loss": 0.3618, "step": 31605 }, { "epoch": 1.1392222582621545, "grad_norm": 0.1870373785495758, "learning_rate": 4.5045345199670227e-05, "loss": 0.4286, "step": 31610 }, { "epoch": 1.139402457923379, "grad_norm": 0.1769174039363861, "learning_rate": 4.504360126248518e-05, "loss": 0.4424, "step": 31615 }, { "epoch": 1.1395826575846038, "grad_norm": 0.19420921802520752, "learning_rate": 4.5041857052209954e-05, "loss": 0.4086, "step": 31620 }, { "epoch": 1.1397628572458283, "grad_norm": 0.19101811945438385, "learning_rate": 4.504011256886831e-05, "loss": 0.4177, "step": 31625 }, { "epoch": 1.139943056907053, "grad_norm": 0.20886962115764618, "learning_rate": 4.503836781248401e-05, "loss": 0.4463, "step": 31630 }, { "epoch": 1.1401232565682777, "grad_norm": 0.20521090924739838, "learning_rate": 4.503662278308082e-05, "loss": 0.408, "step": 31635 }, { "epoch": 1.1403034562295022, "grad_norm": 0.16079290211200714, "learning_rate": 4.503487748068253e-05, "loss": 0.437, "step": 31640 }, { "epoch": 1.140483655890727, "grad_norm": 0.18102915585041046, "learning_rate": 4.5033131905312906e-05, "loss": 0.4308, "step": 31645 }, { "epoch": 1.1406638555519515, "grad_norm": 0.1715448796749115, "learning_rate": 4.503138605699575e-05, "loss": 0.3977, "step": 31650 }, { "epoch": 1.1408440552131762, "grad_norm": 0.18790487945079803, "learning_rate": 4.502963993575483e-05, "loss": 0.4169, "step": 31655 }, { "epoch": 1.141024254874401, "grad_norm": 0.15706005692481995, "learning_rate": 4.502789354161395e-05, "loss": 0.416, "step": 31660 }, { "epoch": 1.1412044545356255, "grad_norm": 0.1588856726884842, "learning_rate": 4.5026146874596895e-05, "loss": 0.4035, "step": 31665 }, { "epoch": 1.1413846541968502, "grad_norm": 0.2316998541355133, "learning_rate": 4.502439993472746e-05, "loss": 0.4545, "step": 31670 }, { "epoch": 1.1415648538580747, "grad_norm": 0.20132675766944885, "learning_rate": 4.502265272202946e-05, "loss": 0.4461, "step": 31675 }, { "epoch": 1.1417450535192994, "grad_norm": 0.1755920946598053, "learning_rate": 4.502090523652669e-05, "loss": 0.4417, "step": 31680 }, { "epoch": 1.141925253180524, "grad_norm": 0.19191905856132507, "learning_rate": 4.501915747824296e-05, "loss": 0.4183, "step": 31685 }, { "epoch": 1.1421054528417487, "grad_norm": 0.16491484642028809, "learning_rate": 4.501740944720209e-05, "loss": 0.4373, "step": 31690 }, { "epoch": 1.1422856525029732, "grad_norm": 0.16868291795253754, "learning_rate": 4.501566114342789e-05, "loss": 0.4038, "step": 31695 }, { "epoch": 1.142465852164198, "grad_norm": 0.1392037272453308, "learning_rate": 4.501391256694418e-05, "loss": 0.3929, "step": 31700 }, { "epoch": 1.1426460518254227, "grad_norm": 0.1932985484600067, "learning_rate": 4.501216371777479e-05, "loss": 0.4323, "step": 31705 }, { "epoch": 1.1428262514866472, "grad_norm": 0.17676390707492828, "learning_rate": 4.501041459594354e-05, "loss": 0.419, "step": 31710 }, { "epoch": 1.143006451147872, "grad_norm": 0.20211122930049896, "learning_rate": 4.5008665201474273e-05, "loss": 0.4147, "step": 31715 }, { "epoch": 1.1431866508090964, "grad_norm": 0.16196738183498383, "learning_rate": 4.50069155343908e-05, "loss": 0.4052, "step": 31720 }, { "epoch": 1.1433668504703212, "grad_norm": 0.16365759074687958, "learning_rate": 4.500516559471699e-05, "loss": 0.424, "step": 31725 }, { "epoch": 1.1435470501315457, "grad_norm": 0.16799362003803253, "learning_rate": 4.500341538247667e-05, "loss": 0.4237, "step": 31730 }, { "epoch": 1.1437272497927704, "grad_norm": 0.1993078589439392, "learning_rate": 4.500166489769369e-05, "loss": 0.4194, "step": 31735 }, { "epoch": 1.143907449453995, "grad_norm": 0.15421169996261597, "learning_rate": 4.49999141403919e-05, "loss": 0.3786, "step": 31740 }, { "epoch": 1.1440876491152197, "grad_norm": 0.17669521272182465, "learning_rate": 4.499816311059514e-05, "loss": 0.3992, "step": 31745 }, { "epoch": 1.1442678487764444, "grad_norm": 0.18035532534122467, "learning_rate": 4.499641180832729e-05, "loss": 0.433, "step": 31750 }, { "epoch": 1.144448048437669, "grad_norm": 0.15400253236293793, "learning_rate": 4.49946602336122e-05, "loss": 0.4367, "step": 31755 }, { "epoch": 1.1446282480988936, "grad_norm": 0.16569051146507263, "learning_rate": 4.4992908386473727e-05, "loss": 0.3999, "step": 31760 }, { "epoch": 1.1448084477601181, "grad_norm": 0.16333115100860596, "learning_rate": 4.4991156266935755e-05, "loss": 0.4458, "step": 31765 }, { "epoch": 1.1449886474213429, "grad_norm": 0.17189821600914001, "learning_rate": 4.498940387502214e-05, "loss": 0.399, "step": 31770 }, { "epoch": 1.1451688470825676, "grad_norm": 0.2206844836473465, "learning_rate": 4.498765121075678e-05, "loss": 0.4515, "step": 31775 }, { "epoch": 1.1453490467437921, "grad_norm": 0.18912701308727264, "learning_rate": 4.498589827416354e-05, "loss": 0.4375, "step": 31780 }, { "epoch": 1.1455292464050169, "grad_norm": 0.17603172361850739, "learning_rate": 4.49841450652663e-05, "loss": 0.407, "step": 31785 }, { "epoch": 1.1457094460662414, "grad_norm": 0.1851511150598526, "learning_rate": 4.498239158408896e-05, "loss": 0.4273, "step": 31790 }, { "epoch": 1.145889645727466, "grad_norm": 0.13465797901153564, "learning_rate": 4.498063783065539e-05, "loss": 0.4008, "step": 31795 }, { "epoch": 1.1460698453886906, "grad_norm": 0.16686324775218964, "learning_rate": 4.4978883804989516e-05, "loss": 0.4083, "step": 31800 }, { "epoch": 1.1462500450499153, "grad_norm": 0.16204862296581268, "learning_rate": 4.49771295071152e-05, "loss": 0.4026, "step": 31805 }, { "epoch": 1.1464302447111399, "grad_norm": 0.19073139131069183, "learning_rate": 4.4975374937056366e-05, "loss": 0.4004, "step": 31810 }, { "epoch": 1.1466104443723646, "grad_norm": 0.20284846425056458, "learning_rate": 4.4973620094836926e-05, "loss": 0.4195, "step": 31815 }, { "epoch": 1.1467906440335893, "grad_norm": 0.18009348213672638, "learning_rate": 4.497186498048077e-05, "loss": 0.4421, "step": 31820 }, { "epoch": 1.1469708436948138, "grad_norm": 0.18369971215724945, "learning_rate": 4.497010959401183e-05, "loss": 0.417, "step": 31825 }, { "epoch": 1.1471510433560386, "grad_norm": 0.17240361869335175, "learning_rate": 4.4968353935454004e-05, "loss": 0.3953, "step": 31830 }, { "epoch": 1.147331243017263, "grad_norm": 0.182241752743721, "learning_rate": 4.496659800483123e-05, "loss": 0.4441, "step": 31835 }, { "epoch": 1.1475114426784878, "grad_norm": 0.20979800820350647, "learning_rate": 4.4964841802167414e-05, "loss": 0.4349, "step": 31840 }, { "epoch": 1.1476916423397123, "grad_norm": 0.1706288456916809, "learning_rate": 4.49630853274865e-05, "loss": 0.3891, "step": 31845 }, { "epoch": 1.147871842000937, "grad_norm": 0.17474350333213806, "learning_rate": 4.496132858081241e-05, "loss": 0.4184, "step": 31850 }, { "epoch": 1.1480520416621616, "grad_norm": 0.16004018485546112, "learning_rate": 4.495957156216908e-05, "loss": 0.4254, "step": 31855 }, { "epoch": 1.1482322413233863, "grad_norm": 0.20258988440036774, "learning_rate": 4.495781427158046e-05, "loss": 0.4425, "step": 31860 }, { "epoch": 1.148412440984611, "grad_norm": 0.18658919632434845, "learning_rate": 4.4956056709070485e-05, "loss": 0.4369, "step": 31865 }, { "epoch": 1.1485926406458355, "grad_norm": 0.16538068652153015, "learning_rate": 4.495429887466309e-05, "loss": 0.4099, "step": 31870 }, { "epoch": 1.1487728403070603, "grad_norm": 0.16032880544662476, "learning_rate": 4.495254076838225e-05, "loss": 0.3942, "step": 31875 }, { "epoch": 1.1489530399682848, "grad_norm": 0.14562112092971802, "learning_rate": 4.495078239025189e-05, "loss": 0.4076, "step": 31880 }, { "epoch": 1.1491332396295095, "grad_norm": 0.20045393705368042, "learning_rate": 4.494902374029599e-05, "loss": 0.4033, "step": 31885 }, { "epoch": 1.149313439290734, "grad_norm": 0.1704903542995453, "learning_rate": 4.4947264818538505e-05, "loss": 0.4255, "step": 31890 }, { "epoch": 1.1494936389519588, "grad_norm": 0.18456216156482697, "learning_rate": 4.49455056250034e-05, "loss": 0.4368, "step": 31895 }, { "epoch": 1.1496738386131833, "grad_norm": 0.18716961145401, "learning_rate": 4.494374615971464e-05, "loss": 0.4268, "step": 31900 }, { "epoch": 1.149854038274408, "grad_norm": 0.1741458624601364, "learning_rate": 4.49419864226962e-05, "loss": 0.3761, "step": 31905 }, { "epoch": 1.1500342379356328, "grad_norm": 0.17461341619491577, "learning_rate": 4.494022641397205e-05, "loss": 0.4028, "step": 31910 }, { "epoch": 1.1502144375968573, "grad_norm": 0.16203176975250244, "learning_rate": 4.493846613356619e-05, "loss": 0.4028, "step": 31915 }, { "epoch": 1.150394637258082, "grad_norm": 0.16421377658843994, "learning_rate": 4.493670558150258e-05, "loss": 0.4365, "step": 31920 }, { "epoch": 1.1505748369193065, "grad_norm": 0.17700999975204468, "learning_rate": 4.493494475780521e-05, "loss": 0.4262, "step": 31925 }, { "epoch": 1.1507550365805312, "grad_norm": 0.1917300820350647, "learning_rate": 4.493318366249809e-05, "loss": 0.4477, "step": 31930 }, { "epoch": 1.150935236241756, "grad_norm": 0.24866171181201935, "learning_rate": 4.4931422295605196e-05, "loss": 0.3896, "step": 31935 }, { "epoch": 1.1511154359029805, "grad_norm": 0.16452571749687195, "learning_rate": 4.492966065715053e-05, "loss": 0.4155, "step": 31940 }, { "epoch": 1.1512956355642052, "grad_norm": 0.15788938105106354, "learning_rate": 4.4927898747158095e-05, "loss": 0.424, "step": 31945 }, { "epoch": 1.1514758352254297, "grad_norm": 0.16340243816375732, "learning_rate": 4.4926136565651904e-05, "loss": 0.4527, "step": 31950 }, { "epoch": 1.1516560348866545, "grad_norm": 0.16550558805465698, "learning_rate": 4.492437411265596e-05, "loss": 0.4677, "step": 31955 }, { "epoch": 1.151836234547879, "grad_norm": 0.14602138102054596, "learning_rate": 4.492261138819427e-05, "loss": 0.4225, "step": 31960 }, { "epoch": 1.1520164342091037, "grad_norm": 0.19070053100585938, "learning_rate": 4.492084839229086e-05, "loss": 0.4458, "step": 31965 }, { "epoch": 1.1521966338703282, "grad_norm": 0.18122611939907074, "learning_rate": 4.491908512496975e-05, "loss": 0.4202, "step": 31970 }, { "epoch": 1.152376833531553, "grad_norm": 0.1686059683561325, "learning_rate": 4.491732158625496e-05, "loss": 0.4264, "step": 31975 }, { "epoch": 1.1525570331927777, "grad_norm": 0.15016207098960876, "learning_rate": 4.491555777617051e-05, "loss": 0.4277, "step": 31980 }, { "epoch": 1.1527372328540022, "grad_norm": 0.15645290911197662, "learning_rate": 4.491379369474046e-05, "loss": 0.4294, "step": 31985 }, { "epoch": 1.152917432515227, "grad_norm": 0.14731116592884064, "learning_rate": 4.491202934198881e-05, "loss": 0.3946, "step": 31990 }, { "epoch": 1.1530976321764514, "grad_norm": 0.1814972460269928, "learning_rate": 4.491026471793962e-05, "loss": 0.4306, "step": 31995 }, { "epoch": 1.1532778318376762, "grad_norm": 0.2180231809616089, "learning_rate": 4.4908499822616934e-05, "loss": 0.421, "step": 32000 }, { "epoch": 1.1532778318376762, "eval_loss": 0.4514605402946472, "eval_runtime": 3.5405, "eval_samples_per_second": 28.244, "eval_steps_per_second": 7.061, "step": 32000 }, { "epoch": 1.1534580314989007, "grad_norm": 0.18036217987537384, "learning_rate": 4.4906734656044786e-05, "loss": 0.4261, "step": 32005 }, { "epoch": 1.1536382311601254, "grad_norm": 0.2106885462999344, "learning_rate": 4.4904969218247234e-05, "loss": 0.3771, "step": 32010 }, { "epoch": 1.15381843082135, "grad_norm": 0.15591959655284882, "learning_rate": 4.490320350924833e-05, "loss": 0.4009, "step": 32015 }, { "epoch": 1.1539986304825747, "grad_norm": 0.14869220554828644, "learning_rate": 4.490143752907213e-05, "loss": 0.4101, "step": 32020 }, { "epoch": 1.1541788301437994, "grad_norm": 0.1626279205083847, "learning_rate": 4.489967127774271e-05, "loss": 0.4174, "step": 32025 }, { "epoch": 1.154359029805024, "grad_norm": 0.1796058565378189, "learning_rate": 4.48979047552841e-05, "loss": 0.4132, "step": 32030 }, { "epoch": 1.1545392294662487, "grad_norm": 0.16110989451408386, "learning_rate": 4.48961379617204e-05, "loss": 0.4238, "step": 32035 }, { "epoch": 1.1547194291274732, "grad_norm": 0.17553108930587769, "learning_rate": 4.489437089707568e-05, "loss": 0.3808, "step": 32040 }, { "epoch": 1.154899628788698, "grad_norm": 0.289981484413147, "learning_rate": 4.489260356137399e-05, "loss": 0.4202, "step": 32045 }, { "epoch": 1.1550798284499226, "grad_norm": 0.162216454744339, "learning_rate": 4.489083595463944e-05, "loss": 0.409, "step": 32050 }, { "epoch": 1.1552600281111471, "grad_norm": 0.18123595416545868, "learning_rate": 4.48890680768961e-05, "loss": 0.4316, "step": 32055 }, { "epoch": 1.1554402277723719, "grad_norm": 0.22130350768566132, "learning_rate": 4.488729992816806e-05, "loss": 0.4034, "step": 32060 }, { "epoch": 1.1556204274335964, "grad_norm": 0.15505443513393402, "learning_rate": 4.488553150847941e-05, "loss": 0.3919, "step": 32065 }, { "epoch": 1.1558006270948211, "grad_norm": 0.17959943413734436, "learning_rate": 4.4883762817854236e-05, "loss": 0.4241, "step": 32070 }, { "epoch": 1.1559808267560456, "grad_norm": 0.18637138605117798, "learning_rate": 4.488199385631665e-05, "loss": 0.4389, "step": 32075 }, { "epoch": 1.1561610264172704, "grad_norm": 0.16619151830673218, "learning_rate": 4.488022462389074e-05, "loss": 0.4582, "step": 32080 }, { "epoch": 1.1563412260784949, "grad_norm": 0.22853337228298187, "learning_rate": 4.487845512060063e-05, "loss": 0.4317, "step": 32085 }, { "epoch": 1.1565214257397196, "grad_norm": 0.15449535846710205, "learning_rate": 4.487668534647041e-05, "loss": 0.4514, "step": 32090 }, { "epoch": 1.1567016254009443, "grad_norm": 0.18602608144283295, "learning_rate": 4.4874915301524194e-05, "loss": 0.4577, "step": 32095 }, { "epoch": 1.1568818250621689, "grad_norm": 0.178580179810524, "learning_rate": 4.487314498578611e-05, "loss": 0.4175, "step": 32100 }, { "epoch": 1.1570620247233936, "grad_norm": 0.17809760570526123, "learning_rate": 4.4871374399280273e-05, "loss": 0.4256, "step": 32105 }, { "epoch": 1.157242224384618, "grad_norm": 0.16492916643619537, "learning_rate": 4.4869603542030806e-05, "loss": 0.4098, "step": 32110 }, { "epoch": 1.1574224240458428, "grad_norm": 0.1905374526977539, "learning_rate": 4.486783241406184e-05, "loss": 0.4009, "step": 32115 }, { "epoch": 1.1576026237070673, "grad_norm": 0.16066482663154602, "learning_rate": 4.486606101539751e-05, "loss": 0.398, "step": 32120 }, { "epoch": 1.157782823368292, "grad_norm": 0.1904597133398056, "learning_rate": 4.486428934606194e-05, "loss": 0.405, "step": 32125 }, { "epoch": 1.1579630230295166, "grad_norm": 0.16945073008537292, "learning_rate": 4.486251740607927e-05, "loss": 0.4205, "step": 32130 }, { "epoch": 1.1581432226907413, "grad_norm": 0.19391489028930664, "learning_rate": 4.486074519547365e-05, "loss": 0.4073, "step": 32135 }, { "epoch": 1.158323422351966, "grad_norm": 0.13818098604679108, "learning_rate": 4.4858972714269215e-05, "loss": 0.3963, "step": 32140 }, { "epoch": 1.1585036220131906, "grad_norm": 0.1501578539609909, "learning_rate": 4.485719996249012e-05, "loss": 0.4193, "step": 32145 }, { "epoch": 1.1586838216744153, "grad_norm": 0.19974437355995178, "learning_rate": 4.4855426940160536e-05, "loss": 0.4491, "step": 32150 }, { "epoch": 1.1588640213356398, "grad_norm": 0.1437712460756302, "learning_rate": 4.4853653647304596e-05, "loss": 0.4116, "step": 32155 }, { "epoch": 1.1590442209968645, "grad_norm": 0.22510062158107758, "learning_rate": 4.485188008394646e-05, "loss": 0.419, "step": 32160 }, { "epoch": 1.1592244206580893, "grad_norm": 0.16433167457580566, "learning_rate": 4.485010625011031e-05, "loss": 0.4199, "step": 32165 }, { "epoch": 1.1594046203193138, "grad_norm": 0.18741507828235626, "learning_rate": 4.48483321458203e-05, "loss": 0.4125, "step": 32170 }, { "epoch": 1.1595848199805385, "grad_norm": 0.171631321310997, "learning_rate": 4.484655777110062e-05, "loss": 0.4034, "step": 32175 }, { "epoch": 1.159765019641763, "grad_norm": 0.20612791180610657, "learning_rate": 4.484478312597542e-05, "loss": 0.4072, "step": 32180 }, { "epoch": 1.1599452193029878, "grad_norm": 0.162516251206398, "learning_rate": 4.4843008210468896e-05, "loss": 0.4092, "step": 32185 }, { "epoch": 1.1601254189642123, "grad_norm": 0.17401380836963654, "learning_rate": 4.484123302460523e-05, "loss": 0.3899, "step": 32190 }, { "epoch": 1.160305618625437, "grad_norm": 0.1766970008611679, "learning_rate": 4.48394575684086e-05, "loss": 0.4243, "step": 32195 }, { "epoch": 1.1604858182866615, "grad_norm": 0.17945876717567444, "learning_rate": 4.483768184190321e-05, "loss": 0.4148, "step": 32200 }, { "epoch": 1.1606660179478863, "grad_norm": 0.17272137105464935, "learning_rate": 4.4835905845113234e-05, "loss": 0.4174, "step": 32205 }, { "epoch": 1.160846217609111, "grad_norm": 0.14284390211105347, "learning_rate": 4.483412957806289e-05, "loss": 0.4218, "step": 32210 }, { "epoch": 1.1610264172703355, "grad_norm": 0.18068622052669525, "learning_rate": 4.483235304077636e-05, "loss": 0.4128, "step": 32215 }, { "epoch": 1.1612066169315602, "grad_norm": 0.17758698761463165, "learning_rate": 4.483057623327787e-05, "loss": 0.4533, "step": 32220 }, { "epoch": 1.1613868165927848, "grad_norm": 0.17569084465503693, "learning_rate": 4.4828799155591615e-05, "loss": 0.448, "step": 32225 }, { "epoch": 1.1615670162540095, "grad_norm": 0.18562377989292145, "learning_rate": 4.4827021807741806e-05, "loss": 0.4072, "step": 32230 }, { "epoch": 1.161747215915234, "grad_norm": 0.19902653992176056, "learning_rate": 4.482524418975267e-05, "loss": 0.4286, "step": 32235 }, { "epoch": 1.1619274155764587, "grad_norm": 0.17017509043216705, "learning_rate": 4.482346630164842e-05, "loss": 0.4172, "step": 32240 }, { "epoch": 1.1621076152376832, "grad_norm": 0.14589948952198029, "learning_rate": 4.4821688143453275e-05, "loss": 0.4425, "step": 32245 }, { "epoch": 1.162287814898908, "grad_norm": 0.17183731496334076, "learning_rate": 4.4819909715191475e-05, "loss": 0.4243, "step": 32250 }, { "epoch": 1.1624680145601327, "grad_norm": 0.14236903190612793, "learning_rate": 4.481813101688723e-05, "loss": 0.4096, "step": 32255 }, { "epoch": 1.1626482142213572, "grad_norm": 0.21945489943027496, "learning_rate": 4.4816352048564806e-05, "loss": 0.4207, "step": 32260 }, { "epoch": 1.162828413882582, "grad_norm": 0.21350398659706116, "learning_rate": 4.48145728102484e-05, "loss": 0.4225, "step": 32265 }, { "epoch": 1.1630086135438065, "grad_norm": 0.11839442700147629, "learning_rate": 4.481279330196229e-05, "loss": 0.3818, "step": 32270 }, { "epoch": 1.1631888132050312, "grad_norm": 0.16868416965007782, "learning_rate": 4.481101352373071e-05, "loss": 0.4231, "step": 32275 }, { "epoch": 1.163369012866256, "grad_norm": 0.1809322088956833, "learning_rate": 4.480923347557789e-05, "loss": 0.4284, "step": 32280 }, { "epoch": 1.1635492125274804, "grad_norm": 0.2081749439239502, "learning_rate": 4.48074531575281e-05, "loss": 0.4275, "step": 32285 }, { "epoch": 1.1637294121887052, "grad_norm": 0.17321893572807312, "learning_rate": 4.480567256960561e-05, "loss": 0.4338, "step": 32290 }, { "epoch": 1.1639096118499297, "grad_norm": 0.19908635318279266, "learning_rate": 4.480389171183466e-05, "loss": 0.3936, "step": 32295 }, { "epoch": 1.1640898115111544, "grad_norm": 0.19130414724349976, "learning_rate": 4.480211058423952e-05, "loss": 0.4122, "step": 32300 }, { "epoch": 1.164270011172379, "grad_norm": 0.20319342613220215, "learning_rate": 4.480032918684446e-05, "loss": 0.3795, "step": 32305 }, { "epoch": 1.1644502108336037, "grad_norm": 0.19457338750362396, "learning_rate": 4.479854751967374e-05, "loss": 0.4287, "step": 32310 }, { "epoch": 1.1646304104948282, "grad_norm": 0.14478561282157898, "learning_rate": 4.479676558275164e-05, "loss": 0.413, "step": 32315 }, { "epoch": 1.164810610156053, "grad_norm": 0.19652974605560303, "learning_rate": 4.479498337610246e-05, "loss": 0.4123, "step": 32320 }, { "epoch": 1.1649908098172777, "grad_norm": 0.14270542562007904, "learning_rate": 4.4793200899750445e-05, "loss": 0.4011, "step": 32325 }, { "epoch": 1.1651710094785022, "grad_norm": 0.16907188296318054, "learning_rate": 4.4791418153719914e-05, "loss": 0.4427, "step": 32330 }, { "epoch": 1.165351209139727, "grad_norm": 0.19798724353313446, "learning_rate": 4.4789635138035137e-05, "loss": 0.4178, "step": 32335 }, { "epoch": 1.1655314088009514, "grad_norm": 0.1947140246629715, "learning_rate": 4.4787851852720405e-05, "loss": 0.405, "step": 32340 }, { "epoch": 1.1657116084621761, "grad_norm": 0.19974881410598755, "learning_rate": 4.4786068297800034e-05, "loss": 0.4248, "step": 32345 }, { "epoch": 1.1658918081234007, "grad_norm": 0.23895248770713806, "learning_rate": 4.478428447329831e-05, "loss": 0.4351, "step": 32350 }, { "epoch": 1.1660720077846254, "grad_norm": 0.1716042160987854, "learning_rate": 4.4782500379239534e-05, "loss": 0.4104, "step": 32355 }, { "epoch": 1.16625220744585, "grad_norm": 0.14665000140666962, "learning_rate": 4.4780716015648026e-05, "loss": 0.4237, "step": 32360 }, { "epoch": 1.1664324071070746, "grad_norm": 0.1704597920179367, "learning_rate": 4.4778931382548096e-05, "loss": 0.4007, "step": 32365 }, { "epoch": 1.1666126067682994, "grad_norm": 0.17646756768226624, "learning_rate": 4.477714647996405e-05, "loss": 0.4092, "step": 32370 }, { "epoch": 1.1667928064295239, "grad_norm": 0.13408124446868896, "learning_rate": 4.4775361307920205e-05, "loss": 0.4013, "step": 32375 }, { "epoch": 1.1669730060907486, "grad_norm": 0.1914110779762268, "learning_rate": 4.47735758664409e-05, "loss": 0.4323, "step": 32380 }, { "epoch": 1.1671532057519731, "grad_norm": 0.19718259572982788, "learning_rate": 4.4771790155550455e-05, "loss": 0.4199, "step": 32385 }, { "epoch": 1.1673334054131979, "grad_norm": 0.19468161463737488, "learning_rate": 4.477000417527319e-05, "loss": 0.4024, "step": 32390 }, { "epoch": 1.1675136050744224, "grad_norm": 0.16953888535499573, "learning_rate": 4.476821792563345e-05, "loss": 0.4586, "step": 32395 }, { "epoch": 1.167693804735647, "grad_norm": 0.18917879462242126, "learning_rate": 4.4766788731997023e-05, "loss": 0.4336, "step": 32400 }, { "epoch": 1.1678740043968716, "grad_norm": 0.16631363332271576, "learning_rate": 4.476500199756615e-05, "loss": 0.3989, "step": 32405 }, { "epoch": 1.1680542040580963, "grad_norm": 0.17123185098171234, "learning_rate": 4.476321499384096e-05, "loss": 0.4394, "step": 32410 }, { "epoch": 1.168234403719321, "grad_norm": 0.14406786859035492, "learning_rate": 4.476142772084578e-05, "loss": 0.4602, "step": 32415 }, { "epoch": 1.1684146033805456, "grad_norm": 0.17469120025634766, "learning_rate": 4.475964017860498e-05, "loss": 0.4058, "step": 32420 }, { "epoch": 1.1685948030417703, "grad_norm": 0.15574227273464203, "learning_rate": 4.47578523671429e-05, "loss": 0.4586, "step": 32425 }, { "epoch": 1.1687750027029948, "grad_norm": 0.22920864820480347, "learning_rate": 4.4756064286483915e-05, "loss": 0.4453, "step": 32430 }, { "epoch": 1.1689552023642196, "grad_norm": 0.19061683118343353, "learning_rate": 4.475427593665238e-05, "loss": 0.4672, "step": 32435 }, { "epoch": 1.1691354020254443, "grad_norm": 0.20378006994724274, "learning_rate": 4.475248731767265e-05, "loss": 0.4179, "step": 32440 }, { "epoch": 1.1693156016866688, "grad_norm": 0.16971811652183533, "learning_rate": 4.475069842956911e-05, "loss": 0.397, "step": 32445 }, { "epoch": 1.1694958013478935, "grad_norm": 0.17052094638347626, "learning_rate": 4.4748909272366133e-05, "loss": 0.3946, "step": 32450 }, { "epoch": 1.169676001009118, "grad_norm": 0.14758871495723724, "learning_rate": 4.474711984608809e-05, "loss": 0.3939, "step": 32455 }, { "epoch": 1.1698562006703428, "grad_norm": 0.18905098736286163, "learning_rate": 4.474533015075936e-05, "loss": 0.4125, "step": 32460 }, { "epoch": 1.1700364003315673, "grad_norm": 0.1888333261013031, "learning_rate": 4.474354018640432e-05, "loss": 0.4127, "step": 32465 }, { "epoch": 1.170216599992792, "grad_norm": 0.15173350274562836, "learning_rate": 4.4741749953047374e-05, "loss": 0.4341, "step": 32470 }, { "epoch": 1.1703967996540166, "grad_norm": 0.18733574450016022, "learning_rate": 4.473995945071291e-05, "loss": 0.4164, "step": 32475 }, { "epoch": 1.1705769993152413, "grad_norm": 0.18692295253276825, "learning_rate": 4.473816867942532e-05, "loss": 0.4247, "step": 32480 }, { "epoch": 1.170757198976466, "grad_norm": 0.22081652283668518, "learning_rate": 4.473637763920901e-05, "loss": 0.4228, "step": 32485 }, { "epoch": 1.1709373986376905, "grad_norm": 0.15377368032932281, "learning_rate": 4.4734586330088365e-05, "loss": 0.381, "step": 32490 }, { "epoch": 1.1711175982989153, "grad_norm": 0.17810076475143433, "learning_rate": 4.47327947520878e-05, "loss": 0.413, "step": 32495 }, { "epoch": 1.1712977979601398, "grad_norm": 0.17749875783920288, "learning_rate": 4.4731002905231735e-05, "loss": 0.4012, "step": 32500 }, { "epoch": 1.1712977979601398, "eval_loss": 0.4519113600254059, "eval_runtime": 3.5356, "eval_samples_per_second": 28.284, "eval_steps_per_second": 7.071, "step": 32500 }, { "epoch": 1.1714779976213645, "grad_norm": 0.20969074964523315, "learning_rate": 4.4729210789544576e-05, "loss": 0.4399, "step": 32505 }, { "epoch": 1.171658197282589, "grad_norm": 0.17028845846652985, "learning_rate": 4.472741840505073e-05, "loss": 0.4074, "step": 32510 }, { "epoch": 1.1718383969438138, "grad_norm": 0.19405877590179443, "learning_rate": 4.472562575177464e-05, "loss": 0.4364, "step": 32515 }, { "epoch": 1.1720185966050383, "grad_norm": 0.20871876180171967, "learning_rate": 4.472383282974071e-05, "loss": 0.4526, "step": 32520 }, { "epoch": 1.172198796266263, "grad_norm": 0.16674499213695526, "learning_rate": 4.4722039638973374e-05, "loss": 0.4057, "step": 32525 }, { "epoch": 1.1723789959274877, "grad_norm": 0.17715777456760406, "learning_rate": 4.4720246179497074e-05, "loss": 0.4047, "step": 32530 }, { "epoch": 1.1725591955887122, "grad_norm": 0.15576201677322388, "learning_rate": 4.471845245133623e-05, "loss": 0.4307, "step": 32535 }, { "epoch": 1.172739395249937, "grad_norm": 0.19848684966564178, "learning_rate": 4.4716658454515294e-05, "loss": 0.4309, "step": 32540 }, { "epoch": 1.1729195949111615, "grad_norm": 0.21592764556407928, "learning_rate": 4.4714864189058705e-05, "loss": 0.4146, "step": 32545 }, { "epoch": 1.1730997945723862, "grad_norm": 0.17954400181770325, "learning_rate": 4.47130696549909e-05, "loss": 0.4367, "step": 32550 }, { "epoch": 1.173279994233611, "grad_norm": 0.17298442125320435, "learning_rate": 4.471127485233635e-05, "loss": 0.3967, "step": 32555 }, { "epoch": 1.1734601938948355, "grad_norm": 0.17591506242752075, "learning_rate": 4.4709479781119485e-05, "loss": 0.4144, "step": 32560 }, { "epoch": 1.1736403935560602, "grad_norm": 0.1792951375246048, "learning_rate": 4.470768444136478e-05, "loss": 0.429, "step": 32565 }, { "epoch": 1.1738205932172847, "grad_norm": 0.19630871713161469, "learning_rate": 4.470588883309669e-05, "loss": 0.4472, "step": 32570 }, { "epoch": 1.1740007928785094, "grad_norm": 0.1824900358915329, "learning_rate": 4.470409295633968e-05, "loss": 0.4047, "step": 32575 }, { "epoch": 1.174180992539734, "grad_norm": 0.21130244433879852, "learning_rate": 4.470229681111822e-05, "loss": 0.4292, "step": 32580 }, { "epoch": 1.1743611922009587, "grad_norm": 0.22852078080177307, "learning_rate": 4.470050039745677e-05, "loss": 0.4134, "step": 32585 }, { "epoch": 1.1745413918621832, "grad_norm": 0.1561303436756134, "learning_rate": 4.4698703715379823e-05, "loss": 0.4225, "step": 32590 }, { "epoch": 1.174721591523408, "grad_norm": 0.19721920788288116, "learning_rate": 4.4696906764911855e-05, "loss": 0.4508, "step": 32595 }, { "epoch": 1.1749017911846327, "grad_norm": 0.19223082065582275, "learning_rate": 4.469510954607734e-05, "loss": 0.4363, "step": 32600 }, { "epoch": 1.1750819908458572, "grad_norm": 0.15646933019161224, "learning_rate": 4.4693312058900774e-05, "loss": 0.4086, "step": 32605 }, { "epoch": 1.175262190507082, "grad_norm": 0.19095897674560547, "learning_rate": 4.469151430340665e-05, "loss": 0.4511, "step": 32610 }, { "epoch": 1.1754423901683064, "grad_norm": 0.1620524823665619, "learning_rate": 4.4689716279619444e-05, "loss": 0.4414, "step": 32615 }, { "epoch": 1.1756225898295312, "grad_norm": 0.14250415563583374, "learning_rate": 4.468791798756367e-05, "loss": 0.4214, "step": 32620 }, { "epoch": 1.1758027894907557, "grad_norm": 0.16954278945922852, "learning_rate": 4.4686119427263826e-05, "loss": 0.4095, "step": 32625 }, { "epoch": 1.1759829891519804, "grad_norm": 0.19082613289356232, "learning_rate": 4.4684320598744426e-05, "loss": 0.4455, "step": 32630 }, { "epoch": 1.176163188813205, "grad_norm": 0.1630188673734665, "learning_rate": 4.468252150202995e-05, "loss": 0.4229, "step": 32635 }, { "epoch": 1.1763433884744297, "grad_norm": 0.20702870190143585, "learning_rate": 4.4680722137144935e-05, "loss": 0.4234, "step": 32640 }, { "epoch": 1.1765235881356544, "grad_norm": 0.1635637730360031, "learning_rate": 4.46789225041139e-05, "loss": 0.3663, "step": 32645 }, { "epoch": 1.176703787796879, "grad_norm": 0.18059010803699493, "learning_rate": 4.467712260296135e-05, "loss": 0.3952, "step": 32650 }, { "epoch": 1.1768839874581036, "grad_norm": 0.18861469626426697, "learning_rate": 4.467532243371181e-05, "loss": 0.436, "step": 32655 }, { "epoch": 1.1770641871193281, "grad_norm": 0.1615755259990692, "learning_rate": 4.467352199638982e-05, "loss": 0.4543, "step": 32660 }, { "epoch": 1.1772443867805529, "grad_norm": 0.19088678061962128, "learning_rate": 4.46717212910199e-05, "loss": 0.4356, "step": 32665 }, { "epoch": 1.1774245864417776, "grad_norm": 0.1913667619228363, "learning_rate": 4.466992031762658e-05, "loss": 0.4021, "step": 32670 }, { "epoch": 1.1776047861030021, "grad_norm": 0.1861899197101593, "learning_rate": 4.466811907623441e-05, "loss": 0.4698, "step": 32675 }, { "epoch": 1.1777849857642269, "grad_norm": 0.21820510923862457, "learning_rate": 4.466631756686792e-05, "loss": 0.4119, "step": 32680 }, { "epoch": 1.1779651854254514, "grad_norm": 0.20619237422943115, "learning_rate": 4.4664515789551665e-05, "loss": 0.4625, "step": 32685 }, { "epoch": 1.178145385086676, "grad_norm": 0.19361612200737, "learning_rate": 4.466271374431019e-05, "loss": 0.4151, "step": 32690 }, { "epoch": 1.1783255847479006, "grad_norm": 0.2008073925971985, "learning_rate": 4.466091143116804e-05, "loss": 0.4312, "step": 32695 }, { "epoch": 1.1785057844091253, "grad_norm": 0.15440912544727325, "learning_rate": 4.465910885014979e-05, "loss": 0.3915, "step": 32700 }, { "epoch": 1.1786859840703499, "grad_norm": 0.37958988547325134, "learning_rate": 4.4657306001279975e-05, "loss": 0.4194, "step": 32705 }, { "epoch": 1.1788661837315746, "grad_norm": 0.19730152189731598, "learning_rate": 4.4655502884583177e-05, "loss": 0.4029, "step": 32710 }, { "epoch": 1.1790463833927993, "grad_norm": 0.19884982705116272, "learning_rate": 4.465369950008396e-05, "loss": 0.4183, "step": 32715 }, { "epoch": 1.1792265830540238, "grad_norm": 0.22939956188201904, "learning_rate": 4.465189584780689e-05, "loss": 0.4118, "step": 32720 }, { "epoch": 1.1794067827152486, "grad_norm": 0.19102820754051208, "learning_rate": 4.465009192777655e-05, "loss": 0.4277, "step": 32725 }, { "epoch": 1.179586982376473, "grad_norm": 0.19602443277835846, "learning_rate": 4.4648287740017506e-05, "loss": 0.4216, "step": 32730 }, { "epoch": 1.1797671820376978, "grad_norm": 0.1469677835702896, "learning_rate": 4.464648328455434e-05, "loss": 0.4232, "step": 32735 }, { "epoch": 1.1799473816989223, "grad_norm": 0.15973035991191864, "learning_rate": 4.464467856141166e-05, "loss": 0.4254, "step": 32740 }, { "epoch": 1.180127581360147, "grad_norm": 0.18778270483016968, "learning_rate": 4.4642873570614016e-05, "loss": 0.4131, "step": 32745 }, { "epoch": 1.1803077810213716, "grad_norm": 0.16484954953193665, "learning_rate": 4.464106831218604e-05, "loss": 0.4222, "step": 32750 }, { "epoch": 1.1804879806825963, "grad_norm": 0.17252211272716522, "learning_rate": 4.4639262786152306e-05, "loss": 0.4331, "step": 32755 }, { "epoch": 1.180668180343821, "grad_norm": 0.16965141892433167, "learning_rate": 4.463745699253742e-05, "loss": 0.4342, "step": 32760 }, { "epoch": 1.1808483800050456, "grad_norm": 0.1783820539712906, "learning_rate": 4.463565093136598e-05, "loss": 0.4189, "step": 32765 }, { "epoch": 1.1810285796662703, "grad_norm": 0.2075047343969345, "learning_rate": 4.46338446026626e-05, "loss": 0.4277, "step": 32770 }, { "epoch": 1.1812087793274948, "grad_norm": 0.21429748833179474, "learning_rate": 4.463203800645188e-05, "loss": 0.4372, "step": 32775 }, { "epoch": 1.1813889789887195, "grad_norm": 0.1777476817369461, "learning_rate": 4.4630231142758455e-05, "loss": 0.4113, "step": 32780 }, { "epoch": 1.1815691786499443, "grad_norm": 0.1785525679588318, "learning_rate": 4.4628424011606927e-05, "loss": 0.4065, "step": 32785 }, { "epoch": 1.1817493783111688, "grad_norm": 0.18620218336582184, "learning_rate": 4.462661661302192e-05, "loss": 0.4236, "step": 32790 }, { "epoch": 1.1819295779723935, "grad_norm": 0.16326984763145447, "learning_rate": 4.462480894702806e-05, "loss": 0.3865, "step": 32795 }, { "epoch": 1.182109777633618, "grad_norm": 0.16543501615524292, "learning_rate": 4.462300101364999e-05, "loss": 0.4208, "step": 32800 }, { "epoch": 1.1822899772948428, "grad_norm": 0.1949724704027176, "learning_rate": 4.4621192812912316e-05, "loss": 0.4216, "step": 32805 }, { "epoch": 1.1824701769560673, "grad_norm": 0.19553053379058838, "learning_rate": 4.46193843448397e-05, "loss": 0.4616, "step": 32810 }, { "epoch": 1.182650376617292, "grad_norm": 0.17806103825569153, "learning_rate": 4.461757560945676e-05, "loss": 0.4171, "step": 32815 }, { "epoch": 1.1828305762785165, "grad_norm": 0.21131014823913574, "learning_rate": 4.461576660678815e-05, "loss": 0.4127, "step": 32820 }, { "epoch": 1.1830107759397412, "grad_norm": 0.16493719816207886, "learning_rate": 4.461395733685853e-05, "loss": 0.4213, "step": 32825 }, { "epoch": 1.183190975600966, "grad_norm": 0.21300533413887024, "learning_rate": 4.461214779969253e-05, "loss": 0.4309, "step": 32830 }, { "epoch": 1.1833711752621905, "grad_norm": 0.17048995196819305, "learning_rate": 4.4610337995314816e-05, "loss": 0.4518, "step": 32835 }, { "epoch": 1.1835513749234152, "grad_norm": 0.17380398511886597, "learning_rate": 4.460852792375004e-05, "loss": 0.4386, "step": 32840 }, { "epoch": 1.1837315745846397, "grad_norm": 0.16541922092437744, "learning_rate": 4.4606717585022876e-05, "loss": 0.4223, "step": 32845 }, { "epoch": 1.1839117742458645, "grad_norm": 0.17973947525024414, "learning_rate": 4.460490697915797e-05, "loss": 0.4302, "step": 32850 }, { "epoch": 1.184091973907089, "grad_norm": 0.24981901049613953, "learning_rate": 4.460309610618001e-05, "loss": 0.4425, "step": 32855 }, { "epoch": 1.1842721735683137, "grad_norm": 0.23169773817062378, "learning_rate": 4.4601284966113656e-05, "loss": 0.4564, "step": 32860 }, { "epoch": 1.1844523732295382, "grad_norm": 0.15463878214359283, "learning_rate": 4.4599473558983596e-05, "loss": 0.3629, "step": 32865 }, { "epoch": 1.184632572890763, "grad_norm": 0.17254050076007843, "learning_rate": 4.459766188481449e-05, "loss": 0.414, "step": 32870 }, { "epoch": 1.1848127725519877, "grad_norm": 0.1448955237865448, "learning_rate": 4.459584994363105e-05, "loss": 0.4048, "step": 32875 }, { "epoch": 1.1849929722132122, "grad_norm": 0.15114130079746246, "learning_rate": 4.459403773545794e-05, "loss": 0.4154, "step": 32880 }, { "epoch": 1.185173171874437, "grad_norm": 0.16808383166790009, "learning_rate": 4.459222526031987e-05, "loss": 0.4226, "step": 32885 }, { "epoch": 1.1853533715356614, "grad_norm": 0.18770961463451385, "learning_rate": 4.45904125182415e-05, "loss": 0.4326, "step": 32890 }, { "epoch": 1.1855335711968862, "grad_norm": 0.18759319186210632, "learning_rate": 4.458859950924758e-05, "loss": 0.4325, "step": 32895 }, { "epoch": 1.1857137708581107, "grad_norm": 0.19363155961036682, "learning_rate": 4.458678623336277e-05, "loss": 0.4474, "step": 32900 }, { "epoch": 1.1858939705193354, "grad_norm": 0.13609229028224945, "learning_rate": 4.4584972690611784e-05, "loss": 0.3956, "step": 32905 }, { "epoch": 1.18607417018056, "grad_norm": 0.20877060294151306, "learning_rate": 4.458315888101935e-05, "loss": 0.4139, "step": 32910 }, { "epoch": 1.1862543698417847, "grad_norm": 0.1942521333694458, "learning_rate": 4.4581344804610156e-05, "loss": 0.4022, "step": 32915 }, { "epoch": 1.1864345695030094, "grad_norm": 0.19765913486480713, "learning_rate": 4.457953046140894e-05, "loss": 0.4402, "step": 32920 }, { "epoch": 1.186614769164234, "grad_norm": 0.15261493623256683, "learning_rate": 4.4577715851440405e-05, "loss": 0.4131, "step": 32925 }, { "epoch": 1.1867949688254587, "grad_norm": 0.17721693217754364, "learning_rate": 4.4575900974729284e-05, "loss": 0.434, "step": 32930 }, { "epoch": 1.1869751684866832, "grad_norm": 0.1746007800102234, "learning_rate": 4.45740858313003e-05, "loss": 0.4198, "step": 32935 }, { "epoch": 1.187155368147908, "grad_norm": 0.18980930745601654, "learning_rate": 4.45722704211782e-05, "loss": 0.4456, "step": 32940 }, { "epoch": 1.1873355678091326, "grad_norm": 0.15787626802921295, "learning_rate": 4.457045474438769e-05, "loss": 0.3863, "step": 32945 }, { "epoch": 1.1875157674703571, "grad_norm": 0.2203458696603775, "learning_rate": 4.4568638800953524e-05, "loss": 0.3924, "step": 32950 }, { "epoch": 1.1876959671315819, "grad_norm": 0.18546192348003387, "learning_rate": 4.4566822590900445e-05, "loss": 0.4048, "step": 32955 }, { "epoch": 1.1878761667928064, "grad_norm": 0.19496969878673553, "learning_rate": 4.45650061142532e-05, "loss": 0.3955, "step": 32960 }, { "epoch": 1.1880563664540311, "grad_norm": 0.1746780127286911, "learning_rate": 4.456318937103653e-05, "loss": 0.4654, "step": 32965 }, { "epoch": 1.1882365661152556, "grad_norm": 0.17759990692138672, "learning_rate": 4.45613723612752e-05, "loss": 0.4105, "step": 32970 }, { "epoch": 1.1884167657764804, "grad_norm": 0.21956616640090942, "learning_rate": 4.455955508499395e-05, "loss": 0.4118, "step": 32975 }, { "epoch": 1.1885969654377049, "grad_norm": 0.1860971599817276, "learning_rate": 4.455773754221755e-05, "loss": 0.4431, "step": 32980 }, { "epoch": 1.1887771650989296, "grad_norm": 0.17252309620380402, "learning_rate": 4.455591973297077e-05, "loss": 0.4097, "step": 32985 }, { "epoch": 1.1889573647601543, "grad_norm": 0.16842429339885712, "learning_rate": 4.455410165727836e-05, "loss": 0.4194, "step": 32990 }, { "epoch": 1.1891375644213789, "grad_norm": 0.16480675339698792, "learning_rate": 4.4552283315165114e-05, "loss": 0.4398, "step": 32995 }, { "epoch": 1.1893177640826036, "grad_norm": 0.1965269148349762, "learning_rate": 4.455046470665578e-05, "loss": 0.435, "step": 33000 }, { "epoch": 1.1893177640826036, "eval_loss": 0.4501653015613556, "eval_runtime": 3.5279, "eval_samples_per_second": 28.345, "eval_steps_per_second": 7.086, "step": 33000 }, { "epoch": 1.189497963743828, "grad_norm": 0.16842001676559448, "learning_rate": 4.454864583177515e-05, "loss": 0.4186, "step": 33005 }, { "epoch": 1.1896781634050528, "grad_norm": 0.17782749235630035, "learning_rate": 4.454682669054801e-05, "loss": 0.4134, "step": 33010 }, { "epoch": 1.1898583630662773, "grad_norm": 0.20939677953720093, "learning_rate": 4.454500728299914e-05, "loss": 0.4094, "step": 33015 }, { "epoch": 1.190038562727502, "grad_norm": 0.22223712503910065, "learning_rate": 4.454318760915333e-05, "loss": 0.4279, "step": 33020 }, { "epoch": 1.1902187623887266, "grad_norm": 0.19512085616588593, "learning_rate": 4.4541367669035373e-05, "loss": 0.4125, "step": 33025 }, { "epoch": 1.1903989620499513, "grad_norm": 0.17766328155994415, "learning_rate": 4.453954746267006e-05, "loss": 0.4323, "step": 33030 }, { "epoch": 1.190579161711176, "grad_norm": 0.17056851089000702, "learning_rate": 4.45377269900822e-05, "loss": 0.4184, "step": 33035 }, { "epoch": 1.1907593613724006, "grad_norm": 0.18727493286132812, "learning_rate": 4.45359062512966e-05, "loss": 0.42, "step": 33040 }, { "epoch": 1.1909395610336253, "grad_norm": 0.16119909286499023, "learning_rate": 4.453408524633805e-05, "loss": 0.3888, "step": 33045 }, { "epoch": 1.1911197606948498, "grad_norm": 0.16698874533176422, "learning_rate": 4.453226397523137e-05, "loss": 0.4427, "step": 33050 }, { "epoch": 1.1912999603560745, "grad_norm": 0.1766866147518158, "learning_rate": 4.453044243800137e-05, "loss": 0.4434, "step": 33055 }, { "epoch": 1.1914801600172993, "grad_norm": 0.19949309527873993, "learning_rate": 4.452862063467289e-05, "loss": 0.4333, "step": 33060 }, { "epoch": 1.1916603596785238, "grad_norm": 0.18705400824546814, "learning_rate": 4.452679856527072e-05, "loss": 0.4109, "step": 33065 }, { "epoch": 1.1918405593397485, "grad_norm": 0.19949448108673096, "learning_rate": 4.45249762298197e-05, "loss": 0.3773, "step": 33070 }, { "epoch": 1.192020759000973, "grad_norm": 0.1769210696220398, "learning_rate": 4.452315362834467e-05, "loss": 0.4424, "step": 33075 }, { "epoch": 1.1922009586621978, "grad_norm": 0.19094079732894897, "learning_rate": 4.452133076087045e-05, "loss": 0.3962, "step": 33080 }, { "epoch": 1.1923811583234223, "grad_norm": 0.2079572230577469, "learning_rate": 4.4519507627421873e-05, "loss": 0.4552, "step": 33085 }, { "epoch": 1.192561357984647, "grad_norm": 0.24924804270267487, "learning_rate": 4.451768422802378e-05, "loss": 0.3995, "step": 33090 }, { "epoch": 1.1927415576458715, "grad_norm": 0.1824015974998474, "learning_rate": 4.451586056270103e-05, "loss": 0.4448, "step": 33095 }, { "epoch": 1.1929217573070963, "grad_norm": 0.1912459284067154, "learning_rate": 4.4514036631478444e-05, "loss": 0.4076, "step": 33100 }, { "epoch": 1.193101956968321, "grad_norm": 0.1705092191696167, "learning_rate": 4.4512212434380894e-05, "loss": 0.4206, "step": 33105 }, { "epoch": 1.1932821566295455, "grad_norm": 0.17675426602363586, "learning_rate": 4.4510387971433225e-05, "loss": 0.4243, "step": 33110 }, { "epoch": 1.1934623562907702, "grad_norm": 0.21491315960884094, "learning_rate": 4.45085632426603e-05, "loss": 0.4422, "step": 33115 }, { "epoch": 1.1936425559519948, "grad_norm": 0.2086547613143921, "learning_rate": 4.4506738248086974e-05, "loss": 0.4128, "step": 33120 }, { "epoch": 1.1938227556132195, "grad_norm": 0.1773192584514618, "learning_rate": 4.4504912987738124e-05, "loss": 0.4244, "step": 33125 }, { "epoch": 1.194002955274444, "grad_norm": 0.19399385154247284, "learning_rate": 4.45030874616386e-05, "loss": 0.4028, "step": 33130 }, { "epoch": 1.1941831549356687, "grad_norm": 0.22621554136276245, "learning_rate": 4.45012616698133e-05, "loss": 0.4292, "step": 33135 }, { "epoch": 1.1943633545968932, "grad_norm": 0.17529787123203278, "learning_rate": 4.449943561228707e-05, "loss": 0.3879, "step": 33140 }, { "epoch": 1.194543554258118, "grad_norm": 0.18595170974731445, "learning_rate": 4.449760928908481e-05, "loss": 0.4049, "step": 33145 }, { "epoch": 1.1947237539193427, "grad_norm": 0.17124305665493011, "learning_rate": 4.44957827002314e-05, "loss": 0.3847, "step": 33150 }, { "epoch": 1.1949039535805672, "grad_norm": 0.1618146151304245, "learning_rate": 4.449395584575172e-05, "loss": 0.4069, "step": 33155 }, { "epoch": 1.195084153241792, "grad_norm": 0.15872132778167725, "learning_rate": 4.449212872567068e-05, "loss": 0.4154, "step": 33160 }, { "epoch": 1.1952643529030165, "grad_norm": 0.18358559906482697, "learning_rate": 4.4490301340013146e-05, "loss": 0.4185, "step": 33165 }, { "epoch": 1.1954445525642412, "grad_norm": 0.139307901263237, "learning_rate": 4.4488473688804034e-05, "loss": 0.3846, "step": 33170 }, { "epoch": 1.195624752225466, "grad_norm": 0.16078798472881317, "learning_rate": 4.448664577206824e-05, "loss": 0.4072, "step": 33175 }, { "epoch": 1.1958049518866904, "grad_norm": 0.20551976561546326, "learning_rate": 4.448481758983067e-05, "loss": 0.4151, "step": 33180 }, { "epoch": 1.1959851515479152, "grad_norm": 0.17032408714294434, "learning_rate": 4.4482989142116236e-05, "loss": 0.4053, "step": 33185 }, { "epoch": 1.1961653512091397, "grad_norm": 0.19917094707489014, "learning_rate": 4.4481160428949845e-05, "loss": 0.3863, "step": 33190 }, { "epoch": 1.1963455508703644, "grad_norm": 0.17138609290122986, "learning_rate": 4.447933145035642e-05, "loss": 0.4144, "step": 33195 }, { "epoch": 1.196525750531589, "grad_norm": 0.16464346647262573, "learning_rate": 4.447750220636086e-05, "loss": 0.4573, "step": 33200 }, { "epoch": 1.1967059501928137, "grad_norm": 0.18972603976726532, "learning_rate": 4.4475672696988117e-05, "loss": 0.4093, "step": 33205 }, { "epoch": 1.1968861498540382, "grad_norm": 0.1480892151594162, "learning_rate": 4.44738429222631e-05, "loss": 0.3913, "step": 33210 }, { "epoch": 1.197066349515263, "grad_norm": 0.1947953850030899, "learning_rate": 4.4472012882210744e-05, "loss": 0.4281, "step": 33215 }, { "epoch": 1.1972465491764877, "grad_norm": 0.1418490707874298, "learning_rate": 4.4470182576855984e-05, "loss": 0.4228, "step": 33220 }, { "epoch": 1.1974267488377122, "grad_norm": 0.21313992142677307, "learning_rate": 4.446835200622376e-05, "loss": 0.4354, "step": 33225 }, { "epoch": 1.197606948498937, "grad_norm": 0.16371306777000427, "learning_rate": 4.4466521170339e-05, "loss": 0.4339, "step": 33230 }, { "epoch": 1.1977871481601614, "grad_norm": 0.17602042853832245, "learning_rate": 4.446469006922666e-05, "loss": 0.4083, "step": 33235 }, { "epoch": 1.1979673478213861, "grad_norm": 0.2149427831172943, "learning_rate": 4.446285870291168e-05, "loss": 0.4426, "step": 33240 }, { "epoch": 1.1981475474826107, "grad_norm": 0.19372370839118958, "learning_rate": 4.4461027071419034e-05, "loss": 0.4215, "step": 33245 }, { "epoch": 1.1983277471438354, "grad_norm": 0.1823178380727768, "learning_rate": 4.445919517477365e-05, "loss": 0.4066, "step": 33250 }, { "epoch": 1.19850794680506, "grad_norm": 0.2029699683189392, "learning_rate": 4.445736301300051e-05, "loss": 0.4223, "step": 33255 }, { "epoch": 1.1986881464662846, "grad_norm": 0.16662827134132385, "learning_rate": 4.445553058612455e-05, "loss": 0.4364, "step": 33260 }, { "epoch": 1.1988683461275094, "grad_norm": 0.1548469513654709, "learning_rate": 4.445369789417077e-05, "loss": 0.4124, "step": 33265 }, { "epoch": 1.1990485457887339, "grad_norm": 0.15706267952919006, "learning_rate": 4.445186493716411e-05, "loss": 0.3692, "step": 33270 }, { "epoch": 1.1992287454499586, "grad_norm": 0.19710016250610352, "learning_rate": 4.4450031715129556e-05, "loss": 0.4461, "step": 33275 }, { "epoch": 1.1994089451111831, "grad_norm": 0.16916310787200928, "learning_rate": 4.4448198228092095e-05, "loss": 0.3842, "step": 33280 }, { "epoch": 1.1995891447724079, "grad_norm": 0.24299342930316925, "learning_rate": 4.444636447607669e-05, "loss": 0.4048, "step": 33285 }, { "epoch": 1.1997693444336326, "grad_norm": 0.1836657077074051, "learning_rate": 4.444453045910834e-05, "loss": 0.4123, "step": 33290 }, { "epoch": 1.199949544094857, "grad_norm": 0.2232903391122818, "learning_rate": 4.444269617721202e-05, "loss": 0.4128, "step": 33295 }, { "epoch": 1.2001297437560818, "grad_norm": 0.150344580411911, "learning_rate": 4.444086163041273e-05, "loss": 0.4184, "step": 33300 }, { "epoch": 1.2003099434173063, "grad_norm": 0.16514082252979279, "learning_rate": 4.443902681873547e-05, "loss": 0.4122, "step": 33305 }, { "epoch": 1.200490143078531, "grad_norm": 0.16925464570522308, "learning_rate": 4.443719174220523e-05, "loss": 0.4041, "step": 33310 }, { "epoch": 1.2006703427397556, "grad_norm": 0.2036067545413971, "learning_rate": 4.443535640084702e-05, "loss": 0.4335, "step": 33315 }, { "epoch": 1.2008505424009803, "grad_norm": 0.19253499805927277, "learning_rate": 4.443352079468583e-05, "loss": 0.4581, "step": 33320 }, { "epoch": 1.2010307420622048, "grad_norm": 0.16766758263111115, "learning_rate": 4.4431684923746695e-05, "loss": 0.4244, "step": 33325 }, { "epoch": 1.2012109417234296, "grad_norm": 0.14643500745296478, "learning_rate": 4.442984878805461e-05, "loss": 0.4007, "step": 33330 }, { "epoch": 1.2013911413846543, "grad_norm": 0.19802068173885345, "learning_rate": 4.44280123876346e-05, "loss": 0.4579, "step": 33335 }, { "epoch": 1.2015713410458788, "grad_norm": 0.2025717943906784, "learning_rate": 4.4426175722511674e-05, "loss": 0.4056, "step": 33340 }, { "epoch": 1.2017515407071035, "grad_norm": 0.181138813495636, "learning_rate": 4.442433879271087e-05, "loss": 0.4282, "step": 33345 }, { "epoch": 1.201931740368328, "grad_norm": 0.1720162183046341, "learning_rate": 4.4422501598257216e-05, "loss": 0.4089, "step": 33350 }, { "epoch": 1.2021119400295528, "grad_norm": 0.23699156939983368, "learning_rate": 4.442066413917573e-05, "loss": 0.4118, "step": 33355 }, { "epoch": 1.2022921396907773, "grad_norm": 0.16807523369789124, "learning_rate": 4.441882641549145e-05, "loss": 0.4238, "step": 33360 }, { "epoch": 1.202472339352002, "grad_norm": 0.16875354945659637, "learning_rate": 4.441698842722943e-05, "loss": 0.41, "step": 33365 }, { "epoch": 1.2026525390132266, "grad_norm": 0.15263767540454865, "learning_rate": 4.44151501744147e-05, "loss": 0.3937, "step": 33370 }, { "epoch": 1.2028327386744513, "grad_norm": 0.19248110055923462, "learning_rate": 4.441331165707231e-05, "loss": 0.4074, "step": 33375 }, { "epoch": 1.203012938335676, "grad_norm": 0.1765611320734024, "learning_rate": 4.44114728752273e-05, "loss": 0.4095, "step": 33380 }, { "epoch": 1.2031931379969005, "grad_norm": 0.1671658307313919, "learning_rate": 4.440963382890474e-05, "loss": 0.4064, "step": 33385 }, { "epoch": 1.2033733376581253, "grad_norm": 0.1801038384437561, "learning_rate": 4.440779451812967e-05, "loss": 0.391, "step": 33390 }, { "epoch": 1.2035535373193498, "grad_norm": 0.18195895850658417, "learning_rate": 4.4405954942927155e-05, "loss": 0.4165, "step": 33395 }, { "epoch": 1.2037337369805745, "grad_norm": 0.22114919126033783, "learning_rate": 4.440411510332226e-05, "loss": 0.4255, "step": 33400 }, { "epoch": 1.203913936641799, "grad_norm": 0.18893185257911682, "learning_rate": 4.4402274999340065e-05, "loss": 0.4319, "step": 33405 }, { "epoch": 1.2040941363030238, "grad_norm": 0.19673962891101837, "learning_rate": 4.4400434631005626e-05, "loss": 0.4201, "step": 33410 }, { "epoch": 1.2042743359642483, "grad_norm": 0.21751564741134644, "learning_rate": 4.439859399834402e-05, "loss": 0.4748, "step": 33415 }, { "epoch": 1.204454535625473, "grad_norm": 0.17174293100833893, "learning_rate": 4.4396753101380316e-05, "loss": 0.3964, "step": 33420 }, { "epoch": 1.2046347352866977, "grad_norm": 0.18520551919937134, "learning_rate": 4.4394911940139616e-05, "loss": 0.4367, "step": 33425 }, { "epoch": 1.2048149349479222, "grad_norm": 0.16532555222511292, "learning_rate": 4.4393070514647e-05, "loss": 0.4135, "step": 33430 }, { "epoch": 1.204995134609147, "grad_norm": 0.1470910906791687, "learning_rate": 4.439122882492754e-05, "loss": 0.4098, "step": 33435 }, { "epoch": 1.2051753342703715, "grad_norm": 0.18562817573547363, "learning_rate": 4.438938687100636e-05, "loss": 0.4498, "step": 33440 }, { "epoch": 1.2053555339315962, "grad_norm": 0.14606645703315735, "learning_rate": 4.438754465290852e-05, "loss": 0.4095, "step": 33445 }, { "epoch": 1.205535733592821, "grad_norm": 0.15203700959682465, "learning_rate": 4.4385702170659144e-05, "loss": 0.3668, "step": 33450 }, { "epoch": 1.2057159332540455, "grad_norm": 0.17846938967704773, "learning_rate": 4.4383859424283325e-05, "loss": 0.4517, "step": 33455 }, { "epoch": 1.2058961329152702, "grad_norm": 0.16596698760986328, "learning_rate": 4.438201641380618e-05, "loss": 0.3891, "step": 33460 }, { "epoch": 1.2060763325764947, "grad_norm": 0.14986424148082733, "learning_rate": 4.438017313925281e-05, "loss": 0.3901, "step": 33465 }, { "epoch": 1.2062565322377194, "grad_norm": 0.18405571579933167, "learning_rate": 4.4378329600648337e-05, "loss": 0.4215, "step": 33470 }, { "epoch": 1.206436731898944, "grad_norm": 0.16309815645217896, "learning_rate": 4.437648579801788e-05, "loss": 0.4015, "step": 33475 }, { "epoch": 1.2066169315601687, "grad_norm": 0.16883867979049683, "learning_rate": 4.437464173138655e-05, "loss": 0.4407, "step": 33480 }, { "epoch": 1.2067971312213932, "grad_norm": 0.18659423291683197, "learning_rate": 4.437279740077947e-05, "loss": 0.4177, "step": 33485 }, { "epoch": 1.206977330882618, "grad_norm": 0.2079888880252838, "learning_rate": 4.437095280622178e-05, "loss": 0.4201, "step": 33490 }, { "epoch": 1.2071575305438427, "grad_norm": 0.15758785605430603, "learning_rate": 4.4369107947738606e-05, "loss": 0.4037, "step": 33495 }, { "epoch": 1.2073377302050672, "grad_norm": 0.18078656494617462, "learning_rate": 4.436726282535509e-05, "loss": 0.4328, "step": 33500 }, { "epoch": 1.2073377302050672, "eval_loss": 0.45027703046798706, "eval_runtime": 3.5418, "eval_samples_per_second": 28.234, "eval_steps_per_second": 7.058, "step": 33500 }, { "epoch": 1.207517929866292, "grad_norm": 0.18480472266674042, "learning_rate": 4.436541743909637e-05, "loss": 0.4078, "step": 33505 }, { "epoch": 1.2076981295275164, "grad_norm": 0.24424877762794495, "learning_rate": 4.436357178898759e-05, "loss": 0.4427, "step": 33510 }, { "epoch": 1.2078783291887412, "grad_norm": 0.15822868049144745, "learning_rate": 4.436172587505389e-05, "loss": 0.4074, "step": 33515 }, { "epoch": 1.2080585288499657, "grad_norm": 0.1882435828447342, "learning_rate": 4.435987969732042e-05, "loss": 0.4272, "step": 33520 }, { "epoch": 1.2082387285111904, "grad_norm": 0.18066862225532532, "learning_rate": 4.4358033255812334e-05, "loss": 0.3978, "step": 33525 }, { "epoch": 1.208418928172415, "grad_norm": 0.18334537744522095, "learning_rate": 4.43561865505548e-05, "loss": 0.4683, "step": 33530 }, { "epoch": 1.2085991278336397, "grad_norm": 0.17291270196437836, "learning_rate": 4.4354339581572974e-05, "loss": 0.4201, "step": 33535 }, { "epoch": 1.2087793274948644, "grad_norm": 0.19278649985790253, "learning_rate": 4.4352492348892015e-05, "loss": 0.4244, "step": 33540 }, { "epoch": 1.208959527156089, "grad_norm": 0.17571572959423065, "learning_rate": 4.435064485253709e-05, "loss": 0.4039, "step": 33545 }, { "epoch": 1.2091397268173136, "grad_norm": 0.21595270931720734, "learning_rate": 4.434879709253338e-05, "loss": 0.4619, "step": 33550 }, { "epoch": 1.2093199264785381, "grad_norm": 0.16592943668365479, "learning_rate": 4.434694906890605e-05, "loss": 0.4259, "step": 33555 }, { "epoch": 1.2095001261397629, "grad_norm": 0.1708192229270935, "learning_rate": 4.434510078168029e-05, "loss": 0.4389, "step": 33560 }, { "epoch": 1.2096803258009876, "grad_norm": 0.13730975985527039, "learning_rate": 4.434325223088128e-05, "loss": 0.3832, "step": 33565 }, { "epoch": 1.2098605254622121, "grad_norm": 0.1926315575838089, "learning_rate": 4.43414034165342e-05, "loss": 0.3929, "step": 33570 }, { "epoch": 1.2100407251234369, "grad_norm": 0.16301114857196808, "learning_rate": 4.433955433866424e-05, "loss": 0.3981, "step": 33575 }, { "epoch": 1.2102209247846614, "grad_norm": 0.16956037282943726, "learning_rate": 4.4337704997296604e-05, "loss": 0.4255, "step": 33580 }, { "epoch": 1.210401124445886, "grad_norm": 0.23975764214992523, "learning_rate": 4.4335855392456474e-05, "loss": 0.4243, "step": 33585 }, { "epoch": 1.2105813241071106, "grad_norm": 0.1944325715303421, "learning_rate": 4.4334005524169066e-05, "loss": 0.4094, "step": 33590 }, { "epoch": 1.2107615237683353, "grad_norm": 0.17085929214954376, "learning_rate": 4.433215539245956e-05, "loss": 0.4084, "step": 33595 }, { "epoch": 1.2109417234295599, "grad_norm": 0.21070963144302368, "learning_rate": 4.43303049973532e-05, "loss": 0.4491, "step": 33600 }, { "epoch": 1.2111219230907846, "grad_norm": 0.158106729388237, "learning_rate": 4.4328454338875156e-05, "loss": 0.3992, "step": 33605 }, { "epoch": 1.2113021227520093, "grad_norm": 0.19057874381542206, "learning_rate": 4.4326603417050676e-05, "loss": 0.3716, "step": 33610 }, { "epoch": 1.2114823224132338, "grad_norm": 0.1798359900712967, "learning_rate": 4.4324752231904965e-05, "loss": 0.4243, "step": 33615 }, { "epoch": 1.2116625220744586, "grad_norm": 0.20672789216041565, "learning_rate": 4.432290078346324e-05, "loss": 0.4384, "step": 33620 }, { "epoch": 1.211842721735683, "grad_norm": 0.18144282698631287, "learning_rate": 4.4321049071750743e-05, "loss": 0.3966, "step": 33625 }, { "epoch": 1.2120229213969078, "grad_norm": 0.2156475931406021, "learning_rate": 4.431919709679269e-05, "loss": 0.4488, "step": 33630 }, { "epoch": 1.2122031210581323, "grad_norm": 0.1874171942472458, "learning_rate": 4.431734485861432e-05, "loss": 0.4277, "step": 33635 }, { "epoch": 1.212383320719357, "grad_norm": 0.20030780136585236, "learning_rate": 4.431549235724086e-05, "loss": 0.3958, "step": 33640 }, { "epoch": 1.2125635203805816, "grad_norm": 0.16703949868679047, "learning_rate": 4.431363959269755e-05, "loss": 0.4372, "step": 33645 }, { "epoch": 1.2127437200418063, "grad_norm": 0.1703837811946869, "learning_rate": 4.431178656500965e-05, "loss": 0.404, "step": 33650 }, { "epoch": 1.212923919703031, "grad_norm": 0.1517026126384735, "learning_rate": 4.43099332742024e-05, "loss": 0.4263, "step": 33655 }, { "epoch": 1.2131041193642556, "grad_norm": 0.21069294214248657, "learning_rate": 4.430807972030104e-05, "loss": 0.3994, "step": 33660 }, { "epoch": 1.2132843190254803, "grad_norm": 0.1706804782152176, "learning_rate": 4.430622590333083e-05, "loss": 0.4113, "step": 33665 }, { "epoch": 1.2134645186867048, "grad_norm": 0.20271262526512146, "learning_rate": 4.430437182331704e-05, "loss": 0.4189, "step": 33670 }, { "epoch": 1.2136447183479295, "grad_norm": 0.1703236848115921, "learning_rate": 4.430251748028492e-05, "loss": 0.4197, "step": 33675 }, { "epoch": 1.2138249180091543, "grad_norm": 0.16944654285907745, "learning_rate": 4.430066287425973e-05, "loss": 0.4716, "step": 33680 }, { "epoch": 1.2140051176703788, "grad_norm": 0.17633916437625885, "learning_rate": 4.429880800526675e-05, "loss": 0.3921, "step": 33685 }, { "epoch": 1.2141853173316035, "grad_norm": 0.2126268744468689, "learning_rate": 4.4296952873331235e-05, "loss": 0.4292, "step": 33690 }, { "epoch": 1.214365516992828, "grad_norm": 0.1921420842409134, "learning_rate": 4.4295097478478484e-05, "loss": 0.4163, "step": 33695 }, { "epoch": 1.2145457166540528, "grad_norm": 0.19828033447265625, "learning_rate": 4.4293241820733764e-05, "loss": 0.4112, "step": 33700 }, { "epoch": 1.2147259163152773, "grad_norm": 0.171315535902977, "learning_rate": 4.429138590012236e-05, "loss": 0.4352, "step": 33705 }, { "epoch": 1.214906115976502, "grad_norm": 0.15419135987758636, "learning_rate": 4.428952971666956e-05, "loss": 0.4077, "step": 33710 }, { "epoch": 1.2150863156377265, "grad_norm": 0.16436229646205902, "learning_rate": 4.428767327040065e-05, "loss": 0.4068, "step": 33715 }, { "epoch": 1.2152665152989512, "grad_norm": 0.17633068561553955, "learning_rate": 4.428581656134092e-05, "loss": 0.4457, "step": 33720 }, { "epoch": 1.215446714960176, "grad_norm": 0.1803673654794693, "learning_rate": 4.4283959589515686e-05, "loss": 0.4286, "step": 33725 }, { "epoch": 1.2156269146214005, "grad_norm": 0.14655368030071259, "learning_rate": 4.428210235495023e-05, "loss": 0.4373, "step": 33730 }, { "epoch": 1.2158071142826252, "grad_norm": 0.18454967439174652, "learning_rate": 4.428024485766986e-05, "loss": 0.4165, "step": 33735 }, { "epoch": 1.2159873139438497, "grad_norm": 0.1603035181760788, "learning_rate": 4.427838709769989e-05, "loss": 0.3995, "step": 33740 }, { "epoch": 1.2161675136050745, "grad_norm": 0.1812167763710022, "learning_rate": 4.427652907506562e-05, "loss": 0.4046, "step": 33745 }, { "epoch": 1.216347713266299, "grad_norm": 0.16783830523490906, "learning_rate": 4.427467078979238e-05, "loss": 0.4103, "step": 33750 }, { "epoch": 1.2165279129275237, "grad_norm": 0.20408573746681213, "learning_rate": 4.427281224190548e-05, "loss": 0.4424, "step": 33755 }, { "epoch": 1.2167081125887482, "grad_norm": 0.13193145394325256, "learning_rate": 4.427095343143025e-05, "loss": 0.4065, "step": 33760 }, { "epoch": 1.216888312249973, "grad_norm": 0.1420491337776184, "learning_rate": 4.4269094358392e-05, "loss": 0.4043, "step": 33765 }, { "epoch": 1.2170685119111977, "grad_norm": 0.19709770381450653, "learning_rate": 4.4267235022816084e-05, "loss": 0.4288, "step": 33770 }, { "epoch": 1.2172487115724222, "grad_norm": 0.17064298689365387, "learning_rate": 4.4265375424727815e-05, "loss": 0.4346, "step": 33775 }, { "epoch": 1.217428911233647, "grad_norm": 0.16574378311634064, "learning_rate": 4.4263515564152534e-05, "loss": 0.4228, "step": 33780 }, { "epoch": 1.2176091108948714, "grad_norm": 0.20119209587574005, "learning_rate": 4.426165544111558e-05, "loss": 0.4042, "step": 33785 }, { "epoch": 1.2177893105560962, "grad_norm": 0.16486892104148865, "learning_rate": 4.4259795055642305e-05, "loss": 0.4075, "step": 33790 }, { "epoch": 1.2179695102173207, "grad_norm": 0.17339356243610382, "learning_rate": 4.425793440775805e-05, "loss": 0.4147, "step": 33795 }, { "epoch": 1.2181497098785454, "grad_norm": 0.16329234838485718, "learning_rate": 4.425607349748816e-05, "loss": 0.411, "step": 33800 }, { "epoch": 1.2183299095397702, "grad_norm": 0.17299103736877441, "learning_rate": 4.425421232485801e-05, "loss": 0.4047, "step": 33805 }, { "epoch": 1.2185101092009947, "grad_norm": 0.18187008798122406, "learning_rate": 4.4252350889892936e-05, "loss": 0.4287, "step": 33810 }, { "epoch": 1.2186903088622194, "grad_norm": 0.19990107417106628, "learning_rate": 4.425048919261832e-05, "loss": 0.4089, "step": 33815 }, { "epoch": 1.218870508523444, "grad_norm": 0.1538524031639099, "learning_rate": 4.4248627233059505e-05, "loss": 0.4271, "step": 33820 }, { "epoch": 1.2190507081846687, "grad_norm": 0.18129165470600128, "learning_rate": 4.4246765011241864e-05, "loss": 0.3741, "step": 33825 }, { "epoch": 1.2192309078458932, "grad_norm": 0.20447011291980743, "learning_rate": 4.4244902527190785e-05, "loss": 0.4431, "step": 33830 }, { "epoch": 1.219411107507118, "grad_norm": 0.148980513215065, "learning_rate": 4.424303978093163e-05, "loss": 0.3938, "step": 33835 }, { "epoch": 1.2195913071683426, "grad_norm": 0.14329323172569275, "learning_rate": 4.424117677248979e-05, "loss": 0.4257, "step": 33840 }, { "epoch": 1.2197715068295671, "grad_norm": 0.22545427083969116, "learning_rate": 4.423931350189065e-05, "loss": 0.4461, "step": 33845 }, { "epoch": 1.2199517064907919, "grad_norm": 0.17364872992038727, "learning_rate": 4.423744996915957e-05, "loss": 0.445, "step": 33850 }, { "epoch": 1.2201319061520164, "grad_norm": 0.2247709333896637, "learning_rate": 4.4235586174321964e-05, "loss": 0.4471, "step": 33855 }, { "epoch": 1.2203121058132411, "grad_norm": 0.18981455266475677, "learning_rate": 4.423372211740323e-05, "loss": 0.4089, "step": 33860 }, { "epoch": 1.2204923054744656, "grad_norm": 0.15154671669006348, "learning_rate": 4.423185779842874e-05, "loss": 0.4149, "step": 33865 }, { "epoch": 1.2206725051356904, "grad_norm": 0.1997554749250412, "learning_rate": 4.422999321742393e-05, "loss": 0.3822, "step": 33870 }, { "epoch": 1.2208527047969149, "grad_norm": 0.21992482244968414, "learning_rate": 4.422812837441417e-05, "loss": 0.4158, "step": 33875 }, { "epoch": 1.2210329044581396, "grad_norm": 0.19677501916885376, "learning_rate": 4.422626326942489e-05, "loss": 0.4037, "step": 33880 }, { "epoch": 1.2212131041193643, "grad_norm": 0.1691318303346634, "learning_rate": 4.422439790248149e-05, "loss": 0.4369, "step": 33885 }, { "epoch": 1.2213933037805889, "grad_norm": 0.15583454072475433, "learning_rate": 4.4222532273609396e-05, "loss": 0.3988, "step": 33890 }, { "epoch": 1.2215735034418136, "grad_norm": 0.1846490055322647, "learning_rate": 4.422066638283402e-05, "loss": 0.4234, "step": 33895 }, { "epoch": 1.221753703103038, "grad_norm": 0.18129578232765198, "learning_rate": 4.421880023018079e-05, "loss": 0.4312, "step": 33900 }, { "epoch": 1.2219339027642628, "grad_norm": 0.15360191464424133, "learning_rate": 4.421693381567512e-05, "loss": 0.4302, "step": 33905 }, { "epoch": 1.2221141024254873, "grad_norm": 0.2045409232378006, "learning_rate": 4.421506713934245e-05, "loss": 0.4176, "step": 33910 }, { "epoch": 1.222294302086712, "grad_norm": 0.15887264907360077, "learning_rate": 4.421320020120821e-05, "loss": 0.4114, "step": 33915 }, { "epoch": 1.2224745017479366, "grad_norm": 0.18806084990501404, "learning_rate": 4.4211333001297836e-05, "loss": 0.4254, "step": 33920 }, { "epoch": 1.2226547014091613, "grad_norm": 0.21067681908607483, "learning_rate": 4.420946553963677e-05, "loss": 0.4186, "step": 33925 }, { "epoch": 1.222834901070386, "grad_norm": 0.15859293937683105, "learning_rate": 4.4207597816250454e-05, "loss": 0.4144, "step": 33930 }, { "epoch": 1.2230151007316106, "grad_norm": 0.1685830056667328, "learning_rate": 4.420572983116434e-05, "loss": 0.4313, "step": 33935 }, { "epoch": 1.2231953003928353, "grad_norm": 0.17942924797534943, "learning_rate": 4.420386158440388e-05, "loss": 0.4175, "step": 33940 }, { "epoch": 1.2233755000540598, "grad_norm": 0.19803176820278168, "learning_rate": 4.420199307599452e-05, "loss": 0.4151, "step": 33945 }, { "epoch": 1.2235556997152846, "grad_norm": 0.18446727097034454, "learning_rate": 4.420012430596172e-05, "loss": 0.4478, "step": 33950 }, { "epoch": 1.2237358993765093, "grad_norm": 0.15331429243087769, "learning_rate": 4.419825527433095e-05, "loss": 0.424, "step": 33955 }, { "epoch": 1.2239160990377338, "grad_norm": 0.21690736711025238, "learning_rate": 4.419638598112765e-05, "loss": 0.4068, "step": 33960 }, { "epoch": 1.2240962986989585, "grad_norm": 0.18475008010864258, "learning_rate": 4.4194516426377326e-05, "loss": 0.4087, "step": 33965 }, { "epoch": 1.224276498360183, "grad_norm": 0.19182796776294708, "learning_rate": 4.4192646610105425e-05, "loss": 0.4304, "step": 33970 }, { "epoch": 1.2244566980214078, "grad_norm": 0.15352657437324524, "learning_rate": 4.419077653233743e-05, "loss": 0.4087, "step": 33975 }, { "epoch": 1.2246368976826323, "grad_norm": 0.18293914198875427, "learning_rate": 4.418890619309882e-05, "loss": 0.4337, "step": 33980 }, { "epoch": 1.224817097343857, "grad_norm": 0.22879280149936676, "learning_rate": 4.4187035592415085e-05, "loss": 0.4208, "step": 33985 }, { "epoch": 1.2249972970050815, "grad_norm": 0.18083889782428741, "learning_rate": 4.41851647303117e-05, "loss": 0.4357, "step": 33990 }, { "epoch": 1.2251774966663063, "grad_norm": 0.17108914256095886, "learning_rate": 4.4183293606814155e-05, "loss": 0.3936, "step": 33995 }, { "epoch": 1.225357696327531, "grad_norm": 0.15355034172534943, "learning_rate": 4.418142222194795e-05, "loss": 0.3946, "step": 34000 }, { "epoch": 1.225357696327531, "eval_loss": 0.45026490092277527, "eval_runtime": 3.5278, "eval_samples_per_second": 28.346, "eval_steps_per_second": 7.086, "step": 34000 }, { "epoch": 1.2255378959887555, "grad_norm": 0.18301257491111755, "learning_rate": 4.4179550575738584e-05, "loss": 0.4336, "step": 34005 }, { "epoch": 1.2257180956499802, "grad_norm": 0.16814136505126953, "learning_rate": 4.4177678668211555e-05, "loss": 0.4443, "step": 34010 }, { "epoch": 1.2258982953112048, "grad_norm": 0.22709350287914276, "learning_rate": 4.417580649939237e-05, "loss": 0.4527, "step": 34015 }, { "epoch": 1.2260784949724295, "grad_norm": 0.15572187304496765, "learning_rate": 4.417393406930652e-05, "loss": 0.3646, "step": 34020 }, { "epoch": 1.226258694633654, "grad_norm": 0.17748330533504486, "learning_rate": 4.4172061377979545e-05, "loss": 0.4317, "step": 34025 }, { "epoch": 1.2264388942948787, "grad_norm": 0.18140548467636108, "learning_rate": 4.417018842543694e-05, "loss": 0.4283, "step": 34030 }, { "epoch": 1.2266190939561032, "grad_norm": 0.1755266636610031, "learning_rate": 4.416831521170424e-05, "loss": 0.4276, "step": 34035 }, { "epoch": 1.226799293617328, "grad_norm": 0.1404469907283783, "learning_rate": 4.416644173680694e-05, "loss": 0.3802, "step": 34040 }, { "epoch": 1.2269794932785527, "grad_norm": 0.18569141626358032, "learning_rate": 4.416456800077059e-05, "loss": 0.4175, "step": 34045 }, { "epoch": 1.2271596929397772, "grad_norm": 0.20885035395622253, "learning_rate": 4.416269400362071e-05, "loss": 0.4017, "step": 34050 }, { "epoch": 1.227339892601002, "grad_norm": 0.18960420787334442, "learning_rate": 4.416081974538283e-05, "loss": 0.4186, "step": 34055 }, { "epoch": 1.2275200922622265, "grad_norm": 0.15756478905677795, "learning_rate": 4.41589452260825e-05, "loss": 0.3757, "step": 34060 }, { "epoch": 1.2277002919234512, "grad_norm": 0.16769862174987793, "learning_rate": 4.415707044574524e-05, "loss": 0.3859, "step": 34065 }, { "epoch": 1.227880491584676, "grad_norm": 0.21524082124233246, "learning_rate": 4.415519540439661e-05, "loss": 0.46, "step": 34070 }, { "epoch": 1.2280606912459004, "grad_norm": 0.1766262948513031, "learning_rate": 4.4153320102062155e-05, "loss": 0.3977, "step": 34075 }, { "epoch": 1.2282408909071252, "grad_norm": 0.15509352087974548, "learning_rate": 4.4151444538767414e-05, "loss": 0.4288, "step": 34080 }, { "epoch": 1.2284210905683497, "grad_norm": 0.14794203639030457, "learning_rate": 4.414956871453796e-05, "loss": 0.4274, "step": 34085 }, { "epoch": 1.2286012902295744, "grad_norm": 0.1935451328754425, "learning_rate": 4.4147692629399326e-05, "loss": 0.3819, "step": 34090 }, { "epoch": 1.228781489890799, "grad_norm": 0.14869855344295502, "learning_rate": 4.414581628337709e-05, "loss": 0.4268, "step": 34095 }, { "epoch": 1.2289616895520237, "grad_norm": 0.17884673178195953, "learning_rate": 4.4143939676496825e-05, "loss": 0.4162, "step": 34100 }, { "epoch": 1.2291418892132482, "grad_norm": 0.19956587255001068, "learning_rate": 4.414206280878408e-05, "loss": 0.4309, "step": 34105 }, { "epoch": 1.229322088874473, "grad_norm": 0.1776251643896103, "learning_rate": 4.414018568026443e-05, "loss": 0.4051, "step": 34110 }, { "epoch": 1.2295022885356977, "grad_norm": 0.16299593448638916, "learning_rate": 4.413830829096347e-05, "loss": 0.4617, "step": 34115 }, { "epoch": 1.2296824881969222, "grad_norm": 0.17670361697673798, "learning_rate": 4.413643064090675e-05, "loss": 0.399, "step": 34120 }, { "epoch": 1.229862687858147, "grad_norm": 0.16211190819740295, "learning_rate": 4.4134552730119874e-05, "loss": 0.4074, "step": 34125 }, { "epoch": 1.2300428875193714, "grad_norm": 0.1310056746006012, "learning_rate": 4.413267455862842e-05, "loss": 0.4109, "step": 34130 }, { "epoch": 1.2302230871805961, "grad_norm": 0.1611911952495575, "learning_rate": 4.4130796126457984e-05, "loss": 0.3891, "step": 34135 }, { "epoch": 1.2304032868418207, "grad_norm": 0.17254267632961273, "learning_rate": 4.412891743363416e-05, "loss": 0.3842, "step": 34140 }, { "epoch": 1.2305834865030454, "grad_norm": 0.2114710956811905, "learning_rate": 4.412703848018253e-05, "loss": 0.421, "step": 34145 }, { "epoch": 1.23076368616427, "grad_norm": 0.1724889725446701, "learning_rate": 4.4125159266128696e-05, "loss": 0.4537, "step": 34150 }, { "epoch": 1.2309438858254946, "grad_norm": 0.17160102725028992, "learning_rate": 4.412327979149828e-05, "loss": 0.4127, "step": 34155 }, { "epoch": 1.2311240854867194, "grad_norm": 0.15789945423603058, "learning_rate": 4.412140005631688e-05, "loss": 0.3848, "step": 34160 }, { "epoch": 1.2313042851479439, "grad_norm": 0.1946706771850586, "learning_rate": 4.4119520060610105e-05, "loss": 0.4288, "step": 34165 }, { "epoch": 1.2314844848091686, "grad_norm": 0.1840618997812271, "learning_rate": 4.411763980440357e-05, "loss": 0.3903, "step": 34170 }, { "epoch": 1.2316646844703931, "grad_norm": 0.16430874168872833, "learning_rate": 4.411575928772289e-05, "loss": 0.4252, "step": 34175 }, { "epoch": 1.2318448841316179, "grad_norm": 0.16133743524551392, "learning_rate": 4.41138785105937e-05, "loss": 0.4184, "step": 34180 }, { "epoch": 1.2320250837928426, "grad_norm": 0.18589156866073608, "learning_rate": 4.411199747304161e-05, "loss": 0.3987, "step": 34185 }, { "epoch": 1.232205283454067, "grad_norm": 0.19146478176116943, "learning_rate": 4.4110116175092254e-05, "loss": 0.437, "step": 34190 }, { "epoch": 1.2323854831152918, "grad_norm": 0.1762928068637848, "learning_rate": 4.410823461677126e-05, "loss": 0.3935, "step": 34195 }, { "epoch": 1.2325656827765163, "grad_norm": 0.17464253306388855, "learning_rate": 4.4106352798104276e-05, "loss": 0.4299, "step": 34200 }, { "epoch": 1.232745882437741, "grad_norm": 0.15182888507843018, "learning_rate": 4.410447071911693e-05, "loss": 0.4296, "step": 34205 }, { "epoch": 1.2329260820989656, "grad_norm": 0.20655085146427155, "learning_rate": 4.410258837983488e-05, "loss": 0.4122, "step": 34210 }, { "epoch": 1.2331062817601903, "grad_norm": 0.15774188935756683, "learning_rate": 4.4100705780283746e-05, "loss": 0.3955, "step": 34215 }, { "epoch": 1.2332864814214148, "grad_norm": 0.16499753296375275, "learning_rate": 4.40988229204892e-05, "loss": 0.4257, "step": 34220 }, { "epoch": 1.2334666810826396, "grad_norm": 0.21794357895851135, "learning_rate": 4.4096939800476894e-05, "loss": 0.4236, "step": 34225 }, { "epoch": 1.2336468807438643, "grad_norm": 0.16437794268131256, "learning_rate": 4.409505642027248e-05, "loss": 0.4395, "step": 34230 }, { "epoch": 1.2338270804050888, "grad_norm": 0.152201309800148, "learning_rate": 4.409317277990161e-05, "loss": 0.4321, "step": 34235 }, { "epoch": 1.2340072800663135, "grad_norm": 0.1949222981929779, "learning_rate": 4.409128887938997e-05, "loss": 0.4265, "step": 34240 }, { "epoch": 1.234187479727538, "grad_norm": 0.18767017126083374, "learning_rate": 4.408940471876321e-05, "loss": 0.4111, "step": 34245 }, { "epoch": 1.2343676793887628, "grad_norm": 0.22580619156360626, "learning_rate": 4.4087520298047003e-05, "loss": 0.416, "step": 34250 }, { "epoch": 1.2345478790499873, "grad_norm": 0.14515608549118042, "learning_rate": 4.4085635617267026e-05, "loss": 0.4118, "step": 34255 }, { "epoch": 1.234728078711212, "grad_norm": 0.17137964069843292, "learning_rate": 4.408375067644897e-05, "loss": 0.3908, "step": 34260 }, { "epoch": 1.2349082783724366, "grad_norm": 0.17997115850448608, "learning_rate": 4.40818654756185e-05, "loss": 0.4645, "step": 34265 }, { "epoch": 1.2350884780336613, "grad_norm": 0.16378553211688995, "learning_rate": 4.40799800148013e-05, "loss": 0.4137, "step": 34270 }, { "epoch": 1.235268677694886, "grad_norm": 0.1839524507522583, "learning_rate": 4.407809429402308e-05, "loss": 0.4448, "step": 34275 }, { "epoch": 1.2354488773561105, "grad_norm": 0.1332423835992813, "learning_rate": 4.407620831330951e-05, "loss": 0.4172, "step": 34280 }, { "epoch": 1.2356290770173353, "grad_norm": 0.18333803117275238, "learning_rate": 4.407432207268629e-05, "loss": 0.4224, "step": 34285 }, { "epoch": 1.2358092766785598, "grad_norm": 0.2039324939250946, "learning_rate": 4.4072435572179136e-05, "loss": 0.4289, "step": 34290 }, { "epoch": 1.2359894763397845, "grad_norm": 0.17370297014713287, "learning_rate": 4.407054881181373e-05, "loss": 0.4226, "step": 34295 }, { "epoch": 1.236169676001009, "grad_norm": 0.16956277191638947, "learning_rate": 4.40686617916158e-05, "loss": 0.4354, "step": 34300 }, { "epoch": 1.2363498756622338, "grad_norm": 0.16114924848079681, "learning_rate": 4.406677451161103e-05, "loss": 0.4433, "step": 34305 }, { "epoch": 1.2365300753234585, "grad_norm": 0.15897580981254578, "learning_rate": 4.406488697182516e-05, "loss": 0.3884, "step": 34310 }, { "epoch": 1.236710274984683, "grad_norm": 0.17210477590560913, "learning_rate": 4.406299917228389e-05, "loss": 0.4553, "step": 34315 }, { "epoch": 1.2368904746459077, "grad_norm": 0.17768850922584534, "learning_rate": 4.406111111301295e-05, "loss": 0.4372, "step": 34320 }, { "epoch": 1.2370706743071322, "grad_norm": 0.20289890468120575, "learning_rate": 4.405922279403807e-05, "loss": 0.4266, "step": 34325 }, { "epoch": 1.237250873968357, "grad_norm": 0.18634875118732452, "learning_rate": 4.405733421538496e-05, "loss": 0.39, "step": 34330 }, { "epoch": 1.2374310736295815, "grad_norm": 0.16916252672672272, "learning_rate": 4.4055445377079364e-05, "loss": 0.3812, "step": 34335 }, { "epoch": 1.2376112732908062, "grad_norm": 0.17390431463718414, "learning_rate": 4.405355627914701e-05, "loss": 0.4682, "step": 34340 }, { "epoch": 1.237791472952031, "grad_norm": 0.17190834879875183, "learning_rate": 4.405166692161365e-05, "loss": 0.4097, "step": 34345 }, { "epoch": 1.2379716726132555, "grad_norm": 0.15604445338249207, "learning_rate": 4.4049777304505e-05, "loss": 0.4193, "step": 34350 }, { "epoch": 1.2381518722744802, "grad_norm": 0.19317157566547394, "learning_rate": 4.404788742784683e-05, "loss": 0.4087, "step": 34355 }, { "epoch": 1.2383320719357047, "grad_norm": 0.15867052972316742, "learning_rate": 4.404599729166489e-05, "loss": 0.4123, "step": 34360 }, { "epoch": 1.2385122715969294, "grad_norm": 0.18117307126522064, "learning_rate": 4.404410689598491e-05, "loss": 0.4088, "step": 34365 }, { "epoch": 1.238692471258154, "grad_norm": 0.20402644574642181, "learning_rate": 4.404221624083267e-05, "loss": 0.4284, "step": 34370 }, { "epoch": 1.2388726709193787, "grad_norm": 0.20701655745506287, "learning_rate": 4.4040325326233914e-05, "loss": 0.386, "step": 34375 }, { "epoch": 1.2390528705806032, "grad_norm": 0.20911046862602234, "learning_rate": 4.403843415221442e-05, "loss": 0.3965, "step": 34380 }, { "epoch": 1.239233070241828, "grad_norm": 0.22463898360729218, "learning_rate": 4.4036542718799944e-05, "loss": 0.4292, "step": 34385 }, { "epoch": 1.2394132699030527, "grad_norm": 0.1606704443693161, "learning_rate": 4.403465102601626e-05, "loss": 0.4427, "step": 34390 }, { "epoch": 1.2395934695642772, "grad_norm": 0.18783807754516602, "learning_rate": 4.4032759073889134e-05, "loss": 0.4545, "step": 34395 }, { "epoch": 1.239773669225502, "grad_norm": 0.20419709384441376, "learning_rate": 4.403086686244435e-05, "loss": 0.4332, "step": 34400 }, { "epoch": 1.2399538688867264, "grad_norm": 0.16795575618743896, "learning_rate": 4.40289743917077e-05, "loss": 0.4359, "step": 34405 }, { "epoch": 1.2401340685479512, "grad_norm": 0.157429039478302, "learning_rate": 4.402708166170495e-05, "loss": 0.4176, "step": 34410 }, { "epoch": 1.2403142682091757, "grad_norm": 0.1853724867105484, "learning_rate": 4.4025188672461903e-05, "loss": 0.4474, "step": 34415 }, { "epoch": 1.2404944678704004, "grad_norm": 0.1714290827512741, "learning_rate": 4.402329542400434e-05, "loss": 0.4237, "step": 34420 }, { "epoch": 1.240674667531625, "grad_norm": 0.2036142498254776, "learning_rate": 4.402140191635806e-05, "loss": 0.4049, "step": 34425 }, { "epoch": 1.2408548671928497, "grad_norm": 0.1990082859992981, "learning_rate": 4.401950814954886e-05, "loss": 0.4252, "step": 34430 }, { "epoch": 1.2410350668540744, "grad_norm": 0.17865248024463654, "learning_rate": 4.4017614123602546e-05, "loss": 0.4072, "step": 34435 }, { "epoch": 1.241215266515299, "grad_norm": 0.1914636194705963, "learning_rate": 4.401571983854492e-05, "loss": 0.4452, "step": 34440 }, { "epoch": 1.2413954661765236, "grad_norm": 0.1847231239080429, "learning_rate": 4.401382529440179e-05, "loss": 0.3943, "step": 34445 }, { "epoch": 1.2415756658377481, "grad_norm": 0.20364885032176971, "learning_rate": 4.401193049119898e-05, "loss": 0.4116, "step": 34450 }, { "epoch": 1.2417558654989729, "grad_norm": 0.1959732621908188, "learning_rate": 4.4010035428962295e-05, "loss": 0.4072, "step": 34455 }, { "epoch": 1.2419360651601976, "grad_norm": 0.18121816217899323, "learning_rate": 4.400814010771755e-05, "loss": 0.4325, "step": 34460 }, { "epoch": 1.2421162648214221, "grad_norm": 0.23917900025844574, "learning_rate": 4.400624452749058e-05, "loss": 0.4233, "step": 34465 }, { "epoch": 1.2422964644826469, "grad_norm": 0.21127377450466156, "learning_rate": 4.400434868830721e-05, "loss": 0.4266, "step": 34470 }, { "epoch": 1.2424766641438714, "grad_norm": 0.21649089455604553, "learning_rate": 4.4002452590193265e-05, "loss": 0.4478, "step": 34475 }, { "epoch": 1.242656863805096, "grad_norm": 0.18373538553714752, "learning_rate": 4.400055623317459e-05, "loss": 0.3895, "step": 34480 }, { "epoch": 1.2428370634663206, "grad_norm": 0.1825505495071411, "learning_rate": 4.399865961727701e-05, "loss": 0.3957, "step": 34485 }, { "epoch": 1.2430172631275453, "grad_norm": 0.15860360860824585, "learning_rate": 4.399676274252637e-05, "loss": 0.4106, "step": 34490 }, { "epoch": 1.2431974627887699, "grad_norm": 0.14994730055332184, "learning_rate": 4.399486560894852e-05, "loss": 0.3668, "step": 34495 }, { "epoch": 1.2433776624499946, "grad_norm": 0.18823125958442688, "learning_rate": 4.39929682165693e-05, "loss": 0.419, "step": 34500 }, { "epoch": 1.2433776624499946, "eval_loss": 0.45045050978660583, "eval_runtime": 3.5444, "eval_samples_per_second": 28.213, "eval_steps_per_second": 7.053, "step": 34500 }, { "epoch": 1.2435578621112193, "grad_norm": 0.20211850106716156, "learning_rate": 4.399107056541456e-05, "loss": 0.3954, "step": 34505 }, { "epoch": 1.2437380617724438, "grad_norm": 0.19048206508159637, "learning_rate": 4.398917265551017e-05, "loss": 0.4312, "step": 34510 }, { "epoch": 1.2439182614336686, "grad_norm": 0.17858850955963135, "learning_rate": 4.398727448688198e-05, "loss": 0.4522, "step": 34515 }, { "epoch": 1.244098461094893, "grad_norm": 0.14461615681648254, "learning_rate": 4.398537605955584e-05, "loss": 0.4295, "step": 34520 }, { "epoch": 1.2442786607561178, "grad_norm": 0.16220664978027344, "learning_rate": 4.398347737355764e-05, "loss": 0.4194, "step": 34525 }, { "epoch": 1.2444588604173423, "grad_norm": 0.17861701548099518, "learning_rate": 4.398157842891323e-05, "loss": 0.4393, "step": 34530 }, { "epoch": 1.244639060078567, "grad_norm": 0.2174644023180008, "learning_rate": 4.3979679225648484e-05, "loss": 0.4721, "step": 34535 }, { "epoch": 1.2448192597397916, "grad_norm": 0.19384269416332245, "learning_rate": 4.397777976378929e-05, "loss": 0.4112, "step": 34540 }, { "epoch": 1.2449994594010163, "grad_norm": 0.16836018860340118, "learning_rate": 4.397588004336152e-05, "loss": 0.4547, "step": 34545 }, { "epoch": 1.245179659062241, "grad_norm": 0.163166344165802, "learning_rate": 4.397398006439105e-05, "loss": 0.4291, "step": 34550 }, { "epoch": 1.2453598587234656, "grad_norm": 0.17786313593387604, "learning_rate": 4.397207982690378e-05, "loss": 0.4256, "step": 34555 }, { "epoch": 1.2455400583846903, "grad_norm": 0.1747133880853653, "learning_rate": 4.39701793309256e-05, "loss": 0.4184, "step": 34560 }, { "epoch": 1.2457202580459148, "grad_norm": 0.1763535887002945, "learning_rate": 4.3968278576482394e-05, "loss": 0.4077, "step": 34565 }, { "epoch": 1.2459004577071395, "grad_norm": 0.22853651642799377, "learning_rate": 4.396637756360007e-05, "loss": 0.4367, "step": 34570 }, { "epoch": 1.2460806573683643, "grad_norm": 0.21163353323936462, "learning_rate": 4.396447629230452e-05, "loss": 0.4155, "step": 34575 }, { "epoch": 1.2462608570295888, "grad_norm": 0.16000016033649445, "learning_rate": 4.396257476262165e-05, "loss": 0.3771, "step": 34580 }, { "epoch": 1.2464410566908135, "grad_norm": 0.23140566051006317, "learning_rate": 4.396067297457738e-05, "loss": 0.4196, "step": 34585 }, { "epoch": 1.246621256352038, "grad_norm": 0.16207090020179749, "learning_rate": 4.3958770928197604e-05, "loss": 0.4025, "step": 34590 }, { "epoch": 1.2468014560132628, "grad_norm": 0.18664728105068207, "learning_rate": 4.395686862350824e-05, "loss": 0.4154, "step": 34595 }, { "epoch": 1.2469816556744873, "grad_norm": 0.1926645040512085, "learning_rate": 4.395496606053522e-05, "loss": 0.4093, "step": 34600 }, { "epoch": 1.247161855335712, "grad_norm": 0.16075952351093292, "learning_rate": 4.395306323930445e-05, "loss": 0.4158, "step": 34605 }, { "epoch": 1.2473420549969365, "grad_norm": 0.1828758716583252, "learning_rate": 4.3951160159841864e-05, "loss": 0.3779, "step": 34610 }, { "epoch": 1.2475222546581612, "grad_norm": 0.2084207683801651, "learning_rate": 4.394925682217339e-05, "loss": 0.4011, "step": 34615 }, { "epoch": 1.247702454319386, "grad_norm": 0.17692120373249054, "learning_rate": 4.3947353226324964e-05, "loss": 0.4318, "step": 34620 }, { "epoch": 1.2478826539806105, "grad_norm": 0.16040922701358795, "learning_rate": 4.394544937232252e-05, "loss": 0.396, "step": 34625 }, { "epoch": 1.2480628536418352, "grad_norm": 0.19275468587875366, "learning_rate": 4.3943545260192e-05, "loss": 0.4325, "step": 34630 }, { "epoch": 1.2482430533030597, "grad_norm": 0.1728275716304779, "learning_rate": 4.394164088995933e-05, "loss": 0.4156, "step": 34635 }, { "epoch": 1.2484232529642845, "grad_norm": 0.2221703678369522, "learning_rate": 4.393973626165048e-05, "loss": 0.3882, "step": 34640 }, { "epoch": 1.248603452625509, "grad_norm": 0.18804728984832764, "learning_rate": 4.393783137529139e-05, "loss": 0.4215, "step": 34645 }, { "epoch": 1.2487836522867337, "grad_norm": 0.15801158547401428, "learning_rate": 4.393592623090801e-05, "loss": 0.364, "step": 34650 }, { "epoch": 1.2489638519479582, "grad_norm": 0.1544542908668518, "learning_rate": 4.39340208285263e-05, "loss": 0.4171, "step": 34655 }, { "epoch": 1.249144051609183, "grad_norm": 0.19875742495059967, "learning_rate": 4.3932115168172225e-05, "loss": 0.4258, "step": 34660 }, { "epoch": 1.2493242512704077, "grad_norm": 0.16528360545635223, "learning_rate": 4.3930209249871744e-05, "loss": 0.4562, "step": 34665 }, { "epoch": 1.2495044509316322, "grad_norm": 0.18268528580665588, "learning_rate": 4.3928303073650835e-05, "loss": 0.407, "step": 34670 }, { "epoch": 1.249684650592857, "grad_norm": 0.1825907826423645, "learning_rate": 4.392639663953545e-05, "loss": 0.4098, "step": 34675 }, { "epoch": 1.2498648502540814, "grad_norm": 0.17392845451831818, "learning_rate": 4.3924489947551586e-05, "loss": 0.4377, "step": 34680 }, { "epoch": 1.2500450499153062, "grad_norm": 0.1909123808145523, "learning_rate": 4.39225829977252e-05, "loss": 0.4298, "step": 34685 }, { "epoch": 1.250225249576531, "grad_norm": 0.13296571373939514, "learning_rate": 4.392067579008229e-05, "loss": 0.4111, "step": 34690 }, { "epoch": 1.2504054492377554, "grad_norm": 0.21893556416034698, "learning_rate": 4.391876832464883e-05, "loss": 0.4422, "step": 34695 }, { "epoch": 1.25058564889898, "grad_norm": 0.18834765255451202, "learning_rate": 4.3916860601450825e-05, "loss": 0.4363, "step": 34700 }, { "epoch": 1.2507658485602047, "grad_norm": 0.18092414736747742, "learning_rate": 4.391495262051425e-05, "loss": 0.4227, "step": 34705 }, { "epoch": 1.2509460482214294, "grad_norm": 0.18126097321510315, "learning_rate": 4.391304438186511e-05, "loss": 0.3999, "step": 34710 }, { "epoch": 1.251126247882654, "grad_norm": 0.19100238382816315, "learning_rate": 4.39111358855294e-05, "loss": 0.3802, "step": 34715 }, { "epoch": 1.2513064475438787, "grad_norm": 0.18125472962856293, "learning_rate": 4.390922713153312e-05, "loss": 0.4184, "step": 34720 }, { "epoch": 1.2514866472051032, "grad_norm": 0.2104109823703766, "learning_rate": 4.3907318119902286e-05, "loss": 0.3942, "step": 34725 }, { "epoch": 1.251666846866328, "grad_norm": 0.18633076548576355, "learning_rate": 4.390540885066291e-05, "loss": 0.409, "step": 34730 }, { "epoch": 1.2518470465275526, "grad_norm": 0.18093355000019073, "learning_rate": 4.3903499323840985e-05, "loss": 0.411, "step": 34735 }, { "epoch": 1.2520272461887771, "grad_norm": 0.18245679140090942, "learning_rate": 4.390158953946255e-05, "loss": 0.4236, "step": 34740 }, { "epoch": 1.2522074458500019, "grad_norm": 0.16724418103694916, "learning_rate": 4.3899679497553616e-05, "loss": 0.4244, "step": 34745 }, { "epoch": 1.2523876455112264, "grad_norm": 0.1758909821510315, "learning_rate": 4.3897769198140204e-05, "loss": 0.438, "step": 34750 }, { "epoch": 1.2525678451724511, "grad_norm": 0.17863698303699493, "learning_rate": 4.389585864124835e-05, "loss": 0.4163, "step": 34755 }, { "epoch": 1.2527480448336756, "grad_norm": 0.19687694311141968, "learning_rate": 4.389394782690408e-05, "loss": 0.4004, "step": 34760 }, { "epoch": 1.2529282444949004, "grad_norm": 0.17366833984851837, "learning_rate": 4.389203675513343e-05, "loss": 0.4307, "step": 34765 }, { "epoch": 1.2531084441561249, "grad_norm": 0.14887477457523346, "learning_rate": 4.389012542596244e-05, "loss": 0.435, "step": 34770 }, { "epoch": 1.2532886438173496, "grad_norm": 0.17449168860912323, "learning_rate": 4.388821383941714e-05, "loss": 0.4406, "step": 34775 }, { "epoch": 1.2534688434785743, "grad_norm": 0.1995227187871933, "learning_rate": 4.388630199552358e-05, "loss": 0.4066, "step": 34780 }, { "epoch": 1.2536490431397989, "grad_norm": 0.19903253018856049, "learning_rate": 4.388438989430782e-05, "loss": 0.459, "step": 34785 }, { "epoch": 1.2538292428010236, "grad_norm": 0.23275376856327057, "learning_rate": 4.3882477535795904e-05, "loss": 0.4503, "step": 34790 }, { "epoch": 1.254009442462248, "grad_norm": 0.17660613358020782, "learning_rate": 4.3880564920013885e-05, "loss": 0.3901, "step": 34795 }, { "epoch": 1.2541896421234728, "grad_norm": 0.1641213297843933, "learning_rate": 4.3878652046987824e-05, "loss": 0.4275, "step": 34800 }, { "epoch": 1.2543698417846976, "grad_norm": 0.19240440428256989, "learning_rate": 4.387673891674379e-05, "loss": 0.4089, "step": 34805 }, { "epoch": 1.254550041445922, "grad_norm": 0.18231160938739777, "learning_rate": 4.387482552930783e-05, "loss": 0.4007, "step": 34810 }, { "epoch": 1.2547302411071466, "grad_norm": 0.15601201355457306, "learning_rate": 4.387291188470603e-05, "loss": 0.4185, "step": 34815 }, { "epoch": 1.2549104407683713, "grad_norm": 0.1767847090959549, "learning_rate": 4.387099798296447e-05, "loss": 0.4317, "step": 34820 }, { "epoch": 1.255090640429596, "grad_norm": 0.1789582371711731, "learning_rate": 4.386908382410922e-05, "loss": 0.4234, "step": 34825 }, { "epoch": 1.2552708400908206, "grad_norm": 0.1394336223602295, "learning_rate": 4.3867169408166333e-05, "loss": 0.3885, "step": 34830 }, { "epoch": 1.2554510397520453, "grad_norm": 0.1963426172733307, "learning_rate": 4.3865254735161934e-05, "loss": 0.3865, "step": 34835 }, { "epoch": 1.2556312394132698, "grad_norm": 0.19364525377750397, "learning_rate": 4.3863339805122086e-05, "loss": 0.4117, "step": 34840 }, { "epoch": 1.2558114390744946, "grad_norm": 0.19295062124729156, "learning_rate": 4.386142461807288e-05, "loss": 0.4395, "step": 34845 }, { "epoch": 1.2559916387357193, "grad_norm": 0.16617421805858612, "learning_rate": 4.385950917404042e-05, "loss": 0.3931, "step": 34850 }, { "epoch": 1.2561718383969438, "grad_norm": 0.16993173956871033, "learning_rate": 4.3857593473050804e-05, "loss": 0.4174, "step": 34855 }, { "epoch": 1.2563520380581685, "grad_norm": 0.1860906183719635, "learning_rate": 4.3855677515130125e-05, "loss": 0.4128, "step": 34860 }, { "epoch": 1.256532237719393, "grad_norm": 0.17956295609474182, "learning_rate": 4.385376130030448e-05, "loss": 0.4174, "step": 34865 }, { "epoch": 1.2567124373806178, "grad_norm": 0.17807215452194214, "learning_rate": 4.38518448286e-05, "loss": 0.4009, "step": 34870 }, { "epoch": 1.2568926370418423, "grad_norm": 0.15387162566184998, "learning_rate": 4.384992810004278e-05, "loss": 0.4004, "step": 34875 }, { "epoch": 1.257072836703067, "grad_norm": 0.1546095311641693, "learning_rate": 4.3848011114658934e-05, "loss": 0.3967, "step": 34880 }, { "epoch": 1.2572530363642915, "grad_norm": 0.18044285476207733, "learning_rate": 4.384609387247459e-05, "loss": 0.4163, "step": 34885 }, { "epoch": 1.2574332360255163, "grad_norm": 0.2026679515838623, "learning_rate": 4.384417637351587e-05, "loss": 0.4258, "step": 34890 }, { "epoch": 1.257613435686741, "grad_norm": 0.15723782777786255, "learning_rate": 4.3842258617808895e-05, "loss": 0.395, "step": 34895 }, { "epoch": 1.2577936353479655, "grad_norm": 0.19395868480205536, "learning_rate": 4.384034060537979e-05, "loss": 0.425, "step": 34900 }, { "epoch": 1.2579738350091902, "grad_norm": 0.1924101859331131, "learning_rate": 4.383842233625469e-05, "loss": 0.432, "step": 34905 }, { "epoch": 1.2581540346704148, "grad_norm": 0.13849791884422302, "learning_rate": 4.383650381045974e-05, "loss": 0.4305, "step": 34910 }, { "epoch": 1.2583342343316395, "grad_norm": 0.16291098296642303, "learning_rate": 4.383458502802107e-05, "loss": 0.4011, "step": 34915 }, { "epoch": 1.2585144339928642, "grad_norm": 0.19101671874523163, "learning_rate": 4.383266598896482e-05, "loss": 0.4103, "step": 34920 }, { "epoch": 1.2586946336540887, "grad_norm": 0.21897098422050476, "learning_rate": 4.383074669331715e-05, "loss": 0.4145, "step": 34925 }, { "epoch": 1.2588748333153132, "grad_norm": 0.22563257813453674, "learning_rate": 4.3828827141104186e-05, "loss": 0.414, "step": 34930 }, { "epoch": 1.259055032976538, "grad_norm": 0.16169485449790955, "learning_rate": 4.382690733235212e-05, "loss": 0.4148, "step": 34935 }, { "epoch": 1.2592352326377627, "grad_norm": 0.20008553564548492, "learning_rate": 4.382498726708707e-05, "loss": 0.4406, "step": 34940 }, { "epoch": 1.2594154322989872, "grad_norm": 0.21007391810417175, "learning_rate": 4.3823066945335225e-05, "loss": 0.4308, "step": 34945 }, { "epoch": 1.259595631960212, "grad_norm": 0.17790304124355316, "learning_rate": 4.3821146367122726e-05, "loss": 0.3854, "step": 34950 }, { "epoch": 1.2597758316214365, "grad_norm": 0.221872016787529, "learning_rate": 4.381922553247576e-05, "loss": 0.3961, "step": 34955 }, { "epoch": 1.2599560312826612, "grad_norm": 0.16703324019908905, "learning_rate": 4.381730444142048e-05, "loss": 0.4415, "step": 34960 }, { "epoch": 1.260136230943886, "grad_norm": 0.21276666224002838, "learning_rate": 4.3815383093983084e-05, "loss": 0.4316, "step": 34965 }, { "epoch": 1.2603164306051104, "grad_norm": 0.2033817023038864, "learning_rate": 4.381346149018973e-05, "loss": 0.409, "step": 34970 }, { "epoch": 1.260496630266335, "grad_norm": 0.15015016496181488, "learning_rate": 4.38115396300666e-05, "loss": 0.4541, "step": 34975 }, { "epoch": 1.2606768299275597, "grad_norm": 0.19646312296390533, "learning_rate": 4.3809617513639886e-05, "loss": 0.4066, "step": 34980 }, { "epoch": 1.2608570295887844, "grad_norm": 0.17837940156459808, "learning_rate": 4.380769514093578e-05, "loss": 0.4212, "step": 34985 }, { "epoch": 1.261037229250009, "grad_norm": 0.20663444697856903, "learning_rate": 4.380577251198047e-05, "loss": 0.4459, "step": 34990 }, { "epoch": 1.2612174289112337, "grad_norm": 0.20016391575336456, "learning_rate": 4.380384962680015e-05, "loss": 0.4125, "step": 34995 }, { "epoch": 1.2613976285724582, "grad_norm": 0.20151177048683167, "learning_rate": 4.380192648542101e-05, "loss": 0.4607, "step": 35000 }, { "epoch": 1.2613976285724582, "eval_loss": 0.44922810792922974, "eval_runtime": 3.5387, "eval_samples_per_second": 28.259, "eval_steps_per_second": 7.065, "step": 35000 }, { "epoch": 1.261577828233683, "grad_norm": 0.18609340488910675, "learning_rate": 4.380000308786927e-05, "loss": 0.4066, "step": 35005 }, { "epoch": 1.2617580278949077, "grad_norm": 0.18463589251041412, "learning_rate": 4.3798079434171124e-05, "loss": 0.4624, "step": 35010 }, { "epoch": 1.2619382275561322, "grad_norm": 0.21645288169384003, "learning_rate": 4.379615552435279e-05, "loss": 0.4172, "step": 35015 }, { "epoch": 1.262118427217357, "grad_norm": 0.18142879009246826, "learning_rate": 4.379423135844048e-05, "loss": 0.445, "step": 35020 }, { "epoch": 1.2622986268785814, "grad_norm": 0.17872460186481476, "learning_rate": 4.379230693646039e-05, "loss": 0.4329, "step": 35025 }, { "epoch": 1.2624788265398061, "grad_norm": 0.16483642160892487, "learning_rate": 4.3790382258438776e-05, "loss": 0.436, "step": 35030 }, { "epoch": 1.2626590262010309, "grad_norm": 0.2143619805574417, "learning_rate": 4.3788457324401826e-05, "loss": 0.4184, "step": 35035 }, { "epoch": 1.2628392258622554, "grad_norm": 0.16062313318252563, "learning_rate": 4.378653213437579e-05, "loss": 0.4251, "step": 35040 }, { "epoch": 1.26301942552348, "grad_norm": 0.206187441945076, "learning_rate": 4.3784606688386885e-05, "loss": 0.4143, "step": 35045 }, { "epoch": 1.2631996251847046, "grad_norm": 0.15511402487754822, "learning_rate": 4.3782680986461356e-05, "loss": 0.3726, "step": 35050 }, { "epoch": 1.2633798248459294, "grad_norm": 0.15216435492038727, "learning_rate": 4.3780755028625434e-05, "loss": 0.407, "step": 35055 }, { "epoch": 1.2635600245071539, "grad_norm": 0.1635829508304596, "learning_rate": 4.377882881490536e-05, "loss": 0.4166, "step": 35060 }, { "epoch": 1.2637402241683786, "grad_norm": 0.2377825379371643, "learning_rate": 4.377690234532739e-05, "loss": 0.4311, "step": 35065 }, { "epoch": 1.2639204238296031, "grad_norm": 0.1933947205543518, "learning_rate": 4.3774975619917744e-05, "loss": 0.4266, "step": 35070 }, { "epoch": 1.2641006234908279, "grad_norm": 0.1821034997701645, "learning_rate": 4.3773048638702694e-05, "loss": 0.4347, "step": 35075 }, { "epoch": 1.2642808231520526, "grad_norm": 0.21266570687294006, "learning_rate": 4.3771121401708495e-05, "loss": 0.4509, "step": 35080 }, { "epoch": 1.264461022813277, "grad_norm": 0.16414852440357208, "learning_rate": 4.3769193908961405e-05, "loss": 0.44, "step": 35085 }, { "epoch": 1.2646412224745016, "grad_norm": 0.15532633662223816, "learning_rate": 4.3767266160487675e-05, "loss": 0.4466, "step": 35090 }, { "epoch": 1.2648214221357263, "grad_norm": 0.1961180716753006, "learning_rate": 4.376533815631357e-05, "loss": 0.4613, "step": 35095 }, { "epoch": 1.265001621796951, "grad_norm": 0.18854406476020813, "learning_rate": 4.3763409896465376e-05, "loss": 0.4098, "step": 35100 }, { "epoch": 1.2651818214581756, "grad_norm": 0.181373730301857, "learning_rate": 4.376148138096936e-05, "loss": 0.4146, "step": 35105 }, { "epoch": 1.2653620211194003, "grad_norm": 0.16534672677516937, "learning_rate": 4.3759552609851785e-05, "loss": 0.4341, "step": 35110 }, { "epoch": 1.2655422207806248, "grad_norm": 0.22548224031925201, "learning_rate": 4.375762358313894e-05, "loss": 0.3947, "step": 35115 }, { "epoch": 1.2657224204418496, "grad_norm": 0.18645992875099182, "learning_rate": 4.37556943008571e-05, "loss": 0.4356, "step": 35120 }, { "epoch": 1.2659026201030743, "grad_norm": 0.1680716574192047, "learning_rate": 4.375376476303256e-05, "loss": 0.3941, "step": 35125 }, { "epoch": 1.2660828197642988, "grad_norm": 0.18578827381134033, "learning_rate": 4.375183496969161e-05, "loss": 0.4242, "step": 35130 }, { "epoch": 1.2662630194255236, "grad_norm": 0.1809617131948471, "learning_rate": 4.374990492086053e-05, "loss": 0.4275, "step": 35135 }, { "epoch": 1.266443219086748, "grad_norm": 0.21433116495609283, "learning_rate": 4.3747974616565634e-05, "loss": 0.4564, "step": 35140 }, { "epoch": 1.2666234187479728, "grad_norm": 0.18926386535167694, "learning_rate": 4.3746044056833205e-05, "loss": 0.4426, "step": 35145 }, { "epoch": 1.2668036184091975, "grad_norm": 0.19316570460796356, "learning_rate": 4.3744113241689565e-05, "loss": 0.4352, "step": 35150 }, { "epoch": 1.266983818070422, "grad_norm": 0.20651282370090485, "learning_rate": 4.3742182171161005e-05, "loss": 0.4239, "step": 35155 }, { "epoch": 1.2671640177316466, "grad_norm": 0.15775299072265625, "learning_rate": 4.3740250845273845e-05, "loss": 0.3809, "step": 35160 }, { "epoch": 1.2673442173928713, "grad_norm": 0.1828775703907013, "learning_rate": 4.373831926405439e-05, "loss": 0.4036, "step": 35165 }, { "epoch": 1.267524417054096, "grad_norm": 0.17841170728206635, "learning_rate": 4.373638742752897e-05, "loss": 0.4488, "step": 35170 }, { "epoch": 1.2677046167153205, "grad_norm": 0.1723710149526596, "learning_rate": 4.373445533572389e-05, "loss": 0.4132, "step": 35175 }, { "epoch": 1.2678848163765453, "grad_norm": 0.2052830010652542, "learning_rate": 4.373252298866549e-05, "loss": 0.4367, "step": 35180 }, { "epoch": 1.2680650160377698, "grad_norm": 0.17017756402492523, "learning_rate": 4.3730590386380086e-05, "loss": 0.4235, "step": 35185 }, { "epoch": 1.2682452156989945, "grad_norm": 0.1878291219472885, "learning_rate": 4.372865752889402e-05, "loss": 0.4508, "step": 35190 }, { "epoch": 1.2684254153602192, "grad_norm": 0.1998129040002823, "learning_rate": 4.3726724416233625e-05, "loss": 0.4286, "step": 35195 }, { "epoch": 1.2686056150214438, "grad_norm": 0.1536918580532074, "learning_rate": 4.372479104842522e-05, "loss": 0.4135, "step": 35200 }, { "epoch": 1.2687858146826683, "grad_norm": 0.2284386157989502, "learning_rate": 4.372285742549517e-05, "loss": 0.4026, "step": 35205 }, { "epoch": 1.268966014343893, "grad_norm": 0.1657029688358307, "learning_rate": 4.372092354746982e-05, "loss": 0.4326, "step": 35210 }, { "epoch": 1.2691462140051177, "grad_norm": 0.1616412103176117, "learning_rate": 4.37189894143755e-05, "loss": 0.424, "step": 35215 }, { "epoch": 1.2693264136663422, "grad_norm": 0.18977968394756317, "learning_rate": 4.371705502623858e-05, "loss": 0.4001, "step": 35220 }, { "epoch": 1.269506613327567, "grad_norm": 0.20277686417102814, "learning_rate": 4.371512038308541e-05, "loss": 0.4667, "step": 35225 }, { "epoch": 1.2696868129887915, "grad_norm": 0.16292032599449158, "learning_rate": 4.371318548494234e-05, "loss": 0.4241, "step": 35230 }, { "epoch": 1.2698670126500162, "grad_norm": 0.18846076726913452, "learning_rate": 4.3711250331835754e-05, "loss": 0.3928, "step": 35235 }, { "epoch": 1.270047212311241, "grad_norm": 0.17085112631320953, "learning_rate": 4.370931492379199e-05, "loss": 0.4118, "step": 35240 }, { "epoch": 1.2702274119724655, "grad_norm": 0.18065665662288666, "learning_rate": 4.3707379260837444e-05, "loss": 0.4184, "step": 35245 }, { "epoch": 1.2704076116336902, "grad_norm": 0.278125137090683, "learning_rate": 4.370544334299847e-05, "loss": 0.4567, "step": 35250 }, { "epoch": 1.2705878112949147, "grad_norm": 0.1816800832748413, "learning_rate": 4.3703507170301454e-05, "loss": 0.4395, "step": 35255 }, { "epoch": 1.2707680109561394, "grad_norm": 0.16282232105731964, "learning_rate": 4.370157074277278e-05, "loss": 0.3854, "step": 35260 }, { "epoch": 1.270948210617364, "grad_norm": 0.1948549896478653, "learning_rate": 4.369963406043881e-05, "loss": 0.4173, "step": 35265 }, { "epoch": 1.2711284102785887, "grad_norm": 0.18401391804218292, "learning_rate": 4.3697697123325956e-05, "loss": 0.3982, "step": 35270 }, { "epoch": 1.2713086099398132, "grad_norm": 0.17478926479816437, "learning_rate": 4.36957599314606e-05, "loss": 0.4542, "step": 35275 }, { "epoch": 1.271488809601038, "grad_norm": 0.20537783205509186, "learning_rate": 4.369382248486914e-05, "loss": 0.4193, "step": 35280 }, { "epoch": 1.2716690092622627, "grad_norm": 0.1872226595878601, "learning_rate": 4.3691884783577966e-05, "loss": 0.4263, "step": 35285 }, { "epoch": 1.2718492089234872, "grad_norm": 0.18654055893421173, "learning_rate": 4.368994682761347e-05, "loss": 0.4464, "step": 35290 }, { "epoch": 1.272029408584712, "grad_norm": 0.17078270018100739, "learning_rate": 4.3688008617002076e-05, "loss": 0.4367, "step": 35295 }, { "epoch": 1.2722096082459364, "grad_norm": 0.1947806477546692, "learning_rate": 4.368607015177018e-05, "loss": 0.4056, "step": 35300 }, { "epoch": 1.2723898079071612, "grad_norm": 0.17518731951713562, "learning_rate": 4.368413143194419e-05, "loss": 0.4531, "step": 35305 }, { "epoch": 1.272570007568386, "grad_norm": 0.19480538368225098, "learning_rate": 4.368219245755053e-05, "loss": 0.448, "step": 35310 }, { "epoch": 1.2727502072296104, "grad_norm": 0.15128688514232635, "learning_rate": 4.368025322861562e-05, "loss": 0.4321, "step": 35315 }, { "epoch": 1.272930406890835, "grad_norm": 0.16583947837352753, "learning_rate": 4.367831374516588e-05, "loss": 0.4399, "step": 35320 }, { "epoch": 1.2731106065520597, "grad_norm": 0.22142437100410461, "learning_rate": 4.3676374007227715e-05, "loss": 0.4275, "step": 35325 }, { "epoch": 1.2732908062132844, "grad_norm": 0.1897389143705368, "learning_rate": 4.367443401482758e-05, "loss": 0.4079, "step": 35330 }, { "epoch": 1.273471005874509, "grad_norm": 0.18949352204799652, "learning_rate": 4.36724937679919e-05, "loss": 0.4659, "step": 35335 }, { "epoch": 1.2736512055357336, "grad_norm": 0.1753978729248047, "learning_rate": 4.367055326674711e-05, "loss": 0.4273, "step": 35340 }, { "epoch": 1.2738314051969581, "grad_norm": 0.20065420866012573, "learning_rate": 4.366861251111963e-05, "loss": 0.4463, "step": 35345 }, { "epoch": 1.2740116048581829, "grad_norm": 0.19253739714622498, "learning_rate": 4.366667150113594e-05, "loss": 0.4188, "step": 35350 }, { "epoch": 1.2741918045194076, "grad_norm": 0.1715114414691925, "learning_rate": 4.366473023682245e-05, "loss": 0.4355, "step": 35355 }, { "epoch": 1.2743720041806321, "grad_norm": 0.17242814600467682, "learning_rate": 4.3662788718205625e-05, "loss": 0.4026, "step": 35360 }, { "epoch": 1.2745522038418569, "grad_norm": 0.20738811790943146, "learning_rate": 4.366084694531192e-05, "loss": 0.4046, "step": 35365 }, { "epoch": 1.2747324035030814, "grad_norm": 0.19424347579479218, "learning_rate": 4.365890491816779e-05, "loss": 0.4434, "step": 35370 }, { "epoch": 1.274912603164306, "grad_norm": 0.21286608278751373, "learning_rate": 4.365696263679969e-05, "loss": 0.4375, "step": 35375 }, { "epoch": 1.2750928028255306, "grad_norm": 0.20882990956306458, "learning_rate": 4.365502010123409e-05, "loss": 0.4316, "step": 35380 }, { "epoch": 1.2752730024867553, "grad_norm": 0.21594053506851196, "learning_rate": 4.365307731149745e-05, "loss": 0.3852, "step": 35385 }, { "epoch": 1.2754532021479799, "grad_norm": 0.1830471307039261, "learning_rate": 4.3651134267616244e-05, "loss": 0.4404, "step": 35390 }, { "epoch": 1.2756334018092046, "grad_norm": 0.17027118802070618, "learning_rate": 4.3649190969616946e-05, "loss": 0.4043, "step": 35395 }, { "epoch": 1.2758136014704293, "grad_norm": 0.16624897718429565, "learning_rate": 4.364724741752603e-05, "loss": 0.4311, "step": 35400 }, { "epoch": 1.2759938011316538, "grad_norm": 0.17142795026302338, "learning_rate": 4.364530361136998e-05, "loss": 0.4524, "step": 35405 }, { "epoch": 1.2761740007928786, "grad_norm": 0.197305366396904, "learning_rate": 4.364335955117528e-05, "loss": 0.4254, "step": 35410 }, { "epoch": 1.276354200454103, "grad_norm": 0.1855136901140213, "learning_rate": 4.364141523696841e-05, "loss": 0.4182, "step": 35415 }, { "epoch": 1.2765344001153278, "grad_norm": 0.1479266881942749, "learning_rate": 4.3639470668775865e-05, "loss": 0.3851, "step": 35420 }, { "epoch": 1.2767145997765526, "grad_norm": 0.18044587969779968, "learning_rate": 4.363752584662415e-05, "loss": 0.4205, "step": 35425 }, { "epoch": 1.276894799437777, "grad_norm": 0.2061435431241989, "learning_rate": 4.3635580770539744e-05, "loss": 0.4146, "step": 35430 }, { "epoch": 1.2770749990990016, "grad_norm": 0.2000425010919571, "learning_rate": 4.363363544054916e-05, "loss": 0.4167, "step": 35435 }, { "epoch": 1.2772551987602263, "grad_norm": 0.18255402147769928, "learning_rate": 4.3631689856678905e-05, "loss": 0.393, "step": 35440 }, { "epoch": 1.277435398421451, "grad_norm": 0.20933406054973602, "learning_rate": 4.362974401895547e-05, "loss": 0.4177, "step": 35445 }, { "epoch": 1.2776155980826756, "grad_norm": 0.1986079066991806, "learning_rate": 4.36277979274054e-05, "loss": 0.4176, "step": 35450 }, { "epoch": 1.2777957977439003, "grad_norm": 0.21557162702083588, "learning_rate": 4.3625851582055174e-05, "loss": 0.3766, "step": 35455 }, { "epoch": 1.2779759974051248, "grad_norm": 0.17857031524181366, "learning_rate": 4.362390498293134e-05, "loss": 0.4071, "step": 35460 }, { "epoch": 1.2781561970663495, "grad_norm": 0.19413699209690094, "learning_rate": 4.362195813006039e-05, "loss": 0.4288, "step": 35465 }, { "epoch": 1.2783363967275743, "grad_norm": 0.16761010885238647, "learning_rate": 4.362001102346888e-05, "loss": 0.4273, "step": 35470 }, { "epoch": 1.2785165963887988, "grad_norm": 0.20636701583862305, "learning_rate": 4.3618063663183315e-05, "loss": 0.4188, "step": 35475 }, { "epoch": 1.2786967960500233, "grad_norm": 0.22335347533226013, "learning_rate": 4.361611604923025e-05, "loss": 0.4317, "step": 35480 }, { "epoch": 1.278876995711248, "grad_norm": 0.23554205894470215, "learning_rate": 4.361416818163619e-05, "loss": 0.4478, "step": 35485 }, { "epoch": 1.2790571953724728, "grad_norm": 0.22054627537727356, "learning_rate": 4.361222006042771e-05, "loss": 0.3942, "step": 35490 }, { "epoch": 1.2792373950336973, "grad_norm": 0.19049589335918427, "learning_rate": 4.361027168563132e-05, "loss": 0.4183, "step": 35495 }, { "epoch": 1.279417594694922, "grad_norm": 0.1891845464706421, "learning_rate": 4.360832305727359e-05, "loss": 0.4341, "step": 35500 }, { "epoch": 1.279417594694922, "eval_loss": 0.44991669058799744, "eval_runtime": 3.8804, "eval_samples_per_second": 25.771, "eval_steps_per_second": 6.443, "step": 35500 }, { "epoch": 1.2795977943561465, "grad_norm": 0.16972802579402924, "learning_rate": 4.360637417538106e-05, "loss": 0.3922, "step": 35505 }, { "epoch": 1.2797779940173712, "grad_norm": 0.15279355645179749, "learning_rate": 4.360442503998028e-05, "loss": 0.387, "step": 35510 }, { "epoch": 1.279958193678596, "grad_norm": 0.19364698231220245, "learning_rate": 4.360247565109782e-05, "loss": 0.4193, "step": 35515 }, { "epoch": 1.2801383933398205, "grad_norm": 0.13126322627067566, "learning_rate": 4.3600526008760226e-05, "loss": 0.3859, "step": 35520 }, { "epoch": 1.2803185930010452, "grad_norm": 0.18221980333328247, "learning_rate": 4.359857611299406e-05, "loss": 0.4055, "step": 35525 }, { "epoch": 1.2804987926622697, "grad_norm": 0.13313446938991547, "learning_rate": 4.35966259638259e-05, "loss": 0.4021, "step": 35530 }, { "epoch": 1.2806789923234945, "grad_norm": 0.20920483767986298, "learning_rate": 4.359467556128232e-05, "loss": 0.4436, "step": 35535 }, { "epoch": 1.2808591919847192, "grad_norm": 0.15556688606739044, "learning_rate": 4.359272490538987e-05, "loss": 0.3783, "step": 35540 }, { "epoch": 1.2810393916459437, "grad_norm": 0.17486773431301117, "learning_rate": 4.359077399617515e-05, "loss": 0.4021, "step": 35545 }, { "epoch": 1.2812195913071682, "grad_norm": 0.1680872142314911, "learning_rate": 4.358882283366473e-05, "loss": 0.3872, "step": 35550 }, { "epoch": 1.281399790968393, "grad_norm": 0.17405681312084198, "learning_rate": 4.3586871417885204e-05, "loss": 0.4278, "step": 35555 }, { "epoch": 1.2815799906296177, "grad_norm": 0.1495596170425415, "learning_rate": 4.358491974886315e-05, "loss": 0.4176, "step": 35560 }, { "epoch": 1.2817601902908422, "grad_norm": 0.188494473695755, "learning_rate": 4.358296782662517e-05, "loss": 0.4321, "step": 35565 }, { "epoch": 1.281940389952067, "grad_norm": 0.20558683574199677, "learning_rate": 4.358101565119784e-05, "loss": 0.4239, "step": 35570 }, { "epoch": 1.2821205896132915, "grad_norm": 0.16552236676216125, "learning_rate": 4.3579063222607776e-05, "loss": 0.4232, "step": 35575 }, { "epoch": 1.2823007892745162, "grad_norm": 0.17347842454910278, "learning_rate": 4.357711054088157e-05, "loss": 0.4082, "step": 35580 }, { "epoch": 1.282480988935741, "grad_norm": 0.17773760855197906, "learning_rate": 4.357515760604583e-05, "loss": 0.3855, "step": 35585 }, { "epoch": 1.2826611885969654, "grad_norm": 0.16329875588417053, "learning_rate": 4.3573204418127165e-05, "loss": 0.3963, "step": 35590 }, { "epoch": 1.28284138825819, "grad_norm": 0.195121169090271, "learning_rate": 4.357125097715218e-05, "loss": 0.4459, "step": 35595 }, { "epoch": 1.2830215879194147, "grad_norm": 0.22871240973472595, "learning_rate": 4.35692972831475e-05, "loss": 0.4106, "step": 35600 }, { "epoch": 1.2832017875806394, "grad_norm": 0.13964316248893738, "learning_rate": 4.356734333613974e-05, "loss": 0.4066, "step": 35605 }, { "epoch": 1.283381987241864, "grad_norm": 0.16373074054718018, "learning_rate": 4.356538913615553e-05, "loss": 0.4039, "step": 35610 }, { "epoch": 1.2835621869030887, "grad_norm": 0.18498462438583374, "learning_rate": 4.3563434683221475e-05, "loss": 0.4177, "step": 35615 }, { "epoch": 1.2837423865643132, "grad_norm": 0.2500152289867401, "learning_rate": 4.356147997736422e-05, "loss": 0.4178, "step": 35620 }, { "epoch": 1.283922586225538, "grad_norm": 0.17622318863868713, "learning_rate": 4.3559525018610395e-05, "loss": 0.3932, "step": 35625 }, { "epoch": 1.2841027858867626, "grad_norm": 0.16591259837150574, "learning_rate": 4.355756980698664e-05, "loss": 0.402, "step": 35630 }, { "epoch": 1.2842829855479871, "grad_norm": 0.22070670127868652, "learning_rate": 4.355561434251958e-05, "loss": 0.4183, "step": 35635 }, { "epoch": 1.2844631852092119, "grad_norm": 0.205961674451828, "learning_rate": 4.3553658625235874e-05, "loss": 0.4136, "step": 35640 }, { "epoch": 1.2846433848704364, "grad_norm": 0.1596502810716629, "learning_rate": 4.355170265516216e-05, "loss": 0.4191, "step": 35645 }, { "epoch": 1.2848235845316611, "grad_norm": 0.15234126150608063, "learning_rate": 4.354974643232508e-05, "loss": 0.4161, "step": 35650 }, { "epoch": 1.2850037841928856, "grad_norm": 0.181291401386261, "learning_rate": 4.354778995675131e-05, "loss": 0.4216, "step": 35655 }, { "epoch": 1.2851839838541104, "grad_norm": 0.16666920483112335, "learning_rate": 4.354583322846748e-05, "loss": 0.4508, "step": 35660 }, { "epoch": 1.2853641835153349, "grad_norm": 0.22122518718242645, "learning_rate": 4.354387624750027e-05, "loss": 0.413, "step": 35665 }, { "epoch": 1.2855443831765596, "grad_norm": 0.18632946908473969, "learning_rate": 4.354191901387634e-05, "loss": 0.42, "step": 35670 }, { "epoch": 1.2857245828377843, "grad_norm": 0.16238003969192505, "learning_rate": 4.3539961527622345e-05, "loss": 0.3928, "step": 35675 }, { "epoch": 1.2859047824990089, "grad_norm": 0.21126984059810638, "learning_rate": 4.353800378876497e-05, "loss": 0.3968, "step": 35680 }, { "epoch": 1.2860849821602336, "grad_norm": 0.17129042744636536, "learning_rate": 4.3536045797330885e-05, "loss": 0.4172, "step": 35685 }, { "epoch": 1.286265181821458, "grad_norm": 0.18168915808200836, "learning_rate": 4.353408755334676e-05, "loss": 0.4172, "step": 35690 }, { "epoch": 1.2864453814826828, "grad_norm": 0.15571817755699158, "learning_rate": 4.3532129056839274e-05, "loss": 0.3766, "step": 35695 }, { "epoch": 1.2866255811439076, "grad_norm": 0.2171332985162735, "learning_rate": 4.353017030783513e-05, "loss": 0.4286, "step": 35700 }, { "epoch": 1.286805780805132, "grad_norm": 0.1743398904800415, "learning_rate": 4.3528211306360986e-05, "loss": 0.4258, "step": 35705 }, { "epoch": 1.2869859804663566, "grad_norm": 0.1896362006664276, "learning_rate": 4.352625205244357e-05, "loss": 0.4056, "step": 35710 }, { "epoch": 1.2871661801275813, "grad_norm": 0.1765444278717041, "learning_rate": 4.352429254610955e-05, "loss": 0.4137, "step": 35715 }, { "epoch": 1.287346379788806, "grad_norm": 0.20674960315227509, "learning_rate": 4.352233278738562e-05, "loss": 0.4382, "step": 35720 }, { "epoch": 1.2875265794500306, "grad_norm": 0.1722467541694641, "learning_rate": 4.35203727762985e-05, "loss": 0.429, "step": 35725 }, { "epoch": 1.2877067791112553, "grad_norm": 0.1646967977285385, "learning_rate": 4.3518412512874885e-05, "loss": 0.3841, "step": 35730 }, { "epoch": 1.2878869787724798, "grad_norm": 0.19040453433990479, "learning_rate": 4.3516451997141485e-05, "loss": 0.4182, "step": 35735 }, { "epoch": 1.2880671784337046, "grad_norm": 0.1459205448627472, "learning_rate": 4.3514491229125015e-05, "loss": 0.3975, "step": 35740 }, { "epoch": 1.2882473780949293, "grad_norm": 0.1318344622850418, "learning_rate": 4.3512530208852185e-05, "loss": 0.4007, "step": 35745 }, { "epoch": 1.2884275777561538, "grad_norm": 0.21362341940402985, "learning_rate": 4.3510568936349714e-05, "loss": 0.4328, "step": 35750 }, { "epoch": 1.2886077774173785, "grad_norm": 0.2101558893918991, "learning_rate": 4.350860741164432e-05, "loss": 0.468, "step": 35755 }, { "epoch": 1.288787977078603, "grad_norm": 0.1813066005706787, "learning_rate": 4.350664563476274e-05, "loss": 0.4452, "step": 35760 }, { "epoch": 1.2889681767398278, "grad_norm": 0.1613730937242508, "learning_rate": 4.35046836057317e-05, "loss": 0.4398, "step": 35765 }, { "epoch": 1.2891483764010523, "grad_norm": 0.17162039875984192, "learning_rate": 4.350272132457792e-05, "loss": 0.4311, "step": 35770 }, { "epoch": 1.289328576062277, "grad_norm": 0.16017718613147736, "learning_rate": 4.350075879132815e-05, "loss": 0.413, "step": 35775 }, { "epoch": 1.2895087757235015, "grad_norm": 0.16685138642787933, "learning_rate": 4.349879600600912e-05, "loss": 0.4324, "step": 35780 }, { "epoch": 1.2896889753847263, "grad_norm": 0.18246233463287354, "learning_rate": 4.349683296864758e-05, "loss": 0.4127, "step": 35785 }, { "epoch": 1.289869175045951, "grad_norm": 0.17236393690109253, "learning_rate": 4.349486967927027e-05, "loss": 0.4159, "step": 35790 }, { "epoch": 1.2900493747071755, "grad_norm": 0.17806027829647064, "learning_rate": 4.349290613790393e-05, "loss": 0.4077, "step": 35795 }, { "epoch": 1.2902295743684002, "grad_norm": 0.17833392322063446, "learning_rate": 4.3490942344575336e-05, "loss": 0.4335, "step": 35800 }, { "epoch": 1.2904097740296248, "grad_norm": 0.2036380022764206, "learning_rate": 4.348897829931123e-05, "loss": 0.4366, "step": 35805 }, { "epoch": 1.2905899736908495, "grad_norm": 0.1541140377521515, "learning_rate": 4.348701400213838e-05, "loss": 0.4208, "step": 35810 }, { "epoch": 1.2907701733520742, "grad_norm": 0.19483250379562378, "learning_rate": 4.3485049453083536e-05, "loss": 0.3919, "step": 35815 }, { "epoch": 1.2909503730132987, "grad_norm": 0.1808329075574875, "learning_rate": 4.348308465217348e-05, "loss": 0.4261, "step": 35820 }, { "epoch": 1.2911305726745232, "grad_norm": 0.208456888794899, "learning_rate": 4.348111959943496e-05, "loss": 0.39, "step": 35825 }, { "epoch": 1.291310772335748, "grad_norm": 0.19897425174713135, "learning_rate": 4.3479154294894774e-05, "loss": 0.4752, "step": 35830 }, { "epoch": 1.2914909719969727, "grad_norm": 0.16709640622138977, "learning_rate": 4.347718873857969e-05, "loss": 0.4283, "step": 35835 }, { "epoch": 1.2916711716581972, "grad_norm": 0.19628377258777618, "learning_rate": 4.347522293051648e-05, "loss": 0.4476, "step": 35840 }, { "epoch": 1.291851371319422, "grad_norm": 0.19053253531455994, "learning_rate": 4.3473256870731935e-05, "loss": 0.4143, "step": 35845 }, { "epoch": 1.2920315709806465, "grad_norm": 0.16357672214508057, "learning_rate": 4.347129055925285e-05, "loss": 0.4099, "step": 35850 }, { "epoch": 1.2922117706418712, "grad_norm": 0.17562684416770935, "learning_rate": 4.3469323996106e-05, "loss": 0.3779, "step": 35855 }, { "epoch": 1.292391970303096, "grad_norm": 0.21830067038536072, "learning_rate": 4.346735718131819e-05, "loss": 0.4124, "step": 35860 }, { "epoch": 1.2925721699643204, "grad_norm": 0.21300473809242249, "learning_rate": 4.3465390114916206e-05, "loss": 0.3939, "step": 35865 }, { "epoch": 1.2927523696255452, "grad_norm": 0.15682797133922577, "learning_rate": 4.3463422796926864e-05, "loss": 0.3946, "step": 35870 }, { "epoch": 1.2929325692867697, "grad_norm": 0.1903832107782364, "learning_rate": 4.3461455227376956e-05, "loss": 0.4147, "step": 35875 }, { "epoch": 1.2931127689479944, "grad_norm": 0.1733843833208084, "learning_rate": 4.3459487406293296e-05, "loss": 0.4034, "step": 35880 }, { "epoch": 1.293292968609219, "grad_norm": 0.193365678191185, "learning_rate": 4.34575193337027e-05, "loss": 0.4185, "step": 35885 }, { "epoch": 1.2934731682704437, "grad_norm": 0.19296406209468842, "learning_rate": 4.345555100963198e-05, "loss": 0.4011, "step": 35890 }, { "epoch": 1.2936533679316682, "grad_norm": 0.16157175600528717, "learning_rate": 4.3453582434107934e-05, "loss": 0.4164, "step": 35895 }, { "epoch": 1.293833567592893, "grad_norm": 0.17810019850730896, "learning_rate": 4.3451613607157416e-05, "loss": 0.394, "step": 35900 }, { "epoch": 1.2940137672541177, "grad_norm": 0.1655871868133545, "learning_rate": 4.344964452880723e-05, "loss": 0.3975, "step": 35905 }, { "epoch": 1.2941939669153422, "grad_norm": 0.24324651062488556, "learning_rate": 4.3447675199084204e-05, "loss": 0.4328, "step": 35910 }, { "epoch": 1.294374166576567, "grad_norm": 0.15802602469921112, "learning_rate": 4.344570561801518e-05, "loss": 0.4295, "step": 35915 }, { "epoch": 1.2945543662377914, "grad_norm": 0.1700577437877655, "learning_rate": 4.344373578562698e-05, "loss": 0.3671, "step": 35920 }, { "epoch": 1.2947345658990161, "grad_norm": 0.1449105143547058, "learning_rate": 4.3441765701946455e-05, "loss": 0.4034, "step": 35925 }, { "epoch": 1.2949147655602409, "grad_norm": 0.18655553460121155, "learning_rate": 4.343979536700045e-05, "loss": 0.3888, "step": 35930 }, { "epoch": 1.2950949652214654, "grad_norm": 0.1947435736656189, "learning_rate": 4.34378247808158e-05, "loss": 0.4335, "step": 35935 }, { "epoch": 1.29527516488269, "grad_norm": 0.1621570587158203, "learning_rate": 4.343585394341936e-05, "loss": 0.4024, "step": 35940 }, { "epoch": 1.2954553645439146, "grad_norm": 0.15728051960468292, "learning_rate": 4.343388285483797e-05, "loss": 0.3716, "step": 35945 }, { "epoch": 1.2956355642051394, "grad_norm": 0.17615413665771484, "learning_rate": 4.34319115150985e-05, "loss": 0.4256, "step": 35950 }, { "epoch": 1.2958157638663639, "grad_norm": 0.17523658275604248, "learning_rate": 4.3429939924227806e-05, "loss": 0.4552, "step": 35955 }, { "epoch": 1.2959959635275886, "grad_norm": 0.15205228328704834, "learning_rate": 4.3427968082252744e-05, "loss": 0.4318, "step": 35960 }, { "epoch": 1.2961761631888131, "grad_norm": 0.18132284283638, "learning_rate": 4.3425995989200184e-05, "loss": 0.4419, "step": 35965 }, { "epoch": 1.2963563628500379, "grad_norm": 0.16100190579891205, "learning_rate": 4.3424023645097e-05, "loss": 0.4099, "step": 35970 }, { "epoch": 1.2965365625112626, "grad_norm": 0.22151976823806763, "learning_rate": 4.342205104997006e-05, "loss": 0.456, "step": 35975 }, { "epoch": 1.296716762172487, "grad_norm": 0.19415588676929474, "learning_rate": 4.3420078203846245e-05, "loss": 0.4543, "step": 35980 }, { "epoch": 1.2968969618337116, "grad_norm": 0.18258939683437347, "learning_rate": 4.341810510675243e-05, "loss": 0.429, "step": 35985 }, { "epoch": 1.2970771614949363, "grad_norm": 0.20076502859592438, "learning_rate": 4.3416131758715496e-05, "loss": 0.4105, "step": 35990 }, { "epoch": 1.297257361156161, "grad_norm": 0.1802350878715515, "learning_rate": 4.3414158159762334e-05, "loss": 0.4423, "step": 35995 }, { "epoch": 1.2974375608173856, "grad_norm": 0.2174658179283142, "learning_rate": 4.341218430991982e-05, "loss": 0.4242, "step": 36000 }, { "epoch": 1.2974375608173856, "eval_loss": 0.4492260217666626, "eval_runtime": 3.5485, "eval_samples_per_second": 28.181, "eval_steps_per_second": 7.045, "step": 36000 }, { "epoch": 1.2976177604786103, "grad_norm": 0.1733621507883072, "learning_rate": 4.3410210209214875e-05, "loss": 0.4091, "step": 36005 }, { "epoch": 1.2977979601398348, "grad_norm": 0.20660637319087982, "learning_rate": 4.3408235857674376e-05, "loss": 0.4355, "step": 36010 }, { "epoch": 1.2979781598010596, "grad_norm": 0.16919872164726257, "learning_rate": 4.340626125532522e-05, "loss": 0.4069, "step": 36015 }, { "epoch": 1.2981583594622843, "grad_norm": 0.19883739948272705, "learning_rate": 4.3404286402194326e-05, "loss": 0.4177, "step": 36020 }, { "epoch": 1.2983385591235088, "grad_norm": 0.13305337727069855, "learning_rate": 4.340231129830859e-05, "loss": 0.4168, "step": 36025 }, { "epoch": 1.2985187587847336, "grad_norm": 0.19353942573070526, "learning_rate": 4.3400335943694925e-05, "loss": 0.3978, "step": 36030 }, { "epoch": 1.298698958445958, "grad_norm": 0.2054118812084198, "learning_rate": 4.339836033838025e-05, "loss": 0.4347, "step": 36035 }, { "epoch": 1.2988791581071828, "grad_norm": 0.16488704085350037, "learning_rate": 4.339638448239147e-05, "loss": 0.3811, "step": 36040 }, { "epoch": 1.2990593577684075, "grad_norm": 0.1690049171447754, "learning_rate": 4.3394408375755526e-05, "loss": 0.4393, "step": 36045 }, { "epoch": 1.299239557429632, "grad_norm": 0.21596601605415344, "learning_rate": 4.339243201849932e-05, "loss": 0.4089, "step": 36050 }, { "epoch": 1.2994197570908566, "grad_norm": 0.17720189690589905, "learning_rate": 4.339045541064978e-05, "loss": 0.4117, "step": 36055 }, { "epoch": 1.2995999567520813, "grad_norm": 0.16456130146980286, "learning_rate": 4.3388478552233856e-05, "loss": 0.3757, "step": 36060 }, { "epoch": 1.299780156413306, "grad_norm": 0.1337253451347351, "learning_rate": 4.338650144327847e-05, "loss": 0.4465, "step": 36065 }, { "epoch": 1.2999603560745305, "grad_norm": 0.17448823153972626, "learning_rate": 4.338452408381056e-05, "loss": 0.4445, "step": 36070 }, { "epoch": 1.3001405557357553, "grad_norm": 0.1743369847536087, "learning_rate": 4.338254647385708e-05, "loss": 0.4016, "step": 36075 }, { "epoch": 1.3003207553969798, "grad_norm": 0.2010771483182907, "learning_rate": 4.338056861344495e-05, "loss": 0.4215, "step": 36080 }, { "epoch": 1.3005009550582045, "grad_norm": 0.21414510905742645, "learning_rate": 4.337859050260113e-05, "loss": 0.4478, "step": 36085 }, { "epoch": 1.3006811547194292, "grad_norm": 0.20338131487369537, "learning_rate": 4.337661214135258e-05, "loss": 0.4265, "step": 36090 }, { "epoch": 1.3008613543806538, "grad_norm": 0.17728644609451294, "learning_rate": 4.3374633529726247e-05, "loss": 0.4039, "step": 36095 }, { "epoch": 1.3010415540418783, "grad_norm": 0.13623587787151337, "learning_rate": 4.3372654667749086e-05, "loss": 0.3892, "step": 36100 }, { "epoch": 1.301221753703103, "grad_norm": 0.20998680591583252, "learning_rate": 4.337067555544806e-05, "loss": 0.4134, "step": 36105 }, { "epoch": 1.3014019533643277, "grad_norm": 0.1900775134563446, "learning_rate": 4.336869619285014e-05, "loss": 0.4382, "step": 36110 }, { "epoch": 1.3015821530255522, "grad_norm": 0.17663796246051788, "learning_rate": 4.33667165799823e-05, "loss": 0.4058, "step": 36115 }, { "epoch": 1.301762352686777, "grad_norm": 0.17739155888557434, "learning_rate": 4.336473671687149e-05, "loss": 0.3889, "step": 36120 }, { "epoch": 1.3019425523480015, "grad_norm": 0.204684317111969, "learning_rate": 4.33627566035447e-05, "loss": 0.435, "step": 36125 }, { "epoch": 1.3021227520092262, "grad_norm": 0.17078737914562225, "learning_rate": 4.336077624002891e-05, "loss": 0.3845, "step": 36130 }, { "epoch": 1.302302951670451, "grad_norm": 0.20431089401245117, "learning_rate": 4.33587956263511e-05, "loss": 0.3966, "step": 36135 }, { "epoch": 1.3024831513316755, "grad_norm": 0.18561053276062012, "learning_rate": 4.335681476253824e-05, "loss": 0.4092, "step": 36140 }, { "epoch": 1.3026633509929002, "grad_norm": 0.1851317286491394, "learning_rate": 4.335483364861734e-05, "loss": 0.4035, "step": 36145 }, { "epoch": 1.3028435506541247, "grad_norm": 0.18310926854610443, "learning_rate": 4.3352852284615395e-05, "loss": 0.4261, "step": 36150 }, { "epoch": 1.3030237503153494, "grad_norm": 0.19984884560108185, "learning_rate": 4.335087067055938e-05, "loss": 0.4395, "step": 36155 }, { "epoch": 1.303203949976574, "grad_norm": 0.17451319098472595, "learning_rate": 4.334888880647631e-05, "loss": 0.4466, "step": 36160 }, { "epoch": 1.3033841496377987, "grad_norm": 0.18329133093357086, "learning_rate": 4.3346906692393184e-05, "loss": 0.4218, "step": 36165 }, { "epoch": 1.3035643492990232, "grad_norm": 0.19069811701774597, "learning_rate": 4.3344924328337e-05, "loss": 0.4008, "step": 36170 }, { "epoch": 1.303744548960248, "grad_norm": 0.18040628731250763, "learning_rate": 4.334294171433478e-05, "loss": 0.3912, "step": 36175 }, { "epoch": 1.3039247486214727, "grad_norm": 0.20285740494728088, "learning_rate": 4.3340958850413526e-05, "loss": 0.4664, "step": 36180 }, { "epoch": 1.3041049482826972, "grad_norm": 0.2134329378604889, "learning_rate": 4.3338975736600266e-05, "loss": 0.4236, "step": 36185 }, { "epoch": 1.304285147943922, "grad_norm": 0.18839861452579498, "learning_rate": 4.3336992372922e-05, "loss": 0.4059, "step": 36190 }, { "epoch": 1.3044653476051464, "grad_norm": 0.22417490184307098, "learning_rate": 4.333500875940577e-05, "loss": 0.4422, "step": 36195 }, { "epoch": 1.3046455472663712, "grad_norm": 0.1869698464870453, "learning_rate": 4.33330248960786e-05, "loss": 0.4479, "step": 36200 }, { "epoch": 1.304825746927596, "grad_norm": 0.14332622289657593, "learning_rate": 4.33310407829675e-05, "loss": 0.3966, "step": 36205 }, { "epoch": 1.3050059465888204, "grad_norm": 0.23910625278949738, "learning_rate": 4.3329056420099534e-05, "loss": 0.409, "step": 36210 }, { "epoch": 1.305186146250045, "grad_norm": 0.178507000207901, "learning_rate": 4.332707180750172e-05, "loss": 0.4216, "step": 36215 }, { "epoch": 1.3053663459112697, "grad_norm": 0.1948593556880951, "learning_rate": 4.3325086945201096e-05, "loss": 0.3968, "step": 36220 }, { "epoch": 1.3055465455724944, "grad_norm": 0.2139938473701477, "learning_rate": 4.332310183322471e-05, "loss": 0.4525, "step": 36225 }, { "epoch": 1.305726745233719, "grad_norm": 0.1733957827091217, "learning_rate": 4.332111647159962e-05, "loss": 0.4319, "step": 36230 }, { "epoch": 1.3059069448949436, "grad_norm": 0.18427705764770508, "learning_rate": 4.331913086035285e-05, "loss": 0.4355, "step": 36235 }, { "epoch": 1.3060871445561681, "grad_norm": 0.17030684649944305, "learning_rate": 4.3317144999511474e-05, "loss": 0.4461, "step": 36240 }, { "epoch": 1.3062673442173929, "grad_norm": 0.17609108984470367, "learning_rate": 4.3315158889102546e-05, "loss": 0.4154, "step": 36245 }, { "epoch": 1.3064475438786176, "grad_norm": 0.22373346984386444, "learning_rate": 4.3313172529153124e-05, "loss": 0.394, "step": 36250 }, { "epoch": 1.3066277435398421, "grad_norm": 0.18238912522792816, "learning_rate": 4.331118591969027e-05, "loss": 0.4212, "step": 36255 }, { "epoch": 1.3068079432010669, "grad_norm": 0.17490154504776, "learning_rate": 4.330919906074106e-05, "loss": 0.3987, "step": 36260 }, { "epoch": 1.3069881428622914, "grad_norm": 0.17882512509822845, "learning_rate": 4.330721195233255e-05, "loss": 0.4248, "step": 36265 }, { "epoch": 1.307168342523516, "grad_norm": 0.21549130976200104, "learning_rate": 4.330522459449182e-05, "loss": 0.4354, "step": 36270 }, { "epoch": 1.3073485421847406, "grad_norm": 0.19337189197540283, "learning_rate": 4.330323698724596e-05, "loss": 0.4282, "step": 36275 }, { "epoch": 1.3075287418459653, "grad_norm": 0.16519051790237427, "learning_rate": 4.330124913062203e-05, "loss": 0.4204, "step": 36280 }, { "epoch": 1.3077089415071899, "grad_norm": 0.18474717438220978, "learning_rate": 4.329926102464712e-05, "loss": 0.4295, "step": 36285 }, { "epoch": 1.3078891411684146, "grad_norm": 0.21671119332313538, "learning_rate": 4.3297272669348325e-05, "loss": 0.4407, "step": 36290 }, { "epoch": 1.3080693408296393, "grad_norm": Infinity, "learning_rate": 4.32956818056143e-05, "loss": 0.4212, "step": 36295 }, { "epoch": 1.3082495404908638, "grad_norm": 0.16523273289203644, "learning_rate": 4.329369300160078e-05, "loss": 0.4094, "step": 36300 }, { "epoch": 1.3084297401520886, "grad_norm": 0.18672731518745422, "learning_rate": 4.329170394833923e-05, "loss": 0.4582, "step": 36305 }, { "epoch": 1.308609939813313, "grad_norm": 0.19107048213481903, "learning_rate": 4.328971464585676e-05, "loss": 0.3931, "step": 36310 }, { "epoch": 1.3087901394745378, "grad_norm": 0.17582692205905914, "learning_rate": 4.3287725094180466e-05, "loss": 0.3489, "step": 36315 }, { "epoch": 1.3089703391357626, "grad_norm": 0.20227184891700745, "learning_rate": 4.328573529333746e-05, "loss": 0.4136, "step": 36320 }, { "epoch": 1.309150538796987, "grad_norm": 0.17152424156665802, "learning_rate": 4.328374524335485e-05, "loss": 0.4413, "step": 36325 }, { "epoch": 1.3093307384582116, "grad_norm": 0.20434246957302094, "learning_rate": 4.328175494425975e-05, "loss": 0.4357, "step": 36330 }, { "epoch": 1.3095109381194363, "grad_norm": 0.18232357501983643, "learning_rate": 4.327976439607928e-05, "loss": 0.4028, "step": 36335 }, { "epoch": 1.309691137780661, "grad_norm": 0.1636376529932022, "learning_rate": 4.327777359884056e-05, "loss": 0.3889, "step": 36340 }, { "epoch": 1.3098713374418856, "grad_norm": 0.21091686189174652, "learning_rate": 4.327578255257071e-05, "loss": 0.4193, "step": 36345 }, { "epoch": 1.3100515371031103, "grad_norm": 0.2029031366109848, "learning_rate": 4.327379125729687e-05, "loss": 0.4471, "step": 36350 }, { "epoch": 1.3102317367643348, "grad_norm": 0.17957919836044312, "learning_rate": 4.327179971304615e-05, "loss": 0.384, "step": 36355 }, { "epoch": 1.3104119364255595, "grad_norm": 0.2107733190059662, "learning_rate": 4.3269807919845705e-05, "loss": 0.3978, "step": 36360 }, { "epoch": 1.3105921360867843, "grad_norm": 0.16910696029663086, "learning_rate": 4.326781587772266e-05, "loss": 0.4386, "step": 36365 }, { "epoch": 1.3107723357480088, "grad_norm": 0.18517418205738068, "learning_rate": 4.326582358670416e-05, "loss": 0.3967, "step": 36370 }, { "epoch": 1.3109525354092335, "grad_norm": 0.21739526093006134, "learning_rate": 4.326383104681735e-05, "loss": 0.4325, "step": 36375 }, { "epoch": 1.311132735070458, "grad_norm": 0.18001195788383484, "learning_rate": 4.3261838258089384e-05, "loss": 0.4216, "step": 36380 }, { "epoch": 1.3113129347316828, "grad_norm": 0.2097054123878479, "learning_rate": 4.32598452205474e-05, "loss": 0.4309, "step": 36385 }, { "epoch": 1.3114931343929073, "grad_norm": 0.15929162502288818, "learning_rate": 4.325785193421856e-05, "loss": 0.4344, "step": 36390 }, { "epoch": 1.311673334054132, "grad_norm": 0.17544053494930267, "learning_rate": 4.325585839913003e-05, "loss": 0.4344, "step": 36395 }, { "epoch": 1.3118535337153565, "grad_norm": 0.2082267850637436, "learning_rate": 4.3253864615308956e-05, "loss": 0.4096, "step": 36400 }, { "epoch": 1.3120337333765812, "grad_norm": 0.1683303415775299, "learning_rate": 4.3251870582782516e-05, "loss": 0.4234, "step": 36405 }, { "epoch": 1.312213933037806, "grad_norm": 0.15545429289340973, "learning_rate": 4.3249876301577877e-05, "loss": 0.3982, "step": 36410 }, { "epoch": 1.3123941326990305, "grad_norm": 0.22306688129901886, "learning_rate": 4.3247881771722195e-05, "loss": 0.4617, "step": 36415 }, { "epoch": 1.3125743323602552, "grad_norm": 0.19142894446849823, "learning_rate": 4.3245886993242666e-05, "loss": 0.4333, "step": 36420 }, { "epoch": 1.3127545320214797, "grad_norm": 0.14523069560527802, "learning_rate": 4.324389196616645e-05, "loss": 0.394, "step": 36425 }, { "epoch": 1.3129347316827045, "grad_norm": 0.1527920961380005, "learning_rate": 4.3241896690520746e-05, "loss": 0.448, "step": 36430 }, { "epoch": 1.3131149313439292, "grad_norm": 0.1837022453546524, "learning_rate": 4.323990116633273e-05, "loss": 0.3912, "step": 36435 }, { "epoch": 1.3132951310051537, "grad_norm": 0.192708358168602, "learning_rate": 4.323790539362958e-05, "loss": 0.4349, "step": 36440 }, { "epoch": 1.3134753306663782, "grad_norm": 0.16803932189941406, "learning_rate": 4.323590937243852e-05, "loss": 0.4473, "step": 36445 }, { "epoch": 1.313655530327603, "grad_norm": 0.18764148652553558, "learning_rate": 4.323391310278672e-05, "loss": 0.4169, "step": 36450 }, { "epoch": 1.3138357299888277, "grad_norm": 0.14096736907958984, "learning_rate": 4.3231916584701374e-05, "loss": 0.389, "step": 36455 }, { "epoch": 1.3140159296500522, "grad_norm": 0.16323328018188477, "learning_rate": 4.32299198182097e-05, "loss": 0.3844, "step": 36460 }, { "epoch": 1.314196129311277, "grad_norm": 0.2195003479719162, "learning_rate": 4.32279228033389e-05, "loss": 0.4448, "step": 36465 }, { "epoch": 1.3143763289725015, "grad_norm": 0.1823636144399643, "learning_rate": 4.3225925540116174e-05, "loss": 0.3907, "step": 36470 }, { "epoch": 1.3145565286337262, "grad_norm": 0.1974102109670639, "learning_rate": 4.322392802856875e-05, "loss": 0.4337, "step": 36475 }, { "epoch": 1.314736728294951, "grad_norm": 0.14405225217342377, "learning_rate": 4.3221930268723834e-05, "loss": 0.3845, "step": 36480 }, { "epoch": 1.3149169279561754, "grad_norm": 0.15730611979961395, "learning_rate": 4.321993226060864e-05, "loss": 0.4251, "step": 36485 }, { "epoch": 1.3150971276174, "grad_norm": 0.17731639742851257, "learning_rate": 4.3217934004250396e-05, "loss": 0.4128, "step": 36490 }, { "epoch": 1.3152773272786247, "grad_norm": 0.17964981496334076, "learning_rate": 4.321593549967634e-05, "loss": 0.4303, "step": 36495 }, { "epoch": 1.3154575269398494, "grad_norm": 0.17006689310073853, "learning_rate": 4.3213936746913675e-05, "loss": 0.418, "step": 36500 }, { "epoch": 1.3154575269398494, "eval_loss": 0.4494481384754181, "eval_runtime": 3.5496, "eval_samples_per_second": 28.172, "eval_steps_per_second": 7.043, "step": 36500 }, { "epoch": 1.315637726601074, "grad_norm": 0.18565762042999268, "learning_rate": 4.321193774598966e-05, "loss": 0.4313, "step": 36505 }, { "epoch": 1.3158179262622987, "grad_norm": 0.18812930583953857, "learning_rate": 4.3209938496931514e-05, "loss": 0.4287, "step": 36510 }, { "epoch": 1.3159981259235232, "grad_norm": 0.21864257752895355, "learning_rate": 4.320793899976648e-05, "loss": 0.4444, "step": 36515 }, { "epoch": 1.316178325584748, "grad_norm": 0.18846680223941803, "learning_rate": 4.32059392545218e-05, "loss": 0.4351, "step": 36520 }, { "epoch": 1.3163585252459726, "grad_norm": 0.2109052538871765, "learning_rate": 4.320393926122473e-05, "loss": 0.3791, "step": 36525 }, { "epoch": 1.3165387249071971, "grad_norm": 0.16090500354766846, "learning_rate": 4.320193901990251e-05, "loss": 0.4113, "step": 36530 }, { "epoch": 1.3167189245684219, "grad_norm": 0.15768031775951385, "learning_rate": 4.31999385305824e-05, "loss": 0.3952, "step": 36535 }, { "epoch": 1.3168991242296464, "grad_norm": 0.18743303418159485, "learning_rate": 4.319793779329163e-05, "loss": 0.4256, "step": 36540 }, { "epoch": 1.3170793238908711, "grad_norm": 0.2007930725812912, "learning_rate": 4.31959368080575e-05, "loss": 0.4352, "step": 36545 }, { "epoch": 1.3172595235520959, "grad_norm": 0.21530933678150177, "learning_rate": 4.319393557490725e-05, "loss": 0.4444, "step": 36550 }, { "epoch": 1.3174397232133204, "grad_norm": 0.173873171210289, "learning_rate": 4.3191934093868146e-05, "loss": 0.4345, "step": 36555 }, { "epoch": 1.3176199228745449, "grad_norm": 0.17242655158042908, "learning_rate": 4.318993236496747e-05, "loss": 0.3883, "step": 36560 }, { "epoch": 1.3178001225357696, "grad_norm": 0.14753524959087372, "learning_rate": 4.318793038823248e-05, "loss": 0.4319, "step": 36565 }, { "epoch": 1.3179803221969943, "grad_norm": 0.18761220574378967, "learning_rate": 4.318592816369046e-05, "loss": 0.4511, "step": 36570 }, { "epoch": 1.3181605218582189, "grad_norm": 0.15937355160713196, "learning_rate": 4.3183925691368695e-05, "loss": 0.3816, "step": 36575 }, { "epoch": 1.3183407215194436, "grad_norm": 0.18989646434783936, "learning_rate": 4.318192297129446e-05, "loss": 0.3873, "step": 36580 }, { "epoch": 1.318520921180668, "grad_norm": 0.1784542053937912, "learning_rate": 4.3179920003495045e-05, "loss": 0.4507, "step": 36585 }, { "epoch": 1.3187011208418928, "grad_norm": 0.162387415766716, "learning_rate": 4.3177916787997735e-05, "loss": 0.4457, "step": 36590 }, { "epoch": 1.3188813205031176, "grad_norm": 0.1939140409231186, "learning_rate": 4.3175913324829834e-05, "loss": 0.377, "step": 36595 }, { "epoch": 1.319061520164342, "grad_norm": 0.19154348969459534, "learning_rate": 4.317390961401862e-05, "loss": 0.3741, "step": 36600 }, { "epoch": 1.3192417198255666, "grad_norm": 0.18536292016506195, "learning_rate": 4.3171905655591425e-05, "loss": 0.3934, "step": 36605 }, { "epoch": 1.3194219194867913, "grad_norm": 0.1465887874364853, "learning_rate": 4.316990144957553e-05, "loss": 0.4129, "step": 36610 }, { "epoch": 1.319602119148016, "grad_norm": 0.21748924255371094, "learning_rate": 4.316789699599824e-05, "loss": 0.4058, "step": 36615 }, { "epoch": 1.3197823188092406, "grad_norm": 0.18273966014385223, "learning_rate": 4.316589229488687e-05, "loss": 0.429, "step": 36620 }, { "epoch": 1.3199625184704653, "grad_norm": 0.21654346585273743, "learning_rate": 4.3163887346268735e-05, "loss": 0.4469, "step": 36625 }, { "epoch": 1.3201427181316898, "grad_norm": 0.18244554102420807, "learning_rate": 4.316188215017116e-05, "loss": 0.4285, "step": 36630 }, { "epoch": 1.3203229177929146, "grad_norm": 0.20105114579200745, "learning_rate": 4.315987670662145e-05, "loss": 0.4308, "step": 36635 }, { "epoch": 1.3205031174541393, "grad_norm": 0.20013387501239777, "learning_rate": 4.315787101564693e-05, "loss": 0.4312, "step": 36640 }, { "epoch": 1.3206833171153638, "grad_norm": 0.20948466658592224, "learning_rate": 4.315586507727494e-05, "loss": 0.4199, "step": 36645 }, { "epoch": 1.3208635167765885, "grad_norm": 0.19690115749835968, "learning_rate": 4.3153858891532804e-05, "loss": 0.4365, "step": 36650 }, { "epoch": 1.321043716437813, "grad_norm": 0.17856040596961975, "learning_rate": 4.315185245844785e-05, "loss": 0.4259, "step": 36655 }, { "epoch": 1.3212239160990378, "grad_norm": 0.16999214887619019, "learning_rate": 4.314984577804743e-05, "loss": 0.4338, "step": 36660 }, { "epoch": 1.3214041157602623, "grad_norm": 0.1723853200674057, "learning_rate": 4.314783885035887e-05, "loss": 0.4113, "step": 36665 }, { "epoch": 1.321584315421487, "grad_norm": 0.19773799180984497, "learning_rate": 4.314583167540952e-05, "loss": 0.436, "step": 36670 }, { "epoch": 1.3217645150827115, "grad_norm": 0.15514503419399261, "learning_rate": 4.314382425322672e-05, "loss": 0.4356, "step": 36675 }, { "epoch": 1.3219447147439363, "grad_norm": 0.20653900504112244, "learning_rate": 4.314181658383783e-05, "loss": 0.3912, "step": 36680 }, { "epoch": 1.322124914405161, "grad_norm": 0.15066716074943542, "learning_rate": 4.313980866727021e-05, "loss": 0.4433, "step": 36685 }, { "epoch": 1.3223051140663855, "grad_norm": 0.1296568661928177, "learning_rate": 4.313780050355119e-05, "loss": 0.3966, "step": 36690 }, { "epoch": 1.3224853137276102, "grad_norm": 0.2106575220823288, "learning_rate": 4.313579209270817e-05, "loss": 0.3952, "step": 36695 }, { "epoch": 1.3226655133888348, "grad_norm": 0.1847856044769287, "learning_rate": 4.313378343476849e-05, "loss": 0.4303, "step": 36700 }, { "epoch": 1.3228457130500595, "grad_norm": 0.20785640180110931, "learning_rate": 4.313177452975952e-05, "loss": 0.4227, "step": 36705 }, { "epoch": 1.3230259127112842, "grad_norm": 0.14709509909152985, "learning_rate": 4.312976537770863e-05, "loss": 0.4353, "step": 36710 }, { "epoch": 1.3232061123725087, "grad_norm": 0.19536839425563812, "learning_rate": 4.312775597864319e-05, "loss": 0.4323, "step": 36715 }, { "epoch": 1.3233863120337332, "grad_norm": 0.15768684446811676, "learning_rate": 4.31257463325906e-05, "loss": 0.416, "step": 36720 }, { "epoch": 1.323566511694958, "grad_norm": 0.20642277598381042, "learning_rate": 4.312373643957821e-05, "loss": 0.4052, "step": 36725 }, { "epoch": 1.3237467113561827, "grad_norm": 0.1643398553133011, "learning_rate": 4.312172629963343e-05, "loss": 0.4166, "step": 36730 }, { "epoch": 1.3239269110174072, "grad_norm": 0.1844344139099121, "learning_rate": 4.311971591278363e-05, "loss": 0.4033, "step": 36735 }, { "epoch": 1.324107110678632, "grad_norm": 0.1502874493598938, "learning_rate": 4.311770527905622e-05, "loss": 0.3871, "step": 36740 }, { "epoch": 1.3242873103398565, "grad_norm": 0.16832222044467926, "learning_rate": 4.3115694398478574e-05, "loss": 0.4425, "step": 36745 }, { "epoch": 1.3244675100010812, "grad_norm": 0.14828276634216309, "learning_rate": 4.31136832710781e-05, "loss": 0.3985, "step": 36750 }, { "epoch": 1.324647709662306, "grad_norm": 0.20640629529953003, "learning_rate": 4.31116718968822e-05, "loss": 0.4414, "step": 36755 }, { "epoch": 1.3248279093235305, "grad_norm": 0.1458745151758194, "learning_rate": 4.310966027591828e-05, "loss": 0.4191, "step": 36760 }, { "epoch": 1.3250081089847552, "grad_norm": 0.2120623141527176, "learning_rate": 4.3107648408213744e-05, "loss": 0.4122, "step": 36765 }, { "epoch": 1.3251883086459797, "grad_norm": 0.17051535844802856, "learning_rate": 4.3105636293795995e-05, "loss": 0.4658, "step": 36770 }, { "epoch": 1.3253685083072044, "grad_norm": 0.19851194322109222, "learning_rate": 4.310362393269247e-05, "loss": 0.4223, "step": 36775 }, { "epoch": 1.325548707968429, "grad_norm": 0.2429044246673584, "learning_rate": 4.310161132493057e-05, "loss": 0.4051, "step": 36780 }, { "epoch": 1.3257289076296537, "grad_norm": 0.21794357895851135, "learning_rate": 4.3099598470537716e-05, "loss": 0.4102, "step": 36785 }, { "epoch": 1.3259091072908782, "grad_norm": 0.17098356783390045, "learning_rate": 4.3097585369541336e-05, "loss": 0.4288, "step": 36790 }, { "epoch": 1.326089306952103, "grad_norm": 0.17869386076927185, "learning_rate": 4.309557202196887e-05, "loss": 0.4128, "step": 36795 }, { "epoch": 1.3262695066133277, "grad_norm": 0.17495180666446686, "learning_rate": 4.309355842784773e-05, "loss": 0.4169, "step": 36800 }, { "epoch": 1.3264497062745522, "grad_norm": 0.15498143434524536, "learning_rate": 4.309154458720536e-05, "loss": 0.4238, "step": 36805 }, { "epoch": 1.326629905935777, "grad_norm": 0.17504778504371643, "learning_rate": 4.3089530500069194e-05, "loss": 0.433, "step": 36810 }, { "epoch": 1.3268101055970014, "grad_norm": 0.1534842997789383, "learning_rate": 4.308751616646668e-05, "loss": 0.4162, "step": 36815 }, { "epoch": 1.3269903052582261, "grad_norm": 0.14600083231925964, "learning_rate": 4.308550158642526e-05, "loss": 0.4286, "step": 36820 }, { "epoch": 1.3271705049194509, "grad_norm": 0.15580976009368896, "learning_rate": 4.3083486759972384e-05, "loss": 0.4622, "step": 36825 }, { "epoch": 1.3273507045806754, "grad_norm": 0.14176791906356812, "learning_rate": 4.30814716871355e-05, "loss": 0.4127, "step": 36830 }, { "epoch": 1.3275309042419, "grad_norm": 0.1750148981809616, "learning_rate": 4.3079456367942065e-05, "loss": 0.4201, "step": 36835 }, { "epoch": 1.3277111039031246, "grad_norm": 0.18358172476291656, "learning_rate": 4.3077440802419544e-05, "loss": 0.4328, "step": 36840 }, { "epoch": 1.3278913035643494, "grad_norm": 0.20503877103328705, "learning_rate": 4.307542499059538e-05, "loss": 0.4361, "step": 36845 }, { "epoch": 1.3280715032255739, "grad_norm": 0.20601756870746613, "learning_rate": 4.307340893249706e-05, "loss": 0.4039, "step": 36850 }, { "epoch": 1.3282517028867986, "grad_norm": 0.1701328456401825, "learning_rate": 4.307139262815204e-05, "loss": 0.3766, "step": 36855 }, { "epoch": 1.3284319025480231, "grad_norm": 0.1465604156255722, "learning_rate": 4.30693760775878e-05, "loss": 0.4086, "step": 36860 }, { "epoch": 1.3286121022092479, "grad_norm": 0.15594859421253204, "learning_rate": 4.3067359280831797e-05, "loss": 0.3908, "step": 36865 }, { "epoch": 1.3287923018704726, "grad_norm": 0.15714693069458008, "learning_rate": 4.306534223791153e-05, "loss": 0.4307, "step": 36870 }, { "epoch": 1.328972501531697, "grad_norm": 0.17976239323616028, "learning_rate": 4.306332494885446e-05, "loss": 0.4154, "step": 36875 }, { "epoch": 1.3291527011929218, "grad_norm": 0.17832331359386444, "learning_rate": 4.30613074136881e-05, "loss": 0.4376, "step": 36880 }, { "epoch": 1.3293329008541463, "grad_norm": 0.15694265067577362, "learning_rate": 4.305928963243992e-05, "loss": 0.4172, "step": 36885 }, { "epoch": 1.329513100515371, "grad_norm": 0.1645699441432953, "learning_rate": 4.305727160513741e-05, "loss": 0.459, "step": 36890 }, { "epoch": 1.3296933001765956, "grad_norm": 0.16790169477462769, "learning_rate": 4.305525333180807e-05, "loss": 0.3762, "step": 36895 }, { "epoch": 1.3298734998378203, "grad_norm": 0.1775948405265808, "learning_rate": 4.3053234812479406e-05, "loss": 0.3997, "step": 36900 }, { "epoch": 1.3300536994990448, "grad_norm": 0.19268429279327393, "learning_rate": 4.305121604717891e-05, "loss": 0.4283, "step": 36905 }, { "epoch": 1.3302338991602696, "grad_norm": 0.1967247873544693, "learning_rate": 4.304919703593409e-05, "loss": 0.4265, "step": 36910 }, { "epoch": 1.3304140988214943, "grad_norm": 0.1895369440317154, "learning_rate": 4.304717777877246e-05, "loss": 0.3815, "step": 36915 }, { "epoch": 1.3305942984827188, "grad_norm": 0.1666320413351059, "learning_rate": 4.304515827572152e-05, "loss": 0.4227, "step": 36920 }, { "epoch": 1.3307744981439436, "grad_norm": 0.19722457230091095, "learning_rate": 4.304313852680879e-05, "loss": 0.4094, "step": 36925 }, { "epoch": 1.330954697805168, "grad_norm": 0.20568078756332397, "learning_rate": 4.3041118532061794e-05, "loss": 0.415, "step": 36930 }, { "epoch": 1.3311348974663928, "grad_norm": 0.16985160112380981, "learning_rate": 4.303909829150805e-05, "loss": 0.4198, "step": 36935 }, { "epoch": 1.3313150971276175, "grad_norm": 0.1945793330669403, "learning_rate": 4.3037077805175085e-05, "loss": 0.4086, "step": 36940 }, { "epoch": 1.331495296788842, "grad_norm": 0.19523586332798004, "learning_rate": 4.303505707309043e-05, "loss": 0.4424, "step": 36945 }, { "epoch": 1.3316754964500666, "grad_norm": 0.18730002641677856, "learning_rate": 4.303303609528161e-05, "loss": 0.4097, "step": 36950 }, { "epoch": 1.3318556961112913, "grad_norm": 0.22210468351840973, "learning_rate": 4.303101487177616e-05, "loss": 0.4432, "step": 36955 }, { "epoch": 1.332035895772516, "grad_norm": 0.16588768362998962, "learning_rate": 4.3028993402601636e-05, "loss": 0.4628, "step": 36960 }, { "epoch": 1.3322160954337405, "grad_norm": 0.19048090279102325, "learning_rate": 4.302697168778556e-05, "loss": 0.4337, "step": 36965 }, { "epoch": 1.3323962950949653, "grad_norm": 0.18421435356140137, "learning_rate": 4.302494972735549e-05, "loss": 0.4059, "step": 36970 }, { "epoch": 1.3325764947561898, "grad_norm": 0.15311434864997864, "learning_rate": 4.3022927521338965e-05, "loss": 0.3721, "step": 36975 }, { "epoch": 1.3327566944174145, "grad_norm": 0.1969848871231079, "learning_rate": 4.302090506976354e-05, "loss": 0.4358, "step": 36980 }, { "epoch": 1.3329368940786392, "grad_norm": 0.19193395972251892, "learning_rate": 4.301888237265678e-05, "loss": 0.4397, "step": 36985 }, { "epoch": 1.3331170937398638, "grad_norm": 0.19378533959388733, "learning_rate": 4.301685943004622e-05, "loss": 0.4522, "step": 36990 }, { "epoch": 1.3332972934010883, "grad_norm": 0.19879907369613647, "learning_rate": 4.301483624195945e-05, "loss": 0.4517, "step": 36995 }, { "epoch": 1.333477493062313, "grad_norm": 0.16110192239284515, "learning_rate": 4.301281280842403e-05, "loss": 0.4311, "step": 37000 }, { "epoch": 1.333477493062313, "eval_loss": 0.4481649100780487, "eval_runtime": 3.5338, "eval_samples_per_second": 28.298, "eval_steps_per_second": 7.075, "step": 37000 }, { "epoch": 1.3336576927235377, "grad_norm": 0.1890387237071991, "learning_rate": 4.301078912946751e-05, "loss": 0.4437, "step": 37005 }, { "epoch": 1.3338378923847622, "grad_norm": 0.2009790688753128, "learning_rate": 4.300876520511748e-05, "loss": 0.4314, "step": 37010 }, { "epoch": 1.334018092045987, "grad_norm": 0.23660525679588318, "learning_rate": 4.300674103540151e-05, "loss": 0.4406, "step": 37015 }, { "epoch": 1.3341982917072115, "grad_norm": 0.20665594935417175, "learning_rate": 4.300471662034719e-05, "loss": 0.3956, "step": 37020 }, { "epoch": 1.3343784913684362, "grad_norm": 0.19279490411281586, "learning_rate": 4.3002691959982076e-05, "loss": 0.4062, "step": 37025 }, { "epoch": 1.334558691029661, "grad_norm": 0.20935142040252686, "learning_rate": 4.3000667054333775e-05, "loss": 0.4324, "step": 37030 }, { "epoch": 1.3347388906908855, "grad_norm": 0.1611267477273941, "learning_rate": 4.2998641903429875e-05, "loss": 0.4183, "step": 37035 }, { "epoch": 1.3349190903521102, "grad_norm": 0.2519780993461609, "learning_rate": 4.299661650729796e-05, "loss": 0.447, "step": 37040 }, { "epoch": 1.3350992900133347, "grad_norm": 0.1985885351896286, "learning_rate": 4.2994590865965634e-05, "loss": 0.4577, "step": 37045 }, { "epoch": 1.3352794896745594, "grad_norm": 0.14682763814926147, "learning_rate": 4.2992564979460484e-05, "loss": 0.397, "step": 37050 }, { "epoch": 1.3354596893357842, "grad_norm": 0.17361211776733398, "learning_rate": 4.299053884781012e-05, "loss": 0.3837, "step": 37055 }, { "epoch": 1.3356398889970087, "grad_norm": 0.17351488769054413, "learning_rate": 4.2988512471042154e-05, "loss": 0.4613, "step": 37060 }, { "epoch": 1.3358200886582332, "grad_norm": 0.16963542997837067, "learning_rate": 4.2986485849184185e-05, "loss": 0.4384, "step": 37065 }, { "epoch": 1.336000288319458, "grad_norm": 0.16560989618301392, "learning_rate": 4.298445898226383e-05, "loss": 0.4036, "step": 37070 }, { "epoch": 1.3361804879806827, "grad_norm": 0.1843060851097107, "learning_rate": 4.29824318703087e-05, "loss": 0.384, "step": 37075 }, { "epoch": 1.3363606876419072, "grad_norm": 0.20342443883419037, "learning_rate": 4.298040451334642e-05, "loss": 0.4224, "step": 37080 }, { "epoch": 1.336540887303132, "grad_norm": 0.17561762034893036, "learning_rate": 4.297837691140461e-05, "loss": 0.4238, "step": 37085 }, { "epoch": 1.3367210869643564, "grad_norm": 0.20807789266109467, "learning_rate": 4.297634906451089e-05, "loss": 0.4155, "step": 37090 }, { "epoch": 1.3369012866255812, "grad_norm": 0.19749189913272858, "learning_rate": 4.29743209726929e-05, "loss": 0.4417, "step": 37095 }, { "epoch": 1.337081486286806, "grad_norm": 0.15271851420402527, "learning_rate": 4.297229263597827e-05, "loss": 0.3862, "step": 37100 }, { "epoch": 1.3372616859480304, "grad_norm": 0.19541774690151215, "learning_rate": 4.2970264054394625e-05, "loss": 0.3878, "step": 37105 }, { "epoch": 1.337441885609255, "grad_norm": 0.20491370558738708, "learning_rate": 4.296823522796961e-05, "loss": 0.4285, "step": 37110 }, { "epoch": 1.3376220852704797, "grad_norm": 0.19413931667804718, "learning_rate": 4.2966206156730875e-05, "loss": 0.4268, "step": 37115 }, { "epoch": 1.3378022849317044, "grad_norm": 0.17734622955322266, "learning_rate": 4.296417684070606e-05, "loss": 0.3678, "step": 37120 }, { "epoch": 1.337982484592929, "grad_norm": 0.1772347390651703, "learning_rate": 4.29621472799228e-05, "loss": 0.4416, "step": 37125 }, { "epoch": 1.3381626842541536, "grad_norm": 0.2039223164319992, "learning_rate": 4.296011747440878e-05, "loss": 0.4219, "step": 37130 }, { "epoch": 1.3383428839153781, "grad_norm": 0.17975406348705292, "learning_rate": 4.295808742419163e-05, "loss": 0.4512, "step": 37135 }, { "epoch": 1.3385230835766029, "grad_norm": 0.21606260538101196, "learning_rate": 4.295605712929901e-05, "loss": 0.4352, "step": 37140 }, { "epoch": 1.3387032832378276, "grad_norm": 0.16076348721981049, "learning_rate": 4.295402658975859e-05, "loss": 0.4171, "step": 37145 }, { "epoch": 1.3388834828990521, "grad_norm": 0.14485345780849457, "learning_rate": 4.295199580559804e-05, "loss": 0.4193, "step": 37150 }, { "epoch": 1.3390636825602769, "grad_norm": 0.17786848545074463, "learning_rate": 4.2949964776845014e-05, "loss": 0.3925, "step": 37155 }, { "epoch": 1.3392438822215014, "grad_norm": 0.2276924103498459, "learning_rate": 4.29479335035272e-05, "loss": 0.4485, "step": 37160 }, { "epoch": 1.339424081882726, "grad_norm": 0.17060185968875885, "learning_rate": 4.294590198567226e-05, "loss": 0.4437, "step": 37165 }, { "epoch": 1.3396042815439506, "grad_norm": 0.18180356919765472, "learning_rate": 4.294387022330789e-05, "loss": 0.3794, "step": 37170 }, { "epoch": 1.3397844812051753, "grad_norm": 0.17206217348575592, "learning_rate": 4.294183821646175e-05, "loss": 0.4358, "step": 37175 }, { "epoch": 1.3399646808663999, "grad_norm": 0.19352799654006958, "learning_rate": 4.293980596516155e-05, "loss": 0.4043, "step": 37180 }, { "epoch": 1.3401448805276246, "grad_norm": 0.2113627791404724, "learning_rate": 4.2937773469434963e-05, "loss": 0.3979, "step": 37185 }, { "epoch": 1.3403250801888493, "grad_norm": 0.1969025582075119, "learning_rate": 4.293574072930968e-05, "loss": 0.4085, "step": 37190 }, { "epoch": 1.3405052798500738, "grad_norm": 0.1911691427230835, "learning_rate": 4.29337077448134e-05, "loss": 0.4124, "step": 37195 }, { "epoch": 1.3406854795112986, "grad_norm": 0.16924938559532166, "learning_rate": 4.293167451597383e-05, "loss": 0.4439, "step": 37200 }, { "epoch": 1.340865679172523, "grad_norm": 0.15736305713653564, "learning_rate": 4.292964104281867e-05, "loss": 0.3968, "step": 37205 }, { "epoch": 1.3410458788337478, "grad_norm": 0.22912681102752686, "learning_rate": 4.2927607325375616e-05, "loss": 0.4219, "step": 37210 }, { "epoch": 1.3412260784949726, "grad_norm": 0.1928148865699768, "learning_rate": 4.292557336367239e-05, "loss": 0.3937, "step": 37215 }, { "epoch": 1.341406278156197, "grad_norm": 0.19849920272827148, "learning_rate": 4.2923539157736695e-05, "loss": 0.4182, "step": 37220 }, { "epoch": 1.3415864778174216, "grad_norm": 0.14143873751163483, "learning_rate": 4.292150470759624e-05, "loss": 0.3873, "step": 37225 }, { "epoch": 1.3417666774786463, "grad_norm": 0.17856481671333313, "learning_rate": 4.291947001327876e-05, "loss": 0.435, "step": 37230 }, { "epoch": 1.341946877139871, "grad_norm": 0.15934741497039795, "learning_rate": 4.291743507481197e-05, "loss": 0.4061, "step": 37235 }, { "epoch": 1.3421270768010956, "grad_norm": 0.193971648812294, "learning_rate": 4.2915399892223595e-05, "loss": 0.3881, "step": 37240 }, { "epoch": 1.3423072764623203, "grad_norm": 0.22746896743774414, "learning_rate": 4.2913364465541366e-05, "loss": 0.411, "step": 37245 }, { "epoch": 1.3424874761235448, "grad_norm": 0.21635814011096954, "learning_rate": 4.291132879479302e-05, "loss": 0.4091, "step": 37250 }, { "epoch": 1.3426676757847695, "grad_norm": 0.19370247423648834, "learning_rate": 4.290929288000628e-05, "loss": 0.4045, "step": 37255 }, { "epoch": 1.3428478754459943, "grad_norm": 0.14422830939292908, "learning_rate": 4.290725672120889e-05, "loss": 0.3829, "step": 37260 }, { "epoch": 1.3430280751072188, "grad_norm": 0.16739609837532043, "learning_rate": 4.29052203184286e-05, "loss": 0.4541, "step": 37265 }, { "epoch": 1.3432082747684435, "grad_norm": 0.17968153953552246, "learning_rate": 4.290318367169314e-05, "loss": 0.4112, "step": 37270 }, { "epoch": 1.343388474429668, "grad_norm": 0.20018011331558228, "learning_rate": 4.290114678103028e-05, "loss": 0.4536, "step": 37275 }, { "epoch": 1.3435686740908928, "grad_norm": 0.1976526975631714, "learning_rate": 4.289910964646776e-05, "loss": 0.4011, "step": 37280 }, { "epoch": 1.3437488737521173, "grad_norm": 0.19108478724956512, "learning_rate": 4.289707226803333e-05, "loss": 0.4303, "step": 37285 }, { "epoch": 1.343929073413342, "grad_norm": 0.1997978538274765, "learning_rate": 4.289503464575476e-05, "loss": 0.4182, "step": 37290 }, { "epoch": 1.3441092730745665, "grad_norm": 0.20176932215690613, "learning_rate": 4.28929967796598e-05, "loss": 0.4345, "step": 37295 }, { "epoch": 1.3442894727357912, "grad_norm": 0.1834762841463089, "learning_rate": 4.289095866977623e-05, "loss": 0.4147, "step": 37300 }, { "epoch": 1.344469672397016, "grad_norm": 0.18414629995822906, "learning_rate": 4.288892031613181e-05, "loss": 0.4264, "step": 37305 }, { "epoch": 1.3446498720582405, "grad_norm": 0.175818532705307, "learning_rate": 4.288688171875431e-05, "loss": 0.379, "step": 37310 }, { "epoch": 1.3448300717194652, "grad_norm": 0.16372618079185486, "learning_rate": 4.288484287767152e-05, "loss": 0.3926, "step": 37315 }, { "epoch": 1.3450102713806897, "grad_norm": 0.17011037468910217, "learning_rate": 4.2882803792911205e-05, "loss": 0.4143, "step": 37320 }, { "epoch": 1.3451904710419145, "grad_norm": 0.13715289533138275, "learning_rate": 4.288076446450115e-05, "loss": 0.3912, "step": 37325 }, { "epoch": 1.3453706707031392, "grad_norm": 0.17241427302360535, "learning_rate": 4.2878724892469135e-05, "loss": 0.3965, "step": 37330 }, { "epoch": 1.3455508703643637, "grad_norm": 0.17453129589557648, "learning_rate": 4.287668507684296e-05, "loss": 0.4192, "step": 37335 }, { "epoch": 1.3457310700255882, "grad_norm": 0.22554276883602142, "learning_rate": 4.287464501765041e-05, "loss": 0.3842, "step": 37340 }, { "epoch": 1.345911269686813, "grad_norm": 0.1914464235305786, "learning_rate": 4.2872604714919285e-05, "loss": 0.4156, "step": 37345 }, { "epoch": 1.3460914693480377, "grad_norm": 0.16608940064907074, "learning_rate": 4.287056416867738e-05, "loss": 0.4284, "step": 37350 }, { "epoch": 1.3462716690092622, "grad_norm": 0.20731467008590698, "learning_rate": 4.28685233789525e-05, "loss": 0.4522, "step": 37355 }, { "epoch": 1.346451868670487, "grad_norm": 0.1691913902759552, "learning_rate": 4.286648234577244e-05, "loss": 0.4299, "step": 37360 }, { "epoch": 1.3466320683317115, "grad_norm": 0.19073499739170074, "learning_rate": 4.286444106916503e-05, "loss": 0.4108, "step": 37365 }, { "epoch": 1.3468122679929362, "grad_norm": 0.1724918931722641, "learning_rate": 4.286239954915806e-05, "loss": 0.4408, "step": 37370 }, { "epoch": 1.346992467654161, "grad_norm": 0.19788911938667297, "learning_rate": 4.2860357785779356e-05, "loss": 0.4212, "step": 37375 }, { "epoch": 1.3471726673153854, "grad_norm": 0.17659629881381989, "learning_rate": 4.2858315779056734e-05, "loss": 0.4535, "step": 37380 }, { "epoch": 1.3473528669766102, "grad_norm": 0.1844049096107483, "learning_rate": 4.285627352901802e-05, "loss": 0.388, "step": 37385 }, { "epoch": 1.3475330666378347, "grad_norm": 0.19511617720127106, "learning_rate": 4.285423103569103e-05, "loss": 0.4225, "step": 37390 }, { "epoch": 1.3477132662990594, "grad_norm": 0.1395874172449112, "learning_rate": 4.2852188299103614e-05, "loss": 0.3867, "step": 37395 }, { "epoch": 1.347893465960284, "grad_norm": 0.1697840690612793, "learning_rate": 4.2850145319283575e-05, "loss": 0.3939, "step": 37400 }, { "epoch": 1.3480736656215087, "grad_norm": 0.20716294646263123, "learning_rate": 4.284810209625876e-05, "loss": 0.4563, "step": 37405 }, { "epoch": 1.3482538652827332, "grad_norm": 0.19542010128498077, "learning_rate": 4.2846058630057016e-05, "loss": 0.4322, "step": 37410 }, { "epoch": 1.348434064943958, "grad_norm": 0.15762588381767273, "learning_rate": 4.2844014920706176e-05, "loss": 0.4279, "step": 37415 }, { "epoch": 1.3486142646051826, "grad_norm": 0.20713409781455994, "learning_rate": 4.284197096823409e-05, "loss": 0.4345, "step": 37420 }, { "epoch": 1.3487944642664071, "grad_norm": 0.18533405661582947, "learning_rate": 4.2839926772668605e-05, "loss": 0.4219, "step": 37425 }, { "epoch": 1.3489746639276319, "grad_norm": 0.19821499288082123, "learning_rate": 4.283788233403757e-05, "loss": 0.4249, "step": 37430 }, { "epoch": 1.3491548635888564, "grad_norm": 0.20067061483860016, "learning_rate": 4.283583765236884e-05, "loss": 0.4566, "step": 37435 }, { "epoch": 1.3493350632500811, "grad_norm": 0.16984498500823975, "learning_rate": 4.2833792727690275e-05, "loss": 0.38, "step": 37440 }, { "epoch": 1.3495152629113059, "grad_norm": 0.1640443056821823, "learning_rate": 4.283174756002973e-05, "loss": 0.4147, "step": 37445 }, { "epoch": 1.3496954625725304, "grad_norm": 0.16982309520244598, "learning_rate": 4.2829702149415094e-05, "loss": 0.4274, "step": 37450 }, { "epoch": 1.3498756622337549, "grad_norm": 0.19028659164905548, "learning_rate": 4.2827656495874205e-05, "loss": 0.4304, "step": 37455 }, { "epoch": 1.3500558618949796, "grad_norm": 0.20685800909996033, "learning_rate": 4.282561059943495e-05, "loss": 0.4145, "step": 37460 }, { "epoch": 1.3502360615562043, "grad_norm": 0.18476605415344238, "learning_rate": 4.2823564460125206e-05, "loss": 0.4054, "step": 37465 }, { "epoch": 1.3504162612174289, "grad_norm": 0.17075756192207336, "learning_rate": 4.2821518077972845e-05, "loss": 0.3853, "step": 37470 }, { "epoch": 1.3505964608786536, "grad_norm": 0.1620296984910965, "learning_rate": 4.281947145300574e-05, "loss": 0.3927, "step": 37475 }, { "epoch": 1.350776660539878, "grad_norm": 0.1557290405035019, "learning_rate": 4.2817424585251804e-05, "loss": 0.4178, "step": 37480 }, { "epoch": 1.3509568602011028, "grad_norm": 0.17383769154548645, "learning_rate": 4.2815377474738894e-05, "loss": 0.4483, "step": 37485 }, { "epoch": 1.3511370598623276, "grad_norm": 0.19885095953941345, "learning_rate": 4.2813330121494924e-05, "loss": 0.4472, "step": 37490 }, { "epoch": 1.351317259523552, "grad_norm": 0.18634334206581116, "learning_rate": 4.281128252554778e-05, "loss": 0.4156, "step": 37495 }, { "epoch": 1.3514974591847766, "grad_norm": 0.21078574657440186, "learning_rate": 4.280923468692535e-05, "loss": 0.4509, "step": 37500 }, { "epoch": 1.3514974591847766, "eval_loss": 0.44750291109085083, "eval_runtime": 3.5455, "eval_samples_per_second": 28.205, "eval_steps_per_second": 7.051, "step": 37500 }, { "epoch": 1.3516776588460013, "grad_norm": 0.21261395514011383, "learning_rate": 4.280718660565556e-05, "loss": 0.4191, "step": 37505 }, { "epoch": 1.351857858507226, "grad_norm": 0.1816295087337494, "learning_rate": 4.280513828176629e-05, "loss": 0.4367, "step": 37510 }, { "epoch": 1.3520380581684506, "grad_norm": 0.16183680295944214, "learning_rate": 4.280308971528546e-05, "loss": 0.4168, "step": 37515 }, { "epoch": 1.3522182578296753, "grad_norm": 0.19457785785198212, "learning_rate": 4.280104090624097e-05, "loss": 0.4259, "step": 37520 }, { "epoch": 1.3523984574908998, "grad_norm": 0.16114507615566254, "learning_rate": 4.279899185466077e-05, "loss": 0.4355, "step": 37525 }, { "epoch": 1.3525786571521246, "grad_norm": 0.17467471957206726, "learning_rate": 4.2796942560572725e-05, "loss": 0.4127, "step": 37530 }, { "epoch": 1.3527588568133493, "grad_norm": 0.18271581828594208, "learning_rate": 4.279489302400479e-05, "loss": 0.4033, "step": 37535 }, { "epoch": 1.3529390564745738, "grad_norm": 0.15047767758369446, "learning_rate": 4.279284324498489e-05, "loss": 0.3937, "step": 37540 }, { "epoch": 1.3531192561357985, "grad_norm": 0.1880033016204834, "learning_rate": 4.2790793223540944e-05, "loss": 0.4112, "step": 37545 }, { "epoch": 1.353299455797023, "grad_norm": 0.2034742534160614, "learning_rate": 4.278874295970088e-05, "loss": 0.4222, "step": 37550 }, { "epoch": 1.3534796554582478, "grad_norm": 0.14780563116073608, "learning_rate": 4.278669245349264e-05, "loss": 0.4239, "step": 37555 }, { "epoch": 1.3536598551194725, "grad_norm": 0.15537936985492706, "learning_rate": 4.278464170494416e-05, "loss": 0.3862, "step": 37560 }, { "epoch": 1.353840054780697, "grad_norm": 0.14298701286315918, "learning_rate": 4.278259071408338e-05, "loss": 0.4007, "step": 37565 }, { "epoch": 1.3540202544419215, "grad_norm": 0.1611703783273697, "learning_rate": 4.278053948093824e-05, "loss": 0.4007, "step": 37570 }, { "epoch": 1.3542004541031463, "grad_norm": 0.2151387333869934, "learning_rate": 4.27784880055367e-05, "loss": 0.4169, "step": 37575 }, { "epoch": 1.354380653764371, "grad_norm": 0.17721928656101227, "learning_rate": 4.277643628790669e-05, "loss": 0.4298, "step": 37580 }, { "epoch": 1.3545608534255955, "grad_norm": 0.17061589658260345, "learning_rate": 4.277438432807619e-05, "loss": 0.4005, "step": 37585 }, { "epoch": 1.3547410530868202, "grad_norm": 0.159605473279953, "learning_rate": 4.277233212607315e-05, "loss": 0.4158, "step": 37590 }, { "epoch": 1.3549212527480448, "grad_norm": 0.15221120417118073, "learning_rate": 4.2770279681925506e-05, "loss": 0.4054, "step": 37595 }, { "epoch": 1.3551014524092695, "grad_norm": 0.1982593834400177, "learning_rate": 4.2768226995661255e-05, "loss": 0.3861, "step": 37600 }, { "epoch": 1.3552816520704942, "grad_norm": 0.1528361439704895, "learning_rate": 4.276617406730835e-05, "loss": 0.4002, "step": 37605 }, { "epoch": 1.3554618517317187, "grad_norm": 0.17486098408699036, "learning_rate": 4.2764120896894755e-05, "loss": 0.3899, "step": 37610 }, { "epoch": 1.3556420513929432, "grad_norm": 0.16037683188915253, "learning_rate": 4.276206748444846e-05, "loss": 0.4031, "step": 37615 }, { "epoch": 1.355822251054168, "grad_norm": 0.1568463146686554, "learning_rate": 4.2760013829997434e-05, "loss": 0.4136, "step": 37620 }, { "epoch": 1.3560024507153927, "grad_norm": 0.18203102052211761, "learning_rate": 4.2757959933569656e-05, "loss": 0.3986, "step": 37625 }, { "epoch": 1.3561826503766172, "grad_norm": 0.16312682628631592, "learning_rate": 4.275590579519311e-05, "loss": 0.4104, "step": 37630 }, { "epoch": 1.356362850037842, "grad_norm": 0.18882383406162262, "learning_rate": 4.275385141489578e-05, "loss": 0.4471, "step": 37635 }, { "epoch": 1.3565430496990665, "grad_norm": 0.21008935570716858, "learning_rate": 4.275179679270568e-05, "loss": 0.4276, "step": 37640 }, { "epoch": 1.3567232493602912, "grad_norm": 0.21513846516609192, "learning_rate": 4.274974192865077e-05, "loss": 0.4229, "step": 37645 }, { "epoch": 1.356903449021516, "grad_norm": 0.2062099725008011, "learning_rate": 4.274768682275907e-05, "loss": 0.4215, "step": 37650 }, { "epoch": 1.3570836486827405, "grad_norm": 0.1856852024793625, "learning_rate": 4.274563147505857e-05, "loss": 0.4214, "step": 37655 }, { "epoch": 1.3572638483439652, "grad_norm": 0.2162654846906662, "learning_rate": 4.2743575885577277e-05, "loss": 0.415, "step": 37660 }, { "epoch": 1.3574440480051897, "grad_norm": 0.14745785295963287, "learning_rate": 4.27415200543432e-05, "loss": 0.4311, "step": 37665 }, { "epoch": 1.3576242476664144, "grad_norm": 0.16586333513259888, "learning_rate": 4.2739463981384345e-05, "loss": 0.4549, "step": 37670 }, { "epoch": 1.357804447327639, "grad_norm": 0.19415488839149475, "learning_rate": 4.2737407666728724e-05, "loss": 0.4612, "step": 37675 }, { "epoch": 1.3579846469888637, "grad_norm": 0.20370012521743774, "learning_rate": 4.2735351110404365e-05, "loss": 0.3613, "step": 37680 }, { "epoch": 1.3581648466500882, "grad_norm": 0.16060635447502136, "learning_rate": 4.273329431243927e-05, "loss": 0.3881, "step": 37685 }, { "epoch": 1.358345046311313, "grad_norm": 0.1804780513048172, "learning_rate": 4.273123727286148e-05, "loss": 0.3816, "step": 37690 }, { "epoch": 1.3585252459725377, "grad_norm": 0.21636547148227692, "learning_rate": 4.272917999169902e-05, "loss": 0.4187, "step": 37695 }, { "epoch": 1.3587054456337622, "grad_norm": 0.20415888726711273, "learning_rate": 4.272712246897991e-05, "loss": 0.4345, "step": 37700 }, { "epoch": 1.358885645294987, "grad_norm": 0.17378808557987213, "learning_rate": 4.272506470473219e-05, "loss": 0.4309, "step": 37705 }, { "epoch": 1.3590658449562114, "grad_norm": 0.1880928874015808, "learning_rate": 4.2723006698983894e-05, "loss": 0.421, "step": 37710 }, { "epoch": 1.3592460446174361, "grad_norm": 0.18104448914527893, "learning_rate": 4.272094845176307e-05, "loss": 0.4287, "step": 37715 }, { "epoch": 1.3594262442786609, "grad_norm": 0.17074403166770935, "learning_rate": 4.2718889963097744e-05, "loss": 0.4521, "step": 37720 }, { "epoch": 1.3596064439398854, "grad_norm": 0.2121085524559021, "learning_rate": 4.2716831233015974e-05, "loss": 0.4466, "step": 37725 }, { "epoch": 1.35978664360111, "grad_norm": 0.14753827452659607, "learning_rate": 4.2714772261545813e-05, "loss": 0.4041, "step": 37730 }, { "epoch": 1.3599668432623346, "grad_norm": 0.18550336360931396, "learning_rate": 4.27127130487153e-05, "loss": 0.4358, "step": 37735 }, { "epoch": 1.3601470429235594, "grad_norm": 0.1991180032491684, "learning_rate": 4.2710653594552506e-05, "loss": 0.4421, "step": 37740 }, { "epoch": 1.3603272425847839, "grad_norm": 0.16525886952877045, "learning_rate": 4.2708593899085494e-05, "loss": 0.4052, "step": 37745 }, { "epoch": 1.3605074422460086, "grad_norm": 0.17231962084770203, "learning_rate": 4.270653396234231e-05, "loss": 0.4103, "step": 37750 }, { "epoch": 1.3606876419072331, "grad_norm": 0.1659233570098877, "learning_rate": 4.2704473784351036e-05, "loss": 0.3799, "step": 37755 }, { "epoch": 1.3608678415684579, "grad_norm": 0.15923145413398743, "learning_rate": 4.2702413365139724e-05, "loss": 0.3827, "step": 37760 }, { "epoch": 1.3610480412296826, "grad_norm": 0.13835641741752625, "learning_rate": 4.270035270473647e-05, "loss": 0.3854, "step": 37765 }, { "epoch": 1.361228240890907, "grad_norm": 0.24770118296146393, "learning_rate": 4.269829180316932e-05, "loss": 0.4457, "step": 37770 }, { "epoch": 1.3614084405521318, "grad_norm": 0.15679314732551575, "learning_rate": 4.269623066046639e-05, "loss": 0.421, "step": 37775 }, { "epoch": 1.3615886402133563, "grad_norm": 0.20325350761413574, "learning_rate": 4.269416927665573e-05, "loss": 0.3852, "step": 37780 }, { "epoch": 1.361768839874581, "grad_norm": 0.18375974893569946, "learning_rate": 4.269210765176544e-05, "loss": 0.4165, "step": 37785 }, { "epoch": 1.3619490395358056, "grad_norm": 0.1880822330713272, "learning_rate": 4.269004578582362e-05, "loss": 0.4473, "step": 37790 }, { "epoch": 1.3621292391970303, "grad_norm": 0.16802115738391876, "learning_rate": 4.2687983678858346e-05, "loss": 0.4006, "step": 37795 }, { "epoch": 1.3623094388582548, "grad_norm": 0.18482767045497894, "learning_rate": 4.268592133089771e-05, "loss": 0.4042, "step": 37800 }, { "epoch": 1.3624896385194796, "grad_norm": 0.18209148943424225, "learning_rate": 4.268385874196983e-05, "loss": 0.419, "step": 37805 }, { "epoch": 1.3626698381807043, "grad_norm": 0.20778799057006836, "learning_rate": 4.268179591210279e-05, "loss": 0.4544, "step": 37810 }, { "epoch": 1.3628500378419288, "grad_norm": 0.19366760551929474, "learning_rate": 4.267973284132471e-05, "loss": 0.4157, "step": 37815 }, { "epoch": 1.3630302375031536, "grad_norm": 0.15186309814453125, "learning_rate": 4.267766952966369e-05, "loss": 0.3843, "step": 37820 }, { "epoch": 1.363210437164378, "grad_norm": 0.19888971745967865, "learning_rate": 4.267560597714785e-05, "loss": 0.4345, "step": 37825 }, { "epoch": 1.3633906368256028, "grad_norm": 0.205244779586792, "learning_rate": 4.2673542183805295e-05, "loss": 0.437, "step": 37830 }, { "epoch": 1.3635708364868275, "grad_norm": 0.21422690153121948, "learning_rate": 4.267147814966415e-05, "loss": 0.4205, "step": 37835 }, { "epoch": 1.363751036148052, "grad_norm": 0.15978442132472992, "learning_rate": 4.266941387475254e-05, "loss": 0.4362, "step": 37840 }, { "epoch": 1.3639312358092766, "grad_norm": 0.17365151643753052, "learning_rate": 4.2667349359098586e-05, "loss": 0.4162, "step": 37845 }, { "epoch": 1.3641114354705013, "grad_norm": 0.17353810369968414, "learning_rate": 4.266528460273041e-05, "loss": 0.3909, "step": 37850 }, { "epoch": 1.364291635131726, "grad_norm": 0.16570428013801575, "learning_rate": 4.266321960567616e-05, "loss": 0.3856, "step": 37855 }, { "epoch": 1.3644718347929505, "grad_norm": 0.17112329602241516, "learning_rate": 4.2661154367963965e-05, "loss": 0.4471, "step": 37860 }, { "epoch": 1.3646520344541753, "grad_norm": 0.14121274650096893, "learning_rate": 4.2659088889621954e-05, "loss": 0.4194, "step": 37865 }, { "epoch": 1.3648322341153998, "grad_norm": 0.17881150543689728, "learning_rate": 4.265702317067828e-05, "loss": 0.4413, "step": 37870 }, { "epoch": 1.3650124337766245, "grad_norm": 0.22597812116146088, "learning_rate": 4.2654957211161085e-05, "loss": 0.4434, "step": 37875 }, { "epoch": 1.3651926334378492, "grad_norm": 0.16727763414382935, "learning_rate": 4.2652891011098505e-05, "loss": 0.4055, "step": 37880 }, { "epoch": 1.3653728330990738, "grad_norm": 0.19390372931957245, "learning_rate": 4.265082457051872e-05, "loss": 0.3988, "step": 37885 }, { "epoch": 1.3655530327602985, "grad_norm": 0.177637979388237, "learning_rate": 4.264875788944985e-05, "loss": 0.411, "step": 37890 }, { "epoch": 1.365733232421523, "grad_norm": 0.17487314343452454, "learning_rate": 4.2646690967920086e-05, "loss": 0.4127, "step": 37895 }, { "epoch": 1.3659134320827477, "grad_norm": 0.203982412815094, "learning_rate": 4.264462380595756e-05, "loss": 0.4603, "step": 37900 }, { "epoch": 1.3660936317439722, "grad_norm": 0.23240825533866882, "learning_rate": 4.264255640359046e-05, "loss": 0.418, "step": 37905 }, { "epoch": 1.366273831405197, "grad_norm": 0.15722376108169556, "learning_rate": 4.2640488760846945e-05, "loss": 0.4449, "step": 37910 }, { "epoch": 1.3664540310664215, "grad_norm": 0.14210271835327148, "learning_rate": 4.263842087775518e-05, "loss": 0.3789, "step": 37915 }, { "epoch": 1.3666342307276462, "grad_norm": 0.15911325812339783, "learning_rate": 4.263635275434336e-05, "loss": 0.4281, "step": 37920 }, { "epoch": 1.366814430388871, "grad_norm": 0.17531517148017883, "learning_rate": 4.263428439063963e-05, "loss": 0.4233, "step": 37925 }, { "epoch": 1.3669946300500955, "grad_norm": 0.16672587394714355, "learning_rate": 4.263221578667219e-05, "loss": 0.4255, "step": 37930 }, { "epoch": 1.3671748297113202, "grad_norm": 0.21964746713638306, "learning_rate": 4.263014694246924e-05, "loss": 0.4298, "step": 37935 }, { "epoch": 1.3673550293725447, "grad_norm": 0.161033034324646, "learning_rate": 4.262807785805894e-05, "loss": 0.3999, "step": 37940 }, { "epoch": 1.3675352290337695, "grad_norm": 0.20498016476631165, "learning_rate": 4.262600853346949e-05, "loss": 0.4068, "step": 37945 }, { "epoch": 1.3677154286949942, "grad_norm": 0.20285950601100922, "learning_rate": 4.262393896872909e-05, "loss": 0.4023, "step": 37950 }, { "epoch": 1.3678956283562187, "grad_norm": 0.16656413674354553, "learning_rate": 4.262186916386594e-05, "loss": 0.4094, "step": 37955 }, { "epoch": 1.3680758280174432, "grad_norm": 0.1727702021598816, "learning_rate": 4.261979911890822e-05, "loss": 0.3972, "step": 37960 }, { "epoch": 1.368256027678668, "grad_norm": 0.14835266768932343, "learning_rate": 4.261772883388416e-05, "loss": 0.4069, "step": 37965 }, { "epoch": 1.3684362273398927, "grad_norm": 0.22121186554431915, "learning_rate": 4.261565830882195e-05, "loss": 0.4128, "step": 37970 }, { "epoch": 1.3686164270011172, "grad_norm": 0.15944576263427734, "learning_rate": 4.261358754374981e-05, "loss": 0.4228, "step": 37975 }, { "epoch": 1.368796626662342, "grad_norm": 0.19683369994163513, "learning_rate": 4.261151653869595e-05, "loss": 0.436, "step": 37980 }, { "epoch": 1.3689768263235664, "grad_norm": 0.16129469871520996, "learning_rate": 4.260944529368858e-05, "loss": 0.4114, "step": 37985 }, { "epoch": 1.3691570259847912, "grad_norm": 0.18280497193336487, "learning_rate": 4.260737380875593e-05, "loss": 0.4122, "step": 37990 }, { "epoch": 1.369337225646016, "grad_norm": 0.20879638195037842, "learning_rate": 4.260530208392622e-05, "loss": 0.3891, "step": 37995 }, { "epoch": 1.3695174253072404, "grad_norm": 0.21062108874320984, "learning_rate": 4.260323011922768e-05, "loss": 0.3986, "step": 38000 }, { "epoch": 1.3695174253072404, "eval_loss": 0.4474603235721588, "eval_runtime": 3.5425, "eval_samples_per_second": 28.229, "eval_steps_per_second": 7.057, "step": 38000 }, { "epoch": 1.369697624968465, "grad_norm": 0.22914621233940125, "learning_rate": 4.2601157914688535e-05, "loss": 0.418, "step": 38005 }, { "epoch": 1.3698778246296897, "grad_norm": 0.2014915496110916, "learning_rate": 4.259908547033703e-05, "loss": 0.4176, "step": 38010 }, { "epoch": 1.3700580242909144, "grad_norm": 0.18956822156906128, "learning_rate": 4.2597012786201384e-05, "loss": 0.414, "step": 38015 }, { "epoch": 1.370238223952139, "grad_norm": 0.18128299713134766, "learning_rate": 4.2594939862309845e-05, "loss": 0.4348, "step": 38020 }, { "epoch": 1.3704184236133636, "grad_norm": 0.22281454503536224, "learning_rate": 4.259286669869066e-05, "loss": 0.4026, "step": 38025 }, { "epoch": 1.3705986232745881, "grad_norm": 0.21276813745498657, "learning_rate": 4.259079329537208e-05, "loss": 0.4402, "step": 38030 }, { "epoch": 1.3707788229358129, "grad_norm": 0.14526169002056122, "learning_rate": 4.2588719652382336e-05, "loss": 0.4397, "step": 38035 }, { "epoch": 1.3709590225970376, "grad_norm": 0.18278780579566956, "learning_rate": 4.25866457697497e-05, "loss": 0.393, "step": 38040 }, { "epoch": 1.3711392222582621, "grad_norm": 0.15444017946720123, "learning_rate": 4.2584571647502417e-05, "loss": 0.427, "step": 38045 }, { "epoch": 1.3713194219194869, "grad_norm": 0.1620875895023346, "learning_rate": 4.2582497285668746e-05, "loss": 0.405, "step": 38050 }, { "epoch": 1.3714996215807114, "grad_norm": 0.15654486417770386, "learning_rate": 4.258042268427695e-05, "loss": 0.3709, "step": 38055 }, { "epoch": 1.371679821241936, "grad_norm": 0.14211991429328918, "learning_rate": 4.257834784335531e-05, "loss": 0.4132, "step": 38060 }, { "epoch": 1.3718600209031608, "grad_norm": 0.17219579219818115, "learning_rate": 4.257627276293208e-05, "loss": 0.4233, "step": 38065 }, { "epoch": 1.3720402205643853, "grad_norm": 0.19686567783355713, "learning_rate": 4.257419744303553e-05, "loss": 0.4242, "step": 38070 }, { "epoch": 1.3722204202256099, "grad_norm": 0.1756589412689209, "learning_rate": 4.257212188369395e-05, "loss": 0.4201, "step": 38075 }, { "epoch": 1.3724006198868346, "grad_norm": 0.17567414045333862, "learning_rate": 4.257004608493561e-05, "loss": 0.4198, "step": 38080 }, { "epoch": 1.3725808195480593, "grad_norm": 0.2001543641090393, "learning_rate": 4.256797004678879e-05, "loss": 0.425, "step": 38085 }, { "epoch": 1.3727610192092838, "grad_norm": 0.20645958185195923, "learning_rate": 4.2565893769281787e-05, "loss": 0.4048, "step": 38090 }, { "epoch": 1.3729412188705086, "grad_norm": 0.18848280608654022, "learning_rate": 4.256381725244287e-05, "loss": 0.4125, "step": 38095 }, { "epoch": 1.373121418531733, "grad_norm": 0.16142655909061432, "learning_rate": 4.2561740496300353e-05, "loss": 0.3701, "step": 38100 }, { "epoch": 1.3733016181929578, "grad_norm": 0.17975199222564697, "learning_rate": 4.2559663500882515e-05, "loss": 0.4448, "step": 38105 }, { "epoch": 1.3734818178541826, "grad_norm": 0.16210007667541504, "learning_rate": 4.255758626621767e-05, "loss": 0.4238, "step": 38110 }, { "epoch": 1.373662017515407, "grad_norm": 0.1924877017736435, "learning_rate": 4.2555508792334105e-05, "loss": 0.42, "step": 38115 }, { "epoch": 1.3738422171766316, "grad_norm": 0.18068957328796387, "learning_rate": 4.255343107926013e-05, "loss": 0.378, "step": 38120 }, { "epoch": 1.3740224168378563, "grad_norm": 0.21288029849529266, "learning_rate": 4.255135312702406e-05, "loss": 0.427, "step": 38125 }, { "epoch": 1.374202616499081, "grad_norm": 0.18913322687149048, "learning_rate": 4.25492749356542e-05, "loss": 0.4031, "step": 38130 }, { "epoch": 1.3743828161603056, "grad_norm": 0.1928529441356659, "learning_rate": 4.2547196505178866e-05, "loss": 0.3974, "step": 38135 }, { "epoch": 1.3745630158215303, "grad_norm": 0.186807319521904, "learning_rate": 4.254511783562638e-05, "loss": 0.4134, "step": 38140 }, { "epoch": 1.3747432154827548, "grad_norm": 0.19627200067043304, "learning_rate": 4.254303892702506e-05, "loss": 0.4457, "step": 38145 }, { "epoch": 1.3749234151439795, "grad_norm": 0.17085368931293488, "learning_rate": 4.254095977940323e-05, "loss": 0.4452, "step": 38150 }, { "epoch": 1.3751036148052043, "grad_norm": 0.15709124505519867, "learning_rate": 4.2538880392789214e-05, "loss": 0.4101, "step": 38155 }, { "epoch": 1.3752838144664288, "grad_norm": 0.2019108682870865, "learning_rate": 4.253680076721136e-05, "loss": 0.403, "step": 38160 }, { "epoch": 1.3754640141276535, "grad_norm": 0.17493942379951477, "learning_rate": 4.253472090269798e-05, "loss": 0.4093, "step": 38165 }, { "epoch": 1.375644213788878, "grad_norm": 0.18124589323997498, "learning_rate": 4.253264079927742e-05, "loss": 0.4191, "step": 38170 }, { "epoch": 1.3758244134501028, "grad_norm": 0.20440387725830078, "learning_rate": 4.253056045697803e-05, "loss": 0.4026, "step": 38175 }, { "epoch": 1.3760046131113273, "grad_norm": 0.2788972854614258, "learning_rate": 4.252847987582815e-05, "loss": 0.4271, "step": 38180 }, { "epoch": 1.376184812772552, "grad_norm": 0.20534437894821167, "learning_rate": 4.252639905585613e-05, "loss": 0.4404, "step": 38185 }, { "epoch": 1.3763650124337765, "grad_norm": 0.18792025744915009, "learning_rate": 4.2524317997090304e-05, "loss": 0.427, "step": 38190 }, { "epoch": 1.3765452120950012, "grad_norm": 0.2014329433441162, "learning_rate": 4.2522236699559045e-05, "loss": 0.4211, "step": 38195 }, { "epoch": 1.376725411756226, "grad_norm": 0.1882011443376541, "learning_rate": 4.25201551632907e-05, "loss": 0.431, "step": 38200 }, { "epoch": 1.3769056114174505, "grad_norm": 0.1633293330669403, "learning_rate": 4.251807338831363e-05, "loss": 0.4322, "step": 38205 }, { "epoch": 1.3770858110786752, "grad_norm": 0.1637231707572937, "learning_rate": 4.2515991374656204e-05, "loss": 0.3997, "step": 38210 }, { "epoch": 1.3772660107398997, "grad_norm": 0.19281378388404846, "learning_rate": 4.251390912234679e-05, "loss": 0.4273, "step": 38215 }, { "epoch": 1.3774462104011245, "grad_norm": 0.18837355077266693, "learning_rate": 4.251182663141375e-05, "loss": 0.4359, "step": 38220 }, { "epoch": 1.3776264100623492, "grad_norm": 0.17957091331481934, "learning_rate": 4.2509743901885474e-05, "loss": 0.4057, "step": 38225 }, { "epoch": 1.3778066097235737, "grad_norm": 0.1847553253173828, "learning_rate": 4.2507660933790314e-05, "loss": 0.44, "step": 38230 }, { "epoch": 1.3779868093847982, "grad_norm": 0.16359250247478485, "learning_rate": 4.250557772715667e-05, "loss": 0.3645, "step": 38235 }, { "epoch": 1.378167009046023, "grad_norm": 0.18574312329292297, "learning_rate": 4.250349428201292e-05, "loss": 0.4006, "step": 38240 }, { "epoch": 1.3783472087072477, "grad_norm": 0.16163067519664764, "learning_rate": 4.250141059838745e-05, "loss": 0.4349, "step": 38245 }, { "epoch": 1.3785274083684722, "grad_norm": 0.15968047082424164, "learning_rate": 4.249932667630864e-05, "loss": 0.4231, "step": 38250 }, { "epoch": 1.378707608029697, "grad_norm": 0.17597581446170807, "learning_rate": 4.24972425158049e-05, "loss": 0.402, "step": 38255 }, { "epoch": 1.3788878076909215, "grad_norm": 0.19579049944877625, "learning_rate": 4.249515811690461e-05, "loss": 0.3995, "step": 38260 }, { "epoch": 1.3790680073521462, "grad_norm": 0.1438038945198059, "learning_rate": 4.249307347963619e-05, "loss": 0.4186, "step": 38265 }, { "epoch": 1.379248207013371, "grad_norm": 0.19650869071483612, "learning_rate": 4.249098860402802e-05, "loss": 0.4032, "step": 38270 }, { "epoch": 1.3794284066745954, "grad_norm": 0.16172637045383453, "learning_rate": 4.2488903490108524e-05, "loss": 0.4299, "step": 38275 }, { "epoch": 1.3796086063358202, "grad_norm": 0.21487955749034882, "learning_rate": 4.2486818137906095e-05, "loss": 0.4782, "step": 38280 }, { "epoch": 1.3797888059970447, "grad_norm": 0.20923011004924774, "learning_rate": 4.248473254744917e-05, "loss": 0.442, "step": 38285 }, { "epoch": 1.3799690056582694, "grad_norm": 0.15938104689121246, "learning_rate": 4.2482646718766136e-05, "loss": 0.4118, "step": 38290 }, { "epoch": 1.380149205319494, "grad_norm": 0.1880597025156021, "learning_rate": 4.2480560651885425e-05, "loss": 0.4324, "step": 38295 }, { "epoch": 1.3803294049807187, "grad_norm": 0.1785939335823059, "learning_rate": 4.247847434683546e-05, "loss": 0.4128, "step": 38300 }, { "epoch": 1.3805096046419432, "grad_norm": 0.2147594839334488, "learning_rate": 4.247638780364468e-05, "loss": 0.3871, "step": 38305 }, { "epoch": 1.380689804303168, "grad_norm": 0.13881224393844604, "learning_rate": 4.247430102234149e-05, "loss": 0.4052, "step": 38310 }, { "epoch": 1.3808700039643926, "grad_norm": 0.18304836750030518, "learning_rate": 4.2472214002954324e-05, "loss": 0.3941, "step": 38315 }, { "epoch": 1.3810502036256171, "grad_norm": 0.1778329461812973, "learning_rate": 4.247012674551163e-05, "loss": 0.4741, "step": 38320 }, { "epoch": 1.3812304032868419, "grad_norm": 0.18518294394016266, "learning_rate": 4.246803925004185e-05, "loss": 0.443, "step": 38325 }, { "epoch": 1.3814106029480664, "grad_norm": 0.21796804666519165, "learning_rate": 4.2465951516573406e-05, "loss": 0.4183, "step": 38330 }, { "epoch": 1.3815908026092911, "grad_norm": 0.2137499749660492, "learning_rate": 4.246386354513475e-05, "loss": 0.4216, "step": 38335 }, { "epoch": 1.3817710022705159, "grad_norm": 0.18902204930782318, "learning_rate": 4.246177533575435e-05, "loss": 0.4524, "step": 38340 }, { "epoch": 1.3819512019317404, "grad_norm": 0.171752467751503, "learning_rate": 4.2459686888460635e-05, "loss": 0.4111, "step": 38345 }, { "epoch": 1.3821314015929649, "grad_norm": 0.15336576104164124, "learning_rate": 4.245759820328206e-05, "loss": 0.3991, "step": 38350 }, { "epoch": 1.3823116012541896, "grad_norm": 0.19525669515132904, "learning_rate": 4.2455509280247097e-05, "loss": 0.4658, "step": 38355 }, { "epoch": 1.3824918009154143, "grad_norm": 0.15848879516124725, "learning_rate": 4.2453420119384195e-05, "loss": 0.427, "step": 38360 }, { "epoch": 1.3826720005766389, "grad_norm": 0.16411955654621124, "learning_rate": 4.245133072072183e-05, "loss": 0.4179, "step": 38365 }, { "epoch": 1.3828522002378636, "grad_norm": 0.21341267228126526, "learning_rate": 4.244924108428846e-05, "loss": 0.4251, "step": 38370 }, { "epoch": 1.383032399899088, "grad_norm": 0.16351541876792908, "learning_rate": 4.2447151210112555e-05, "loss": 0.4354, "step": 38375 }, { "epoch": 1.3832125995603128, "grad_norm": 0.176330104470253, "learning_rate": 4.2445061098222596e-05, "loss": 0.4136, "step": 38380 }, { "epoch": 1.3833927992215376, "grad_norm": 0.16498027741909027, "learning_rate": 4.244297074864705e-05, "loss": 0.4119, "step": 38385 }, { "epoch": 1.383572998882762, "grad_norm": 0.20347841084003448, "learning_rate": 4.244088016141441e-05, "loss": 0.4565, "step": 38390 }, { "epoch": 1.3837531985439868, "grad_norm": 0.20916853845119476, "learning_rate": 4.2438789336553154e-05, "loss": 0.4209, "step": 38395 }, { "epoch": 1.3839333982052113, "grad_norm": 0.20273546874523163, "learning_rate": 4.2436698274091765e-05, "loss": 0.4065, "step": 38400 }, { "epoch": 1.384113597866436, "grad_norm": 0.15745113790035248, "learning_rate": 4.2434606974058756e-05, "loss": 0.3952, "step": 38405 }, { "epoch": 1.3842937975276606, "grad_norm": 0.20275966823101044, "learning_rate": 4.243251543648258e-05, "loss": 0.3806, "step": 38410 }, { "epoch": 1.3844739971888853, "grad_norm": 0.15785571932792664, "learning_rate": 4.243042366139177e-05, "loss": 0.4033, "step": 38415 }, { "epoch": 1.3846541968501098, "grad_norm": 0.19757528603076935, "learning_rate": 4.24283316488148e-05, "loss": 0.3708, "step": 38420 }, { "epoch": 1.3848343965113346, "grad_norm": 0.2014017403125763, "learning_rate": 4.24262393987802e-05, "loss": 0.4544, "step": 38425 }, { "epoch": 1.3850145961725593, "grad_norm": 0.20693495869636536, "learning_rate": 4.242414691131645e-05, "loss": 0.4509, "step": 38430 }, { "epoch": 1.3851947958337838, "grad_norm": 0.18149085342884064, "learning_rate": 4.242205418645208e-05, "loss": 0.4171, "step": 38435 }, { "epoch": 1.3853749954950085, "grad_norm": 0.2000938504934311, "learning_rate": 4.2419961224215595e-05, "loss": 0.3862, "step": 38440 }, { "epoch": 1.385555195156233, "grad_norm": 0.19279134273529053, "learning_rate": 4.2417868024635504e-05, "loss": 0.4, "step": 38445 }, { "epoch": 1.3857353948174578, "grad_norm": 0.18606650829315186, "learning_rate": 4.241577458774034e-05, "loss": 0.4044, "step": 38450 }, { "epoch": 1.3859155944786825, "grad_norm": 0.1808106005191803, "learning_rate": 4.241368091355862e-05, "loss": 0.4383, "step": 38455 }, { "epoch": 1.386095794139907, "grad_norm": 0.15027718245983124, "learning_rate": 4.241158700211886e-05, "loss": 0.4218, "step": 38460 }, { "epoch": 1.3862759938011315, "grad_norm": 0.15103080868721008, "learning_rate": 4.24094928534496e-05, "loss": 0.416, "step": 38465 }, { "epoch": 1.3864561934623563, "grad_norm": 0.1560763716697693, "learning_rate": 4.2407398467579376e-05, "loss": 0.4568, "step": 38470 }, { "epoch": 1.386636393123581, "grad_norm": 0.17564137279987335, "learning_rate": 4.2405303844536714e-05, "loss": 0.3982, "step": 38475 }, { "epoch": 1.3868165927848055, "grad_norm": 0.1863342970609665, "learning_rate": 4.2403208984350164e-05, "loss": 0.4215, "step": 38480 }, { "epoch": 1.3869967924460302, "grad_norm": 0.2165493667125702, "learning_rate": 4.240111388704825e-05, "loss": 0.4043, "step": 38485 }, { "epoch": 1.3871769921072548, "grad_norm": 0.17703358829021454, "learning_rate": 4.2399018552659536e-05, "loss": 0.4201, "step": 38490 }, { "epoch": 1.3873571917684795, "grad_norm": 0.12427543848752975, "learning_rate": 4.239692298121256e-05, "loss": 0.4103, "step": 38495 }, { "epoch": 1.3875373914297042, "grad_norm": 0.161130890250206, "learning_rate": 4.239482717273587e-05, "loss": 0.4157, "step": 38500 }, { "epoch": 1.3875373914297042, "eval_loss": 0.44684794545173645, "eval_runtime": 3.5231, "eval_samples_per_second": 28.384, "eval_steps_per_second": 7.096, "step": 38500 }, { "epoch": 1.3877175910909287, "grad_norm": 0.20817585289478302, "learning_rate": 4.2392731127258037e-05, "loss": 0.4488, "step": 38505 }, { "epoch": 1.3878977907521532, "grad_norm": 0.17625221610069275, "learning_rate": 4.239063484480761e-05, "loss": 0.4266, "step": 38510 }, { "epoch": 1.388077990413378, "grad_norm": 0.2073509842157364, "learning_rate": 4.238853832541315e-05, "loss": 0.4415, "step": 38515 }, { "epoch": 1.3882581900746027, "grad_norm": 0.2082330286502838, "learning_rate": 4.238644156910322e-05, "loss": 0.388, "step": 38520 }, { "epoch": 1.3884383897358272, "grad_norm": 0.19567419588565826, "learning_rate": 4.238434457590639e-05, "loss": 0.4268, "step": 38525 }, { "epoch": 1.388618589397052, "grad_norm": 0.19679030776023865, "learning_rate": 4.238224734585123e-05, "loss": 0.4052, "step": 38530 }, { "epoch": 1.3887987890582765, "grad_norm": 0.219960555434227, "learning_rate": 4.238014987896631e-05, "loss": 0.4552, "step": 38535 }, { "epoch": 1.3889789887195012, "grad_norm": 0.16919073462486267, "learning_rate": 4.2378052175280216e-05, "loss": 0.3934, "step": 38540 }, { "epoch": 1.389159188380726, "grad_norm": 0.15269456803798676, "learning_rate": 4.237595423482153e-05, "loss": 0.3468, "step": 38545 }, { "epoch": 1.3893393880419505, "grad_norm": 0.18729466199874878, "learning_rate": 4.237385605761883e-05, "loss": 0.4197, "step": 38550 }, { "epoch": 1.3895195877031752, "grad_norm": 0.19090360403060913, "learning_rate": 4.2371757643700705e-05, "loss": 0.4401, "step": 38555 }, { "epoch": 1.3896997873643997, "grad_norm": 0.17647488415241241, "learning_rate": 4.236965899309574e-05, "loss": 0.4282, "step": 38560 }, { "epoch": 1.3898799870256244, "grad_norm": 0.2209496647119522, "learning_rate": 4.236756010583254e-05, "loss": 0.4473, "step": 38565 }, { "epoch": 1.3900601866868492, "grad_norm": 0.16806313395500183, "learning_rate": 4.23654609819397e-05, "loss": 0.4122, "step": 38570 }, { "epoch": 1.3902403863480737, "grad_norm": 0.20954453945159912, "learning_rate": 4.236336162144581e-05, "loss": 0.4201, "step": 38575 }, { "epoch": 1.3904205860092982, "grad_norm": 0.14711293578147888, "learning_rate": 4.236126202437948e-05, "loss": 0.3762, "step": 38580 }, { "epoch": 1.390600785670523, "grad_norm": 0.23657748103141785, "learning_rate": 4.235916219076931e-05, "loss": 0.4146, "step": 38585 }, { "epoch": 1.3907809853317477, "grad_norm": 0.16697244346141815, "learning_rate": 4.235706212064392e-05, "loss": 0.4094, "step": 38590 }, { "epoch": 1.3909611849929722, "grad_norm": 0.18580500781536102, "learning_rate": 4.2354961814031924e-05, "loss": 0.402, "step": 38595 }, { "epoch": 1.391141384654197, "grad_norm": 0.1857820749282837, "learning_rate": 4.235286127096193e-05, "loss": 0.3989, "step": 38600 }, { "epoch": 1.3913215843154214, "grad_norm": 0.1920892894268036, "learning_rate": 4.235076049146257e-05, "loss": 0.4196, "step": 38605 }, { "epoch": 1.3915017839766461, "grad_norm": 0.18875955045223236, "learning_rate": 4.2348659475562445e-05, "loss": 0.4203, "step": 38610 }, { "epoch": 1.3916819836378709, "grad_norm": 0.15751686692237854, "learning_rate": 4.23465582232902e-05, "loss": 0.4026, "step": 38615 }, { "epoch": 1.3918621832990954, "grad_norm": 0.1680629700422287, "learning_rate": 4.2344456734674454e-05, "loss": 0.3999, "step": 38620 }, { "epoch": 1.39204238296032, "grad_norm": 0.14273864030838013, "learning_rate": 4.234235500974384e-05, "loss": 0.427, "step": 38625 }, { "epoch": 1.3922225826215446, "grad_norm": 0.17425033450126648, "learning_rate": 4.2340253048527e-05, "loss": 0.4206, "step": 38630 }, { "epoch": 1.3924027822827694, "grad_norm": 0.14705787599086761, "learning_rate": 4.233815085105257e-05, "loss": 0.4026, "step": 38635 }, { "epoch": 1.3925829819439939, "grad_norm": 0.19145555794239044, "learning_rate": 4.233604841734919e-05, "loss": 0.3902, "step": 38640 }, { "epoch": 1.3927631816052186, "grad_norm": 0.20708268880844116, "learning_rate": 4.2333945747445516e-05, "loss": 0.4036, "step": 38645 }, { "epoch": 1.3929433812664431, "grad_norm": 0.17952348291873932, "learning_rate": 4.2331842841370175e-05, "loss": 0.4346, "step": 38650 }, { "epoch": 1.3931235809276679, "grad_norm": 0.21833153069019318, "learning_rate": 4.232973969915184e-05, "loss": 0.4475, "step": 38655 }, { "epoch": 1.3933037805888926, "grad_norm": 0.18720868229866028, "learning_rate": 4.232763632081915e-05, "loss": 0.3696, "step": 38660 }, { "epoch": 1.393483980250117, "grad_norm": 0.1900511085987091, "learning_rate": 4.232553270640077e-05, "loss": 0.3998, "step": 38665 }, { "epoch": 1.3936641799113418, "grad_norm": 0.19259464740753174, "learning_rate": 4.232342885592536e-05, "loss": 0.3862, "step": 38670 }, { "epoch": 1.3938443795725663, "grad_norm": 0.19600766897201538, "learning_rate": 4.232132476942159e-05, "loss": 0.3997, "step": 38675 }, { "epoch": 1.394024579233791, "grad_norm": 0.17231407761573792, "learning_rate": 4.231922044691813e-05, "loss": 0.406, "step": 38680 }, { "epoch": 1.3942047788950156, "grad_norm": 0.19340550899505615, "learning_rate": 4.231711588844363e-05, "loss": 0.4058, "step": 38685 }, { "epoch": 1.3943849785562403, "grad_norm": 0.19323815405368805, "learning_rate": 4.2315011094026784e-05, "loss": 0.4092, "step": 38690 }, { "epoch": 1.3945651782174648, "grad_norm": 0.19978420436382294, "learning_rate": 4.231290606369627e-05, "loss": 0.442, "step": 38695 }, { "epoch": 1.3947453778786896, "grad_norm": 0.1691822111606598, "learning_rate": 4.2310800797480756e-05, "loss": 0.4045, "step": 38700 }, { "epoch": 1.3949255775399143, "grad_norm": 0.2136184573173523, "learning_rate": 4.230869529540894e-05, "loss": 0.3986, "step": 38705 }, { "epoch": 1.3951057772011388, "grad_norm": 0.1695156693458557, "learning_rate": 4.230658955750949e-05, "loss": 0.4319, "step": 38710 }, { "epoch": 1.3952859768623636, "grad_norm": 0.17145298421382904, "learning_rate": 4.230448358381112e-05, "loss": 0.3985, "step": 38715 }, { "epoch": 1.395466176523588, "grad_norm": 0.18987350165843964, "learning_rate": 4.2302377374342505e-05, "loss": 0.4378, "step": 38720 }, { "epoch": 1.3956463761848128, "grad_norm": 0.17236654460430145, "learning_rate": 4.2300270929132344e-05, "loss": 0.4128, "step": 38725 }, { "epoch": 1.3958265758460375, "grad_norm": 0.18958859145641327, "learning_rate": 4.229816424820935e-05, "loss": 0.4303, "step": 38730 }, { "epoch": 1.396006775507262, "grad_norm": 0.19442634284496307, "learning_rate": 4.229605733160221e-05, "loss": 0.4405, "step": 38735 }, { "epoch": 1.3961869751684866, "grad_norm": 0.16832728683948517, "learning_rate": 4.2293950179339645e-05, "loss": 0.4066, "step": 38740 }, { "epoch": 1.3963671748297113, "grad_norm": 0.20120170712471008, "learning_rate": 4.2291842791450356e-05, "loss": 0.4036, "step": 38745 }, { "epoch": 1.396547374490936, "grad_norm": 0.17381571233272552, "learning_rate": 4.2289735167963054e-05, "loss": 0.4033, "step": 38750 }, { "epoch": 1.3967275741521605, "grad_norm": 0.17229481041431427, "learning_rate": 4.228762730890645e-05, "loss": 0.4159, "step": 38755 }, { "epoch": 1.3969077738133853, "grad_norm": 0.1683155596256256, "learning_rate": 4.228551921430928e-05, "loss": 0.3986, "step": 38760 }, { "epoch": 1.3970879734746098, "grad_norm": 0.1502419412136078, "learning_rate": 4.228341088420026e-05, "loss": 0.4202, "step": 38765 }, { "epoch": 1.3972681731358345, "grad_norm": 0.16682939231395721, "learning_rate": 4.2281302318608106e-05, "loss": 0.4485, "step": 38770 }, { "epoch": 1.3974483727970592, "grad_norm": 0.20583415031433105, "learning_rate": 4.227919351756155e-05, "loss": 0.4224, "step": 38775 }, { "epoch": 1.3976285724582838, "grad_norm": 0.1680290699005127, "learning_rate": 4.227708448108934e-05, "loss": 0.4034, "step": 38780 }, { "epoch": 1.3978087721195085, "grad_norm": 0.1613532453775406, "learning_rate": 4.227497520922019e-05, "loss": 0.4187, "step": 38785 }, { "epoch": 1.397988971780733, "grad_norm": 0.19061127305030823, "learning_rate": 4.2272865701982855e-05, "loss": 0.4305, "step": 38790 }, { "epoch": 1.3981691714419577, "grad_norm": 0.1994105577468872, "learning_rate": 4.227075595940606e-05, "loss": 0.4196, "step": 38795 }, { "epoch": 1.3983493711031822, "grad_norm": 0.16450746357440948, "learning_rate": 4.226864598151857e-05, "loss": 0.4488, "step": 38800 }, { "epoch": 1.398529570764407, "grad_norm": 0.15390661358833313, "learning_rate": 4.226653576834911e-05, "loss": 0.4042, "step": 38805 }, { "epoch": 1.3987097704256315, "grad_norm": 0.22952650487422943, "learning_rate": 4.226442531992645e-05, "loss": 0.4069, "step": 38810 }, { "epoch": 1.3988899700868562, "grad_norm": 0.18804705142974854, "learning_rate": 4.2262314636279334e-05, "loss": 0.3962, "step": 38815 }, { "epoch": 1.399070169748081, "grad_norm": 0.15264515578746796, "learning_rate": 4.226020371743653e-05, "loss": 0.4237, "step": 38820 }, { "epoch": 1.3992503694093055, "grad_norm": 0.18798717856407166, "learning_rate": 4.225809256342678e-05, "loss": 0.4252, "step": 38825 }, { "epoch": 1.3994305690705302, "grad_norm": 0.21027231216430664, "learning_rate": 4.225598117427887e-05, "loss": 0.4366, "step": 38830 }, { "epoch": 1.3996107687317547, "grad_norm": 0.17129452526569366, "learning_rate": 4.225386955002155e-05, "loss": 0.4157, "step": 38835 }, { "epoch": 1.3997909683929795, "grad_norm": 0.19141530990600586, "learning_rate": 4.2251757690683604e-05, "loss": 0.3856, "step": 38840 }, { "epoch": 1.3999711680542042, "grad_norm": 0.1746709644794464, "learning_rate": 4.22496455962938e-05, "loss": 0.395, "step": 38845 }, { "epoch": 1.4001513677154287, "grad_norm": 0.20320837199687958, "learning_rate": 4.22475332668809e-05, "loss": 0.4106, "step": 38850 }, { "epoch": 1.4003315673766532, "grad_norm": 0.17430174350738525, "learning_rate": 4.224542070247371e-05, "loss": 0.4004, "step": 38855 }, { "epoch": 1.400511767037878, "grad_norm": 0.1601121872663498, "learning_rate": 4.224330790310101e-05, "loss": 0.4212, "step": 38860 }, { "epoch": 1.4006919666991027, "grad_norm": 0.16500531136989594, "learning_rate": 4.2241194868791565e-05, "loss": 0.3749, "step": 38865 }, { "epoch": 1.4008721663603272, "grad_norm": 0.18439029157161713, "learning_rate": 4.2239081599574184e-05, "loss": 0.4193, "step": 38870 }, { "epoch": 1.401052366021552, "grad_norm": 0.20299525558948517, "learning_rate": 4.223696809547766e-05, "loss": 0.4482, "step": 38875 }, { "epoch": 1.4012325656827764, "grad_norm": 0.17272146046161652, "learning_rate": 4.2234854356530776e-05, "loss": 0.4225, "step": 38880 }, { "epoch": 1.4014127653440012, "grad_norm": 0.1982017606496811, "learning_rate": 4.223274038276233e-05, "loss": 0.4437, "step": 38885 }, { "epoch": 1.401592965005226, "grad_norm": 0.16172081232070923, "learning_rate": 4.223062617420114e-05, "loss": 0.4323, "step": 38890 }, { "epoch": 1.4017731646664504, "grad_norm": 0.1717664897441864, "learning_rate": 4.2228511730876006e-05, "loss": 0.3999, "step": 38895 }, { "epoch": 1.4019533643276751, "grad_norm": 0.18911094963550568, "learning_rate": 4.2226397052815734e-05, "loss": 0.409, "step": 38900 }, { "epoch": 1.4021335639888997, "grad_norm": 0.19404666125774384, "learning_rate": 4.2224282140049145e-05, "loss": 0.411, "step": 38905 }, { "epoch": 1.4023137636501244, "grad_norm": 0.19572977721691132, "learning_rate": 4.2222166992605037e-05, "loss": 0.4489, "step": 38910 }, { "epoch": 1.402493963311349, "grad_norm": 0.20973151922225952, "learning_rate": 4.2220051610512236e-05, "loss": 0.4269, "step": 38915 }, { "epoch": 1.4026741629725736, "grad_norm": 0.17148537933826447, "learning_rate": 4.2217935993799576e-05, "loss": 0.3622, "step": 38920 }, { "epoch": 1.4028543626337981, "grad_norm": 0.1789887696504593, "learning_rate": 4.221582014249586e-05, "loss": 0.436, "step": 38925 }, { "epoch": 1.4030345622950229, "grad_norm": 0.17625224590301514, "learning_rate": 4.2213704056629936e-05, "loss": 0.4053, "step": 38930 }, { "epoch": 1.4032147619562476, "grad_norm": 0.1674962043762207, "learning_rate": 4.2211587736230614e-05, "loss": 0.4552, "step": 38935 }, { "epoch": 1.4033949616174721, "grad_norm": 0.20774921774864197, "learning_rate": 4.220947118132676e-05, "loss": 0.4201, "step": 38940 }, { "epoch": 1.4035751612786969, "grad_norm": 0.15896476805210114, "learning_rate": 4.220735439194718e-05, "loss": 0.4324, "step": 38945 }, { "epoch": 1.4037553609399214, "grad_norm": 0.21336419880390167, "learning_rate": 4.220523736812073e-05, "loss": 0.4731, "step": 38950 }, { "epoch": 1.403935560601146, "grad_norm": 0.23092104494571686, "learning_rate": 4.220312010987626e-05, "loss": 0.422, "step": 38955 }, { "epoch": 1.4041157602623708, "grad_norm": 0.20718926191329956, "learning_rate": 4.22010026172426e-05, "loss": 0.394, "step": 38960 }, { "epoch": 1.4042959599235953, "grad_norm": 0.18056684732437134, "learning_rate": 4.219888489024861e-05, "loss": 0.421, "step": 38965 }, { "epoch": 1.4044761595848199, "grad_norm": 0.19858917593955994, "learning_rate": 4.2196766928923147e-05, "loss": 0.4036, "step": 38970 }, { "epoch": 1.4046563592460446, "grad_norm": 0.23689039051532745, "learning_rate": 4.219464873329506e-05, "loss": 0.4278, "step": 38975 }, { "epoch": 1.4048365589072693, "grad_norm": 0.1857483685016632, "learning_rate": 4.219253030339322e-05, "loss": 0.4156, "step": 38980 }, { "epoch": 1.4050167585684938, "grad_norm": 0.1789952665567398, "learning_rate": 4.2190411639246474e-05, "loss": 0.4147, "step": 38985 }, { "epoch": 1.4051969582297186, "grad_norm": 0.21986453235149384, "learning_rate": 4.218829274088371e-05, "loss": 0.454, "step": 38990 }, { "epoch": 1.405377157890943, "grad_norm": 0.16250832378864288, "learning_rate": 4.2186173608333776e-05, "loss": 0.3945, "step": 38995 }, { "epoch": 1.4055573575521678, "grad_norm": 0.16607147455215454, "learning_rate": 4.2184054241625556e-05, "loss": 0.429, "step": 39000 }, { "epoch": 1.4055573575521678, "eval_loss": 0.4471230208873749, "eval_runtime": 3.5385, "eval_samples_per_second": 28.261, "eval_steps_per_second": 7.065, "step": 39000 }, { "epoch": 1.4057375572133926, "grad_norm": 0.18567611277103424, "learning_rate": 4.218193464078792e-05, "loss": 0.4589, "step": 39005 }, { "epoch": 1.405917756874617, "grad_norm": 0.14248013496398926, "learning_rate": 4.217981480584976e-05, "loss": 0.4067, "step": 39010 }, { "epoch": 1.4060979565358416, "grad_norm": 0.18082767724990845, "learning_rate": 4.217769473683994e-05, "loss": 0.4069, "step": 39015 }, { "epoch": 1.4062781561970663, "grad_norm": 0.2055099606513977, "learning_rate": 4.217557443378736e-05, "loss": 0.4208, "step": 39020 }, { "epoch": 1.406458355858291, "grad_norm": 0.20879048109054565, "learning_rate": 4.2173453896720906e-05, "loss": 0.3962, "step": 39025 }, { "epoch": 1.4066385555195156, "grad_norm": 0.18732579052448273, "learning_rate": 4.217133312566946e-05, "loss": 0.4423, "step": 39030 }, { "epoch": 1.4068187551807403, "grad_norm": 0.199151873588562, "learning_rate": 4.2169212120661926e-05, "loss": 0.4394, "step": 39035 }, { "epoch": 1.4069989548419648, "grad_norm": 0.15201795101165771, "learning_rate": 4.21670908817272e-05, "loss": 0.4055, "step": 39040 }, { "epoch": 1.4071791545031895, "grad_norm": 0.15921539068222046, "learning_rate": 4.2164969408894194e-05, "loss": 0.4533, "step": 39045 }, { "epoch": 1.4073593541644143, "grad_norm": 0.16528940200805664, "learning_rate": 4.21628477021918e-05, "loss": 0.4136, "step": 39050 }, { "epoch": 1.4075395538256388, "grad_norm": 0.20184743404388428, "learning_rate": 4.216072576164892e-05, "loss": 0.453, "step": 39055 }, { "epoch": 1.4077197534868635, "grad_norm": 0.2002110779285431, "learning_rate": 4.215860358729448e-05, "loss": 0.4098, "step": 39060 }, { "epoch": 1.407899953148088, "grad_norm": 0.16042189300060272, "learning_rate": 4.215648117915739e-05, "loss": 0.3906, "step": 39065 }, { "epoch": 1.4080801528093128, "grad_norm": 0.21840621531009674, "learning_rate": 4.2154358537266564e-05, "loss": 0.4053, "step": 39070 }, { "epoch": 1.4082603524705375, "grad_norm": 0.15878549218177795, "learning_rate": 4.2152235661650925e-05, "loss": 0.4222, "step": 39075 }, { "epoch": 1.408440552131762, "grad_norm": 0.17753714323043823, "learning_rate": 4.215011255233939e-05, "loss": 0.4452, "step": 39080 }, { "epoch": 1.4086207517929865, "grad_norm": 0.1962030678987503, "learning_rate": 4.2147989209360903e-05, "loss": 0.4588, "step": 39085 }, { "epoch": 1.4088009514542112, "grad_norm": 0.17643392086029053, "learning_rate": 4.2145865632744376e-05, "loss": 0.4102, "step": 39090 }, { "epoch": 1.408981151115436, "grad_norm": 0.2033703774213791, "learning_rate": 4.214374182251874e-05, "loss": 0.3898, "step": 39095 }, { "epoch": 1.4091613507766605, "grad_norm": 0.2141045182943344, "learning_rate": 4.214161777871296e-05, "loss": 0.4343, "step": 39100 }, { "epoch": 1.4093415504378852, "grad_norm": 0.19499428570270538, "learning_rate": 4.213949350135594e-05, "loss": 0.4263, "step": 39105 }, { "epoch": 1.4095217500991097, "grad_norm": 0.1868494302034378, "learning_rate": 4.2137368990476655e-05, "loss": 0.4148, "step": 39110 }, { "epoch": 1.4097019497603345, "grad_norm": 0.16455568373203278, "learning_rate": 4.213524424610402e-05, "loss": 0.3985, "step": 39115 }, { "epoch": 1.4098821494215592, "grad_norm": 0.23864524066448212, "learning_rate": 4.213311926826701e-05, "loss": 0.4044, "step": 39120 }, { "epoch": 1.4100623490827837, "grad_norm": 0.20035329461097717, "learning_rate": 4.213099405699457e-05, "loss": 0.3905, "step": 39125 }, { "epoch": 1.4102425487440082, "grad_norm": 0.18260158598423004, "learning_rate": 4.212886861231564e-05, "loss": 0.4173, "step": 39130 }, { "epoch": 1.410422748405233, "grad_norm": 0.14662878215312958, "learning_rate": 4.212674293425919e-05, "loss": 0.3954, "step": 39135 }, { "epoch": 1.4106029480664577, "grad_norm": 0.2082986682653427, "learning_rate": 4.2124617022854195e-05, "loss": 0.4102, "step": 39140 }, { "epoch": 1.4107831477276822, "grad_norm": 0.19178947806358337, "learning_rate": 4.212249087812961e-05, "loss": 0.4261, "step": 39145 }, { "epoch": 1.410963347388907, "grad_norm": 0.18366897106170654, "learning_rate": 4.2120364500114394e-05, "loss": 0.3988, "step": 39150 }, { "epoch": 1.4111435470501315, "grad_norm": 0.20114761590957642, "learning_rate": 4.2118237888837534e-05, "loss": 0.4235, "step": 39155 }, { "epoch": 1.4113237467113562, "grad_norm": 0.17763690650463104, "learning_rate": 4.2116111044327984e-05, "loss": 0.4264, "step": 39160 }, { "epoch": 1.411503946372581, "grad_norm": 0.2246585637331009, "learning_rate": 4.2113983966614745e-05, "loss": 0.4089, "step": 39165 }, { "epoch": 1.4116841460338054, "grad_norm": 0.19135521352291107, "learning_rate": 4.211185665572678e-05, "loss": 0.4371, "step": 39170 }, { "epoch": 1.4118643456950302, "grad_norm": 0.19484780728816986, "learning_rate": 4.2109729111693085e-05, "loss": 0.3799, "step": 39175 }, { "epoch": 1.4120445453562547, "grad_norm": 0.16387687623500824, "learning_rate": 4.210760133454265e-05, "loss": 0.3739, "step": 39180 }, { "epoch": 1.4122247450174794, "grad_norm": 0.169037863612175, "learning_rate": 4.210547332430446e-05, "loss": 0.4141, "step": 39185 }, { "epoch": 1.412404944678704, "grad_norm": 0.2004702091217041, "learning_rate": 4.21033450810075e-05, "loss": 0.4108, "step": 39190 }, { "epoch": 1.4125851443399287, "grad_norm": 0.16014528274536133, "learning_rate": 4.210121660468077e-05, "loss": 0.3964, "step": 39195 }, { "epoch": 1.4127653440011532, "grad_norm": 0.16173921525478363, "learning_rate": 4.209908789535328e-05, "loss": 0.4055, "step": 39200 }, { "epoch": 1.412945543662378, "grad_norm": 0.1848604828119278, "learning_rate": 4.209695895305403e-05, "loss": 0.4002, "step": 39205 }, { "epoch": 1.4131257433236026, "grad_norm": 0.14384835958480835, "learning_rate": 4.209482977781202e-05, "loss": 0.4028, "step": 39210 }, { "epoch": 1.4133059429848271, "grad_norm": 0.15797820687294006, "learning_rate": 4.209270036965627e-05, "loss": 0.4436, "step": 39215 }, { "epoch": 1.4134861426460519, "grad_norm": 0.17530421912670135, "learning_rate": 4.2090570728615774e-05, "loss": 0.4092, "step": 39220 }, { "epoch": 1.4136663423072764, "grad_norm": 0.17297030985355377, "learning_rate": 4.208844085471957e-05, "loss": 0.4609, "step": 39225 }, { "epoch": 1.4138465419685011, "grad_norm": 0.1550433486700058, "learning_rate": 4.2086310747996674e-05, "loss": 0.4124, "step": 39230 }, { "epoch": 1.4140267416297259, "grad_norm": 0.19032645225524902, "learning_rate": 4.2084180408476094e-05, "loss": 0.4298, "step": 39235 }, { "epoch": 1.4142069412909504, "grad_norm": 0.18603280186653137, "learning_rate": 4.208204983618687e-05, "loss": 0.4064, "step": 39240 }, { "epoch": 1.4143871409521749, "grad_norm": 0.21045638620853424, "learning_rate": 4.2079919031158014e-05, "loss": 0.435, "step": 39245 }, { "epoch": 1.4145673406133996, "grad_norm": 0.22773757576942444, "learning_rate": 4.2077787993418574e-05, "loss": 0.4148, "step": 39250 }, { "epoch": 1.4147475402746243, "grad_norm": 0.22381030023097992, "learning_rate": 4.2075656722997583e-05, "loss": 0.3881, "step": 39255 }, { "epoch": 1.4149277399358489, "grad_norm": 0.2066371738910675, "learning_rate": 4.207352521992407e-05, "loss": 0.4107, "step": 39260 }, { "epoch": 1.4151079395970736, "grad_norm": 0.1946861892938614, "learning_rate": 4.207139348422708e-05, "loss": 0.4056, "step": 39265 }, { "epoch": 1.415288139258298, "grad_norm": 0.16008664667606354, "learning_rate": 4.2069261515935656e-05, "loss": 0.4325, "step": 39270 }, { "epoch": 1.4154683389195228, "grad_norm": 0.20454731583595276, "learning_rate": 4.206712931507886e-05, "loss": 0.3848, "step": 39275 }, { "epoch": 1.4156485385807476, "grad_norm": 0.19149497151374817, "learning_rate": 4.206499688168572e-05, "loss": 0.4211, "step": 39280 }, { "epoch": 1.415828738241972, "grad_norm": 0.18504855036735535, "learning_rate": 4.2062864215785304e-05, "loss": 0.4501, "step": 39285 }, { "epoch": 1.4160089379031968, "grad_norm": 0.18371212482452393, "learning_rate": 4.206073131740668e-05, "loss": 0.4172, "step": 39290 }, { "epoch": 1.4161891375644213, "grad_norm": 0.17143423855304718, "learning_rate": 4.205859818657888e-05, "loss": 0.3977, "step": 39295 }, { "epoch": 1.416369337225646, "grad_norm": 0.18957088887691498, "learning_rate": 4.205646482333098e-05, "loss": 0.4387, "step": 39300 }, { "epoch": 1.4165495368868706, "grad_norm": 0.18221431970596313, "learning_rate": 4.205433122769206e-05, "loss": 0.3961, "step": 39305 }, { "epoch": 1.4167297365480953, "grad_norm": 0.1832798719406128, "learning_rate": 4.2052197399691174e-05, "loss": 0.4188, "step": 39310 }, { "epoch": 1.4169099362093198, "grad_norm": 0.16603875160217285, "learning_rate": 4.205006333935739e-05, "loss": 0.3877, "step": 39315 }, { "epoch": 1.4170901358705446, "grad_norm": 0.17114681005477905, "learning_rate": 4.204792904671981e-05, "loss": 0.4252, "step": 39320 }, { "epoch": 1.4172703355317693, "grad_norm": 0.16899357736110687, "learning_rate": 4.204579452180749e-05, "loss": 0.392, "step": 39325 }, { "epoch": 1.4174505351929938, "grad_norm": 0.20568254590034485, "learning_rate": 4.2043659764649527e-05, "loss": 0.4218, "step": 39330 }, { "epoch": 1.4176307348542185, "grad_norm": 0.217452272772789, "learning_rate": 4.2041524775274985e-05, "loss": 0.3834, "step": 39335 }, { "epoch": 1.417810934515443, "grad_norm": 0.18286210298538208, "learning_rate": 4.2039389553712986e-05, "loss": 0.4243, "step": 39340 }, { "epoch": 1.4179911341766678, "grad_norm": 0.176735058426857, "learning_rate": 4.2037254099992584e-05, "loss": 0.392, "step": 39345 }, { "epoch": 1.4181713338378925, "grad_norm": 0.18584541976451874, "learning_rate": 4.2035118414142905e-05, "loss": 0.415, "step": 39350 }, { "epoch": 1.418351533499117, "grad_norm": 0.20618844032287598, "learning_rate": 4.203298249619303e-05, "loss": 0.4303, "step": 39355 }, { "epoch": 1.4185317331603415, "grad_norm": 0.1963784545660019, "learning_rate": 4.203084634617207e-05, "loss": 0.418, "step": 39360 }, { "epoch": 1.4187119328215663, "grad_norm": 0.19487901031970978, "learning_rate": 4.202870996410913e-05, "loss": 0.4129, "step": 39365 }, { "epoch": 1.418892132482791, "grad_norm": 0.19040003418922424, "learning_rate": 4.2026573350033304e-05, "loss": 0.4042, "step": 39370 }, { "epoch": 1.4190723321440155, "grad_norm": 0.21092675626277924, "learning_rate": 4.2024436503973716e-05, "loss": 0.4216, "step": 39375 }, { "epoch": 1.4192525318052402, "grad_norm": 0.17777122557163239, "learning_rate": 4.2022299425959476e-05, "loss": 0.4004, "step": 39380 }, { "epoch": 1.4194327314664648, "grad_norm": 0.1416267305612564, "learning_rate": 4.2020162116019706e-05, "loss": 0.3728, "step": 39385 }, { "epoch": 1.4196129311276895, "grad_norm": 0.1654488891363144, "learning_rate": 4.2018024574183525e-05, "loss": 0.412, "step": 39390 }, { "epoch": 1.4197931307889142, "grad_norm": 0.19146573543548584, "learning_rate": 4.2015886800480044e-05, "loss": 0.393, "step": 39395 }, { "epoch": 1.4199733304501387, "grad_norm": 0.17153365910053253, "learning_rate": 4.20137487949384e-05, "loss": 0.4341, "step": 39400 }, { "epoch": 1.4201535301113635, "grad_norm": 0.24907280504703522, "learning_rate": 4.201161055758773e-05, "loss": 0.4512, "step": 39405 }, { "epoch": 1.420333729772588, "grad_norm": 0.1950104534626007, "learning_rate": 4.200947208845716e-05, "loss": 0.438, "step": 39410 }, { "epoch": 1.4205139294338127, "grad_norm": 0.16665388643741608, "learning_rate": 4.200733338757582e-05, "loss": 0.4124, "step": 39415 }, { "epoch": 1.4206941290950372, "grad_norm": 0.18930195271968842, "learning_rate": 4.2005194454972864e-05, "loss": 0.4393, "step": 39420 }, { "epoch": 1.420874328756262, "grad_norm": 0.1515640914440155, "learning_rate": 4.2003055290677416e-05, "loss": 0.3813, "step": 39425 }, { "epoch": 1.4210545284174865, "grad_norm": 0.14799198508262634, "learning_rate": 4.200091589471863e-05, "loss": 0.4036, "step": 39430 }, { "epoch": 1.4212347280787112, "grad_norm": 0.18082498013973236, "learning_rate": 4.199877626712567e-05, "loss": 0.4354, "step": 39435 }, { "epoch": 1.421414927739936, "grad_norm": 0.1811361163854599, "learning_rate": 4.199663640792767e-05, "loss": 0.4331, "step": 39440 }, { "epoch": 1.4215951274011605, "grad_norm": 0.1772868037223816, "learning_rate": 4.199449631715378e-05, "loss": 0.3759, "step": 39445 }, { "epoch": 1.4217753270623852, "grad_norm": 0.17968173325061798, "learning_rate": 4.1992355994833175e-05, "loss": 0.3889, "step": 39450 }, { "epoch": 1.4219555267236097, "grad_norm": 0.2115190178155899, "learning_rate": 4.199021544099501e-05, "loss": 0.3891, "step": 39455 }, { "epoch": 1.4221357263848344, "grad_norm": 0.2003733217716217, "learning_rate": 4.1988074655668446e-05, "loss": 0.4274, "step": 39460 }, { "epoch": 1.4223159260460592, "grad_norm": 0.1704542636871338, "learning_rate": 4.1985933638882655e-05, "loss": 0.4479, "step": 39465 }, { "epoch": 1.4224961257072837, "grad_norm": 0.18098214268684387, "learning_rate": 4.198379239066681e-05, "loss": 0.4312, "step": 39470 }, { "epoch": 1.4226763253685082, "grad_norm": 0.17561987042427063, "learning_rate": 4.198165091105007e-05, "loss": 0.396, "step": 39475 }, { "epoch": 1.422856525029733, "grad_norm": 0.14922846853733063, "learning_rate": 4.197950920006164e-05, "loss": 0.4461, "step": 39480 }, { "epoch": 1.4230367246909577, "grad_norm": 0.189297616481781, "learning_rate": 4.197736725773068e-05, "loss": 0.3796, "step": 39485 }, { "epoch": 1.4232169243521822, "grad_norm": 0.18948666751384735, "learning_rate": 4.197522508408637e-05, "loss": 0.4309, "step": 39490 }, { "epoch": 1.423397124013407, "grad_norm": 0.21630705893039703, "learning_rate": 4.197308267915791e-05, "loss": 0.4242, "step": 39495 }, { "epoch": 1.4235773236746314, "grad_norm": 0.2401585429906845, "learning_rate": 4.1970940042974485e-05, "loss": 0.4483, "step": 39500 }, { "epoch": 1.4235773236746314, "eval_loss": 0.44728443026542664, "eval_runtime": 3.5187, "eval_samples_per_second": 28.419, "eval_steps_per_second": 7.105, "step": 39500 }, { "epoch": 1.4237575233358561, "grad_norm": 0.2015465348958969, "learning_rate": 4.196879717556529e-05, "loss": 0.4352, "step": 39505 }, { "epoch": 1.4239377229970809, "grad_norm": 0.17503204941749573, "learning_rate": 4.1966654076959516e-05, "loss": 0.3889, "step": 39510 }, { "epoch": 1.4241179226583054, "grad_norm": 0.15715694427490234, "learning_rate": 4.196451074718637e-05, "loss": 0.4131, "step": 39515 }, { "epoch": 1.42429812231953, "grad_norm": 0.20377200841903687, "learning_rate": 4.196236718627504e-05, "loss": 0.4144, "step": 39520 }, { "epoch": 1.4244783219807546, "grad_norm": 0.18509119749069214, "learning_rate": 4.196022339425475e-05, "loss": 0.4244, "step": 39525 }, { "epoch": 1.4246585216419794, "grad_norm": 0.1515013426542282, "learning_rate": 4.195807937115469e-05, "loss": 0.4429, "step": 39530 }, { "epoch": 1.4248387213032039, "grad_norm": 0.2172863632440567, "learning_rate": 4.1955935117004095e-05, "loss": 0.3957, "step": 39535 }, { "epoch": 1.4250189209644286, "grad_norm": 0.16076506674289703, "learning_rate": 4.1953790631832156e-05, "loss": 0.426, "step": 39540 }, { "epoch": 1.4251991206256531, "grad_norm": 0.24314238131046295, "learning_rate": 4.1951645915668105e-05, "loss": 0.4287, "step": 39545 }, { "epoch": 1.4253793202868779, "grad_norm": 0.22566382586956024, "learning_rate": 4.1949500968541154e-05, "loss": 0.4242, "step": 39550 }, { "epoch": 1.4255595199481026, "grad_norm": 0.1787545531988144, "learning_rate": 4.194735579048055e-05, "loss": 0.4171, "step": 39555 }, { "epoch": 1.425739719609327, "grad_norm": 0.142437145113945, "learning_rate": 4.1945210381515485e-05, "loss": 0.4205, "step": 39560 }, { "epoch": 1.4259199192705518, "grad_norm": 0.16321620345115662, "learning_rate": 4.194306474167522e-05, "loss": 0.4164, "step": 39565 }, { "epoch": 1.4261001189317764, "grad_norm": 0.24486680328845978, "learning_rate": 4.1940918870988976e-05, "loss": 0.4363, "step": 39570 }, { "epoch": 1.426280318593001, "grad_norm": 0.23074302077293396, "learning_rate": 4.1938772769486e-05, "loss": 0.4505, "step": 39575 }, { "epoch": 1.4264605182542258, "grad_norm": 0.1574547439813614, "learning_rate": 4.193662643719552e-05, "loss": 0.3959, "step": 39580 }, { "epoch": 1.4266407179154503, "grad_norm": 0.18195997178554535, "learning_rate": 4.193447987414678e-05, "loss": 0.414, "step": 39585 }, { "epoch": 1.4268209175766748, "grad_norm": 0.19354818761348724, "learning_rate": 4.1932333080369036e-05, "loss": 0.4372, "step": 39590 }, { "epoch": 1.4270011172378996, "grad_norm": 0.1999470740556717, "learning_rate": 4.1930186055891525e-05, "loss": 0.4069, "step": 39595 }, { "epoch": 1.4271813168991243, "grad_norm": 0.18640463054180145, "learning_rate": 4.1928038800743504e-05, "loss": 0.4398, "step": 39600 }, { "epoch": 1.4273615165603488, "grad_norm": 0.18737342953681946, "learning_rate": 4.192589131495424e-05, "loss": 0.4102, "step": 39605 }, { "epoch": 1.4275417162215736, "grad_norm": 0.16932520270347595, "learning_rate": 4.1923743598552984e-05, "loss": 0.4147, "step": 39610 }, { "epoch": 1.427721915882798, "grad_norm": 0.14350475370883942, "learning_rate": 4.192159565156899e-05, "loss": 0.4086, "step": 39615 }, { "epoch": 1.4279021155440228, "grad_norm": 0.19839464128017426, "learning_rate": 4.1919447474031546e-05, "loss": 0.4067, "step": 39620 }, { "epoch": 1.4280823152052475, "grad_norm": 0.17800691723823547, "learning_rate": 4.19172990659699e-05, "loss": 0.4202, "step": 39625 }, { "epoch": 1.428262514866472, "grad_norm": 0.19023603200912476, "learning_rate": 4.191515042741332e-05, "loss": 0.394, "step": 39630 }, { "epoch": 1.4284427145276966, "grad_norm": 0.14899520576000214, "learning_rate": 4.1913001558391095e-05, "loss": 0.4398, "step": 39635 }, { "epoch": 1.4286229141889213, "grad_norm": 0.1971236616373062, "learning_rate": 4.19108524589325e-05, "loss": 0.407, "step": 39640 }, { "epoch": 1.428803113850146, "grad_norm": 0.16257238388061523, "learning_rate": 4.190870312906682e-05, "loss": 0.4173, "step": 39645 }, { "epoch": 1.4289833135113705, "grad_norm": 0.19401347637176514, "learning_rate": 4.190655356882332e-05, "loss": 0.4268, "step": 39650 }, { "epoch": 1.4291635131725953, "grad_norm": 0.16988852620124817, "learning_rate": 4.1904403778231313e-05, "loss": 0.4242, "step": 39655 }, { "epoch": 1.4293437128338198, "grad_norm": 0.16938932240009308, "learning_rate": 4.1902253757320074e-05, "loss": 0.4149, "step": 39660 }, { "epoch": 1.4295239124950445, "grad_norm": 0.2068401277065277, "learning_rate": 4.190010350611889e-05, "loss": 0.4358, "step": 39665 }, { "epoch": 1.4297041121562692, "grad_norm": 0.1826406568288803, "learning_rate": 4.1897953024657084e-05, "loss": 0.4312, "step": 39670 }, { "epoch": 1.4298843118174938, "grad_norm": 0.17122584581375122, "learning_rate": 4.189580231296393e-05, "loss": 0.44, "step": 39675 }, { "epoch": 1.4300645114787185, "grad_norm": 0.18926846981048584, "learning_rate": 4.1893651371068743e-05, "loss": 0.4346, "step": 39680 }, { "epoch": 1.430244711139943, "grad_norm": 0.16383080184459686, "learning_rate": 4.1891500199000827e-05, "loss": 0.3993, "step": 39685 }, { "epoch": 1.4304249108011677, "grad_norm": 0.1707577407360077, "learning_rate": 4.1889348796789484e-05, "loss": 0.3975, "step": 39690 }, { "epoch": 1.4306051104623922, "grad_norm": 0.20824764668941498, "learning_rate": 4.188719716446404e-05, "loss": 0.417, "step": 39695 }, { "epoch": 1.430785310123617, "grad_norm": 0.15967229008674622, "learning_rate": 4.188504530205381e-05, "loss": 0.404, "step": 39700 }, { "epoch": 1.4309655097848415, "grad_norm": 0.18324309587478638, "learning_rate": 4.1882893209588104e-05, "loss": 0.4453, "step": 39705 }, { "epoch": 1.4311457094460662, "grad_norm": 0.1739649921655655, "learning_rate": 4.188074088709624e-05, "loss": 0.4023, "step": 39710 }, { "epoch": 1.431325909107291, "grad_norm": 0.17934979498386383, "learning_rate": 4.187858833460755e-05, "loss": 0.4329, "step": 39715 }, { "epoch": 1.4315061087685155, "grad_norm": 0.19043073058128357, "learning_rate": 4.187643555215137e-05, "loss": 0.3626, "step": 39720 }, { "epoch": 1.4316863084297402, "grad_norm": 0.1956600397825241, "learning_rate": 4.187428253975702e-05, "loss": 0.4218, "step": 39725 }, { "epoch": 1.4318665080909647, "grad_norm": 0.16748665273189545, "learning_rate": 4.1872129297453835e-05, "loss": 0.4228, "step": 39730 }, { "epoch": 1.4320467077521895, "grad_norm": 0.24490876495838165, "learning_rate": 4.186997582527115e-05, "loss": 0.4301, "step": 39735 }, { "epoch": 1.4322269074134142, "grad_norm": 0.19111377000808716, "learning_rate": 4.186782212323832e-05, "loss": 0.4255, "step": 39740 }, { "epoch": 1.4324071070746387, "grad_norm": 0.236825630068779, "learning_rate": 4.186566819138467e-05, "loss": 0.4159, "step": 39745 }, { "epoch": 1.4325873067358632, "grad_norm": 0.17333528399467468, "learning_rate": 4.186351402973956e-05, "loss": 0.4423, "step": 39750 }, { "epoch": 1.432767506397088, "grad_norm": 0.15992626547813416, "learning_rate": 4.1861359638332343e-05, "loss": 0.4119, "step": 39755 }, { "epoch": 1.4329477060583127, "grad_norm": 0.18964506685733795, "learning_rate": 4.185920501719236e-05, "loss": 0.4394, "step": 39760 }, { "epoch": 1.4331279057195372, "grad_norm": 0.1828775703907013, "learning_rate": 4.185705016634897e-05, "loss": 0.4115, "step": 39765 }, { "epoch": 1.433308105380762, "grad_norm": 0.17985029518604279, "learning_rate": 4.1854895085831533e-05, "loss": 0.4361, "step": 39770 }, { "epoch": 1.4334883050419864, "grad_norm": 0.19158676266670227, "learning_rate": 4.1852739775669425e-05, "loss": 0.4207, "step": 39775 }, { "epoch": 1.4336685047032112, "grad_norm": 0.1590370237827301, "learning_rate": 4.185058423589199e-05, "loss": 0.4402, "step": 39780 }, { "epoch": 1.433848704364436, "grad_norm": 0.17683321237564087, "learning_rate": 4.184842846652861e-05, "loss": 0.4283, "step": 39785 }, { "epoch": 1.4340289040256604, "grad_norm": 0.16492830216884613, "learning_rate": 4.1846272467608655e-05, "loss": 0.3904, "step": 39790 }, { "epoch": 1.4342091036868851, "grad_norm": 0.22784380614757538, "learning_rate": 4.18441162391615e-05, "loss": 0.4632, "step": 39795 }, { "epoch": 1.4343893033481097, "grad_norm": 0.17077568173408508, "learning_rate": 4.184195978121652e-05, "loss": 0.4248, "step": 39800 }, { "epoch": 1.4345695030093344, "grad_norm": 0.17400763928890228, "learning_rate": 4.1839803093803106e-05, "loss": 0.4172, "step": 39805 }, { "epoch": 1.434749702670559, "grad_norm": 0.1851196140050888, "learning_rate": 4.183764617695063e-05, "loss": 0.4743, "step": 39810 }, { "epoch": 1.4349299023317836, "grad_norm": 0.16248086094856262, "learning_rate": 4.183548903068848e-05, "loss": 0.4334, "step": 39815 }, { "epoch": 1.4351101019930081, "grad_norm": 0.19981388747692108, "learning_rate": 4.1833331655046055e-05, "loss": 0.4594, "step": 39820 }, { "epoch": 1.4352903016542329, "grad_norm": 0.21425208449363708, "learning_rate": 4.1831174050052745e-05, "loss": 0.4465, "step": 39825 }, { "epoch": 1.4354705013154576, "grad_norm": 0.16509120166301727, "learning_rate": 4.182901621573795e-05, "loss": 0.4481, "step": 39830 }, { "epoch": 1.4356507009766821, "grad_norm": 0.18470360338687897, "learning_rate": 4.1826858152131064e-05, "loss": 0.4352, "step": 39835 }, { "epoch": 1.4358309006379069, "grad_norm": 0.1776283085346222, "learning_rate": 4.182469985926149e-05, "loss": 0.3918, "step": 39840 }, { "epoch": 1.4360111002991314, "grad_norm": 0.16285589337348938, "learning_rate": 4.1822541337158646e-05, "loss": 0.4151, "step": 39845 }, { "epoch": 1.436191299960356, "grad_norm": 0.15581516921520233, "learning_rate": 4.1820382585851925e-05, "loss": 0.4004, "step": 39850 }, { "epoch": 1.4363714996215808, "grad_norm": 0.15109944343566895, "learning_rate": 4.1818223605370756e-05, "loss": 0.4049, "step": 39855 }, { "epoch": 1.4365516992828053, "grad_norm": 0.1799648404121399, "learning_rate": 4.1816064395744536e-05, "loss": 0.4167, "step": 39860 }, { "epoch": 1.4367318989440299, "grad_norm": 0.19493626058101654, "learning_rate": 4.181390495700271e-05, "loss": 0.4467, "step": 39865 }, { "epoch": 1.4369120986052546, "grad_norm": 0.21801850199699402, "learning_rate": 4.181174528917468e-05, "loss": 0.4298, "step": 39870 }, { "epoch": 1.4370922982664793, "grad_norm": 0.24591484665870667, "learning_rate": 4.1809585392289865e-05, "loss": 0.4069, "step": 39875 }, { "epoch": 1.4372724979277038, "grad_norm": 0.16199952363967896, "learning_rate": 4.180742526637771e-05, "loss": 0.4273, "step": 39880 }, { "epoch": 1.4374526975889286, "grad_norm": 0.16908341646194458, "learning_rate": 4.180526491146764e-05, "loss": 0.4104, "step": 39885 }, { "epoch": 1.437632897250153, "grad_norm": 0.18602709472179413, "learning_rate": 4.180310432758908e-05, "loss": 0.4457, "step": 39890 }, { "epoch": 1.4378130969113778, "grad_norm": 0.21784497797489166, "learning_rate": 4.1800943514771486e-05, "loss": 0.4454, "step": 39895 }, { "epoch": 1.4379932965726026, "grad_norm": 0.166326642036438, "learning_rate": 4.179878247304429e-05, "loss": 0.4221, "step": 39900 }, { "epoch": 1.438173496233827, "grad_norm": 0.17365773022174835, "learning_rate": 4.1796621202436934e-05, "loss": 0.3897, "step": 39905 }, { "epoch": 1.4383536958950518, "grad_norm": 0.231038898229599, "learning_rate": 4.179445970297887e-05, "loss": 0.4301, "step": 39910 }, { "epoch": 1.4385338955562763, "grad_norm": 0.19160155951976776, "learning_rate": 4.179229797469954e-05, "loss": 0.3991, "step": 39915 }, { "epoch": 1.438714095217501, "grad_norm": 0.1742529571056366, "learning_rate": 4.179013601762839e-05, "loss": 0.4099, "step": 39920 }, { "epoch": 1.4388942948787256, "grad_norm": 0.17680296301841736, "learning_rate": 4.17879738317949e-05, "loss": 0.4349, "step": 39925 }, { "epoch": 1.4390744945399503, "grad_norm": 0.21508027613162994, "learning_rate": 4.1785811417228513e-05, "loss": 0.4284, "step": 39930 }, { "epoch": 1.4392546942011748, "grad_norm": 0.19093069434165955, "learning_rate": 4.1783648773958706e-05, "loss": 0.4541, "step": 39935 }, { "epoch": 1.4394348938623995, "grad_norm": 0.18556737899780273, "learning_rate": 4.178148590201492e-05, "loss": 0.416, "step": 39940 }, { "epoch": 1.4396150935236243, "grad_norm": 0.17068710923194885, "learning_rate": 4.177932280142665e-05, "loss": 0.4588, "step": 39945 }, { "epoch": 1.4397952931848488, "grad_norm": 0.195718452334404, "learning_rate": 4.177715947222334e-05, "loss": 0.4256, "step": 39950 }, { "epoch": 1.4399754928460735, "grad_norm": 0.21339944005012512, "learning_rate": 4.177499591443449e-05, "loss": 0.4219, "step": 39955 }, { "epoch": 1.440155692507298, "grad_norm": 0.21166032552719116, "learning_rate": 4.1772832128089564e-05, "loss": 0.4104, "step": 39960 }, { "epoch": 1.4403358921685228, "grad_norm": 0.1573396623134613, "learning_rate": 4.177066811321805e-05, "loss": 0.4435, "step": 39965 }, { "epoch": 1.4405160918297475, "grad_norm": 0.2001795619726181, "learning_rate": 4.176850386984943e-05, "loss": 0.3743, "step": 39970 }, { "epoch": 1.440696291490972, "grad_norm": 0.19709444046020508, "learning_rate": 4.176633939801319e-05, "loss": 0.4321, "step": 39975 }, { "epoch": 1.4408764911521965, "grad_norm": 0.1635955274105072, "learning_rate": 4.176417469773882e-05, "loss": 0.4251, "step": 39980 }, { "epoch": 1.4410566908134212, "grad_norm": 0.15509602427482605, "learning_rate": 4.1762009769055835e-05, "loss": 0.4027, "step": 39985 }, { "epoch": 1.441236890474646, "grad_norm": 0.1558351367712021, "learning_rate": 4.1759844611993685e-05, "loss": 0.4319, "step": 39990 }, { "epoch": 1.4414170901358705, "grad_norm": 0.19999557733535767, "learning_rate": 4.175767922658191e-05, "loss": 0.3745, "step": 39995 }, { "epoch": 1.4415972897970952, "grad_norm": 0.16591276228427887, "learning_rate": 4.1755513612849993e-05, "loss": 0.4251, "step": 40000 }, { "epoch": 1.4415972897970952, "eval_loss": 0.4461284279823303, "eval_runtime": 3.5457, "eval_samples_per_second": 28.203, "eval_steps_per_second": 7.051, "step": 40000 }, { "epoch": 1.4417774894583197, "grad_norm": 0.1718062311410904, "learning_rate": 4.1753347770827454e-05, "loss": 0.4254, "step": 40005 }, { "epoch": 1.4419576891195445, "grad_norm": 0.19657757878303528, "learning_rate": 4.17511817005438e-05, "loss": 0.4246, "step": 40010 }, { "epoch": 1.4421378887807692, "grad_norm": 0.18001805245876312, "learning_rate": 4.1749015402028526e-05, "loss": 0.409, "step": 40015 }, { "epoch": 1.4423180884419937, "grad_norm": 0.158418208360672, "learning_rate": 4.174684887531116e-05, "loss": 0.3819, "step": 40020 }, { "epoch": 1.4424982881032182, "grad_norm": 0.23636119067668915, "learning_rate": 4.174468212042123e-05, "loss": 0.399, "step": 40025 }, { "epoch": 1.442678487764443, "grad_norm": 0.1882403939962387, "learning_rate": 4.1742515137388246e-05, "loss": 0.4072, "step": 40030 }, { "epoch": 1.4428586874256677, "grad_norm": 0.17548276484012604, "learning_rate": 4.174034792624173e-05, "loss": 0.4078, "step": 40035 }, { "epoch": 1.4430388870868922, "grad_norm": 0.15663708746433258, "learning_rate": 4.1738180487011214e-05, "loss": 0.4162, "step": 40040 }, { "epoch": 1.443219086748117, "grad_norm": 0.19377632439136505, "learning_rate": 4.173601281972623e-05, "loss": 0.4313, "step": 40045 }, { "epoch": 1.4433992864093415, "grad_norm": 0.22673170268535614, "learning_rate": 4.173384492441632e-05, "loss": 0.4314, "step": 40050 }, { "epoch": 1.4435794860705662, "grad_norm": 0.15245167911052704, "learning_rate": 4.173167680111101e-05, "loss": 0.3985, "step": 40055 }, { "epoch": 1.443759685731791, "grad_norm": 0.1588246375322342, "learning_rate": 4.1729508449839834e-05, "loss": 0.42, "step": 40060 }, { "epoch": 1.4439398853930154, "grad_norm": 0.20271897315979004, "learning_rate": 4.1727339870632345e-05, "loss": 0.4485, "step": 40065 }, { "epoch": 1.4441200850542402, "grad_norm": 0.1787983775138855, "learning_rate": 4.17251710635181e-05, "loss": 0.3992, "step": 40070 }, { "epoch": 1.4443002847154647, "grad_norm": 0.17043855786323547, "learning_rate": 4.1723002028526625e-05, "loss": 0.4319, "step": 40075 }, { "epoch": 1.4444804843766894, "grad_norm": 0.1581939458847046, "learning_rate": 4.172083276568749e-05, "loss": 0.4217, "step": 40080 }, { "epoch": 1.4446606840379141, "grad_norm": 0.19269004464149475, "learning_rate": 4.1718663275030246e-05, "loss": 0.3794, "step": 40085 }, { "epoch": 1.4448408836991387, "grad_norm": 0.1823306381702423, "learning_rate": 4.1716493556584455e-05, "loss": 0.3788, "step": 40090 }, { "epoch": 1.4450210833603632, "grad_norm": 0.22795365750789642, "learning_rate": 4.171432361037968e-05, "loss": 0.4046, "step": 40095 }, { "epoch": 1.445201283021588, "grad_norm": 0.1868722289800644, "learning_rate": 4.1712153436445464e-05, "loss": 0.4358, "step": 40100 }, { "epoch": 1.4453814826828126, "grad_norm": 0.2065182775259018, "learning_rate": 4.1709983034811406e-05, "loss": 0.4051, "step": 40105 }, { "epoch": 1.4455616823440371, "grad_norm": 0.14879098534584045, "learning_rate": 4.170781240550706e-05, "loss": 0.3798, "step": 40110 }, { "epoch": 1.4457418820052619, "grad_norm": 0.15872523188591003, "learning_rate": 4.170564154856201e-05, "loss": 0.406, "step": 40115 }, { "epoch": 1.4459220816664864, "grad_norm": 0.15561115741729736, "learning_rate": 4.170347046400583e-05, "loss": 0.3787, "step": 40120 }, { "epoch": 1.4461022813277111, "grad_norm": 0.22237730026245117, "learning_rate": 4.170129915186809e-05, "loss": 0.4211, "step": 40125 }, { "epoch": 1.4462824809889359, "grad_norm": 0.19073696434497833, "learning_rate": 4.169912761217839e-05, "loss": 0.4023, "step": 40130 }, { "epoch": 1.4464626806501604, "grad_norm": 0.18226531147956848, "learning_rate": 4.16969558449663e-05, "loss": 0.4144, "step": 40135 }, { "epoch": 1.4466428803113849, "grad_norm": 0.19188474118709564, "learning_rate": 4.169478385026142e-05, "loss": 0.4288, "step": 40140 }, { "epoch": 1.4468230799726096, "grad_norm": 0.2194652110338211, "learning_rate": 4.169261162809335e-05, "loss": 0.4719, "step": 40145 }, { "epoch": 1.4470032796338343, "grad_norm": 0.192906454205513, "learning_rate": 4.169043917849168e-05, "loss": 0.4207, "step": 40150 }, { "epoch": 1.4471834792950589, "grad_norm": 0.17282317578792572, "learning_rate": 4.1688266501486004e-05, "loss": 0.388, "step": 40155 }, { "epoch": 1.4473636789562836, "grad_norm": 0.2374262660741806, "learning_rate": 4.168609359710592e-05, "loss": 0.4231, "step": 40160 }, { "epoch": 1.447543878617508, "grad_norm": 0.14937429130077362, "learning_rate": 4.1683920465381054e-05, "loss": 0.4245, "step": 40165 }, { "epoch": 1.4477240782787328, "grad_norm": 0.15380804240703583, "learning_rate": 4.1681747106340995e-05, "loss": 0.4201, "step": 40170 }, { "epoch": 1.4479042779399576, "grad_norm": 0.22721770405769348, "learning_rate": 4.167957352001537e-05, "loss": 0.3973, "step": 40175 }, { "epoch": 1.448084477601182, "grad_norm": 0.21499106287956238, "learning_rate": 4.167739970643377e-05, "loss": 0.4453, "step": 40180 }, { "epoch": 1.4482646772624068, "grad_norm": 0.19283875823020935, "learning_rate": 4.167522566562584e-05, "loss": 0.4062, "step": 40185 }, { "epoch": 1.4484448769236313, "grad_norm": 0.1777193248271942, "learning_rate": 4.167305139762119e-05, "loss": 0.4055, "step": 40190 }, { "epoch": 1.448625076584856, "grad_norm": 0.2283918261528015, "learning_rate": 4.167087690244943e-05, "loss": 0.4338, "step": 40195 }, { "epoch": 1.4488052762460806, "grad_norm": 0.17080770432949066, "learning_rate": 4.1668702180140206e-05, "loss": 0.4003, "step": 40200 }, { "epoch": 1.4489854759073053, "grad_norm": 0.17411410808563232, "learning_rate": 4.166652723072314e-05, "loss": 0.4115, "step": 40205 }, { "epoch": 1.4491656755685298, "grad_norm": 0.18166224658489227, "learning_rate": 4.166435205422787e-05, "loss": 0.4184, "step": 40210 }, { "epoch": 1.4493458752297546, "grad_norm": 0.17343877255916595, "learning_rate": 4.166217665068403e-05, "loss": 0.4373, "step": 40215 }, { "epoch": 1.4495260748909793, "grad_norm": 0.1936635822057724, "learning_rate": 4.166000102012126e-05, "loss": 0.387, "step": 40220 }, { "epoch": 1.4497062745522038, "grad_norm": 0.1682083159685135, "learning_rate": 4.16578251625692e-05, "loss": 0.422, "step": 40225 }, { "epoch": 1.4498864742134285, "grad_norm": 0.20389603078365326, "learning_rate": 4.16556490780575e-05, "loss": 0.4434, "step": 40230 }, { "epoch": 1.450066673874653, "grad_norm": 0.16445006430149078, "learning_rate": 4.16534727666158e-05, "loss": 0.4393, "step": 40235 }, { "epoch": 1.4502468735358778, "grad_norm": 0.1689777970314026, "learning_rate": 4.165129622827376e-05, "loss": 0.3975, "step": 40240 }, { "epoch": 1.4504270731971025, "grad_norm": 0.17561562359333038, "learning_rate": 4.164911946306104e-05, "loss": 0.4218, "step": 40245 }, { "epoch": 1.450607272858327, "grad_norm": 0.18199895322322845, "learning_rate": 4.164694247100728e-05, "loss": 0.4344, "step": 40250 }, { "epoch": 1.4507874725195515, "grad_norm": 0.17951489984989166, "learning_rate": 4.164476525214216e-05, "loss": 0.4282, "step": 40255 }, { "epoch": 1.4509676721807763, "grad_norm": 0.1799135059118271, "learning_rate": 4.1642587806495324e-05, "loss": 0.4252, "step": 40260 }, { "epoch": 1.451147871842001, "grad_norm": 0.172772616147995, "learning_rate": 4.1640410134096465e-05, "loss": 0.413, "step": 40265 }, { "epoch": 1.4513280715032255, "grad_norm": 0.15084902942180634, "learning_rate": 4.1638232234975225e-05, "loss": 0.4336, "step": 40270 }, { "epoch": 1.4515082711644502, "grad_norm": 0.17858850955963135, "learning_rate": 4.163605410916131e-05, "loss": 0.443, "step": 40275 }, { "epoch": 1.4516884708256748, "grad_norm": 0.1724681556224823, "learning_rate": 4.163387575668437e-05, "loss": 0.3995, "step": 40280 }, { "epoch": 1.4518686704868995, "grad_norm": 0.15848155319690704, "learning_rate": 4.163169717757409e-05, "loss": 0.3968, "step": 40285 }, { "epoch": 1.4520488701481242, "grad_norm": 0.17804880440235138, "learning_rate": 4.162951837186016e-05, "loss": 0.4272, "step": 40290 }, { "epoch": 1.4522290698093487, "grad_norm": 0.19191712141036987, "learning_rate": 4.1627339339572256e-05, "loss": 0.4096, "step": 40295 }, { "epoch": 1.4524092694705735, "grad_norm": 0.19533273577690125, "learning_rate": 4.1625160080740075e-05, "loss": 0.4129, "step": 40300 }, { "epoch": 1.452589469131798, "grad_norm": 0.28732794523239136, "learning_rate": 4.162298059539331e-05, "loss": 0.4749, "step": 40305 }, { "epoch": 1.4527696687930227, "grad_norm": 0.16327139735221863, "learning_rate": 4.1620800883561656e-05, "loss": 0.4015, "step": 40310 }, { "epoch": 1.4529498684542472, "grad_norm": 0.1888490468263626, "learning_rate": 4.16186209452748e-05, "loss": 0.4243, "step": 40315 }, { "epoch": 1.453130068115472, "grad_norm": 0.20011357963085175, "learning_rate": 4.161644078056246e-05, "loss": 0.4201, "step": 40320 }, { "epoch": 1.4533102677766965, "grad_norm": 0.1774764508008957, "learning_rate": 4.161426038945432e-05, "loss": 0.3993, "step": 40325 }, { "epoch": 1.4534904674379212, "grad_norm": 0.18601520359516144, "learning_rate": 4.1612079771980106e-05, "loss": 0.382, "step": 40330 }, { "epoch": 1.453670667099146, "grad_norm": 0.18623565137386322, "learning_rate": 4.160989892816952e-05, "loss": 0.4032, "step": 40335 }, { "epoch": 1.4538508667603705, "grad_norm": 0.16086627542972565, "learning_rate": 4.160771785805228e-05, "loss": 0.4054, "step": 40340 }, { "epoch": 1.4540310664215952, "grad_norm": 0.1630929857492447, "learning_rate": 4.16055365616581e-05, "loss": 0.4016, "step": 40345 }, { "epoch": 1.4542112660828197, "grad_norm": 0.16197270154953003, "learning_rate": 4.160335503901669e-05, "loss": 0.4461, "step": 40350 }, { "epoch": 1.4543914657440444, "grad_norm": 0.160113126039505, "learning_rate": 4.1601173290157794e-05, "loss": 0.429, "step": 40355 }, { "epoch": 1.4545716654052692, "grad_norm": 0.20948779582977295, "learning_rate": 4.159899131511111e-05, "loss": 0.4058, "step": 40360 }, { "epoch": 1.4547518650664937, "grad_norm": 0.19424784183502197, "learning_rate": 4.15968091139064e-05, "loss": 0.4113, "step": 40365 }, { "epoch": 1.4549320647277182, "grad_norm": 0.19249606132507324, "learning_rate": 4.159462668657337e-05, "loss": 0.4407, "step": 40370 }, { "epoch": 1.455112264388943, "grad_norm": 0.13863718509674072, "learning_rate": 4.159244403314176e-05, "loss": 0.4143, "step": 40375 }, { "epoch": 1.4552924640501677, "grad_norm": 0.16577591001987457, "learning_rate": 4.159026115364132e-05, "loss": 0.4468, "step": 40380 }, { "epoch": 1.4554726637113922, "grad_norm": 0.16115480661392212, "learning_rate": 4.1588078048101784e-05, "loss": 0.4213, "step": 40385 }, { "epoch": 1.455652863372617, "grad_norm": 0.17219962179660797, "learning_rate": 4.15858947165529e-05, "loss": 0.4411, "step": 40390 }, { "epoch": 1.4558330630338414, "grad_norm": 0.17890863120555878, "learning_rate": 4.15837111590244e-05, "loss": 0.4052, "step": 40395 }, { "epoch": 1.4560132626950661, "grad_norm": 0.19050833582878113, "learning_rate": 4.158152737554606e-05, "loss": 0.4257, "step": 40400 }, { "epoch": 1.4561934623562909, "grad_norm": 0.1742664873600006, "learning_rate": 4.1579343366147604e-05, "loss": 0.4023, "step": 40405 }, { "epoch": 1.4563736620175154, "grad_norm": 0.19294539093971252, "learning_rate": 4.157715913085881e-05, "loss": 0.4072, "step": 40410 }, { "epoch": 1.4565538616787401, "grad_norm": 0.1636057049036026, "learning_rate": 4.1574974669709435e-05, "loss": 0.4023, "step": 40415 }, { "epoch": 1.4567340613399646, "grad_norm": 0.15918225049972534, "learning_rate": 4.1572789982729244e-05, "loss": 0.3778, "step": 40420 }, { "epoch": 1.4569142610011894, "grad_norm": 0.17120924592018127, "learning_rate": 4.1570605069947986e-05, "loss": 0.4285, "step": 40425 }, { "epoch": 1.4570944606624139, "grad_norm": 0.16076339781284332, "learning_rate": 4.1568419931395456e-05, "loss": 0.4201, "step": 40430 }, { "epoch": 1.4572746603236386, "grad_norm": 0.14771609008312225, "learning_rate": 4.15662345671014e-05, "loss": 0.3823, "step": 40435 }, { "epoch": 1.4574548599848631, "grad_norm": 0.19097311794757843, "learning_rate": 4.156404897709562e-05, "loss": 0.4274, "step": 40440 }, { "epoch": 1.4576350596460879, "grad_norm": 0.16327013075351715, "learning_rate": 4.1561863161407866e-05, "loss": 0.4176, "step": 40445 }, { "epoch": 1.4578152593073126, "grad_norm": 0.15996558964252472, "learning_rate": 4.1559677120067935e-05, "loss": 0.4001, "step": 40450 }, { "epoch": 1.457995458968537, "grad_norm": 0.1909141093492508, "learning_rate": 4.1557490853105614e-05, "loss": 0.3835, "step": 40455 }, { "epoch": 1.4581756586297618, "grad_norm": 0.21793286502361298, "learning_rate": 4.155530436055068e-05, "loss": 0.3851, "step": 40460 }, { "epoch": 1.4583558582909864, "grad_norm": 0.14985129237174988, "learning_rate": 4.155311764243294e-05, "loss": 0.385, "step": 40465 }, { "epoch": 1.458536057952211, "grad_norm": 0.16198906302452087, "learning_rate": 4.155093069878216e-05, "loss": 0.3893, "step": 40470 }, { "epoch": 1.4587162576134358, "grad_norm": 0.20363600552082062, "learning_rate": 4.154874352962816e-05, "loss": 0.4146, "step": 40475 }, { "epoch": 1.4588964572746603, "grad_norm": 0.1638229936361313, "learning_rate": 4.154655613500075e-05, "loss": 0.4149, "step": 40480 }, { "epoch": 1.4590766569358848, "grad_norm": 0.18865369260311127, "learning_rate": 4.15443685149297e-05, "loss": 0.3961, "step": 40485 }, { "epoch": 1.4592568565971096, "grad_norm": 0.17428448796272278, "learning_rate": 4.154218066944483e-05, "loss": 0.4314, "step": 40490 }, { "epoch": 1.4594370562583343, "grad_norm": 0.18061710894107819, "learning_rate": 4.1539992598575954e-05, "loss": 0.4091, "step": 40495 }, { "epoch": 1.4596172559195588, "grad_norm": 0.14302043616771698, "learning_rate": 4.153780430235289e-05, "loss": 0.3872, "step": 40500 }, { "epoch": 1.4596172559195588, "eval_loss": 0.44571349024772644, "eval_runtime": 3.5421, "eval_samples_per_second": 28.232, "eval_steps_per_second": 7.058, "step": 40500 }, { "epoch": 1.4597974555807836, "grad_norm": 0.1947942078113556, "learning_rate": 4.153561578080543e-05, "loss": 0.413, "step": 40505 }, { "epoch": 1.459977655242008, "grad_norm": 0.1369856297969818, "learning_rate": 4.153342703396341e-05, "loss": 0.3949, "step": 40510 }, { "epoch": 1.4601578549032328, "grad_norm": 0.17609436810016632, "learning_rate": 4.153123806185666e-05, "loss": 0.4322, "step": 40515 }, { "epoch": 1.4603380545644575, "grad_norm": 0.2042810618877411, "learning_rate": 4.152904886451498e-05, "loss": 0.4339, "step": 40520 }, { "epoch": 1.460518254225682, "grad_norm": 0.22225399315357208, "learning_rate": 4.152685944196821e-05, "loss": 0.4344, "step": 40525 }, { "epoch": 1.4606984538869066, "grad_norm": 0.47528132796287537, "learning_rate": 4.152466979424619e-05, "loss": 0.4446, "step": 40530 }, { "epoch": 1.4608786535481313, "grad_norm": 0.16672192513942719, "learning_rate": 4.1522479921378733e-05, "loss": 0.4329, "step": 40535 }, { "epoch": 1.461058853209356, "grad_norm": 0.19320343434810638, "learning_rate": 4.1520289823395686e-05, "loss": 0.4102, "step": 40540 }, { "epoch": 1.4612390528705805, "grad_norm": 0.17399749159812927, "learning_rate": 4.151809950032689e-05, "loss": 0.4194, "step": 40545 }, { "epoch": 1.4614192525318053, "grad_norm": 0.1759127378463745, "learning_rate": 4.1515908952202186e-05, "loss": 0.3828, "step": 40550 }, { "epoch": 1.4615994521930298, "grad_norm": 0.17152760922908783, "learning_rate": 4.151371817905142e-05, "loss": 0.4291, "step": 40555 }, { "epoch": 1.4617796518542545, "grad_norm": 0.18539777398109436, "learning_rate": 4.151152718090445e-05, "loss": 0.376, "step": 40560 }, { "epoch": 1.4619598515154792, "grad_norm": 0.1786716729402542, "learning_rate": 4.1509335957791106e-05, "loss": 0.3922, "step": 40565 }, { "epoch": 1.4621400511767038, "grad_norm": 0.15850621461868286, "learning_rate": 4.150714450974126e-05, "loss": 0.4005, "step": 40570 }, { "epoch": 1.4623202508379285, "grad_norm": 0.19711154699325562, "learning_rate": 4.150495283678477e-05, "loss": 0.4157, "step": 40575 }, { "epoch": 1.462500450499153, "grad_norm": 0.2046595960855484, "learning_rate": 4.150276093895149e-05, "loss": 0.4197, "step": 40580 }, { "epoch": 1.4626806501603777, "grad_norm": 0.23600243031978607, "learning_rate": 4.1500568816271285e-05, "loss": 0.4453, "step": 40585 }, { "epoch": 1.4628608498216025, "grad_norm": 0.18028078973293304, "learning_rate": 4.149837646877402e-05, "loss": 0.449, "step": 40590 }, { "epoch": 1.463041049482827, "grad_norm": 0.21281985938549042, "learning_rate": 4.149618389648958e-05, "loss": 0.434, "step": 40595 }, { "epoch": 1.4632212491440515, "grad_norm": 0.18634112179279327, "learning_rate": 4.149399109944783e-05, "loss": 0.3694, "step": 40600 }, { "epoch": 1.4634014488052762, "grad_norm": 0.2347174882888794, "learning_rate": 4.1491798077678636e-05, "loss": 0.4023, "step": 40605 }, { "epoch": 1.463581648466501, "grad_norm": 0.2317020446062088, "learning_rate": 4.148960483121189e-05, "loss": 0.4539, "step": 40610 }, { "epoch": 1.4637618481277255, "grad_norm": 0.16629308462142944, "learning_rate": 4.148741136007747e-05, "loss": 0.4403, "step": 40615 }, { "epoch": 1.4639420477889502, "grad_norm": 0.19857972860336304, "learning_rate": 4.148521766430527e-05, "loss": 0.3936, "step": 40620 }, { "epoch": 1.4641222474501747, "grad_norm": 0.18918615579605103, "learning_rate": 4.148302374392516e-05, "loss": 0.4059, "step": 40625 }, { "epoch": 1.4643024471113995, "grad_norm": 0.1877117156982422, "learning_rate": 4.148082959896704e-05, "loss": 0.3964, "step": 40630 }, { "epoch": 1.4644826467726242, "grad_norm": 0.1804433912038803, "learning_rate": 4.1478635229460814e-05, "loss": 0.4587, "step": 40635 }, { "epoch": 1.4646628464338487, "grad_norm": 0.16584636270999908, "learning_rate": 4.1476440635436376e-05, "loss": 0.3789, "step": 40640 }, { "epoch": 1.4648430460950732, "grad_norm": 0.16475588083267212, "learning_rate": 4.1474245816923616e-05, "loss": 0.3847, "step": 40645 }, { "epoch": 1.465023245756298, "grad_norm": 0.18502764403820038, "learning_rate": 4.147205077395245e-05, "loss": 0.4038, "step": 40650 }, { "epoch": 1.4652034454175227, "grad_norm": 0.14497093856334686, "learning_rate": 4.146985550655279e-05, "loss": 0.3932, "step": 40655 }, { "epoch": 1.4653836450787472, "grad_norm": 0.21168319880962372, "learning_rate": 4.146766001475453e-05, "loss": 0.411, "step": 40660 }, { "epoch": 1.465563844739972, "grad_norm": 0.17997999489307404, "learning_rate": 4.146546429858759e-05, "loss": 0.3992, "step": 40665 }, { "epoch": 1.4657440444011964, "grad_norm": 0.1749783307313919, "learning_rate": 4.146326835808188e-05, "loss": 0.463, "step": 40670 }, { "epoch": 1.4659242440624212, "grad_norm": 0.19459916651248932, "learning_rate": 4.1461072193267344e-05, "loss": 0.4353, "step": 40675 }, { "epoch": 1.466104443723646, "grad_norm": 0.18047486245632172, "learning_rate": 4.145887580417387e-05, "loss": 0.414, "step": 40680 }, { "epoch": 1.4662846433848704, "grad_norm": 0.21394000947475433, "learning_rate": 4.145667919083141e-05, "loss": 0.3916, "step": 40685 }, { "epoch": 1.4664648430460951, "grad_norm": 0.2331513613462448, "learning_rate": 4.1454482353269875e-05, "loss": 0.4231, "step": 40690 }, { "epoch": 1.4666450427073197, "grad_norm": 0.18889982998371124, "learning_rate": 4.145228529151921e-05, "loss": 0.4509, "step": 40695 }, { "epoch": 1.4668252423685444, "grad_norm": 0.1711355745792389, "learning_rate": 4.1450088005609335e-05, "loss": 0.4098, "step": 40700 }, { "epoch": 1.467005442029769, "grad_norm": 0.22893080115318298, "learning_rate": 4.1447890495570205e-05, "loss": 0.406, "step": 40705 }, { "epoch": 1.4671856416909936, "grad_norm": 0.1730494350194931, "learning_rate": 4.1445692761431743e-05, "loss": 0.4144, "step": 40710 }, { "epoch": 1.4673658413522181, "grad_norm": 0.22228698432445526, "learning_rate": 4.14434948032239e-05, "loss": 0.4327, "step": 40715 }, { "epoch": 1.4675460410134429, "grad_norm": 0.18846584856510162, "learning_rate": 4.144129662097663e-05, "loss": 0.4167, "step": 40720 }, { "epoch": 1.4677262406746676, "grad_norm": 0.18671122193336487, "learning_rate": 4.143909821471988e-05, "loss": 0.4348, "step": 40725 }, { "epoch": 1.4679064403358921, "grad_norm": 0.15065552294254303, "learning_rate": 4.143689958448359e-05, "loss": 0.4215, "step": 40730 }, { "epoch": 1.4680866399971169, "grad_norm": 0.19978941977024078, "learning_rate": 4.143470073029774e-05, "loss": 0.4065, "step": 40735 }, { "epoch": 1.4682668396583414, "grad_norm": 0.1559489220380783, "learning_rate": 4.143250165219226e-05, "loss": 0.423, "step": 40740 }, { "epoch": 1.468447039319566, "grad_norm": 0.1872124969959259, "learning_rate": 4.143030235019713e-05, "loss": 0.4196, "step": 40745 }, { "epoch": 1.4686272389807908, "grad_norm": 0.20608900487422943, "learning_rate": 4.142810282434231e-05, "loss": 0.419, "step": 40750 }, { "epoch": 1.4688074386420154, "grad_norm": 0.16788601875305176, "learning_rate": 4.1425903074657776e-05, "loss": 0.4231, "step": 40755 }, { "epoch": 1.4689876383032399, "grad_norm": 0.305698037147522, "learning_rate": 4.142370310117348e-05, "loss": 0.3935, "step": 40760 }, { "epoch": 1.4691678379644646, "grad_norm": 0.13668397068977356, "learning_rate": 4.142150290391943e-05, "loss": 0.387, "step": 40765 }, { "epoch": 1.4693480376256893, "grad_norm": 0.15131601691246033, "learning_rate": 4.141930248292557e-05, "loss": 0.4168, "step": 40770 }, { "epoch": 1.4695282372869138, "grad_norm": 0.19707903265953064, "learning_rate": 4.141710183822189e-05, "loss": 0.4628, "step": 40775 }, { "epoch": 1.4697084369481386, "grad_norm": 0.19335195422172546, "learning_rate": 4.1414900969838375e-05, "loss": 0.4157, "step": 40780 }, { "epoch": 1.469888636609363, "grad_norm": 0.20567668974399567, "learning_rate": 4.141269987780502e-05, "loss": 0.4535, "step": 40785 }, { "epoch": 1.4700688362705878, "grad_norm": 0.17595280706882477, "learning_rate": 4.14104985621518e-05, "loss": 0.4263, "step": 40790 }, { "epoch": 1.4702490359318126, "grad_norm": 0.18598760664463043, "learning_rate": 4.140829702290872e-05, "loss": 0.4139, "step": 40795 }, { "epoch": 1.470429235593037, "grad_norm": 0.19767579436302185, "learning_rate": 4.140609526010576e-05, "loss": 0.3817, "step": 40800 }, { "epoch": 1.4706094352542618, "grad_norm": 0.18627217411994934, "learning_rate": 4.140389327377294e-05, "loss": 0.3926, "step": 40805 }, { "epoch": 1.4707896349154863, "grad_norm": 0.20653992891311646, "learning_rate": 4.140169106394024e-05, "loss": 0.4532, "step": 40810 }, { "epoch": 1.470969834576711, "grad_norm": 0.16096976399421692, "learning_rate": 4.139948863063768e-05, "loss": 0.3833, "step": 40815 }, { "epoch": 1.4711500342379356, "grad_norm": 0.14477817714214325, "learning_rate": 4.1397285973895264e-05, "loss": 0.4148, "step": 40820 }, { "epoch": 1.4713302338991603, "grad_norm": 0.17661945521831512, "learning_rate": 4.1395083093743006e-05, "loss": 0.4228, "step": 40825 }, { "epoch": 1.4715104335603848, "grad_norm": 0.16946406662464142, "learning_rate": 4.139287999021091e-05, "loss": 0.4413, "step": 40830 }, { "epoch": 1.4716906332216095, "grad_norm": 0.2282014936208725, "learning_rate": 4.1390676663328995e-05, "loss": 0.4072, "step": 40835 }, { "epoch": 1.4718708328828343, "grad_norm": 0.1720079779624939, "learning_rate": 4.138847311312728e-05, "loss": 0.4107, "step": 40840 }, { "epoch": 1.4720510325440588, "grad_norm": 0.1870976686477661, "learning_rate": 4.13862693396358e-05, "loss": 0.4437, "step": 40845 }, { "epoch": 1.4722312322052835, "grad_norm": 0.14838680624961853, "learning_rate": 4.138406534288457e-05, "loss": 0.4164, "step": 40850 }, { "epoch": 1.472411431866508, "grad_norm": 0.146432027220726, "learning_rate": 4.138186112290362e-05, "loss": 0.3791, "step": 40855 }, { "epoch": 1.4725916315277328, "grad_norm": 0.1663684993982315, "learning_rate": 4.137965667972298e-05, "loss": 0.3969, "step": 40860 }, { "epoch": 1.4727718311889575, "grad_norm": 0.24084417521953583, "learning_rate": 4.1377452013372695e-05, "loss": 0.4105, "step": 40865 }, { "epoch": 1.472952030850182, "grad_norm": 0.21911956369876862, "learning_rate": 4.13752471238828e-05, "loss": 0.45, "step": 40870 }, { "epoch": 1.4731322305114065, "grad_norm": 0.20108048617839813, "learning_rate": 4.137304201128334e-05, "loss": 0.432, "step": 40875 }, { "epoch": 1.4733124301726312, "grad_norm": 0.16961334645748138, "learning_rate": 4.1370836675604326e-05, "loss": 0.401, "step": 40880 }, { "epoch": 1.473492629833856, "grad_norm": 0.16937093436717987, "learning_rate": 4.1368631116875856e-05, "loss": 0.4159, "step": 40885 }, { "epoch": 1.4736728294950805, "grad_norm": 0.19566501677036285, "learning_rate": 4.136642533512795e-05, "loss": 0.4196, "step": 40890 }, { "epoch": 1.4738530291563052, "grad_norm": 0.17469613254070282, "learning_rate": 4.136421933039066e-05, "loss": 0.3932, "step": 40895 }, { "epoch": 1.4740332288175297, "grad_norm": 0.18725554645061493, "learning_rate": 4.136201310269406e-05, "loss": 0.4323, "step": 40900 }, { "epoch": 1.4742134284787545, "grad_norm": 0.21251484751701355, "learning_rate": 4.135980665206819e-05, "loss": 0.4335, "step": 40905 }, { "epoch": 1.4743936281399792, "grad_norm": 0.21791920065879822, "learning_rate": 4.135759997854313e-05, "loss": 0.4479, "step": 40910 }, { "epoch": 1.4745738278012037, "grad_norm": 0.19280044734477997, "learning_rate": 4.1355393082148936e-05, "loss": 0.387, "step": 40915 }, { "epoch": 1.4747540274624285, "grad_norm": 0.14178399741649628, "learning_rate": 4.1353185962915675e-05, "loss": 0.4095, "step": 40920 }, { "epoch": 1.474934227123653, "grad_norm": 0.24102537333965302, "learning_rate": 4.135097862087342e-05, "loss": 0.4249, "step": 40925 }, { "epoch": 1.4751144267848777, "grad_norm": 0.2044147253036499, "learning_rate": 4.134877105605225e-05, "loss": 0.3982, "step": 40930 }, { "epoch": 1.4752946264461022, "grad_norm": 0.14155633747577667, "learning_rate": 4.1346563268482245e-05, "loss": 0.4237, "step": 40935 }, { "epoch": 1.475474826107327, "grad_norm": 0.20636247098445892, "learning_rate": 4.134435525819347e-05, "loss": 0.414, "step": 40940 }, { "epoch": 1.4756550257685515, "grad_norm": 0.16069145500659943, "learning_rate": 4.1342147025216015e-05, "loss": 0.4105, "step": 40945 }, { "epoch": 1.4758352254297762, "grad_norm": 0.2055576741695404, "learning_rate": 4.1339938569579985e-05, "loss": 0.4265, "step": 40950 }, { "epoch": 1.476015425091001, "grad_norm": 0.1672385036945343, "learning_rate": 4.1337729891315445e-05, "loss": 0.4226, "step": 40955 }, { "epoch": 1.4761956247522254, "grad_norm": 0.1980341076850891, "learning_rate": 4.1335520990452504e-05, "loss": 0.4252, "step": 40960 }, { "epoch": 1.4763758244134502, "grad_norm": 0.19401240348815918, "learning_rate": 4.1333311867021254e-05, "loss": 0.4046, "step": 40965 }, { "epoch": 1.4765560240746747, "grad_norm": 0.2943877875804901, "learning_rate": 4.133110252105178e-05, "loss": 0.4149, "step": 40970 }, { "epoch": 1.4767362237358994, "grad_norm": 0.15814274549484253, "learning_rate": 4.13288929525742e-05, "loss": 0.4066, "step": 40975 }, { "epoch": 1.4769164233971241, "grad_norm": 0.1717575490474701, "learning_rate": 4.132668316161863e-05, "loss": 0.4235, "step": 40980 }, { "epoch": 1.4770966230583487, "grad_norm": 0.1958007961511612, "learning_rate": 4.1324473148215146e-05, "loss": 0.4493, "step": 40985 }, { "epoch": 1.4772768227195732, "grad_norm": 0.17059576511383057, "learning_rate": 4.1322262912393884e-05, "loss": 0.3968, "step": 40990 }, { "epoch": 1.477457022380798, "grad_norm": 0.18559856712818146, "learning_rate": 4.132005245418495e-05, "loss": 0.4099, "step": 40995 }, { "epoch": 1.4776372220420226, "grad_norm": 0.206581249833107, "learning_rate": 4.131784177361845e-05, "loss": 0.4063, "step": 41000 }, { "epoch": 1.4776372220420226, "eval_loss": 0.44472381472587585, "eval_runtime": 3.5272, "eval_samples_per_second": 28.351, "eval_steps_per_second": 7.088, "step": 41000 }, { "epoch": 1.4778174217032471, "grad_norm": 0.2029978483915329, "learning_rate": 4.131563087072453e-05, "loss": 0.4428, "step": 41005 }, { "epoch": 1.4779976213644719, "grad_norm": 0.1657671481370926, "learning_rate": 4.131341974553329e-05, "loss": 0.4282, "step": 41010 }, { "epoch": 1.4781778210256964, "grad_norm": 0.22293435037136078, "learning_rate": 4.131120839807487e-05, "loss": 0.4178, "step": 41015 }, { "epoch": 1.4783580206869211, "grad_norm": 0.18376168608665466, "learning_rate": 4.130899682837939e-05, "loss": 0.4239, "step": 41020 }, { "epoch": 1.4785382203481459, "grad_norm": 0.20025895535945892, "learning_rate": 4.130678503647698e-05, "loss": 0.4262, "step": 41025 }, { "epoch": 1.4787184200093704, "grad_norm": 0.16391725838184357, "learning_rate": 4.1304573022397784e-05, "loss": 0.3756, "step": 41030 }, { "epoch": 1.4788986196705949, "grad_norm": 0.16140343248844147, "learning_rate": 4.1302360786171946e-05, "loss": 0.4036, "step": 41035 }, { "epoch": 1.4790788193318196, "grad_norm": 0.23130172491073608, "learning_rate": 4.1300148327829593e-05, "loss": 0.4511, "step": 41040 }, { "epoch": 1.4792590189930444, "grad_norm": 0.17555512487888336, "learning_rate": 4.1297935647400874e-05, "loss": 0.4236, "step": 41045 }, { "epoch": 1.4794392186542689, "grad_norm": 0.18214865028858185, "learning_rate": 4.1295722744915934e-05, "loss": 0.4806, "step": 41050 }, { "epoch": 1.4796194183154936, "grad_norm": 0.17145216464996338, "learning_rate": 4.129350962040494e-05, "loss": 0.4215, "step": 41055 }, { "epoch": 1.479799617976718, "grad_norm": 0.1968265026807785, "learning_rate": 4.1291296273898015e-05, "loss": 0.4054, "step": 41060 }, { "epoch": 1.4799798176379428, "grad_norm": 0.14597874879837036, "learning_rate": 4.1289082705425344e-05, "loss": 0.4229, "step": 41065 }, { "epoch": 1.4801600172991676, "grad_norm": 0.1973959356546402, "learning_rate": 4.1286868915017064e-05, "loss": 0.3867, "step": 41070 }, { "epoch": 1.480340216960392, "grad_norm": 0.18178540468215942, "learning_rate": 4.1284654902703356e-05, "loss": 0.4134, "step": 41075 }, { "epoch": 1.4805204166216168, "grad_norm": 0.2181033045053482, "learning_rate": 4.1282440668514376e-05, "loss": 0.4203, "step": 41080 }, { "epoch": 1.4807006162828413, "grad_norm": 0.18515872955322266, "learning_rate": 4.128022621248029e-05, "loss": 0.4207, "step": 41085 }, { "epoch": 1.480880815944066, "grad_norm": 0.16878053545951843, "learning_rate": 4.1278011534631276e-05, "loss": 0.4058, "step": 41090 }, { "epoch": 1.4810610156052908, "grad_norm": 0.1830851435661316, "learning_rate": 4.127579663499752e-05, "loss": 0.3987, "step": 41095 }, { "epoch": 1.4812412152665153, "grad_norm": 0.1524893045425415, "learning_rate": 4.1273581513609173e-05, "loss": 0.4154, "step": 41100 }, { "epoch": 1.4814214149277398, "grad_norm": 0.18265849351882935, "learning_rate": 4.127136617049643e-05, "loss": 0.4195, "step": 41105 }, { "epoch": 1.4816016145889646, "grad_norm": 0.18311630189418793, "learning_rate": 4.126915060568947e-05, "loss": 0.4548, "step": 41110 }, { "epoch": 1.4817818142501893, "grad_norm": 0.1863049566745758, "learning_rate": 4.126693481921848e-05, "loss": 0.4483, "step": 41115 }, { "epoch": 1.4819620139114138, "grad_norm": 0.15636593103408813, "learning_rate": 4.126471881111367e-05, "loss": 0.4159, "step": 41120 }, { "epoch": 1.4821422135726385, "grad_norm": 0.1574103981256485, "learning_rate": 4.1262502581405196e-05, "loss": 0.4247, "step": 41125 }, { "epoch": 1.482322413233863, "grad_norm": 0.1982761174440384, "learning_rate": 4.1260286130123285e-05, "loss": 0.4114, "step": 41130 }, { "epoch": 1.4825026128950878, "grad_norm": 0.17163503170013428, "learning_rate": 4.125806945729812e-05, "loss": 0.4174, "step": 41135 }, { "epoch": 1.4826828125563125, "grad_norm": 0.1855458766222, "learning_rate": 4.1255852562959904e-05, "loss": 0.4327, "step": 41140 }, { "epoch": 1.482863012217537, "grad_norm": 0.21407297253608704, "learning_rate": 4.125363544713884e-05, "loss": 0.4342, "step": 41145 }, { "epoch": 1.4830432118787615, "grad_norm": 0.20228859782218933, "learning_rate": 4.1251418109865146e-05, "loss": 0.4116, "step": 41150 }, { "epoch": 1.4832234115399863, "grad_norm": 0.19245915114879608, "learning_rate": 4.124920055116903e-05, "loss": 0.3937, "step": 41155 }, { "epoch": 1.483403611201211, "grad_norm": 0.1870017945766449, "learning_rate": 4.12469827710807e-05, "loss": 0.426, "step": 41160 }, { "epoch": 1.4835838108624355, "grad_norm": 0.1986088752746582, "learning_rate": 4.1244764769630375e-05, "loss": 0.4125, "step": 41165 }, { "epoch": 1.4837640105236602, "grad_norm": 0.18899187445640564, "learning_rate": 4.124254654684827e-05, "loss": 0.4205, "step": 41170 }, { "epoch": 1.4839442101848848, "grad_norm": 0.1863652914762497, "learning_rate": 4.1240328102764614e-05, "loss": 0.4055, "step": 41175 }, { "epoch": 1.4841244098461095, "grad_norm": 0.2102532535791397, "learning_rate": 4.1238109437409635e-05, "loss": 0.4175, "step": 41180 }, { "epoch": 1.4843046095073342, "grad_norm": 0.17602580785751343, "learning_rate": 4.123589055081356e-05, "loss": 0.3868, "step": 41185 }, { "epoch": 1.4844848091685587, "grad_norm": 0.1657838225364685, "learning_rate": 4.123367144300662e-05, "loss": 0.3956, "step": 41190 }, { "epoch": 1.4846650088297835, "grad_norm": 0.15461941063404083, "learning_rate": 4.123145211401904e-05, "loss": 0.432, "step": 41195 }, { "epoch": 1.484845208491008, "grad_norm": 0.17926329374313354, "learning_rate": 4.1229232563881084e-05, "loss": 0.4127, "step": 41200 }, { "epoch": 1.4850254081522327, "grad_norm": 0.15599684417247772, "learning_rate": 4.122701279262296e-05, "loss": 0.4382, "step": 41205 }, { "epoch": 1.4852056078134572, "grad_norm": 0.17323412001132965, "learning_rate": 4.1224792800274936e-05, "loss": 0.39, "step": 41210 }, { "epoch": 1.485385807474682, "grad_norm": 0.19088657200336456, "learning_rate": 4.122257258686725e-05, "loss": 0.4044, "step": 41215 }, { "epoch": 1.4855660071359065, "grad_norm": 0.1666693538427353, "learning_rate": 4.122035215243015e-05, "loss": 0.422, "step": 41220 }, { "epoch": 1.4857462067971312, "grad_norm": 0.23202933371067047, "learning_rate": 4.12181314969939e-05, "loss": 0.4344, "step": 41225 }, { "epoch": 1.485926406458356, "grad_norm": 0.19096602499485016, "learning_rate": 4.1215910620588745e-05, "loss": 0.4192, "step": 41230 }, { "epoch": 1.4861066061195805, "grad_norm": 0.21714234352111816, "learning_rate": 4.1213689523244945e-05, "loss": 0.397, "step": 41235 }, { "epoch": 1.4862868057808052, "grad_norm": 0.22744521498680115, "learning_rate": 4.121146820499277e-05, "loss": 0.439, "step": 41240 }, { "epoch": 1.4864670054420297, "grad_norm": 0.1769517958164215, "learning_rate": 4.120924666586248e-05, "loss": 0.4124, "step": 41245 }, { "epoch": 1.4866472051032544, "grad_norm": 0.16635587811470032, "learning_rate": 4.1207024905884335e-05, "loss": 0.4265, "step": 41250 }, { "epoch": 1.4868274047644792, "grad_norm": 0.1759326308965683, "learning_rate": 4.120480292508861e-05, "loss": 0.4143, "step": 41255 }, { "epoch": 1.4870076044257037, "grad_norm": 0.15628643333911896, "learning_rate": 4.120258072350559e-05, "loss": 0.4084, "step": 41260 }, { "epoch": 1.4871878040869282, "grad_norm": 0.1729431450366974, "learning_rate": 4.1200358301165544e-05, "loss": 0.4391, "step": 41265 }, { "epoch": 1.487368003748153, "grad_norm": 0.20765365660190582, "learning_rate": 4.1198135658098755e-05, "loss": 0.4579, "step": 41270 }, { "epoch": 1.4875482034093777, "grad_norm": 0.18058431148529053, "learning_rate": 4.11959127943355e-05, "loss": 0.4406, "step": 41275 }, { "epoch": 1.4877284030706022, "grad_norm": 0.16490638256072998, "learning_rate": 4.119368970990607e-05, "loss": 0.4603, "step": 41280 }, { "epoch": 1.487908602731827, "grad_norm": 0.20363958179950714, "learning_rate": 4.119146640484075e-05, "loss": 0.3987, "step": 41285 }, { "epoch": 1.4880888023930514, "grad_norm": 0.17337477207183838, "learning_rate": 4.118924287916984e-05, "loss": 0.4447, "step": 41290 }, { "epoch": 1.4882690020542761, "grad_norm": 0.1706027388572693, "learning_rate": 4.118701913292363e-05, "loss": 0.4281, "step": 41295 }, { "epoch": 1.4884492017155009, "grad_norm": 0.22588284313678741, "learning_rate": 4.118479516613242e-05, "loss": 0.4214, "step": 41300 }, { "epoch": 1.4886294013767254, "grad_norm": 0.17973540723323822, "learning_rate": 4.1182570978826496e-05, "loss": 0.4239, "step": 41305 }, { "epoch": 1.4888096010379501, "grad_norm": 0.1527833491563797, "learning_rate": 4.118034657103619e-05, "loss": 0.3976, "step": 41310 }, { "epoch": 1.4889898006991746, "grad_norm": 0.15691448748111725, "learning_rate": 4.1178121942791786e-05, "loss": 0.4011, "step": 41315 }, { "epoch": 1.4891700003603994, "grad_norm": 0.20908258855342865, "learning_rate": 4.117589709412361e-05, "loss": 0.4172, "step": 41320 }, { "epoch": 1.4893502000216239, "grad_norm": 0.18858399987220764, "learning_rate": 4.117367202506196e-05, "loss": 0.4134, "step": 41325 }, { "epoch": 1.4895303996828486, "grad_norm": 0.2324933558702469, "learning_rate": 4.117144673563717e-05, "loss": 0.4321, "step": 41330 }, { "epoch": 1.4897105993440731, "grad_norm": 0.1979554146528244, "learning_rate": 4.116922122587954e-05, "loss": 0.3956, "step": 41335 }, { "epoch": 1.4898907990052979, "grad_norm": 0.17453639209270477, "learning_rate": 4.11669954958194e-05, "loss": 0.4303, "step": 41340 }, { "epoch": 1.4900709986665226, "grad_norm": 0.17398759722709656, "learning_rate": 4.116476954548708e-05, "loss": 0.4485, "step": 41345 }, { "epoch": 1.490251198327747, "grad_norm": 0.1593339741230011, "learning_rate": 4.116254337491291e-05, "loss": 0.4429, "step": 41350 }, { "epoch": 1.4904313979889718, "grad_norm": 0.15613697469234467, "learning_rate": 4.1160316984127205e-05, "loss": 0.3826, "step": 41355 }, { "epoch": 1.4906115976501964, "grad_norm": 0.1693560630083084, "learning_rate": 4.115809037316032e-05, "loss": 0.4813, "step": 41360 }, { "epoch": 1.490791797311421, "grad_norm": 0.15100261569023132, "learning_rate": 4.1155863542042575e-05, "loss": 0.4032, "step": 41365 }, { "epoch": 1.4909719969726458, "grad_norm": 0.2202003002166748, "learning_rate": 4.115363649080432e-05, "loss": 0.3974, "step": 41370 }, { "epoch": 1.4911521966338703, "grad_norm": 0.1720680594444275, "learning_rate": 4.11514092194759e-05, "loss": 0.3987, "step": 41375 }, { "epoch": 1.4913323962950948, "grad_norm": 0.16511781513690948, "learning_rate": 4.114918172808765e-05, "loss": 0.3955, "step": 41380 }, { "epoch": 1.4915125959563196, "grad_norm": 0.18977214395999908, "learning_rate": 4.1146954016669925e-05, "loss": 0.3936, "step": 41385 }, { "epoch": 1.4916927956175443, "grad_norm": 0.18784664571285248, "learning_rate": 4.1144726085253084e-05, "loss": 0.4507, "step": 41390 }, { "epoch": 1.4918729952787688, "grad_norm": 0.1858154982328415, "learning_rate": 4.1142497933867465e-05, "loss": 0.4643, "step": 41395 }, { "epoch": 1.4920531949399936, "grad_norm": 0.19693292677402496, "learning_rate": 4.1140269562543445e-05, "loss": 0.411, "step": 41400 }, { "epoch": 1.492233394601218, "grad_norm": 0.14796632528305054, "learning_rate": 4.113804097131138e-05, "loss": 0.4257, "step": 41405 }, { "epoch": 1.4924135942624428, "grad_norm": 0.2132243663072586, "learning_rate": 4.1135812160201624e-05, "loss": 0.4285, "step": 41410 }, { "epoch": 1.4925937939236675, "grad_norm": 0.151437446475029, "learning_rate": 4.113358312924455e-05, "loss": 0.3778, "step": 41415 }, { "epoch": 1.492773993584892, "grad_norm": 0.1567225605249405, "learning_rate": 4.1131353878470536e-05, "loss": 0.365, "step": 41420 }, { "epoch": 1.4929541932461168, "grad_norm": 0.19728878140449524, "learning_rate": 4.1129124407909944e-05, "loss": 0.4408, "step": 41425 }, { "epoch": 1.4931343929073413, "grad_norm": 0.1711881160736084, "learning_rate": 4.112689471759316e-05, "loss": 0.3945, "step": 41430 }, { "epoch": 1.493314592568566, "grad_norm": 0.18180076777935028, "learning_rate": 4.112466480755055e-05, "loss": 0.4029, "step": 41435 }, { "epoch": 1.4934947922297905, "grad_norm": 0.17946167290210724, "learning_rate": 4.1122434677812506e-05, "loss": 0.4141, "step": 41440 }, { "epoch": 1.4936749918910153, "grad_norm": 0.17549100518226624, "learning_rate": 4.1120204328409416e-05, "loss": 0.4119, "step": 41445 }, { "epoch": 1.4938551915522398, "grad_norm": 0.18163146078586578, "learning_rate": 4.1117973759371666e-05, "loss": 0.4345, "step": 41450 }, { "epoch": 1.4940353912134645, "grad_norm": 0.1497826874256134, "learning_rate": 4.111574297072963e-05, "loss": 0.3586, "step": 41455 }, { "epoch": 1.4942155908746892, "grad_norm": 0.17524641752243042, "learning_rate": 4.111351196251373e-05, "loss": 0.3819, "step": 41460 }, { "epoch": 1.4943957905359138, "grad_norm": 0.19001320004463196, "learning_rate": 4.1111280734754345e-05, "loss": 0.4198, "step": 41465 }, { "epoch": 1.4945759901971385, "grad_norm": 0.15616640448570251, "learning_rate": 4.1109049287481874e-05, "loss": 0.4113, "step": 41470 }, { "epoch": 1.494756189858363, "grad_norm": 0.1617770940065384, "learning_rate": 4.110681762072672e-05, "loss": 0.4383, "step": 41475 }, { "epoch": 1.4949363895195877, "grad_norm": 0.20571884512901306, "learning_rate": 4.110458573451931e-05, "loss": 0.4229, "step": 41480 }, { "epoch": 1.4951165891808125, "grad_norm": 0.18498611450195312, "learning_rate": 4.110235362889003e-05, "loss": 0.4011, "step": 41485 }, { "epoch": 1.495296788842037, "grad_norm": 0.19509319961071014, "learning_rate": 4.1100121303869296e-05, "loss": 0.41, "step": 41490 }, { "epoch": 1.4954769885032615, "grad_norm": 0.21900251507759094, "learning_rate": 4.109788875948754e-05, "loss": 0.4099, "step": 41495 }, { "epoch": 1.4956571881644862, "grad_norm": 0.1650031954050064, "learning_rate": 4.109565599577515e-05, "loss": 0.386, "step": 41500 }, { "epoch": 1.4956571881644862, "eval_loss": 0.44493234157562256, "eval_runtime": 3.5367, "eval_samples_per_second": 28.275, "eval_steps_per_second": 7.069, "step": 41500 }, { "epoch": 1.495837387825711, "grad_norm": 0.19879759848117828, "learning_rate": 4.109342301276257e-05, "loss": 0.4323, "step": 41505 }, { "epoch": 1.4960175874869355, "grad_norm": 0.22263413667678833, "learning_rate": 4.109118981048022e-05, "loss": 0.4187, "step": 41510 }, { "epoch": 1.4961977871481602, "grad_norm": 0.18593116104602814, "learning_rate": 4.1088956388958524e-05, "loss": 0.4388, "step": 41515 }, { "epoch": 1.4963779868093847, "grad_norm": 0.16802819073200226, "learning_rate": 4.1086722748227903e-05, "loss": 0.4141, "step": 41520 }, { "epoch": 1.4965581864706095, "grad_norm": 0.19732625782489777, "learning_rate": 4.108448888831881e-05, "loss": 0.4034, "step": 41525 }, { "epoch": 1.4967383861318342, "grad_norm": 0.15761670470237732, "learning_rate": 4.108225480926167e-05, "loss": 0.4235, "step": 41530 }, { "epoch": 1.4969185857930587, "grad_norm": 0.195955291390419, "learning_rate": 4.108002051108691e-05, "loss": 0.4061, "step": 41535 }, { "epoch": 1.4970987854542832, "grad_norm": 0.19495412707328796, "learning_rate": 4.107778599382499e-05, "loss": 0.4036, "step": 41540 }, { "epoch": 1.497278985115508, "grad_norm": 0.2070777267217636, "learning_rate": 4.1075551257506354e-05, "loss": 0.4182, "step": 41545 }, { "epoch": 1.4974591847767327, "grad_norm": 0.1688157469034195, "learning_rate": 4.1073316302161435e-05, "loss": 0.4271, "step": 41550 }, { "epoch": 1.4976393844379572, "grad_norm": 0.15762291848659515, "learning_rate": 4.1071081127820696e-05, "loss": 0.4158, "step": 41555 }, { "epoch": 1.497819584099182, "grad_norm": 0.19532495737075806, "learning_rate": 4.1068845734514593e-05, "loss": 0.4508, "step": 41560 }, { "epoch": 1.4979997837604064, "grad_norm": 0.15557561814785004, "learning_rate": 4.1066610122273575e-05, "loss": 0.4036, "step": 41565 }, { "epoch": 1.4981799834216312, "grad_norm": 0.18481487035751343, "learning_rate": 4.1064374291128106e-05, "loss": 0.4475, "step": 41570 }, { "epoch": 1.498360183082856, "grad_norm": 0.17046202719211578, "learning_rate": 4.1062138241108645e-05, "loss": 0.4168, "step": 41575 }, { "epoch": 1.4985403827440804, "grad_norm": 0.18078990280628204, "learning_rate": 4.105990197224566e-05, "loss": 0.4325, "step": 41580 }, { "epoch": 1.4987205824053051, "grad_norm": 0.1715507060289383, "learning_rate": 4.105766548456962e-05, "loss": 0.422, "step": 41585 }, { "epoch": 1.4989007820665297, "grad_norm": 0.17720800638198853, "learning_rate": 4.1055428778111004e-05, "loss": 0.4115, "step": 41590 }, { "epoch": 1.4990809817277544, "grad_norm": 0.16280311346054077, "learning_rate": 4.105319185290027e-05, "loss": 0.4068, "step": 41595 }, { "epoch": 1.4992611813889791, "grad_norm": 0.1896113157272339, "learning_rate": 4.10509547089679e-05, "loss": 0.4245, "step": 41600 }, { "epoch": 1.4994413810502036, "grad_norm": 0.17678354680538177, "learning_rate": 4.10487173463444e-05, "loss": 0.4236, "step": 41605 }, { "epoch": 1.4996215807114281, "grad_norm": 0.15229156613349915, "learning_rate": 4.104647976506022e-05, "loss": 0.3917, "step": 41610 }, { "epoch": 1.4998017803726529, "grad_norm": 0.2196597158908844, "learning_rate": 4.104424196514586e-05, "loss": 0.4439, "step": 41615 }, { "epoch": 1.4999819800338776, "grad_norm": 0.18850156664848328, "learning_rate": 4.104200394663181e-05, "loss": 0.4067, "step": 41620 }, { "epoch": 1.5001621796951021, "grad_norm": 0.19405673444271088, "learning_rate": 4.103976570954856e-05, "loss": 0.4496, "step": 41625 }, { "epoch": 1.5003423793563269, "grad_norm": 0.20671923458576202, "learning_rate": 4.103752725392661e-05, "loss": 0.4071, "step": 41630 }, { "epoch": 1.5005225790175514, "grad_norm": 0.2034417986869812, "learning_rate": 4.103528857979646e-05, "loss": 0.3851, "step": 41635 }, { "epoch": 1.500702778678776, "grad_norm": 0.1623309999704361, "learning_rate": 4.10330496871886e-05, "loss": 0.4161, "step": 41640 }, { "epoch": 1.5008829783400008, "grad_norm": 0.1519644856452942, "learning_rate": 4.103081057613355e-05, "loss": 0.4051, "step": 41645 }, { "epoch": 1.5010631780012254, "grad_norm": 0.14681312441825867, "learning_rate": 4.1028571246661804e-05, "loss": 0.4235, "step": 41650 }, { "epoch": 1.5012433776624499, "grad_norm": 0.189476877450943, "learning_rate": 4.102633169880388e-05, "loss": 0.3896, "step": 41655 }, { "epoch": 1.5014235773236746, "grad_norm": 0.17130400240421295, "learning_rate": 4.102409193259029e-05, "loss": 0.4145, "step": 41660 }, { "epoch": 1.5016037769848993, "grad_norm": 0.16844402253627777, "learning_rate": 4.102185194805155e-05, "loss": 0.445, "step": 41665 }, { "epoch": 1.501783976646124, "grad_norm": 0.17054221034049988, "learning_rate": 4.101961174521818e-05, "loss": 0.4269, "step": 41670 }, { "epoch": 1.5019641763073486, "grad_norm": 0.1802477240562439, "learning_rate": 4.101737132412069e-05, "loss": 0.4183, "step": 41675 }, { "epoch": 1.502144375968573, "grad_norm": 0.20035143196582794, "learning_rate": 4.101513068478963e-05, "loss": 0.4227, "step": 41680 }, { "epoch": 1.5023245756297978, "grad_norm": 0.165787011384964, "learning_rate": 4.101288982725551e-05, "loss": 0.4357, "step": 41685 }, { "epoch": 1.5025047752910226, "grad_norm": 0.1805458515882492, "learning_rate": 4.1010648751548876e-05, "loss": 0.4115, "step": 41690 }, { "epoch": 1.502684974952247, "grad_norm": 0.17907939851284027, "learning_rate": 4.1008407457700234e-05, "loss": 0.3908, "step": 41695 }, { "epoch": 1.5028651746134716, "grad_norm": 0.24330779910087585, "learning_rate": 4.1006165945740154e-05, "loss": 0.4361, "step": 41700 }, { "epoch": 1.5030453742746963, "grad_norm": 0.21545155346393585, "learning_rate": 4.100392421569916e-05, "loss": 0.4375, "step": 41705 }, { "epoch": 1.503225573935921, "grad_norm": 0.18745043873786926, "learning_rate": 4.1001682267607796e-05, "loss": 0.3854, "step": 41710 }, { "epoch": 1.5034057735971458, "grad_norm": 0.16889239847660065, "learning_rate": 4.0999440101496606e-05, "loss": 0.3463, "step": 41715 }, { "epoch": 1.5035859732583703, "grad_norm": 0.18077971041202545, "learning_rate": 4.099719771739614e-05, "loss": 0.3698, "step": 41720 }, { "epoch": 1.5037661729195948, "grad_norm": 0.20077045261859894, "learning_rate": 4.099495511533696e-05, "loss": 0.4117, "step": 41725 }, { "epoch": 1.5039463725808195, "grad_norm": 0.1883951872587204, "learning_rate": 4.099271229534961e-05, "loss": 0.4191, "step": 41730 }, { "epoch": 1.5041265722420443, "grad_norm": 0.18416355550289154, "learning_rate": 4.0990469257464645e-05, "loss": 0.3896, "step": 41735 }, { "epoch": 1.5043067719032688, "grad_norm": 0.20206254720687866, "learning_rate": 4.0988226001712646e-05, "loss": 0.4027, "step": 41740 }, { "epoch": 1.5044869715644935, "grad_norm": 0.18239402770996094, "learning_rate": 4.0985982528124156e-05, "loss": 0.3764, "step": 41745 }, { "epoch": 1.504667171225718, "grad_norm": 0.1996680647134781, "learning_rate": 4.098373883672974e-05, "loss": 0.404, "step": 41750 }, { "epoch": 1.5048473708869428, "grad_norm": 0.1679215282201767, "learning_rate": 4.0981494927559994e-05, "loss": 0.4324, "step": 41755 }, { "epoch": 1.5050275705481675, "grad_norm": 0.19576683640480042, "learning_rate": 4.0979250800645465e-05, "loss": 0.4211, "step": 41760 }, { "epoch": 1.505207770209392, "grad_norm": 0.20221199095249176, "learning_rate": 4.097700645601673e-05, "loss": 0.3923, "step": 41765 }, { "epoch": 1.5053879698706165, "grad_norm": 0.14687031507492065, "learning_rate": 4.097476189370439e-05, "loss": 0.3818, "step": 41770 }, { "epoch": 1.5055681695318412, "grad_norm": 0.19657109677791595, "learning_rate": 4.097251711373901e-05, "loss": 0.417, "step": 41775 }, { "epoch": 1.505748369193066, "grad_norm": 0.2037065178155899, "learning_rate": 4.097027211615117e-05, "loss": 0.3996, "step": 41780 }, { "epoch": 1.5059285688542907, "grad_norm": 0.22538067400455475, "learning_rate": 4.096802690097146e-05, "loss": 0.4108, "step": 41785 }, { "epoch": 1.5061087685155152, "grad_norm": 0.2207726240158081, "learning_rate": 4.096578146823048e-05, "loss": 0.4288, "step": 41790 }, { "epoch": 1.5062889681767397, "grad_norm": 0.16942410171031952, "learning_rate": 4.096353581795882e-05, "loss": 0.4209, "step": 41795 }, { "epoch": 1.5064691678379645, "grad_norm": 0.22281381487846375, "learning_rate": 4.096128995018707e-05, "loss": 0.4037, "step": 41800 }, { "epoch": 1.5066493674991892, "grad_norm": 0.19026844203472137, "learning_rate": 4.095904386494585e-05, "loss": 0.407, "step": 41805 }, { "epoch": 1.5068295671604137, "grad_norm": 0.23003491759300232, "learning_rate": 4.095679756226573e-05, "loss": 0.3969, "step": 41810 }, { "epoch": 1.5070097668216382, "grad_norm": 0.19548535346984863, "learning_rate": 4.0954551042177337e-05, "loss": 0.403, "step": 41815 }, { "epoch": 1.507189966482863, "grad_norm": 0.19245415925979614, "learning_rate": 4.0952304304711275e-05, "loss": 0.398, "step": 41820 }, { "epoch": 1.5073701661440877, "grad_norm": 0.18857969343662262, "learning_rate": 4.0950057349898165e-05, "loss": 0.4223, "step": 41825 }, { "epoch": 1.5075503658053124, "grad_norm": 0.19236567616462708, "learning_rate": 4.0947810177768595e-05, "loss": 0.4026, "step": 41830 }, { "epoch": 1.507730565466537, "grad_norm": 0.14722810685634613, "learning_rate": 4.094556278835321e-05, "loss": 0.3917, "step": 41835 }, { "epoch": 1.5079107651277615, "grad_norm": 0.18496693670749664, "learning_rate": 4.094331518168262e-05, "loss": 0.4228, "step": 41840 }, { "epoch": 1.5080909647889862, "grad_norm": 0.14098331332206726, "learning_rate": 4.0941067357787445e-05, "loss": 0.4242, "step": 41845 }, { "epoch": 1.508271164450211, "grad_norm": 0.16862405836582184, "learning_rate": 4.093881931669831e-05, "loss": 0.4082, "step": 41850 }, { "epoch": 1.5084513641114354, "grad_norm": 0.18131491541862488, "learning_rate": 4.093657105844585e-05, "loss": 0.4356, "step": 41855 }, { "epoch": 1.50863156377266, "grad_norm": 0.17331118881702423, "learning_rate": 4.093432258306069e-05, "loss": 0.4143, "step": 41860 }, { "epoch": 1.5088117634338847, "grad_norm": 0.17720840871334076, "learning_rate": 4.093207389057348e-05, "loss": 0.3942, "step": 41865 }, { "epoch": 1.5089919630951094, "grad_norm": 0.18100488185882568, "learning_rate": 4.0929824981014845e-05, "loss": 0.4303, "step": 41870 }, { "epoch": 1.5091721627563341, "grad_norm": 0.1756538450717926, "learning_rate": 4.0927575854415425e-05, "loss": 0.418, "step": 41875 }, { "epoch": 1.5093523624175587, "grad_norm": 0.2303672730922699, "learning_rate": 4.092532651080587e-05, "loss": 0.4329, "step": 41880 }, { "epoch": 1.5095325620787832, "grad_norm": 0.15763244032859802, "learning_rate": 4.092307695021682e-05, "loss": 0.4029, "step": 41885 }, { "epoch": 1.509712761740008, "grad_norm": 0.16717827320098877, "learning_rate": 4.092082717267893e-05, "loss": 0.4265, "step": 41890 }, { "epoch": 1.5098929614012326, "grad_norm": 0.2047724723815918, "learning_rate": 4.091857717822286e-05, "loss": 0.4463, "step": 41895 }, { "epoch": 1.5100731610624571, "grad_norm": 0.18995271623134613, "learning_rate": 4.091632696687925e-05, "loss": 0.4023, "step": 41900 }, { "epoch": 1.5102533607236819, "grad_norm": 0.19766134023666382, "learning_rate": 4.091407653867877e-05, "loss": 0.4182, "step": 41905 }, { "epoch": 1.5104335603849064, "grad_norm": 0.21319149434566498, "learning_rate": 4.091182589365208e-05, "loss": 0.4199, "step": 41910 }, { "epoch": 1.5106137600461311, "grad_norm": 0.17943358421325684, "learning_rate": 4.090957503182984e-05, "loss": 0.3911, "step": 41915 }, { "epoch": 1.5107939597073559, "grad_norm": 0.196610227227211, "learning_rate": 4.0907323953242724e-05, "loss": 0.4463, "step": 41920 }, { "epoch": 1.5109741593685804, "grad_norm": 0.18429423868656158, "learning_rate": 4.0905072657921396e-05, "loss": 0.4321, "step": 41925 }, { "epoch": 1.5111543590298049, "grad_norm": 0.16434848308563232, "learning_rate": 4.090282114589653e-05, "loss": 0.4202, "step": 41930 }, { "epoch": 1.5113345586910296, "grad_norm": 0.15789394080638885, "learning_rate": 4.090056941719881e-05, "loss": 0.3869, "step": 41935 }, { "epoch": 1.5115147583522544, "grad_norm": 0.16550783812999725, "learning_rate": 4.08983174718589e-05, "loss": 0.4141, "step": 41940 }, { "epoch": 1.511694958013479, "grad_norm": 0.159341499209404, "learning_rate": 4.08960653099075e-05, "loss": 0.4122, "step": 41945 }, { "epoch": 1.5118751576747036, "grad_norm": 0.187036395072937, "learning_rate": 4.089381293137529e-05, "loss": 0.4435, "step": 41950 }, { "epoch": 1.512055357335928, "grad_norm": 0.18756411969661713, "learning_rate": 4.0891560336292955e-05, "loss": 0.4126, "step": 41955 }, { "epoch": 1.5122355569971528, "grad_norm": 0.18429414927959442, "learning_rate": 4.088930752469119e-05, "loss": 0.4426, "step": 41960 }, { "epoch": 1.5124157566583776, "grad_norm": 0.16013193130493164, "learning_rate": 4.0887054496600676e-05, "loss": 0.3959, "step": 41965 }, { "epoch": 1.512595956319602, "grad_norm": 0.20077770948410034, "learning_rate": 4.088480125205213e-05, "loss": 0.3921, "step": 41970 }, { "epoch": 1.5127761559808266, "grad_norm": 0.20495128631591797, "learning_rate": 4.088254779107624e-05, "loss": 0.4392, "step": 41975 }, { "epoch": 1.5129563556420513, "grad_norm": 0.16244658827781677, "learning_rate": 4.088074486648847e-05, "loss": 0.4346, "step": 41980 }, { "epoch": 1.513136555303276, "grad_norm": 0.2103343904018402, "learning_rate": 4.0878491016020734e-05, "loss": 0.4033, "step": 41985 }, { "epoch": 1.5133167549645008, "grad_norm": 0.17164449393749237, "learning_rate": 4.087623694921164e-05, "loss": 0.4303, "step": 41990 }, { "epoch": 1.5134969546257253, "grad_norm": 0.1736312210559845, "learning_rate": 4.087398266609188e-05, "loss": 0.4023, "step": 41995 }, { "epoch": 1.5136771542869498, "grad_norm": 0.17446556687355042, "learning_rate": 4.0871728166692195e-05, "loss": 0.4104, "step": 42000 }, { "epoch": 1.5136771542869498, "eval_loss": 0.4447964131832123, "eval_runtime": 3.5328, "eval_samples_per_second": 28.306, "eval_steps_per_second": 7.076, "step": 42000 }, { "epoch": 1.5138573539481746, "grad_norm": 0.16829872131347656, "learning_rate": 4.0869473451043274e-05, "loss": 0.4072, "step": 42005 }, { "epoch": 1.5140375536093993, "grad_norm": 0.1769193410873413, "learning_rate": 4.086721851917585e-05, "loss": 0.3981, "step": 42010 }, { "epoch": 1.5142177532706238, "grad_norm": 0.13675101101398468, "learning_rate": 4.086496337112064e-05, "loss": 0.3936, "step": 42015 }, { "epoch": 1.5143979529318485, "grad_norm": 0.16778729856014252, "learning_rate": 4.0862708006908393e-05, "loss": 0.3619, "step": 42020 }, { "epoch": 1.514578152593073, "grad_norm": 0.21226206421852112, "learning_rate": 4.086045242656982e-05, "loss": 0.4095, "step": 42025 }, { "epoch": 1.5147583522542978, "grad_norm": 0.16903939843177795, "learning_rate": 4.0858196630135636e-05, "loss": 0.4046, "step": 42030 }, { "epoch": 1.5149385519155225, "grad_norm": 0.18711115419864655, "learning_rate": 4.0855940617636604e-05, "loss": 0.4402, "step": 42035 }, { "epoch": 1.515118751576747, "grad_norm": 0.22342517971992493, "learning_rate": 4.0853684389103444e-05, "loss": 0.4173, "step": 42040 }, { "epoch": 1.5152989512379715, "grad_norm": 0.2237049639225006, "learning_rate": 4.0851427944566915e-05, "loss": 0.441, "step": 42045 }, { "epoch": 1.5154791508991963, "grad_norm": 0.1923510730266571, "learning_rate": 4.084917128405774e-05, "loss": 0.4073, "step": 42050 }, { "epoch": 1.515659350560421, "grad_norm": 0.14314790070056915, "learning_rate": 4.0846914407606676e-05, "loss": 0.3933, "step": 42055 }, { "epoch": 1.5158395502216457, "grad_norm": 0.18931898474693298, "learning_rate": 4.0844657315244475e-05, "loss": 0.3878, "step": 42060 }, { "epoch": 1.5160197498828702, "grad_norm": 0.1530705988407135, "learning_rate": 4.084240000700188e-05, "loss": 0.4434, "step": 42065 }, { "epoch": 1.5161999495440948, "grad_norm": 0.1672196239233017, "learning_rate": 4.084014248290966e-05, "loss": 0.423, "step": 42070 }, { "epoch": 1.5163801492053195, "grad_norm": 0.17072351276874542, "learning_rate": 4.083788474299856e-05, "loss": 0.4333, "step": 42075 }, { "epoch": 1.5165603488665442, "grad_norm": 0.18741382658481598, "learning_rate": 4.083562678729935e-05, "loss": 0.4173, "step": 42080 }, { "epoch": 1.5167405485277687, "grad_norm": 0.19706690311431885, "learning_rate": 4.083336861584278e-05, "loss": 0.4282, "step": 42085 }, { "epoch": 1.5169207481889933, "grad_norm": 0.1541031002998352, "learning_rate": 4.0831110228659644e-05, "loss": 0.4078, "step": 42090 }, { "epoch": 1.517100947850218, "grad_norm": 0.16391873359680176, "learning_rate": 4.082885162578069e-05, "loss": 0.4351, "step": 42095 }, { "epoch": 1.5172811475114427, "grad_norm": 0.1912384182214737, "learning_rate": 4.0826592807236694e-05, "loss": 0.3714, "step": 42100 }, { "epoch": 1.5174613471726675, "grad_norm": 0.19222666323184967, "learning_rate": 4.0824333773058434e-05, "loss": 0.4148, "step": 42105 }, { "epoch": 1.517641546833892, "grad_norm": 0.18687984347343445, "learning_rate": 4.08220745232767e-05, "loss": 0.4279, "step": 42110 }, { "epoch": 1.5178217464951165, "grad_norm": 0.17974166572093964, "learning_rate": 4.0819815057922254e-05, "loss": 0.4572, "step": 42115 }, { "epoch": 1.5180019461563412, "grad_norm": 0.18375541269779205, "learning_rate": 4.0817555377025895e-05, "loss": 0.4348, "step": 42120 }, { "epoch": 1.518182145817566, "grad_norm": 0.21198663115501404, "learning_rate": 4.0815295480618395e-05, "loss": 0.4156, "step": 42125 }, { "epoch": 1.5183623454787905, "grad_norm": 0.18676543235778809, "learning_rate": 4.081303536873058e-05, "loss": 0.4357, "step": 42130 }, { "epoch": 1.5185425451400152, "grad_norm": 0.19135519862174988, "learning_rate": 4.08107750413932e-05, "loss": 0.3997, "step": 42135 }, { "epoch": 1.5187227448012397, "grad_norm": 0.16404955089092255, "learning_rate": 4.080851449863708e-05, "loss": 0.3918, "step": 42140 }, { "epoch": 1.5189029444624644, "grad_norm": 0.17126797139644623, "learning_rate": 4.080625374049301e-05, "loss": 0.4189, "step": 42145 }, { "epoch": 1.5190831441236892, "grad_norm": 0.1819930374622345, "learning_rate": 4.0803992766991785e-05, "loss": 0.4446, "step": 42150 }, { "epoch": 1.5192633437849137, "grad_norm": 0.18088293075561523, "learning_rate": 4.080173157816422e-05, "loss": 0.4154, "step": 42155 }, { "epoch": 1.5194435434461382, "grad_norm": 0.18512316048145294, "learning_rate": 4.079947017404113e-05, "loss": 0.43, "step": 42160 }, { "epoch": 1.519623743107363, "grad_norm": 0.17444075644016266, "learning_rate": 4.079720855465331e-05, "loss": 0.4072, "step": 42165 }, { "epoch": 1.5198039427685877, "grad_norm": 0.16421203315258026, "learning_rate": 4.079494672003159e-05, "loss": 0.4049, "step": 42170 }, { "epoch": 1.5199841424298124, "grad_norm": 0.24568875133991241, "learning_rate": 4.079268467020677e-05, "loss": 0.4327, "step": 42175 }, { "epoch": 1.520164342091037, "grad_norm": 0.2001422941684723, "learning_rate": 4.079042240520968e-05, "loss": 0.4125, "step": 42180 }, { "epoch": 1.5203445417522614, "grad_norm": 0.16765598952770233, "learning_rate": 4.078815992507115e-05, "loss": 0.3885, "step": 42185 }, { "epoch": 1.5205247414134861, "grad_norm": 0.20924493670463562, "learning_rate": 4.078589722982199e-05, "loss": 0.4214, "step": 42190 }, { "epoch": 1.5207049410747109, "grad_norm": 0.183701753616333, "learning_rate": 4.078363431949304e-05, "loss": 0.4176, "step": 42195 }, { "epoch": 1.5208851407359354, "grad_norm": 0.1759697049856186, "learning_rate": 4.0781371194115126e-05, "loss": 0.4188, "step": 42200 }, { "epoch": 1.52106534039716, "grad_norm": 0.19455984234809875, "learning_rate": 4.077910785371909e-05, "loss": 0.4576, "step": 42205 }, { "epoch": 1.5212455400583846, "grad_norm": 0.21152974665164948, "learning_rate": 4.077684429833576e-05, "loss": 0.4207, "step": 42210 }, { "epoch": 1.5214257397196094, "grad_norm": 0.18577638268470764, "learning_rate": 4.0774580527995975e-05, "loss": 0.4593, "step": 42215 }, { "epoch": 1.521605939380834, "grad_norm": 0.18051621317863464, "learning_rate": 4.0772316542730594e-05, "loss": 0.4016, "step": 42220 }, { "epoch": 1.5217861390420586, "grad_norm": 0.1438736468553543, "learning_rate": 4.077005234257045e-05, "loss": 0.4366, "step": 42225 }, { "epoch": 1.5219663387032831, "grad_norm": 0.20258064568042755, "learning_rate": 4.0767787927546394e-05, "loss": 0.4339, "step": 42230 }, { "epoch": 1.5221465383645079, "grad_norm": 0.19496627151966095, "learning_rate": 4.076552329768929e-05, "loss": 0.4543, "step": 42235 }, { "epoch": 1.5223267380257326, "grad_norm": 0.1588614284992218, "learning_rate": 4.0763258453029976e-05, "loss": 0.4298, "step": 42240 }, { "epoch": 1.522506937686957, "grad_norm": 0.1556130349636078, "learning_rate": 4.076099339359931e-05, "loss": 0.4272, "step": 42245 }, { "epoch": 1.5226871373481818, "grad_norm": 0.20325186848640442, "learning_rate": 4.0758728119428166e-05, "loss": 0.4189, "step": 42250 }, { "epoch": 1.5228673370094064, "grad_norm": 0.20584464073181152, "learning_rate": 4.075646263054741e-05, "loss": 0.4432, "step": 42255 }, { "epoch": 1.523047536670631, "grad_norm": 0.15573151409626007, "learning_rate": 4.0754196926987897e-05, "loss": 0.3929, "step": 42260 }, { "epoch": 1.5232277363318558, "grad_norm": 0.17175668478012085, "learning_rate": 4.0751931008780496e-05, "loss": 0.4094, "step": 42265 }, { "epoch": 1.5234079359930803, "grad_norm": 0.16942541301250458, "learning_rate": 4.074966487595608e-05, "loss": 0.4101, "step": 42270 }, { "epoch": 1.5235881356543048, "grad_norm": 0.1620819866657257, "learning_rate": 4.074739852854555e-05, "loss": 0.4122, "step": 42275 }, { "epoch": 1.5237683353155296, "grad_norm": 0.22527901828289032, "learning_rate": 4.0745131966579744e-05, "loss": 0.3755, "step": 42280 }, { "epoch": 1.5239485349767543, "grad_norm": 0.16794419288635254, "learning_rate": 4.074286519008957e-05, "loss": 0.4646, "step": 42285 }, { "epoch": 1.524128734637979, "grad_norm": 0.20727737247943878, "learning_rate": 4.0740598199105904e-05, "loss": 0.3832, "step": 42290 }, { "epoch": 1.5243089342992036, "grad_norm": 0.1709243506193161, "learning_rate": 4.073833099365965e-05, "loss": 0.4257, "step": 42295 }, { "epoch": 1.524489133960428, "grad_norm": 0.16771294176578522, "learning_rate": 4.073606357378166e-05, "loss": 0.4417, "step": 42300 }, { "epoch": 1.5246693336216528, "grad_norm": 0.2179926335811615, "learning_rate": 4.073379593950286e-05, "loss": 0.3852, "step": 42305 }, { "epoch": 1.5248495332828775, "grad_norm": 0.19868656992912292, "learning_rate": 4.073152809085414e-05, "loss": 0.4342, "step": 42310 }, { "epoch": 1.525029732944102, "grad_norm": 0.14885884523391724, "learning_rate": 4.0729260027866395e-05, "loss": 0.4217, "step": 42315 }, { "epoch": 1.5252099326053266, "grad_norm": 0.21947623789310455, "learning_rate": 4.0726991750570525e-05, "loss": 0.4223, "step": 42320 }, { "epoch": 1.5253901322665513, "grad_norm": 0.2015712708234787, "learning_rate": 4.072472325899743e-05, "loss": 0.4617, "step": 42325 }, { "epoch": 1.525570331927776, "grad_norm": 0.1568281650543213, "learning_rate": 4.072245455317804e-05, "loss": 0.3907, "step": 42330 }, { "epoch": 1.5257505315890008, "grad_norm": 0.16805681586265564, "learning_rate": 4.0720185633143246e-05, "loss": 0.421, "step": 42335 }, { "epoch": 1.5259307312502253, "grad_norm": 0.16904395818710327, "learning_rate": 4.071791649892396e-05, "loss": 0.4295, "step": 42340 }, { "epoch": 1.5261109309114498, "grad_norm": 0.15109315514564514, "learning_rate": 4.07156471505511e-05, "loss": 0.3572, "step": 42345 }, { "epoch": 1.5262911305726745, "grad_norm": 0.1983298510313034, "learning_rate": 4.0713377588055604e-05, "loss": 0.4146, "step": 42350 }, { "epoch": 1.5264713302338992, "grad_norm": 0.22432877123355865, "learning_rate": 4.0711107811468375e-05, "loss": 0.4239, "step": 42355 }, { "epoch": 1.5266515298951238, "grad_norm": 0.18590804934501648, "learning_rate": 4.070883782082035e-05, "loss": 0.4095, "step": 42360 }, { "epoch": 1.5268317295563483, "grad_norm": 0.22011986374855042, "learning_rate": 4.070656761614244e-05, "loss": 0.4078, "step": 42365 }, { "epoch": 1.527011929217573, "grad_norm": 0.19090569019317627, "learning_rate": 4.0704297197465594e-05, "loss": 0.4148, "step": 42370 }, { "epoch": 1.5271921288787977, "grad_norm": 0.14910888671875, "learning_rate": 4.070202656482074e-05, "loss": 0.41, "step": 42375 }, { "epoch": 1.5273723285400225, "grad_norm": 0.13588306307792664, "learning_rate": 4.0699755718238806e-05, "loss": 0.416, "step": 42380 }, { "epoch": 1.527552528201247, "grad_norm": 0.1951780468225479, "learning_rate": 4.069748465775075e-05, "loss": 0.4077, "step": 42385 }, { "epoch": 1.5277327278624715, "grad_norm": 0.1684001386165619, "learning_rate": 4.06952133833875e-05, "loss": 0.3969, "step": 42390 }, { "epoch": 1.5279129275236962, "grad_norm": 0.15427611768245697, "learning_rate": 4.0692941895180004e-05, "loss": 0.4256, "step": 42395 }, { "epoch": 1.528093127184921, "grad_norm": 0.19290746748447418, "learning_rate": 4.0690670193159214e-05, "loss": 0.4083, "step": 42400 }, { "epoch": 1.5282733268461455, "grad_norm": 0.23477697372436523, "learning_rate": 4.0688398277356076e-05, "loss": 0.443, "step": 42405 }, { "epoch": 1.5284535265073702, "grad_norm": 0.1695445477962494, "learning_rate": 4.068612614780156e-05, "loss": 0.4119, "step": 42410 }, { "epoch": 1.5286337261685947, "grad_norm": 0.1924455612897873, "learning_rate": 4.068385380452661e-05, "loss": 0.3882, "step": 42415 }, { "epoch": 1.5288139258298195, "grad_norm": 0.17386910319328308, "learning_rate": 4.0681581247562186e-05, "loss": 0.4252, "step": 42420 }, { "epoch": 1.5289941254910442, "grad_norm": 0.15284450352191925, "learning_rate": 4.0679308476939245e-05, "loss": 0.3943, "step": 42425 }, { "epoch": 1.5291743251522687, "grad_norm": 0.21759410202503204, "learning_rate": 4.067703549268877e-05, "loss": 0.4231, "step": 42430 }, { "epoch": 1.5293545248134932, "grad_norm": 0.20288415253162384, "learning_rate": 4.0674762294841715e-05, "loss": 0.4339, "step": 42435 }, { "epoch": 1.529534724474718, "grad_norm": 0.229291170835495, "learning_rate": 4.067248888342907e-05, "loss": 0.4271, "step": 42440 }, { "epoch": 1.5297149241359427, "grad_norm": 0.1930796056985855, "learning_rate": 4.067021525848179e-05, "loss": 0.4097, "step": 42445 }, { "epoch": 1.5298951237971674, "grad_norm": 0.17433331906795502, "learning_rate": 4.066794142003086e-05, "loss": 0.403, "step": 42450 }, { "epoch": 1.530075323458392, "grad_norm": 0.15750248730182648, "learning_rate": 4.066566736810727e-05, "loss": 0.3983, "step": 42455 }, { "epoch": 1.5302555231196164, "grad_norm": 0.22471831738948822, "learning_rate": 4.0663393102741995e-05, "loss": 0.4662, "step": 42460 }, { "epoch": 1.5304357227808412, "grad_norm": 0.17949113249778748, "learning_rate": 4.066111862396601e-05, "loss": 0.4209, "step": 42465 }, { "epoch": 1.530615922442066, "grad_norm": 0.17478035390377045, "learning_rate": 4.065884393181032e-05, "loss": 0.3824, "step": 42470 }, { "epoch": 1.5307961221032904, "grad_norm": 0.14620402455329895, "learning_rate": 4.065656902630592e-05, "loss": 0.4105, "step": 42475 }, { "epoch": 1.530976321764515, "grad_norm": 0.2226269692182541, "learning_rate": 4.06542939074838e-05, "loss": 0.4242, "step": 42480 }, { "epoch": 1.5311565214257397, "grad_norm": 0.1945657581090927, "learning_rate": 4.0652018575374945e-05, "loss": 0.4298, "step": 42485 }, { "epoch": 1.5313367210869644, "grad_norm": 0.18082071840763092, "learning_rate": 4.0649743030010366e-05, "loss": 0.4119, "step": 42490 }, { "epoch": 1.5315169207481891, "grad_norm": 0.1490999460220337, "learning_rate": 4.064746727142108e-05, "loss": 0.3611, "step": 42495 }, { "epoch": 1.5316971204094136, "grad_norm": 0.20667734742164612, "learning_rate": 4.064519129963807e-05, "loss": 0.3959, "step": 42500 }, { "epoch": 1.5316971204094136, "eval_loss": 0.4439639151096344, "eval_runtime": 3.531, "eval_samples_per_second": 28.32, "eval_steps_per_second": 7.08, "step": 42500 }, { "epoch": 1.5318773200706381, "grad_norm": 0.18733155727386475, "learning_rate": 4.064291511469237e-05, "loss": 0.4323, "step": 42505 }, { "epoch": 1.5320575197318629, "grad_norm": 0.17432504892349243, "learning_rate": 4.064063871661497e-05, "loss": 0.3858, "step": 42510 }, { "epoch": 1.5322377193930876, "grad_norm": 0.16605962812900543, "learning_rate": 4.06383621054369e-05, "loss": 0.4115, "step": 42515 }, { "epoch": 1.5324179190543121, "grad_norm": 0.16073060035705566, "learning_rate": 4.0636085281189176e-05, "loss": 0.428, "step": 42520 }, { "epoch": 1.5325981187155369, "grad_norm": 0.18236148357391357, "learning_rate": 4.063380824390282e-05, "loss": 0.4489, "step": 42525 }, { "epoch": 1.5327783183767614, "grad_norm": 0.16601453721523285, "learning_rate": 4.063153099360884e-05, "loss": 0.4184, "step": 42530 }, { "epoch": 1.532958518037986, "grad_norm": 0.15551328659057617, "learning_rate": 4.0629253530338284e-05, "loss": 0.3619, "step": 42535 }, { "epoch": 1.5331387176992108, "grad_norm": 0.2021326869726181, "learning_rate": 4.062697585412218e-05, "loss": 0.4062, "step": 42540 }, { "epoch": 1.5333189173604354, "grad_norm": 0.1801024228334427, "learning_rate": 4.062469796499155e-05, "loss": 0.398, "step": 42545 }, { "epoch": 1.5334991170216599, "grad_norm": 0.1799817532300949, "learning_rate": 4.062241986297743e-05, "loss": 0.4576, "step": 42550 }, { "epoch": 1.5336793166828846, "grad_norm": 0.22685213387012482, "learning_rate": 4.062014154811087e-05, "loss": 0.4469, "step": 42555 }, { "epoch": 1.5338595163441093, "grad_norm": 0.19450637698173523, "learning_rate": 4.061786302042291e-05, "loss": 0.4152, "step": 42560 }, { "epoch": 1.534039716005334, "grad_norm": 0.17256684601306915, "learning_rate": 4.0615584279944586e-05, "loss": 0.3805, "step": 42565 }, { "epoch": 1.5342199156665586, "grad_norm": 0.17024463415145874, "learning_rate": 4.061330532670695e-05, "loss": 0.4086, "step": 42570 }, { "epoch": 1.534400115327783, "grad_norm": 0.16583757102489471, "learning_rate": 4.061102616074105e-05, "loss": 0.4145, "step": 42575 }, { "epoch": 1.5345803149890078, "grad_norm": 0.1984453946352005, "learning_rate": 4.060874678207794e-05, "loss": 0.426, "step": 42580 }, { "epoch": 1.5347605146502326, "grad_norm": 0.192495658993721, "learning_rate": 4.060646719074868e-05, "loss": 0.4444, "step": 42585 }, { "epoch": 1.534940714311457, "grad_norm": 0.1842796951532364, "learning_rate": 4.060418738678432e-05, "loss": 0.4038, "step": 42590 }, { "epoch": 1.5351209139726816, "grad_norm": 0.17399680614471436, "learning_rate": 4.060190737021594e-05, "loss": 0.4175, "step": 42595 }, { "epoch": 1.5353011136339063, "grad_norm": 0.20878851413726807, "learning_rate": 4.059962714107458e-05, "loss": 0.4576, "step": 42600 }, { "epoch": 1.535481313295131, "grad_norm": 0.19469203054904938, "learning_rate": 4.059734669939133e-05, "loss": 0.4215, "step": 42605 }, { "epoch": 1.5356615129563558, "grad_norm": 0.1677183359861374, "learning_rate": 4.0595066045197245e-05, "loss": 0.4248, "step": 42610 }, { "epoch": 1.5358417126175803, "grad_norm": 0.18572258949279785, "learning_rate": 4.059278517852341e-05, "loss": 0.395, "step": 42615 }, { "epoch": 1.5360219122788048, "grad_norm": 0.2249453216791153, "learning_rate": 4.059050409940089e-05, "loss": 0.4284, "step": 42620 }, { "epoch": 1.5362021119400295, "grad_norm": 0.17428001761436462, "learning_rate": 4.058822280786077e-05, "loss": 0.3935, "step": 42625 }, { "epoch": 1.5363823116012543, "grad_norm": 0.1760479211807251, "learning_rate": 4.058594130393414e-05, "loss": 0.4689, "step": 42630 }, { "epoch": 1.5365625112624788, "grad_norm": 0.2455560564994812, "learning_rate": 4.058365958765207e-05, "loss": 0.4436, "step": 42635 }, { "epoch": 1.5367427109237035, "grad_norm": 0.18493212759494781, "learning_rate": 4.058137765904565e-05, "loss": 0.4125, "step": 42640 }, { "epoch": 1.536922910584928, "grad_norm": 0.16667693853378296, "learning_rate": 4.057909551814599e-05, "loss": 0.3845, "step": 42645 }, { "epoch": 1.5371031102461528, "grad_norm": 0.1610030233860016, "learning_rate": 4.057681316498416e-05, "loss": 0.4306, "step": 42650 }, { "epoch": 1.5372833099073775, "grad_norm": 0.19051054120063782, "learning_rate": 4.0574530599591264e-05, "loss": 0.3968, "step": 42655 }, { "epoch": 1.537463509568602, "grad_norm": 0.19822818040847778, "learning_rate": 4.0572247821998414e-05, "loss": 0.4072, "step": 42660 }, { "epoch": 1.5376437092298265, "grad_norm": 0.16644228994846344, "learning_rate": 4.056996483223669e-05, "loss": 0.4174, "step": 42665 }, { "epoch": 1.5378239088910512, "grad_norm": 0.14218945801258087, "learning_rate": 4.056768163033722e-05, "loss": 0.3884, "step": 42670 }, { "epoch": 1.538004108552276, "grad_norm": 0.21810731291770935, "learning_rate": 4.05653982163311e-05, "loss": 0.3874, "step": 42675 }, { "epoch": 1.5381843082135007, "grad_norm": 0.184495210647583, "learning_rate": 4.056311459024944e-05, "loss": 0.4172, "step": 42680 }, { "epoch": 1.5383645078747252, "grad_norm": 0.23121629655361176, "learning_rate": 4.0560830752123355e-05, "loss": 0.4142, "step": 42685 }, { "epoch": 1.5385447075359497, "grad_norm": 0.1597655862569809, "learning_rate": 4.0558546701983977e-05, "loss": 0.4112, "step": 42690 }, { "epoch": 1.5387249071971745, "grad_norm": 0.17811793088912964, "learning_rate": 4.05562624398624e-05, "loss": 0.3963, "step": 42695 }, { "epoch": 1.5389051068583992, "grad_norm": 0.20575061440467834, "learning_rate": 4.055397796578976e-05, "loss": 0.4076, "step": 42700 }, { "epoch": 1.5390853065196237, "grad_norm": 0.1952335387468338, "learning_rate": 4.055169327979719e-05, "loss": 0.4558, "step": 42705 }, { "epoch": 1.5392655061808482, "grad_norm": 0.1855412870645523, "learning_rate": 4.0549408381915796e-05, "loss": 0.4063, "step": 42710 }, { "epoch": 1.539445705842073, "grad_norm": 0.16599588096141815, "learning_rate": 4.054712327217673e-05, "loss": 0.4303, "step": 42715 }, { "epoch": 1.5396259055032977, "grad_norm": 0.15664242208003998, "learning_rate": 4.054483795061112e-05, "loss": 0.3982, "step": 42720 }, { "epoch": 1.5398061051645224, "grad_norm": 0.2023800015449524, "learning_rate": 4.05425524172501e-05, "loss": 0.4052, "step": 42725 }, { "epoch": 1.539986304825747, "grad_norm": 0.1691524088382721, "learning_rate": 4.0540266672124814e-05, "loss": 0.4235, "step": 42730 }, { "epoch": 1.5401665044869715, "grad_norm": 0.2135685384273529, "learning_rate": 4.0537980715266404e-05, "loss": 0.4041, "step": 42735 }, { "epoch": 1.5403467041481962, "grad_norm": 0.16403359174728394, "learning_rate": 4.0535694546706014e-05, "loss": 0.3921, "step": 42740 }, { "epoch": 1.540526903809421, "grad_norm": 0.1725141555070877, "learning_rate": 4.05334081664748e-05, "loss": 0.4409, "step": 42745 }, { "epoch": 1.5407071034706454, "grad_norm": 0.16590328514575958, "learning_rate": 4.05311215746039e-05, "loss": 0.4197, "step": 42750 }, { "epoch": 1.5408873031318702, "grad_norm": 0.20750023424625397, "learning_rate": 4.0528834771124474e-05, "loss": 0.4164, "step": 42755 }, { "epoch": 1.5410675027930947, "grad_norm": 0.18303443491458893, "learning_rate": 4.052654775606768e-05, "loss": 0.4091, "step": 42760 }, { "epoch": 1.5412477024543194, "grad_norm": 0.16997192800045013, "learning_rate": 4.0524260529464695e-05, "loss": 0.4316, "step": 42765 }, { "epoch": 1.5414279021155441, "grad_norm": 0.17433921992778778, "learning_rate": 4.052197309134665e-05, "loss": 0.3874, "step": 42770 }, { "epoch": 1.5416081017767687, "grad_norm": 0.1725977659225464, "learning_rate": 4.051968544174473e-05, "loss": 0.4378, "step": 42775 }, { "epoch": 1.5417883014379932, "grad_norm": 0.23443584144115448, "learning_rate": 4.0517397580690096e-05, "loss": 0.4625, "step": 42780 }, { "epoch": 1.541968501099218, "grad_norm": 0.22204741835594177, "learning_rate": 4.051510950821393e-05, "loss": 0.4225, "step": 42785 }, { "epoch": 1.5421487007604426, "grad_norm": 0.160122349858284, "learning_rate": 4.05128212243474e-05, "loss": 0.3939, "step": 42790 }, { "epoch": 1.5423289004216674, "grad_norm": 0.21652203798294067, "learning_rate": 4.0510532729121684e-05, "loss": 0.4134, "step": 42795 }, { "epoch": 1.5425091000828919, "grad_norm": 0.2179877758026123, "learning_rate": 4.0508244022567966e-05, "loss": 0.4251, "step": 42800 }, { "epoch": 1.5426892997441164, "grad_norm": 0.16457681357860565, "learning_rate": 4.050595510471742e-05, "loss": 0.4323, "step": 42805 }, { "epoch": 1.5428694994053411, "grad_norm": 0.1653529703617096, "learning_rate": 4.050366597560124e-05, "loss": 0.41, "step": 42810 }, { "epoch": 1.5430496990665659, "grad_norm": 0.16849032044410706, "learning_rate": 4.0501376635250606e-05, "loss": 0.4431, "step": 42815 }, { "epoch": 1.5432298987277904, "grad_norm": 0.17099571228027344, "learning_rate": 4.049908708369673e-05, "loss": 0.3836, "step": 42820 }, { "epoch": 1.5434100983890149, "grad_norm": 0.198577880859375, "learning_rate": 4.049679732097079e-05, "loss": 0.3952, "step": 42825 }, { "epoch": 1.5435902980502396, "grad_norm": 0.1876094788312912, "learning_rate": 4.049450734710398e-05, "loss": 0.4149, "step": 42830 }, { "epoch": 1.5437704977114644, "grad_norm": 0.14950236678123474, "learning_rate": 4.049221716212751e-05, "loss": 0.3671, "step": 42835 }, { "epoch": 1.543950697372689, "grad_norm": 0.19585910439491272, "learning_rate": 4.048992676607258e-05, "loss": 0.4084, "step": 42840 }, { "epoch": 1.5441308970339136, "grad_norm": 0.16137710213661194, "learning_rate": 4.04876361589704e-05, "loss": 0.4119, "step": 42845 }, { "epoch": 1.544311096695138, "grad_norm": 0.20326700806617737, "learning_rate": 4.048534534085218e-05, "loss": 0.4127, "step": 42850 }, { "epoch": 1.5444912963563628, "grad_norm": 0.16420388221740723, "learning_rate": 4.0483054311749114e-05, "loss": 0.3519, "step": 42855 }, { "epoch": 1.5446714960175876, "grad_norm": 0.17619933187961578, "learning_rate": 4.048076307169244e-05, "loss": 0.4249, "step": 42860 }, { "epoch": 1.544851695678812, "grad_norm": 0.1833731085062027, "learning_rate": 4.047847162071336e-05, "loss": 0.4124, "step": 42865 }, { "epoch": 1.5450318953400366, "grad_norm": 0.18540017306804657, "learning_rate": 4.0476179958843105e-05, "loss": 0.3995, "step": 42870 }, { "epoch": 1.5452120950012613, "grad_norm": 0.19920703768730164, "learning_rate": 4.04738880861129e-05, "loss": 0.4219, "step": 42875 }, { "epoch": 1.545392294662486, "grad_norm": 0.19860808551311493, "learning_rate": 4.0471596002553956e-05, "loss": 0.4153, "step": 42880 }, { "epoch": 1.5455724943237108, "grad_norm": 0.1732819527387619, "learning_rate": 4.0469303708197515e-05, "loss": 0.4229, "step": 42885 }, { "epoch": 1.5457526939849353, "grad_norm": 0.19622696936130524, "learning_rate": 4.0467011203074815e-05, "loss": 0.4149, "step": 42890 }, { "epoch": 1.5459328936461598, "grad_norm": 0.1721510887145996, "learning_rate": 4.0464718487217066e-05, "loss": 0.4413, "step": 42895 }, { "epoch": 1.5461130933073846, "grad_norm": 0.1426592469215393, "learning_rate": 4.046242556065553e-05, "loss": 0.4244, "step": 42900 }, { "epoch": 1.5462932929686093, "grad_norm": 0.20957881212234497, "learning_rate": 4.046013242342144e-05, "loss": 0.4354, "step": 42905 }, { "epoch": 1.5464734926298338, "grad_norm": 0.15983013808727264, "learning_rate": 4.045783907554604e-05, "loss": 0.3761, "step": 42910 }, { "epoch": 1.5466536922910585, "grad_norm": 0.1735716015100479, "learning_rate": 4.045554551706057e-05, "loss": 0.3935, "step": 42915 }, { "epoch": 1.546833891952283, "grad_norm": 0.17203421890735626, "learning_rate": 4.045325174799629e-05, "loss": 0.4176, "step": 42920 }, { "epoch": 1.5470140916135078, "grad_norm": 0.17394539713859558, "learning_rate": 4.0450957768384446e-05, "loss": 0.3769, "step": 42925 }, { "epoch": 1.5471942912747325, "grad_norm": 0.18221496045589447, "learning_rate": 4.044866357825629e-05, "loss": 0.3931, "step": 42930 }, { "epoch": 1.547374490935957, "grad_norm": 0.24642367660999298, "learning_rate": 4.0446369177643085e-05, "loss": 0.4427, "step": 42935 }, { "epoch": 1.5475546905971815, "grad_norm": 0.2149946391582489, "learning_rate": 4.044407456657609e-05, "loss": 0.4279, "step": 42940 }, { "epoch": 1.5477348902584063, "grad_norm": 0.147264301776886, "learning_rate": 4.0441779745086575e-05, "loss": 0.3768, "step": 42945 }, { "epoch": 1.547915089919631, "grad_norm": 0.17553412914276123, "learning_rate": 4.0439484713205795e-05, "loss": 0.3712, "step": 42950 }, { "epoch": 1.5480952895808557, "grad_norm": 0.1730952113866806, "learning_rate": 4.0437189470965026e-05, "loss": 0.4062, "step": 42955 }, { "epoch": 1.5482754892420802, "grad_norm": 0.21400214731693268, "learning_rate": 4.043489401839554e-05, "loss": 0.4106, "step": 42960 }, { "epoch": 1.5484556889033048, "grad_norm": 0.1784030795097351, "learning_rate": 4.0432598355528606e-05, "loss": 0.4188, "step": 42965 }, { "epoch": 1.5486358885645295, "grad_norm": 0.2129783183336258, "learning_rate": 4.043030248239551e-05, "loss": 0.4148, "step": 42970 }, { "epoch": 1.5488160882257542, "grad_norm": 0.23545123636722565, "learning_rate": 4.042800639902754e-05, "loss": 0.422, "step": 42975 }, { "epoch": 1.5489962878869787, "grad_norm": 0.1820184886455536, "learning_rate": 4.042571010545596e-05, "loss": 0.371, "step": 42980 }, { "epoch": 1.5491764875482033, "grad_norm": 0.19500315189361572, "learning_rate": 4.0423413601712065e-05, "loss": 0.4332, "step": 42985 }, { "epoch": 1.549356687209428, "grad_norm": 0.18859097361564636, "learning_rate": 4.042111688782715e-05, "loss": 0.4262, "step": 42990 }, { "epoch": 1.5495368868706527, "grad_norm": 0.16476643085479736, "learning_rate": 4.04188199638325e-05, "loss": 0.4073, "step": 42995 }, { "epoch": 1.5497170865318775, "grad_norm": 0.1720295399427414, "learning_rate": 4.041652282975942e-05, "loss": 0.4289, "step": 43000 }, { "epoch": 1.5497170865318775, "eval_loss": 0.4438794255256653, "eval_runtime": 3.5307, "eval_samples_per_second": 28.323, "eval_steps_per_second": 7.081, "step": 43000 }, { "epoch": 1.549897286193102, "grad_norm": 0.17833761870861053, "learning_rate": 4.041422548563919e-05, "loss": 0.4653, "step": 43005 }, { "epoch": 1.5500774858543265, "grad_norm": 0.1873827427625656, "learning_rate": 4.041192793150314e-05, "loss": 0.4042, "step": 43010 }, { "epoch": 1.5502576855155512, "grad_norm": 0.20229150354862213, "learning_rate": 4.040963016738254e-05, "loss": 0.446, "step": 43015 }, { "epoch": 1.550437885176776, "grad_norm": 0.17710404098033905, "learning_rate": 4.040733219330871e-05, "loss": 0.4281, "step": 43020 }, { "epoch": 1.5506180848380005, "grad_norm": 0.15827186405658722, "learning_rate": 4.040503400931297e-05, "loss": 0.3829, "step": 43025 }, { "epoch": 1.5507982844992252, "grad_norm": 0.18000894784927368, "learning_rate": 4.040273561542662e-05, "loss": 0.4088, "step": 43030 }, { "epoch": 1.5509784841604497, "grad_norm": 0.18159720301628113, "learning_rate": 4.0400437011680986e-05, "loss": 0.4461, "step": 43035 }, { "epoch": 1.5511586838216744, "grad_norm": 0.19357514381408691, "learning_rate": 4.039813819810737e-05, "loss": 0.4084, "step": 43040 }, { "epoch": 1.5513388834828992, "grad_norm": 0.16021384298801422, "learning_rate": 4.039583917473711e-05, "loss": 0.4259, "step": 43045 }, { "epoch": 1.5515190831441237, "grad_norm": 0.22351530194282532, "learning_rate": 4.039353994160152e-05, "loss": 0.4508, "step": 43050 }, { "epoch": 1.5516992828053482, "grad_norm": 0.1790456771850586, "learning_rate": 4.039124049873193e-05, "loss": 0.435, "step": 43055 }, { "epoch": 1.551879482466573, "grad_norm": 0.1445944756269455, "learning_rate": 4.038894084615966e-05, "loss": 0.4111, "step": 43060 }, { "epoch": 1.5520596821277977, "grad_norm": 0.1887555867433548, "learning_rate": 4.038664098391606e-05, "loss": 0.4236, "step": 43065 }, { "epoch": 1.5522398817890224, "grad_norm": 0.14658775925636292, "learning_rate": 4.038434091203245e-05, "loss": 0.4447, "step": 43070 }, { "epoch": 1.552420081450247, "grad_norm": 0.192291259765625, "learning_rate": 4.038204063054017e-05, "loss": 0.468, "step": 43075 }, { "epoch": 1.5526002811114714, "grad_norm": 0.15899904072284698, "learning_rate": 4.037974013947058e-05, "loss": 0.377, "step": 43080 }, { "epoch": 1.5527804807726961, "grad_norm": 0.182462677359581, "learning_rate": 4.037743943885499e-05, "loss": 0.4033, "step": 43085 }, { "epoch": 1.5529606804339209, "grad_norm": 0.17787639796733856, "learning_rate": 4.037513852872478e-05, "loss": 0.416, "step": 43090 }, { "epoch": 1.5531408800951454, "grad_norm": 0.1610783487558365, "learning_rate": 4.037283740911128e-05, "loss": 0.4039, "step": 43095 }, { "epoch": 1.55332107975637, "grad_norm": 0.15985815227031708, "learning_rate": 4.037053608004584e-05, "loss": 0.4483, "step": 43100 }, { "epoch": 1.5535012794175946, "grad_norm": 0.21218977868556976, "learning_rate": 4.036823454155982e-05, "loss": 0.4047, "step": 43105 }, { "epoch": 1.5536814790788194, "grad_norm": 0.1868336945772171, "learning_rate": 4.036593279368458e-05, "loss": 0.4103, "step": 43110 }, { "epoch": 1.553861678740044, "grad_norm": 0.24280469119548798, "learning_rate": 4.0363630836451496e-05, "loss": 0.4188, "step": 43115 }, { "epoch": 1.5540418784012686, "grad_norm": 0.1968899369239807, "learning_rate": 4.036132866989191e-05, "loss": 0.419, "step": 43120 }, { "epoch": 1.5542220780624931, "grad_norm": 0.19591078162193298, "learning_rate": 4.035902629403718e-05, "loss": 0.4057, "step": 43125 }, { "epoch": 1.5544022777237179, "grad_norm": 0.1963011920452118, "learning_rate": 4.0356723708918705e-05, "loss": 0.394, "step": 43130 }, { "epoch": 1.5545824773849426, "grad_norm": 0.1776949167251587, "learning_rate": 4.035442091456784e-05, "loss": 0.3855, "step": 43135 }, { "epoch": 1.554762677046167, "grad_norm": 0.21400536596775055, "learning_rate": 4.035211791101596e-05, "loss": 0.4177, "step": 43140 }, { "epoch": 1.5549428767073918, "grad_norm": 0.22430282831192017, "learning_rate": 4.034981469829445e-05, "loss": 0.444, "step": 43145 }, { "epoch": 1.5551230763686164, "grad_norm": 0.1932390183210373, "learning_rate": 4.034751127643468e-05, "loss": 0.3992, "step": 43150 }, { "epoch": 1.555303276029841, "grad_norm": 0.20954085886478424, "learning_rate": 4.034520764546805e-05, "loss": 0.4377, "step": 43155 }, { "epoch": 1.5554834756910658, "grad_norm": 0.21870118379592896, "learning_rate": 4.034290380542593e-05, "loss": 0.4406, "step": 43160 }, { "epoch": 1.5556636753522903, "grad_norm": 0.19834262132644653, "learning_rate": 4.0340599756339715e-05, "loss": 0.4218, "step": 43165 }, { "epoch": 1.5558438750135148, "grad_norm": 0.1412007212638855, "learning_rate": 4.03382954982408e-05, "loss": 0.3994, "step": 43170 }, { "epoch": 1.5560240746747396, "grad_norm": 0.15708720684051514, "learning_rate": 4.0335991031160584e-05, "loss": 0.3901, "step": 43175 }, { "epoch": 1.5562042743359643, "grad_norm": 0.13734932243824005, "learning_rate": 4.0333686355130454e-05, "loss": 0.4384, "step": 43180 }, { "epoch": 1.556384473997189, "grad_norm": 0.2183203250169754, "learning_rate": 4.033138147018181e-05, "loss": 0.4417, "step": 43185 }, { "epoch": 1.5565646736584136, "grad_norm": 0.20702525973320007, "learning_rate": 4.0329076376346063e-05, "loss": 0.4323, "step": 43190 }, { "epoch": 1.556744873319638, "grad_norm": 0.20246562361717224, "learning_rate": 4.032677107365463e-05, "loss": 0.4611, "step": 43195 }, { "epoch": 1.5569250729808628, "grad_norm": 0.15538166463375092, "learning_rate": 4.0324465562138905e-05, "loss": 0.4066, "step": 43200 }, { "epoch": 1.5571052726420875, "grad_norm": 0.1648663729429245, "learning_rate": 4.03221598418303e-05, "loss": 0.4347, "step": 43205 }, { "epoch": 1.557285472303312, "grad_norm": 0.2193061113357544, "learning_rate": 4.031985391276023e-05, "loss": 0.412, "step": 43210 }, { "epoch": 1.5574656719645366, "grad_norm": 0.1736992746591568, "learning_rate": 4.031754777496012e-05, "loss": 0.4344, "step": 43215 }, { "epoch": 1.5576458716257613, "grad_norm": 0.18772049248218536, "learning_rate": 4.031524142846139e-05, "loss": 0.4455, "step": 43220 }, { "epoch": 1.557826071286986, "grad_norm": 0.185786172747612, "learning_rate": 4.031293487329546e-05, "loss": 0.4187, "step": 43225 }, { "epoch": 1.5580062709482108, "grad_norm": 0.21457712352275848, "learning_rate": 4.031062810949375e-05, "loss": 0.4008, "step": 43230 }, { "epoch": 1.5581864706094353, "grad_norm": 0.16875572502613068, "learning_rate": 4.03083211370877e-05, "loss": 0.4041, "step": 43235 }, { "epoch": 1.5583666702706598, "grad_norm": 0.1875239759683609, "learning_rate": 4.0306013956108747e-05, "loss": 0.4421, "step": 43240 }, { "epoch": 1.5585468699318845, "grad_norm": 0.1823228895664215, "learning_rate": 4.030370656658831e-05, "loss": 0.3946, "step": 43245 }, { "epoch": 1.5587270695931092, "grad_norm": 0.1981915831565857, "learning_rate": 4.030139896855783e-05, "loss": 0.4127, "step": 43250 }, { "epoch": 1.5589072692543338, "grad_norm": 0.17064636945724487, "learning_rate": 4.029909116204875e-05, "loss": 0.3918, "step": 43255 }, { "epoch": 1.5590874689155585, "grad_norm": 0.21734385192394257, "learning_rate": 4.0296783147092527e-05, "loss": 0.4109, "step": 43260 }, { "epoch": 1.559267668576783, "grad_norm": 0.18934614956378937, "learning_rate": 4.029447492372059e-05, "loss": 0.407, "step": 43265 }, { "epoch": 1.5594478682380077, "grad_norm": 0.22457720339298248, "learning_rate": 4.029216649196439e-05, "loss": 0.428, "step": 43270 }, { "epoch": 1.5596280678992325, "grad_norm": 0.16613884270191193, "learning_rate": 4.028985785185538e-05, "loss": 0.4337, "step": 43275 }, { "epoch": 1.559808267560457, "grad_norm": 0.1653304249048233, "learning_rate": 4.0287549003425026e-05, "loss": 0.3973, "step": 43280 }, { "epoch": 1.5599884672216815, "grad_norm": 0.18372784554958344, "learning_rate": 4.028523994670477e-05, "loss": 0.4102, "step": 43285 }, { "epoch": 1.5601686668829062, "grad_norm": 0.1557554453611374, "learning_rate": 4.028293068172608e-05, "loss": 0.417, "step": 43290 }, { "epoch": 1.560348866544131, "grad_norm": 0.17378360033035278, "learning_rate": 4.028062120852042e-05, "loss": 0.4113, "step": 43295 }, { "epoch": 1.5605290662053557, "grad_norm": 0.17926354706287384, "learning_rate": 4.027831152711925e-05, "loss": 0.402, "step": 43300 }, { "epoch": 1.5607092658665802, "grad_norm": 0.17073003947734833, "learning_rate": 4.027600163755405e-05, "loss": 0.4631, "step": 43305 }, { "epoch": 1.5608894655278047, "grad_norm": 0.17934848368167877, "learning_rate": 4.027369153985628e-05, "loss": 0.4058, "step": 43310 }, { "epoch": 1.5610696651890295, "grad_norm": 0.16843603551387787, "learning_rate": 4.0271381234057426e-05, "loss": 0.4345, "step": 43315 }, { "epoch": 1.5612498648502542, "grad_norm": 0.1804237961769104, "learning_rate": 4.026907072018896e-05, "loss": 0.4171, "step": 43320 }, { "epoch": 1.5614300645114787, "grad_norm": 0.18647703528404236, "learning_rate": 4.0266759998282355e-05, "loss": 0.4483, "step": 43325 }, { "epoch": 1.5616102641727032, "grad_norm": 0.15589216351509094, "learning_rate": 4.026444906836909e-05, "loss": 0.3917, "step": 43330 }, { "epoch": 1.561790463833928, "grad_norm": 0.2177332192659378, "learning_rate": 4.026213793048068e-05, "loss": 0.4652, "step": 43335 }, { "epoch": 1.5619706634951527, "grad_norm": 0.22540467977523804, "learning_rate": 4.0259826584648596e-05, "loss": 0.4222, "step": 43340 }, { "epoch": 1.5621508631563774, "grad_norm": 0.1936839520931244, "learning_rate": 4.025751503090432e-05, "loss": 0.4248, "step": 43345 }, { "epoch": 1.562331062817602, "grad_norm": 0.20749831199645996, "learning_rate": 4.025520326927936e-05, "loss": 0.4026, "step": 43350 }, { "epoch": 1.5625112624788264, "grad_norm": 0.1811009794473648, "learning_rate": 4.025289129980521e-05, "loss": 0.3812, "step": 43355 }, { "epoch": 1.5626914621400512, "grad_norm": 0.1572161763906479, "learning_rate": 4.025057912251337e-05, "loss": 0.4306, "step": 43360 }, { "epoch": 1.562871661801276, "grad_norm": 0.18943502008914948, "learning_rate": 4.024826673743533e-05, "loss": 0.3895, "step": 43365 }, { "epoch": 1.5630518614625004, "grad_norm": 0.2089279443025589, "learning_rate": 4.024595414460261e-05, "loss": 0.4108, "step": 43370 }, { "epoch": 1.563232061123725, "grad_norm": 0.16804815828800201, "learning_rate": 4.0243641344046725e-05, "loss": 0.3943, "step": 43375 }, { "epoch": 1.5634122607849497, "grad_norm": 0.19758044183254242, "learning_rate": 4.0241328335799185e-05, "loss": 0.4298, "step": 43380 }, { "epoch": 1.5635924604461744, "grad_norm": 0.18327586352825165, "learning_rate": 4.023901511989149e-05, "loss": 0.4356, "step": 43385 }, { "epoch": 1.5637726601073991, "grad_norm": 0.18740832805633545, "learning_rate": 4.023670169635516e-05, "loss": 0.4263, "step": 43390 }, { "epoch": 1.5639528597686236, "grad_norm": 0.18278846144676208, "learning_rate": 4.0234388065221716e-05, "loss": 0.3991, "step": 43395 }, { "epoch": 1.5641330594298481, "grad_norm": 0.137952983379364, "learning_rate": 4.02320742265227e-05, "loss": 0.4308, "step": 43400 }, { "epoch": 1.5643132590910729, "grad_norm": 0.16237494349479675, "learning_rate": 4.0229760180289604e-05, "loss": 0.3995, "step": 43405 }, { "epoch": 1.5644934587522976, "grad_norm": 0.1925591230392456, "learning_rate": 4.022744592655398e-05, "loss": 0.4095, "step": 43410 }, { "epoch": 1.5646736584135221, "grad_norm": 0.17975488305091858, "learning_rate": 4.022513146534735e-05, "loss": 0.397, "step": 43415 }, { "epoch": 1.5648538580747469, "grad_norm": 0.19634199142456055, "learning_rate": 4.022281679670127e-05, "loss": 0.4361, "step": 43420 }, { "epoch": 1.5650340577359714, "grad_norm": 0.22516696155071259, "learning_rate": 4.022050192064724e-05, "loss": 0.4091, "step": 43425 }, { "epoch": 1.565214257397196, "grad_norm": 0.19511666893959045, "learning_rate": 4.021818683721682e-05, "loss": 0.4301, "step": 43430 }, { "epoch": 1.5653944570584208, "grad_norm": 0.19139276444911957, "learning_rate": 4.021587154644156e-05, "loss": 0.4132, "step": 43435 }, { "epoch": 1.5655746567196454, "grad_norm": 0.18717695772647858, "learning_rate": 4.021355604835299e-05, "loss": 0.4059, "step": 43440 }, { "epoch": 1.5657548563808699, "grad_norm": 0.16392220556735992, "learning_rate": 4.0211240342982656e-05, "loss": 0.4349, "step": 43445 }, { "epoch": 1.5659350560420946, "grad_norm": 0.23648183047771454, "learning_rate": 4.0208924430362126e-05, "loss": 0.4023, "step": 43450 }, { "epoch": 1.5661152557033193, "grad_norm": 0.23210176825523376, "learning_rate": 4.020660831052295e-05, "loss": 0.4303, "step": 43455 }, { "epoch": 1.566295455364544, "grad_norm": 0.18653087317943573, "learning_rate": 4.020429198349667e-05, "loss": 0.4452, "step": 43460 }, { "epoch": 1.5664756550257686, "grad_norm": 0.17263540625572205, "learning_rate": 4.0201975449314865e-05, "loss": 0.4231, "step": 43465 }, { "epoch": 1.566655854686993, "grad_norm": 0.2012946456670761, "learning_rate": 4.019965870800908e-05, "loss": 0.4403, "step": 43470 }, { "epoch": 1.5668360543482178, "grad_norm": 0.2045126110315323, "learning_rate": 4.019734175961089e-05, "loss": 0.4306, "step": 43475 }, { "epoch": 1.5670162540094426, "grad_norm": 0.15547120571136475, "learning_rate": 4.019502460415186e-05, "loss": 0.4273, "step": 43480 }, { "epoch": 1.567196453670667, "grad_norm": 0.17727886140346527, "learning_rate": 4.0192707241663567e-05, "loss": 0.4196, "step": 43485 }, { "epoch": 1.5673766533318916, "grad_norm": 0.16581672430038452, "learning_rate": 4.0190389672177575e-05, "loss": 0.3979, "step": 43490 }, { "epoch": 1.5675568529931163, "grad_norm": 0.1664031445980072, "learning_rate": 4.0188071895725466e-05, "loss": 0.4289, "step": 43495 }, { "epoch": 1.567737052654341, "grad_norm": 0.17638015747070312, "learning_rate": 4.018575391233882e-05, "loss": 0.4044, "step": 43500 }, { "epoch": 1.567737052654341, "eval_loss": 0.44283291697502136, "eval_runtime": 3.524, "eval_samples_per_second": 28.377, "eval_steps_per_second": 7.094, "step": 43500 }, { "epoch": 1.5679172523155658, "grad_norm": 0.19836536049842834, "learning_rate": 4.018343572204921e-05, "loss": 0.4234, "step": 43505 }, { "epoch": 1.5680974519767903, "grad_norm": 0.1390867680311203, "learning_rate": 4.018111732488823e-05, "loss": 0.386, "step": 43510 }, { "epoch": 1.5682776516380148, "grad_norm": 0.2248731255531311, "learning_rate": 4.0178798720887465e-05, "loss": 0.4143, "step": 43515 }, { "epoch": 1.5684578512992395, "grad_norm": 0.149456188082695, "learning_rate": 4.017647991007851e-05, "loss": 0.3873, "step": 43520 }, { "epoch": 1.5686380509604643, "grad_norm": 0.20427654683589935, "learning_rate": 4.017416089249296e-05, "loss": 0.4319, "step": 43525 }, { "epoch": 1.5688182506216888, "grad_norm": 0.18678858876228333, "learning_rate": 4.017184166816239e-05, "loss": 0.3849, "step": 43530 }, { "epoch": 1.5689984502829135, "grad_norm": 0.14671590924263, "learning_rate": 4.0169522237118426e-05, "loss": 0.3873, "step": 43535 }, { "epoch": 1.569178649944138, "grad_norm": 0.1654651165008545, "learning_rate": 4.0167202599392656e-05, "loss": 0.4051, "step": 43540 }, { "epoch": 1.5693588496053628, "grad_norm": 0.17216157913208008, "learning_rate": 4.0164882755016685e-05, "loss": 0.4207, "step": 43545 }, { "epoch": 1.5695390492665875, "grad_norm": 0.1685381382703781, "learning_rate": 4.0162562704022124e-05, "loss": 0.3873, "step": 43550 }, { "epoch": 1.569719248927812, "grad_norm": 0.16307340562343597, "learning_rate": 4.0160242446440584e-05, "loss": 0.3893, "step": 43555 }, { "epoch": 1.5698994485890365, "grad_norm": 0.2021232396364212, "learning_rate": 4.015792198230367e-05, "loss": 0.4492, "step": 43560 }, { "epoch": 1.5700796482502613, "grad_norm": 0.1811605542898178, "learning_rate": 4.0155601311643006e-05, "loss": 0.3913, "step": 43565 }, { "epoch": 1.570259847911486, "grad_norm": 0.14910484850406647, "learning_rate": 4.015328043449021e-05, "loss": 0.4193, "step": 43570 }, { "epoch": 1.5704400475727107, "grad_norm": 0.24436582624912262, "learning_rate": 4.0150959350876903e-05, "loss": 0.432, "step": 43575 }, { "epoch": 1.5706202472339352, "grad_norm": 0.1627466231584549, "learning_rate": 4.014863806083471e-05, "loss": 0.4316, "step": 43580 }, { "epoch": 1.5708004468951597, "grad_norm": 0.18843139708042145, "learning_rate": 4.0146316564395254e-05, "loss": 0.4146, "step": 43585 }, { "epoch": 1.5709806465563845, "grad_norm": 0.17627915740013123, "learning_rate": 4.014399486159016e-05, "loss": 0.444, "step": 43590 }, { "epoch": 1.5711608462176092, "grad_norm": 0.2867834270000458, "learning_rate": 4.014167295245108e-05, "loss": 0.4179, "step": 43595 }, { "epoch": 1.5713410458788337, "grad_norm": 0.16591203212738037, "learning_rate": 4.013935083700963e-05, "loss": 0.4026, "step": 43600 }, { "epoch": 1.5715212455400582, "grad_norm": 0.1895548552274704, "learning_rate": 4.0137028515297456e-05, "loss": 0.4287, "step": 43605 }, { "epoch": 1.571701445201283, "grad_norm": 0.20505109429359436, "learning_rate": 4.0134705987346206e-05, "loss": 0.4474, "step": 43610 }, { "epoch": 1.5718816448625077, "grad_norm": 0.18428920209407806, "learning_rate": 4.013238325318751e-05, "loss": 0.4338, "step": 43615 }, { "epoch": 1.5720618445237324, "grad_norm": 0.21533146500587463, "learning_rate": 4.013006031285302e-05, "loss": 0.4064, "step": 43620 }, { "epoch": 1.572242044184957, "grad_norm": 0.17549021542072296, "learning_rate": 4.012773716637439e-05, "loss": 0.4381, "step": 43625 }, { "epoch": 1.5724222438461815, "grad_norm": 0.21111688017845154, "learning_rate": 4.0125413813783275e-05, "loss": 0.4279, "step": 43630 }, { "epoch": 1.5726024435074062, "grad_norm": 0.23240859806537628, "learning_rate": 4.0123090255111316e-05, "loss": 0.435, "step": 43635 }, { "epoch": 1.572782643168631, "grad_norm": 0.1591193825006485, "learning_rate": 4.0120766490390197e-05, "loss": 0.4326, "step": 43640 }, { "epoch": 1.5729628428298554, "grad_norm": 0.19279992580413818, "learning_rate": 4.011844251965154e-05, "loss": 0.4427, "step": 43645 }, { "epoch": 1.5731430424910802, "grad_norm": 0.19141902029514313, "learning_rate": 4.0116118342927045e-05, "loss": 0.3833, "step": 43650 }, { "epoch": 1.5733232421523047, "grad_norm": 0.1679369956254959, "learning_rate": 4.0113793960248356e-05, "loss": 0.4242, "step": 43655 }, { "epoch": 1.5735034418135294, "grad_norm": 0.21762417256832123, "learning_rate": 4.0111469371647156e-05, "loss": 0.4086, "step": 43660 }, { "epoch": 1.5736836414747541, "grad_norm": 0.1647619605064392, "learning_rate": 4.010914457715511e-05, "loss": 0.4345, "step": 43665 }, { "epoch": 1.5738638411359787, "grad_norm": 0.27651816606521606, "learning_rate": 4.01068195768039e-05, "loss": 0.4154, "step": 43670 }, { "epoch": 1.5740440407972032, "grad_norm": 0.20368202030658722, "learning_rate": 4.010449437062519e-05, "loss": 0.4557, "step": 43675 }, { "epoch": 1.574224240458428, "grad_norm": 0.17364609241485596, "learning_rate": 4.0102168958650676e-05, "loss": 0.3769, "step": 43680 }, { "epoch": 1.5744044401196526, "grad_norm": 0.17485357820987701, "learning_rate": 4.009984334091203e-05, "loss": 0.3792, "step": 43685 }, { "epoch": 1.5745846397808774, "grad_norm": 0.20240798592567444, "learning_rate": 4.009751751744094e-05, "loss": 0.3928, "step": 43690 }, { "epoch": 1.5747648394421019, "grad_norm": 0.21913383901119232, "learning_rate": 4.009519148826909e-05, "loss": 0.4397, "step": 43695 }, { "epoch": 1.5749450391033264, "grad_norm": 0.1732109934091568, "learning_rate": 4.009286525342819e-05, "loss": 0.3939, "step": 43700 }, { "epoch": 1.5751252387645511, "grad_norm": 0.20331084728240967, "learning_rate": 4.0090538812949916e-05, "loss": 0.4074, "step": 43705 }, { "epoch": 1.5753054384257759, "grad_norm": 0.18988221883773804, "learning_rate": 4.008821216686598e-05, "loss": 0.3991, "step": 43710 }, { "epoch": 1.5754856380870004, "grad_norm": 0.17192235589027405, "learning_rate": 4.008588531520807e-05, "loss": 0.4486, "step": 43715 }, { "epoch": 1.5756658377482249, "grad_norm": 0.19935615360736847, "learning_rate": 4.008355825800789e-05, "loss": 0.4195, "step": 43720 }, { "epoch": 1.5758460374094496, "grad_norm": 0.18798862397670746, "learning_rate": 4.0081230995297154e-05, "loss": 0.397, "step": 43725 }, { "epoch": 1.5760262370706744, "grad_norm": 0.16383147239685059, "learning_rate": 4.007890352710757e-05, "loss": 0.4065, "step": 43730 }, { "epoch": 1.576206436731899, "grad_norm": 0.18491680920124054, "learning_rate": 4.007657585347083e-05, "loss": 0.397, "step": 43735 }, { "epoch": 1.5763866363931236, "grad_norm": 0.1732751429080963, "learning_rate": 4.007424797441868e-05, "loss": 0.3996, "step": 43740 }, { "epoch": 1.576566836054348, "grad_norm": 0.17933018505573273, "learning_rate": 4.00719198899828e-05, "loss": 0.427, "step": 43745 }, { "epoch": 1.5767470357155728, "grad_norm": 0.17597095668315887, "learning_rate": 4.006959160019495e-05, "loss": 0.4075, "step": 43750 }, { "epoch": 1.5769272353767976, "grad_norm": 0.16884708404541016, "learning_rate": 4.0067263105086825e-05, "loss": 0.4244, "step": 43755 }, { "epoch": 1.577107435038022, "grad_norm": 0.21198032796382904, "learning_rate": 4.0064934404690146e-05, "loss": 0.4058, "step": 43760 }, { "epoch": 1.5772876346992466, "grad_norm": 0.16649512946605682, "learning_rate": 4.006260549903666e-05, "loss": 0.4174, "step": 43765 }, { "epoch": 1.5774678343604713, "grad_norm": 0.16471701860427856, "learning_rate": 4.00602763881581e-05, "loss": 0.3756, "step": 43770 }, { "epoch": 1.577648034021696, "grad_norm": 0.1989121437072754, "learning_rate": 4.005794707208618e-05, "loss": 0.4276, "step": 43775 }, { "epoch": 1.5778282336829208, "grad_norm": 0.14594782888889313, "learning_rate": 4.005561755085265e-05, "loss": 0.4355, "step": 43780 }, { "epoch": 1.5780084333441453, "grad_norm": 0.19597959518432617, "learning_rate": 4.0053287824489236e-05, "loss": 0.4134, "step": 43785 }, { "epoch": 1.5781886330053698, "grad_norm": 0.17923668026924133, "learning_rate": 4.0050957893027706e-05, "loss": 0.4061, "step": 43790 }, { "epoch": 1.5783688326665946, "grad_norm": 0.1501871943473816, "learning_rate": 4.004862775649978e-05, "loss": 0.3867, "step": 43795 }, { "epoch": 1.5785490323278193, "grad_norm": 0.18202674388885498, "learning_rate": 4.004629741493721e-05, "loss": 0.4072, "step": 43800 }, { "epoch": 1.578729231989044, "grad_norm": 0.13299371302127838, "learning_rate": 4.004396686837176e-05, "loss": 0.4034, "step": 43805 }, { "epoch": 1.5789094316502685, "grad_norm": 0.17386196553707123, "learning_rate": 4.004163611683517e-05, "loss": 0.4056, "step": 43810 }, { "epoch": 1.579089631311493, "grad_norm": 0.18044842779636383, "learning_rate": 4.0039305160359195e-05, "loss": 0.4121, "step": 43815 }, { "epoch": 1.5792698309727178, "grad_norm": 0.15430901944637299, "learning_rate": 4.0036973998975604e-05, "loss": 0.3994, "step": 43820 }, { "epoch": 1.5794500306339425, "grad_norm": 0.17539086937904358, "learning_rate": 4.0034642632716155e-05, "loss": 0.4177, "step": 43825 }, { "epoch": 1.579630230295167, "grad_norm": 0.19086940586566925, "learning_rate": 4.0032311061612604e-05, "loss": 0.4033, "step": 43830 }, { "epoch": 1.5798104299563915, "grad_norm": 0.19829294085502625, "learning_rate": 4.0029979285696736e-05, "loss": 0.431, "step": 43835 }, { "epoch": 1.5799906296176163, "grad_norm": 0.21436335146427155, "learning_rate": 4.0027647305000306e-05, "loss": 0.4564, "step": 43840 }, { "epoch": 1.580170829278841, "grad_norm": 0.21377906203269958, "learning_rate": 4.002531511955509e-05, "loss": 0.441, "step": 43845 }, { "epoch": 1.5803510289400657, "grad_norm": 0.1993681788444519, "learning_rate": 4.0022982729392855e-05, "loss": 0.4371, "step": 43850 }, { "epoch": 1.5805312286012903, "grad_norm": 0.16495195031166077, "learning_rate": 4.00206501345454e-05, "loss": 0.436, "step": 43855 }, { "epoch": 1.5807114282625148, "grad_norm": 0.16763421893119812, "learning_rate": 4.0018317335044495e-05, "loss": 0.4238, "step": 43860 }, { "epoch": 1.5808916279237395, "grad_norm": 0.16144713759422302, "learning_rate": 4.0015984330921916e-05, "loss": 0.4115, "step": 43865 }, { "epoch": 1.5810718275849642, "grad_norm": 0.20617428421974182, "learning_rate": 4.0013651122209465e-05, "loss": 0.4347, "step": 43870 }, { "epoch": 1.5812520272461887, "grad_norm": 0.1693793535232544, "learning_rate": 4.0011317708938924e-05, "loss": 0.3648, "step": 43875 }, { "epoch": 1.5814322269074133, "grad_norm": 0.15378816425800323, "learning_rate": 4.0008984091142086e-05, "loss": 0.4536, "step": 43880 }, { "epoch": 1.581612426568638, "grad_norm": 0.20745548605918884, "learning_rate": 4.0006650268850745e-05, "loss": 0.4071, "step": 43885 }, { "epoch": 1.5817926262298627, "grad_norm": 0.2162124514579773, "learning_rate": 4.00043162420967e-05, "loss": 0.4081, "step": 43890 }, { "epoch": 1.5819728258910875, "grad_norm": 0.16745862364768982, "learning_rate": 4.000198201091175e-05, "loss": 0.4493, "step": 43895 }, { "epoch": 1.582153025552312, "grad_norm": 0.16501225531101227, "learning_rate": 3.999964757532769e-05, "loss": 0.3873, "step": 43900 }, { "epoch": 1.5823332252135365, "grad_norm": 0.22815780341625214, "learning_rate": 3.9997312935376346e-05, "loss": 0.3807, "step": 43905 }, { "epoch": 1.5825134248747612, "grad_norm": 0.17533724009990692, "learning_rate": 3.9994978091089515e-05, "loss": 0.4033, "step": 43910 }, { "epoch": 1.582693624535986, "grad_norm": 0.1684589833021164, "learning_rate": 3.999264304249901e-05, "loss": 0.4166, "step": 43915 }, { "epoch": 1.5828738241972105, "grad_norm": 0.16925841569900513, "learning_rate": 3.999030778963665e-05, "loss": 0.394, "step": 43920 }, { "epoch": 1.5830540238584352, "grad_norm": 0.1653861552476883, "learning_rate": 3.9987972332534246e-05, "loss": 0.4332, "step": 43925 }, { "epoch": 1.5832342235196597, "grad_norm": 0.1570800542831421, "learning_rate": 3.998563667122362e-05, "loss": 0.3974, "step": 43930 }, { "epoch": 1.5834144231808844, "grad_norm": 0.18440444767475128, "learning_rate": 3.9983300805736595e-05, "loss": 0.4235, "step": 43935 }, { "epoch": 1.5835946228421092, "grad_norm": 0.15225686132907867, "learning_rate": 3.9980964736104995e-05, "loss": 0.4064, "step": 43940 }, { "epoch": 1.5837748225033337, "grad_norm": 0.17495448887348175, "learning_rate": 3.997862846236066e-05, "loss": 0.4167, "step": 43945 }, { "epoch": 1.5839550221645582, "grad_norm": 0.1724550575017929, "learning_rate": 3.9976291984535405e-05, "loss": 0.4093, "step": 43950 }, { "epoch": 1.584135221825783, "grad_norm": 0.19340504705905914, "learning_rate": 3.997395530266108e-05, "loss": 0.4351, "step": 43955 }, { "epoch": 1.5843154214870077, "grad_norm": 0.18444287776947021, "learning_rate": 3.9971618416769495e-05, "loss": 0.3998, "step": 43960 }, { "epoch": 1.5844956211482324, "grad_norm": 0.13938504457473755, "learning_rate": 3.9969281326892523e-05, "loss": 0.3954, "step": 43965 }, { "epoch": 1.584675820809457, "grad_norm": 0.15549679100513458, "learning_rate": 3.996694403306198e-05, "loss": 0.4072, "step": 43970 }, { "epoch": 1.5848560204706814, "grad_norm": 0.16593880951404572, "learning_rate": 3.9964606535309735e-05, "loss": 0.4273, "step": 43975 }, { "epoch": 1.5850362201319061, "grad_norm": 0.23627829551696777, "learning_rate": 3.9962268833667615e-05, "loss": 0.4176, "step": 43980 }, { "epoch": 1.5852164197931309, "grad_norm": 0.1789269745349884, "learning_rate": 3.9959930928167474e-05, "loss": 0.4231, "step": 43985 }, { "epoch": 1.5853966194543554, "grad_norm": 0.20784516632556915, "learning_rate": 3.995759281884118e-05, "loss": 0.4042, "step": 43990 }, { "epoch": 1.58557681911558, "grad_norm": 0.18998631834983826, "learning_rate": 3.995525450572059e-05, "loss": 0.4483, "step": 43995 }, { "epoch": 1.5857570187768046, "grad_norm": 0.2027607262134552, "learning_rate": 3.9952915988837534e-05, "loss": 0.3861, "step": 44000 }, { "epoch": 1.5857570187768046, "eval_loss": 0.44330334663391113, "eval_runtime": 3.5296, "eval_samples_per_second": 28.332, "eval_steps_per_second": 7.083, "step": 44000 }, { "epoch": 1.5859372184380294, "grad_norm": 0.18675218522548676, "learning_rate": 3.99505772682239e-05, "loss": 0.399, "step": 44005 }, { "epoch": 1.586117418099254, "grad_norm": 0.18582110106945038, "learning_rate": 3.994823834391154e-05, "loss": 0.3755, "step": 44010 }, { "epoch": 1.5862976177604786, "grad_norm": 0.18552209436893463, "learning_rate": 3.994589921593233e-05, "loss": 0.4288, "step": 44015 }, { "epoch": 1.5864778174217031, "grad_norm": 0.17207638919353485, "learning_rate": 3.994355988431814e-05, "loss": 0.3947, "step": 44020 }, { "epoch": 1.5866580170829279, "grad_norm": 0.22943609952926636, "learning_rate": 3.994122034910083e-05, "loss": 0.4271, "step": 44025 }, { "epoch": 1.5868382167441526, "grad_norm": 0.21029749512672424, "learning_rate": 3.9938880610312294e-05, "loss": 0.4071, "step": 44030 }, { "epoch": 1.587018416405377, "grad_norm": 0.1644810289144516, "learning_rate": 3.99365406679844e-05, "loss": 0.3875, "step": 44035 }, { "epoch": 1.5871986160666018, "grad_norm": 0.1785242259502411, "learning_rate": 3.993420052214904e-05, "loss": 0.3676, "step": 44040 }, { "epoch": 1.5873788157278264, "grad_norm": 0.17535260319709778, "learning_rate": 3.9931860172838076e-05, "loss": 0.3939, "step": 44045 }, { "epoch": 1.587559015389051, "grad_norm": 0.17083501815795898, "learning_rate": 3.992951962008341e-05, "loss": 0.4247, "step": 44050 }, { "epoch": 1.5877392150502758, "grad_norm": 0.230384960770607, "learning_rate": 3.992717886391693e-05, "loss": 0.4225, "step": 44055 }, { "epoch": 1.5879194147115003, "grad_norm": 0.1916215866804123, "learning_rate": 3.992483790437054e-05, "loss": 0.4315, "step": 44060 }, { "epoch": 1.5880996143727248, "grad_norm": 0.17122019827365875, "learning_rate": 3.992249674147611e-05, "loss": 0.409, "step": 44065 }, { "epoch": 1.5882798140339496, "grad_norm": 0.17510074377059937, "learning_rate": 3.9920155375265555e-05, "loss": 0.4144, "step": 44070 }, { "epoch": 1.5884600136951743, "grad_norm": 0.1663195937871933, "learning_rate": 3.991781380577076e-05, "loss": 0.4265, "step": 44075 }, { "epoch": 1.588640213356399, "grad_norm": 0.18738073110580444, "learning_rate": 3.991547203302366e-05, "loss": 0.4012, "step": 44080 }, { "epoch": 1.5888204130176236, "grad_norm": 0.1917044073343277, "learning_rate": 3.991313005705613e-05, "loss": 0.4378, "step": 44085 }, { "epoch": 1.589000612678848, "grad_norm": 0.2828647792339325, "learning_rate": 3.991078787790009e-05, "loss": 0.3974, "step": 44090 }, { "epoch": 1.5891808123400728, "grad_norm": 0.17092807590961456, "learning_rate": 3.990844549558745e-05, "loss": 0.4047, "step": 44095 }, { "epoch": 1.5893610120012975, "grad_norm": 0.1954643726348877, "learning_rate": 3.990610291015014e-05, "loss": 0.4167, "step": 44100 }, { "epoch": 1.589541211662522, "grad_norm": 0.2312636822462082, "learning_rate": 3.990376012162006e-05, "loss": 0.4022, "step": 44105 }, { "epoch": 1.5897214113237466, "grad_norm": 0.1981632262468338, "learning_rate": 3.990141713002912e-05, "loss": 0.4631, "step": 44110 }, { "epoch": 1.5899016109849713, "grad_norm": 0.16944488883018494, "learning_rate": 3.989907393540927e-05, "loss": 0.3809, "step": 44115 }, { "epoch": 1.590081810646196, "grad_norm": 0.1728655993938446, "learning_rate": 3.9896730537792415e-05, "loss": 0.3891, "step": 44120 }, { "epoch": 1.5902620103074208, "grad_norm": 0.19275811314582825, "learning_rate": 3.989438693721049e-05, "loss": 0.4316, "step": 44125 }, { "epoch": 1.5904422099686453, "grad_norm": 0.16822832822799683, "learning_rate": 3.989204313369543e-05, "loss": 0.4205, "step": 44130 }, { "epoch": 1.5906224096298698, "grad_norm": 0.16480205953121185, "learning_rate": 3.9889699127279164e-05, "loss": 0.435, "step": 44135 }, { "epoch": 1.5908026092910945, "grad_norm": 0.1605161726474762, "learning_rate": 3.9887354917993635e-05, "loss": 0.4301, "step": 44140 }, { "epoch": 1.5909828089523192, "grad_norm": 0.16853894293308258, "learning_rate": 3.988501050587078e-05, "loss": 0.4255, "step": 44145 }, { "epoch": 1.5911630086135438, "grad_norm": 0.20577529072761536, "learning_rate": 3.9882665890942526e-05, "loss": 0.3954, "step": 44150 }, { "epoch": 1.5913432082747685, "grad_norm": 0.15664778649806976, "learning_rate": 3.988032107324084e-05, "loss": 0.4088, "step": 44155 }, { "epoch": 1.591523407935993, "grad_norm": 0.22527268528938293, "learning_rate": 3.987797605279766e-05, "loss": 0.4338, "step": 44160 }, { "epoch": 1.5917036075972177, "grad_norm": 0.15542291104793549, "learning_rate": 3.987563082964493e-05, "loss": 0.4333, "step": 44165 }, { "epoch": 1.5918838072584425, "grad_norm": 0.18337464332580566, "learning_rate": 3.987328540381461e-05, "loss": 0.4064, "step": 44170 }, { "epoch": 1.592064006919667, "grad_norm": 0.21346335113048553, "learning_rate": 3.987093977533867e-05, "loss": 0.4588, "step": 44175 }, { "epoch": 1.5922442065808915, "grad_norm": 0.1897256225347519, "learning_rate": 3.986859394424905e-05, "loss": 0.44, "step": 44180 }, { "epoch": 1.5924244062421162, "grad_norm": 0.1959022730588913, "learning_rate": 3.98662479105777e-05, "loss": 0.4221, "step": 44185 }, { "epoch": 1.592604605903341, "grad_norm": 0.20452988147735596, "learning_rate": 3.986390167435661e-05, "loss": 0.4131, "step": 44190 }, { "epoch": 1.5927848055645657, "grad_norm": 0.18788976967334747, "learning_rate": 3.9861555235617734e-05, "loss": 0.4157, "step": 44195 }, { "epoch": 1.5929650052257902, "grad_norm": 0.14749525487422943, "learning_rate": 3.985920859439306e-05, "loss": 0.371, "step": 44200 }, { "epoch": 1.5931452048870147, "grad_norm": 0.19985322654247284, "learning_rate": 3.9856861750714535e-05, "loss": 0.4088, "step": 44205 }, { "epoch": 1.5933254045482395, "grad_norm": 0.18450570106506348, "learning_rate": 3.985451470461414e-05, "loss": 0.3813, "step": 44210 }, { "epoch": 1.5935056042094642, "grad_norm": 0.26536038517951965, "learning_rate": 3.985216745612387e-05, "loss": 0.3779, "step": 44215 }, { "epoch": 1.5936858038706887, "grad_norm": 0.17288127541542053, "learning_rate": 3.984982000527568e-05, "loss": 0.4086, "step": 44220 }, { "epoch": 1.5938660035319132, "grad_norm": 0.19189390540122986, "learning_rate": 3.984747235210158e-05, "loss": 0.4554, "step": 44225 }, { "epoch": 1.594046203193138, "grad_norm": 0.17613013088703156, "learning_rate": 3.984512449663353e-05, "loss": 0.3713, "step": 44230 }, { "epoch": 1.5942264028543627, "grad_norm": 0.15496566891670227, "learning_rate": 3.984277643890355e-05, "loss": 0.3943, "step": 44235 }, { "epoch": 1.5944066025155874, "grad_norm": 0.16410352289676666, "learning_rate": 3.98404281789436e-05, "loss": 0.426, "step": 44240 }, { "epoch": 1.594586802176812, "grad_norm": 0.19269990921020508, "learning_rate": 3.9838079716785704e-05, "loss": 0.3917, "step": 44245 }, { "epoch": 1.5947670018380364, "grad_norm": 0.19066692888736725, "learning_rate": 3.983573105246183e-05, "loss": 0.4285, "step": 44250 }, { "epoch": 1.5949472014992612, "grad_norm": 0.17595848441123962, "learning_rate": 3.9833382186004005e-05, "loss": 0.4524, "step": 44255 }, { "epoch": 1.595127401160486, "grad_norm": 0.15249063074588776, "learning_rate": 3.983103311744421e-05, "loss": 0.4218, "step": 44260 }, { "epoch": 1.5953076008217104, "grad_norm": 0.16133977472782135, "learning_rate": 3.982868384681446e-05, "loss": 0.4011, "step": 44265 }, { "epoch": 1.595487800482935, "grad_norm": 0.16677658259868622, "learning_rate": 3.982633437414677e-05, "loss": 0.389, "step": 44270 }, { "epoch": 1.5956680001441597, "grad_norm": 0.18336910009384155, "learning_rate": 3.9823984699473147e-05, "loss": 0.3929, "step": 44275 }, { "epoch": 1.5958481998053844, "grad_norm": 0.2092888355255127, "learning_rate": 3.982163482282559e-05, "loss": 0.4169, "step": 44280 }, { "epoch": 1.5960283994666091, "grad_norm": 0.17717908322811127, "learning_rate": 3.981928474423614e-05, "loss": 0.4569, "step": 44285 }, { "epoch": 1.5962085991278336, "grad_norm": 0.15611782670021057, "learning_rate": 3.98169344637368e-05, "loss": 0.4195, "step": 44290 }, { "epoch": 1.5963887987890581, "grad_norm": 0.18297843635082245, "learning_rate": 3.9814583981359596e-05, "loss": 0.3639, "step": 44295 }, { "epoch": 1.5965689984502829, "grad_norm": 0.15773242712020874, "learning_rate": 3.981223329713655e-05, "loss": 0.3866, "step": 44300 }, { "epoch": 1.5967491981115076, "grad_norm": 0.20912377536296844, "learning_rate": 3.98098824110997e-05, "loss": 0.3917, "step": 44305 }, { "epoch": 1.5969293977727324, "grad_norm": 0.18566472828388214, "learning_rate": 3.980753132328107e-05, "loss": 0.4331, "step": 44310 }, { "epoch": 1.5971095974339569, "grad_norm": 0.17410017549991608, "learning_rate": 3.9805180033712685e-05, "loss": 0.4333, "step": 44315 }, { "epoch": 1.5972897970951814, "grad_norm": 0.17320142686367035, "learning_rate": 3.980282854242659e-05, "loss": 0.4056, "step": 44320 }, { "epoch": 1.597469996756406, "grad_norm": 0.19781510531902313, "learning_rate": 3.9800476849454825e-05, "loss": 0.4087, "step": 44325 }, { "epoch": 1.5976501964176308, "grad_norm": 0.18340526521205902, "learning_rate": 3.979812495482943e-05, "loss": 0.4255, "step": 44330 }, { "epoch": 1.5978303960788554, "grad_norm": 0.19130225479602814, "learning_rate": 3.9795772858582444e-05, "loss": 0.4059, "step": 44335 }, { "epoch": 1.5980105957400799, "grad_norm": 0.18067646026611328, "learning_rate": 3.979342056074592e-05, "loss": 0.4199, "step": 44340 }, { "epoch": 1.5981907954013046, "grad_norm": 0.20790977776050568, "learning_rate": 3.97910680613519e-05, "loss": 0.3784, "step": 44345 }, { "epoch": 1.5983709950625293, "grad_norm": 0.22750818729400635, "learning_rate": 3.978871536043245e-05, "loss": 0.4147, "step": 44350 }, { "epoch": 1.598551194723754, "grad_norm": 0.19239923357963562, "learning_rate": 3.978636245801961e-05, "loss": 0.4602, "step": 44355 }, { "epoch": 1.5987313943849786, "grad_norm": 0.13409259915351868, "learning_rate": 3.9784009354145446e-05, "loss": 0.4135, "step": 44360 }, { "epoch": 1.598911594046203, "grad_norm": 0.18741188943386078, "learning_rate": 3.978165604884201e-05, "loss": 0.4095, "step": 44365 }, { "epoch": 1.5990917937074278, "grad_norm": 0.1525307148694992, "learning_rate": 3.9779302542141384e-05, "loss": 0.4054, "step": 44370 }, { "epoch": 1.5992719933686526, "grad_norm": 0.16644379496574402, "learning_rate": 3.977694883407561e-05, "loss": 0.4299, "step": 44375 }, { "epoch": 1.599452193029877, "grad_norm": 0.234503835439682, "learning_rate": 3.977459492467678e-05, "loss": 0.4382, "step": 44380 }, { "epoch": 1.5996323926911016, "grad_norm": 0.1683819741010666, "learning_rate": 3.977224081397696e-05, "loss": 0.4252, "step": 44385 }, { "epoch": 1.5998125923523263, "grad_norm": 0.15491560101509094, "learning_rate": 3.97698865020082e-05, "loss": 0.4087, "step": 44390 }, { "epoch": 1.599992792013551, "grad_norm": 0.18362046778202057, "learning_rate": 3.9767531988802606e-05, "loss": 0.3969, "step": 44395 }, { "epoch": 1.6001729916747758, "grad_norm": 0.178349107503891, "learning_rate": 3.9765177274392244e-05, "loss": 0.451, "step": 44400 }, { "epoch": 1.6003531913360003, "grad_norm": 0.1806674599647522, "learning_rate": 3.9762822358809206e-05, "loss": 0.4001, "step": 44405 }, { "epoch": 1.6005333909972248, "grad_norm": 0.16210544109344482, "learning_rate": 3.976046724208557e-05, "loss": 0.3909, "step": 44410 }, { "epoch": 1.6007135906584495, "grad_norm": 0.18187157809734344, "learning_rate": 3.975811192425342e-05, "loss": 0.4267, "step": 44415 }, { "epoch": 1.6008937903196743, "grad_norm": 0.20049238204956055, "learning_rate": 3.9755756405344855e-05, "loss": 0.3962, "step": 44420 }, { "epoch": 1.6010739899808988, "grad_norm": 0.1605527549982071, "learning_rate": 3.9753400685391974e-05, "loss": 0.4174, "step": 44425 }, { "epoch": 1.6012541896421235, "grad_norm": 0.16398051381111145, "learning_rate": 3.975104476442686e-05, "loss": 0.4539, "step": 44430 }, { "epoch": 1.601434389303348, "grad_norm": 0.18974505364894867, "learning_rate": 3.9748688642481614e-05, "loss": 0.3968, "step": 44435 }, { "epoch": 1.6016145889645728, "grad_norm": 0.22180098295211792, "learning_rate": 3.974633231958834e-05, "loss": 0.4166, "step": 44440 }, { "epoch": 1.6017947886257975, "grad_norm": 0.15120428800582886, "learning_rate": 3.974397579577914e-05, "loss": 0.4171, "step": 44445 }, { "epoch": 1.601974988287022, "grad_norm": 0.19408200681209564, "learning_rate": 3.974161907108613e-05, "loss": 0.4058, "step": 44450 }, { "epoch": 1.6021551879482465, "grad_norm": 0.1284191906452179, "learning_rate": 3.973926214554142e-05, "loss": 0.3842, "step": 44455 }, { "epoch": 1.6023353876094713, "grad_norm": 0.12077841907739639, "learning_rate": 3.9736905019177106e-05, "loss": 0.363, "step": 44460 }, { "epoch": 1.602515587270696, "grad_norm": 0.19127848744392395, "learning_rate": 3.973454769202532e-05, "loss": 0.3965, "step": 44465 }, { "epoch": 1.6026957869319207, "grad_norm": 0.16007506847381592, "learning_rate": 3.9732190164118175e-05, "loss": 0.3915, "step": 44470 }, { "epoch": 1.6028759865931452, "grad_norm": 0.16943296790122986, "learning_rate": 3.972983243548779e-05, "loss": 0.4289, "step": 44475 }, { "epoch": 1.6030561862543697, "grad_norm": 0.15759488940238953, "learning_rate": 3.972747450616629e-05, "loss": 0.405, "step": 44480 }, { "epoch": 1.6032363859155945, "grad_norm": 0.178999662399292, "learning_rate": 3.97251163761858e-05, "loss": 0.3919, "step": 44485 }, { "epoch": 1.6034165855768192, "grad_norm": 0.26717543601989746, "learning_rate": 3.9722758045578454e-05, "loss": 0.4161, "step": 44490 }, { "epoch": 1.6035967852380437, "grad_norm": 0.24966903030872345, "learning_rate": 3.9720399514376374e-05, "loss": 0.4346, "step": 44495 }, { "epoch": 1.6037769848992682, "grad_norm": 0.17381806671619415, "learning_rate": 3.97180407826117e-05, "loss": 0.4227, "step": 44500 }, { "epoch": 1.6037769848992682, "eval_loss": 0.44284766912460327, "eval_runtime": 3.5457, "eval_samples_per_second": 28.203, "eval_steps_per_second": 7.051, "step": 44500 }, { "epoch": 1.603957184560493, "grad_norm": 0.15787242352962494, "learning_rate": 3.971568185031658e-05, "loss": 0.3882, "step": 44505 }, { "epoch": 1.6041373842217177, "grad_norm": 0.168611541390419, "learning_rate": 3.971332271752313e-05, "loss": 0.3956, "step": 44510 }, { "epoch": 1.6043175838829424, "grad_norm": 0.17954233288764954, "learning_rate": 3.9710963384263515e-05, "loss": 0.4084, "step": 44515 }, { "epoch": 1.604497783544167, "grad_norm": 0.16351082921028137, "learning_rate": 3.970860385056987e-05, "loss": 0.4356, "step": 44520 }, { "epoch": 1.6046779832053915, "grad_norm": 0.19120116531848907, "learning_rate": 3.9706244116474345e-05, "loss": 0.418, "step": 44525 }, { "epoch": 1.6048581828666162, "grad_norm": 0.15512365102767944, "learning_rate": 3.97038841820091e-05, "loss": 0.4185, "step": 44530 }, { "epoch": 1.605038382527841, "grad_norm": 0.1973215490579605, "learning_rate": 3.970152404720627e-05, "loss": 0.4046, "step": 44535 }, { "epoch": 1.6052185821890654, "grad_norm": 0.18126599490642548, "learning_rate": 3.969916371209802e-05, "loss": 0.4358, "step": 44540 }, { "epoch": 1.6053987818502902, "grad_norm": 0.21188373863697052, "learning_rate": 3.9696803176716515e-05, "loss": 0.472, "step": 44545 }, { "epoch": 1.6055789815115147, "grad_norm": 0.16535495221614838, "learning_rate": 3.969444244109391e-05, "loss": 0.4183, "step": 44550 }, { "epoch": 1.6057591811727394, "grad_norm": 0.21638596057891846, "learning_rate": 3.969208150526237e-05, "loss": 0.4231, "step": 44555 }, { "epoch": 1.6059393808339641, "grad_norm": 0.21015167236328125, "learning_rate": 3.968972036925407e-05, "loss": 0.4411, "step": 44560 }, { "epoch": 1.6061195804951887, "grad_norm": 0.16333642601966858, "learning_rate": 3.968735903310117e-05, "loss": 0.4082, "step": 44565 }, { "epoch": 1.6062997801564132, "grad_norm": 0.1700402796268463, "learning_rate": 3.968499749683584e-05, "loss": 0.406, "step": 44570 }, { "epoch": 1.606479979817638, "grad_norm": 0.16421562433242798, "learning_rate": 3.968263576049027e-05, "loss": 0.3949, "step": 44575 }, { "epoch": 1.6066601794788626, "grad_norm": 0.18624955415725708, "learning_rate": 3.968027382409663e-05, "loss": 0.3847, "step": 44580 }, { "epoch": 1.6068403791400874, "grad_norm": 0.23237396776676178, "learning_rate": 3.9677911687687095e-05, "loss": 0.4233, "step": 44585 }, { "epoch": 1.6070205788013119, "grad_norm": 0.17745272815227509, "learning_rate": 3.967554935129387e-05, "loss": 0.405, "step": 44590 }, { "epoch": 1.6072007784625364, "grad_norm": 0.1729012280702591, "learning_rate": 3.9673186814949115e-05, "loss": 0.3969, "step": 44595 }, { "epoch": 1.6073809781237611, "grad_norm": 0.2897031605243683, "learning_rate": 3.967082407868503e-05, "loss": 0.3892, "step": 44600 }, { "epoch": 1.6075611777849859, "grad_norm": 0.17418605089187622, "learning_rate": 3.9668461142533807e-05, "loss": 0.4135, "step": 44605 }, { "epoch": 1.6077413774462104, "grad_norm": 0.19740115106105804, "learning_rate": 3.9666098006527653e-05, "loss": 0.3922, "step": 44610 }, { "epoch": 1.6079215771074349, "grad_norm": 0.18826255202293396, "learning_rate": 3.966373467069874e-05, "loss": 0.4254, "step": 44615 }, { "epoch": 1.6081017767686596, "grad_norm": 0.18724419176578522, "learning_rate": 3.9661371135079285e-05, "loss": 0.4175, "step": 44620 }, { "epoch": 1.6082819764298844, "grad_norm": 0.20004336535930634, "learning_rate": 3.9659007399701485e-05, "loss": 0.4015, "step": 44625 }, { "epoch": 1.608462176091109, "grad_norm": 0.2576258182525635, "learning_rate": 3.9656643464597545e-05, "loss": 0.4176, "step": 44630 }, { "epoch": 1.6086423757523336, "grad_norm": 0.1877816915512085, "learning_rate": 3.9654279329799684e-05, "loss": 0.4314, "step": 44635 }, { "epoch": 1.608822575413558, "grad_norm": 0.20604005455970764, "learning_rate": 3.96519149953401e-05, "loss": 0.4033, "step": 44640 }, { "epoch": 1.6090027750747828, "grad_norm": 0.17732234299182892, "learning_rate": 3.964955046125101e-05, "loss": 0.4149, "step": 44645 }, { "epoch": 1.6091829747360076, "grad_norm": 0.22829070687294006, "learning_rate": 3.964718572756463e-05, "loss": 0.3822, "step": 44650 }, { "epoch": 1.609363174397232, "grad_norm": 0.20384085178375244, "learning_rate": 3.964482079431319e-05, "loss": 0.411, "step": 44655 }, { "epoch": 1.6095433740584568, "grad_norm": 0.21670548617839813, "learning_rate": 3.9642455661528885e-05, "loss": 0.4065, "step": 44660 }, { "epoch": 1.6097235737196813, "grad_norm": 0.18137463927268982, "learning_rate": 3.964009032924396e-05, "loss": 0.3918, "step": 44665 }, { "epoch": 1.609903773380906, "grad_norm": 0.16431452333927155, "learning_rate": 3.963772479749065e-05, "loss": 0.3946, "step": 44670 }, { "epoch": 1.6100839730421308, "grad_norm": 0.17995165288448334, "learning_rate": 3.963535906630117e-05, "loss": 0.4308, "step": 44675 }, { "epoch": 1.6102641727033553, "grad_norm": 0.17555540800094604, "learning_rate": 3.9632993135707755e-05, "loss": 0.4113, "step": 44680 }, { "epoch": 1.6104443723645798, "grad_norm": 0.21081358194351196, "learning_rate": 3.963062700574264e-05, "loss": 0.4261, "step": 44685 }, { "epoch": 1.6106245720258046, "grad_norm": 0.18575575947761536, "learning_rate": 3.9628260676438064e-05, "loss": 0.3961, "step": 44690 }, { "epoch": 1.6108047716870293, "grad_norm": 0.16622988879680634, "learning_rate": 3.962589414782627e-05, "loss": 0.4321, "step": 44695 }, { "epoch": 1.610984971348254, "grad_norm": 0.19155137240886688, "learning_rate": 3.96235274199395e-05, "loss": 0.4032, "step": 44700 }, { "epoch": 1.6111651710094785, "grad_norm": 0.15774855017662048, "learning_rate": 3.962116049280999e-05, "loss": 0.4115, "step": 44705 }, { "epoch": 1.611345370670703, "grad_norm": 0.17993386089801788, "learning_rate": 3.961879336647001e-05, "loss": 0.4065, "step": 44710 }, { "epoch": 1.6115255703319278, "grad_norm": 0.18789418041706085, "learning_rate": 3.961642604095181e-05, "loss": 0.391, "step": 44715 }, { "epoch": 1.6117057699931525, "grad_norm": 0.1855124831199646, "learning_rate": 3.9614058516287624e-05, "loss": 0.4414, "step": 44720 }, { "epoch": 1.611885969654377, "grad_norm": 0.17999990284442902, "learning_rate": 3.961169079250971e-05, "loss": 0.385, "step": 44725 }, { "epoch": 1.6120661693156015, "grad_norm": 0.1700844168663025, "learning_rate": 3.9609322869650354e-05, "loss": 0.4109, "step": 44730 }, { "epoch": 1.6122463689768263, "grad_norm": 0.1721271425485611, "learning_rate": 3.9606954747741797e-05, "loss": 0.4331, "step": 44735 }, { "epoch": 1.612426568638051, "grad_norm": 0.17084349691867828, "learning_rate": 3.960458642681631e-05, "loss": 0.3995, "step": 44740 }, { "epoch": 1.6126067682992757, "grad_norm": 0.14765715599060059, "learning_rate": 3.960221790690616e-05, "loss": 0.4097, "step": 44745 }, { "epoch": 1.6127869679605003, "grad_norm": 0.16474126279354095, "learning_rate": 3.959984918804361e-05, "loss": 0.4378, "step": 44750 }, { "epoch": 1.6129671676217248, "grad_norm": 0.2174513041973114, "learning_rate": 3.959748027026095e-05, "loss": 0.3976, "step": 44755 }, { "epoch": 1.6131473672829495, "grad_norm": 0.15356197953224182, "learning_rate": 3.959511115359045e-05, "loss": 0.4151, "step": 44760 }, { "epoch": 1.6133275669441742, "grad_norm": 0.18707029521465302, "learning_rate": 3.959274183806438e-05, "loss": 0.4231, "step": 44765 }, { "epoch": 1.6135077666053987, "grad_norm": 0.20073364675045013, "learning_rate": 3.959037232371503e-05, "loss": 0.4256, "step": 44770 }, { "epoch": 1.6136879662666233, "grad_norm": 0.20367778837680817, "learning_rate": 3.9588002610574694e-05, "loss": 0.4231, "step": 44775 }, { "epoch": 1.613868165927848, "grad_norm": 0.15489740669727325, "learning_rate": 3.958563269867563e-05, "loss": 0.3753, "step": 44780 }, { "epoch": 1.6140483655890727, "grad_norm": 0.18717733025550842, "learning_rate": 3.958326258805015e-05, "loss": 0.4145, "step": 44785 }, { "epoch": 1.6142285652502975, "grad_norm": 0.1864037811756134, "learning_rate": 3.9580892278730534e-05, "loss": 0.3988, "step": 44790 }, { "epoch": 1.614408764911522, "grad_norm": 0.19516822695732117, "learning_rate": 3.95785217707491e-05, "loss": 0.4386, "step": 44795 }, { "epoch": 1.6145889645727465, "grad_norm": 0.17213517427444458, "learning_rate": 3.957615106413811e-05, "loss": 0.4205, "step": 44800 }, { "epoch": 1.6147691642339712, "grad_norm": 0.16478149592876434, "learning_rate": 3.95737801589299e-05, "loss": 0.3819, "step": 44805 }, { "epoch": 1.614949363895196, "grad_norm": 0.15579113364219666, "learning_rate": 3.957140905515674e-05, "loss": 0.4181, "step": 44810 }, { "epoch": 1.6151295635564207, "grad_norm": 0.21928997337818146, "learning_rate": 3.956903775285097e-05, "loss": 0.4157, "step": 44815 }, { "epoch": 1.6153097632176452, "grad_norm": 0.14107581973075867, "learning_rate": 3.956666625204487e-05, "loss": 0.3837, "step": 44820 }, { "epoch": 1.6154899628788697, "grad_norm": 0.19918832182884216, "learning_rate": 3.956429455277077e-05, "loss": 0.4018, "step": 44825 }, { "epoch": 1.6156701625400944, "grad_norm": 0.19882018864154816, "learning_rate": 3.9561922655060965e-05, "loss": 0.4323, "step": 44830 }, { "epoch": 1.6158503622013192, "grad_norm": 0.17899902164936066, "learning_rate": 3.95595505589478e-05, "loss": 0.4351, "step": 44835 }, { "epoch": 1.6160305618625437, "grad_norm": 0.14967146515846252, "learning_rate": 3.955717826446357e-05, "loss": 0.4039, "step": 44840 }, { "epoch": 1.6162107615237682, "grad_norm": 0.17827114462852478, "learning_rate": 3.95548057716406e-05, "loss": 0.3759, "step": 44845 }, { "epoch": 1.616390961184993, "grad_norm": 0.19635485112667084, "learning_rate": 3.955243308051122e-05, "loss": 0.4571, "step": 44850 }, { "epoch": 1.6165711608462177, "grad_norm": 0.20336943864822388, "learning_rate": 3.955006019110776e-05, "loss": 0.4104, "step": 44855 }, { "epoch": 1.6167513605074424, "grad_norm": 0.16588455438613892, "learning_rate": 3.954768710346255e-05, "loss": 0.3808, "step": 44860 }, { "epoch": 1.616931560168667, "grad_norm": 0.18041035532951355, "learning_rate": 3.954531381760791e-05, "loss": 0.3845, "step": 44865 }, { "epoch": 1.6171117598298914, "grad_norm": 0.19246827065944672, "learning_rate": 3.95429403335762e-05, "loss": 0.4226, "step": 44870 }, { "epoch": 1.6172919594911161, "grad_norm": 0.1802048683166504, "learning_rate": 3.954056665139972e-05, "loss": 0.4507, "step": 44875 }, { "epoch": 1.6174721591523409, "grad_norm": 0.1771692931652069, "learning_rate": 3.9538192771110855e-05, "loss": 0.3853, "step": 44880 }, { "epoch": 1.6176523588135654, "grad_norm": 0.19818015396595, "learning_rate": 3.953581869274192e-05, "loss": 0.4143, "step": 44885 }, { "epoch": 1.61783255847479, "grad_norm": 0.19741560518741608, "learning_rate": 3.953344441632527e-05, "loss": 0.4189, "step": 44890 }, { "epoch": 1.6180127581360146, "grad_norm": 0.15774419903755188, "learning_rate": 3.953106994189326e-05, "loss": 0.4124, "step": 44895 }, { "epoch": 1.6181929577972394, "grad_norm": 0.1899198293685913, "learning_rate": 3.952869526947823e-05, "loss": 0.4436, "step": 44900 }, { "epoch": 1.618373157458464, "grad_norm": 0.19726921617984772, "learning_rate": 3.9526320399112536e-05, "loss": 0.3846, "step": 44905 }, { "epoch": 1.6185533571196886, "grad_norm": 0.18593871593475342, "learning_rate": 3.952394533082855e-05, "loss": 0.4035, "step": 44910 }, { "epoch": 1.6187335567809131, "grad_norm": 0.1598072648048401, "learning_rate": 3.952157006465861e-05, "loss": 0.4077, "step": 44915 }, { "epoch": 1.6189137564421379, "grad_norm": 0.2124381959438324, "learning_rate": 3.95191946006351e-05, "loss": 0.4266, "step": 44920 }, { "epoch": 1.6190939561033626, "grad_norm": 0.16838979721069336, "learning_rate": 3.951681893879036e-05, "loss": 0.4073, "step": 44925 }, { "epoch": 1.619274155764587, "grad_norm": 0.16557417809963226, "learning_rate": 3.951444307915678e-05, "loss": 0.4183, "step": 44930 }, { "epoch": 1.6194543554258118, "grad_norm": 0.17037592828273773, "learning_rate": 3.951206702176672e-05, "loss": 0.3463, "step": 44935 }, { "epoch": 1.6196345550870364, "grad_norm": 0.16905367374420166, "learning_rate": 3.950969076665256e-05, "loss": 0.3978, "step": 44940 }, { "epoch": 1.619814754748261, "grad_norm": 0.16748353838920593, "learning_rate": 3.950731431384668e-05, "loss": 0.461, "step": 44945 }, { "epoch": 1.6199949544094858, "grad_norm": 0.1732574999332428, "learning_rate": 3.950493766338144e-05, "loss": 0.407, "step": 44950 }, { "epoch": 1.6201751540707103, "grad_norm": 0.12833158671855927, "learning_rate": 3.9502560815289236e-05, "loss": 0.3877, "step": 44955 }, { "epoch": 1.6203553537319348, "grad_norm": 0.17978282272815704, "learning_rate": 3.950018376960245e-05, "loss": 0.4099, "step": 44960 }, { "epoch": 1.6205355533931596, "grad_norm": 0.19126909971237183, "learning_rate": 3.949780652635347e-05, "loss": 0.4621, "step": 44965 }, { "epoch": 1.6207157530543843, "grad_norm": 0.1761917769908905, "learning_rate": 3.9495429085574675e-05, "loss": 0.4437, "step": 44970 }, { "epoch": 1.620895952715609, "grad_norm": 0.21770428121089935, "learning_rate": 3.949305144729847e-05, "loss": 0.4273, "step": 44975 }, { "epoch": 1.6210761523768336, "grad_norm": 0.21799218654632568, "learning_rate": 3.949067361155725e-05, "loss": 0.4328, "step": 44980 }, { "epoch": 1.621256352038058, "grad_norm": 0.21713624894618988, "learning_rate": 3.94882955783834e-05, "loss": 0.4416, "step": 44985 }, { "epoch": 1.6214365516992828, "grad_norm": 0.1928412914276123, "learning_rate": 3.9485917347809334e-05, "loss": 0.4139, "step": 44990 }, { "epoch": 1.6216167513605075, "grad_norm": 0.24302361905574799, "learning_rate": 3.948353891986743e-05, "loss": 0.4207, "step": 44995 }, { "epoch": 1.621796951021732, "grad_norm": 0.17859376966953278, "learning_rate": 3.948116029459014e-05, "loss": 0.4249, "step": 45000 }, { "epoch": 1.621796951021732, "eval_loss": 0.44367220997810364, "eval_runtime": 3.5481, "eval_samples_per_second": 28.184, "eval_steps_per_second": 7.046, "step": 45000 }, { "epoch": 1.6219771506829566, "grad_norm": 0.18802441656589508, "learning_rate": 3.947878147200983e-05, "loss": 0.4004, "step": 45005 }, { "epoch": 1.6221573503441813, "grad_norm": 0.2054462730884552, "learning_rate": 3.9476402452158926e-05, "loss": 0.4113, "step": 45010 }, { "epoch": 1.622337550005406, "grad_norm": 0.17358975112438202, "learning_rate": 3.947402323506984e-05, "loss": 0.4125, "step": 45015 }, { "epoch": 1.6225177496666308, "grad_norm": 0.2103101909160614, "learning_rate": 3.9471643820774995e-05, "loss": 0.4065, "step": 45020 }, { "epoch": 1.6226979493278553, "grad_norm": 0.18095333874225616, "learning_rate": 3.94692642093068e-05, "loss": 0.4137, "step": 45025 }, { "epoch": 1.6228781489890798, "grad_norm": 0.17532798647880554, "learning_rate": 3.946688440069768e-05, "loss": 0.4145, "step": 45030 }, { "epoch": 1.6230583486503045, "grad_norm": 0.21436169743537903, "learning_rate": 3.946450439498006e-05, "loss": 0.432, "step": 45035 }, { "epoch": 1.6232385483115293, "grad_norm": 0.15949729084968567, "learning_rate": 3.946212419218638e-05, "loss": 0.4074, "step": 45040 }, { "epoch": 1.6234187479727538, "grad_norm": 0.17256999015808105, "learning_rate": 3.945974379234905e-05, "loss": 0.4371, "step": 45045 }, { "epoch": 1.6235989476339785, "grad_norm": 0.2011367380619049, "learning_rate": 3.945736319550051e-05, "loss": 0.443, "step": 45050 }, { "epoch": 1.623779147295203, "grad_norm": 0.14212122559547424, "learning_rate": 3.945498240167319e-05, "loss": 0.4378, "step": 45055 }, { "epoch": 1.6239593469564277, "grad_norm": 0.21600797772407532, "learning_rate": 3.9452601410899544e-05, "loss": 0.4043, "step": 45060 }, { "epoch": 1.6241395466176525, "grad_norm": 0.16741763055324554, "learning_rate": 3.9450220223212e-05, "loss": 0.4199, "step": 45065 }, { "epoch": 1.624319746278877, "grad_norm": 0.1476602405309677, "learning_rate": 3.9447838838643e-05, "loss": 0.3941, "step": 45070 }, { "epoch": 1.6244999459401015, "grad_norm": 0.17911425232887268, "learning_rate": 3.9445457257224996e-05, "loss": 0.4306, "step": 45075 }, { "epoch": 1.6246801456013262, "grad_norm": 0.17017336189746857, "learning_rate": 3.944307547899042e-05, "loss": 0.4172, "step": 45080 }, { "epoch": 1.624860345262551, "grad_norm": 0.17422279715538025, "learning_rate": 3.944069350397175e-05, "loss": 0.3886, "step": 45085 }, { "epoch": 1.6250405449237757, "grad_norm": 0.1593707799911499, "learning_rate": 3.9438311332201424e-05, "loss": 0.3895, "step": 45090 }, { "epoch": 1.6252207445850002, "grad_norm": 0.1855667531490326, "learning_rate": 3.94359289637119e-05, "loss": 0.4233, "step": 45095 }, { "epoch": 1.6254009442462247, "grad_norm": 0.16529178619384766, "learning_rate": 3.943354639853565e-05, "loss": 0.3903, "step": 45100 }, { "epoch": 1.6255811439074495, "grad_norm": 0.15914855897426605, "learning_rate": 3.9431163636705114e-05, "loss": 0.4092, "step": 45105 }, { "epoch": 1.6257613435686742, "grad_norm": 0.21924299001693726, "learning_rate": 3.942878067825277e-05, "loss": 0.4527, "step": 45110 }, { "epoch": 1.6259415432298987, "grad_norm": 0.16359423100948334, "learning_rate": 3.942639752321108e-05, "loss": 0.3986, "step": 45115 }, { "epoch": 1.6261217428911232, "grad_norm": 0.23440222442150116, "learning_rate": 3.942401417161252e-05, "loss": 0.4406, "step": 45120 }, { "epoch": 1.626301942552348, "grad_norm": 0.2375732809305191, "learning_rate": 3.942163062348956e-05, "loss": 0.4347, "step": 45125 }, { "epoch": 1.6264821422135727, "grad_norm": 0.19271384179592133, "learning_rate": 3.9419246878874674e-05, "loss": 0.4476, "step": 45130 }, { "epoch": 1.6266623418747974, "grad_norm": 0.1916663497686386, "learning_rate": 3.9416862937800335e-05, "loss": 0.4026, "step": 45135 }, { "epoch": 1.626842541536022, "grad_norm": 0.18741334974765778, "learning_rate": 3.941447880029903e-05, "loss": 0.3952, "step": 45140 }, { "epoch": 1.6270227411972464, "grad_norm": 0.16537748277187347, "learning_rate": 3.941209446640325e-05, "loss": 0.4506, "step": 45145 }, { "epoch": 1.6272029408584712, "grad_norm": 0.17888925969600677, "learning_rate": 3.940970993614547e-05, "loss": 0.4331, "step": 45150 }, { "epoch": 1.627383140519696, "grad_norm": 0.1680968850851059, "learning_rate": 3.940732520955818e-05, "loss": 0.4103, "step": 45155 }, { "epoch": 1.6275633401809204, "grad_norm": 0.22438734769821167, "learning_rate": 3.940494028667387e-05, "loss": 0.4185, "step": 45160 }, { "epoch": 1.6277435398421451, "grad_norm": 0.1568988710641861, "learning_rate": 3.940255516752504e-05, "loss": 0.404, "step": 45165 }, { "epoch": 1.6279237395033697, "grad_norm": 0.18171727657318115, "learning_rate": 3.94001698521442e-05, "loss": 0.3919, "step": 45170 }, { "epoch": 1.6281039391645944, "grad_norm": 0.20402833819389343, "learning_rate": 3.9397784340563813e-05, "loss": 0.3906, "step": 45175 }, { "epoch": 1.6282841388258191, "grad_norm": 0.2102474421262741, "learning_rate": 3.939539863281641e-05, "loss": 0.4252, "step": 45180 }, { "epoch": 1.6284643384870436, "grad_norm": 0.15504828095436096, "learning_rate": 3.939301272893449e-05, "loss": 0.394, "step": 45185 }, { "epoch": 1.6286445381482682, "grad_norm": 0.19128242135047913, "learning_rate": 3.939062662895055e-05, "loss": 0.3909, "step": 45190 }, { "epoch": 1.6288247378094929, "grad_norm": 0.18570300936698914, "learning_rate": 3.938824033289712e-05, "loss": 0.4437, "step": 45195 }, { "epoch": 1.6290049374707176, "grad_norm": 0.1933826506137848, "learning_rate": 3.938585384080668e-05, "loss": 0.4038, "step": 45200 }, { "epoch": 1.6291851371319424, "grad_norm": 0.17036275565624237, "learning_rate": 3.93834671527118e-05, "loss": 0.4106, "step": 45205 }, { "epoch": 1.6293653367931669, "grad_norm": 0.2243850827217102, "learning_rate": 3.9381080268644936e-05, "loss": 0.4018, "step": 45210 }, { "epoch": 1.6295455364543914, "grad_norm": 0.2178872525691986, "learning_rate": 3.9378693188638646e-05, "loss": 0.4064, "step": 45215 }, { "epoch": 1.629725736115616, "grad_norm": 0.2029925286769867, "learning_rate": 3.937630591272545e-05, "loss": 0.4244, "step": 45220 }, { "epoch": 1.6299059357768408, "grad_norm": 0.20879967510700226, "learning_rate": 3.937391844093786e-05, "loss": 0.4523, "step": 45225 }, { "epoch": 1.6300861354380654, "grad_norm": 0.1707211285829544, "learning_rate": 3.937153077330843e-05, "loss": 0.4215, "step": 45230 }, { "epoch": 1.6302663350992899, "grad_norm": 0.17152579128742218, "learning_rate": 3.936914290986966e-05, "loss": 0.4481, "step": 45235 }, { "epoch": 1.6304465347605146, "grad_norm": 0.17980226874351501, "learning_rate": 3.9366754850654106e-05, "loss": 0.3495, "step": 45240 }, { "epoch": 1.6306267344217393, "grad_norm": 0.18112671375274658, "learning_rate": 3.9364366595694296e-05, "loss": 0.3669, "step": 45245 }, { "epoch": 1.630806934082964, "grad_norm": 0.20279546082019806, "learning_rate": 3.936197814502278e-05, "loss": 0.3874, "step": 45250 }, { "epoch": 1.6309871337441886, "grad_norm": 0.15267014503479004, "learning_rate": 3.9359589498672086e-05, "loss": 0.4019, "step": 45255 }, { "epoch": 1.631167333405413, "grad_norm": 0.16568580269813538, "learning_rate": 3.9357200656674764e-05, "loss": 0.4233, "step": 45260 }, { "epoch": 1.6313475330666378, "grad_norm": 0.17788587510585785, "learning_rate": 3.935481161906336e-05, "loss": 0.3938, "step": 45265 }, { "epoch": 1.6315277327278626, "grad_norm": 0.19289278984069824, "learning_rate": 3.935242238587043e-05, "loss": 0.4054, "step": 45270 }, { "epoch": 1.631707932389087, "grad_norm": 0.17799817025661469, "learning_rate": 3.935003295712853e-05, "loss": 0.4, "step": 45275 }, { "epoch": 1.6318881320503116, "grad_norm": 0.1564566195011139, "learning_rate": 3.9347643332870206e-05, "loss": 0.4249, "step": 45280 }, { "epoch": 1.6320683317115363, "grad_norm": 0.22281388938426971, "learning_rate": 3.934525351312801e-05, "loss": 0.396, "step": 45285 }, { "epoch": 1.632248531372761, "grad_norm": 0.17600180208683014, "learning_rate": 3.934286349793452e-05, "loss": 0.423, "step": 45290 }, { "epoch": 1.6324287310339858, "grad_norm": 0.19052568078041077, "learning_rate": 3.93404732873223e-05, "loss": 0.4046, "step": 45295 }, { "epoch": 1.6326089306952103, "grad_norm": 0.18254795670509338, "learning_rate": 3.9338082881323896e-05, "loss": 0.3924, "step": 45300 }, { "epoch": 1.6327891303564348, "grad_norm": 0.20248796045780182, "learning_rate": 3.933569227997189e-05, "loss": 0.4133, "step": 45305 }, { "epoch": 1.6329693300176595, "grad_norm": 0.1688736379146576, "learning_rate": 3.9333301483298854e-05, "loss": 0.3878, "step": 45310 }, { "epoch": 1.6331495296788843, "grad_norm": 0.17724154889583588, "learning_rate": 3.9330910491337365e-05, "loss": 0.3865, "step": 45315 }, { "epoch": 1.633329729340109, "grad_norm": 0.1554093062877655, "learning_rate": 3.932851930411999e-05, "loss": 0.4509, "step": 45320 }, { "epoch": 1.6335099290013335, "grad_norm": 0.18391548097133636, "learning_rate": 3.9326127921679315e-05, "loss": 0.4208, "step": 45325 }, { "epoch": 1.633690128662558, "grad_norm": 0.18900564312934875, "learning_rate": 3.932373634404793e-05, "loss": 0.412, "step": 45330 }, { "epoch": 1.6338703283237828, "grad_norm": 0.17100130021572113, "learning_rate": 3.932134457125839e-05, "loss": 0.4065, "step": 45335 }, { "epoch": 1.6340505279850075, "grad_norm": 0.17979390919208527, "learning_rate": 3.931895260334331e-05, "loss": 0.3887, "step": 45340 }, { "epoch": 1.634230727646232, "grad_norm": 0.1904669851064682, "learning_rate": 3.9316560440335275e-05, "loss": 0.4119, "step": 45345 }, { "epoch": 1.6344109273074565, "grad_norm": 0.16896291077136993, "learning_rate": 3.931416808226688e-05, "loss": 0.4423, "step": 45350 }, { "epoch": 1.6345911269686813, "grad_norm": 0.22345204651355743, "learning_rate": 3.931177552917071e-05, "loss": 0.4178, "step": 45355 }, { "epoch": 1.634771326629906, "grad_norm": 0.16224905848503113, "learning_rate": 3.9309382781079375e-05, "loss": 0.4169, "step": 45360 }, { "epoch": 1.6349515262911307, "grad_norm": 0.20584708452224731, "learning_rate": 3.930698983802547e-05, "loss": 0.4187, "step": 45365 }, { "epoch": 1.6351317259523552, "grad_norm": 0.17952153086662292, "learning_rate": 3.930459670004159e-05, "loss": 0.4389, "step": 45370 }, { "epoch": 1.6353119256135797, "grad_norm": 0.2221333533525467, "learning_rate": 3.930220336716036e-05, "loss": 0.4312, "step": 45375 }, { "epoch": 1.6354921252748045, "grad_norm": 0.16802071034908295, "learning_rate": 3.929980983941437e-05, "loss": 0.4034, "step": 45380 }, { "epoch": 1.6356723249360292, "grad_norm": 0.20047937333583832, "learning_rate": 3.9297416116836246e-05, "loss": 0.3942, "step": 45385 }, { "epoch": 1.6358525245972537, "grad_norm": 0.19346770644187927, "learning_rate": 3.9295022199458596e-05, "loss": 0.4662, "step": 45390 }, { "epoch": 1.6360327242584782, "grad_norm": 0.18764521181583405, "learning_rate": 3.9292628087314034e-05, "loss": 0.4294, "step": 45395 }, { "epoch": 1.636212923919703, "grad_norm": 0.1746283918619156, "learning_rate": 3.9290233780435174e-05, "loss": 0.4132, "step": 45400 }, { "epoch": 1.6363931235809277, "grad_norm": 0.18788184225559235, "learning_rate": 3.928783927885466e-05, "loss": 0.4366, "step": 45405 }, { "epoch": 1.6365733232421524, "grad_norm": 0.16621163487434387, "learning_rate": 3.9285444582605086e-05, "loss": 0.4393, "step": 45410 }, { "epoch": 1.636753522903377, "grad_norm": 0.16439075767993927, "learning_rate": 3.9283049691719106e-05, "loss": 0.3832, "step": 45415 }, { "epoch": 1.6369337225646015, "grad_norm": 0.15328091382980347, "learning_rate": 3.928065460622933e-05, "loss": 0.4351, "step": 45420 }, { "epoch": 1.6371139222258262, "grad_norm": 0.20914670825004578, "learning_rate": 3.927825932616841e-05, "loss": 0.3928, "step": 45425 }, { "epoch": 1.637294121887051, "grad_norm": 0.21218664944171906, "learning_rate": 3.9275863851568964e-05, "loss": 0.4199, "step": 45430 }, { "epoch": 1.6374743215482754, "grad_norm": 0.19584229588508606, "learning_rate": 3.9273468182463645e-05, "loss": 0.4132, "step": 45435 }, { "epoch": 1.6376545212095002, "grad_norm": 0.20541848242282867, "learning_rate": 3.9271072318885076e-05, "loss": 0.4287, "step": 45440 }, { "epoch": 1.6378347208707247, "grad_norm": 0.20829689502716064, "learning_rate": 3.9268676260865914e-05, "loss": 0.4313, "step": 45445 }, { "epoch": 1.6380149205319494, "grad_norm": 0.1655767560005188, "learning_rate": 3.92662800084388e-05, "loss": 0.4409, "step": 45450 }, { "epoch": 1.6381951201931741, "grad_norm": 0.17300541698932648, "learning_rate": 3.9263883561636385e-05, "loss": 0.4154, "step": 45455 }, { "epoch": 1.6383753198543987, "grad_norm": 0.220417782664299, "learning_rate": 3.926148692049132e-05, "loss": 0.418, "step": 45460 }, { "epoch": 1.6385555195156232, "grad_norm": 0.18571282923221588, "learning_rate": 3.925909008503625e-05, "loss": 0.4301, "step": 45465 }, { "epoch": 1.638735719176848, "grad_norm": 0.18570081889629364, "learning_rate": 3.925669305530384e-05, "loss": 0.4454, "step": 45470 }, { "epoch": 1.6389159188380726, "grad_norm": 0.14735782146453857, "learning_rate": 3.925429583132675e-05, "loss": 0.4146, "step": 45475 }, { "epoch": 1.6390961184992974, "grad_norm": 0.17413799464702606, "learning_rate": 3.925189841313764e-05, "loss": 0.3904, "step": 45480 }, { "epoch": 1.6392763181605219, "grad_norm": 0.27123942971229553, "learning_rate": 3.924950080076917e-05, "loss": 0.4315, "step": 45485 }, { "epoch": 1.6394565178217464, "grad_norm": 0.1668565422296524, "learning_rate": 3.924710299425401e-05, "loss": 0.4216, "step": 45490 }, { "epoch": 1.6396367174829711, "grad_norm": 0.17070479691028595, "learning_rate": 3.924470499362483e-05, "loss": 0.423, "step": 45495 }, { "epoch": 1.6398169171441959, "grad_norm": 0.1955229640007019, "learning_rate": 3.9242306798914305e-05, "loss": 0.4145, "step": 45500 }, { "epoch": 1.6398169171441959, "eval_loss": 0.4432607591152191, "eval_runtime": 3.5416, "eval_samples_per_second": 28.236, "eval_steps_per_second": 7.059, "step": 45500 }, { "epoch": 1.6399971168054204, "grad_norm": 0.1931176483631134, "learning_rate": 3.9239908410155104e-05, "loss": 0.412, "step": 45505 }, { "epoch": 1.6401773164666449, "grad_norm": 0.216093048453331, "learning_rate": 3.923750982737991e-05, "loss": 0.4194, "step": 45510 }, { "epoch": 1.6403575161278696, "grad_norm": 0.17901061475276947, "learning_rate": 3.92351110506214e-05, "loss": 0.4208, "step": 45515 }, { "epoch": 1.6405377157890944, "grad_norm": 0.16819880902767181, "learning_rate": 3.923271207991226e-05, "loss": 0.3989, "step": 45520 }, { "epoch": 1.640717915450319, "grad_norm": 0.20534752309322357, "learning_rate": 3.923031291528517e-05, "loss": 0.4011, "step": 45525 }, { "epoch": 1.6408981151115436, "grad_norm": 0.16648423671722412, "learning_rate": 3.922791355677282e-05, "loss": 0.4435, "step": 45530 }, { "epoch": 1.641078314772768, "grad_norm": 0.20146596431732178, "learning_rate": 3.9225514004407916e-05, "loss": 0.4431, "step": 45535 }, { "epoch": 1.6412585144339928, "grad_norm": 0.18600507080554962, "learning_rate": 3.922311425822313e-05, "loss": 0.4459, "step": 45540 }, { "epoch": 1.6414387140952176, "grad_norm": 0.19291111826896667, "learning_rate": 3.922071431825116e-05, "loss": 0.3978, "step": 45545 }, { "epoch": 1.641618913756442, "grad_norm": 0.18025104701519012, "learning_rate": 3.921831418452472e-05, "loss": 0.4321, "step": 45550 }, { "epoch": 1.6417991134176668, "grad_norm": 0.18391737341880798, "learning_rate": 3.921591385707649e-05, "loss": 0.4303, "step": 45555 }, { "epoch": 1.6419793130788913, "grad_norm": 0.23778654634952545, "learning_rate": 3.921351333593919e-05, "loss": 0.4226, "step": 45560 }, { "epoch": 1.642159512740116, "grad_norm": 0.1602877527475357, "learning_rate": 3.921111262114553e-05, "loss": 0.3877, "step": 45565 }, { "epoch": 1.6423397124013408, "grad_norm": 0.17459672689437866, "learning_rate": 3.920871171272821e-05, "loss": 0.4258, "step": 45570 }, { "epoch": 1.6425199120625653, "grad_norm": 0.16200929880142212, "learning_rate": 3.920631061071994e-05, "loss": 0.379, "step": 45575 }, { "epoch": 1.6427001117237898, "grad_norm": 0.2194150686264038, "learning_rate": 3.9203909315153445e-05, "loss": 0.4539, "step": 45580 }, { "epoch": 1.6428803113850146, "grad_norm": 0.15917415916919708, "learning_rate": 3.920150782606142e-05, "loss": 0.4171, "step": 45585 }, { "epoch": 1.6430605110462393, "grad_norm": 0.16585630178451538, "learning_rate": 3.919910614347662e-05, "loss": 0.4325, "step": 45590 }, { "epoch": 1.643240710707464, "grad_norm": 0.1744535267353058, "learning_rate": 3.9196704267431735e-05, "loss": 0.4188, "step": 45595 }, { "epoch": 1.6434209103686885, "grad_norm": 0.17278815805912018, "learning_rate": 3.91943021979595e-05, "loss": 0.4041, "step": 45600 }, { "epoch": 1.643601110029913, "grad_norm": 0.19840632379055023, "learning_rate": 3.919189993509265e-05, "loss": 0.4229, "step": 45605 }, { "epoch": 1.6437813096911378, "grad_norm": 0.178060382604599, "learning_rate": 3.918949747886391e-05, "loss": 0.4469, "step": 45610 }, { "epoch": 1.6439615093523625, "grad_norm": 0.1763332337141037, "learning_rate": 3.918709482930602e-05, "loss": 0.3798, "step": 45615 }, { "epoch": 1.644141709013587, "grad_norm": 0.17714865505695343, "learning_rate": 3.918469198645171e-05, "loss": 0.4069, "step": 45620 }, { "epoch": 1.6443219086748115, "grad_norm": 0.18300695717334747, "learning_rate": 3.918228895033371e-05, "loss": 0.3987, "step": 45625 }, { "epoch": 1.6445021083360363, "grad_norm": 0.1943272352218628, "learning_rate": 3.9179885720984775e-05, "loss": 0.4106, "step": 45630 }, { "epoch": 1.644682307997261, "grad_norm": 0.20393919944763184, "learning_rate": 3.917748229843764e-05, "loss": 0.4568, "step": 45635 }, { "epoch": 1.6448625076584857, "grad_norm": 0.18924960494041443, "learning_rate": 3.917507868272506e-05, "loss": 0.4561, "step": 45640 }, { "epoch": 1.6450427073197103, "grad_norm": 0.1751289963722229, "learning_rate": 3.9172674873879766e-05, "loss": 0.4143, "step": 45645 }, { "epoch": 1.6452229069809348, "grad_norm": 0.16475655138492584, "learning_rate": 3.917027087193453e-05, "loss": 0.3721, "step": 45650 }, { "epoch": 1.6454031066421595, "grad_norm": 0.2124844789505005, "learning_rate": 3.9167866676922096e-05, "loss": 0.4227, "step": 45655 }, { "epoch": 1.6455833063033842, "grad_norm": 0.17801690101623535, "learning_rate": 3.916546228887521e-05, "loss": 0.4318, "step": 45660 }, { "epoch": 1.6457635059646087, "grad_norm": 0.17565682530403137, "learning_rate": 3.9163057707826654e-05, "loss": 0.3954, "step": 45665 }, { "epoch": 1.6459437056258335, "grad_norm": 0.17757514119148254, "learning_rate": 3.916065293380917e-05, "loss": 0.4042, "step": 45670 }, { "epoch": 1.646123905287058, "grad_norm": 0.20188312232494354, "learning_rate": 3.9158247966855545e-05, "loss": 0.4265, "step": 45675 }, { "epoch": 1.6463041049482827, "grad_norm": 0.19370879232883453, "learning_rate": 3.915584280699853e-05, "loss": 0.4094, "step": 45680 }, { "epoch": 1.6464843046095075, "grad_norm": 0.15388374030590057, "learning_rate": 3.915343745427089e-05, "loss": 0.3697, "step": 45685 }, { "epoch": 1.646664504270732, "grad_norm": 0.19175679981708527, "learning_rate": 3.9151031908705406e-05, "loss": 0.4483, "step": 45690 }, { "epoch": 1.6468447039319565, "grad_norm": 0.19182166457176208, "learning_rate": 3.9148626170334854e-05, "loss": 0.3927, "step": 45695 }, { "epoch": 1.6470249035931812, "grad_norm": 0.1689678132534027, "learning_rate": 3.914622023919201e-05, "loss": 0.4264, "step": 45700 }, { "epoch": 1.647205103254406, "grad_norm": 0.17200689017772675, "learning_rate": 3.914381411530965e-05, "loss": 0.4504, "step": 45705 }, { "epoch": 1.6473853029156307, "grad_norm": 0.17576947808265686, "learning_rate": 3.914140779872056e-05, "loss": 0.4333, "step": 45710 }, { "epoch": 1.6475655025768552, "grad_norm": 0.18409965932369232, "learning_rate": 3.913900128945753e-05, "loss": 0.3983, "step": 45715 }, { "epoch": 1.6477457022380797, "grad_norm": 0.1899709552526474, "learning_rate": 3.913659458755335e-05, "loss": 0.3916, "step": 45720 }, { "epoch": 1.6479259018993044, "grad_norm": 0.1969635933637619, "learning_rate": 3.9134187693040806e-05, "loss": 0.4051, "step": 45725 }, { "epoch": 1.6481061015605292, "grad_norm": 0.15576127171516418, "learning_rate": 3.913178060595268e-05, "loss": 0.4128, "step": 45730 }, { "epoch": 1.6482863012217537, "grad_norm": 0.17115731537342072, "learning_rate": 3.9129373326321785e-05, "loss": 0.4159, "step": 45735 }, { "epoch": 1.6484665008829782, "grad_norm": 0.1695828139781952, "learning_rate": 3.912696585418092e-05, "loss": 0.4314, "step": 45740 }, { "epoch": 1.648646700544203, "grad_norm": 0.18110521137714386, "learning_rate": 3.912455818956288e-05, "loss": 0.4301, "step": 45745 }, { "epoch": 1.6488269002054277, "grad_norm": 0.15694189071655273, "learning_rate": 3.912215033250046e-05, "loss": 0.3813, "step": 45750 }, { "epoch": 1.6490070998666524, "grad_norm": 0.1641474962234497, "learning_rate": 3.911974228302647e-05, "loss": 0.3951, "step": 45755 }, { "epoch": 1.649187299527877, "grad_norm": 0.20309121906757355, "learning_rate": 3.911733404117375e-05, "loss": 0.4113, "step": 45760 }, { "epoch": 1.6493674991891014, "grad_norm": 0.219468891620636, "learning_rate": 3.9114925606975064e-05, "loss": 0.3907, "step": 45765 }, { "epoch": 1.6495476988503261, "grad_norm": 0.17919921875, "learning_rate": 3.9112516980463255e-05, "loss": 0.418, "step": 45770 }, { "epoch": 1.6497278985115509, "grad_norm": 0.15530207753181458, "learning_rate": 3.911010816167113e-05, "loss": 0.3502, "step": 45775 }, { "epoch": 1.6499080981727754, "grad_norm": 0.15395016968250275, "learning_rate": 3.910769915063153e-05, "loss": 0.4206, "step": 45780 }, { "epoch": 1.650088297834, "grad_norm": 0.17081965506076813, "learning_rate": 3.910528994737725e-05, "loss": 0.4347, "step": 45785 }, { "epoch": 1.6502684974952246, "grad_norm": 0.2129909098148346, "learning_rate": 3.910288055194112e-05, "loss": 0.4404, "step": 45790 }, { "epoch": 1.6504486971564494, "grad_norm": 0.203140988945961, "learning_rate": 3.910047096435598e-05, "loss": 0.4398, "step": 45795 }, { "epoch": 1.650628896817674, "grad_norm": 0.19998563826084137, "learning_rate": 3.909806118465466e-05, "loss": 0.4061, "step": 45800 }, { "epoch": 1.6508090964788986, "grad_norm": 0.1874360889196396, "learning_rate": 3.909565121286997e-05, "loss": 0.4194, "step": 45805 }, { "epoch": 1.6509892961401231, "grad_norm": 0.19713318347930908, "learning_rate": 3.909324104903477e-05, "loss": 0.4072, "step": 45810 }, { "epoch": 1.6511694958013479, "grad_norm": 0.17209462821483612, "learning_rate": 3.9090830693181885e-05, "loss": 0.417, "step": 45815 }, { "epoch": 1.6513496954625726, "grad_norm": 0.17033915221691132, "learning_rate": 3.9088420145344164e-05, "loss": 0.3954, "step": 45820 }, { "epoch": 1.6515298951237973, "grad_norm": 0.19625675678253174, "learning_rate": 3.9086009405554445e-05, "loss": 0.3812, "step": 45825 }, { "epoch": 1.6517100947850218, "grad_norm": 0.16199907660484314, "learning_rate": 3.908359847384557e-05, "loss": 0.429, "step": 45830 }, { "epoch": 1.6518902944462464, "grad_norm": 0.2132118195295334, "learning_rate": 3.9081187350250406e-05, "loss": 0.4157, "step": 45835 }, { "epoch": 1.652070494107471, "grad_norm": 0.17869247496128082, "learning_rate": 3.9078776034801775e-05, "loss": 0.399, "step": 45840 }, { "epoch": 1.6522506937686958, "grad_norm": 0.15512903034687042, "learning_rate": 3.907636452753256e-05, "loss": 0.4001, "step": 45845 }, { "epoch": 1.6524308934299203, "grad_norm": 0.15080289542675018, "learning_rate": 3.907395282847559e-05, "loss": 0.4167, "step": 45850 }, { "epoch": 1.6526110930911448, "grad_norm": 0.19971080124378204, "learning_rate": 3.907154093766375e-05, "loss": 0.4297, "step": 45855 }, { "epoch": 1.6527912927523696, "grad_norm": 0.19148339331150055, "learning_rate": 3.9069128855129876e-05, "loss": 0.411, "step": 45860 }, { "epoch": 1.6529714924135943, "grad_norm": 0.15302042663097382, "learning_rate": 3.906671658090686e-05, "loss": 0.4408, "step": 45865 }, { "epoch": 1.653151692074819, "grad_norm": 0.1940101683139801, "learning_rate": 3.9064304115027544e-05, "loss": 0.3945, "step": 45870 }, { "epoch": 1.6533318917360436, "grad_norm": 0.19384680688381195, "learning_rate": 3.9061891457524814e-05, "loss": 0.4195, "step": 45875 }, { "epoch": 1.653512091397268, "grad_norm": 0.2088027447462082, "learning_rate": 3.9059478608431526e-05, "loss": 0.3886, "step": 45880 }, { "epoch": 1.6536922910584928, "grad_norm": 0.16105620563030243, "learning_rate": 3.905706556778057e-05, "loss": 0.4008, "step": 45885 }, { "epoch": 1.6538724907197175, "grad_norm": 0.1838972270488739, "learning_rate": 3.9054652335604814e-05, "loss": 0.428, "step": 45890 }, { "epoch": 1.654052690380942, "grad_norm": 0.18796832859516144, "learning_rate": 3.905223891193715e-05, "loss": 0.4167, "step": 45895 }, { "epoch": 1.6542328900421666, "grad_norm": 0.18928878009319305, "learning_rate": 3.9049825296810436e-05, "loss": 0.3747, "step": 45900 }, { "epoch": 1.6544130897033913, "grad_norm": 0.1923985332250595, "learning_rate": 3.904741149025759e-05, "loss": 0.3859, "step": 45905 }, { "epoch": 1.654593289364616, "grad_norm": 0.2407243549823761, "learning_rate": 3.904499749231147e-05, "loss": 0.3979, "step": 45910 }, { "epoch": 1.6547734890258408, "grad_norm": 0.1961093693971634, "learning_rate": 3.904258330300498e-05, "loss": 0.4073, "step": 45915 }, { "epoch": 1.6549536886870653, "grad_norm": 0.23786601424217224, "learning_rate": 3.904016892237101e-05, "loss": 0.4582, "step": 45920 }, { "epoch": 1.6551338883482898, "grad_norm": 0.1749258190393448, "learning_rate": 3.903775435044246e-05, "loss": 0.4478, "step": 45925 }, { "epoch": 1.6553140880095145, "grad_norm": 0.16315124928951263, "learning_rate": 3.903533958725223e-05, "loss": 0.3567, "step": 45930 }, { "epoch": 1.6554942876707393, "grad_norm": 0.14763472974300385, "learning_rate": 3.9032924632833205e-05, "loss": 0.3758, "step": 45935 }, { "epoch": 1.6556744873319638, "grad_norm": 0.22486995160579681, "learning_rate": 3.90305094872183e-05, "loss": 0.4554, "step": 45940 }, { "epoch": 1.6558546869931885, "grad_norm": 0.19883042573928833, "learning_rate": 3.902809415044043e-05, "loss": 0.4338, "step": 45945 }, { "epoch": 1.656034886654413, "grad_norm": 0.18573078513145447, "learning_rate": 3.902567862253248e-05, "loss": 0.4199, "step": 45950 }, { "epoch": 1.6562150863156377, "grad_norm": 0.18856869637966156, "learning_rate": 3.902326290352738e-05, "loss": 0.3974, "step": 45955 }, { "epoch": 1.6563952859768625, "grad_norm": 0.18239809572696686, "learning_rate": 3.902084699345804e-05, "loss": 0.433, "step": 45960 }, { "epoch": 1.656575485638087, "grad_norm": 0.23285090923309326, "learning_rate": 3.9018430892357376e-05, "loss": 0.4054, "step": 45965 }, { "epoch": 1.6567556852993115, "grad_norm": 0.16723239421844482, "learning_rate": 3.90160146002583e-05, "loss": 0.4016, "step": 45970 }, { "epoch": 1.6569358849605362, "grad_norm": 0.14857235550880432, "learning_rate": 3.901359811719374e-05, "loss": 0.4149, "step": 45975 }, { "epoch": 1.657116084621761, "grad_norm": 0.1981445550918579, "learning_rate": 3.901118144319662e-05, "loss": 0.403, "step": 45980 }, { "epoch": 1.6572962842829857, "grad_norm": 0.1867840737104416, "learning_rate": 3.9008764578299866e-05, "loss": 0.4106, "step": 45985 }, { "epoch": 1.6574764839442102, "grad_norm": 0.21596737205982208, "learning_rate": 3.900634752253641e-05, "loss": 0.4186, "step": 45990 }, { "epoch": 1.6576566836054347, "grad_norm": 0.20319004356861115, "learning_rate": 3.900393027593917e-05, "loss": 0.4474, "step": 45995 }, { "epoch": 1.6578368832666595, "grad_norm": 0.16582396626472473, "learning_rate": 3.90015128385411e-05, "loss": 0.4064, "step": 46000 }, { "epoch": 1.6578368832666595, "eval_loss": 0.4423971474170685, "eval_runtime": 3.531, "eval_samples_per_second": 28.32, "eval_steps_per_second": 7.08, "step": 46000 }, { "epoch": 1.6580170829278842, "grad_norm": 0.15383246541023254, "learning_rate": 3.8999095210375124e-05, "loss": 0.3594, "step": 46005 }, { "epoch": 1.6581972825891087, "grad_norm": 0.18127766251564026, "learning_rate": 3.899667739147419e-05, "loss": 0.431, "step": 46010 }, { "epoch": 1.6583774822503332, "grad_norm": 0.19410793483257294, "learning_rate": 3.899425938187123e-05, "loss": 0.3563, "step": 46015 }, { "epoch": 1.658557681911558, "grad_norm": 0.23459197580814362, "learning_rate": 3.89918411815992e-05, "loss": 0.4236, "step": 46020 }, { "epoch": 1.6587378815727827, "grad_norm": 0.1808062195777893, "learning_rate": 3.898942279069104e-05, "loss": 0.3991, "step": 46025 }, { "epoch": 1.6589180812340074, "grad_norm": 0.14679357409477234, "learning_rate": 3.8987004209179715e-05, "loss": 0.3997, "step": 46030 }, { "epoch": 1.659098280895232, "grad_norm": 0.18178507685661316, "learning_rate": 3.898458543709815e-05, "loss": 0.4321, "step": 46035 }, { "epoch": 1.6592784805564564, "grad_norm": 0.20688877999782562, "learning_rate": 3.8982166474479323e-05, "loss": 0.4243, "step": 46040 }, { "epoch": 1.6594586802176812, "grad_norm": 0.21331140398979187, "learning_rate": 3.8979747321356186e-05, "loss": 0.4117, "step": 46045 }, { "epoch": 1.659638879878906, "grad_norm": 0.20621632039546967, "learning_rate": 3.89773279777617e-05, "loss": 0.4367, "step": 46050 }, { "epoch": 1.6598190795401304, "grad_norm": 0.20351921021938324, "learning_rate": 3.897490844372882e-05, "loss": 0.4259, "step": 46055 }, { "epoch": 1.6599992792013551, "grad_norm": 0.17361485958099365, "learning_rate": 3.897248871929052e-05, "loss": 0.4149, "step": 46060 }, { "epoch": 1.6601794788625797, "grad_norm": 0.1759531944990158, "learning_rate": 3.897006880447977e-05, "loss": 0.3962, "step": 46065 }, { "epoch": 1.6603596785238044, "grad_norm": 0.17056173086166382, "learning_rate": 3.896764869932953e-05, "loss": 0.3861, "step": 46070 }, { "epoch": 1.6605398781850291, "grad_norm": 0.16253900527954102, "learning_rate": 3.8965228403872784e-05, "loss": 0.4117, "step": 46075 }, { "epoch": 1.6607200778462536, "grad_norm": 0.17955611646175385, "learning_rate": 3.8962807918142507e-05, "loss": 0.4292, "step": 46080 }, { "epoch": 1.6609002775074782, "grad_norm": 0.15855169296264648, "learning_rate": 3.896038724217167e-05, "loss": 0.3961, "step": 46085 }, { "epoch": 1.6610804771687029, "grad_norm": 0.192918598651886, "learning_rate": 3.8957966375993266e-05, "loss": 0.3705, "step": 46090 }, { "epoch": 1.6612606768299276, "grad_norm": Infinity, "learning_rate": 3.895602954612325e-05, "loss": 0.399, "step": 46095 }, { "epoch": 1.6614408764911524, "grad_norm": 0.15826788544654846, "learning_rate": 3.895360833765434e-05, "loss": 0.422, "step": 46100 }, { "epoch": 1.6616210761523769, "grad_norm": 0.1578192263841629, "learning_rate": 3.895118693907021e-05, "loss": 0.4362, "step": 46105 }, { "epoch": 1.6618012758136014, "grad_norm": 0.18749912083148956, "learning_rate": 3.8948765350403856e-05, "loss": 0.4229, "step": 46110 }, { "epoch": 1.661981475474826, "grad_norm": 0.2128848135471344, "learning_rate": 3.8946343571688273e-05, "loss": 0.3845, "step": 46115 }, { "epoch": 1.6621616751360508, "grad_norm": 0.19334283471107483, "learning_rate": 3.894392160295647e-05, "loss": 0.4214, "step": 46120 }, { "epoch": 1.6623418747972754, "grad_norm": 0.17643995583057404, "learning_rate": 3.894149944424142e-05, "loss": 0.3705, "step": 46125 }, { "epoch": 1.6625220744584999, "grad_norm": 0.17659886181354523, "learning_rate": 3.8939077095576154e-05, "loss": 0.3882, "step": 46130 }, { "epoch": 1.6627022741197246, "grad_norm": 0.1546994149684906, "learning_rate": 3.893665455699366e-05, "loss": 0.3932, "step": 46135 }, { "epoch": 1.6628824737809493, "grad_norm": 0.18664954602718353, "learning_rate": 3.893423182852694e-05, "loss": 0.43, "step": 46140 }, { "epoch": 1.663062673442174, "grad_norm": 0.2018563598394394, "learning_rate": 3.893180891020901e-05, "loss": 0.4061, "step": 46145 }, { "epoch": 1.6632428731033986, "grad_norm": 0.17747381329536438, "learning_rate": 3.8929385802072885e-05, "loss": 0.3716, "step": 46150 }, { "epoch": 1.663423072764623, "grad_norm": 0.1724482625722885, "learning_rate": 3.892696250415156e-05, "loss": 0.4176, "step": 46155 }, { "epoch": 1.6636032724258478, "grad_norm": 0.18025627732276917, "learning_rate": 3.892453901647809e-05, "loss": 0.4606, "step": 46160 }, { "epoch": 1.6637834720870726, "grad_norm": 0.18692156672477722, "learning_rate": 3.8922115339085455e-05, "loss": 0.3861, "step": 46165 }, { "epoch": 1.663963671748297, "grad_norm": 0.16012603044509888, "learning_rate": 3.89196914720067e-05, "loss": 0.3892, "step": 46170 }, { "epoch": 1.6641438714095218, "grad_norm": 0.20383629202842712, "learning_rate": 3.891726741527484e-05, "loss": 0.4128, "step": 46175 }, { "epoch": 1.6643240710707463, "grad_norm": 0.2038664072751999, "learning_rate": 3.891484316892291e-05, "loss": 0.4766, "step": 46180 }, { "epoch": 1.664504270731971, "grad_norm": 0.18376675248146057, "learning_rate": 3.891241873298394e-05, "loss": 0.3883, "step": 46185 }, { "epoch": 1.6646844703931958, "grad_norm": 0.17446856200695038, "learning_rate": 3.890999410749095e-05, "loss": 0.4284, "step": 46190 }, { "epoch": 1.6648646700544203, "grad_norm": 0.20632441341876984, "learning_rate": 3.890756929247699e-05, "loss": 0.4319, "step": 46195 }, { "epoch": 1.6650448697156448, "grad_norm": 0.1419346034526825, "learning_rate": 3.890514428797508e-05, "loss": 0.384, "step": 46200 }, { "epoch": 1.6652250693768695, "grad_norm": 0.17198680341243744, "learning_rate": 3.890271909401828e-05, "loss": 0.4073, "step": 46205 }, { "epoch": 1.6654052690380943, "grad_norm": 0.17511259019374847, "learning_rate": 3.890029371063962e-05, "loss": 0.4082, "step": 46210 }, { "epoch": 1.665585468699319, "grad_norm": 0.16610261797904968, "learning_rate": 3.8897868137872154e-05, "loss": 0.4148, "step": 46215 }, { "epoch": 1.6657656683605435, "grad_norm": 0.15712276101112366, "learning_rate": 3.889544237574893e-05, "loss": 0.3822, "step": 46220 }, { "epoch": 1.665945868021768, "grad_norm": 0.15209558606147766, "learning_rate": 3.889301642430299e-05, "loss": 0.4053, "step": 46225 }, { "epoch": 1.6661260676829928, "grad_norm": 0.1979793757200241, "learning_rate": 3.889059028356738e-05, "loss": 0.4097, "step": 46230 }, { "epoch": 1.6663062673442175, "grad_norm": 0.17566531896591187, "learning_rate": 3.8888163953575174e-05, "loss": 0.3878, "step": 46235 }, { "epoch": 1.666486467005442, "grad_norm": 0.23031200468540192, "learning_rate": 3.8885737434359424e-05, "loss": 0.4245, "step": 46240 }, { "epoch": 1.6666666666666665, "grad_norm": 0.16964511573314667, "learning_rate": 3.8883310725953194e-05, "loss": 0.3682, "step": 46245 }, { "epoch": 1.6668468663278913, "grad_norm": 0.20693713426589966, "learning_rate": 3.888088382838954e-05, "loss": 0.4083, "step": 46250 }, { "epoch": 1.667027065989116, "grad_norm": 0.18024256825447083, "learning_rate": 3.8878456741701524e-05, "loss": 0.4068, "step": 46255 }, { "epoch": 1.6672072656503407, "grad_norm": 0.1766664981842041, "learning_rate": 3.887602946592223e-05, "loss": 0.3993, "step": 46260 }, { "epoch": 1.6673874653115652, "grad_norm": 0.18678483366966248, "learning_rate": 3.8873602001084716e-05, "loss": 0.4381, "step": 46265 }, { "epoch": 1.6675676649727897, "grad_norm": 0.13964805006980896, "learning_rate": 3.887117434722206e-05, "loss": 0.4004, "step": 46270 }, { "epoch": 1.6677478646340145, "grad_norm": 0.15428251028060913, "learning_rate": 3.886874650436735e-05, "loss": 0.3547, "step": 46275 }, { "epoch": 1.6679280642952392, "grad_norm": 0.1429775059223175, "learning_rate": 3.8866318472553644e-05, "loss": 0.394, "step": 46280 }, { "epoch": 1.6681082639564637, "grad_norm": 0.16724984347820282, "learning_rate": 3.886389025181404e-05, "loss": 0.3977, "step": 46285 }, { "epoch": 1.6682884636176882, "grad_norm": 0.1835627257823944, "learning_rate": 3.886146184218161e-05, "loss": 0.4373, "step": 46290 }, { "epoch": 1.668468663278913, "grad_norm": 0.17956386506557465, "learning_rate": 3.8859033243689446e-05, "loss": 0.4227, "step": 46295 }, { "epoch": 1.6686488629401377, "grad_norm": 0.19547246396541595, "learning_rate": 3.885660445637064e-05, "loss": 0.383, "step": 46300 }, { "epoch": 1.6688290626013624, "grad_norm": 0.2311650663614273, "learning_rate": 3.885417548025828e-05, "loss": 0.3988, "step": 46305 }, { "epoch": 1.669009262262587, "grad_norm": 0.15516655147075653, "learning_rate": 3.885174631538546e-05, "loss": 0.4264, "step": 46310 }, { "epoch": 1.6691894619238115, "grad_norm": 0.19897285103797913, "learning_rate": 3.8849316961785276e-05, "loss": 0.4051, "step": 46315 }, { "epoch": 1.6693696615850362, "grad_norm": 0.22610118985176086, "learning_rate": 3.884688741949084e-05, "loss": 0.4286, "step": 46320 }, { "epoch": 1.669549861246261, "grad_norm": 0.16517026722431183, "learning_rate": 3.884445768853524e-05, "loss": 0.4167, "step": 46325 }, { "epoch": 1.6697300609074857, "grad_norm": 0.21267640590667725, "learning_rate": 3.884202776895158e-05, "loss": 0.3828, "step": 46330 }, { "epoch": 1.6699102605687102, "grad_norm": 0.20657142996788025, "learning_rate": 3.883959766077297e-05, "loss": 0.4069, "step": 46335 }, { "epoch": 1.6700904602299347, "grad_norm": 0.16115951538085938, "learning_rate": 3.8837167364032526e-05, "loss": 0.4248, "step": 46340 }, { "epoch": 1.6702706598911594, "grad_norm": 0.2205209583044052, "learning_rate": 3.883473687876336e-05, "loss": 0.4162, "step": 46345 }, { "epoch": 1.6704508595523841, "grad_norm": 0.18244054913520813, "learning_rate": 3.883230620499857e-05, "loss": 0.4127, "step": 46350 }, { "epoch": 1.6706310592136087, "grad_norm": 0.14126257598400116, "learning_rate": 3.8829875342771287e-05, "loss": 0.4314, "step": 46355 }, { "epoch": 1.6708112588748332, "grad_norm": 0.1783919334411621, "learning_rate": 3.8827444292114634e-05, "loss": 0.4063, "step": 46360 }, { "epoch": 1.670991458536058, "grad_norm": 0.1652853786945343, "learning_rate": 3.882501305306174e-05, "loss": 0.4104, "step": 46365 }, { "epoch": 1.6711716581972826, "grad_norm": 0.16857433319091797, "learning_rate": 3.8822581625645706e-05, "loss": 0.3828, "step": 46370 }, { "epoch": 1.6713518578585074, "grad_norm": 0.2388414889574051, "learning_rate": 3.882015000989968e-05, "loss": 0.4411, "step": 46375 }, { "epoch": 1.6715320575197319, "grad_norm": 0.16864396631717682, "learning_rate": 3.881771820585678e-05, "loss": 0.4467, "step": 46380 }, { "epoch": 1.6717122571809564, "grad_norm": 0.1579165756702423, "learning_rate": 3.881528621355015e-05, "loss": 0.4215, "step": 46385 }, { "epoch": 1.6718924568421811, "grad_norm": 0.17890191078186035, "learning_rate": 3.8812854033012916e-05, "loss": 0.4252, "step": 46390 }, { "epoch": 1.6720726565034059, "grad_norm": 0.1696099191904068, "learning_rate": 3.881042166427823e-05, "loss": 0.3783, "step": 46395 }, { "epoch": 1.6722528561646304, "grad_norm": 0.13848702609539032, "learning_rate": 3.880798910737921e-05, "loss": 0.3871, "step": 46400 }, { "epoch": 1.6724330558258549, "grad_norm": 0.17733387649059296, "learning_rate": 3.880555636234902e-05, "loss": 0.4035, "step": 46405 }, { "epoch": 1.6726132554870796, "grad_norm": 0.21084022521972656, "learning_rate": 3.880312342922079e-05, "loss": 0.4408, "step": 46410 }, { "epoch": 1.6727934551483044, "grad_norm": 0.21797522902488708, "learning_rate": 3.880069030802768e-05, "loss": 0.4113, "step": 46415 }, { "epoch": 1.672973654809529, "grad_norm": 0.19593912363052368, "learning_rate": 3.879825699880284e-05, "loss": 0.4092, "step": 46420 }, { "epoch": 1.6731538544707536, "grad_norm": 0.2192002683877945, "learning_rate": 3.879582350157942e-05, "loss": 0.4616, "step": 46425 }, { "epoch": 1.673334054131978, "grad_norm": 0.1574559360742569, "learning_rate": 3.879338981639057e-05, "loss": 0.3955, "step": 46430 }, { "epoch": 1.6735142537932028, "grad_norm": 0.20337316393852234, "learning_rate": 3.8790955943269455e-05, "loss": 0.4303, "step": 46435 }, { "epoch": 1.6736944534544276, "grad_norm": 0.15265347063541412, "learning_rate": 3.878852188224924e-05, "loss": 0.4054, "step": 46440 }, { "epoch": 1.673874653115652, "grad_norm": 0.1718795895576477, "learning_rate": 3.8786087633363075e-05, "loss": 0.3766, "step": 46445 }, { "epoch": 1.6740548527768768, "grad_norm": 0.18566109240055084, "learning_rate": 3.8783653196644144e-05, "loss": 0.3856, "step": 46450 }, { "epoch": 1.6742350524381013, "grad_norm": 0.18366526067256927, "learning_rate": 3.87812185721256e-05, "loss": 0.4811, "step": 46455 }, { "epoch": 1.674415252099326, "grad_norm": 0.1583663374185562, "learning_rate": 3.8778783759840625e-05, "loss": 0.437, "step": 46460 }, { "epoch": 1.6745954517605508, "grad_norm": 0.16400283575057983, "learning_rate": 3.877634875982239e-05, "loss": 0.397, "step": 46465 }, { "epoch": 1.6747756514217753, "grad_norm": 0.18252485990524292, "learning_rate": 3.877391357210407e-05, "loss": 0.431, "step": 46470 }, { "epoch": 1.6749558510829998, "grad_norm": 0.22396881878376007, "learning_rate": 3.877147819671884e-05, "loss": 0.424, "step": 46475 }, { "epoch": 1.6751360507442246, "grad_norm": 0.16650210320949554, "learning_rate": 3.876904263369989e-05, "loss": 0.399, "step": 46480 }, { "epoch": 1.6753162504054493, "grad_norm": 0.17203296720981598, "learning_rate": 3.87666068830804e-05, "loss": 0.4241, "step": 46485 }, { "epoch": 1.675496450066674, "grad_norm": 0.21028588712215424, "learning_rate": 3.876417094489355e-05, "loss": 0.3976, "step": 46490 }, { "epoch": 1.6756766497278985, "grad_norm": 0.20283226668834686, "learning_rate": 3.876173481917255e-05, "loss": 0.4196, "step": 46495 }, { "epoch": 1.675856849389123, "grad_norm": 0.1598893254995346, "learning_rate": 3.875929850595056e-05, "loss": 0.3846, "step": 46500 }, { "epoch": 1.675856849389123, "eval_loss": 0.44327858090400696, "eval_runtime": 3.5351, "eval_samples_per_second": 28.288, "eval_steps_per_second": 7.072, "step": 46500 }, { "epoch": 1.6760370490503478, "grad_norm": 0.18217889964580536, "learning_rate": 3.8756862005260804e-05, "loss": 0.4492, "step": 46505 }, { "epoch": 1.6762172487115725, "grad_norm": 0.19530221819877625, "learning_rate": 3.8754425317136465e-05, "loss": 0.4021, "step": 46510 }, { "epoch": 1.676397448372797, "grad_norm": 0.21097613871097565, "learning_rate": 3.875198844161074e-05, "loss": 0.4219, "step": 46515 }, { "epoch": 1.6765776480340215, "grad_norm": 0.14871633052825928, "learning_rate": 3.874955137871684e-05, "loss": 0.3881, "step": 46520 }, { "epoch": 1.6767578476952463, "grad_norm": 0.1719123125076294, "learning_rate": 3.874711412848796e-05, "loss": 0.4257, "step": 46525 }, { "epoch": 1.676938047356471, "grad_norm": 0.2002047300338745, "learning_rate": 3.874467669095731e-05, "loss": 0.3852, "step": 46530 }, { "epoch": 1.6771182470176957, "grad_norm": 0.18990401923656464, "learning_rate": 3.87422390661581e-05, "loss": 0.4062, "step": 46535 }, { "epoch": 1.6772984466789203, "grad_norm": 0.2010718584060669, "learning_rate": 3.873980125412355e-05, "loss": 0.4468, "step": 46540 }, { "epoch": 1.6774786463401448, "grad_norm": 0.20043112337589264, "learning_rate": 3.873736325488687e-05, "loss": 0.4265, "step": 46545 }, { "epoch": 1.6776588460013695, "grad_norm": 0.208219975233078, "learning_rate": 3.873492506848127e-05, "loss": 0.4229, "step": 46550 }, { "epoch": 1.6778390456625942, "grad_norm": 0.1497962474822998, "learning_rate": 3.873248669493997e-05, "loss": 0.3987, "step": 46555 }, { "epoch": 1.6780192453238187, "grad_norm": 0.1851639449596405, "learning_rate": 3.87300481342962e-05, "loss": 0.4378, "step": 46560 }, { "epoch": 1.6781994449850435, "grad_norm": 0.18497171998023987, "learning_rate": 3.8727609386583184e-05, "loss": 0.3855, "step": 46565 }, { "epoch": 1.678379644646268, "grad_norm": 0.19032762944698334, "learning_rate": 3.872517045183416e-05, "loss": 0.4086, "step": 46570 }, { "epoch": 1.6785598443074927, "grad_norm": 0.1807955503463745, "learning_rate": 3.872273133008233e-05, "loss": 0.4104, "step": 46575 }, { "epoch": 1.6787400439687175, "grad_norm": 0.18649344146251678, "learning_rate": 3.872029202136095e-05, "loss": 0.4241, "step": 46580 }, { "epoch": 1.678920243629942, "grad_norm": 0.17632882297039032, "learning_rate": 3.8717852525703246e-05, "loss": 0.4656, "step": 46585 }, { "epoch": 1.6791004432911665, "grad_norm": 0.1870754510164261, "learning_rate": 3.871541284314245e-05, "loss": 0.3766, "step": 46590 }, { "epoch": 1.6792806429523912, "grad_norm": 0.18941162526607513, "learning_rate": 3.871297297371182e-05, "loss": 0.4175, "step": 46595 }, { "epoch": 1.679460842613616, "grad_norm": 0.17592564225196838, "learning_rate": 3.871053291744459e-05, "loss": 0.3781, "step": 46600 }, { "epoch": 1.6796410422748407, "grad_norm": 0.17591671645641327, "learning_rate": 3.870809267437398e-05, "loss": 0.3759, "step": 46605 }, { "epoch": 1.6798212419360652, "grad_norm": 0.17050381004810333, "learning_rate": 3.870565224453329e-05, "loss": 0.3951, "step": 46610 }, { "epoch": 1.6800014415972897, "grad_norm": 0.20477713644504547, "learning_rate": 3.870321162795573e-05, "loss": 0.4295, "step": 46615 }, { "epoch": 1.6801816412585144, "grad_norm": 0.20674197375774384, "learning_rate": 3.870077082467456e-05, "loss": 0.4307, "step": 46620 }, { "epoch": 1.6803618409197392, "grad_norm": 0.1541663408279419, "learning_rate": 3.8698329834723046e-05, "loss": 0.4447, "step": 46625 }, { "epoch": 1.6805420405809637, "grad_norm": 0.1866021603345871, "learning_rate": 3.8695888658134446e-05, "loss": 0.3986, "step": 46630 }, { "epoch": 1.6807222402421882, "grad_norm": 0.17100781202316284, "learning_rate": 3.8693447294942e-05, "loss": 0.4129, "step": 46635 }, { "epoch": 1.680902439903413, "grad_norm": 0.16829904913902283, "learning_rate": 3.8691005745179e-05, "loss": 0.436, "step": 46640 }, { "epoch": 1.6810826395646377, "grad_norm": 0.17675045132637024, "learning_rate": 3.868856400887868e-05, "loss": 0.4185, "step": 46645 }, { "epoch": 1.6812628392258624, "grad_norm": 0.1742866337299347, "learning_rate": 3.868612208607434e-05, "loss": 0.4504, "step": 46650 }, { "epoch": 1.681443038887087, "grad_norm": 0.21476224064826965, "learning_rate": 3.8683679976799235e-05, "loss": 0.4158, "step": 46655 }, { "epoch": 1.6816232385483114, "grad_norm": 0.19867900013923645, "learning_rate": 3.868123768108664e-05, "loss": 0.3975, "step": 46660 }, { "epoch": 1.6818034382095362, "grad_norm": 0.15411974489688873, "learning_rate": 3.867879519896983e-05, "loss": 0.4003, "step": 46665 }, { "epoch": 1.6819836378707609, "grad_norm": 0.20566970109939575, "learning_rate": 3.8676352530482074e-05, "loss": 0.4074, "step": 46670 }, { "epoch": 1.6821638375319854, "grad_norm": 0.17975671589374542, "learning_rate": 3.8673909675656675e-05, "loss": 0.4181, "step": 46675 }, { "epoch": 1.6823440371932101, "grad_norm": 0.16022039949893951, "learning_rate": 3.86714666345269e-05, "loss": 0.4104, "step": 46680 }, { "epoch": 1.6825242368544346, "grad_norm": 0.22043001651763916, "learning_rate": 3.8669023407126035e-05, "loss": 0.4131, "step": 46685 }, { "epoch": 1.6827044365156594, "grad_norm": 0.14950603246688843, "learning_rate": 3.866657999348737e-05, "loss": 0.3596, "step": 46690 }, { "epoch": 1.682884636176884, "grad_norm": 0.1892220824956894, "learning_rate": 3.866413639364421e-05, "loss": 0.3928, "step": 46695 }, { "epoch": 1.6830648358381086, "grad_norm": 0.15742996335029602, "learning_rate": 3.8661692607629826e-05, "loss": 0.41, "step": 46700 }, { "epoch": 1.6832450354993331, "grad_norm": 0.18481925129890442, "learning_rate": 3.865924863547753e-05, "loss": 0.4061, "step": 46705 }, { "epoch": 1.6834252351605579, "grad_norm": 0.2112767994403839, "learning_rate": 3.865680447722062e-05, "loss": 0.4467, "step": 46710 }, { "epoch": 1.6836054348217826, "grad_norm": 0.19676971435546875, "learning_rate": 3.865436013289239e-05, "loss": 0.4542, "step": 46715 }, { "epoch": 1.6837856344830073, "grad_norm": 0.18890570104122162, "learning_rate": 3.865191560252614e-05, "loss": 0.4311, "step": 46720 }, { "epoch": 1.6839658341442318, "grad_norm": 0.17850306630134583, "learning_rate": 3.864947088615519e-05, "loss": 0.424, "step": 46725 }, { "epoch": 1.6841460338054564, "grad_norm": 0.13823658227920532, "learning_rate": 3.8647025983812844e-05, "loss": 0.4182, "step": 46730 }, { "epoch": 1.684326233466681, "grad_norm": 0.17909295856952667, "learning_rate": 3.864458089553241e-05, "loss": 0.4016, "step": 46735 }, { "epoch": 1.6845064331279058, "grad_norm": 0.20117956399917603, "learning_rate": 3.8642135621347195e-05, "loss": 0.3977, "step": 46740 }, { "epoch": 1.6846866327891303, "grad_norm": 0.19415679574012756, "learning_rate": 3.863969016129053e-05, "loss": 0.3985, "step": 46745 }, { "epoch": 1.6848668324503548, "grad_norm": 0.2219560146331787, "learning_rate": 3.8637244515395734e-05, "loss": 0.4205, "step": 46750 }, { "epoch": 1.6850470321115796, "grad_norm": 0.1750744730234146, "learning_rate": 3.8634798683696114e-05, "loss": 0.428, "step": 46755 }, { "epoch": 1.6852272317728043, "grad_norm": 0.19463662803173065, "learning_rate": 3.8632352666225005e-05, "loss": 0.4025, "step": 46760 }, { "epoch": 1.685407431434029, "grad_norm": 0.18877053260803223, "learning_rate": 3.862990646301572e-05, "loss": 0.4338, "step": 46765 }, { "epoch": 1.6855876310952536, "grad_norm": 0.17238645255565643, "learning_rate": 3.8627460074101606e-05, "loss": 0.4193, "step": 46770 }, { "epoch": 1.685767830756478, "grad_norm": 0.20663443207740784, "learning_rate": 3.862501349951599e-05, "loss": 0.4149, "step": 46775 }, { "epoch": 1.6859480304177028, "grad_norm": 0.17570826411247253, "learning_rate": 3.86225667392922e-05, "loss": 0.4042, "step": 46780 }, { "epoch": 1.6861282300789275, "grad_norm": 0.1705494225025177, "learning_rate": 3.8620119793463573e-05, "loss": 0.432, "step": 46785 }, { "epoch": 1.686308429740152, "grad_norm": 0.18460239470005035, "learning_rate": 3.861767266206345e-05, "loss": 0.3972, "step": 46790 }, { "epoch": 1.6864886294013766, "grad_norm": 0.1540331393480301, "learning_rate": 3.861522534512518e-05, "loss": 0.4156, "step": 46795 }, { "epoch": 1.6866688290626013, "grad_norm": 0.2118932008743286, "learning_rate": 3.861277784268209e-05, "loss": 0.4159, "step": 46800 }, { "epoch": 1.686849028723826, "grad_norm": 0.18069863319396973, "learning_rate": 3.861033015476755e-05, "loss": 0.4068, "step": 46805 }, { "epoch": 1.6870292283850508, "grad_norm": 0.2565334439277649, "learning_rate": 3.860788228141489e-05, "loss": 0.4314, "step": 46810 }, { "epoch": 1.6872094280462753, "grad_norm": 0.21841110289096832, "learning_rate": 3.8605434222657465e-05, "loss": 0.4269, "step": 46815 }, { "epoch": 1.6873896277074998, "grad_norm": 0.1737854778766632, "learning_rate": 3.860298597852864e-05, "loss": 0.426, "step": 46820 }, { "epoch": 1.6875698273687245, "grad_norm": 0.17643529176712036, "learning_rate": 3.860053754906176e-05, "loss": 0.4376, "step": 46825 }, { "epoch": 1.6877500270299493, "grad_norm": 0.18308429419994354, "learning_rate": 3.859808893429019e-05, "loss": 0.4082, "step": 46830 }, { "epoch": 1.687930226691174, "grad_norm": 0.1854483038187027, "learning_rate": 3.859564013424729e-05, "loss": 0.4091, "step": 46835 }, { "epoch": 1.6881104263523985, "grad_norm": 0.2083449810743332, "learning_rate": 3.859319114896643e-05, "loss": 0.4431, "step": 46840 }, { "epoch": 1.688290626013623, "grad_norm": 0.15931734442710876, "learning_rate": 3.859074197848097e-05, "loss": 0.3969, "step": 46845 }, { "epoch": 1.6884708256748477, "grad_norm": 0.20284508168697357, "learning_rate": 3.858829262282428e-05, "loss": 0.4014, "step": 46850 }, { "epoch": 1.6886510253360725, "grad_norm": 0.1814059317111969, "learning_rate": 3.858584308202973e-05, "loss": 0.4241, "step": 46855 }, { "epoch": 1.688831224997297, "grad_norm": 0.24571998417377472, "learning_rate": 3.858339335613071e-05, "loss": 0.4337, "step": 46860 }, { "epoch": 1.6890114246585215, "grad_norm": 0.19344958662986755, "learning_rate": 3.858094344516058e-05, "loss": 0.4392, "step": 46865 }, { "epoch": 1.6891916243197462, "grad_norm": 0.1718379110097885, "learning_rate": 3.8578493349152725e-05, "loss": 0.3957, "step": 46870 }, { "epoch": 1.689371823980971, "grad_norm": 0.18230155110359192, "learning_rate": 3.857604306814052e-05, "loss": 0.4316, "step": 46875 }, { "epoch": 1.6895520236421957, "grad_norm": 0.2310900092124939, "learning_rate": 3.857359260215736e-05, "loss": 0.4239, "step": 46880 }, { "epoch": 1.6897322233034202, "grad_norm": 0.14590436220169067, "learning_rate": 3.8571141951236634e-05, "loss": 0.3717, "step": 46885 }, { "epoch": 1.6899124229646447, "grad_norm": 0.20509690046310425, "learning_rate": 3.8568691115411726e-05, "loss": 0.4504, "step": 46890 }, { "epoch": 1.6900926226258695, "grad_norm": 0.22596745193004608, "learning_rate": 3.856624009471602e-05, "loss": 0.4144, "step": 46895 }, { "epoch": 1.6902728222870942, "grad_norm": 0.21356570720672607, "learning_rate": 3.8563788889182925e-05, "loss": 0.4174, "step": 46900 }, { "epoch": 1.6904530219483187, "grad_norm": 0.16195428371429443, "learning_rate": 3.856133749884584e-05, "loss": 0.4089, "step": 46905 }, { "epoch": 1.6906332216095432, "grad_norm": 0.1678587794303894, "learning_rate": 3.8558885923738144e-05, "loss": 0.4158, "step": 46910 }, { "epoch": 1.690813421270768, "grad_norm": 0.16259028017520905, "learning_rate": 3.8556434163893254e-05, "loss": 0.3828, "step": 46915 }, { "epoch": 1.6909936209319927, "grad_norm": 0.16952233016490936, "learning_rate": 3.8553982219344584e-05, "loss": 0.4048, "step": 46920 }, { "epoch": 1.6911738205932174, "grad_norm": 0.20961052179336548, "learning_rate": 3.855153009012552e-05, "loss": 0.4091, "step": 46925 }, { "epoch": 1.691354020254442, "grad_norm": 0.17797677218914032, "learning_rate": 3.854907777626948e-05, "loss": 0.4102, "step": 46930 }, { "epoch": 1.6915342199156664, "grad_norm": 0.23650279641151428, "learning_rate": 3.854662527780989e-05, "loss": 0.4334, "step": 46935 }, { "epoch": 1.6917144195768912, "grad_norm": 0.1639034003019333, "learning_rate": 3.8544172594780145e-05, "loss": 0.3668, "step": 46940 }, { "epoch": 1.691894619238116, "grad_norm": 0.14335285127162933, "learning_rate": 3.8541719727213675e-05, "loss": 0.38, "step": 46945 }, { "epoch": 1.6920748188993404, "grad_norm": 0.2120550274848938, "learning_rate": 3.853926667514389e-05, "loss": 0.4064, "step": 46950 }, { "epoch": 1.6922550185605651, "grad_norm": 0.1797804832458496, "learning_rate": 3.853681343860423e-05, "loss": 0.4465, "step": 46955 }, { "epoch": 1.6924352182217897, "grad_norm": 0.17130543291568756, "learning_rate": 3.8534360017628096e-05, "loss": 0.3994, "step": 46960 }, { "epoch": 1.6926154178830144, "grad_norm": 0.17090466618537903, "learning_rate": 3.853190641224893e-05, "loss": 0.4236, "step": 46965 }, { "epoch": 1.6927956175442391, "grad_norm": 0.1600097268819809, "learning_rate": 3.852945262250016e-05, "loss": 0.4157, "step": 46970 }, { "epoch": 1.6929758172054636, "grad_norm": 0.16842542588710785, "learning_rate": 3.8526998648415214e-05, "loss": 0.4017, "step": 46975 }, { "epoch": 1.6931560168666882, "grad_norm": 0.2014317363500595, "learning_rate": 3.8524544490027534e-05, "loss": 0.4108, "step": 46980 }, { "epoch": 1.6933362165279129, "grad_norm": 0.18398632109165192, "learning_rate": 3.852209014737055e-05, "loss": 0.3708, "step": 46985 }, { "epoch": 1.6935164161891376, "grad_norm": 0.217530757188797, "learning_rate": 3.8519635620477714e-05, "loss": 0.4193, "step": 46990 }, { "epoch": 1.6936966158503624, "grad_norm": 0.15751570463180542, "learning_rate": 3.851718090938245e-05, "loss": 0.4189, "step": 46995 }, { "epoch": 1.6938768155115869, "grad_norm": 0.21628709137439728, "learning_rate": 3.851472601411822e-05, "loss": 0.4166, "step": 47000 }, { "epoch": 1.6938768155115869, "eval_loss": 0.44244661927223206, "eval_runtime": 3.5403, "eval_samples_per_second": 28.246, "eval_steps_per_second": 7.062, "step": 47000 }, { "epoch": 1.6940570151728114, "grad_norm": 0.19788464903831482, "learning_rate": 3.851227093471847e-05, "loss": 0.424, "step": 47005 }, { "epoch": 1.694237214834036, "grad_norm": 0.1924174278974533, "learning_rate": 3.850981567121663e-05, "loss": 0.4527, "step": 47010 }, { "epoch": 1.6944174144952608, "grad_norm": 0.16324442625045776, "learning_rate": 3.850736022364617e-05, "loss": 0.4156, "step": 47015 }, { "epoch": 1.6945976141564854, "grad_norm": 0.23435580730438232, "learning_rate": 3.8504904592040545e-05, "loss": 0.4468, "step": 47020 }, { "epoch": 1.6947778138177099, "grad_norm": 0.15442748367786407, "learning_rate": 3.8502448776433216e-05, "loss": 0.3801, "step": 47025 }, { "epoch": 1.6949580134789346, "grad_norm": 0.18875345587730408, "learning_rate": 3.849999277685763e-05, "loss": 0.3887, "step": 47030 }, { "epoch": 1.6951382131401593, "grad_norm": 0.18448412418365479, "learning_rate": 3.849753659334725e-05, "loss": 0.4143, "step": 47035 }, { "epoch": 1.695318412801384, "grad_norm": 0.20887787640094757, "learning_rate": 3.849508022593556e-05, "loss": 0.4555, "step": 47040 }, { "epoch": 1.6954986124626086, "grad_norm": 0.16566239297389984, "learning_rate": 3.849262367465601e-05, "loss": 0.3988, "step": 47045 }, { "epoch": 1.695678812123833, "grad_norm": 0.2598322033882141, "learning_rate": 3.849016693954207e-05, "loss": 0.431, "step": 47050 }, { "epoch": 1.6958590117850578, "grad_norm": 0.1612170934677124, "learning_rate": 3.848771002062722e-05, "loss": 0.3811, "step": 47055 }, { "epoch": 1.6960392114462826, "grad_norm": 0.18904070556163788, "learning_rate": 3.848525291794494e-05, "loss": 0.4372, "step": 47060 }, { "epoch": 1.696219411107507, "grad_norm": 0.1836709976196289, "learning_rate": 3.848279563152869e-05, "loss": 0.3892, "step": 47065 }, { "epoch": 1.6963996107687318, "grad_norm": 0.18471242487430573, "learning_rate": 3.848033816141196e-05, "loss": 0.3994, "step": 47070 }, { "epoch": 1.6965798104299563, "grad_norm": 0.18308402597904205, "learning_rate": 3.847788050762824e-05, "loss": 0.4139, "step": 47075 }, { "epoch": 1.696760010091181, "grad_norm": 0.1916336864233017, "learning_rate": 3.8475422670211e-05, "loss": 0.4577, "step": 47080 }, { "epoch": 1.6969402097524058, "grad_norm": 0.17841115593910217, "learning_rate": 3.8472964649193736e-05, "loss": 0.4185, "step": 47085 }, { "epoch": 1.6971204094136303, "grad_norm": 0.16689583659172058, "learning_rate": 3.8470506444609946e-05, "loss": 0.3826, "step": 47090 }, { "epoch": 1.6973006090748548, "grad_norm": 0.21197299659252167, "learning_rate": 3.84680480564931e-05, "loss": 0.4296, "step": 47095 }, { "epoch": 1.6974808087360795, "grad_norm": 0.14996369183063507, "learning_rate": 3.8465589484876716e-05, "loss": 0.4002, "step": 47100 }, { "epoch": 1.6976610083973043, "grad_norm": 0.15397225320339203, "learning_rate": 3.846313072979428e-05, "loss": 0.3986, "step": 47105 }, { "epoch": 1.697841208058529, "grad_norm": 0.1733284294605255, "learning_rate": 3.846067179127929e-05, "loss": 0.4098, "step": 47110 }, { "epoch": 1.6980214077197535, "grad_norm": 0.16923348605632782, "learning_rate": 3.8458212669365256e-05, "loss": 0.3978, "step": 47115 }, { "epoch": 1.698201607380978, "grad_norm": 0.16483382880687714, "learning_rate": 3.845575336408568e-05, "loss": 0.4234, "step": 47120 }, { "epoch": 1.6983818070422028, "grad_norm": 0.18694746494293213, "learning_rate": 3.845329387547407e-05, "loss": 0.4383, "step": 47125 }, { "epoch": 1.6985620067034275, "grad_norm": 0.20622876286506653, "learning_rate": 3.845083420356393e-05, "loss": 0.4094, "step": 47130 }, { "epoch": 1.698742206364652, "grad_norm": 0.177239328622818, "learning_rate": 3.8448374348388796e-05, "loss": 0.4394, "step": 47135 }, { "epoch": 1.6989224060258765, "grad_norm": 0.21104155480861664, "learning_rate": 3.8445914309982145e-05, "loss": 0.4234, "step": 47140 }, { "epoch": 1.6991026056871013, "grad_norm": 0.19302360713481903, "learning_rate": 3.844345408837753e-05, "loss": 0.3983, "step": 47145 }, { "epoch": 1.699282805348326, "grad_norm": 0.18512195348739624, "learning_rate": 3.844099368360845e-05, "loss": 0.4173, "step": 47150 }, { "epoch": 1.6994630050095507, "grad_norm": 0.17956486344337463, "learning_rate": 3.8438533095708426e-05, "loss": 0.4372, "step": 47155 }, { "epoch": 1.6996432046707752, "grad_norm": 0.19222790002822876, "learning_rate": 3.8436072324710995e-05, "loss": 0.3728, "step": 47160 }, { "epoch": 1.6998234043319997, "grad_norm": 0.15799042582511902, "learning_rate": 3.8433611370649686e-05, "loss": 0.4133, "step": 47165 }, { "epoch": 1.7000036039932245, "grad_norm": 0.1787501573562622, "learning_rate": 3.843115023355802e-05, "loss": 0.4264, "step": 47170 }, { "epoch": 1.7001838036544492, "grad_norm": 0.17307919263839722, "learning_rate": 3.842868891346952e-05, "loss": 0.4198, "step": 47175 }, { "epoch": 1.7003640033156737, "grad_norm": 0.1564173549413681, "learning_rate": 3.8426227410417755e-05, "loss": 0.3832, "step": 47180 }, { "epoch": 1.7005442029768985, "grad_norm": 0.21778932213783264, "learning_rate": 3.842376572443623e-05, "loss": 0.4172, "step": 47185 }, { "epoch": 1.700724402638123, "grad_norm": 0.16438071429729462, "learning_rate": 3.8421303855558496e-05, "loss": 0.4138, "step": 47190 }, { "epoch": 1.7009046022993477, "grad_norm": 0.2048167735338211, "learning_rate": 3.8418841803818096e-05, "loss": 0.4055, "step": 47195 }, { "epoch": 1.7010848019605724, "grad_norm": 0.15939681231975555, "learning_rate": 3.841637956924857e-05, "loss": 0.4179, "step": 47200 }, { "epoch": 1.701265001621797, "grad_norm": 0.16138513386249542, "learning_rate": 3.841391715188348e-05, "loss": 0.416, "step": 47205 }, { "epoch": 1.7014452012830215, "grad_norm": 0.186210036277771, "learning_rate": 3.841145455175635e-05, "loss": 0.4287, "step": 47210 }, { "epoch": 1.7016254009442462, "grad_norm": 0.17103976011276245, "learning_rate": 3.840899176890076e-05, "loss": 0.3943, "step": 47215 }, { "epoch": 1.701805600605471, "grad_norm": 0.15319930016994476, "learning_rate": 3.840652880335025e-05, "loss": 0.4086, "step": 47220 }, { "epoch": 1.7019858002666957, "grad_norm": 0.1645641177892685, "learning_rate": 3.840406565513838e-05, "loss": 0.3827, "step": 47225 }, { "epoch": 1.7021659999279202, "grad_norm": 0.1592930108308792, "learning_rate": 3.840160232429872e-05, "loss": 0.3865, "step": 47230 }, { "epoch": 1.7023461995891447, "grad_norm": 0.18125946819782257, "learning_rate": 3.839913881086481e-05, "loss": 0.428, "step": 47235 }, { "epoch": 1.7025263992503694, "grad_norm": 0.17202188074588776, "learning_rate": 3.8396675114870234e-05, "loss": 0.4201, "step": 47240 }, { "epoch": 1.7027065989115941, "grad_norm": 0.16342946887016296, "learning_rate": 3.839421123634855e-05, "loss": 0.389, "step": 47245 }, { "epoch": 1.7028867985728187, "grad_norm": 0.17867057025432587, "learning_rate": 3.8391747175333336e-05, "loss": 0.4173, "step": 47250 }, { "epoch": 1.7030669982340432, "grad_norm": 0.20603960752487183, "learning_rate": 3.838928293185815e-05, "loss": 0.4412, "step": 47255 }, { "epoch": 1.703247197895268, "grad_norm": 0.19826002418994904, "learning_rate": 3.838681850595659e-05, "loss": 0.3956, "step": 47260 }, { "epoch": 1.7034273975564926, "grad_norm": 0.18762384355068207, "learning_rate": 3.83843538976622e-05, "loss": 0.4009, "step": 47265 }, { "epoch": 1.7036075972177174, "grad_norm": 0.21648670732975006, "learning_rate": 3.838188910700861e-05, "loss": 0.4453, "step": 47270 }, { "epoch": 1.7037877968789419, "grad_norm": 0.20147664844989777, "learning_rate": 3.837942413402935e-05, "loss": 0.4143, "step": 47275 }, { "epoch": 1.7039679965401664, "grad_norm": 0.16938365995883942, "learning_rate": 3.837695897875803e-05, "loss": 0.4284, "step": 47280 }, { "epoch": 1.7041481962013911, "grad_norm": 0.16605401039123535, "learning_rate": 3.837449364122823e-05, "loss": 0.4519, "step": 47285 }, { "epoch": 1.7043283958626159, "grad_norm": 0.15813526511192322, "learning_rate": 3.837202812147355e-05, "loss": 0.4502, "step": 47290 }, { "epoch": 1.7045085955238404, "grad_norm": 0.1578982025384903, "learning_rate": 3.8369562419527574e-05, "loss": 0.407, "step": 47295 }, { "epoch": 1.7046887951850649, "grad_norm": 0.20871727168560028, "learning_rate": 3.8367096535423895e-05, "loss": 0.4251, "step": 47300 }, { "epoch": 1.7048689948462896, "grad_norm": 0.1831165850162506, "learning_rate": 3.836463046919612e-05, "loss": 0.4263, "step": 47305 }, { "epoch": 1.7050491945075144, "grad_norm": 0.2124657928943634, "learning_rate": 3.836216422087784e-05, "loss": 0.3903, "step": 47310 }, { "epoch": 1.705229394168739, "grad_norm": 0.2060711830854416, "learning_rate": 3.8359697790502656e-05, "loss": 0.4089, "step": 47315 }, { "epoch": 1.7054095938299636, "grad_norm": 0.1653200387954712, "learning_rate": 3.835723117810418e-05, "loss": 0.376, "step": 47320 }, { "epoch": 1.705589793491188, "grad_norm": 0.15278410911560059, "learning_rate": 3.835476438371601e-05, "loss": 0.4144, "step": 47325 }, { "epoch": 1.7057699931524128, "grad_norm": 0.21837785840034485, "learning_rate": 3.835229740737176e-05, "loss": 0.4179, "step": 47330 }, { "epoch": 1.7059501928136376, "grad_norm": 0.17335210740566254, "learning_rate": 3.8349830249105057e-05, "loss": 0.3892, "step": 47335 }, { "epoch": 1.7061303924748623, "grad_norm": 0.17480070888996124, "learning_rate": 3.8347362908949484e-05, "loss": 0.4117, "step": 47340 }, { "epoch": 1.7063105921360868, "grad_norm": 0.2083815336227417, "learning_rate": 3.834489538693868e-05, "loss": 0.4644, "step": 47345 }, { "epoch": 1.7064907917973113, "grad_norm": 0.1669875979423523, "learning_rate": 3.8342427683106276e-05, "loss": 0.3913, "step": 47350 }, { "epoch": 1.706670991458536, "grad_norm": 0.16881945729255676, "learning_rate": 3.833995979748587e-05, "loss": 0.4141, "step": 47355 }, { "epoch": 1.7068511911197608, "grad_norm": 0.17470820248126984, "learning_rate": 3.833749173011108e-05, "loss": 0.4245, "step": 47360 }, { "epoch": 1.7070313907809853, "grad_norm": 0.18530166149139404, "learning_rate": 3.833502348101556e-05, "loss": 0.4304, "step": 47365 }, { "epoch": 1.7072115904422098, "grad_norm": 0.206429585814476, "learning_rate": 3.833255505023292e-05, "loss": 0.4014, "step": 47370 }, { "epoch": 1.7073917901034346, "grad_norm": 0.172637477517128, "learning_rate": 3.83300864377968e-05, "loss": 0.3989, "step": 47375 }, { "epoch": 1.7075719897646593, "grad_norm": 0.18102556467056274, "learning_rate": 3.832761764374084e-05, "loss": 0.4066, "step": 47380 }, { "epoch": 1.707752189425884, "grad_norm": 0.16687656939029694, "learning_rate": 3.832514866809866e-05, "loss": 0.4241, "step": 47385 }, { "epoch": 1.7079323890871085, "grad_norm": 0.17765361070632935, "learning_rate": 3.832267951090392e-05, "loss": 0.404, "step": 47390 }, { "epoch": 1.708112588748333, "grad_norm": 0.20449407398700714, "learning_rate": 3.832021017219025e-05, "loss": 0.4424, "step": 47395 }, { "epoch": 1.7082927884095578, "grad_norm": 0.20661450922489166, "learning_rate": 3.831774065199128e-05, "loss": 0.444, "step": 47400 }, { "epoch": 1.7084729880707825, "grad_norm": 0.1424778401851654, "learning_rate": 3.8315270950340684e-05, "loss": 0.3854, "step": 47405 }, { "epoch": 1.708653187732007, "grad_norm": 0.1613854020833969, "learning_rate": 3.831280106727211e-05, "loss": 0.4014, "step": 47410 }, { "epoch": 1.7088333873932315, "grad_norm": 0.19576317071914673, "learning_rate": 3.8310331002819186e-05, "loss": 0.428, "step": 47415 }, { "epoch": 1.7090135870544563, "grad_norm": 0.19375436007976532, "learning_rate": 3.830786075701558e-05, "loss": 0.4368, "step": 47420 }, { "epoch": 1.709193786715681, "grad_norm": 0.24213196337223053, "learning_rate": 3.8305390329894945e-05, "loss": 0.4112, "step": 47425 }, { "epoch": 1.7093739863769057, "grad_norm": 0.15626302361488342, "learning_rate": 3.830291972149095e-05, "loss": 0.4208, "step": 47430 }, { "epoch": 1.7095541860381303, "grad_norm": 0.18249692022800446, "learning_rate": 3.8300448931837244e-05, "loss": 0.3906, "step": 47435 }, { "epoch": 1.7097343856993548, "grad_norm": 0.15607425570487976, "learning_rate": 3.829797796096749e-05, "loss": 0.4015, "step": 47440 }, { "epoch": 1.7099145853605795, "grad_norm": 0.1599874198436737, "learning_rate": 3.829550680891537e-05, "loss": 0.4124, "step": 47445 }, { "epoch": 1.7100947850218042, "grad_norm": 0.2017892599105835, "learning_rate": 3.8293035475714543e-05, "loss": 0.4204, "step": 47450 }, { "epoch": 1.7102749846830287, "grad_norm": 0.17167171835899353, "learning_rate": 3.8290563961398686e-05, "loss": 0.3933, "step": 47455 }, { "epoch": 1.7104551843442535, "grad_norm": 0.18356946110725403, "learning_rate": 3.828809226600146e-05, "loss": 0.4294, "step": 47460 }, { "epoch": 1.710635384005478, "grad_norm": 0.20430004596710205, "learning_rate": 3.828562038955655e-05, "loss": 0.4215, "step": 47465 }, { "epoch": 1.7108155836667027, "grad_norm": 0.19897079467773438, "learning_rate": 3.828314833209764e-05, "loss": 0.4301, "step": 47470 }, { "epoch": 1.7109957833279275, "grad_norm": 0.17606034874916077, "learning_rate": 3.828067609365841e-05, "loss": 0.4146, "step": 47475 }, { "epoch": 1.711175982989152, "grad_norm": 0.18910571932792664, "learning_rate": 3.827869817262381e-05, "loss": 0.4191, "step": 47480 }, { "epoch": 1.7113561826503765, "grad_norm": 0.16573849320411682, "learning_rate": 3.827622560850488e-05, "loss": 0.4207, "step": 47485 }, { "epoch": 1.7115363823116012, "grad_norm": 0.18701373040676117, "learning_rate": 3.827375286349993e-05, "loss": 0.4094, "step": 47490 }, { "epoch": 1.711716581972826, "grad_norm": 0.16349554061889648, "learning_rate": 3.827127993764269e-05, "loss": 0.4338, "step": 47495 }, { "epoch": 1.7118967816340507, "grad_norm": 0.17863896489143372, "learning_rate": 3.826880683096681e-05, "loss": 0.4307, "step": 47500 }, { "epoch": 1.7118967816340507, "eval_loss": 0.4411923587322235, "eval_runtime": 3.5372, "eval_samples_per_second": 28.271, "eval_steps_per_second": 7.068, "step": 47500 }, { "epoch": 1.7120769812952752, "grad_norm": 0.2336321473121643, "learning_rate": 3.8266333543506016e-05, "loss": 0.4195, "step": 47505 }, { "epoch": 1.7122571809564997, "grad_norm": 0.22593377530574799, "learning_rate": 3.826386007529399e-05, "loss": 0.4147, "step": 47510 }, { "epoch": 1.7124373806177244, "grad_norm": 0.23086853325366974, "learning_rate": 3.826138642636444e-05, "loss": 0.4112, "step": 47515 }, { "epoch": 1.7126175802789492, "grad_norm": 0.2238939106464386, "learning_rate": 3.8258912596751076e-05, "loss": 0.3977, "step": 47520 }, { "epoch": 1.7127977799401737, "grad_norm": 0.1743045598268509, "learning_rate": 3.8256438586487584e-05, "loss": 0.429, "step": 47525 }, { "epoch": 1.7129779796013982, "grad_norm": 0.1635396033525467, "learning_rate": 3.825396439560769e-05, "loss": 0.3928, "step": 47530 }, { "epoch": 1.713158179262623, "grad_norm": 0.1982748806476593, "learning_rate": 3.8251490024145085e-05, "loss": 0.4473, "step": 47535 }, { "epoch": 1.7133383789238477, "grad_norm": 0.17309851944446564, "learning_rate": 3.824901547213351e-05, "loss": 0.3952, "step": 47540 }, { "epoch": 1.7135185785850724, "grad_norm": 0.17767395079135895, "learning_rate": 3.824654073960665e-05, "loss": 0.379, "step": 47545 }, { "epoch": 1.713698778246297, "grad_norm": 0.19817490875720978, "learning_rate": 3.824406582659824e-05, "loss": 0.4156, "step": 47550 }, { "epoch": 1.7138789779075214, "grad_norm": 0.20314614474773407, "learning_rate": 3.8241590733142004e-05, "loss": 0.3943, "step": 47555 }, { "epoch": 1.7140591775687462, "grad_norm": 0.17916688323020935, "learning_rate": 3.823911545927166e-05, "loss": 0.3994, "step": 47560 }, { "epoch": 1.7142393772299709, "grad_norm": 0.1822926551103592, "learning_rate": 3.823664000502093e-05, "loss": 0.4236, "step": 47565 }, { "epoch": 1.7144195768911954, "grad_norm": 0.1775011420249939, "learning_rate": 3.823416437042353e-05, "loss": 0.4446, "step": 47570 }, { "epoch": 1.7145997765524201, "grad_norm": 0.1689702570438385, "learning_rate": 3.823168855551321e-05, "loss": 0.3789, "step": 47575 }, { "epoch": 1.7147799762136446, "grad_norm": 0.17035454511642456, "learning_rate": 3.82292125603237e-05, "loss": 0.4214, "step": 47580 }, { "epoch": 1.7149601758748694, "grad_norm": 0.17879173159599304, "learning_rate": 3.822673638488873e-05, "loss": 0.4425, "step": 47585 }, { "epoch": 1.715140375536094, "grad_norm": 0.2069813311100006, "learning_rate": 3.8224260029242034e-05, "loss": 0.4517, "step": 47590 }, { "epoch": 1.7153205751973186, "grad_norm": 0.16887204349040985, "learning_rate": 3.822178349341735e-05, "loss": 0.4043, "step": 47595 }, { "epoch": 1.7155007748585431, "grad_norm": 0.16069333255290985, "learning_rate": 3.821930677744843e-05, "loss": 0.4107, "step": 47600 }, { "epoch": 1.7156809745197679, "grad_norm": 0.1842743456363678, "learning_rate": 3.821682988136902e-05, "loss": 0.4391, "step": 47605 }, { "epoch": 1.7158611741809926, "grad_norm": 0.2043282687664032, "learning_rate": 3.821435280521286e-05, "loss": 0.4216, "step": 47610 }, { "epoch": 1.7160413738422173, "grad_norm": 0.14843593537807465, "learning_rate": 3.82118755490137e-05, "loss": 0.3988, "step": 47615 }, { "epoch": 1.7162215735034418, "grad_norm": 0.17393402755260468, "learning_rate": 3.820939811280528e-05, "loss": 0.4165, "step": 47620 }, { "epoch": 1.7164017731646664, "grad_norm": 0.20571184158325195, "learning_rate": 3.820692049662139e-05, "loss": 0.3767, "step": 47625 }, { "epoch": 1.716581972825891, "grad_norm": 0.18904875218868256, "learning_rate": 3.820444270049576e-05, "loss": 0.4407, "step": 47630 }, { "epoch": 1.7167621724871158, "grad_norm": 0.16259777545928955, "learning_rate": 3.820196472446215e-05, "loss": 0.4126, "step": 47635 }, { "epoch": 1.7169423721483403, "grad_norm": 0.19046054780483246, "learning_rate": 3.819948656855432e-05, "loss": 0.3814, "step": 47640 }, { "epoch": 1.7171225718095648, "grad_norm": 0.17224107682704926, "learning_rate": 3.819700823280605e-05, "loss": 0.4453, "step": 47645 }, { "epoch": 1.7173027714707896, "grad_norm": 0.22785985469818115, "learning_rate": 3.8194529717251095e-05, "loss": 0.3798, "step": 47650 }, { "epoch": 1.7174829711320143, "grad_norm": 0.17245115339756012, "learning_rate": 3.819205102192323e-05, "loss": 0.3987, "step": 47655 }, { "epoch": 1.717663170793239, "grad_norm": 0.17877335846424103, "learning_rate": 3.818957214685622e-05, "loss": 0.3945, "step": 47660 }, { "epoch": 1.7178433704544636, "grad_norm": 0.20267999172210693, "learning_rate": 3.8187093092083845e-05, "loss": 0.4068, "step": 47665 }, { "epoch": 1.718023570115688, "grad_norm": 0.1948103904724121, "learning_rate": 3.818461385763988e-05, "loss": 0.4051, "step": 47670 }, { "epoch": 1.7182037697769128, "grad_norm": 0.1381908804178238, "learning_rate": 3.81821344435581e-05, "loss": 0.4079, "step": 47675 }, { "epoch": 1.7183839694381375, "grad_norm": 0.16724207997322083, "learning_rate": 3.81796548498723e-05, "loss": 0.387, "step": 47680 }, { "epoch": 1.718564169099362, "grad_norm": 0.19141383469104767, "learning_rate": 3.817717507661625e-05, "loss": 0.4073, "step": 47685 }, { "epoch": 1.7187443687605868, "grad_norm": 0.21919883787631989, "learning_rate": 3.8174695123823734e-05, "loss": 0.4621, "step": 47690 }, { "epoch": 1.7189245684218113, "grad_norm": 0.1990518867969513, "learning_rate": 3.8172214991528554e-05, "loss": 0.4047, "step": 47695 }, { "epoch": 1.719104768083036, "grad_norm": 0.18987984955310822, "learning_rate": 3.8169734679764494e-05, "loss": 0.4128, "step": 47700 }, { "epoch": 1.7192849677442608, "grad_norm": 0.18330170214176178, "learning_rate": 3.816725418856535e-05, "loss": 0.4213, "step": 47705 }, { "epoch": 1.7194651674054853, "grad_norm": 0.1505930870771408, "learning_rate": 3.816477351796491e-05, "loss": 0.4264, "step": 47710 }, { "epoch": 1.7196453670667098, "grad_norm": 0.18332305550575256, "learning_rate": 3.8162292667996986e-05, "loss": 0.4012, "step": 47715 }, { "epoch": 1.7198255667279345, "grad_norm": 0.21521466970443726, "learning_rate": 3.815981163869537e-05, "loss": 0.4301, "step": 47720 }, { "epoch": 1.7200057663891593, "grad_norm": 0.21533189713954926, "learning_rate": 3.815733043009387e-05, "loss": 0.4403, "step": 47725 }, { "epoch": 1.720185966050384, "grad_norm": 0.22986893355846405, "learning_rate": 3.815484904222629e-05, "loss": 0.4242, "step": 47730 }, { "epoch": 1.7203661657116085, "grad_norm": 0.1726514995098114, "learning_rate": 3.8152367475126436e-05, "loss": 0.4429, "step": 47735 }, { "epoch": 1.720546365372833, "grad_norm": 0.18250474333763123, "learning_rate": 3.814988572882813e-05, "loss": 0.4109, "step": 47740 }, { "epoch": 1.7207265650340577, "grad_norm": 0.15575775504112244, "learning_rate": 3.814740380336517e-05, "loss": 0.3745, "step": 47745 }, { "epoch": 1.7209067646952825, "grad_norm": 0.23164531588554382, "learning_rate": 3.814492169877138e-05, "loss": 0.4285, "step": 47750 }, { "epoch": 1.721086964356507, "grad_norm": 0.18944543600082397, "learning_rate": 3.814243941508058e-05, "loss": 0.4425, "step": 47755 }, { "epoch": 1.7212671640177315, "grad_norm": 0.17386077344417572, "learning_rate": 3.813995695232658e-05, "loss": 0.4277, "step": 47760 }, { "epoch": 1.7214473636789562, "grad_norm": 0.16808000206947327, "learning_rate": 3.813747431054321e-05, "loss": 0.4155, "step": 47765 }, { "epoch": 1.721627563340181, "grad_norm": 0.21575501561164856, "learning_rate": 3.8134991489764305e-05, "loss": 0.4243, "step": 47770 }, { "epoch": 1.7218077630014057, "grad_norm": 0.16547565162181854, "learning_rate": 3.8132508490023674e-05, "loss": 0.4241, "step": 47775 }, { "epoch": 1.7219879626626302, "grad_norm": 0.17828123271465302, "learning_rate": 3.813002531135517e-05, "loss": 0.4356, "step": 47780 }, { "epoch": 1.7221681623238547, "grad_norm": 0.21467016637325287, "learning_rate": 3.81275419537926e-05, "loss": 0.399, "step": 47785 }, { "epoch": 1.7223483619850795, "grad_norm": 0.16447187960147858, "learning_rate": 3.8125058417369824e-05, "loss": 0.4181, "step": 47790 }, { "epoch": 1.7225285616463042, "grad_norm": 0.17198550701141357, "learning_rate": 3.812257470212066e-05, "loss": 0.3742, "step": 47795 }, { "epoch": 1.7227087613075287, "grad_norm": 0.1887580305337906, "learning_rate": 3.812009080807896e-05, "loss": 0.3888, "step": 47800 }, { "epoch": 1.7228889609687532, "grad_norm": 0.18961556255817413, "learning_rate": 3.8117606735278556e-05, "loss": 0.44, "step": 47805 }, { "epoch": 1.723069160629978, "grad_norm": 0.18733921647071838, "learning_rate": 3.811512248375332e-05, "loss": 0.4275, "step": 47810 }, { "epoch": 1.7232493602912027, "grad_norm": 0.17053326964378357, "learning_rate": 3.811263805353705e-05, "loss": 0.4142, "step": 47815 }, { "epoch": 1.7234295599524274, "grad_norm": 0.2137068808078766, "learning_rate": 3.811015344466365e-05, "loss": 0.4018, "step": 47820 }, { "epoch": 1.723609759613652, "grad_norm": 0.21045798063278198, "learning_rate": 3.810766865716693e-05, "loss": 0.4379, "step": 47825 }, { "epoch": 1.7237899592748764, "grad_norm": 0.2026238739490509, "learning_rate": 3.810518369108077e-05, "loss": 0.4311, "step": 47830 }, { "epoch": 1.7239701589361012, "grad_norm": 0.1852739304304123, "learning_rate": 3.8102698546439025e-05, "loss": 0.397, "step": 47835 }, { "epoch": 1.724150358597326, "grad_norm": 0.20483826100826263, "learning_rate": 3.810021322327554e-05, "loss": 0.4485, "step": 47840 }, { "epoch": 1.7243305582585506, "grad_norm": 0.18066450953483582, "learning_rate": 3.809772772162419e-05, "loss": 0.4222, "step": 47845 }, { "epoch": 1.7245107579197752, "grad_norm": 0.1802929937839508, "learning_rate": 3.809524204151883e-05, "loss": 0.4147, "step": 47850 }, { "epoch": 1.7246909575809997, "grad_norm": 0.177862748503685, "learning_rate": 3.809275618299335e-05, "loss": 0.445, "step": 47855 }, { "epoch": 1.7248711572422244, "grad_norm": 0.1601111888885498, "learning_rate": 3.809027014608159e-05, "loss": 0.424, "step": 47860 }, { "epoch": 1.7250513569034491, "grad_norm": 0.1917547881603241, "learning_rate": 3.808778393081742e-05, "loss": 0.4559, "step": 47865 }, { "epoch": 1.7252315565646736, "grad_norm": 0.1599884331226349, "learning_rate": 3.808529753723475e-05, "loss": 0.4261, "step": 47870 }, { "epoch": 1.7254117562258982, "grad_norm": 0.1725659966468811, "learning_rate": 3.808281096536742e-05, "loss": 0.4202, "step": 47875 }, { "epoch": 1.7255919558871229, "grad_norm": 0.177949458360672, "learning_rate": 3.808032421524933e-05, "loss": 0.431, "step": 47880 }, { "epoch": 1.7257721555483476, "grad_norm": 0.20164379477500916, "learning_rate": 3.807783728691435e-05, "loss": 0.3792, "step": 47885 }, { "epoch": 1.7259523552095724, "grad_norm": 0.16568000614643097, "learning_rate": 3.8075350180396376e-05, "loss": 0.4351, "step": 47890 }, { "epoch": 1.7261325548707969, "grad_norm": 0.1772766262292862, "learning_rate": 3.807286289572929e-05, "loss": 0.4139, "step": 47895 }, { "epoch": 1.7263127545320214, "grad_norm": 0.20613396167755127, "learning_rate": 3.8070375432946965e-05, "loss": 0.4108, "step": 47900 }, { "epoch": 1.726492954193246, "grad_norm": 0.1459551453590393, "learning_rate": 3.806788779208331e-05, "loss": 0.4489, "step": 47905 }, { "epoch": 1.7266731538544708, "grad_norm": 0.20022889971733093, "learning_rate": 3.806539997317221e-05, "loss": 0.438, "step": 47910 }, { "epoch": 1.7268533535156954, "grad_norm": 0.1898205578327179, "learning_rate": 3.806291197624758e-05, "loss": 0.4088, "step": 47915 }, { "epoch": 1.7270335531769199, "grad_norm": 0.18108853697776794, "learning_rate": 3.8060423801343294e-05, "loss": 0.4036, "step": 47920 }, { "epoch": 1.7272137528381446, "grad_norm": 0.21101535856723785, "learning_rate": 3.805793544849326e-05, "loss": 0.4368, "step": 47925 }, { "epoch": 1.7273939524993693, "grad_norm": 0.16363751888275146, "learning_rate": 3.8055446917731386e-05, "loss": 0.4097, "step": 47930 }, { "epoch": 1.727574152160594, "grad_norm": 0.21080221235752106, "learning_rate": 3.805295820909158e-05, "loss": 0.4326, "step": 47935 }, { "epoch": 1.7277543518218186, "grad_norm": 0.18741890788078308, "learning_rate": 3.805046932260774e-05, "loss": 0.4329, "step": 47940 }, { "epoch": 1.727934551483043, "grad_norm": 0.20189730823040009, "learning_rate": 3.804798025831379e-05, "loss": 0.4618, "step": 47945 }, { "epoch": 1.7281147511442678, "grad_norm": 0.16234025359153748, "learning_rate": 3.804549101624362e-05, "loss": 0.3968, "step": 47950 }, { "epoch": 1.7282949508054926, "grad_norm": 0.17763473093509674, "learning_rate": 3.804300159643117e-05, "loss": 0.3923, "step": 47955 }, { "epoch": 1.728475150466717, "grad_norm": 0.20635025203227997, "learning_rate": 3.804051199891035e-05, "loss": 0.3992, "step": 47960 }, { "epoch": 1.7286553501279418, "grad_norm": 0.1754833310842514, "learning_rate": 3.803802222371507e-05, "loss": 0.4374, "step": 47965 }, { "epoch": 1.7288355497891663, "grad_norm": 0.17620554566383362, "learning_rate": 3.803553227087928e-05, "loss": 0.409, "step": 47970 }, { "epoch": 1.729015749450391, "grad_norm": 0.1946227103471756, "learning_rate": 3.803304214043687e-05, "loss": 0.4026, "step": 47975 }, { "epoch": 1.7291959491116158, "grad_norm": 0.22340406477451324, "learning_rate": 3.803055183242179e-05, "loss": 0.3881, "step": 47980 }, { "epoch": 1.7293761487728403, "grad_norm": 0.16771641373634338, "learning_rate": 3.8028061346867963e-05, "loss": 0.4304, "step": 47985 }, { "epoch": 1.7295563484340648, "grad_norm": 0.16595138609409332, "learning_rate": 3.802557068380932e-05, "loss": 0.4088, "step": 47990 }, { "epoch": 1.7297365480952895, "grad_norm": 0.22411175072193146, "learning_rate": 3.80230798432798e-05, "loss": 0.4289, "step": 47995 }, { "epoch": 1.7299167477565143, "grad_norm": 0.2403663694858551, "learning_rate": 3.802058882531334e-05, "loss": 0.4267, "step": 48000 }, { "epoch": 1.7299167477565143, "eval_loss": 0.44221970438957214, "eval_runtime": 3.5221, "eval_samples_per_second": 28.392, "eval_steps_per_second": 7.098, "step": 48000 }, { "epoch": 1.730096947417739, "grad_norm": 0.1950840801000595, "learning_rate": 3.801809762994387e-05, "loss": 0.4326, "step": 48005 }, { "epoch": 1.7302771470789635, "grad_norm": 0.18309806287288666, "learning_rate": 3.801560625720535e-05, "loss": 0.4377, "step": 48010 }, { "epoch": 1.730457346740188, "grad_norm": 0.17351049184799194, "learning_rate": 3.8013114707131716e-05, "loss": 0.3877, "step": 48015 }, { "epoch": 1.7306375464014128, "grad_norm": 0.18052922189235687, "learning_rate": 3.80106229797569e-05, "loss": 0.3962, "step": 48020 }, { "epoch": 1.7308177460626375, "grad_norm": 0.17976753413677216, "learning_rate": 3.8008131075114886e-05, "loss": 0.3876, "step": 48025 }, { "epoch": 1.730997945723862, "grad_norm": 0.1913975179195404, "learning_rate": 3.800563899323959e-05, "loss": 0.3877, "step": 48030 }, { "epoch": 1.7311781453850865, "grad_norm": 0.19166803359985352, "learning_rate": 3.800314673416498e-05, "loss": 0.4104, "step": 48035 }, { "epoch": 1.7313583450463113, "grad_norm": 0.19192728400230408, "learning_rate": 3.800065429792501e-05, "loss": 0.3646, "step": 48040 }, { "epoch": 1.731538544707536, "grad_norm": 0.21600832045078278, "learning_rate": 3.7998161684553656e-05, "loss": 0.4372, "step": 48045 }, { "epoch": 1.7317187443687607, "grad_norm": 0.20406126976013184, "learning_rate": 3.799566889408486e-05, "loss": 0.4218, "step": 48050 }, { "epoch": 1.7318989440299852, "grad_norm": 0.17205806076526642, "learning_rate": 3.7993175926552596e-05, "loss": 0.3918, "step": 48055 }, { "epoch": 1.7320791436912097, "grad_norm": 0.15462535619735718, "learning_rate": 3.799068278199081e-05, "loss": 0.3719, "step": 48060 }, { "epoch": 1.7322593433524345, "grad_norm": 0.2172221541404724, "learning_rate": 3.7988189460433496e-05, "loss": 0.4315, "step": 48065 }, { "epoch": 1.7324395430136592, "grad_norm": 0.23806485533714294, "learning_rate": 3.7985695961914614e-05, "loss": 0.4457, "step": 48070 }, { "epoch": 1.7326197426748837, "grad_norm": 0.19144880771636963, "learning_rate": 3.798320228646814e-05, "loss": 0.4054, "step": 48075 }, { "epoch": 1.7327999423361085, "grad_norm": 0.13344308733940125, "learning_rate": 3.798070843412805e-05, "loss": 0.4203, "step": 48080 }, { "epoch": 1.732980141997333, "grad_norm": 0.17505653202533722, "learning_rate": 3.7978214404928315e-05, "loss": 0.4313, "step": 48085 }, { "epoch": 1.7331603416585577, "grad_norm": 0.15916766226291656, "learning_rate": 3.7975720198902924e-05, "loss": 0.3778, "step": 48090 }, { "epoch": 1.7333405413197824, "grad_norm": 0.2202104777097702, "learning_rate": 3.797322581608585e-05, "loss": 0.4379, "step": 48095 }, { "epoch": 1.733520740981007, "grad_norm": 0.17521396279335022, "learning_rate": 3.7970731256511104e-05, "loss": 0.4141, "step": 48100 }, { "epoch": 1.7337009406422315, "grad_norm": 0.1417665183544159, "learning_rate": 3.796823652021265e-05, "loss": 0.3653, "step": 48105 }, { "epoch": 1.7338811403034562, "grad_norm": 0.20344464480876923, "learning_rate": 3.796574160722448e-05, "loss": 0.4285, "step": 48110 }, { "epoch": 1.734061339964681, "grad_norm": 0.1791670322418213, "learning_rate": 3.796324651758059e-05, "loss": 0.4186, "step": 48115 }, { "epoch": 1.7342415396259057, "grad_norm": 0.19157084822654724, "learning_rate": 3.796075125131498e-05, "loss": 0.4068, "step": 48120 }, { "epoch": 1.7344217392871302, "grad_norm": 0.19911465048789978, "learning_rate": 3.795825580846164e-05, "loss": 0.4383, "step": 48125 }, { "epoch": 1.7346019389483547, "grad_norm": 0.1901884824037552, "learning_rate": 3.7955760189054566e-05, "loss": 0.4327, "step": 48130 }, { "epoch": 1.7347821386095794, "grad_norm": 0.17567721009254456, "learning_rate": 3.7953264393127774e-05, "loss": 0.406, "step": 48135 }, { "epoch": 1.7349623382708041, "grad_norm": 0.20942769944667816, "learning_rate": 3.795076842071526e-05, "loss": 0.4276, "step": 48140 }, { "epoch": 1.7351425379320287, "grad_norm": 0.18025130033493042, "learning_rate": 3.794827227185105e-05, "loss": 0.4079, "step": 48145 }, { "epoch": 1.7353227375932532, "grad_norm": 0.21213695406913757, "learning_rate": 3.794577594656912e-05, "loss": 0.4179, "step": 48150 }, { "epoch": 1.735502937254478, "grad_norm": 0.18721318244934082, "learning_rate": 3.79432794449035e-05, "loss": 0.4373, "step": 48155 }, { "epoch": 1.7356831369157026, "grad_norm": 0.2057793140411377, "learning_rate": 3.794078276688822e-05, "loss": 0.3952, "step": 48160 }, { "epoch": 1.7358633365769274, "grad_norm": 0.1834101676940918, "learning_rate": 3.7938285912557256e-05, "loss": 0.4069, "step": 48165 }, { "epoch": 1.7360435362381519, "grad_norm": 0.21136592328548431, "learning_rate": 3.793578888194467e-05, "loss": 0.4132, "step": 48170 }, { "epoch": 1.7362237358993764, "grad_norm": 0.1626768410205841, "learning_rate": 3.793329167508445e-05, "loss": 0.359, "step": 48175 }, { "epoch": 1.7364039355606011, "grad_norm": 0.1921762079000473, "learning_rate": 3.7930794292010654e-05, "loss": 0.4221, "step": 48180 }, { "epoch": 1.7365841352218259, "grad_norm": 0.17365401983261108, "learning_rate": 3.7928296732757276e-05, "loss": 0.4299, "step": 48185 }, { "epoch": 1.7367643348830504, "grad_norm": 0.18681730329990387, "learning_rate": 3.7925798997358354e-05, "loss": 0.4318, "step": 48190 }, { "epoch": 1.736944534544275, "grad_norm": 0.15825678408145905, "learning_rate": 3.792330108584793e-05, "loss": 0.3875, "step": 48195 }, { "epoch": 1.7371247342054996, "grad_norm": 0.17392268776893616, "learning_rate": 3.792080299826003e-05, "loss": 0.4145, "step": 48200 }, { "epoch": 1.7373049338667244, "grad_norm": 0.17445778846740723, "learning_rate": 3.79183047346287e-05, "loss": 0.3871, "step": 48205 }, { "epoch": 1.737485133527949, "grad_norm": 0.18119192123413086, "learning_rate": 3.7915806294987955e-05, "loss": 0.3867, "step": 48210 }, { "epoch": 1.7376653331891736, "grad_norm": 0.20309992134571075, "learning_rate": 3.7913307679371856e-05, "loss": 0.4133, "step": 48215 }, { "epoch": 1.7378455328503981, "grad_norm": 0.1670549362897873, "learning_rate": 3.791080888781444e-05, "loss": 0.405, "step": 48220 }, { "epoch": 1.7380257325116228, "grad_norm": 0.162832111120224, "learning_rate": 3.790830992034974e-05, "loss": 0.3856, "step": 48225 }, { "epoch": 1.7382059321728476, "grad_norm": 0.16759029030799866, "learning_rate": 3.7905810777011837e-05, "loss": 0.3806, "step": 48230 }, { "epoch": 1.7383861318340723, "grad_norm": 0.18382543325424194, "learning_rate": 3.790331145783474e-05, "loss": 0.3809, "step": 48235 }, { "epoch": 1.7385663314952968, "grad_norm": 0.18206432461738586, "learning_rate": 3.7900811962852544e-05, "loss": 0.4206, "step": 48240 }, { "epoch": 1.7387465311565213, "grad_norm": 0.17950773239135742, "learning_rate": 3.789831229209927e-05, "loss": 0.436, "step": 48245 }, { "epoch": 1.738926730817746, "grad_norm": 0.16286823153495789, "learning_rate": 3.7895812445608994e-05, "loss": 0.4264, "step": 48250 }, { "epoch": 1.7391069304789708, "grad_norm": 0.18310612440109253, "learning_rate": 3.789331242341576e-05, "loss": 0.4107, "step": 48255 }, { "epoch": 1.7392871301401953, "grad_norm": 0.17169633507728577, "learning_rate": 3.789081222555365e-05, "loss": 0.4303, "step": 48260 }, { "epoch": 1.7394673298014198, "grad_norm": 0.20822210609912872, "learning_rate": 3.7888311852056725e-05, "loss": 0.409, "step": 48265 }, { "epoch": 1.7396475294626446, "grad_norm": 0.18613308668136597, "learning_rate": 3.788581130295903e-05, "loss": 0.3899, "step": 48270 }, { "epoch": 1.7398277291238693, "grad_norm": 0.1604495495557785, "learning_rate": 3.788331057829466e-05, "loss": 0.413, "step": 48275 }, { "epoch": 1.740007928785094, "grad_norm": 0.18513786792755127, "learning_rate": 3.788080967809767e-05, "loss": 0.4236, "step": 48280 }, { "epoch": 1.7401881284463185, "grad_norm": 0.20122095942497253, "learning_rate": 3.7878308602402156e-05, "loss": 0.3951, "step": 48285 }, { "epoch": 1.740368328107543, "grad_norm": 0.22590294480323792, "learning_rate": 3.787580735124217e-05, "loss": 0.4287, "step": 48290 }, { "epoch": 1.7405485277687678, "grad_norm": 0.15910333395004272, "learning_rate": 3.78733059246518e-05, "loss": 0.4064, "step": 48295 }, { "epoch": 1.7407287274299925, "grad_norm": 0.20325346291065216, "learning_rate": 3.787080432266514e-05, "loss": 0.415, "step": 48300 }, { "epoch": 1.740908927091217, "grad_norm": 0.16542761027812958, "learning_rate": 3.786830254531626e-05, "loss": 0.4409, "step": 48305 }, { "epoch": 1.7410891267524415, "grad_norm": 0.19882872700691223, "learning_rate": 3.7865800592639245e-05, "loss": 0.4755, "step": 48310 }, { "epoch": 1.7412693264136663, "grad_norm": 0.17635419964790344, "learning_rate": 3.786329846466818e-05, "loss": 0.3941, "step": 48315 }, { "epoch": 1.741449526074891, "grad_norm": 0.22185000777244568, "learning_rate": 3.786079616143718e-05, "loss": 0.4116, "step": 48320 }, { "epoch": 1.7416297257361157, "grad_norm": 0.21445868909358978, "learning_rate": 3.7858293682980315e-05, "loss": 0.4251, "step": 48325 }, { "epoch": 1.7418099253973403, "grad_norm": 0.18230614066123962, "learning_rate": 3.785579102933168e-05, "loss": 0.4337, "step": 48330 }, { "epoch": 1.7419901250585648, "grad_norm": 0.18611697852611542, "learning_rate": 3.7853288200525394e-05, "loss": 0.4349, "step": 48335 }, { "epoch": 1.7421703247197895, "grad_norm": 0.17176128923892975, "learning_rate": 3.785078519659554e-05, "loss": 0.4123, "step": 48340 }, { "epoch": 1.7423505243810142, "grad_norm": 0.18628238141536713, "learning_rate": 3.784828201757623e-05, "loss": 0.417, "step": 48345 }, { "epoch": 1.742530724042239, "grad_norm": 0.17633478343486786, "learning_rate": 3.784577866350155e-05, "loss": 0.4247, "step": 48350 }, { "epoch": 1.7427109237034635, "grad_norm": 0.17602220177650452, "learning_rate": 3.7843275134405645e-05, "loss": 0.4179, "step": 48355 }, { "epoch": 1.742891123364688, "grad_norm": 0.15746374428272247, "learning_rate": 3.7840771430322586e-05, "loss": 0.4164, "step": 48360 }, { "epoch": 1.7430713230259127, "grad_norm": 0.22847017645835876, "learning_rate": 3.7838267551286504e-05, "loss": 0.4151, "step": 48365 }, { "epoch": 1.7432515226871375, "grad_norm": 0.19369705021381378, "learning_rate": 3.783576349733152e-05, "loss": 0.4186, "step": 48370 }, { "epoch": 1.743431722348362, "grad_norm": 0.1731320321559906, "learning_rate": 3.7833259268491735e-05, "loss": 0.3926, "step": 48375 }, { "epoch": 1.7436119220095865, "grad_norm": 0.17335784435272217, "learning_rate": 3.7830754864801284e-05, "loss": 0.4057, "step": 48380 }, { "epoch": 1.7437921216708112, "grad_norm": 0.1906140148639679, "learning_rate": 3.782825028629428e-05, "loss": 0.438, "step": 48385 }, { "epoch": 1.743972321332036, "grad_norm": 0.21084898710250854, "learning_rate": 3.782574553300485e-05, "loss": 0.4079, "step": 48390 }, { "epoch": 1.7441525209932607, "grad_norm": 0.2448464035987854, "learning_rate": 3.7823240604967116e-05, "loss": 0.4115, "step": 48395 }, { "epoch": 1.7443327206544852, "grad_norm": 0.185360386967659, "learning_rate": 3.782073550221521e-05, "loss": 0.4011, "step": 48400 }, { "epoch": 1.7445129203157097, "grad_norm": 0.16345396637916565, "learning_rate": 3.781823022478327e-05, "loss": 0.4431, "step": 48405 }, { "epoch": 1.7446931199769344, "grad_norm": 0.13991603255271912, "learning_rate": 3.7815724772705423e-05, "loss": 0.4147, "step": 48410 }, { "epoch": 1.7448733196381592, "grad_norm": 0.15714098513126373, "learning_rate": 3.781321914601581e-05, "loss": 0.3805, "step": 48415 }, { "epoch": 1.7450535192993837, "grad_norm": 0.20374760031700134, "learning_rate": 3.781071334474856e-05, "loss": 0.4347, "step": 48420 }, { "epoch": 1.7452337189606082, "grad_norm": 0.22393597662448883, "learning_rate": 3.780820736893783e-05, "loss": 0.441, "step": 48425 }, { "epoch": 1.745413918621833, "grad_norm": 0.23388418555259705, "learning_rate": 3.780570121861775e-05, "loss": 0.4262, "step": 48430 }, { "epoch": 1.7455941182830577, "grad_norm": 0.23228850960731506, "learning_rate": 3.7803194893822466e-05, "loss": 0.4326, "step": 48435 }, { "epoch": 1.7457743179442824, "grad_norm": 0.24053829908370972, "learning_rate": 3.780068839458614e-05, "loss": 0.4341, "step": 48440 }, { "epoch": 1.745954517605507, "grad_norm": 0.1622568517923355, "learning_rate": 3.779818172094291e-05, "loss": 0.3959, "step": 48445 }, { "epoch": 1.7461347172667314, "grad_norm": 0.20720703899860382, "learning_rate": 3.779567487292693e-05, "loss": 0.4108, "step": 48450 }, { "epoch": 1.7463149169279562, "grad_norm": 0.20429673790931702, "learning_rate": 3.779316785057235e-05, "loss": 0.434, "step": 48455 }, { "epoch": 1.7464951165891809, "grad_norm": 0.23048081994056702, "learning_rate": 3.7790660653913346e-05, "loss": 0.4158, "step": 48460 }, { "epoch": 1.7466753162504054, "grad_norm": 0.19923274219036102, "learning_rate": 3.778815328298406e-05, "loss": 0.4146, "step": 48465 }, { "epoch": 1.7468555159116301, "grad_norm": 0.18411320447921753, "learning_rate": 3.778564573781866e-05, "loss": 0.4568, "step": 48470 }, { "epoch": 1.7470357155728546, "grad_norm": 0.20448732376098633, "learning_rate": 3.778313801845132e-05, "loss": 0.4085, "step": 48475 }, { "epoch": 1.7472159152340794, "grad_norm": 0.18641623854637146, "learning_rate": 3.7780630124916195e-05, "loss": 0.41, "step": 48480 }, { "epoch": 1.747396114895304, "grad_norm": 0.1633073091506958, "learning_rate": 3.7778122057247464e-05, "loss": 0.4043, "step": 48485 }, { "epoch": 1.7475763145565286, "grad_norm": 0.15964628756046295, "learning_rate": 3.777561381547929e-05, "loss": 0.3637, "step": 48490 }, { "epoch": 1.7477565142177531, "grad_norm": 0.15613767504692078, "learning_rate": 3.7773105399645845e-05, "loss": 0.402, "step": 48495 }, { "epoch": 1.7479367138789779, "grad_norm": 0.1990021914243698, "learning_rate": 3.777059680978132e-05, "loss": 0.4074, "step": 48500 }, { "epoch": 1.7479367138789779, "eval_loss": 0.4423659145832062, "eval_runtime": 3.5418, "eval_samples_per_second": 28.234, "eval_steps_per_second": 7.059, "step": 48500 }, { "epoch": 1.7481169135402026, "grad_norm": 0.20178571343421936, "learning_rate": 3.776808804591989e-05, "loss": 0.4374, "step": 48505 }, { "epoch": 1.7482971132014273, "grad_norm": 0.16582556068897247, "learning_rate": 3.776557910809572e-05, "loss": 0.4201, "step": 48510 }, { "epoch": 1.7484773128626518, "grad_norm": 0.2059725672006607, "learning_rate": 3.7763069996343015e-05, "loss": 0.4146, "step": 48515 }, { "epoch": 1.7486575125238764, "grad_norm": 0.24231673777103424, "learning_rate": 3.776056071069595e-05, "loss": 0.4654, "step": 48520 }, { "epoch": 1.748837712185101, "grad_norm": 0.16822253167629242, "learning_rate": 3.775805125118871e-05, "loss": 0.4132, "step": 48525 }, { "epoch": 1.7490179118463258, "grad_norm": 0.21261481940746307, "learning_rate": 3.7755541617855505e-05, "loss": 0.4319, "step": 48530 }, { "epoch": 1.7491981115075503, "grad_norm": 0.15500931441783905, "learning_rate": 3.77530318107305e-05, "loss": 0.3957, "step": 48535 }, { "epoch": 1.7493783111687748, "grad_norm": 0.24944768846035004, "learning_rate": 3.775052182984792e-05, "loss": 0.4093, "step": 48540 }, { "epoch": 1.7495585108299996, "grad_norm": 0.17092691361904144, "learning_rate": 3.7748011675241935e-05, "loss": 0.4218, "step": 48545 }, { "epoch": 1.7497387104912243, "grad_norm": 0.20408369600772858, "learning_rate": 3.7745501346946764e-05, "loss": 0.4123, "step": 48550 }, { "epoch": 1.749918910152449, "grad_norm": 0.25046631693840027, "learning_rate": 3.7742990844996606e-05, "loss": 0.4413, "step": 48555 }, { "epoch": 1.7500991098136736, "grad_norm": 0.1823837161064148, "learning_rate": 3.7740480169425666e-05, "loss": 0.4226, "step": 48560 }, { "epoch": 1.750279309474898, "grad_norm": 0.19407956302165985, "learning_rate": 3.7737969320268143e-05, "loss": 0.4308, "step": 48565 }, { "epoch": 1.7504595091361228, "grad_norm": 0.23119215667247772, "learning_rate": 3.7735458297558266e-05, "loss": 0.4491, "step": 48570 }, { "epoch": 1.7506397087973475, "grad_norm": 0.2448967546224594, "learning_rate": 3.773294710133023e-05, "loss": 0.4618, "step": 48575 }, { "epoch": 1.750819908458572, "grad_norm": 0.16788001358509064, "learning_rate": 3.773043573161825e-05, "loss": 0.3828, "step": 48580 }, { "epoch": 1.7510001081197968, "grad_norm": 0.1867685317993164, "learning_rate": 3.772792418845655e-05, "loss": 0.3888, "step": 48585 }, { "epoch": 1.7511803077810213, "grad_norm": 0.2027750462293625, "learning_rate": 3.7725412471879354e-05, "loss": 0.4142, "step": 48590 }, { "epoch": 1.751360507442246, "grad_norm": 0.19157275557518005, "learning_rate": 3.7722900581920875e-05, "loss": 0.3886, "step": 48595 }, { "epoch": 1.7515407071034708, "grad_norm": 0.19362173974514008, "learning_rate": 3.7720388518615335e-05, "loss": 0.4182, "step": 48600 }, { "epoch": 1.7517209067646953, "grad_norm": 0.1794130504131317, "learning_rate": 3.771787628199696e-05, "loss": 0.3702, "step": 48605 }, { "epoch": 1.7519011064259198, "grad_norm": 0.1916467696428299, "learning_rate": 3.771536387209999e-05, "loss": 0.3976, "step": 48610 }, { "epoch": 1.7520813060871445, "grad_norm": 0.2182328701019287, "learning_rate": 3.771285128895865e-05, "loss": 0.4402, "step": 48615 }, { "epoch": 1.7522615057483693, "grad_norm": 0.1866350769996643, "learning_rate": 3.771033853260717e-05, "loss": 0.4152, "step": 48620 }, { "epoch": 1.752441705409594, "grad_norm": 0.15760236978530884, "learning_rate": 3.770782560307978e-05, "loss": 0.4226, "step": 48625 }, { "epoch": 1.7526219050708185, "grad_norm": 0.2243875414133072, "learning_rate": 3.770531250041074e-05, "loss": 0.4394, "step": 48630 }, { "epoch": 1.752802104732043, "grad_norm": 0.16506066918373108, "learning_rate": 3.770279922463428e-05, "loss": 0.3997, "step": 48635 }, { "epoch": 1.7529823043932677, "grad_norm": 0.17547442018985748, "learning_rate": 3.770028577578462e-05, "loss": 0.4006, "step": 48640 }, { "epoch": 1.7531625040544925, "grad_norm": 0.18571364879608154, "learning_rate": 3.769777215389604e-05, "loss": 0.4179, "step": 48645 }, { "epoch": 1.753342703715717, "grad_norm": 0.17581316828727722, "learning_rate": 3.7695258359002775e-05, "loss": 0.4329, "step": 48650 }, { "epoch": 1.7535229033769415, "grad_norm": 0.22021692991256714, "learning_rate": 3.769274439113906e-05, "loss": 0.4256, "step": 48655 }, { "epoch": 1.7537031030381662, "grad_norm": 0.16692475974559784, "learning_rate": 3.769023025033917e-05, "loss": 0.4274, "step": 48660 }, { "epoch": 1.753883302699391, "grad_norm": 0.16848334670066833, "learning_rate": 3.768771593663735e-05, "loss": 0.4117, "step": 48665 }, { "epoch": 1.7540635023606157, "grad_norm": 0.2044806033372879, "learning_rate": 3.7685201450067845e-05, "loss": 0.4221, "step": 48670 }, { "epoch": 1.7542437020218402, "grad_norm": 0.16209609806537628, "learning_rate": 3.768268679066494e-05, "loss": 0.389, "step": 48675 }, { "epoch": 1.7544239016830647, "grad_norm": 0.18497797846794128, "learning_rate": 3.7680171958462875e-05, "loss": 0.4079, "step": 48680 }, { "epoch": 1.7546041013442895, "grad_norm": 0.15725673735141754, "learning_rate": 3.767765695349592e-05, "loss": 0.3912, "step": 48685 }, { "epoch": 1.7547843010055142, "grad_norm": 0.19795434176921844, "learning_rate": 3.767514177579836e-05, "loss": 0.4147, "step": 48690 }, { "epoch": 1.7549645006667387, "grad_norm": 0.17479757964611053, "learning_rate": 3.7672626425404436e-05, "loss": 0.4329, "step": 48695 }, { "epoch": 1.7551447003279634, "grad_norm": 0.20223133265972137, "learning_rate": 3.767011090234842e-05, "loss": 0.3739, "step": 48700 }, { "epoch": 1.755324899989188, "grad_norm": 0.2165326178073883, "learning_rate": 3.7667595206664606e-05, "loss": 0.4324, "step": 48705 }, { "epoch": 1.7555050996504127, "grad_norm": 0.2068873792886734, "learning_rate": 3.766507933838726e-05, "loss": 0.3831, "step": 48710 }, { "epoch": 1.7556852993116374, "grad_norm": 0.1555756777524948, "learning_rate": 3.7662563297550666e-05, "loss": 0.4153, "step": 48715 }, { "epoch": 1.755865498972862, "grad_norm": 0.21677540242671967, "learning_rate": 3.766004708418909e-05, "loss": 0.4148, "step": 48720 }, { "epoch": 1.7560456986340864, "grad_norm": 0.18430842459201813, "learning_rate": 3.765753069833683e-05, "loss": 0.4374, "step": 48725 }, { "epoch": 1.7562258982953112, "grad_norm": 0.16822421550750732, "learning_rate": 3.7655014140028156e-05, "loss": 0.3953, "step": 48730 }, { "epoch": 1.756406097956536, "grad_norm": 0.18837454915046692, "learning_rate": 3.765249740929737e-05, "loss": 0.3994, "step": 48735 }, { "epoch": 1.7565862976177606, "grad_norm": 0.21585826575756073, "learning_rate": 3.764998050617876e-05, "loss": 0.4374, "step": 48740 }, { "epoch": 1.7567664972789852, "grad_norm": 0.18385829031467438, "learning_rate": 3.7647463430706605e-05, "loss": 0.4148, "step": 48745 }, { "epoch": 1.7569466969402097, "grad_norm": 0.17298060655593872, "learning_rate": 3.7644946182915215e-05, "loss": 0.4257, "step": 48750 }, { "epoch": 1.7571268966014344, "grad_norm": 0.17734064161777496, "learning_rate": 3.764242876283888e-05, "loss": 0.3634, "step": 48755 }, { "epoch": 1.7573070962626591, "grad_norm": 0.19183120131492615, "learning_rate": 3.76399111705119e-05, "loss": 0.3898, "step": 48760 }, { "epoch": 1.7574872959238836, "grad_norm": 0.19240742921829224, "learning_rate": 3.763739340596858e-05, "loss": 0.4226, "step": 48765 }, { "epoch": 1.7576674955851082, "grad_norm": 0.19744504988193512, "learning_rate": 3.763487546924322e-05, "loss": 0.4182, "step": 48770 }, { "epoch": 1.7578476952463329, "grad_norm": 0.15184414386749268, "learning_rate": 3.763235736037014e-05, "loss": 0.368, "step": 48775 }, { "epoch": 1.7580278949075576, "grad_norm": 0.1860179305076599, "learning_rate": 3.762983907938362e-05, "loss": 0.3983, "step": 48780 }, { "epoch": 1.7582080945687824, "grad_norm": 0.1840306669473648, "learning_rate": 3.7627320626317994e-05, "loss": 0.4329, "step": 48785 }, { "epoch": 1.7583882942300069, "grad_norm": 0.20998184382915497, "learning_rate": 3.762480200120756e-05, "loss": 0.3834, "step": 48790 }, { "epoch": 1.7585684938912314, "grad_norm": 0.22181224822998047, "learning_rate": 3.7622283204086653e-05, "loss": 0.4068, "step": 48795 }, { "epoch": 1.758748693552456, "grad_norm": 0.16627205908298492, "learning_rate": 3.761976423498958e-05, "loss": 0.3937, "step": 48800 }, { "epoch": 1.7589288932136808, "grad_norm": 0.15934346616268158, "learning_rate": 3.761724509395066e-05, "loss": 0.4018, "step": 48805 }, { "epoch": 1.7591090928749054, "grad_norm": 0.1558726578950882, "learning_rate": 3.761472578100422e-05, "loss": 0.4015, "step": 48810 }, { "epoch": 1.7592892925361299, "grad_norm": 0.17845207452774048, "learning_rate": 3.7612206296184594e-05, "loss": 0.3892, "step": 48815 }, { "epoch": 1.7594694921973546, "grad_norm": 0.1628832072019577, "learning_rate": 3.7609686639526086e-05, "loss": 0.394, "step": 48820 }, { "epoch": 1.7596496918585793, "grad_norm": 0.21811261773109436, "learning_rate": 3.760716681106304e-05, "loss": 0.4561, "step": 48825 }, { "epoch": 1.759829891519804, "grad_norm": 0.2100425362586975, "learning_rate": 3.760464681082979e-05, "loss": 0.4025, "step": 48830 }, { "epoch": 1.7600100911810286, "grad_norm": 0.25296419858932495, "learning_rate": 3.760212663886067e-05, "loss": 0.4405, "step": 48835 }, { "epoch": 1.760190290842253, "grad_norm": 0.22018037736415863, "learning_rate": 3.759960629519e-05, "loss": 0.4108, "step": 48840 }, { "epoch": 1.7603704905034778, "grad_norm": 0.17130319774150848, "learning_rate": 3.759708577985215e-05, "loss": 0.4407, "step": 48845 }, { "epoch": 1.7605506901647026, "grad_norm": 0.2034503072500229, "learning_rate": 3.759456509288144e-05, "loss": 0.4196, "step": 48850 }, { "epoch": 1.7607308898259273, "grad_norm": 0.17805613577365875, "learning_rate": 3.759204423431222e-05, "loss": 0.4007, "step": 48855 }, { "epoch": 1.7609110894871518, "grad_norm": 0.1614164412021637, "learning_rate": 3.7589523204178836e-05, "loss": 0.4025, "step": 48860 }, { "epoch": 1.7610912891483763, "grad_norm": 0.17002156376838684, "learning_rate": 3.7587002002515623e-05, "loss": 0.4165, "step": 48865 }, { "epoch": 1.761271488809601, "grad_norm": 0.18788275122642517, "learning_rate": 3.758448062935696e-05, "loss": 0.3999, "step": 48870 }, { "epoch": 1.7614516884708258, "grad_norm": 0.2487161010503769, "learning_rate": 3.758195908473717e-05, "loss": 0.4245, "step": 48875 }, { "epoch": 1.7616318881320503, "grad_norm": 0.18604964017868042, "learning_rate": 3.757943736869064e-05, "loss": 0.4501, "step": 48880 }, { "epoch": 1.7618120877932748, "grad_norm": 0.15470033884048462, "learning_rate": 3.757691548125171e-05, "loss": 0.3926, "step": 48885 }, { "epoch": 1.7619922874544995, "grad_norm": 0.1927955448627472, "learning_rate": 3.757439342245473e-05, "loss": 0.4354, "step": 48890 }, { "epoch": 1.7621724871157243, "grad_norm": 0.20000748336315155, "learning_rate": 3.757187119233408e-05, "loss": 0.4443, "step": 48895 }, { "epoch": 1.762352686776949, "grad_norm": 0.18299151957035065, "learning_rate": 3.756934879092412e-05, "loss": 0.3823, "step": 48900 }, { "epoch": 1.7625328864381735, "grad_norm": 0.17722856998443604, "learning_rate": 3.756682621825922e-05, "loss": 0.3988, "step": 48905 }, { "epoch": 1.762713086099398, "grad_norm": 0.20240166783332825, "learning_rate": 3.756430347437374e-05, "loss": 0.4445, "step": 48910 }, { "epoch": 1.7628932857606228, "grad_norm": 0.17536598443984985, "learning_rate": 3.7561780559302064e-05, "loss": 0.4041, "step": 48915 }, { "epoch": 1.7630734854218475, "grad_norm": 0.19098421931266785, "learning_rate": 3.7559257473078554e-05, "loss": 0.4033, "step": 48920 }, { "epoch": 1.763253685083072, "grad_norm": 0.17195981740951538, "learning_rate": 3.755673421573759e-05, "loss": 0.4251, "step": 48925 }, { "epoch": 1.7634338847442965, "grad_norm": 0.15965524315834045, "learning_rate": 3.7554210787313554e-05, "loss": 0.378, "step": 48930 }, { "epoch": 1.7636140844055213, "grad_norm": 0.20123085379600525, "learning_rate": 3.755168718784083e-05, "loss": 0.4222, "step": 48935 }, { "epoch": 1.763794284066746, "grad_norm": 0.1748918741941452, "learning_rate": 3.75491634173538e-05, "loss": 0.4014, "step": 48940 }, { "epoch": 1.7639744837279707, "grad_norm": 0.1668645590543747, "learning_rate": 3.7546639475886844e-05, "loss": 0.4385, "step": 48945 }, { "epoch": 1.7641546833891952, "grad_norm": 0.17461763322353363, "learning_rate": 3.754411536347435e-05, "loss": 0.3705, "step": 48950 }, { "epoch": 1.7643348830504197, "grad_norm": 0.19829685986042023, "learning_rate": 3.7541591080150725e-05, "loss": 0.4261, "step": 48955 }, { "epoch": 1.7645150827116445, "grad_norm": 0.16521485149860382, "learning_rate": 3.753906662595035e-05, "loss": 0.3963, "step": 48960 }, { "epoch": 1.7646952823728692, "grad_norm": 0.19236351549625397, "learning_rate": 3.75365420009076e-05, "loss": 0.454, "step": 48965 }, { "epoch": 1.7648754820340937, "grad_norm": 0.19585488736629486, "learning_rate": 3.7534017205056915e-05, "loss": 0.4209, "step": 48970 }, { "epoch": 1.7650556816953185, "grad_norm": 0.13935211300849915, "learning_rate": 3.7531492238432656e-05, "loss": 0.386, "step": 48975 }, { "epoch": 1.765235881356543, "grad_norm": 0.182199165225029, "learning_rate": 3.7528967101069254e-05, "loss": 0.4283, "step": 48980 }, { "epoch": 1.7654160810177677, "grad_norm": 0.23854130506515503, "learning_rate": 3.7526441793001094e-05, "loss": 0.4379, "step": 48985 }, { "epoch": 1.7655962806789924, "grad_norm": 0.20375332236289978, "learning_rate": 3.7523916314262585e-05, "loss": 0.3951, "step": 48990 }, { "epoch": 1.765776480340217, "grad_norm": 0.20095662772655487, "learning_rate": 3.752139066488815e-05, "loss": 0.4075, "step": 48995 }, { "epoch": 1.7659566800014415, "grad_norm": 0.21261079609394073, "learning_rate": 3.751886484491219e-05, "loss": 0.4467, "step": 49000 }, { "epoch": 1.7659566800014415, "eval_loss": 0.4405106008052826, "eval_runtime": 3.5203, "eval_samples_per_second": 28.407, "eval_steps_per_second": 7.102, "step": 49000 }, { "epoch": 1.7661368796626662, "grad_norm": 0.13927486538887024, "learning_rate": 3.751633885436912e-05, "loss": 0.4111, "step": 49005 }, { "epoch": 1.766317079323891, "grad_norm": 0.18607810139656067, "learning_rate": 3.751381269329335e-05, "loss": 0.4157, "step": 49010 }, { "epoch": 1.7664972789851157, "grad_norm": 0.20816226303577423, "learning_rate": 3.751128636171931e-05, "loss": 0.4413, "step": 49015 }, { "epoch": 1.7666774786463402, "grad_norm": 0.17857316136360168, "learning_rate": 3.7508759859681416e-05, "loss": 0.4119, "step": 49020 }, { "epoch": 1.7668576783075647, "grad_norm": 0.24799132347106934, "learning_rate": 3.750623318721409e-05, "loss": 0.3855, "step": 49025 }, { "epoch": 1.7670378779687894, "grad_norm": 0.15814100205898285, "learning_rate": 3.7503706344351766e-05, "loss": 0.3871, "step": 49030 }, { "epoch": 1.7672180776300142, "grad_norm": 0.19214874505996704, "learning_rate": 3.7501179331128844e-05, "loss": 0.391, "step": 49035 }, { "epoch": 1.7673982772912387, "grad_norm": 0.17339515686035156, "learning_rate": 3.7498652147579786e-05, "loss": 0.3669, "step": 49040 }, { "epoch": 1.7675784769524632, "grad_norm": 0.1755550056695938, "learning_rate": 3.749612479373902e-05, "loss": 0.4258, "step": 49045 }, { "epoch": 1.767758676613688, "grad_norm": 0.18907277286052704, "learning_rate": 3.749359726964096e-05, "loss": 0.398, "step": 49050 }, { "epoch": 1.7679388762749126, "grad_norm": 0.1572108417749405, "learning_rate": 3.749106957532006e-05, "loss": 0.4311, "step": 49055 }, { "epoch": 1.7681190759361374, "grad_norm": 0.17797333002090454, "learning_rate": 3.748854171081076e-05, "loss": 0.3821, "step": 49060 }, { "epoch": 1.7682992755973619, "grad_norm": 0.18352951109409332, "learning_rate": 3.7486013676147495e-05, "loss": 0.3894, "step": 49065 }, { "epoch": 1.7684794752585864, "grad_norm": 0.22129513323307037, "learning_rate": 3.748348547136471e-05, "loss": 0.4171, "step": 49070 }, { "epoch": 1.7686596749198111, "grad_norm": 0.17449241876602173, "learning_rate": 3.748095709649685e-05, "loss": 0.3955, "step": 49075 }, { "epoch": 1.7688398745810359, "grad_norm": 0.18049213290214539, "learning_rate": 3.747842855157836e-05, "loss": 0.3908, "step": 49080 }, { "epoch": 1.7690200742422604, "grad_norm": 0.22616390883922577, "learning_rate": 3.747589983664371e-05, "loss": 0.4427, "step": 49085 }, { "epoch": 1.769200273903485, "grad_norm": 0.17422033846378326, "learning_rate": 3.7473370951727335e-05, "loss": 0.4336, "step": 49090 }, { "epoch": 1.7693804735647096, "grad_norm": 0.20560961961746216, "learning_rate": 3.74708418968637e-05, "loss": 0.4182, "step": 49095 }, { "epoch": 1.7695606732259344, "grad_norm": 0.208149254322052, "learning_rate": 3.7468312672087245e-05, "loss": 0.3874, "step": 49100 }, { "epoch": 1.769740872887159, "grad_norm": 0.18670162558555603, "learning_rate": 3.746578327743246e-05, "loss": 0.4345, "step": 49105 }, { "epoch": 1.7699210725483836, "grad_norm": 0.17205697298049927, "learning_rate": 3.746325371293379e-05, "loss": 0.4163, "step": 49110 }, { "epoch": 1.7701012722096081, "grad_norm": 0.16193123161792755, "learning_rate": 3.746072397862569e-05, "loss": 0.4363, "step": 49115 }, { "epoch": 1.7702814718708328, "grad_norm": 0.16992096602916718, "learning_rate": 3.7458194074542643e-05, "loss": 0.4117, "step": 49120 }, { "epoch": 1.7704616715320576, "grad_norm": 0.19616004824638367, "learning_rate": 3.7455664000719113e-05, "loss": 0.4117, "step": 49125 }, { "epoch": 1.7706418711932823, "grad_norm": 0.21857887506484985, "learning_rate": 3.745313375718957e-05, "loss": 0.3996, "step": 49130 }, { "epoch": 1.7708220708545068, "grad_norm": 0.2159646898508072, "learning_rate": 3.74506033439885e-05, "loss": 0.414, "step": 49135 }, { "epoch": 1.7710022705157313, "grad_norm": 0.18626536428928375, "learning_rate": 3.744807276115036e-05, "loss": 0.3965, "step": 49140 }, { "epoch": 1.771182470176956, "grad_norm": 0.19798725843429565, "learning_rate": 3.744554200870965e-05, "loss": 0.421, "step": 49145 }, { "epoch": 1.7713626698381808, "grad_norm": 0.170378640294075, "learning_rate": 3.744301108670083e-05, "loss": 0.3878, "step": 49150 }, { "epoch": 1.7715428694994053, "grad_norm": 0.1625005006790161, "learning_rate": 3.744047999515839e-05, "loss": 0.3824, "step": 49155 }, { "epoch": 1.7717230691606298, "grad_norm": 0.21197515726089478, "learning_rate": 3.743794873411682e-05, "loss": 0.4288, "step": 49160 }, { "epoch": 1.7719032688218546, "grad_norm": 0.21333040297031403, "learning_rate": 3.743541730361062e-05, "loss": 0.4542, "step": 49165 }, { "epoch": 1.7720834684830793, "grad_norm": 0.18313553929328918, "learning_rate": 3.743288570367426e-05, "loss": 0.4284, "step": 49170 }, { "epoch": 1.772263668144304, "grad_norm": 0.1629858762025833, "learning_rate": 3.7430353934342235e-05, "loss": 0.4091, "step": 49175 }, { "epoch": 1.7724438678055285, "grad_norm": 0.1397615224123001, "learning_rate": 3.7427821995649044e-05, "loss": 0.4355, "step": 49180 }, { "epoch": 1.772624067466753, "grad_norm": 0.18151454627513885, "learning_rate": 3.742528988762919e-05, "loss": 0.4128, "step": 49185 }, { "epoch": 1.7728042671279778, "grad_norm": 0.17477920651435852, "learning_rate": 3.742275761031716e-05, "loss": 0.4322, "step": 49190 }, { "epoch": 1.7729844667892025, "grad_norm": 0.15377044677734375, "learning_rate": 3.742022516374747e-05, "loss": 0.3747, "step": 49195 }, { "epoch": 1.773164666450427, "grad_norm": 0.22250111401081085, "learning_rate": 3.741769254795461e-05, "loss": 0.4256, "step": 49200 }, { "epoch": 1.7733448661116518, "grad_norm": 0.20984488725662231, "learning_rate": 3.7415159762973094e-05, "loss": 0.4101, "step": 49205 }, { "epoch": 1.7735250657728763, "grad_norm": 0.15702414512634277, "learning_rate": 3.741262680883743e-05, "loss": 0.4372, "step": 49210 }, { "epoch": 1.773705265434101, "grad_norm": 0.170258030295372, "learning_rate": 3.7410093685582135e-05, "loss": 0.3799, "step": 49215 }, { "epoch": 1.7738854650953257, "grad_norm": 0.15923821926116943, "learning_rate": 3.740756039324171e-05, "loss": 0.414, "step": 49220 }, { "epoch": 1.7740656647565503, "grad_norm": 0.15464822947978973, "learning_rate": 3.7405026931850676e-05, "loss": 0.4139, "step": 49225 }, { "epoch": 1.7742458644177748, "grad_norm": 0.16088750958442688, "learning_rate": 3.7402493301443556e-05, "loss": 0.3763, "step": 49230 }, { "epoch": 1.7744260640789995, "grad_norm": 0.1694214642047882, "learning_rate": 3.739995950205487e-05, "loss": 0.3844, "step": 49235 }, { "epoch": 1.7746062637402242, "grad_norm": 0.16688519716262817, "learning_rate": 3.739742553371913e-05, "loss": 0.4343, "step": 49240 }, { "epoch": 1.774786463401449, "grad_norm": 0.1897389143705368, "learning_rate": 3.7394891396470866e-05, "loss": 0.4178, "step": 49245 }, { "epoch": 1.7749666630626735, "grad_norm": 0.19561418890953064, "learning_rate": 3.739235709034461e-05, "loss": 0.3847, "step": 49250 }, { "epoch": 1.775146862723898, "grad_norm": 0.17393264174461365, "learning_rate": 3.7389822615374884e-05, "loss": 0.4522, "step": 49255 }, { "epoch": 1.7753270623851227, "grad_norm": 0.18216289579868317, "learning_rate": 3.738728797159623e-05, "loss": 0.4269, "step": 49260 }, { "epoch": 1.7755072620463475, "grad_norm": 0.1977429836988449, "learning_rate": 3.738475315904317e-05, "loss": 0.3856, "step": 49265 }, { "epoch": 1.775687461707572, "grad_norm": 0.21220578253269196, "learning_rate": 3.738221817775025e-05, "loss": 0.4363, "step": 49270 }, { "epoch": 1.7758676613687965, "grad_norm": 0.1830170750617981, "learning_rate": 3.7379683027752e-05, "loss": 0.4057, "step": 49275 }, { "epoch": 1.7760478610300212, "grad_norm": 0.1913861781358719, "learning_rate": 3.7377147709082966e-05, "loss": 0.4363, "step": 49280 }, { "epoch": 1.776228060691246, "grad_norm": 0.16212737560272217, "learning_rate": 3.7374612221777694e-05, "loss": 0.3936, "step": 49285 }, { "epoch": 1.7764082603524707, "grad_norm": 0.2292328029870987, "learning_rate": 3.737207656587073e-05, "loss": 0.417, "step": 49290 }, { "epoch": 1.7765884600136952, "grad_norm": 0.19917726516723633, "learning_rate": 3.7369540741396614e-05, "loss": 0.4154, "step": 49295 }, { "epoch": 1.7767686596749197, "grad_norm": 0.18198366463184357, "learning_rate": 3.7367004748389897e-05, "loss": 0.4043, "step": 49300 }, { "epoch": 1.7769488593361444, "grad_norm": 0.2522968053817749, "learning_rate": 3.736446858688513e-05, "loss": 0.4596, "step": 49305 }, { "epoch": 1.7771290589973692, "grad_norm": 0.21617339551448822, "learning_rate": 3.736193225691689e-05, "loss": 0.4162, "step": 49310 }, { "epoch": 1.7773092586585937, "grad_norm": 0.1531769037246704, "learning_rate": 3.73593957585197e-05, "loss": 0.4273, "step": 49315 }, { "epoch": 1.7774894583198182, "grad_norm": 0.16418246924877167, "learning_rate": 3.735685909172815e-05, "loss": 0.3954, "step": 49320 }, { "epoch": 1.777669657981043, "grad_norm": 0.1850021332502365, "learning_rate": 3.7354322256576765e-05, "loss": 0.4432, "step": 49325 }, { "epoch": 1.7778498576422677, "grad_norm": 0.14618489146232605, "learning_rate": 3.735178525310015e-05, "loss": 0.3764, "step": 49330 }, { "epoch": 1.7780300573034924, "grad_norm": 0.22677135467529297, "learning_rate": 3.734924808133285e-05, "loss": 0.4131, "step": 49335 }, { "epoch": 1.778210256964717, "grad_norm": 0.1776285022497177, "learning_rate": 3.734671074130943e-05, "loss": 0.4312, "step": 49340 }, { "epoch": 1.7783904566259414, "grad_norm": 0.1667148470878601, "learning_rate": 3.734417323306447e-05, "loss": 0.3951, "step": 49345 }, { "epoch": 1.7785706562871662, "grad_norm": 0.18254049122333527, "learning_rate": 3.7341635556632544e-05, "loss": 0.4046, "step": 49350 }, { "epoch": 1.7787508559483909, "grad_norm": 0.18212547898292542, "learning_rate": 3.733909771204821e-05, "loss": 0.4332, "step": 49355 }, { "epoch": 1.7789310556096156, "grad_norm": 0.19516201317310333, "learning_rate": 3.733655969934607e-05, "loss": 0.4102, "step": 49360 }, { "epoch": 1.7791112552708401, "grad_norm": 0.17682771384716034, "learning_rate": 3.733402151856069e-05, "loss": 0.4214, "step": 49365 }, { "epoch": 1.7792914549320646, "grad_norm": 0.1816743165254593, "learning_rate": 3.733148316972665e-05, "loss": 0.4073, "step": 49370 }, { "epoch": 1.7794716545932894, "grad_norm": 0.20628765225410461, "learning_rate": 3.732894465287854e-05, "loss": 0.4256, "step": 49375 }, { "epoch": 1.779651854254514, "grad_norm": 0.1907653659582138, "learning_rate": 3.732640596805096e-05, "loss": 0.4183, "step": 49380 }, { "epoch": 1.7798320539157386, "grad_norm": 0.18405283987522125, "learning_rate": 3.732386711527847e-05, "loss": 0.4181, "step": 49385 }, { "epoch": 1.7800122535769631, "grad_norm": 0.24257473647594452, "learning_rate": 3.7321328094595685e-05, "loss": 0.4125, "step": 49390 }, { "epoch": 1.7801924532381879, "grad_norm": 0.19096507132053375, "learning_rate": 3.731878890603718e-05, "loss": 0.3772, "step": 49395 }, { "epoch": 1.7803726528994126, "grad_norm": 0.19265110790729523, "learning_rate": 3.731624954963757e-05, "loss": 0.4049, "step": 49400 }, { "epoch": 1.7805528525606373, "grad_norm": 0.24484902620315552, "learning_rate": 3.731371002543144e-05, "loss": 0.4594, "step": 49405 }, { "epoch": 1.7807330522218618, "grad_norm": 0.19407442212104797, "learning_rate": 3.73111703334534e-05, "loss": 0.3982, "step": 49410 }, { "epoch": 1.7809132518830864, "grad_norm": 0.18388943374156952, "learning_rate": 3.7308630473738046e-05, "loss": 0.3976, "step": 49415 }, { "epoch": 1.781093451544311, "grad_norm": 0.15484626591205597, "learning_rate": 3.730609044631998e-05, "loss": 0.4456, "step": 49420 }, { "epoch": 1.7812736512055358, "grad_norm": 0.18442393839359283, "learning_rate": 3.7303550251233824e-05, "loss": 0.3943, "step": 49425 }, { "epoch": 1.7814538508667603, "grad_norm": 0.18085482716560364, "learning_rate": 3.730100988851417e-05, "loss": 0.4069, "step": 49430 }, { "epoch": 1.7816340505279848, "grad_norm": 0.21936087310314178, "learning_rate": 3.7298469358195635e-05, "loss": 0.4314, "step": 49435 }, { "epoch": 1.7818142501892096, "grad_norm": 0.18822318315505981, "learning_rate": 3.729592866031284e-05, "loss": 0.4145, "step": 49440 }, { "epoch": 1.7819944498504343, "grad_norm": 0.2043982595205307, "learning_rate": 3.729338779490039e-05, "loss": 0.429, "step": 49445 }, { "epoch": 1.782174649511659, "grad_norm": 0.1583716869354248, "learning_rate": 3.7290846761992924e-05, "loss": 0.4125, "step": 49450 }, { "epoch": 1.7823548491728836, "grad_norm": 0.20296910405158997, "learning_rate": 3.728830556162505e-05, "loss": 0.4143, "step": 49455 }, { "epoch": 1.782535048834108, "grad_norm": 0.20263749361038208, "learning_rate": 3.7285764193831384e-05, "loss": 0.3639, "step": 49460 }, { "epoch": 1.7827152484953328, "grad_norm": 0.15118259191513062, "learning_rate": 3.728322265864656e-05, "loss": 0.4173, "step": 49465 }, { "epoch": 1.7828954481565575, "grad_norm": 0.15944749116897583, "learning_rate": 3.7280680956105206e-05, "loss": 0.3725, "step": 49470 }, { "epoch": 1.783075647817782, "grad_norm": 0.1850617229938507, "learning_rate": 3.727813908624196e-05, "loss": 0.4237, "step": 49475 }, { "epoch": 1.7832558474790068, "grad_norm": 0.17814558744430542, "learning_rate": 3.727559704909144e-05, "loss": 0.4384, "step": 49480 }, { "epoch": 1.7834360471402313, "grad_norm": 0.18063758313655853, "learning_rate": 3.727305484468828e-05, "loss": 0.4244, "step": 49485 }, { "epoch": 1.783616246801456, "grad_norm": 0.21019330620765686, "learning_rate": 3.727051247306713e-05, "loss": 0.3935, "step": 49490 }, { "epoch": 1.7837964464626808, "grad_norm": 0.17506244778633118, "learning_rate": 3.726796993426263e-05, "loss": 0.4377, "step": 49495 }, { "epoch": 1.7839766461239053, "grad_norm": 0.17135731875896454, "learning_rate": 3.726542722830941e-05, "loss": 0.3957, "step": 49500 }, { "epoch": 1.7839766461239053, "eval_loss": 0.4405691623687744, "eval_runtime": 3.5268, "eval_samples_per_second": 28.354, "eval_steps_per_second": 7.089, "step": 49500 }, { "epoch": 1.7841568457851298, "grad_norm": 0.17667745053768158, "learning_rate": 3.7262884355242116e-05, "loss": 0.4162, "step": 49505 }, { "epoch": 1.7843370454463545, "grad_norm": 0.2062585949897766, "learning_rate": 3.7260341315095394e-05, "loss": 0.3877, "step": 49510 }, { "epoch": 1.7845172451075793, "grad_norm": 0.1851799339056015, "learning_rate": 3.72577981079039e-05, "loss": 0.4212, "step": 49515 }, { "epoch": 1.784697444768804, "grad_norm": 0.1553850919008255, "learning_rate": 3.725525473370228e-05, "loss": 0.4183, "step": 49520 }, { "epoch": 1.7848776444300285, "grad_norm": 0.18418540060520172, "learning_rate": 3.725271119252519e-05, "loss": 0.4034, "step": 49525 }, { "epoch": 1.785057844091253, "grad_norm": 0.21145012974739075, "learning_rate": 3.7250167484407274e-05, "loss": 0.4491, "step": 49530 }, { "epoch": 1.7852380437524777, "grad_norm": 0.18339501321315765, "learning_rate": 3.7247623609383206e-05, "loss": 0.4113, "step": 49535 }, { "epoch": 1.7854182434137025, "grad_norm": 0.1943238079547882, "learning_rate": 3.7245079567487635e-05, "loss": 0.3917, "step": 49540 }, { "epoch": 1.785598443074927, "grad_norm": 0.15589359402656555, "learning_rate": 3.724253535875522e-05, "loss": 0.4486, "step": 49545 }, { "epoch": 1.7857786427361515, "grad_norm": 0.25339236855506897, "learning_rate": 3.723999098322064e-05, "loss": 0.3768, "step": 49550 }, { "epoch": 1.7859588423973762, "grad_norm": 0.2046034038066864, "learning_rate": 3.7237446440918545e-05, "loss": 0.4172, "step": 49555 }, { "epoch": 1.786139042058601, "grad_norm": 0.19488854706287384, "learning_rate": 3.723490173188362e-05, "loss": 0.4283, "step": 49560 }, { "epoch": 1.7863192417198257, "grad_norm": 0.16177228093147278, "learning_rate": 3.723235685615052e-05, "loss": 0.3981, "step": 49565 }, { "epoch": 1.7864994413810502, "grad_norm": 0.15859396755695343, "learning_rate": 3.722981181375392e-05, "loss": 0.399, "step": 49570 }, { "epoch": 1.7866796410422747, "grad_norm": 0.17841722071170807, "learning_rate": 3.7227266604728516e-05, "loss": 0.4186, "step": 49575 }, { "epoch": 1.7868598407034995, "grad_norm": 0.22205208241939545, "learning_rate": 3.722472122910896e-05, "loss": 0.4336, "step": 49580 }, { "epoch": 1.7870400403647242, "grad_norm": 0.17837925255298615, "learning_rate": 3.7222175686929947e-05, "loss": 0.4137, "step": 49585 }, { "epoch": 1.7872202400259487, "grad_norm": 0.19479599595069885, "learning_rate": 3.7219629978226165e-05, "loss": 0.4216, "step": 49590 }, { "epoch": 1.7874004396871734, "grad_norm": 0.19044643640518188, "learning_rate": 3.7217084103032284e-05, "loss": 0.4043, "step": 49595 }, { "epoch": 1.787580639348398, "grad_norm": 0.17032626271247864, "learning_rate": 3.7214538061383e-05, "loss": 0.4044, "step": 49600 }, { "epoch": 1.7877608390096227, "grad_norm": 0.19500906765460968, "learning_rate": 3.7211991853312996e-05, "loss": 0.3888, "step": 49605 }, { "epoch": 1.7879410386708474, "grad_norm": 0.20601950585842133, "learning_rate": 3.720944547885697e-05, "loss": 0.3977, "step": 49610 }, { "epoch": 1.788121238332072, "grad_norm": 0.15395066142082214, "learning_rate": 3.720689893804962e-05, "loss": 0.4093, "step": 49615 }, { "epoch": 1.7883014379932964, "grad_norm": 0.21535900235176086, "learning_rate": 3.720435223092562e-05, "loss": 0.4189, "step": 49620 }, { "epoch": 1.7884816376545212, "grad_norm": 0.1768050193786621, "learning_rate": 3.720180535751969e-05, "loss": 0.4222, "step": 49625 }, { "epoch": 1.788661837315746, "grad_norm": 0.1698492169380188, "learning_rate": 3.719925831786653e-05, "loss": 0.3944, "step": 49630 }, { "epoch": 1.7888420369769706, "grad_norm": 0.1947326809167862, "learning_rate": 3.7196711112000835e-05, "loss": 0.43, "step": 49635 }, { "epoch": 1.7890222366381952, "grad_norm": 0.15801461040973663, "learning_rate": 3.7194163739957306e-05, "loss": 0.4035, "step": 49640 }, { "epoch": 1.7892024362994197, "grad_norm": 0.2287774682044983, "learning_rate": 3.719161620177066e-05, "loss": 0.4457, "step": 49645 }, { "epoch": 1.7893826359606444, "grad_norm": 0.19257451593875885, "learning_rate": 3.71890684974756e-05, "loss": 0.4043, "step": 49650 }, { "epoch": 1.7895628356218691, "grad_norm": 0.22990871965885162, "learning_rate": 3.7186520627106855e-05, "loss": 0.4, "step": 49655 }, { "epoch": 1.7897430352830936, "grad_norm": 0.17866025865077972, "learning_rate": 3.718397259069911e-05, "loss": 0.418, "step": 49660 }, { "epoch": 1.7899232349443182, "grad_norm": 0.18929140269756317, "learning_rate": 3.718142438828711e-05, "loss": 0.408, "step": 49665 }, { "epoch": 1.7901034346055429, "grad_norm": 0.19318966567516327, "learning_rate": 3.717887601990555e-05, "loss": 0.4305, "step": 49670 }, { "epoch": 1.7902836342667676, "grad_norm": 0.21049553155899048, "learning_rate": 3.717632748558917e-05, "loss": 0.4117, "step": 49675 }, { "epoch": 1.7904638339279924, "grad_norm": 0.20448461174964905, "learning_rate": 3.717377878537269e-05, "loss": 0.4479, "step": 49680 }, { "epoch": 1.7906440335892169, "grad_norm": 0.15858854353427887, "learning_rate": 3.717122991929082e-05, "loss": 0.3854, "step": 49685 }, { "epoch": 1.7908242332504414, "grad_norm": 0.18554039299488068, "learning_rate": 3.7168680887378306e-05, "loss": 0.4105, "step": 49690 }, { "epoch": 1.7910044329116661, "grad_norm": 0.21041366457939148, "learning_rate": 3.716613168966986e-05, "loss": 0.4343, "step": 49695 }, { "epoch": 1.7911846325728908, "grad_norm": 0.18072649836540222, "learning_rate": 3.716358232620024e-05, "loss": 0.4132, "step": 49700 }, { "epoch": 1.7913648322341154, "grad_norm": 0.16237974166870117, "learning_rate": 3.716103279700416e-05, "loss": 0.4437, "step": 49705 }, { "epoch": 1.79154503189534, "grad_norm": 0.21563240885734558, "learning_rate": 3.7158483102116354e-05, "loss": 0.4425, "step": 49710 }, { "epoch": 1.7917252315565646, "grad_norm": 0.18444405496120453, "learning_rate": 3.715593324157158e-05, "loss": 0.4327, "step": 49715 }, { "epoch": 1.7919054312177893, "grad_norm": 0.20448465645313263, "learning_rate": 3.715338321540457e-05, "loss": 0.414, "step": 49720 }, { "epoch": 1.792085630879014, "grad_norm": 0.1820807009935379, "learning_rate": 3.715083302365006e-05, "loss": 0.407, "step": 49725 }, { "epoch": 1.7922658305402386, "grad_norm": 0.17193655669689178, "learning_rate": 3.7148282666342804e-05, "loss": 0.4177, "step": 49730 }, { "epoch": 1.792446030201463, "grad_norm": 0.1984802633523941, "learning_rate": 3.714573214351754e-05, "loss": 0.4338, "step": 49735 }, { "epoch": 1.7926262298626878, "grad_norm": 0.18184858560562134, "learning_rate": 3.714318145520905e-05, "loss": 0.4176, "step": 49740 }, { "epoch": 1.7928064295239126, "grad_norm": 0.16971564292907715, "learning_rate": 3.7140630601452045e-05, "loss": 0.3892, "step": 49745 }, { "epoch": 1.7929866291851373, "grad_norm": 0.15988574922084808, "learning_rate": 3.7138079582281306e-05, "loss": 0.4246, "step": 49750 }, { "epoch": 1.7931668288463618, "grad_norm": 0.1885288655757904, "learning_rate": 3.713552839773158e-05, "loss": 0.4083, "step": 49755 }, { "epoch": 1.7933470285075863, "grad_norm": 0.15891823172569275, "learning_rate": 3.713297704783763e-05, "loss": 0.3774, "step": 49760 }, { "epoch": 1.793527228168811, "grad_norm": 0.21999859809875488, "learning_rate": 3.7130425532634214e-05, "loss": 0.4091, "step": 49765 }, { "epoch": 1.7937074278300358, "grad_norm": 0.21735869348049164, "learning_rate": 3.71278738521561e-05, "loss": 0.4353, "step": 49770 }, { "epoch": 1.7938876274912603, "grad_norm": 0.17658278346061707, "learning_rate": 3.7125322006438055e-05, "loss": 0.3801, "step": 49775 }, { "epoch": 1.7940678271524848, "grad_norm": 0.17942824959754944, "learning_rate": 3.712276999551485e-05, "loss": 0.4222, "step": 49780 }, { "epoch": 1.7942480268137095, "grad_norm": 0.17095422744750977, "learning_rate": 3.712021781942124e-05, "loss": 0.3908, "step": 49785 }, { "epoch": 1.7944282264749343, "grad_norm": 0.20140354335308075, "learning_rate": 3.711766547819201e-05, "loss": 0.3921, "step": 49790 }, { "epoch": 1.794608426136159, "grad_norm": 0.1793317049741745, "learning_rate": 3.711511297186194e-05, "loss": 0.4273, "step": 49795 }, { "epoch": 1.7947886257973835, "grad_norm": 0.1982317417860031, "learning_rate": 3.711256030046581e-05, "loss": 0.3903, "step": 49800 }, { "epoch": 1.794968825458608, "grad_norm": 0.18103168904781342, "learning_rate": 3.7110007464038375e-05, "loss": 0.4436, "step": 49805 }, { "epoch": 1.7951490251198328, "grad_norm": 0.20677992701530457, "learning_rate": 3.710745446261444e-05, "loss": 0.3946, "step": 49810 }, { "epoch": 1.7953292247810575, "grad_norm": 0.17719873785972595, "learning_rate": 3.710490129622878e-05, "loss": 0.4143, "step": 49815 }, { "epoch": 1.795509424442282, "grad_norm": 0.15315468609333038, "learning_rate": 3.7102347964916183e-05, "loss": 0.3798, "step": 49820 }, { "epoch": 1.7956896241035065, "grad_norm": 0.15976719558238983, "learning_rate": 3.709979446871144e-05, "loss": 0.4025, "step": 49825 }, { "epoch": 1.7958698237647313, "grad_norm": 0.19692429900169373, "learning_rate": 3.7097240807649334e-05, "loss": 0.4113, "step": 49830 }, { "epoch": 1.796050023425956, "grad_norm": 0.2119201272726059, "learning_rate": 3.709468698176467e-05, "loss": 0.4152, "step": 49835 }, { "epoch": 1.7962302230871807, "grad_norm": 0.17838400602340698, "learning_rate": 3.7092132991092236e-05, "loss": 0.4198, "step": 49840 }, { "epoch": 1.7964104227484052, "grad_norm": 0.17005471885204315, "learning_rate": 3.7089578835666834e-05, "loss": 0.3886, "step": 49845 }, { "epoch": 1.7965906224096297, "grad_norm": 0.18709275126457214, "learning_rate": 3.708702451552326e-05, "loss": 0.4055, "step": 49850 }, { "epoch": 1.7967708220708545, "grad_norm": 0.21572504937648773, "learning_rate": 3.708447003069631e-05, "loss": 0.4095, "step": 49855 }, { "epoch": 1.7969510217320792, "grad_norm": 0.17971207201480865, "learning_rate": 3.70819153812208e-05, "loss": 0.4189, "step": 49860 }, { "epoch": 1.797131221393304, "grad_norm": 0.20288041234016418, "learning_rate": 3.707936056713154e-05, "loss": 0.4218, "step": 49865 }, { "epoch": 1.7973114210545285, "grad_norm": 0.18527214229106903, "learning_rate": 3.7076805588463324e-05, "loss": 0.4071, "step": 49870 }, { "epoch": 1.797491620715753, "grad_norm": 0.17355002462863922, "learning_rate": 3.7074250445250954e-05, "loss": 0.408, "step": 49875 }, { "epoch": 1.7976718203769777, "grad_norm": 0.16921040415763855, "learning_rate": 3.707169513752928e-05, "loss": 0.4111, "step": 49880 }, { "epoch": 1.7978520200382024, "grad_norm": 0.1597539633512497, "learning_rate": 3.70691396653331e-05, "loss": 0.4486, "step": 49885 }, { "epoch": 1.798032219699427, "grad_norm": 0.1976759433746338, "learning_rate": 3.7066584028697224e-05, "loss": 0.3784, "step": 49890 }, { "epoch": 1.7982124193606515, "grad_norm": 0.1847647726535797, "learning_rate": 3.706402822765647e-05, "loss": 0.4264, "step": 49895 }, { "epoch": 1.7983926190218762, "grad_norm": 0.2059379369020462, "learning_rate": 3.7061472262245664e-05, "loss": 0.3677, "step": 49900 }, { "epoch": 1.798572818683101, "grad_norm": 0.18027707934379578, "learning_rate": 3.7058916132499645e-05, "loss": 0.4286, "step": 49905 }, { "epoch": 1.7987530183443257, "grad_norm": 0.1781684011220932, "learning_rate": 3.705635983845322e-05, "loss": 0.447, "step": 49910 }, { "epoch": 1.7989332180055502, "grad_norm": 0.14217472076416016, "learning_rate": 3.7053803380141233e-05, "loss": 0.3952, "step": 49915 }, { "epoch": 1.7991134176667747, "grad_norm": 0.17906373739242554, "learning_rate": 3.70512467575985e-05, "loss": 0.4536, "step": 49920 }, { "epoch": 1.7992936173279994, "grad_norm": 0.18036003410816193, "learning_rate": 3.7048689970859874e-05, "loss": 0.3784, "step": 49925 }, { "epoch": 1.7994738169892242, "grad_norm": 0.15445788204669952, "learning_rate": 3.704613301996017e-05, "loss": 0.4119, "step": 49930 }, { "epoch": 1.7996540166504487, "grad_norm": 0.20820528268814087, "learning_rate": 3.7043575904934246e-05, "loss": 0.4091, "step": 49935 }, { "epoch": 1.7998342163116732, "grad_norm": 0.1830679327249527, "learning_rate": 3.7041018625816926e-05, "loss": 0.4069, "step": 49940 }, { "epoch": 1.800014415972898, "grad_norm": 0.18425950407981873, "learning_rate": 3.703846118264306e-05, "loss": 0.4254, "step": 49945 }, { "epoch": 1.8001946156341226, "grad_norm": 0.19316412508487701, "learning_rate": 3.703590357544749e-05, "loss": 0.4153, "step": 49950 }, { "epoch": 1.8003748152953474, "grad_norm": 0.17735867202281952, "learning_rate": 3.7033345804265054e-05, "loss": 0.3632, "step": 49955 }, { "epoch": 1.8005550149565719, "grad_norm": 0.22728107869625092, "learning_rate": 3.703078786913063e-05, "loss": 0.3849, "step": 49960 }, { "epoch": 1.8007352146177964, "grad_norm": 0.16942258179187775, "learning_rate": 3.702822977007904e-05, "loss": 0.3961, "step": 49965 }, { "epoch": 1.8009154142790211, "grad_norm": 0.1619909405708313, "learning_rate": 3.7025671507145156e-05, "loss": 0.4082, "step": 49970 }, { "epoch": 1.8010956139402459, "grad_norm": 0.17971543967723846, "learning_rate": 3.702311308036381e-05, "loss": 0.4318, "step": 49975 }, { "epoch": 1.8012758136014704, "grad_norm": 0.19710752367973328, "learning_rate": 3.702055448976989e-05, "loss": 0.4109, "step": 49980 }, { "epoch": 1.801456013262695, "grad_norm": 0.21794439852237701, "learning_rate": 3.7017995735398237e-05, "loss": 0.3768, "step": 49985 }, { "epoch": 1.8016362129239196, "grad_norm": 0.21101748943328857, "learning_rate": 3.7015436817283724e-05, "loss": 0.4516, "step": 49990 }, { "epoch": 1.8018164125851444, "grad_norm": 0.21973469853401184, "learning_rate": 3.70128777354612e-05, "loss": 0.4238, "step": 49995 }, { "epoch": 1.801996612246369, "grad_norm": 0.1689206063747406, "learning_rate": 3.701031848996555e-05, "loss": 0.4313, "step": 50000 }, { "epoch": 1.801996612246369, "eval_loss": 0.4403511881828308, "eval_runtime": 3.5415, "eval_samples_per_second": 28.237, "eval_steps_per_second": 7.059, "step": 50000 }, { "epoch": 1.8021768119075936, "grad_norm": 0.17889846861362457, "learning_rate": 3.700775908083164e-05, "loss": 0.4117, "step": 50005 }, { "epoch": 1.8023570115688181, "grad_norm": 0.18117192387580872, "learning_rate": 3.7005199508094326e-05, "loss": 0.4065, "step": 50010 }, { "epoch": 1.8025372112300428, "grad_norm": 0.18847104907035828, "learning_rate": 3.700263977178851e-05, "loss": 0.4004, "step": 50015 }, { "epoch": 1.8027174108912676, "grad_norm": 0.19416354596614838, "learning_rate": 3.700007987194903e-05, "loss": 0.4118, "step": 50020 }, { "epoch": 1.8028976105524923, "grad_norm": 0.19143450260162354, "learning_rate": 3.69975198086108e-05, "loss": 0.4157, "step": 50025 }, { "epoch": 1.8030778102137168, "grad_norm": 0.17741017043590546, "learning_rate": 3.699495958180868e-05, "loss": 0.3936, "step": 50030 }, { "epoch": 1.8032580098749413, "grad_norm": 0.18034659326076508, "learning_rate": 3.6992399191577554e-05, "loss": 0.4407, "step": 50035 }, { "epoch": 1.803438209536166, "grad_norm": 0.17005987465381622, "learning_rate": 3.698983863795232e-05, "loss": 0.4033, "step": 50040 }, { "epoch": 1.8036184091973908, "grad_norm": 0.19476930797100067, "learning_rate": 3.698727792096785e-05, "loss": 0.4351, "step": 50045 }, { "epoch": 1.8037986088586153, "grad_norm": 0.2065165936946869, "learning_rate": 3.698471704065904e-05, "loss": 0.4046, "step": 50050 }, { "epoch": 1.8039788085198398, "grad_norm": 0.21584558486938477, "learning_rate": 3.698215599706078e-05, "loss": 0.3787, "step": 50055 }, { "epoch": 1.8041590081810646, "grad_norm": 0.19947922229766846, "learning_rate": 3.6979594790207964e-05, "loss": 0.4095, "step": 50060 }, { "epoch": 1.8043392078422893, "grad_norm": 0.1538824588060379, "learning_rate": 3.697703342013549e-05, "loss": 0.39, "step": 50065 }, { "epoch": 1.804519407503514, "grad_norm": 0.19563712179660797, "learning_rate": 3.6974471886878255e-05, "loss": 0.4194, "step": 50070 }, { "epoch": 1.8046996071647385, "grad_norm": 0.18470360338687897, "learning_rate": 3.697191019047116e-05, "loss": 0.3901, "step": 50075 }, { "epoch": 1.804879806825963, "grad_norm": 0.24419862031936646, "learning_rate": 3.69693483309491e-05, "loss": 0.4253, "step": 50080 }, { "epoch": 1.8050600064871878, "grad_norm": 0.1856248527765274, "learning_rate": 3.696678630834699e-05, "loss": 0.4468, "step": 50085 }, { "epoch": 1.8052402061484125, "grad_norm": 0.2281470149755478, "learning_rate": 3.696422412269974e-05, "loss": 0.3804, "step": 50090 }, { "epoch": 1.805420405809637, "grad_norm": 0.17991898953914642, "learning_rate": 3.696166177404224e-05, "loss": 0.3965, "step": 50095 }, { "epoch": 1.8056006054708618, "grad_norm": 0.15711066126823425, "learning_rate": 3.6959099262409425e-05, "loss": 0.3968, "step": 50100 }, { "epoch": 1.8057808051320863, "grad_norm": 0.18689851462841034, "learning_rate": 3.6956536587836186e-05, "loss": 0.3862, "step": 50105 }, { "epoch": 1.805961004793311, "grad_norm": 0.13950034976005554, "learning_rate": 3.695397375035747e-05, "loss": 0.4016, "step": 50110 }, { "epoch": 1.8061412044545357, "grad_norm": 0.19982732832431793, "learning_rate": 3.695141075000816e-05, "loss": 0.4065, "step": 50115 }, { "epoch": 1.8063214041157603, "grad_norm": 0.1736806035041809, "learning_rate": 3.6948847586823196e-05, "loss": 0.4303, "step": 50120 }, { "epoch": 1.8065016037769848, "grad_norm": 0.18712177872657776, "learning_rate": 3.69462842608375e-05, "loss": 0.4068, "step": 50125 }, { "epoch": 1.8066818034382095, "grad_norm": 0.2055433690547943, "learning_rate": 3.694372077208599e-05, "loss": 0.4056, "step": 50130 }, { "epoch": 1.8068620030994342, "grad_norm": 0.1870342344045639, "learning_rate": 3.69411571206036e-05, "loss": 0.4669, "step": 50135 }, { "epoch": 1.807042202760659, "grad_norm": 0.224535271525383, "learning_rate": 3.6938593306425255e-05, "loss": 0.4317, "step": 50140 }, { "epoch": 1.8072224024218835, "grad_norm": 0.2587486505508423, "learning_rate": 3.693602932958589e-05, "loss": 0.4924, "step": 50145 }, { "epoch": 1.807402602083108, "grad_norm": 0.22826701402664185, "learning_rate": 3.6933465190120434e-05, "loss": 0.4257, "step": 50150 }, { "epoch": 1.8075828017443327, "grad_norm": 0.1683250367641449, "learning_rate": 3.693090088806383e-05, "loss": 0.4344, "step": 50155 }, { "epoch": 1.8077630014055575, "grad_norm": 0.1752661168575287, "learning_rate": 3.6928336423451e-05, "loss": 0.4172, "step": 50160 }, { "epoch": 1.807943201066782, "grad_norm": 0.16361607611179352, "learning_rate": 3.69257717963169e-05, "loss": 0.4046, "step": 50165 }, { "epoch": 1.8081234007280065, "grad_norm": 0.18544571101665497, "learning_rate": 3.692320700669648e-05, "loss": 0.4402, "step": 50170 }, { "epoch": 1.8083036003892312, "grad_norm": 0.16442377865314484, "learning_rate": 3.6920642054624655e-05, "loss": 0.3902, "step": 50175 }, { "epoch": 1.808483800050456, "grad_norm": 0.19460293650627136, "learning_rate": 3.6918076940136406e-05, "loss": 0.4246, "step": 50180 }, { "epoch": 1.8086639997116807, "grad_norm": 0.1653093844652176, "learning_rate": 3.691551166326665e-05, "loss": 0.4109, "step": 50185 }, { "epoch": 1.8088441993729052, "grad_norm": 0.17277853190898895, "learning_rate": 3.691294622405037e-05, "loss": 0.3898, "step": 50190 }, { "epoch": 1.8090243990341297, "grad_norm": 0.19189266860485077, "learning_rate": 3.69103806225225e-05, "loss": 0.4178, "step": 50195 }, { "epoch": 1.8092045986953544, "grad_norm": 0.18027593195438385, "learning_rate": 3.690781485871799e-05, "loss": 0.4507, "step": 50200 }, { "epoch": 1.8093847983565792, "grad_norm": 0.17248685657978058, "learning_rate": 3.690524893267181e-05, "loss": 0.3912, "step": 50205 }, { "epoch": 1.8095649980178037, "grad_norm": 0.21308831870555878, "learning_rate": 3.690268284441893e-05, "loss": 0.4291, "step": 50210 }, { "epoch": 1.8097451976790284, "grad_norm": 0.19350473582744598, "learning_rate": 3.6900116593994295e-05, "loss": 0.4422, "step": 50215 }, { "epoch": 1.809925397340253, "grad_norm": 0.17806485295295715, "learning_rate": 3.6897550181432865e-05, "loss": 0.3978, "step": 50220 }, { "epoch": 1.8101055970014777, "grad_norm": 0.17208920419216156, "learning_rate": 3.689498360676963e-05, "loss": 0.3817, "step": 50225 }, { "epoch": 1.8102857966627024, "grad_norm": 0.21627385914325714, "learning_rate": 3.689241687003955e-05, "loss": 0.3997, "step": 50230 }, { "epoch": 1.810465996323927, "grad_norm": 0.16115298867225647, "learning_rate": 3.688984997127758e-05, "loss": 0.4023, "step": 50235 }, { "epoch": 1.8106461959851514, "grad_norm": 0.19665555655956268, "learning_rate": 3.688728291051871e-05, "loss": 0.4401, "step": 50240 }, { "epoch": 1.8108263956463762, "grad_norm": 0.1811586618423462, "learning_rate": 3.688471568779791e-05, "loss": 0.4217, "step": 50245 }, { "epoch": 1.8110065953076009, "grad_norm": 0.19190897047519684, "learning_rate": 3.6882148303150166e-05, "loss": 0.4226, "step": 50250 }, { "epoch": 1.8111867949688256, "grad_norm": 0.16638103127479553, "learning_rate": 3.687958075661045e-05, "loss": 0.3893, "step": 50255 }, { "epoch": 1.8113669946300501, "grad_norm": 0.18631191551685333, "learning_rate": 3.687701304821374e-05, "loss": 0.3919, "step": 50260 }, { "epoch": 1.8115471942912746, "grad_norm": 0.16074827313423157, "learning_rate": 3.687444517799503e-05, "loss": 0.3798, "step": 50265 }, { "epoch": 1.8117273939524994, "grad_norm": 0.18698133528232574, "learning_rate": 3.6871877145989305e-05, "loss": 0.3859, "step": 50270 }, { "epoch": 1.811907593613724, "grad_norm": 0.1874270737171173, "learning_rate": 3.686930895223156e-05, "loss": 0.3785, "step": 50275 }, { "epoch": 1.8120877932749486, "grad_norm": 0.17781777679920197, "learning_rate": 3.686674059675677e-05, "loss": 0.3977, "step": 50280 }, { "epoch": 1.8122679929361731, "grad_norm": 0.17600201070308685, "learning_rate": 3.686417207959994e-05, "loss": 0.4425, "step": 50285 }, { "epoch": 1.8124481925973979, "grad_norm": 0.17503990232944489, "learning_rate": 3.686160340079605e-05, "loss": 0.3825, "step": 50290 }, { "epoch": 1.8126283922586226, "grad_norm": 0.18147842586040497, "learning_rate": 3.6859034560380125e-05, "loss": 0.4118, "step": 50295 }, { "epoch": 1.8128085919198473, "grad_norm": 0.19389557838439941, "learning_rate": 3.6856465558387144e-05, "loss": 0.3737, "step": 50300 }, { "epoch": 1.8129887915810718, "grad_norm": 0.17562417685985565, "learning_rate": 3.685389639485211e-05, "loss": 0.3936, "step": 50305 }, { "epoch": 1.8131689912422964, "grad_norm": 0.16235031187534332, "learning_rate": 3.685132706981004e-05, "loss": 0.4133, "step": 50310 }, { "epoch": 1.813349190903521, "grad_norm": 0.16100172698497772, "learning_rate": 3.684875758329593e-05, "loss": 0.401, "step": 50315 }, { "epoch": 1.8135293905647458, "grad_norm": 0.1928766369819641, "learning_rate": 3.68461879353448e-05, "loss": 0.4323, "step": 50320 }, { "epoch": 1.8137095902259703, "grad_norm": 0.1594439446926117, "learning_rate": 3.684361812599164e-05, "loss": 0.4094, "step": 50325 }, { "epoch": 1.8138897898871948, "grad_norm": 0.24857677519321442, "learning_rate": 3.684104815527149e-05, "loss": 0.3897, "step": 50330 }, { "epoch": 1.8140699895484196, "grad_norm": 0.17496556043624878, "learning_rate": 3.6838478023219344e-05, "loss": 0.3763, "step": 50335 }, { "epoch": 1.8142501892096443, "grad_norm": 0.2180209904909134, "learning_rate": 3.683590772987022e-05, "loss": 0.3818, "step": 50340 }, { "epoch": 1.814430388870869, "grad_norm": 0.18124771118164062, "learning_rate": 3.683333727525916e-05, "loss": 0.4156, "step": 50345 }, { "epoch": 1.8146105885320936, "grad_norm": 0.2114446610212326, "learning_rate": 3.683076665942115e-05, "loss": 0.4311, "step": 50350 }, { "epoch": 1.814790788193318, "grad_norm": 0.1923656463623047, "learning_rate": 3.682819588239126e-05, "loss": 0.4145, "step": 50355 }, { "epoch": 1.8149709878545428, "grad_norm": 0.16528750956058502, "learning_rate": 3.682562494420447e-05, "loss": 0.3791, "step": 50360 }, { "epoch": 1.8151511875157675, "grad_norm": 0.18421389162540436, "learning_rate": 3.682305384489585e-05, "loss": 0.473, "step": 50365 }, { "epoch": 1.8153313871769923, "grad_norm": 0.18511919677257538, "learning_rate": 3.682048258450039e-05, "loss": 0.4415, "step": 50370 }, { "epoch": 1.8155115868382168, "grad_norm": 0.1795085072517395, "learning_rate": 3.681791116305315e-05, "loss": 0.4432, "step": 50375 }, { "epoch": 1.8156917864994413, "grad_norm": 0.16460351645946503, "learning_rate": 3.6815339580589165e-05, "loss": 0.3958, "step": 50380 }, { "epoch": 1.815871986160666, "grad_norm": 0.18292105197906494, "learning_rate": 3.6812767837143455e-05, "loss": 0.4311, "step": 50385 }, { "epoch": 1.8160521858218908, "grad_norm": 0.19992749392986298, "learning_rate": 3.681019593275108e-05, "loss": 0.4179, "step": 50390 }, { "epoch": 1.8162323854831153, "grad_norm": 0.1999056190252304, "learning_rate": 3.680762386744707e-05, "loss": 0.4146, "step": 50395 }, { "epoch": 1.8164125851443398, "grad_norm": 0.16766919195652008, "learning_rate": 3.6805051641266476e-05, "loss": 0.3939, "step": 50400 }, { "epoch": 1.8165927848055645, "grad_norm": 0.1673685610294342, "learning_rate": 3.6802479254244327e-05, "loss": 0.416, "step": 50405 }, { "epoch": 1.8167729844667893, "grad_norm": 0.2412656992673874, "learning_rate": 3.679990670641569e-05, "loss": 0.4164, "step": 50410 }, { "epoch": 1.816953184128014, "grad_norm": 0.18288861215114594, "learning_rate": 3.679733399781561e-05, "loss": 0.4088, "step": 50415 }, { "epoch": 1.8171333837892385, "grad_norm": 0.15355655550956726, "learning_rate": 3.6794761128479125e-05, "loss": 0.4283, "step": 50420 }, { "epoch": 1.817313583450463, "grad_norm": 0.1338133066892624, "learning_rate": 3.679218809844132e-05, "loss": 0.3979, "step": 50425 }, { "epoch": 1.8174937831116877, "grad_norm": 0.2555032968521118, "learning_rate": 3.6789614907737226e-05, "loss": 0.4351, "step": 50430 }, { "epoch": 1.8176739827729125, "grad_norm": 0.1840798258781433, "learning_rate": 3.6787041556401914e-05, "loss": 0.3992, "step": 50435 }, { "epoch": 1.817854182434137, "grad_norm": 0.18239086866378784, "learning_rate": 3.678446804447044e-05, "loss": 0.3901, "step": 50440 }, { "epoch": 1.8180343820953615, "grad_norm": 0.2021910846233368, "learning_rate": 3.678189437197788e-05, "loss": 0.4164, "step": 50445 }, { "epoch": 1.8182145817565862, "grad_norm": 0.22721827030181885, "learning_rate": 3.6779320538959275e-05, "loss": 0.4761, "step": 50450 }, { "epoch": 1.818394781417811, "grad_norm": 0.148165762424469, "learning_rate": 3.6776746545449715e-05, "loss": 0.3898, "step": 50455 }, { "epoch": 1.8185749810790357, "grad_norm": 0.1854783147573471, "learning_rate": 3.677417239148428e-05, "loss": 0.3955, "step": 50460 }, { "epoch": 1.8187551807402602, "grad_norm": 0.18385660648345947, "learning_rate": 3.6771598077098e-05, "loss": 0.4202, "step": 50465 }, { "epoch": 1.8189353804014847, "grad_norm": 0.19717253744602203, "learning_rate": 3.6769023602325985e-05, "loss": 0.3799, "step": 50470 }, { "epoch": 1.8191155800627095, "grad_norm": 0.20641092956066132, "learning_rate": 3.67664489672033e-05, "loss": 0.4063, "step": 50475 }, { "epoch": 1.8192957797239342, "grad_norm": 0.18371793627738953, "learning_rate": 3.676387417176503e-05, "loss": 0.4177, "step": 50480 }, { "epoch": 1.8194759793851587, "grad_norm": 0.16387097537517548, "learning_rate": 3.676129921604625e-05, "loss": 0.4037, "step": 50485 }, { "epoch": 1.8196561790463834, "grad_norm": 0.2013285756111145, "learning_rate": 3.675872410008204e-05, "loss": 0.4411, "step": 50490 }, { "epoch": 1.819836378707608, "grad_norm": 0.17570237815380096, "learning_rate": 3.675614882390751e-05, "loss": 0.4142, "step": 50495 }, { "epoch": 1.8200165783688327, "grad_norm": 0.17556174099445343, "learning_rate": 3.675357338755771e-05, "loss": 0.4109, "step": 50500 }, { "epoch": 1.8200165783688327, "eval_loss": 0.4410496950149536, "eval_runtime": 3.5389, "eval_samples_per_second": 28.257, "eval_steps_per_second": 7.064, "step": 50500 }, { "epoch": 1.8201967780300574, "grad_norm": 0.18107643723487854, "learning_rate": 3.675099779106775e-05, "loss": 0.3993, "step": 50505 }, { "epoch": 1.820376977691282, "grad_norm": 0.20702654123306274, "learning_rate": 3.6748422034472725e-05, "loss": 0.4126, "step": 50510 }, { "epoch": 1.8205571773525064, "grad_norm": 0.15964628756046295, "learning_rate": 3.674584611780772e-05, "loss": 0.406, "step": 50515 }, { "epoch": 1.8207373770137312, "grad_norm": 0.16199252009391785, "learning_rate": 3.6743270041107846e-05, "loss": 0.4537, "step": 50520 }, { "epoch": 1.820917576674956, "grad_norm": 0.23481187224388123, "learning_rate": 3.6740693804408175e-05, "loss": 0.4111, "step": 50525 }, { "epoch": 1.8210977763361806, "grad_norm": 0.1678016185760498, "learning_rate": 3.673811740774384e-05, "loss": 0.4329, "step": 50530 }, { "epoch": 1.8212779759974052, "grad_norm": 0.16724498569965363, "learning_rate": 3.673554085114991e-05, "loss": 0.4037, "step": 50535 }, { "epoch": 1.8214581756586297, "grad_norm": 0.17332807183265686, "learning_rate": 3.673296413466153e-05, "loss": 0.3876, "step": 50540 }, { "epoch": 1.8216383753198544, "grad_norm": 0.20103994011878967, "learning_rate": 3.673038725831377e-05, "loss": 0.4242, "step": 50545 }, { "epoch": 1.8218185749810791, "grad_norm": 0.1513690948486328, "learning_rate": 3.672781022214176e-05, "loss": 0.3812, "step": 50550 }, { "epoch": 1.8219987746423036, "grad_norm": 0.14303380250930786, "learning_rate": 3.67252330261806e-05, "loss": 0.4238, "step": 50555 }, { "epoch": 1.8221789743035282, "grad_norm": 0.17810964584350586, "learning_rate": 3.6722655670465416e-05, "loss": 0.4074, "step": 50560 }, { "epoch": 1.8223591739647529, "grad_norm": 0.15270282328128815, "learning_rate": 3.672007815503132e-05, "loss": 0.4043, "step": 50565 }, { "epoch": 1.8225393736259776, "grad_norm": 0.2379712015390396, "learning_rate": 3.671750047991343e-05, "loss": 0.4042, "step": 50570 }, { "epoch": 1.8227195732872024, "grad_norm": 0.19908632338047028, "learning_rate": 3.6714922645146856e-05, "loss": 0.4034, "step": 50575 }, { "epoch": 1.8228997729484269, "grad_norm": 0.16394317150115967, "learning_rate": 3.671234465076673e-05, "loss": 0.4139, "step": 50580 }, { "epoch": 1.8230799726096514, "grad_norm": 0.18769471347332, "learning_rate": 3.670976649680819e-05, "loss": 0.4194, "step": 50585 }, { "epoch": 1.8232601722708761, "grad_norm": 0.18766231834888458, "learning_rate": 3.6707188183306326e-05, "loss": 0.4128, "step": 50590 }, { "epoch": 1.8234403719321008, "grad_norm": 0.15433186292648315, "learning_rate": 3.6704609710296295e-05, "loss": 0.4008, "step": 50595 }, { "epoch": 1.8236205715933254, "grad_norm": 0.20321685075759888, "learning_rate": 3.670203107781324e-05, "loss": 0.3944, "step": 50600 }, { "epoch": 1.82380077125455, "grad_norm": 0.21775129437446594, "learning_rate": 3.669945228589225e-05, "loss": 0.4413, "step": 50605 }, { "epoch": 1.8239809709157746, "grad_norm": 0.2147761881351471, "learning_rate": 3.669687333456852e-05, "loss": 0.418, "step": 50610 }, { "epoch": 1.8241611705769993, "grad_norm": 0.20212721824645996, "learning_rate": 3.669429422387713e-05, "loss": 0.3872, "step": 50615 }, { "epoch": 1.824341370238224, "grad_norm": 0.1724279671907425, "learning_rate": 3.669171495385325e-05, "loss": 0.3979, "step": 50620 }, { "epoch": 1.8245215698994486, "grad_norm": 0.1798325926065445, "learning_rate": 3.668913552453203e-05, "loss": 0.404, "step": 50625 }, { "epoch": 1.824701769560673, "grad_norm": 0.1844148486852646, "learning_rate": 3.668655593594858e-05, "loss": 0.4004, "step": 50630 }, { "epoch": 1.8248819692218978, "grad_norm": 0.22137939929962158, "learning_rate": 3.6683976188138084e-05, "loss": 0.3921, "step": 50635 }, { "epoch": 1.8250621688831226, "grad_norm": 0.1671324074268341, "learning_rate": 3.6681396281135676e-05, "loss": 0.3937, "step": 50640 }, { "epoch": 1.8252423685443473, "grad_norm": 0.21605345606803894, "learning_rate": 3.6678816214976504e-05, "loss": 0.4299, "step": 50645 }, { "epoch": 1.8254225682055718, "grad_norm": 0.16101548075675964, "learning_rate": 3.667623598969572e-05, "loss": 0.3997, "step": 50650 }, { "epoch": 1.8256027678667963, "grad_norm": 0.1385546624660492, "learning_rate": 3.6673655605328475e-05, "loss": 0.3785, "step": 50655 }, { "epoch": 1.825782967528021, "grad_norm": 0.1622258722782135, "learning_rate": 3.667107506190993e-05, "loss": 0.4387, "step": 50660 }, { "epoch": 1.8259631671892458, "grad_norm": 0.14885656535625458, "learning_rate": 3.666849435947526e-05, "loss": 0.4072, "step": 50665 }, { "epoch": 1.8261433668504703, "grad_norm": 0.20128627121448517, "learning_rate": 3.6665913498059615e-05, "loss": 0.4241, "step": 50670 }, { "epoch": 1.8263235665116948, "grad_norm": 0.17123094201087952, "learning_rate": 3.666333247769814e-05, "loss": 0.4182, "step": 50675 }, { "epoch": 1.8265037661729195, "grad_norm": 0.19727930426597595, "learning_rate": 3.666075129842603e-05, "loss": 0.3987, "step": 50680 }, { "epoch": 1.8266839658341443, "grad_norm": 0.2271791249513626, "learning_rate": 3.665816996027844e-05, "loss": 0.4111, "step": 50685 }, { "epoch": 1.826864165495369, "grad_norm": 0.18186520040035248, "learning_rate": 3.665558846329055e-05, "loss": 0.4242, "step": 50690 }, { "epoch": 1.8270443651565935, "grad_norm": 0.18122968077659607, "learning_rate": 3.66530068074975e-05, "loss": 0.4055, "step": 50695 }, { "epoch": 1.827224564817818, "grad_norm": 0.20953094959259033, "learning_rate": 3.6650424992934504e-05, "loss": 0.4515, "step": 50700 }, { "epoch": 1.8274047644790428, "grad_norm": 0.1821168065071106, "learning_rate": 3.664784301963673e-05, "loss": 0.4363, "step": 50705 }, { "epoch": 1.8275849641402675, "grad_norm": 0.16025103628635406, "learning_rate": 3.664526088763934e-05, "loss": 0.3871, "step": 50710 }, { "epoch": 1.827765163801492, "grad_norm": 0.1833401620388031, "learning_rate": 3.6642678596977526e-05, "loss": 0.4348, "step": 50715 }, { "epoch": 1.8279453634627167, "grad_norm": 0.14645916223526, "learning_rate": 3.6640096147686467e-05, "loss": 0.4141, "step": 50720 }, { "epoch": 1.8281255631239413, "grad_norm": 0.2093980610370636, "learning_rate": 3.663751353980136e-05, "loss": 0.4121, "step": 50725 }, { "epoch": 1.828305762785166, "grad_norm": 0.19935870170593262, "learning_rate": 3.663493077335738e-05, "loss": 0.4305, "step": 50730 }, { "epoch": 1.8284859624463907, "grad_norm": 0.17722731828689575, "learning_rate": 3.663234784838972e-05, "loss": 0.4046, "step": 50735 }, { "epoch": 1.8286661621076152, "grad_norm": 0.1792575865983963, "learning_rate": 3.662976476493357e-05, "loss": 0.433, "step": 50740 }, { "epoch": 1.8288463617688397, "grad_norm": 0.15851470828056335, "learning_rate": 3.662718152302413e-05, "loss": 0.417, "step": 50745 }, { "epoch": 1.8290265614300645, "grad_norm": 0.15749147534370422, "learning_rate": 3.6624598122696595e-05, "loss": 0.4008, "step": 50750 }, { "epoch": 1.8292067610912892, "grad_norm": 0.16477638483047485, "learning_rate": 3.6622014563986155e-05, "loss": 0.403, "step": 50755 }, { "epoch": 1.829386960752514, "grad_norm": 0.16740094125270844, "learning_rate": 3.661943084692802e-05, "loss": 0.4044, "step": 50760 }, { "epoch": 1.8295671604137385, "grad_norm": 0.2223920226097107, "learning_rate": 3.661684697155739e-05, "loss": 0.4402, "step": 50765 }, { "epoch": 1.829747360074963, "grad_norm": 0.21728269755840302, "learning_rate": 3.6614262937909474e-05, "loss": 0.4326, "step": 50770 }, { "epoch": 1.8299275597361877, "grad_norm": 0.1693524718284607, "learning_rate": 3.6611678746019464e-05, "loss": 0.3812, "step": 50775 }, { "epoch": 1.8301077593974124, "grad_norm": 0.2092827707529068, "learning_rate": 3.6609094395922585e-05, "loss": 0.4101, "step": 50780 }, { "epoch": 1.830287959058637, "grad_norm": 0.19062267243862152, "learning_rate": 3.6606509887654046e-05, "loss": 0.4504, "step": 50785 }, { "epoch": 1.8304681587198615, "grad_norm": 0.4250420928001404, "learning_rate": 3.660392522124905e-05, "loss": 0.4012, "step": 50790 }, { "epoch": 1.8306483583810862, "grad_norm": 0.18966785073280334, "learning_rate": 3.660134039674282e-05, "loss": 0.4067, "step": 50795 }, { "epoch": 1.830828558042311, "grad_norm": 0.17551647126674652, "learning_rate": 3.659875541417057e-05, "loss": 0.408, "step": 50800 }, { "epoch": 1.8310087577035357, "grad_norm": 0.16543635725975037, "learning_rate": 3.659617027356753e-05, "loss": 0.4234, "step": 50805 }, { "epoch": 1.8311889573647602, "grad_norm": 0.20097027719020844, "learning_rate": 3.6593584974968916e-05, "loss": 0.4096, "step": 50810 }, { "epoch": 1.8313691570259847, "grad_norm": 0.1641996055841446, "learning_rate": 3.659099951840995e-05, "loss": 0.4007, "step": 50815 }, { "epoch": 1.8315493566872094, "grad_norm": 0.20720233023166656, "learning_rate": 3.658841390392585e-05, "loss": 0.4468, "step": 50820 }, { "epoch": 1.8317295563484342, "grad_norm": 0.1876114308834076, "learning_rate": 3.658582813155187e-05, "loss": 0.404, "step": 50825 }, { "epoch": 1.8319097560096587, "grad_norm": 0.18697507679462433, "learning_rate": 3.658324220132322e-05, "loss": 0.3958, "step": 50830 }, { "epoch": 1.8320899556708832, "grad_norm": 0.1502731889486313, "learning_rate": 3.658065611327513e-05, "loss": 0.4068, "step": 50835 }, { "epoch": 1.832270155332108, "grad_norm": 0.18511810898780823, "learning_rate": 3.6578069867442846e-05, "loss": 0.426, "step": 50840 }, { "epoch": 1.8324503549933326, "grad_norm": 0.1467100828886032, "learning_rate": 3.6575483463861604e-05, "loss": 0.4337, "step": 50845 }, { "epoch": 1.8326305546545574, "grad_norm": 0.147383913397789, "learning_rate": 3.657289690256664e-05, "loss": 0.3969, "step": 50850 }, { "epoch": 1.8328107543157819, "grad_norm": 0.16182565689086914, "learning_rate": 3.65703101835932e-05, "loss": 0.4109, "step": 50855 }, { "epoch": 1.8329909539770064, "grad_norm": 0.21511298418045044, "learning_rate": 3.656772330697651e-05, "loss": 0.3946, "step": 50860 }, { "epoch": 1.8331711536382311, "grad_norm": 0.2106020152568817, "learning_rate": 3.6565136272751844e-05, "loss": 0.3559, "step": 50865 }, { "epoch": 1.8333513532994559, "grad_norm": 0.22178612649440765, "learning_rate": 3.656254908095443e-05, "loss": 0.4217, "step": 50870 }, { "epoch": 1.8335315529606806, "grad_norm": 0.1671254187822342, "learning_rate": 3.655996173161953e-05, "loss": 0.4105, "step": 50875 }, { "epoch": 1.8337117526219051, "grad_norm": 0.18210411071777344, "learning_rate": 3.6557374224782384e-05, "loss": 0.4154, "step": 50880 }, { "epoch": 1.8338919522831296, "grad_norm": 0.19609753787517548, "learning_rate": 3.6554786560478245e-05, "loss": 0.4294, "step": 50885 }, { "epoch": 1.8340721519443544, "grad_norm": 0.16133646667003632, "learning_rate": 3.655219873874238e-05, "loss": 0.425, "step": 50890 }, { "epoch": 1.834252351605579, "grad_norm": 0.19089598953723907, "learning_rate": 3.654961075961005e-05, "loss": 0.4355, "step": 50895 }, { "epoch": 1.8344325512668036, "grad_norm": 0.176603302359581, "learning_rate": 3.654702262311651e-05, "loss": 0.3786, "step": 50900 }, { "epoch": 1.8346127509280281, "grad_norm": 0.18047241866588593, "learning_rate": 3.654443432929701e-05, "loss": 0.3985, "step": 50905 }, { "epoch": 1.8347929505892528, "grad_norm": 0.17832064628601074, "learning_rate": 3.654184587818684e-05, "loss": 0.3826, "step": 50910 }, { "epoch": 1.8349731502504776, "grad_norm": 0.1874881535768509, "learning_rate": 3.653925726982125e-05, "loss": 0.4242, "step": 50915 }, { "epoch": 1.8351533499117023, "grad_norm": 0.16851532459259033, "learning_rate": 3.653666850423551e-05, "loss": 0.4039, "step": 50920 }, { "epoch": 1.8353335495729268, "grad_norm": 0.19694097340106964, "learning_rate": 3.65340795814649e-05, "loss": 0.429, "step": 50925 }, { "epoch": 1.8355137492341513, "grad_norm": 0.1746322214603424, "learning_rate": 3.653149050154469e-05, "loss": 0.406, "step": 50930 }, { "epoch": 1.835693948895376, "grad_norm": 0.21489328145980835, "learning_rate": 3.652890126451015e-05, "loss": 0.4699, "step": 50935 }, { "epoch": 1.8358741485566008, "grad_norm": 0.2216479480266571, "learning_rate": 3.652631187039657e-05, "loss": 0.4144, "step": 50940 }, { "epoch": 1.8360543482178253, "grad_norm": 0.21755515038967133, "learning_rate": 3.6523722319239214e-05, "loss": 0.43, "step": 50945 }, { "epoch": 1.8362345478790498, "grad_norm": 0.22237977385520935, "learning_rate": 3.652113261107338e-05, "loss": 0.4274, "step": 50950 }, { "epoch": 1.8364147475402746, "grad_norm": 0.1792164444923401, "learning_rate": 3.651854274593433e-05, "loss": 0.4141, "step": 50955 }, { "epoch": 1.8365949472014993, "grad_norm": 0.16846583783626556, "learning_rate": 3.651595272385738e-05, "loss": 0.3788, "step": 50960 }, { "epoch": 1.836775146862724, "grad_norm": 0.23637813329696655, "learning_rate": 3.6513362544877794e-05, "loss": 0.4256, "step": 50965 }, { "epoch": 1.8369553465239485, "grad_norm": 0.21190734207630157, "learning_rate": 3.6510772209030885e-05, "loss": 0.4456, "step": 50970 }, { "epoch": 1.837135546185173, "grad_norm": 0.1966804713010788, "learning_rate": 3.650818171635192e-05, "loss": 0.3872, "step": 50975 }, { "epoch": 1.8373157458463978, "grad_norm": 0.24481886625289917, "learning_rate": 3.650559106687621e-05, "loss": 0.4143, "step": 50980 }, { "epoch": 1.8374959455076225, "grad_norm": 0.2108885794878006, "learning_rate": 3.650300026063905e-05, "loss": 0.3876, "step": 50985 }, { "epoch": 1.837676145168847, "grad_norm": 0.17803071439266205, "learning_rate": 3.650040929767575e-05, "loss": 0.3957, "step": 50990 }, { "epoch": 1.8378563448300718, "grad_norm": 0.1737845540046692, "learning_rate": 3.649781817802159e-05, "loss": 0.3848, "step": 50995 }, { "epoch": 1.8380365444912963, "grad_norm": 0.16461852192878723, "learning_rate": 3.649522690171188e-05, "loss": 0.41, "step": 51000 }, { "epoch": 1.8380365444912963, "eval_loss": 0.4396840035915375, "eval_runtime": 3.5402, "eval_samples_per_second": 28.247, "eval_steps_per_second": 7.062, "step": 51000 }, { "epoch": 1.838216744152521, "grad_norm": 0.19908668100833893, "learning_rate": 3.6492635468781934e-05, "loss": 0.411, "step": 51005 }, { "epoch": 1.8383969438137457, "grad_norm": 0.23171748220920563, "learning_rate": 3.649004387926705e-05, "loss": 0.3974, "step": 51010 }, { "epoch": 1.8385771434749703, "grad_norm": 0.19724468886852264, "learning_rate": 3.6487452133202546e-05, "loss": 0.4076, "step": 51015 }, { "epoch": 1.8387573431361948, "grad_norm": 0.1985427588224411, "learning_rate": 3.648486023062373e-05, "loss": 0.45, "step": 51020 }, { "epoch": 1.8389375427974195, "grad_norm": 0.16115103662014008, "learning_rate": 3.6482268171565914e-05, "loss": 0.3991, "step": 51025 }, { "epoch": 1.8391177424586442, "grad_norm": 0.20082534849643707, "learning_rate": 3.647967595606443e-05, "loss": 0.4553, "step": 51030 }, { "epoch": 1.839297942119869, "grad_norm": 0.1648685485124588, "learning_rate": 3.647708358415457e-05, "loss": 0.4469, "step": 51035 }, { "epoch": 1.8394781417810935, "grad_norm": 0.16533087193965912, "learning_rate": 3.6474491055871675e-05, "loss": 0.3725, "step": 51040 }, { "epoch": 1.839658341442318, "grad_norm": 0.18278469145298004, "learning_rate": 3.647189837125106e-05, "loss": 0.427, "step": 51045 }, { "epoch": 1.8398385411035427, "grad_norm": 0.18159344792366028, "learning_rate": 3.646930553032805e-05, "loss": 0.4254, "step": 51050 }, { "epoch": 1.8400187407647675, "grad_norm": 0.16814720630645752, "learning_rate": 3.646671253313797e-05, "loss": 0.4013, "step": 51055 }, { "epoch": 1.840198940425992, "grad_norm": 0.15060807764530182, "learning_rate": 3.646411937971615e-05, "loss": 0.3897, "step": 51060 }, { "epoch": 1.8403791400872165, "grad_norm": 0.19889427721500397, "learning_rate": 3.646152607009793e-05, "loss": 0.436, "step": 51065 }, { "epoch": 1.8405593397484412, "grad_norm": 0.22724641859531403, "learning_rate": 3.645893260431863e-05, "loss": 0.4595, "step": 51070 }, { "epoch": 1.840739539409666, "grad_norm": 0.19296084344387054, "learning_rate": 3.64563389824136e-05, "loss": 0.3611, "step": 51075 }, { "epoch": 1.8409197390708907, "grad_norm": 0.16096292436122894, "learning_rate": 3.6453745204418164e-05, "loss": 0.4039, "step": 51080 }, { "epoch": 1.8410999387321152, "grad_norm": 0.18182432651519775, "learning_rate": 3.6451151270367665e-05, "loss": 0.3959, "step": 51085 }, { "epoch": 1.8412801383933397, "grad_norm": 0.14909334480762482, "learning_rate": 3.644855718029745e-05, "loss": 0.4084, "step": 51090 }, { "epoch": 1.8414603380545644, "grad_norm": 0.2044646441936493, "learning_rate": 3.644596293424286e-05, "loss": 0.4397, "step": 51095 }, { "epoch": 1.8416405377157892, "grad_norm": 0.20742791891098022, "learning_rate": 3.644336853223924e-05, "loss": 0.4362, "step": 51100 }, { "epoch": 1.8418207373770137, "grad_norm": 0.17310570180416107, "learning_rate": 3.644077397432194e-05, "loss": 0.4036, "step": 51105 }, { "epoch": 1.8420009370382384, "grad_norm": 0.17185135185718536, "learning_rate": 3.64381792605263e-05, "loss": 0.4234, "step": 51110 }, { "epoch": 1.842181136699463, "grad_norm": 0.18162250518798828, "learning_rate": 3.64355843908877e-05, "loss": 0.396, "step": 51115 }, { "epoch": 1.8423613363606877, "grad_norm": 0.18723055720329285, "learning_rate": 3.6432989365441474e-05, "loss": 0.3928, "step": 51120 }, { "epoch": 1.8425415360219124, "grad_norm": 0.21605227887630463, "learning_rate": 3.643039418422297e-05, "loss": 0.3746, "step": 51125 }, { "epoch": 1.842721735683137, "grad_norm": 0.15051497519016266, "learning_rate": 3.642779884726757e-05, "loss": 0.4126, "step": 51130 }, { "epoch": 1.8429019353443614, "grad_norm": 0.19938229024410248, "learning_rate": 3.642520335461061e-05, "loss": 0.4506, "step": 51135 }, { "epoch": 1.8430821350055862, "grad_norm": 0.17573609948158264, "learning_rate": 3.642260770628747e-05, "loss": 0.4178, "step": 51140 }, { "epoch": 1.8432623346668109, "grad_norm": 0.16785641014575958, "learning_rate": 3.6420011902333516e-05, "loss": 0.4336, "step": 51145 }, { "epoch": 1.8434425343280356, "grad_norm": 0.21680709719657898, "learning_rate": 3.641741594278411e-05, "loss": 0.4317, "step": 51150 }, { "epoch": 1.8436227339892601, "grad_norm": 0.18228915333747864, "learning_rate": 3.641481982767463e-05, "loss": 0.4118, "step": 51155 }, { "epoch": 1.8438029336504846, "grad_norm": 0.2226298749446869, "learning_rate": 3.641222355704042e-05, "loss": 0.4553, "step": 51160 }, { "epoch": 1.8439831333117094, "grad_norm": 0.20143426954746246, "learning_rate": 3.640962713091689e-05, "loss": 0.43, "step": 51165 }, { "epoch": 1.844163332972934, "grad_norm": 0.1602238118648529, "learning_rate": 3.64070305493394e-05, "loss": 0.4138, "step": 51170 }, { "epoch": 1.8443435326341586, "grad_norm": 0.19529132544994354, "learning_rate": 3.640443381234331e-05, "loss": 0.4168, "step": 51175 }, { "epoch": 1.8445237322953831, "grad_norm": 0.19434118270874023, "learning_rate": 3.6401836919964035e-05, "loss": 0.4032, "step": 51180 }, { "epoch": 1.8447039319566079, "grad_norm": 0.1953050196170807, "learning_rate": 3.639923987223693e-05, "loss": 0.4151, "step": 51185 }, { "epoch": 1.8448841316178326, "grad_norm": 0.16159437596797943, "learning_rate": 3.639664266919739e-05, "loss": 0.4209, "step": 51190 }, { "epoch": 1.8450643312790573, "grad_norm": 0.22107087075710297, "learning_rate": 3.639404531088081e-05, "loss": 0.4232, "step": 51195 }, { "epoch": 1.8452445309402818, "grad_norm": 0.1914900243282318, "learning_rate": 3.6391447797322556e-05, "loss": 0.4235, "step": 51200 }, { "epoch": 1.8454247306015064, "grad_norm": 0.1916929930448532, "learning_rate": 3.6388850128558036e-05, "loss": 0.4633, "step": 51205 }, { "epoch": 1.845604930262731, "grad_norm": 0.16584375500679016, "learning_rate": 3.6386252304622636e-05, "loss": 0.4198, "step": 51210 }, { "epoch": 1.8457851299239558, "grad_norm": 0.19661644101142883, "learning_rate": 3.6383654325551756e-05, "loss": 0.3869, "step": 51215 }, { "epoch": 1.8459653295851803, "grad_norm": 0.17381379008293152, "learning_rate": 3.638105619138079e-05, "loss": 0.3688, "step": 51220 }, { "epoch": 1.846145529246405, "grad_norm": 0.18140283226966858, "learning_rate": 3.637845790214513e-05, "loss": 0.464, "step": 51225 }, { "epoch": 1.8463257289076296, "grad_norm": 0.18589642643928528, "learning_rate": 3.637585945788019e-05, "loss": 0.4818, "step": 51230 }, { "epoch": 1.8465059285688543, "grad_norm": 0.1755005270242691, "learning_rate": 3.6373260858621364e-05, "loss": 0.4101, "step": 51235 }, { "epoch": 1.846686128230079, "grad_norm": 0.14478670060634613, "learning_rate": 3.637066210440407e-05, "loss": 0.3988, "step": 51240 }, { "epoch": 1.8468663278913036, "grad_norm": 0.1898811310529709, "learning_rate": 3.6368063195263694e-05, "loss": 0.3998, "step": 51245 }, { "epoch": 1.847046527552528, "grad_norm": 0.2270967662334442, "learning_rate": 3.6365464131235664e-05, "loss": 0.4444, "step": 51250 }, { "epoch": 1.8472267272137528, "grad_norm": 0.17820177972316742, "learning_rate": 3.636286491235538e-05, "loss": 0.4097, "step": 51255 }, { "epoch": 1.8474069268749775, "grad_norm": 0.19704335927963257, "learning_rate": 3.636026553865828e-05, "loss": 0.4587, "step": 51260 }, { "epoch": 1.8475871265362023, "grad_norm": 0.18560896813869476, "learning_rate": 3.635766601017974e-05, "loss": 0.4076, "step": 51265 }, { "epoch": 1.8477673261974268, "grad_norm": 0.19222392141819, "learning_rate": 3.635506632695521e-05, "loss": 0.4196, "step": 51270 }, { "epoch": 1.8479475258586513, "grad_norm": 0.1405927538871765, "learning_rate": 3.6352466489020095e-05, "loss": 0.4081, "step": 51275 }, { "epoch": 1.848127725519876, "grad_norm": 0.16948674619197845, "learning_rate": 3.634986649640982e-05, "loss": 0.3944, "step": 51280 }, { "epoch": 1.8483079251811008, "grad_norm": 0.16560353338718414, "learning_rate": 3.6347266349159826e-05, "loss": 0.4439, "step": 51285 }, { "epoch": 1.8484881248423253, "grad_norm": 0.15636521577835083, "learning_rate": 3.634466604730551e-05, "loss": 0.4157, "step": 51290 }, { "epoch": 1.8486683245035498, "grad_norm": 0.19085612893104553, "learning_rate": 3.634206559088232e-05, "loss": 0.4038, "step": 51295 }, { "epoch": 1.8488485241647745, "grad_norm": 0.1767541766166687, "learning_rate": 3.633946497992568e-05, "loss": 0.4058, "step": 51300 }, { "epoch": 1.8490287238259993, "grad_norm": 0.1703364998102188, "learning_rate": 3.6336864214471035e-05, "loss": 0.3893, "step": 51305 }, { "epoch": 1.849208923487224, "grad_norm": 0.19773104786872864, "learning_rate": 3.63342632945538e-05, "loss": 0.4547, "step": 51310 }, { "epoch": 1.8493891231484485, "grad_norm": 0.20999063551425934, "learning_rate": 3.6331662220209416e-05, "loss": 0.4344, "step": 51315 }, { "epoch": 1.849569322809673, "grad_norm": 0.16823193430900574, "learning_rate": 3.6329060991473344e-05, "loss": 0.4112, "step": 51320 }, { "epoch": 1.8497495224708977, "grad_norm": 0.21957112848758698, "learning_rate": 3.6326459608380994e-05, "loss": 0.3883, "step": 51325 }, { "epoch": 1.8499297221321225, "grad_norm": 0.17172305285930634, "learning_rate": 3.6323858070967834e-05, "loss": 0.3975, "step": 51330 }, { "epoch": 1.850109921793347, "grad_norm": 0.23369702696800232, "learning_rate": 3.6321256379269296e-05, "loss": 0.4123, "step": 51335 }, { "epoch": 1.8502901214545715, "grad_norm": 0.19112427532672882, "learning_rate": 3.631865453332084e-05, "loss": 0.4109, "step": 51340 }, { "epoch": 1.8504703211157962, "grad_norm": 0.19734065234661102, "learning_rate": 3.63160525331579e-05, "loss": 0.4006, "step": 51345 }, { "epoch": 1.850650520777021, "grad_norm": 0.15451741218566895, "learning_rate": 3.631345037881593e-05, "loss": 0.4063, "step": 51350 }, { "epoch": 1.8508307204382457, "grad_norm": 0.1700822114944458, "learning_rate": 3.631084807033041e-05, "loss": 0.4052, "step": 51355 }, { "epoch": 1.8510109200994702, "grad_norm": 0.1707632839679718, "learning_rate": 3.630824560773676e-05, "loss": 0.3775, "step": 51360 }, { "epoch": 1.8511911197606947, "grad_norm": 0.16627883911132812, "learning_rate": 3.630564299107045e-05, "loss": 0.4075, "step": 51365 }, { "epoch": 1.8513713194219195, "grad_norm": 0.24305926263332367, "learning_rate": 3.630304022036694e-05, "loss": 0.4367, "step": 51370 }, { "epoch": 1.8515515190831442, "grad_norm": 0.15088029205799103, "learning_rate": 3.6300437295661706e-05, "loss": 0.398, "step": 51375 }, { "epoch": 1.851731718744369, "grad_norm": 0.16519147157669067, "learning_rate": 3.62978342169902e-05, "loss": 0.4295, "step": 51380 }, { "epoch": 1.8519119184055934, "grad_norm": 0.19771799445152283, "learning_rate": 3.6295230984387884e-05, "loss": 0.3928, "step": 51385 }, { "epoch": 1.852092118066818, "grad_norm": 0.164763942360878, "learning_rate": 3.629262759789024e-05, "loss": 0.4141, "step": 51390 }, { "epoch": 1.8522723177280427, "grad_norm": 0.18047907948493958, "learning_rate": 3.6290024057532726e-05, "loss": 0.4259, "step": 51395 }, { "epoch": 1.8524525173892674, "grad_norm": 0.21674178540706635, "learning_rate": 3.6287420363350824e-05, "loss": 0.3962, "step": 51400 }, { "epoch": 1.852632717050492, "grad_norm": 0.1720811426639557, "learning_rate": 3.628481651538e-05, "loss": 0.4129, "step": 51405 }, { "epoch": 1.8528129167117164, "grad_norm": 0.201126828789711, "learning_rate": 3.628221251365574e-05, "loss": 0.4334, "step": 51410 }, { "epoch": 1.8529931163729412, "grad_norm": 0.17763501405715942, "learning_rate": 3.627960835821351e-05, "loss": 0.4179, "step": 51415 }, { "epoch": 1.853173316034166, "grad_norm": 0.16252952814102173, "learning_rate": 3.6277004049088815e-05, "loss": 0.3806, "step": 51420 }, { "epoch": 1.8533535156953906, "grad_norm": 0.17400288581848145, "learning_rate": 3.627439958631712e-05, "loss": 0.3974, "step": 51425 }, { "epoch": 1.8535337153566152, "grad_norm": 0.1547224074602127, "learning_rate": 3.6271794969933895e-05, "loss": 0.373, "step": 51430 }, { "epoch": 1.8537139150178397, "grad_norm": 0.17776109278202057, "learning_rate": 3.626919019997467e-05, "loss": 0.3768, "step": 51435 }, { "epoch": 1.8538941146790644, "grad_norm": 0.188429594039917, "learning_rate": 3.6266585276474896e-05, "loss": 0.4463, "step": 51440 }, { "epoch": 1.8540743143402891, "grad_norm": 0.18767085671424866, "learning_rate": 3.626398019947008e-05, "loss": 0.4271, "step": 51445 }, { "epoch": 1.8542545140015136, "grad_norm": 0.19436992704868317, "learning_rate": 3.626137496899572e-05, "loss": 0.399, "step": 51450 }, { "epoch": 1.8544347136627382, "grad_norm": 0.2516634166240692, "learning_rate": 3.62587695850873e-05, "loss": 0.4383, "step": 51455 }, { "epoch": 1.8546149133239629, "grad_norm": 0.18735727667808533, "learning_rate": 3.625616404778033e-05, "loss": 0.4246, "step": 51460 }, { "epoch": 1.8547951129851876, "grad_norm": 0.19105082750320435, "learning_rate": 3.62535583571103e-05, "loss": 0.4179, "step": 51465 }, { "epoch": 1.8549753126464124, "grad_norm": 0.20437583327293396, "learning_rate": 3.625095251311272e-05, "loss": 0.4275, "step": 51470 }, { "epoch": 1.8551555123076369, "grad_norm": 0.1884140968322754, "learning_rate": 3.6248346515823084e-05, "loss": 0.3885, "step": 51475 }, { "epoch": 1.8553357119688614, "grad_norm": 0.22424067556858063, "learning_rate": 3.6245740365276914e-05, "loss": 0.4377, "step": 51480 }, { "epoch": 1.8555159116300861, "grad_norm": 0.1814982295036316, "learning_rate": 3.62431340615097e-05, "loss": 0.4373, "step": 51485 }, { "epoch": 1.8556961112913108, "grad_norm": 0.18222209811210632, "learning_rate": 3.624052760455696e-05, "loss": 0.3891, "step": 51490 }, { "epoch": 1.8558763109525354, "grad_norm": 0.21884121000766754, "learning_rate": 3.6237920994454216e-05, "loss": 0.436, "step": 51495 }, { "epoch": 1.85605651061376, "grad_norm": 0.16417382657527924, "learning_rate": 3.623531423123697e-05, "loss": 0.4172, "step": 51500 }, { "epoch": 1.85605651061376, "eval_loss": 0.43941619992256165, "eval_runtime": 3.5304, "eval_samples_per_second": 28.326, "eval_steps_per_second": 7.081, "step": 51500 }, { "epoch": 1.8562367102749846, "grad_norm": 0.20107519626617432, "learning_rate": 3.623270731494075e-05, "loss": 0.4246, "step": 51505 }, { "epoch": 1.8564169099362093, "grad_norm": 0.18021084368228912, "learning_rate": 3.6230100245601056e-05, "loss": 0.4206, "step": 51510 }, { "epoch": 1.856597109597434, "grad_norm": 0.16509023308753967, "learning_rate": 3.6227493023253425e-05, "loss": 0.4185, "step": 51515 }, { "epoch": 1.8567773092586586, "grad_norm": 0.2372467815876007, "learning_rate": 3.622488564793337e-05, "loss": 0.4057, "step": 51520 }, { "epoch": 1.856957508919883, "grad_norm": 0.2038879245519638, "learning_rate": 3.622227811967643e-05, "loss": 0.3873, "step": 51525 }, { "epoch": 1.8571377085811078, "grad_norm": 0.1562618464231491, "learning_rate": 3.6219670438518125e-05, "loss": 0.3674, "step": 51530 }, { "epoch": 1.8573179082423326, "grad_norm": 0.20719584822654724, "learning_rate": 3.621706260449397e-05, "loss": 0.4226, "step": 51535 }, { "epoch": 1.8574981079035573, "grad_norm": 0.15713489055633545, "learning_rate": 3.621445461763952e-05, "loss": 0.4243, "step": 51540 }, { "epoch": 1.8576783075647818, "grad_norm": 0.19836826622486115, "learning_rate": 3.62118464779903e-05, "loss": 0.4348, "step": 51545 }, { "epoch": 1.8578585072260063, "grad_norm": 0.17970958352088928, "learning_rate": 3.620923818558183e-05, "loss": 0.4002, "step": 51550 }, { "epoch": 1.858038706887231, "grad_norm": 0.2058597356081009, "learning_rate": 3.6206629740449666e-05, "loss": 0.3783, "step": 51555 }, { "epoch": 1.8582189065484558, "grad_norm": 0.17197424173355103, "learning_rate": 3.620402114262934e-05, "loss": 0.423, "step": 51560 }, { "epoch": 1.8583991062096803, "grad_norm": 0.2183474451303482, "learning_rate": 3.6201412392156395e-05, "loss": 0.4269, "step": 51565 }, { "epoch": 1.8585793058709048, "grad_norm": 0.1597500592470169, "learning_rate": 3.619880348906638e-05, "loss": 0.3744, "step": 51570 }, { "epoch": 1.8587595055321295, "grad_norm": 0.14877550303936005, "learning_rate": 3.619619443339483e-05, "loss": 0.3873, "step": 51575 }, { "epoch": 1.8589397051933543, "grad_norm": 0.21616920828819275, "learning_rate": 3.6193585225177296e-05, "loss": 0.4411, "step": 51580 }, { "epoch": 1.859119904854579, "grad_norm": 0.20081162452697754, "learning_rate": 3.619097586444934e-05, "loss": 0.4237, "step": 51585 }, { "epoch": 1.8593001045158035, "grad_norm": 0.1473800092935562, "learning_rate": 3.618836635124649e-05, "loss": 0.3905, "step": 51590 }, { "epoch": 1.859480304177028, "grad_norm": 0.18639175593852997, "learning_rate": 3.618575668560433e-05, "loss": 0.4168, "step": 51595 }, { "epoch": 1.8596605038382528, "grad_norm": 0.17644469439983368, "learning_rate": 3.6183146867558394e-05, "loss": 0.4074, "step": 51600 }, { "epoch": 1.8598407034994775, "grad_norm": 0.15397772192955017, "learning_rate": 3.6180536897144245e-05, "loss": 0.4132, "step": 51605 }, { "epoch": 1.860020903160702, "grad_norm": 0.17601633071899414, "learning_rate": 3.6177926774397455e-05, "loss": 0.4003, "step": 51610 }, { "epoch": 1.8602011028219267, "grad_norm": 0.20962753891944885, "learning_rate": 3.617531649935356e-05, "loss": 0.431, "step": 51615 }, { "epoch": 1.8603813024831513, "grad_norm": 0.18711386620998383, "learning_rate": 3.617270607204816e-05, "loss": 0.4172, "step": 51620 }, { "epoch": 1.860561502144376, "grad_norm": 0.19060426950454712, "learning_rate": 3.6170095492516795e-05, "loss": 0.4218, "step": 51625 }, { "epoch": 1.8607417018056007, "grad_norm": 0.24624811112880707, "learning_rate": 3.616748476079504e-05, "loss": 0.4342, "step": 51630 }, { "epoch": 1.8609219014668252, "grad_norm": 0.20159181952476501, "learning_rate": 3.616487387691847e-05, "loss": 0.3987, "step": 51635 }, { "epoch": 1.8611021011280497, "grad_norm": 0.18336619436740875, "learning_rate": 3.616226284092264e-05, "loss": 0.4065, "step": 51640 }, { "epoch": 1.8612823007892745, "grad_norm": 0.21609511971473694, "learning_rate": 3.615965165284316e-05, "loss": 0.4113, "step": 51645 }, { "epoch": 1.8614625004504992, "grad_norm": 0.22596147656440735, "learning_rate": 3.615704031271558e-05, "loss": 0.4017, "step": 51650 }, { "epoch": 1.861642700111724, "grad_norm": 0.15656688809394836, "learning_rate": 3.6154428820575484e-05, "loss": 0.4422, "step": 51655 }, { "epoch": 1.8618228997729485, "grad_norm": 0.19237256050109863, "learning_rate": 3.615181717645845e-05, "loss": 0.4242, "step": 51660 }, { "epoch": 1.862003099434173, "grad_norm": 0.18129973113536835, "learning_rate": 3.6149205380400074e-05, "loss": 0.4334, "step": 51665 }, { "epoch": 1.8621832990953977, "grad_norm": 0.19113698601722717, "learning_rate": 3.614659343243594e-05, "loss": 0.4155, "step": 51670 }, { "epoch": 1.8623634987566224, "grad_norm": 0.20505818724632263, "learning_rate": 3.6143981332601615e-05, "loss": 0.4176, "step": 51675 }, { "epoch": 1.862543698417847, "grad_norm": 0.16761788725852966, "learning_rate": 3.6141369080932705e-05, "loss": 0.422, "step": 51680 }, { "epoch": 1.8627238980790715, "grad_norm": 0.18111896514892578, "learning_rate": 3.61387566774648e-05, "loss": 0.4167, "step": 51685 }, { "epoch": 1.8629040977402962, "grad_norm": 0.15977542102336884, "learning_rate": 3.6136144122233497e-05, "loss": 0.3934, "step": 51690 }, { "epoch": 1.863084297401521, "grad_norm": 0.17929469048976898, "learning_rate": 3.6133531415274376e-05, "loss": 0.4395, "step": 51695 }, { "epoch": 1.8632644970627457, "grad_norm": 0.19982290267944336, "learning_rate": 3.613091855662305e-05, "loss": 0.3916, "step": 51700 }, { "epoch": 1.8634446967239702, "grad_norm": 0.16737203299999237, "learning_rate": 3.6128305546315114e-05, "loss": 0.4053, "step": 51705 }, { "epoch": 1.8636248963851947, "grad_norm": 0.1759185642004013, "learning_rate": 3.6125692384386164e-05, "loss": 0.3961, "step": 51710 }, { "epoch": 1.8638050960464194, "grad_norm": 0.1947273313999176, "learning_rate": 3.612307907087182e-05, "loss": 0.4288, "step": 51715 }, { "epoch": 1.8639852957076442, "grad_norm": 0.14398063719272614, "learning_rate": 3.6120465605807666e-05, "loss": 0.3918, "step": 51720 }, { "epoch": 1.8641654953688687, "grad_norm": 0.18036890029907227, "learning_rate": 3.611785198922933e-05, "loss": 0.422, "step": 51725 }, { "epoch": 1.8643456950300934, "grad_norm": 0.18519078195095062, "learning_rate": 3.611523822117241e-05, "loss": 0.4418, "step": 51730 }, { "epoch": 1.864525894691318, "grad_norm": 0.20603454113006592, "learning_rate": 3.611262430167253e-05, "loss": 0.4232, "step": 51735 }, { "epoch": 1.8647060943525426, "grad_norm": 0.18835075199604034, "learning_rate": 3.6110010230765276e-05, "loss": 0.4057, "step": 51740 }, { "epoch": 1.8648862940137674, "grad_norm": 0.18818290531635284, "learning_rate": 3.6107396008486296e-05, "loss": 0.4408, "step": 51745 }, { "epoch": 1.8650664936749919, "grad_norm": 0.16613999009132385, "learning_rate": 3.61047816348712e-05, "loss": 0.3983, "step": 51750 }, { "epoch": 1.8652466933362164, "grad_norm": 0.19603601098060608, "learning_rate": 3.6102167109955594e-05, "loss": 0.433, "step": 51755 }, { "epoch": 1.8654268929974411, "grad_norm": 0.1884978711605072, "learning_rate": 3.609955243377511e-05, "loss": 0.4285, "step": 51760 }, { "epoch": 1.8656070926586659, "grad_norm": 0.19464312493801117, "learning_rate": 3.609693760636538e-05, "loss": 0.3915, "step": 51765 }, { "epoch": 1.8657872923198906, "grad_norm": 0.20593151450157166, "learning_rate": 3.609432262776202e-05, "loss": 0.4311, "step": 51770 }, { "epoch": 1.8659674919811151, "grad_norm": 0.1879161298274994, "learning_rate": 3.6091707498000666e-05, "loss": 0.4121, "step": 51775 }, { "epoch": 1.8661476916423396, "grad_norm": 0.19726385176181793, "learning_rate": 3.608909221711694e-05, "loss": 0.3844, "step": 51780 }, { "epoch": 1.8663278913035644, "grad_norm": 0.1846427619457245, "learning_rate": 3.6086476785146486e-05, "loss": 0.4014, "step": 51785 }, { "epoch": 1.866508090964789, "grad_norm": 0.1984597146511078, "learning_rate": 3.6083861202124926e-05, "loss": 0.4117, "step": 51790 }, { "epoch": 1.8666882906260136, "grad_norm": 0.14796842634677887, "learning_rate": 3.60812454680879e-05, "loss": 0.3917, "step": 51795 }, { "epoch": 1.8668684902872381, "grad_norm": 0.21340517699718475, "learning_rate": 3.607862958307106e-05, "loss": 0.3993, "step": 51800 }, { "epoch": 1.8670486899484628, "grad_norm": 0.17566043138504028, "learning_rate": 3.607601354711003e-05, "loss": 0.4282, "step": 51805 }, { "epoch": 1.8672288896096876, "grad_norm": 0.19569726288318634, "learning_rate": 3.607339736024046e-05, "loss": 0.3953, "step": 51810 }, { "epoch": 1.8674090892709123, "grad_norm": 0.18134260177612305, "learning_rate": 3.6070781022497996e-05, "loss": 0.4215, "step": 51815 }, { "epoch": 1.8675892889321368, "grad_norm": 0.18229928612709045, "learning_rate": 3.606816453391828e-05, "loss": 0.3922, "step": 51820 }, { "epoch": 1.8677694885933613, "grad_norm": 0.16680172085762024, "learning_rate": 3.606554789453697e-05, "loss": 0.3905, "step": 51825 }, { "epoch": 1.867949688254586, "grad_norm": 0.20481085777282715, "learning_rate": 3.606293110438972e-05, "loss": 0.4282, "step": 51830 }, { "epoch": 1.8681298879158108, "grad_norm": 0.17302840948104858, "learning_rate": 3.6060314163512164e-05, "loss": 0.4102, "step": 51835 }, { "epoch": 1.8683100875770353, "grad_norm": 0.1852729618549347, "learning_rate": 3.605769707193997e-05, "loss": 0.4104, "step": 51840 }, { "epoch": 1.8684902872382598, "grad_norm": 0.1932128518819809, "learning_rate": 3.6055079829708795e-05, "loss": 0.4389, "step": 51845 }, { "epoch": 1.8686704868994846, "grad_norm": 0.20994549989700317, "learning_rate": 3.60524624368543e-05, "loss": 0.4512, "step": 51850 }, { "epoch": 1.8688506865607093, "grad_norm": 0.2091553956270218, "learning_rate": 3.6049844893412143e-05, "loss": 0.4099, "step": 51855 }, { "epoch": 1.869030886221934, "grad_norm": 0.16755080223083496, "learning_rate": 3.6047227199417987e-05, "loss": 0.4143, "step": 51860 }, { "epoch": 1.8692110858831585, "grad_norm": 0.23165898025035858, "learning_rate": 3.6044609354907505e-05, "loss": 0.4135, "step": 51865 }, { "epoch": 1.869391285544383, "grad_norm": 0.20910170674324036, "learning_rate": 3.604199135991635e-05, "loss": 0.4071, "step": 51870 }, { "epoch": 1.8695714852056078, "grad_norm": 0.17225544154644012, "learning_rate": 3.603937321448021e-05, "loss": 0.4114, "step": 51875 }, { "epoch": 1.8697516848668325, "grad_norm": 0.17193418741226196, "learning_rate": 3.6036754918634744e-05, "loss": 0.4112, "step": 51880 }, { "epoch": 1.8699318845280573, "grad_norm": 0.17545709013938904, "learning_rate": 3.6034136472415624e-05, "loss": 0.4146, "step": 51885 }, { "epoch": 1.8701120841892818, "grad_norm": 0.17598995566368103, "learning_rate": 3.603151787585853e-05, "loss": 0.3975, "step": 51890 }, { "epoch": 1.8702922838505063, "grad_norm": 0.19214802980422974, "learning_rate": 3.602889912899915e-05, "loss": 0.4483, "step": 51895 }, { "epoch": 1.870472483511731, "grad_norm": 0.19266071915626526, "learning_rate": 3.602628023187315e-05, "loss": 0.4325, "step": 51900 }, { "epoch": 1.8706526831729557, "grad_norm": 0.2069164216518402, "learning_rate": 3.602366118451621e-05, "loss": 0.4078, "step": 51905 }, { "epoch": 1.8708328828341803, "grad_norm": 0.18193595111370087, "learning_rate": 3.6021041986964035e-05, "loss": 0.4199, "step": 51910 }, { "epoch": 1.8710130824954048, "grad_norm": 0.15143196284770966, "learning_rate": 3.601842263925228e-05, "loss": 0.3955, "step": 51915 }, { "epoch": 1.8711932821566295, "grad_norm": 0.1509236842393875, "learning_rate": 3.601580314141666e-05, "loss": 0.3925, "step": 51920 }, { "epoch": 1.8713734818178542, "grad_norm": 0.20824167132377625, "learning_rate": 3.601318349349285e-05, "loss": 0.4159, "step": 51925 }, { "epoch": 1.871553681479079, "grad_norm": 0.1683613657951355, "learning_rate": 3.601056369551655e-05, "loss": 0.3946, "step": 51930 }, { "epoch": 1.8717338811403035, "grad_norm": 0.20717251300811768, "learning_rate": 3.600794374752346e-05, "loss": 0.4327, "step": 51935 }, { "epoch": 1.871914080801528, "grad_norm": 0.1377881020307541, "learning_rate": 3.600532364954926e-05, "loss": 0.4057, "step": 51940 }, { "epoch": 1.8720942804627527, "grad_norm": 0.20057149231433868, "learning_rate": 3.600270340162966e-05, "loss": 0.4513, "step": 51945 }, { "epoch": 1.8722744801239775, "grad_norm": 0.1850452721118927, "learning_rate": 3.600008300380035e-05, "loss": 0.3977, "step": 51950 }, { "epoch": 1.872454679785202, "grad_norm": 0.21717745065689087, "learning_rate": 3.599746245609703e-05, "loss": 0.3927, "step": 51955 }, { "epoch": 1.8726348794464265, "grad_norm": 0.17817428708076477, "learning_rate": 3.599484175855543e-05, "loss": 0.4003, "step": 51960 }, { "epoch": 1.8728150791076512, "grad_norm": 0.1645549237728119, "learning_rate": 3.599222091121123e-05, "loss": 0.4091, "step": 51965 }, { "epoch": 1.872995278768876, "grad_norm": 0.26613032817840576, "learning_rate": 3.598959991410015e-05, "loss": 0.4515, "step": 51970 }, { "epoch": 1.8731754784301007, "grad_norm": 0.15055564045906067, "learning_rate": 3.5986978767257906e-05, "loss": 0.4026, "step": 51975 }, { "epoch": 1.8733556780913252, "grad_norm": 0.19910356402397156, "learning_rate": 3.59843574707202e-05, "loss": 0.4153, "step": 51980 }, { "epoch": 1.8735358777525497, "grad_norm": 0.17660050094127655, "learning_rate": 3.598173602452274e-05, "loss": 0.4038, "step": 51985 }, { "epoch": 1.8737160774137744, "grad_norm": 0.19887585937976837, "learning_rate": 3.5979114428701265e-05, "loss": 0.4239, "step": 51990 }, { "epoch": 1.8738962770749992, "grad_norm": 0.1948709338903427, "learning_rate": 3.597649268329148e-05, "loss": 0.4, "step": 51995 }, { "epoch": 1.8740764767362237, "grad_norm": 0.2058577537536621, "learning_rate": 3.59738707883291e-05, "loss": 0.4223, "step": 52000 }, { "epoch": 1.8740764767362237, "eval_loss": 0.4389624297618866, "eval_runtime": 3.5417, "eval_samples_per_second": 28.235, "eval_steps_per_second": 7.059, "step": 52000 }, { "epoch": 1.8742566763974484, "grad_norm": 0.19060571491718292, "learning_rate": 3.5971248743849864e-05, "loss": 0.4135, "step": 52005 }, { "epoch": 1.874436876058673, "grad_norm": 0.14245115220546722, "learning_rate": 3.596862654988948e-05, "loss": 0.3737, "step": 52010 }, { "epoch": 1.8746170757198977, "grad_norm": 0.1959165632724762, "learning_rate": 3.5966004206483687e-05, "loss": 0.4575, "step": 52015 }, { "epoch": 1.8747972753811224, "grad_norm": 0.17014524340629578, "learning_rate": 3.596338171366821e-05, "loss": 0.3932, "step": 52020 }, { "epoch": 1.874977475042347, "grad_norm": 0.18894478678703308, "learning_rate": 3.596075907147878e-05, "loss": 0.387, "step": 52025 }, { "epoch": 1.8751576747035714, "grad_norm": 0.1582806408405304, "learning_rate": 3.595813627995113e-05, "loss": 0.39, "step": 52030 }, { "epoch": 1.8753378743647962, "grad_norm": 0.20237891376018524, "learning_rate": 3.595551333912099e-05, "loss": 0.4171, "step": 52035 }, { "epoch": 1.8755180740260209, "grad_norm": 0.1860736459493637, "learning_rate": 3.595289024902411e-05, "loss": 0.406, "step": 52040 }, { "epoch": 1.8756982736872456, "grad_norm": 0.191467747092247, "learning_rate": 3.5950267009696206e-05, "loss": 0.4538, "step": 52045 }, { "epoch": 1.8758784733484701, "grad_norm": 0.1970950961112976, "learning_rate": 3.594764362117305e-05, "loss": 0.3997, "step": 52050 }, { "epoch": 1.8760586730096946, "grad_norm": 0.2063102424144745, "learning_rate": 3.594502008349036e-05, "loss": 0.4225, "step": 52055 }, { "epoch": 1.8762388726709194, "grad_norm": 0.18709443509578705, "learning_rate": 3.59423963966839e-05, "loss": 0.3833, "step": 52060 }, { "epoch": 1.8764190723321441, "grad_norm": 0.1764218658208847, "learning_rate": 3.59397725607894e-05, "loss": 0.4111, "step": 52065 }, { "epoch": 1.8765992719933686, "grad_norm": 0.1787770837545395, "learning_rate": 3.593714857584261e-05, "loss": 0.4018, "step": 52070 }, { "epoch": 1.8767794716545931, "grad_norm": 0.1761235147714615, "learning_rate": 3.59345244418793e-05, "loss": 0.4015, "step": 52075 }, { "epoch": 1.8769596713158179, "grad_norm": 0.1807931810617447, "learning_rate": 3.593190015893521e-05, "loss": 0.4127, "step": 52080 }, { "epoch": 1.8771398709770426, "grad_norm": 0.16054491698741913, "learning_rate": 3.5929275727046095e-05, "loss": 0.4037, "step": 52085 }, { "epoch": 1.8773200706382673, "grad_norm": 0.23387478291988373, "learning_rate": 3.5926651146247715e-05, "loss": 0.429, "step": 52090 }, { "epoch": 1.8775002702994918, "grad_norm": 0.17151391506195068, "learning_rate": 3.5924026416575826e-05, "loss": 0.432, "step": 52095 }, { "epoch": 1.8776804699607164, "grad_norm": 0.20386628806591034, "learning_rate": 3.5921401538066195e-05, "loss": 0.419, "step": 52100 }, { "epoch": 1.877860669621941, "grad_norm": 0.16705821454524994, "learning_rate": 3.591877651075458e-05, "loss": 0.4308, "step": 52105 }, { "epoch": 1.8780408692831658, "grad_norm": 0.16681896150112152, "learning_rate": 3.591615133467675e-05, "loss": 0.4394, "step": 52110 }, { "epoch": 1.8782210689443903, "grad_norm": 0.16043955087661743, "learning_rate": 3.591352600986847e-05, "loss": 0.3904, "step": 52115 }, { "epoch": 1.878401268605615, "grad_norm": 0.24545589089393616, "learning_rate": 3.5910900536365517e-05, "loss": 0.4429, "step": 52120 }, { "epoch": 1.8785814682668396, "grad_norm": 0.18531683087348938, "learning_rate": 3.590827491420365e-05, "loss": 0.3819, "step": 52125 }, { "epoch": 1.8787616679280643, "grad_norm": 0.21406356990337372, "learning_rate": 3.5905649143418654e-05, "loss": 0.4126, "step": 52130 }, { "epoch": 1.878941867589289, "grad_norm": 0.2111523151397705, "learning_rate": 3.590302322404629e-05, "loss": 0.4129, "step": 52135 }, { "epoch": 1.8791220672505136, "grad_norm": 0.19074268639087677, "learning_rate": 3.590039715612236e-05, "loss": 0.3669, "step": 52140 }, { "epoch": 1.879302266911738, "grad_norm": 0.1679174304008484, "learning_rate": 3.5897770939682616e-05, "loss": 0.4018, "step": 52145 }, { "epoch": 1.8794824665729628, "grad_norm": 0.28113430738449097, "learning_rate": 3.5895144574762855e-05, "loss": 0.4003, "step": 52150 }, { "epoch": 1.8796626662341875, "grad_norm": 0.1835562139749527, "learning_rate": 3.589251806139887e-05, "loss": 0.3801, "step": 52155 }, { "epoch": 1.8798428658954123, "grad_norm": 0.21328282356262207, "learning_rate": 3.588989139962642e-05, "loss": 0.4041, "step": 52160 }, { "epoch": 1.8800230655566368, "grad_norm": 0.15331655740737915, "learning_rate": 3.5887264589481324e-05, "loss": 0.399, "step": 52165 }, { "epoch": 1.8802032652178613, "grad_norm": 0.20487786829471588, "learning_rate": 3.588463763099934e-05, "loss": 0.4275, "step": 52170 }, { "epoch": 1.880383464879086, "grad_norm": 0.17890450358390808, "learning_rate": 3.5882010524216284e-05, "loss": 0.4195, "step": 52175 }, { "epoch": 1.8805636645403108, "grad_norm": 0.1512042135000229, "learning_rate": 3.587938326916794e-05, "loss": 0.398, "step": 52180 }, { "epoch": 1.8807438642015353, "grad_norm": 0.17925597727298737, "learning_rate": 3.587675586589011e-05, "loss": 0.4313, "step": 52185 }, { "epoch": 1.8809240638627598, "grad_norm": 0.20325055718421936, "learning_rate": 3.587412831441858e-05, "loss": 0.3756, "step": 52190 }, { "epoch": 1.8811042635239845, "grad_norm": 0.1322261095046997, "learning_rate": 3.5871500614789155e-05, "loss": 0.3894, "step": 52195 }, { "epoch": 1.8812844631852093, "grad_norm": 0.1917523741722107, "learning_rate": 3.5868872767037646e-05, "loss": 0.4009, "step": 52200 }, { "epoch": 1.881464662846434, "grad_norm": 0.174628347158432, "learning_rate": 3.5866244771199855e-05, "loss": 0.395, "step": 52205 }, { "epoch": 1.8816448625076585, "grad_norm": 0.17742396891117096, "learning_rate": 3.586361662731157e-05, "loss": 0.4242, "step": 52210 }, { "epoch": 1.881825062168883, "grad_norm": 0.1793578863143921, "learning_rate": 3.5860988335408616e-05, "loss": 0.4331, "step": 52215 }, { "epoch": 1.8820052618301077, "grad_norm": 0.18306861817836761, "learning_rate": 3.5858359895526807e-05, "loss": 0.4215, "step": 52220 }, { "epoch": 1.8821854614913325, "grad_norm": 0.16969892382621765, "learning_rate": 3.585573130770193e-05, "loss": 0.4109, "step": 52225 }, { "epoch": 1.882365661152557, "grad_norm": 0.1748346984386444, "learning_rate": 3.585310257196983e-05, "loss": 0.3951, "step": 52230 }, { "epoch": 1.8825458608137817, "grad_norm": 0.20251350104808807, "learning_rate": 3.5850473688366306e-05, "loss": 0.4113, "step": 52235 }, { "epoch": 1.8827260604750062, "grad_norm": 0.19220057129859924, "learning_rate": 3.5847844656927176e-05, "loss": 0.3578, "step": 52240 }, { "epoch": 1.882906260136231, "grad_norm": 0.17758119106292725, "learning_rate": 3.584521547768826e-05, "loss": 0.3993, "step": 52245 }, { "epoch": 1.8830864597974557, "grad_norm": 0.17641225457191467, "learning_rate": 3.584258615068539e-05, "loss": 0.4343, "step": 52250 }, { "epoch": 1.8832666594586802, "grad_norm": 0.1818428784608841, "learning_rate": 3.583995667595437e-05, "loss": 0.3988, "step": 52255 }, { "epoch": 1.8834468591199047, "grad_norm": 0.16791364550590515, "learning_rate": 3.583732705353105e-05, "loss": 0.4238, "step": 52260 }, { "epoch": 1.8836270587811295, "grad_norm": 0.17646898329257965, "learning_rate": 3.5834697283451244e-05, "loss": 0.4154, "step": 52265 }, { "epoch": 1.8838072584423542, "grad_norm": 0.1645815521478653, "learning_rate": 3.5832067365750794e-05, "loss": 0.4345, "step": 52270 }, { "epoch": 1.883987458103579, "grad_norm": 0.1957109421491623, "learning_rate": 3.5829437300465504e-05, "loss": 0.3884, "step": 52275 }, { "epoch": 1.8841676577648034, "grad_norm": 0.20072638988494873, "learning_rate": 3.5826807087631243e-05, "loss": 0.3996, "step": 52280 }, { "epoch": 1.884347857426028, "grad_norm": 0.1894032508134842, "learning_rate": 3.582417672728383e-05, "loss": 0.3905, "step": 52285 }, { "epoch": 1.8845280570872527, "grad_norm": 0.20433971285820007, "learning_rate": 3.58215462194591e-05, "loss": 0.4112, "step": 52290 }, { "epoch": 1.8847082567484774, "grad_norm": 0.19983576238155365, "learning_rate": 3.58189155641929e-05, "loss": 0.4133, "step": 52295 }, { "epoch": 1.884888456409702, "grad_norm": 0.1698193997144699, "learning_rate": 3.581628476152107e-05, "loss": 0.4222, "step": 52300 }, { "epoch": 1.8850686560709264, "grad_norm": 0.18257997930049896, "learning_rate": 3.581365381147946e-05, "loss": 0.4192, "step": 52305 }, { "epoch": 1.8852488557321512, "grad_norm": 0.22537992894649506, "learning_rate": 3.5811022714103906e-05, "loss": 0.4401, "step": 52310 }, { "epoch": 1.885429055393376, "grad_norm": 0.19952908158302307, "learning_rate": 3.580839146943026e-05, "loss": 0.3866, "step": 52315 }, { "epoch": 1.8856092550546006, "grad_norm": 0.25064241886138916, "learning_rate": 3.5805760077494366e-05, "loss": 0.4678, "step": 52320 }, { "epoch": 1.8857894547158252, "grad_norm": 0.19262924790382385, "learning_rate": 3.580312853833209e-05, "loss": 0.4435, "step": 52325 }, { "epoch": 1.8859696543770497, "grad_norm": 0.12994927167892456, "learning_rate": 3.580049685197928e-05, "loss": 0.3827, "step": 52330 }, { "epoch": 1.8861498540382744, "grad_norm": 0.12715496122837067, "learning_rate": 3.5797865018471785e-05, "loss": 0.4211, "step": 52335 }, { "epoch": 1.8863300536994991, "grad_norm": 0.18410317599773407, "learning_rate": 3.5795233037845475e-05, "loss": 0.4057, "step": 52340 }, { "epoch": 1.8865102533607236, "grad_norm": 0.2159508764743805, "learning_rate": 3.579260091013621e-05, "loss": 0.4253, "step": 52345 }, { "epoch": 1.8866904530219482, "grad_norm": 0.23478345572948456, "learning_rate": 3.578996863537983e-05, "loss": 0.4077, "step": 52350 }, { "epoch": 1.8868706526831729, "grad_norm": 0.13263140618801117, "learning_rate": 3.578733621361223e-05, "loss": 0.381, "step": 52355 }, { "epoch": 1.8870508523443976, "grad_norm": 0.19846735894680023, "learning_rate": 3.578470364486926e-05, "loss": 0.4142, "step": 52360 }, { "epoch": 1.8872310520056224, "grad_norm": 0.1773809790611267, "learning_rate": 3.578259748407673e-05, "loss": 0.4116, "step": 52365 }, { "epoch": 1.8874112516668469, "grad_norm": 0.1703425943851471, "learning_rate": 3.577996465086848e-05, "loss": 0.4306, "step": 52370 }, { "epoch": 1.8875914513280714, "grad_norm": 0.1680893748998642, "learning_rate": 3.5777331670785305e-05, "loss": 0.4214, "step": 52375 }, { "epoch": 1.8877716509892961, "grad_norm": 0.2334313541650772, "learning_rate": 3.577469854386308e-05, "loss": 0.4327, "step": 52380 }, { "epoch": 1.8879518506505208, "grad_norm": 0.19660332798957825, "learning_rate": 3.5772065270137665e-05, "loss": 0.4319, "step": 52385 }, { "epoch": 1.8881320503117456, "grad_norm": 0.20360225439071655, "learning_rate": 3.5769431849644955e-05, "loss": 0.4157, "step": 52390 }, { "epoch": 1.88831224997297, "grad_norm": 0.1686132401227951, "learning_rate": 3.5766798282420814e-05, "loss": 0.3861, "step": 52395 }, { "epoch": 1.8884924496341946, "grad_norm": 0.2118513137102127, "learning_rate": 3.576416456850113e-05, "loss": 0.4282, "step": 52400 }, { "epoch": 1.8886726492954193, "grad_norm": 0.16424524784088135, "learning_rate": 3.5761530707921794e-05, "loss": 0.4134, "step": 52405 }, { "epoch": 1.888852848956644, "grad_norm": 0.17415526509284973, "learning_rate": 3.575889670071868e-05, "loss": 0.4505, "step": 52410 }, { "epoch": 1.8890330486178686, "grad_norm": 0.21902436017990112, "learning_rate": 3.575626254692768e-05, "loss": 0.3874, "step": 52415 }, { "epoch": 1.889213248279093, "grad_norm": 0.16111133992671967, "learning_rate": 3.5753628246584694e-05, "loss": 0.4127, "step": 52420 }, { "epoch": 1.8893934479403178, "grad_norm": 0.1768888384103775, "learning_rate": 3.57509937997256e-05, "loss": 0.4033, "step": 52425 }, { "epoch": 1.8895736476015426, "grad_norm": 0.18009909987449646, "learning_rate": 3.574835920638629e-05, "loss": 0.4052, "step": 52430 }, { "epoch": 1.8897538472627673, "grad_norm": 0.21755820512771606, "learning_rate": 3.574572446660268e-05, "loss": 0.4571, "step": 52435 }, { "epoch": 1.8899340469239918, "grad_norm": 0.21082909405231476, "learning_rate": 3.574308958041064e-05, "loss": 0.3826, "step": 52440 }, { "epoch": 1.8901142465852163, "grad_norm": 0.23044735193252563, "learning_rate": 3.57404545478461e-05, "loss": 0.4161, "step": 52445 }, { "epoch": 1.890294446246441, "grad_norm": 0.21793107688426971, "learning_rate": 3.573781936894493e-05, "loss": 0.4393, "step": 52450 }, { "epoch": 1.8904746459076658, "grad_norm": 0.15808548033237457, "learning_rate": 3.573518404374306e-05, "loss": 0.3924, "step": 52455 }, { "epoch": 1.8906548455688903, "grad_norm": 0.18602894246578217, "learning_rate": 3.5732548572276386e-05, "loss": 0.4046, "step": 52460 }, { "epoch": 1.8908350452301148, "grad_norm": 0.22336095571517944, "learning_rate": 3.572991295458081e-05, "loss": 0.4219, "step": 52465 }, { "epoch": 1.8910152448913395, "grad_norm": 0.1639654040336609, "learning_rate": 3.572727719069225e-05, "loss": 0.4163, "step": 52470 }, { "epoch": 1.8911954445525643, "grad_norm": 0.14199529588222504, "learning_rate": 3.572464128064662e-05, "loss": 0.4071, "step": 52475 }, { "epoch": 1.891375644213789, "grad_norm": 0.22127693891525269, "learning_rate": 3.5722005224479826e-05, "loss": 0.3902, "step": 52480 }, { "epoch": 1.8915558438750135, "grad_norm": 0.19781959056854248, "learning_rate": 3.571936902222778e-05, "loss": 0.4002, "step": 52485 }, { "epoch": 1.891736043536238, "grad_norm": 0.19771304726600647, "learning_rate": 3.571673267392642e-05, "loss": 0.4017, "step": 52490 }, { "epoch": 1.8919162431974628, "grad_norm": 0.22566063702106476, "learning_rate": 3.571409617961164e-05, "loss": 0.3925, "step": 52495 }, { "epoch": 1.8920964428586875, "grad_norm": 0.1664046347141266, "learning_rate": 3.571145953931938e-05, "loss": 0.3956, "step": 52500 }, { "epoch": 1.8920964428586875, "eval_loss": 0.4388228952884674, "eval_runtime": 3.5346, "eval_samples_per_second": 28.292, "eval_steps_per_second": 7.073, "step": 52500 }, { "epoch": 1.892276642519912, "grad_norm": 0.18760746717453003, "learning_rate": 3.5708822753085555e-05, "loss": 0.407, "step": 52505 }, { "epoch": 1.8924568421811367, "grad_norm": 0.19066941738128662, "learning_rate": 3.5706185820946094e-05, "loss": 0.3779, "step": 52510 }, { "epoch": 1.8926370418423613, "grad_norm": 0.18899337947368622, "learning_rate": 3.570354874293692e-05, "loss": 0.355, "step": 52515 }, { "epoch": 1.892817241503586, "grad_norm": 0.22324331104755402, "learning_rate": 3.570091151909397e-05, "loss": 0.406, "step": 52520 }, { "epoch": 1.8929974411648107, "grad_norm": 0.18958257138729095, "learning_rate": 3.569827414945317e-05, "loss": 0.426, "step": 52525 }, { "epoch": 1.8931776408260352, "grad_norm": 0.20323504507541656, "learning_rate": 3.5695636634050466e-05, "loss": 0.4198, "step": 52530 }, { "epoch": 1.8933578404872597, "grad_norm": 0.2171027809381485, "learning_rate": 3.569299897292177e-05, "loss": 0.4075, "step": 52535 }, { "epoch": 1.8935380401484845, "grad_norm": 0.17089755833148956, "learning_rate": 3.5690361166103045e-05, "loss": 0.3932, "step": 52540 }, { "epoch": 1.8937182398097092, "grad_norm": 0.21072880923748016, "learning_rate": 3.568772321363021e-05, "loss": 0.4229, "step": 52545 }, { "epoch": 1.893898439470934, "grad_norm": 0.20696797966957092, "learning_rate": 3.5685085115539225e-05, "loss": 0.4261, "step": 52550 }, { "epoch": 1.8940786391321585, "grad_norm": 0.1554139405488968, "learning_rate": 3.568244687186602e-05, "loss": 0.388, "step": 52555 }, { "epoch": 1.894258838793383, "grad_norm": 0.19356495141983032, "learning_rate": 3.5679808482646535e-05, "loss": 0.4374, "step": 52560 }, { "epoch": 1.8944390384546077, "grad_norm": 0.14796990156173706, "learning_rate": 3.567716994791674e-05, "loss": 0.3883, "step": 52565 }, { "epoch": 1.8946192381158324, "grad_norm": 0.21466509997844696, "learning_rate": 3.5674531267712566e-05, "loss": 0.435, "step": 52570 }, { "epoch": 1.894799437777057, "grad_norm": 0.16786347329616547, "learning_rate": 3.567189244206996e-05, "loss": 0.3936, "step": 52575 }, { "epoch": 1.8949796374382815, "grad_norm": 0.2334979772567749, "learning_rate": 3.5669253471024904e-05, "loss": 0.4226, "step": 52580 }, { "epoch": 1.8951598370995062, "grad_norm": 0.14717628061771393, "learning_rate": 3.5666614354613325e-05, "loss": 0.3976, "step": 52585 }, { "epoch": 1.895340036760731, "grad_norm": 0.17818215489387512, "learning_rate": 3.5663975092871194e-05, "loss": 0.3997, "step": 52590 }, { "epoch": 1.8955202364219557, "grad_norm": 0.20797117054462433, "learning_rate": 3.5661335685834466e-05, "loss": 0.4407, "step": 52595 }, { "epoch": 1.8957004360831802, "grad_norm": 0.17417512834072113, "learning_rate": 3.56586961335391e-05, "loss": 0.4273, "step": 52600 }, { "epoch": 1.8958806357444047, "grad_norm": 0.22176715731620789, "learning_rate": 3.565605643602107e-05, "loss": 0.4448, "step": 52605 }, { "epoch": 1.8960608354056294, "grad_norm": 0.1701037585735321, "learning_rate": 3.565341659331633e-05, "loss": 0.3807, "step": 52610 }, { "epoch": 1.8962410350668542, "grad_norm": 0.17541612684726715, "learning_rate": 3.565077660546085e-05, "loss": 0.4121, "step": 52615 }, { "epoch": 1.8964212347280787, "grad_norm": 0.17228418588638306, "learning_rate": 3.5648136472490604e-05, "loss": 0.4114, "step": 52620 }, { "epoch": 1.8966014343893034, "grad_norm": 0.1823887676000595, "learning_rate": 3.5645496194441555e-05, "loss": 0.422, "step": 52625 }, { "epoch": 1.896781634050528, "grad_norm": 0.181901216506958, "learning_rate": 3.564285577134969e-05, "loss": 0.4025, "step": 52630 }, { "epoch": 1.8969618337117526, "grad_norm": 0.19372449815273285, "learning_rate": 3.564021520325096e-05, "loss": 0.4179, "step": 52635 }, { "epoch": 1.8971420333729774, "grad_norm": 0.2167021781206131, "learning_rate": 3.5637574490181376e-05, "loss": 0.4143, "step": 52640 }, { "epoch": 1.8973222330342019, "grad_norm": 0.20177282392978668, "learning_rate": 3.563493363217689e-05, "loss": 0.3899, "step": 52645 }, { "epoch": 1.8975024326954264, "grad_norm": 0.18920306861400604, "learning_rate": 3.563229262927349e-05, "loss": 0.4006, "step": 52650 }, { "epoch": 1.8976826323566511, "grad_norm": 0.14058130979537964, "learning_rate": 3.562965148150716e-05, "loss": 0.3701, "step": 52655 }, { "epoch": 1.8978628320178759, "grad_norm": 0.20638848841190338, "learning_rate": 3.56270101889139e-05, "loss": 0.4196, "step": 52660 }, { "epoch": 1.8980430316791006, "grad_norm": 0.22467444837093353, "learning_rate": 3.562436875152967e-05, "loss": 0.4454, "step": 52665 }, { "epoch": 1.8982232313403251, "grad_norm": 0.17210647463798523, "learning_rate": 3.562172716939048e-05, "loss": 0.4351, "step": 52670 }, { "epoch": 1.8984034310015496, "grad_norm": 0.1578112095594406, "learning_rate": 3.561908544253231e-05, "loss": 0.4413, "step": 52675 }, { "epoch": 1.8985836306627744, "grad_norm": 0.15398284792900085, "learning_rate": 3.561644357099116e-05, "loss": 0.3976, "step": 52680 }, { "epoch": 1.898763830323999, "grad_norm": 0.18252988159656525, "learning_rate": 3.561380155480302e-05, "loss": 0.4166, "step": 52685 }, { "epoch": 1.8989440299852236, "grad_norm": 0.16655157506465912, "learning_rate": 3.561115939400389e-05, "loss": 0.4013, "step": 52690 }, { "epoch": 1.8991242296464481, "grad_norm": 0.1663331389427185, "learning_rate": 3.560851708862977e-05, "loss": 0.3884, "step": 52695 }, { "epoch": 1.8993044293076728, "grad_norm": 0.16247087717056274, "learning_rate": 3.560587463871665e-05, "loss": 0.3918, "step": 52700 }, { "epoch": 1.8994846289688976, "grad_norm": 0.18449072539806366, "learning_rate": 3.560323204430055e-05, "loss": 0.4069, "step": 52705 }, { "epoch": 1.8996648286301223, "grad_norm": 0.19125069677829742, "learning_rate": 3.560058930541746e-05, "loss": 0.4051, "step": 52710 }, { "epoch": 1.8998450282913468, "grad_norm": 0.15312950313091278, "learning_rate": 3.55979464221034e-05, "loss": 0.4207, "step": 52715 }, { "epoch": 1.9000252279525713, "grad_norm": 0.2004600614309311, "learning_rate": 3.559530339439436e-05, "loss": 0.4074, "step": 52720 }, { "epoch": 1.900205427613796, "grad_norm": 0.1745137721300125, "learning_rate": 3.5592660222326375e-05, "loss": 0.3963, "step": 52725 }, { "epoch": 1.9003856272750208, "grad_norm": 0.15934212505817413, "learning_rate": 3.5590016905935436e-05, "loss": 0.3571, "step": 52730 }, { "epoch": 1.9005658269362453, "grad_norm": 0.20348936319351196, "learning_rate": 3.558737344525758e-05, "loss": 0.4435, "step": 52735 }, { "epoch": 1.90074602659747, "grad_norm": 0.20140963792800903, "learning_rate": 3.558472984032879e-05, "loss": 0.4502, "step": 52740 }, { "epoch": 1.9009262262586946, "grad_norm": 0.16998355090618134, "learning_rate": 3.558208609118512e-05, "loss": 0.4119, "step": 52745 }, { "epoch": 1.9011064259199193, "grad_norm": 0.17498774826526642, "learning_rate": 3.5579442197862575e-05, "loss": 0.3913, "step": 52750 }, { "epoch": 1.901286625581144, "grad_norm": 0.1859028935432434, "learning_rate": 3.557679816039717e-05, "loss": 0.4298, "step": 52755 }, { "epoch": 1.9014668252423685, "grad_norm": 0.21767857670783997, "learning_rate": 3.5574153978824945e-05, "loss": 0.4364, "step": 52760 }, { "epoch": 1.901647024903593, "grad_norm": 0.1995837390422821, "learning_rate": 3.557150965318192e-05, "loss": 0.408, "step": 52765 }, { "epoch": 1.9018272245648178, "grad_norm": 0.22697186470031738, "learning_rate": 3.5568865183504127e-05, "loss": 0.4431, "step": 52770 }, { "epoch": 1.9020074242260425, "grad_norm": 0.17042651772499084, "learning_rate": 3.556622056982758e-05, "loss": 0.3834, "step": 52775 }, { "epoch": 1.9021876238872673, "grad_norm": 0.16556668281555176, "learning_rate": 3.556357581218833e-05, "loss": 0.4083, "step": 52780 }, { "epoch": 1.9023678235484918, "grad_norm": 0.16349823772907257, "learning_rate": 3.556093091062241e-05, "loss": 0.3985, "step": 52785 }, { "epoch": 1.9025480232097163, "grad_norm": 0.18236073851585388, "learning_rate": 3.555828586516584e-05, "loss": 0.4089, "step": 52790 }, { "epoch": 1.902728222870941, "grad_norm": 0.15176883339881897, "learning_rate": 3.5555640675854675e-05, "loss": 0.4, "step": 52795 }, { "epoch": 1.9029084225321657, "grad_norm": 0.16588926315307617, "learning_rate": 3.555299534272495e-05, "loss": 0.4054, "step": 52800 }, { "epoch": 1.9030886221933903, "grad_norm": 0.17433229088783264, "learning_rate": 3.55503498658127e-05, "loss": 0.4082, "step": 52805 }, { "epoch": 1.9032688218546148, "grad_norm": 0.178829625248909, "learning_rate": 3.5547704245153984e-05, "loss": 0.4344, "step": 52810 }, { "epoch": 1.9034490215158395, "grad_norm": 0.259390652179718, "learning_rate": 3.554505848078483e-05, "loss": 0.407, "step": 52815 }, { "epoch": 1.9036292211770642, "grad_norm": 0.20816822350025177, "learning_rate": 3.554241257274131e-05, "loss": 0.4278, "step": 52820 }, { "epoch": 1.903809420838289, "grad_norm": 0.22725282609462738, "learning_rate": 3.5539766521059455e-05, "loss": 0.414, "step": 52825 }, { "epoch": 1.9039896204995135, "grad_norm": 0.20979876816272736, "learning_rate": 3.553712032577532e-05, "loss": 0.4301, "step": 52830 }, { "epoch": 1.904169820160738, "grad_norm": 0.16655777394771576, "learning_rate": 3.5534473986924954e-05, "loss": 0.4324, "step": 52835 }, { "epoch": 1.9043500198219627, "grad_norm": 0.165932297706604, "learning_rate": 3.553182750454442e-05, "loss": 0.4295, "step": 52840 }, { "epoch": 1.9045302194831875, "grad_norm": 0.1914936900138855, "learning_rate": 3.552918087866979e-05, "loss": 0.4105, "step": 52845 }, { "epoch": 1.904710419144412, "grad_norm": 0.15866640210151672, "learning_rate": 3.552653410933709e-05, "loss": 0.3874, "step": 52850 }, { "epoch": 1.9048906188056365, "grad_norm": 0.15294025838375092, "learning_rate": 3.552388719658242e-05, "loss": 0.3981, "step": 52855 }, { "epoch": 1.9050708184668612, "grad_norm": 0.19104799628257751, "learning_rate": 3.552124014044181e-05, "loss": 0.4481, "step": 52860 }, { "epoch": 1.905251018128086, "grad_norm": 0.1697293370962143, "learning_rate": 3.551859294095135e-05, "loss": 0.4, "step": 52865 }, { "epoch": 1.9054312177893107, "grad_norm": 0.17631365358829498, "learning_rate": 3.551594559814709e-05, "loss": 0.3584, "step": 52870 }, { "epoch": 1.9056114174505352, "grad_norm": 0.20345862209796906, "learning_rate": 3.551329811206511e-05, "loss": 0.4251, "step": 52875 }, { "epoch": 1.9057916171117597, "grad_norm": 0.1694919317960739, "learning_rate": 3.551065048274147e-05, "loss": 0.4144, "step": 52880 }, { "epoch": 1.9059718167729844, "grad_norm": 0.1483502835035324, "learning_rate": 3.550800271021226e-05, "loss": 0.3912, "step": 52885 }, { "epoch": 1.9061520164342092, "grad_norm": 0.169077530503273, "learning_rate": 3.550535479451356e-05, "loss": 0.4053, "step": 52890 }, { "epoch": 1.906332216095434, "grad_norm": 0.20600876212120056, "learning_rate": 3.550270673568141e-05, "loss": 0.4135, "step": 52895 }, { "epoch": 1.9065124157566584, "grad_norm": 0.2073904126882553, "learning_rate": 3.550005853375193e-05, "loss": 0.4526, "step": 52900 }, { "epoch": 1.906692615417883, "grad_norm": 0.17720499634742737, "learning_rate": 3.549741018876118e-05, "loss": 0.4018, "step": 52905 }, { "epoch": 1.9068728150791077, "grad_norm": 0.1841294914484024, "learning_rate": 3.549476170074526e-05, "loss": 0.4054, "step": 52910 }, { "epoch": 1.9070530147403324, "grad_norm": 0.20570293068885803, "learning_rate": 3.549211306974023e-05, "loss": 0.4207, "step": 52915 }, { "epoch": 1.907233214401557, "grad_norm": 0.1706700176000595, "learning_rate": 3.548946429578219e-05, "loss": 0.4166, "step": 52920 }, { "epoch": 1.9074134140627814, "grad_norm": 0.19486333429813385, "learning_rate": 3.5486815378907234e-05, "loss": 0.4408, "step": 52925 }, { "epoch": 1.9075936137240062, "grad_norm": 0.15442302823066711, "learning_rate": 3.548416631915146e-05, "loss": 0.4376, "step": 52930 }, { "epoch": 1.9077738133852309, "grad_norm": 0.1942632794380188, "learning_rate": 3.5481517116550936e-05, "loss": 0.3904, "step": 52935 }, { "epoch": 1.9079540130464556, "grad_norm": 0.16237981617450714, "learning_rate": 3.547886777114177e-05, "loss": 0.353, "step": 52940 }, { "epoch": 1.9081342127076801, "grad_norm": 0.2059374898672104, "learning_rate": 3.5476218282960064e-05, "loss": 0.4277, "step": 52945 }, { "epoch": 1.9083144123689046, "grad_norm": 0.1902727484703064, "learning_rate": 3.547356865204191e-05, "loss": 0.4187, "step": 52950 }, { "epoch": 1.9084946120301294, "grad_norm": 0.19110523164272308, "learning_rate": 3.5470918878423414e-05, "loss": 0.4122, "step": 52955 }, { "epoch": 1.9086748116913541, "grad_norm": 0.19622482359409332, "learning_rate": 3.546826896214067e-05, "loss": 0.3987, "step": 52960 }, { "epoch": 1.9088550113525786, "grad_norm": 0.19886255264282227, "learning_rate": 3.546561890322979e-05, "loss": 0.4114, "step": 52965 }, { "epoch": 1.9090352110138031, "grad_norm": 0.1465664505958557, "learning_rate": 3.546296870172689e-05, "loss": 0.3839, "step": 52970 }, { "epoch": 1.9092154106750279, "grad_norm": 0.19967186450958252, "learning_rate": 3.546031835766806e-05, "loss": 0.4433, "step": 52975 }, { "epoch": 1.9093956103362526, "grad_norm": 0.17404790222644806, "learning_rate": 3.545766787108941e-05, "loss": 0.4215, "step": 52980 }, { "epoch": 1.9095758099974773, "grad_norm": 0.15910834074020386, "learning_rate": 3.545501724202706e-05, "loss": 0.3836, "step": 52985 }, { "epoch": 1.9097560096587018, "grad_norm": 0.17526118457317352, "learning_rate": 3.545236647051713e-05, "loss": 0.3867, "step": 52990 }, { "epoch": 1.9099362093199264, "grad_norm": 0.1866360753774643, "learning_rate": 3.544971555659574e-05, "loss": 0.3812, "step": 52995 }, { "epoch": 1.910116408981151, "grad_norm": 0.1662186235189438, "learning_rate": 3.544706450029898e-05, "loss": 0.4114, "step": 53000 }, { "epoch": 1.910116408981151, "eval_loss": 0.4382114112377167, "eval_runtime": 3.5422, "eval_samples_per_second": 28.231, "eval_steps_per_second": 7.058, "step": 53000 }, { "epoch": 1.9102966086423758, "grad_norm": 0.1859690397977829, "learning_rate": 3.5444413301662996e-05, "loss": 0.4291, "step": 53005 }, { "epoch": 1.9104768083036003, "grad_norm": 0.2037961333990097, "learning_rate": 3.544176196072391e-05, "loss": 0.4178, "step": 53010 }, { "epoch": 1.910657007964825, "grad_norm": 0.19950434565544128, "learning_rate": 3.543911047751783e-05, "loss": 0.3977, "step": 53015 }, { "epoch": 1.9108372076260496, "grad_norm": 0.169523224234581, "learning_rate": 3.5436458852080895e-05, "loss": 0.4187, "step": 53020 }, { "epoch": 1.9110174072872743, "grad_norm": 0.19487643241882324, "learning_rate": 3.543380708444923e-05, "loss": 0.4207, "step": 53025 }, { "epoch": 1.911197606948499, "grad_norm": 0.1871878057718277, "learning_rate": 3.543115517465896e-05, "loss": 0.4376, "step": 53030 }, { "epoch": 1.9113778066097236, "grad_norm": 0.17613734304904938, "learning_rate": 3.542850312274622e-05, "loss": 0.4136, "step": 53035 }, { "epoch": 1.911558006270948, "grad_norm": 0.17650562524795532, "learning_rate": 3.542585092874714e-05, "loss": 0.3665, "step": 53040 }, { "epoch": 1.9117382059321728, "grad_norm": 0.16049924492835999, "learning_rate": 3.542319859269787e-05, "loss": 0.3824, "step": 53045 }, { "epoch": 1.9119184055933975, "grad_norm": 0.1577320694923401, "learning_rate": 3.5420546114634535e-05, "loss": 0.3998, "step": 53050 }, { "epoch": 1.9120986052546223, "grad_norm": 0.22599272429943085, "learning_rate": 3.541789349459327e-05, "loss": 0.4059, "step": 53055 }, { "epoch": 1.9122788049158468, "grad_norm": 0.16319070756435394, "learning_rate": 3.541524073261023e-05, "loss": 0.3643, "step": 53060 }, { "epoch": 1.9124590045770713, "grad_norm": 0.20491214096546173, "learning_rate": 3.541258782872154e-05, "loss": 0.412, "step": 53065 }, { "epoch": 1.912639204238296, "grad_norm": 0.14605240523815155, "learning_rate": 3.540993478296337e-05, "loss": 0.3939, "step": 53070 }, { "epoch": 1.9128194038995208, "grad_norm": 0.18107111752033234, "learning_rate": 3.540728159537185e-05, "loss": 0.4318, "step": 53075 }, { "epoch": 1.9129996035607453, "grad_norm": 0.16898123919963837, "learning_rate": 3.540462826598313e-05, "loss": 0.4414, "step": 53080 }, { "epoch": 1.9131798032219698, "grad_norm": 0.1803431212902069, "learning_rate": 3.540197479483337e-05, "loss": 0.4014, "step": 53085 }, { "epoch": 1.9133600028831945, "grad_norm": 0.17735476791858673, "learning_rate": 3.539932118195871e-05, "loss": 0.3945, "step": 53090 }, { "epoch": 1.9135402025444193, "grad_norm": 0.1508413851261139, "learning_rate": 3.539666742739532e-05, "loss": 0.4031, "step": 53095 }, { "epoch": 1.913720402205644, "grad_norm": 0.21895846724510193, "learning_rate": 3.539401353117935e-05, "loss": 0.4405, "step": 53100 }, { "epoch": 1.9139006018668685, "grad_norm": 0.22758044302463531, "learning_rate": 3.539135949334695e-05, "loss": 0.4415, "step": 53105 }, { "epoch": 1.914080801528093, "grad_norm": 0.17309550940990448, "learning_rate": 3.538870531393429e-05, "loss": 0.4228, "step": 53110 }, { "epoch": 1.9142610011893177, "grad_norm": 0.2273998111486435, "learning_rate": 3.538605099297754e-05, "loss": 0.4158, "step": 53115 }, { "epoch": 1.9144412008505425, "grad_norm": 0.17294365167617798, "learning_rate": 3.538339653051285e-05, "loss": 0.3855, "step": 53120 }, { "epoch": 1.914621400511767, "grad_norm": 0.2204235941171646, "learning_rate": 3.5380741926576385e-05, "loss": 0.4159, "step": 53125 }, { "epoch": 1.9148016001729917, "grad_norm": 0.18429414927959442, "learning_rate": 3.537808718120433e-05, "loss": 0.4255, "step": 53130 }, { "epoch": 1.9149817998342162, "grad_norm": 0.18683667480945587, "learning_rate": 3.537543229443285e-05, "loss": 0.4042, "step": 53135 }, { "epoch": 1.915161999495441, "grad_norm": 0.20948028564453125, "learning_rate": 3.537330828323237e-05, "loss": 0.3947, "step": 53140 }, { "epoch": 1.9153421991566657, "grad_norm": 0.17518873512744904, "learning_rate": 3.5370653142033075e-05, "loss": 0.4003, "step": 53145 }, { "epoch": 1.9155223988178902, "grad_norm": 0.13344982266426086, "learning_rate": 3.536799785953563e-05, "loss": 0.4035, "step": 53150 }, { "epoch": 1.9157025984791147, "grad_norm": 0.1912095695734024, "learning_rate": 3.5365342435776225e-05, "loss": 0.4021, "step": 53155 }, { "epoch": 1.9158827981403395, "grad_norm": 0.19049333035945892, "learning_rate": 3.536268687079104e-05, "loss": 0.3966, "step": 53160 }, { "epoch": 1.9160629978015642, "grad_norm": 0.14792177081108093, "learning_rate": 3.5360031164616244e-05, "loss": 0.4132, "step": 53165 }, { "epoch": 1.916243197462789, "grad_norm": 0.1720864176750183, "learning_rate": 3.535737531728803e-05, "loss": 0.4254, "step": 53170 }, { "epoch": 1.9164233971240134, "grad_norm": 0.2024461328983307, "learning_rate": 3.5354719328842585e-05, "loss": 0.4462, "step": 53175 }, { "epoch": 1.916603596785238, "grad_norm": 0.19676236808300018, "learning_rate": 3.53520631993161e-05, "loss": 0.4637, "step": 53180 }, { "epoch": 1.9167837964464627, "grad_norm": 0.19073833525180817, "learning_rate": 3.534940692874475e-05, "loss": 0.4156, "step": 53185 }, { "epoch": 1.9169639961076874, "grad_norm": 0.18771030008792877, "learning_rate": 3.5346750517164736e-05, "loss": 0.3749, "step": 53190 }, { "epoch": 1.917144195768912, "grad_norm": 0.20612174272537231, "learning_rate": 3.534409396461225e-05, "loss": 0.4305, "step": 53195 }, { "epoch": 1.9173243954301364, "grad_norm": 0.18494312465190887, "learning_rate": 3.534143727112349e-05, "loss": 0.4385, "step": 53200 }, { "epoch": 1.9175045950913612, "grad_norm": 0.22901670634746552, "learning_rate": 3.533878043673464e-05, "loss": 0.4131, "step": 53205 }, { "epoch": 1.917684794752586, "grad_norm": 0.18775056302547455, "learning_rate": 3.533612346148192e-05, "loss": 0.4204, "step": 53210 }, { "epoch": 1.9178649944138106, "grad_norm": 0.2237263172864914, "learning_rate": 3.533346634540151e-05, "loss": 0.4032, "step": 53215 }, { "epoch": 1.9180451940750352, "grad_norm": 0.24082402884960175, "learning_rate": 3.533080908852962e-05, "loss": 0.3854, "step": 53220 }, { "epoch": 1.9182253937362597, "grad_norm": 0.18689344823360443, "learning_rate": 3.5328151690902465e-05, "loss": 0.3829, "step": 53225 }, { "epoch": 1.9184055933974844, "grad_norm": 0.20460011065006256, "learning_rate": 3.532549415255624e-05, "loss": 0.4378, "step": 53230 }, { "epoch": 1.9185857930587091, "grad_norm": 0.18038150668144226, "learning_rate": 3.5322836473527154e-05, "loss": 0.4306, "step": 53235 }, { "epoch": 1.9187659927199336, "grad_norm": 0.18723049759864807, "learning_rate": 3.5320178653851425e-05, "loss": 0.384, "step": 53240 }, { "epoch": 1.9189461923811584, "grad_norm": 0.21049338579177856, "learning_rate": 3.531752069356525e-05, "loss": 0.4149, "step": 53245 }, { "epoch": 1.919126392042383, "grad_norm": 0.17135761678218842, "learning_rate": 3.531486259270486e-05, "loss": 0.4073, "step": 53250 }, { "epoch": 1.9193065917036076, "grad_norm": 0.1982400119304657, "learning_rate": 3.531220435130646e-05, "loss": 0.3981, "step": 53255 }, { "epoch": 1.9194867913648324, "grad_norm": 0.18633022904396057, "learning_rate": 3.530954596940628e-05, "loss": 0.3868, "step": 53260 }, { "epoch": 1.9196669910260569, "grad_norm": 0.20298829674720764, "learning_rate": 3.530688744704053e-05, "loss": 0.4442, "step": 53265 }, { "epoch": 1.9198471906872814, "grad_norm": 0.17261956632137299, "learning_rate": 3.530422878424543e-05, "loss": 0.3916, "step": 53270 }, { "epoch": 1.9200273903485061, "grad_norm": 0.2012072056531906, "learning_rate": 3.5301569981057205e-05, "loss": 0.4336, "step": 53275 }, { "epoch": 1.9202075900097308, "grad_norm": 0.17168846726417542, "learning_rate": 3.5298911037512086e-05, "loss": 0.3852, "step": 53280 }, { "epoch": 1.9203877896709556, "grad_norm": 0.21444140374660492, "learning_rate": 3.52962519536463e-05, "loss": 0.4031, "step": 53285 }, { "epoch": 1.92056798933218, "grad_norm": 0.206976979970932, "learning_rate": 3.5293592729496076e-05, "loss": 0.3876, "step": 53290 }, { "epoch": 1.9207481889934046, "grad_norm": 0.17099922895431519, "learning_rate": 3.529093336509764e-05, "loss": 0.4347, "step": 53295 }, { "epoch": 1.9209283886546293, "grad_norm": 0.19171853363513947, "learning_rate": 3.528827386048723e-05, "loss": 0.4066, "step": 53300 }, { "epoch": 1.921108588315854, "grad_norm": 0.24429461359977722, "learning_rate": 3.528561421570108e-05, "loss": 0.4134, "step": 53305 }, { "epoch": 1.9212887879770786, "grad_norm": 0.1823347806930542, "learning_rate": 3.528295443077543e-05, "loss": 0.4363, "step": 53310 }, { "epoch": 1.921468987638303, "grad_norm": 0.22016625106334686, "learning_rate": 3.528029450574651e-05, "loss": 0.4154, "step": 53315 }, { "epoch": 1.9216491872995278, "grad_norm": 0.18516206741333008, "learning_rate": 3.5277634440650584e-05, "loss": 0.4497, "step": 53320 }, { "epoch": 1.9218293869607526, "grad_norm": 0.18409641087055206, "learning_rate": 3.527497423552386e-05, "loss": 0.4074, "step": 53325 }, { "epoch": 1.9220095866219773, "grad_norm": 0.18460574746131897, "learning_rate": 3.527231389040262e-05, "loss": 0.385, "step": 53330 }, { "epoch": 1.9221897862832018, "grad_norm": 0.18730106949806213, "learning_rate": 3.526965340532308e-05, "loss": 0.4215, "step": 53335 }, { "epoch": 1.9223699859444263, "grad_norm": 0.20932596921920776, "learning_rate": 3.52669927803215e-05, "loss": 0.38, "step": 53340 }, { "epoch": 1.922550185605651, "grad_norm": 0.20945709943771362, "learning_rate": 3.5264332015434134e-05, "loss": 0.386, "step": 53345 }, { "epoch": 1.9227303852668758, "grad_norm": 0.19625355303287506, "learning_rate": 3.5261671110697234e-05, "loss": 0.4177, "step": 53350 }, { "epoch": 1.9229105849281003, "grad_norm": 0.16801773011684418, "learning_rate": 3.525901006614705e-05, "loss": 0.4078, "step": 53355 }, { "epoch": 1.9230907845893248, "grad_norm": 0.23051504790782928, "learning_rate": 3.525634888181983e-05, "loss": 0.4157, "step": 53360 }, { "epoch": 1.9232709842505495, "grad_norm": 0.17896528542041779, "learning_rate": 3.525368755775186e-05, "loss": 0.4013, "step": 53365 }, { "epoch": 1.9234511839117743, "grad_norm": 0.24281032383441925, "learning_rate": 3.525102609397937e-05, "loss": 0.4071, "step": 53370 }, { "epoch": 1.923631383572999, "grad_norm": 0.19194422662258148, "learning_rate": 3.524836449053864e-05, "loss": 0.4225, "step": 53375 }, { "epoch": 1.9238115832342235, "grad_norm": 0.18676306307315826, "learning_rate": 3.5245702747465924e-05, "loss": 0.4121, "step": 53380 }, { "epoch": 1.923991782895448, "grad_norm": 0.1551360785961151, "learning_rate": 3.524304086479749e-05, "loss": 0.4243, "step": 53385 }, { "epoch": 1.9241719825566728, "grad_norm": 0.1882885843515396, "learning_rate": 3.5240378842569614e-05, "loss": 0.3898, "step": 53390 }, { "epoch": 1.9243521822178975, "grad_norm": 0.22999207675457, "learning_rate": 3.5237716680818554e-05, "loss": 0.4386, "step": 53395 }, { "epoch": 1.9245323818791222, "grad_norm": 0.18411701917648315, "learning_rate": 3.523505437958059e-05, "loss": 0.3995, "step": 53400 }, { "epoch": 1.9247125815403467, "grad_norm": 0.1591692715883255, "learning_rate": 3.5232391938891983e-05, "loss": 0.3968, "step": 53405 }, { "epoch": 1.9248927812015713, "grad_norm": 0.1875472217798233, "learning_rate": 3.5229729358789026e-05, "loss": 0.4125, "step": 53410 }, { "epoch": 1.925072980862796, "grad_norm": 0.19325365126132965, "learning_rate": 3.522706663930799e-05, "loss": 0.404, "step": 53415 }, { "epoch": 1.9252531805240207, "grad_norm": 0.17323115468025208, "learning_rate": 3.5224403780485136e-05, "loss": 0.4047, "step": 53420 }, { "epoch": 1.9254333801852452, "grad_norm": 0.15711285173892975, "learning_rate": 3.522174078235677e-05, "loss": 0.3875, "step": 53425 }, { "epoch": 1.9256135798464697, "grad_norm": 0.1933702826499939, "learning_rate": 3.521907764495917e-05, "loss": 0.4531, "step": 53430 }, { "epoch": 1.9257937795076945, "grad_norm": 0.20678189396858215, "learning_rate": 3.5216414368328607e-05, "loss": 0.4352, "step": 53435 }, { "epoch": 1.9259739791689192, "grad_norm": 0.17905567586421967, "learning_rate": 3.521375095250138e-05, "loss": 0.4394, "step": 53440 }, { "epoch": 1.926154178830144, "grad_norm": 0.19163614511489868, "learning_rate": 3.521108739751377e-05, "loss": 0.3998, "step": 53445 }, { "epoch": 1.9263343784913685, "grad_norm": 0.1524534672498703, "learning_rate": 3.5208423703402075e-05, "loss": 0.3911, "step": 53450 }, { "epoch": 1.926514578152593, "grad_norm": 0.15856072306632996, "learning_rate": 3.520575987020258e-05, "loss": 0.4365, "step": 53455 }, { "epoch": 1.9266947778138177, "grad_norm": 0.20642130076885223, "learning_rate": 3.520309589795159e-05, "loss": 0.4418, "step": 53460 }, { "epoch": 1.9268749774750424, "grad_norm": 0.1950474977493286, "learning_rate": 3.520043178668538e-05, "loss": 0.4325, "step": 53465 }, { "epoch": 1.927055177136267, "grad_norm": 0.1779826283454895, "learning_rate": 3.519776753644028e-05, "loss": 0.4068, "step": 53470 }, { "epoch": 1.9272353767974915, "grad_norm": 0.19303326308727264, "learning_rate": 3.5195103147252564e-05, "loss": 0.4287, "step": 53475 }, { "epoch": 1.9274155764587162, "grad_norm": 0.20207709074020386, "learning_rate": 3.5192438619158536e-05, "loss": 0.4266, "step": 53480 }, { "epoch": 1.927595776119941, "grad_norm": 0.16578246653079987, "learning_rate": 3.5189773952194506e-05, "loss": 0.377, "step": 53485 }, { "epoch": 1.9277759757811657, "grad_norm": 0.17136074602603912, "learning_rate": 3.518710914639678e-05, "loss": 0.3963, "step": 53490 }, { "epoch": 1.9279561754423902, "grad_norm": 0.1691044121980667, "learning_rate": 3.518444420180167e-05, "loss": 0.4504, "step": 53495 }, { "epoch": 1.9281363751036147, "grad_norm": 0.1666347235441208, "learning_rate": 3.518177911844547e-05, "loss": 0.4177, "step": 53500 }, { "epoch": 1.9281363751036147, "eval_loss": 0.43853840231895447, "eval_runtime": 3.5305, "eval_samples_per_second": 28.325, "eval_steps_per_second": 7.081, "step": 53500 }, { "epoch": 1.9283165747648394, "grad_norm": 0.18548129498958588, "learning_rate": 3.5179113896364504e-05, "loss": 0.4455, "step": 53505 }, { "epoch": 1.9284967744260642, "grad_norm": 0.16688363254070282, "learning_rate": 3.517644853559509e-05, "loss": 0.3915, "step": 53510 }, { "epoch": 1.9286769740872887, "grad_norm": 0.14306481182575226, "learning_rate": 3.517378303617353e-05, "loss": 0.425, "step": 53515 }, { "epoch": 1.9288571737485134, "grad_norm": 0.13732267916202545, "learning_rate": 3.517111739813615e-05, "loss": 0.4324, "step": 53520 }, { "epoch": 1.929037373409738, "grad_norm": 0.1663437783718109, "learning_rate": 3.516845162151925e-05, "loss": 0.4314, "step": 53525 }, { "epoch": 1.9292175730709626, "grad_norm": 0.18951812386512756, "learning_rate": 3.516578570635917e-05, "loss": 0.4099, "step": 53530 }, { "epoch": 1.9293977727321874, "grad_norm": 0.17902812361717224, "learning_rate": 3.5163119652692236e-05, "loss": 0.4343, "step": 53535 }, { "epoch": 1.9295779723934119, "grad_norm": 0.1834252029657364, "learning_rate": 3.5160453460554766e-05, "loss": 0.3859, "step": 53540 }, { "epoch": 1.9297581720546364, "grad_norm": 0.21351586282253265, "learning_rate": 3.515778712998307e-05, "loss": 0.4282, "step": 53545 }, { "epoch": 1.9299383717158611, "grad_norm": 0.18002468347549438, "learning_rate": 3.515512066101351e-05, "loss": 0.4183, "step": 53550 }, { "epoch": 1.9301185713770859, "grad_norm": 0.18808430433273315, "learning_rate": 3.515245405368238e-05, "loss": 0.3664, "step": 53555 }, { "epoch": 1.9302987710383106, "grad_norm": 0.17705509066581726, "learning_rate": 3.5149787308026036e-05, "loss": 0.3765, "step": 53560 }, { "epoch": 1.9304789706995351, "grad_norm": 0.17073272168636322, "learning_rate": 3.514712042408081e-05, "loss": 0.4367, "step": 53565 }, { "epoch": 1.9306591703607596, "grad_norm": 0.19067902863025665, "learning_rate": 3.514445340188303e-05, "loss": 0.4082, "step": 53570 }, { "epoch": 1.9308393700219844, "grad_norm": 0.19914193451404572, "learning_rate": 3.5141786241469033e-05, "loss": 0.4077, "step": 53575 }, { "epoch": 1.931019569683209, "grad_norm": 0.15693803131580353, "learning_rate": 3.513911894287516e-05, "loss": 0.3914, "step": 53580 }, { "epoch": 1.9311997693444336, "grad_norm": 0.18516100943088531, "learning_rate": 3.5136451506137766e-05, "loss": 0.4613, "step": 53585 }, { "epoch": 1.9313799690056581, "grad_norm": 0.18044835329055786, "learning_rate": 3.513378393129317e-05, "loss": 0.4471, "step": 53590 }, { "epoch": 1.9315601686668828, "grad_norm": 0.19149909913539886, "learning_rate": 3.5131116218377735e-05, "loss": 0.4116, "step": 53595 }, { "epoch": 1.9317403683281076, "grad_norm": 0.1986377239227295, "learning_rate": 3.51284483674278e-05, "loss": 0.4369, "step": 53600 }, { "epoch": 1.9319205679893323, "grad_norm": 0.16995567083358765, "learning_rate": 3.5125780378479725e-05, "loss": 0.4208, "step": 53605 }, { "epoch": 1.9321007676505568, "grad_norm": 0.16595984995365143, "learning_rate": 3.5123112251569844e-05, "loss": 0.3805, "step": 53610 }, { "epoch": 1.9322809673117813, "grad_norm": 0.2101103663444519, "learning_rate": 3.5120443986734526e-05, "loss": 0.3964, "step": 53615 }, { "epoch": 1.932461166973006, "grad_norm": 0.15714338421821594, "learning_rate": 3.511777558401012e-05, "loss": 0.3843, "step": 53620 }, { "epoch": 1.9326413666342308, "grad_norm": 0.14750663936138153, "learning_rate": 3.511510704343297e-05, "loss": 0.4092, "step": 53625 }, { "epoch": 1.9328215662954553, "grad_norm": 0.16553977131843567, "learning_rate": 3.511243836503944e-05, "loss": 0.419, "step": 53630 }, { "epoch": 1.93300176595668, "grad_norm": 0.23781326413154602, "learning_rate": 3.5109769548865914e-05, "loss": 0.3938, "step": 53635 }, { "epoch": 1.9331819656179046, "grad_norm": 0.19198031723499298, "learning_rate": 3.510710059494871e-05, "loss": 0.4057, "step": 53640 }, { "epoch": 1.9333621652791293, "grad_norm": 0.1985791027545929, "learning_rate": 3.5104431503324244e-05, "loss": 0.4248, "step": 53645 }, { "epoch": 1.933542364940354, "grad_norm": 0.18147054314613342, "learning_rate": 3.510176227402884e-05, "loss": 0.3811, "step": 53650 }, { "epoch": 1.9337225646015785, "grad_norm": 0.20852439105510712, "learning_rate": 3.509909290709889e-05, "loss": 0.4246, "step": 53655 }, { "epoch": 1.933902764262803, "grad_norm": 0.19854693114757538, "learning_rate": 3.5096423402570746e-05, "loss": 0.4155, "step": 53660 }, { "epoch": 1.9340829639240278, "grad_norm": 0.15732385218143463, "learning_rate": 3.5093753760480794e-05, "loss": 0.4039, "step": 53665 }, { "epoch": 1.9342631635852525, "grad_norm": 0.17299677431583405, "learning_rate": 3.509108398086539e-05, "loss": 0.4189, "step": 53670 }, { "epoch": 1.9344433632464773, "grad_norm": 0.17880766093730927, "learning_rate": 3.508841406376093e-05, "loss": 0.4369, "step": 53675 }, { "epoch": 1.9346235629077018, "grad_norm": 0.23910261690616608, "learning_rate": 3.508574400920379e-05, "loss": 0.377, "step": 53680 }, { "epoch": 1.9348037625689263, "grad_norm": 0.165726900100708, "learning_rate": 3.508307381723032e-05, "loss": 0.4052, "step": 53685 }, { "epoch": 1.934983962230151, "grad_norm": 0.18893593549728394, "learning_rate": 3.508040348787694e-05, "loss": 0.3806, "step": 53690 }, { "epoch": 1.9351641618913757, "grad_norm": 0.22258023917675018, "learning_rate": 3.5077733021180006e-05, "loss": 0.402, "step": 53695 }, { "epoch": 1.9353443615526003, "grad_norm": 0.17295654118061066, "learning_rate": 3.5075062417175905e-05, "loss": 0.4317, "step": 53700 }, { "epoch": 1.9355245612138248, "grad_norm": 0.1404528021812439, "learning_rate": 3.507239167590104e-05, "loss": 0.4074, "step": 53705 }, { "epoch": 1.9357047608750495, "grad_norm": 0.16644835472106934, "learning_rate": 3.5069720797391784e-05, "loss": 0.4192, "step": 53710 }, { "epoch": 1.9358849605362742, "grad_norm": 0.21839553117752075, "learning_rate": 3.506704978168453e-05, "loss": 0.4101, "step": 53715 }, { "epoch": 1.936065160197499, "grad_norm": 0.21238647401332855, "learning_rate": 3.506437862881567e-05, "loss": 0.4314, "step": 53720 }, { "epoch": 1.9362453598587235, "grad_norm": 0.19997538626194, "learning_rate": 3.506170733882161e-05, "loss": 0.371, "step": 53725 }, { "epoch": 1.936425559519948, "grad_norm": 0.18178488314151764, "learning_rate": 3.505903591173872e-05, "loss": 0.4023, "step": 53730 }, { "epoch": 1.9366057591811727, "grad_norm": 0.161050945520401, "learning_rate": 3.505636434760343e-05, "loss": 0.4041, "step": 53735 }, { "epoch": 1.9367859588423975, "grad_norm": 0.22217293083667755, "learning_rate": 3.505369264645211e-05, "loss": 0.4249, "step": 53740 }, { "epoch": 1.936966158503622, "grad_norm": 0.1790328472852707, "learning_rate": 3.505102080832118e-05, "loss": 0.405, "step": 53745 }, { "epoch": 1.9371463581648467, "grad_norm": 0.17486228048801422, "learning_rate": 3.504834883324704e-05, "loss": 0.3947, "step": 53750 }, { "epoch": 1.9373265578260712, "grad_norm": 0.1511772871017456, "learning_rate": 3.504567672126608e-05, "loss": 0.3633, "step": 53755 }, { "epoch": 1.937506757487296, "grad_norm": 0.18033269047737122, "learning_rate": 3.504300447241473e-05, "loss": 0.393, "step": 53760 }, { "epoch": 1.9376869571485207, "grad_norm": 0.20827585458755493, "learning_rate": 3.504033208672939e-05, "loss": 0.4061, "step": 53765 }, { "epoch": 1.9378671568097452, "grad_norm": 0.22322550415992737, "learning_rate": 3.5037659564246464e-05, "loss": 0.4297, "step": 53770 }, { "epoch": 1.9380473564709697, "grad_norm": 0.22495147585868835, "learning_rate": 3.5034986905002365e-05, "loss": 0.396, "step": 53775 }, { "epoch": 1.9382275561321944, "grad_norm": 0.1754370629787445, "learning_rate": 3.503231410903352e-05, "loss": 0.4142, "step": 53780 }, { "epoch": 1.9384077557934192, "grad_norm": 0.168187215924263, "learning_rate": 3.5029641176376335e-05, "loss": 0.413, "step": 53785 }, { "epoch": 1.938587955454644, "grad_norm": 0.17752215266227722, "learning_rate": 3.502696810706723e-05, "loss": 0.4108, "step": 53790 }, { "epoch": 1.9387681551158684, "grad_norm": 0.19909372925758362, "learning_rate": 3.502429490114263e-05, "loss": 0.4196, "step": 53795 }, { "epoch": 1.938948354777093, "grad_norm": 0.16391627490520477, "learning_rate": 3.502162155863896e-05, "loss": 0.429, "step": 53800 }, { "epoch": 1.9391285544383177, "grad_norm": 0.19088739156723022, "learning_rate": 3.5018948079592626e-05, "loss": 0.356, "step": 53805 }, { "epoch": 1.9393087540995424, "grad_norm": 0.16824400424957275, "learning_rate": 3.501627446404006e-05, "loss": 0.3539, "step": 53810 }, { "epoch": 1.939488953760767, "grad_norm": 0.19399607181549072, "learning_rate": 3.5013600712017704e-05, "loss": 0.3882, "step": 53815 }, { "epoch": 1.9396691534219914, "grad_norm": 0.159646674990654, "learning_rate": 3.501092682356197e-05, "loss": 0.3816, "step": 53820 }, { "epoch": 1.9398493530832162, "grad_norm": 0.22390936315059662, "learning_rate": 3.5008252798709294e-05, "loss": 0.4189, "step": 53825 }, { "epoch": 1.9400295527444409, "grad_norm": 0.18720601499080658, "learning_rate": 3.5005578637496114e-05, "loss": 0.3893, "step": 53830 }, { "epoch": 1.9402097524056656, "grad_norm": 0.2625311613082886, "learning_rate": 3.500290433995886e-05, "loss": 0.386, "step": 53835 }, { "epoch": 1.9403899520668901, "grad_norm": 0.19546449184417725, "learning_rate": 3.500022990613398e-05, "loss": 0.4227, "step": 53840 }, { "epoch": 1.9405701517281146, "grad_norm": 0.19765974581241608, "learning_rate": 3.49975553360579e-05, "loss": 0.3913, "step": 53845 }, { "epoch": 1.9407503513893394, "grad_norm": 0.17598837614059448, "learning_rate": 3.4994880629767056e-05, "loss": 0.3854, "step": 53850 }, { "epoch": 1.9409305510505641, "grad_norm": 0.17875751852989197, "learning_rate": 3.499220578729791e-05, "loss": 0.407, "step": 53855 }, { "epoch": 1.9411107507117886, "grad_norm": 0.20579689741134644, "learning_rate": 3.498953080868689e-05, "loss": 0.3687, "step": 53860 }, { "epoch": 1.9412909503730131, "grad_norm": 0.20091566443443298, "learning_rate": 3.498685569397045e-05, "loss": 0.4192, "step": 53865 }, { "epoch": 1.9414711500342379, "grad_norm": 0.17543958127498627, "learning_rate": 3.498418044318502e-05, "loss": 0.3872, "step": 53870 }, { "epoch": 1.9416513496954626, "grad_norm": 0.1831250786781311, "learning_rate": 3.498150505636708e-05, "loss": 0.3941, "step": 53875 }, { "epoch": 1.9418315493566873, "grad_norm": 0.18950533866882324, "learning_rate": 3.4978829533553064e-05, "loss": 0.4167, "step": 53880 }, { "epoch": 1.9420117490179118, "grad_norm": 0.2181074172258377, "learning_rate": 3.497615387477942e-05, "loss": 0.3822, "step": 53885 }, { "epoch": 1.9421919486791364, "grad_norm": 0.20885571837425232, "learning_rate": 3.497347808008262e-05, "loss": 0.4137, "step": 53890 }, { "epoch": 1.942372148340361, "grad_norm": 0.18355844914913177, "learning_rate": 3.4970802149499106e-05, "loss": 0.439, "step": 53895 }, { "epoch": 1.9425523480015858, "grad_norm": 0.17857640981674194, "learning_rate": 3.496812608306535e-05, "loss": 0.4262, "step": 53900 }, { "epoch": 1.9427325476628103, "grad_norm": 0.21502956748008728, "learning_rate": 3.4965449880817795e-05, "loss": 0.438, "step": 53905 }, { "epoch": 1.942912747324035, "grad_norm": 0.17269867658615112, "learning_rate": 3.4962773542792925e-05, "loss": 0.3961, "step": 53910 }, { "epoch": 1.9430929469852596, "grad_norm": 0.1496947705745697, "learning_rate": 3.4960097069027184e-05, "loss": 0.4213, "step": 53915 }, { "epoch": 1.9432731466464843, "grad_norm": 0.19583889842033386, "learning_rate": 3.495742045955706e-05, "loss": 0.4293, "step": 53920 }, { "epoch": 1.943453346307709, "grad_norm": 0.1784418672323227, "learning_rate": 3.4954743714419006e-05, "loss": 0.3941, "step": 53925 }, { "epoch": 1.9436335459689336, "grad_norm": 0.2068331241607666, "learning_rate": 3.4952066833649495e-05, "loss": 0.4088, "step": 53930 }, { "epoch": 1.943813745630158, "grad_norm": 0.18008601665496826, "learning_rate": 3.4949389817285e-05, "loss": 0.3759, "step": 53935 }, { "epoch": 1.9439939452913828, "grad_norm": 0.17084579169750214, "learning_rate": 3.494671266536199e-05, "loss": 0.3866, "step": 53940 }, { "epoch": 1.9441741449526075, "grad_norm": 0.1742352992296219, "learning_rate": 3.494403537791696e-05, "loss": 0.4175, "step": 53945 }, { "epoch": 1.9443543446138323, "grad_norm": 0.2424996942281723, "learning_rate": 3.494135795498636e-05, "loss": 0.416, "step": 53950 }, { "epoch": 1.9445345442750568, "grad_norm": 0.17355568706989288, "learning_rate": 3.493868039660669e-05, "loss": 0.4145, "step": 53955 }, { "epoch": 1.9447147439362813, "grad_norm": 0.15699560940265656, "learning_rate": 3.493600270281442e-05, "loss": 0.4125, "step": 53960 }, { "epoch": 1.944894943597506, "grad_norm": 0.16016830503940582, "learning_rate": 3.493332487364604e-05, "loss": 0.4297, "step": 53965 }, { "epoch": 1.9450751432587308, "grad_norm": 0.23133453726768494, "learning_rate": 3.4930646909138026e-05, "loss": 0.4289, "step": 53970 }, { "epoch": 1.9452553429199553, "grad_norm": 0.23153449594974518, "learning_rate": 3.492796880932687e-05, "loss": 0.4518, "step": 53975 }, { "epoch": 1.9454355425811798, "grad_norm": 0.1748327612876892, "learning_rate": 3.492529057424907e-05, "loss": 0.41, "step": 53980 }, { "epoch": 1.9456157422424045, "grad_norm": 0.2023671418428421, "learning_rate": 3.49226122039411e-05, "loss": 0.4217, "step": 53985 }, { "epoch": 1.9457959419036293, "grad_norm": 0.15689332783222198, "learning_rate": 3.491993369843946e-05, "loss": 0.3638, "step": 53990 }, { "epoch": 1.945976141564854, "grad_norm": 0.19829018414020538, "learning_rate": 3.4917255057780646e-05, "loss": 0.4338, "step": 53995 }, { "epoch": 1.9461563412260785, "grad_norm": 0.19208745658397675, "learning_rate": 3.491457628200115e-05, "loss": 0.4192, "step": 54000 }, { "epoch": 1.9461563412260785, "eval_loss": 0.4374231994152069, "eval_runtime": 3.5248, "eval_samples_per_second": 28.37, "eval_steps_per_second": 7.093, "step": 54000 }, { "epoch": 1.946336540887303, "grad_norm": 0.20834042131900787, "learning_rate": 3.491189737113748e-05, "loss": 0.4054, "step": 54005 }, { "epoch": 1.9465167405485277, "grad_norm": 0.2228306233882904, "learning_rate": 3.490921832522612e-05, "loss": 0.394, "step": 54010 }, { "epoch": 1.9466969402097525, "grad_norm": 0.1861565113067627, "learning_rate": 3.490653914430358e-05, "loss": 0.4106, "step": 54015 }, { "epoch": 1.946877139870977, "grad_norm": 0.1798921376466751, "learning_rate": 3.490385982840636e-05, "loss": 0.4095, "step": 54020 }, { "epoch": 1.9470573395322017, "grad_norm": 0.15824037790298462, "learning_rate": 3.490118037757097e-05, "loss": 0.4057, "step": 54025 }, { "epoch": 1.9472375391934262, "grad_norm": 0.18199415504932404, "learning_rate": 3.489850079183391e-05, "loss": 0.3894, "step": 54030 }, { "epoch": 1.947417738854651, "grad_norm": 0.18391664326190948, "learning_rate": 3.489582107123169e-05, "loss": 0.4136, "step": 54035 }, { "epoch": 1.9475979385158757, "grad_norm": 0.1818101406097412, "learning_rate": 3.489314121580084e-05, "loss": 0.3785, "step": 54040 }, { "epoch": 1.9477781381771002, "grad_norm": 0.2508971095085144, "learning_rate": 3.4890461225577844e-05, "loss": 0.4335, "step": 54045 }, { "epoch": 1.9479583378383247, "grad_norm": 0.17361201345920563, "learning_rate": 3.488778110059924e-05, "loss": 0.3918, "step": 54050 }, { "epoch": 1.9481385374995495, "grad_norm": 0.19530875980854034, "learning_rate": 3.488510084090152e-05, "loss": 0.4348, "step": 54055 }, { "epoch": 1.9483187371607742, "grad_norm": 0.19267144799232483, "learning_rate": 3.488242044652122e-05, "loss": 0.4229, "step": 54060 }, { "epoch": 1.948498936821999, "grad_norm": 0.1983078569173813, "learning_rate": 3.487973991749485e-05, "loss": 0.4049, "step": 54065 }, { "epoch": 1.9486791364832234, "grad_norm": 0.156129390001297, "learning_rate": 3.487705925385894e-05, "loss": 0.4582, "step": 54070 }, { "epoch": 1.948859336144448, "grad_norm": 0.17622871696949005, "learning_rate": 3.4874378455650016e-05, "loss": 0.3908, "step": 54075 }, { "epoch": 1.9490395358056727, "grad_norm": 0.21170586347579956, "learning_rate": 3.487169752290458e-05, "loss": 0.4382, "step": 54080 }, { "epoch": 1.9492197354668974, "grad_norm": 0.2054653912782669, "learning_rate": 3.48690164556592e-05, "loss": 0.4048, "step": 54085 }, { "epoch": 1.949399935128122, "grad_norm": 0.17421171069145203, "learning_rate": 3.486633525395037e-05, "loss": 0.3885, "step": 54090 }, { "epoch": 1.9495801347893464, "grad_norm": 0.21200191974639893, "learning_rate": 3.4863653917814626e-05, "loss": 0.439, "step": 54095 }, { "epoch": 1.9497603344505712, "grad_norm": 0.17257794737815857, "learning_rate": 3.486097244728851e-05, "loss": 0.3801, "step": 54100 }, { "epoch": 1.949940534111796, "grad_norm": 0.1847560554742813, "learning_rate": 3.485829084240856e-05, "loss": 0.3753, "step": 54105 }, { "epoch": 1.9501207337730206, "grad_norm": 0.2439274787902832, "learning_rate": 3.48556091032113e-05, "loss": 0.4313, "step": 54110 }, { "epoch": 1.9503009334342452, "grad_norm": 0.1944105327129364, "learning_rate": 3.485292722973327e-05, "loss": 0.4109, "step": 54115 }, { "epoch": 1.9504811330954697, "grad_norm": 0.189129039645195, "learning_rate": 3.4850245222011025e-05, "loss": 0.3661, "step": 54120 }, { "epoch": 1.9506613327566944, "grad_norm": 0.2111881524324417, "learning_rate": 3.484756308008109e-05, "loss": 0.4375, "step": 54125 }, { "epoch": 1.9508415324179191, "grad_norm": 0.14287078380584717, "learning_rate": 3.484488080398002e-05, "loss": 0.399, "step": 54130 }, { "epoch": 1.9510217320791436, "grad_norm": 0.1772930771112442, "learning_rate": 3.4842198393744354e-05, "loss": 0.4539, "step": 54135 }, { "epoch": 1.9512019317403684, "grad_norm": 0.143804669380188, "learning_rate": 3.483951584941063e-05, "loss": 0.4164, "step": 54140 }, { "epoch": 1.951382131401593, "grad_norm": 0.2153077870607376, "learning_rate": 3.483683317101541e-05, "loss": 0.4466, "step": 54145 }, { "epoch": 1.9515623310628176, "grad_norm": 0.2091062068939209, "learning_rate": 3.483415035859525e-05, "loss": 0.4344, "step": 54150 }, { "epoch": 1.9517425307240424, "grad_norm": 0.19160053133964539, "learning_rate": 3.48314674121867e-05, "loss": 0.3617, "step": 54155 }, { "epoch": 1.9519227303852669, "grad_norm": 0.1973278522491455, "learning_rate": 3.48287843318263e-05, "loss": 0.3868, "step": 54160 }, { "epoch": 1.9521029300464914, "grad_norm": 0.24400056898593903, "learning_rate": 3.482610111755062e-05, "loss": 0.4246, "step": 54165 }, { "epoch": 1.9522831297077161, "grad_norm": 0.18288040161132812, "learning_rate": 3.4823417769396214e-05, "loss": 0.4284, "step": 54170 }, { "epoch": 1.9524633293689408, "grad_norm": 0.18425007164478302, "learning_rate": 3.482073428739964e-05, "loss": 0.4324, "step": 54175 }, { "epoch": 1.9526435290301656, "grad_norm": 0.1836194396018982, "learning_rate": 3.481805067159747e-05, "loss": 0.4368, "step": 54180 }, { "epoch": 1.95282372869139, "grad_norm": 0.1682935655117035, "learning_rate": 3.481536692202625e-05, "loss": 0.4079, "step": 54185 }, { "epoch": 1.9530039283526146, "grad_norm": 0.18854603171348572, "learning_rate": 3.4812683038722565e-05, "loss": 0.4287, "step": 54190 }, { "epoch": 1.9531841280138393, "grad_norm": 0.1703491359949112, "learning_rate": 3.4809999021722964e-05, "loss": 0.3948, "step": 54195 }, { "epoch": 1.953364327675064, "grad_norm": 0.18725082278251648, "learning_rate": 3.480731487106404e-05, "loss": 0.4472, "step": 54200 }, { "epoch": 1.9535445273362886, "grad_norm": 0.17395414412021637, "learning_rate": 3.4804630586782336e-05, "loss": 0.4191, "step": 54205 }, { "epoch": 1.953724726997513, "grad_norm": 0.20272298157215118, "learning_rate": 3.4801946168914445e-05, "loss": 0.4106, "step": 54210 }, { "epoch": 1.9539049266587378, "grad_norm": 0.2384205460548401, "learning_rate": 3.4799261617496936e-05, "loss": 0.3815, "step": 54215 }, { "epoch": 1.9540851263199626, "grad_norm": 0.1440509706735611, "learning_rate": 3.4796576932566374e-05, "loss": 0.4261, "step": 54220 }, { "epoch": 1.9542653259811873, "grad_norm": 0.20167045295238495, "learning_rate": 3.4793892114159364e-05, "loss": 0.4292, "step": 54225 }, { "epoch": 1.9544455256424118, "grad_norm": 0.179216668009758, "learning_rate": 3.4791207162312464e-05, "loss": 0.399, "step": 54230 }, { "epoch": 1.9546257253036363, "grad_norm": 0.18981371819972992, "learning_rate": 3.478852207706226e-05, "loss": 0.4152, "step": 54235 }, { "epoch": 1.954805924964861, "grad_norm": 0.1702110916376114, "learning_rate": 3.4785836858445334e-05, "loss": 0.4375, "step": 54240 }, { "epoch": 1.9549861246260858, "grad_norm": 0.17524947226047516, "learning_rate": 3.478315150649829e-05, "loss": 0.4262, "step": 54245 }, { "epoch": 1.9551663242873103, "grad_norm": 0.20712916553020477, "learning_rate": 3.4780466021257685e-05, "loss": 0.4056, "step": 54250 }, { "epoch": 1.955346523948535, "grad_norm": 0.171823188662529, "learning_rate": 3.477778040276013e-05, "loss": 0.3729, "step": 54255 }, { "epoch": 1.9555267236097595, "grad_norm": 0.16928203403949738, "learning_rate": 3.4775094651042204e-05, "loss": 0.4196, "step": 54260 }, { "epoch": 1.9557069232709843, "grad_norm": 0.19515909254550934, "learning_rate": 3.4772408766140516e-05, "loss": 0.4314, "step": 54265 }, { "epoch": 1.955887122932209, "grad_norm": 0.1939311921596527, "learning_rate": 3.4769722748091646e-05, "loss": 0.4058, "step": 54270 }, { "epoch": 1.9560673225934335, "grad_norm": 0.2215987592935562, "learning_rate": 3.47670365969322e-05, "loss": 0.3656, "step": 54275 }, { "epoch": 1.956247522254658, "grad_norm": 0.18643298745155334, "learning_rate": 3.476435031269876e-05, "loss": 0.4374, "step": 54280 }, { "epoch": 1.9564277219158828, "grad_norm": 0.20477153360843658, "learning_rate": 3.4761663895427946e-05, "loss": 0.419, "step": 54285 }, { "epoch": 1.9566079215771075, "grad_norm": 0.1482023149728775, "learning_rate": 3.475897734515635e-05, "loss": 0.4041, "step": 54290 }, { "epoch": 1.9567881212383322, "grad_norm": 0.17524316906929016, "learning_rate": 3.4756290661920575e-05, "loss": 0.4322, "step": 54295 }, { "epoch": 1.9569683208995567, "grad_norm": 0.21270814538002014, "learning_rate": 3.4753603845757235e-05, "loss": 0.373, "step": 54300 }, { "epoch": 1.9571485205607813, "grad_norm": 0.1944366991519928, "learning_rate": 3.475091689670292e-05, "loss": 0.4083, "step": 54305 }, { "epoch": 1.957328720222006, "grad_norm": 0.2040632963180542, "learning_rate": 3.4748229814794256e-05, "loss": 0.386, "step": 54310 }, { "epoch": 1.9575089198832307, "grad_norm": 0.19412831962108612, "learning_rate": 3.474554260006785e-05, "loss": 0.4259, "step": 54315 }, { "epoch": 1.9576891195444552, "grad_norm": 0.17978821694850922, "learning_rate": 3.4742855252560315e-05, "loss": 0.404, "step": 54320 }, { "epoch": 1.9578693192056797, "grad_norm": 0.19575315713882446, "learning_rate": 3.474016777230825e-05, "loss": 0.4097, "step": 54325 }, { "epoch": 1.9580495188669045, "grad_norm": 0.16315071284770966, "learning_rate": 3.473748015934829e-05, "loss": 0.4187, "step": 54330 }, { "epoch": 1.9582297185281292, "grad_norm": 0.20219777524471283, "learning_rate": 3.473479241371706e-05, "loss": 0.4193, "step": 54335 }, { "epoch": 1.958409918189354, "grad_norm": 0.17881831526756287, "learning_rate": 3.473210453545116e-05, "loss": 0.3867, "step": 54340 }, { "epoch": 1.9585901178505785, "grad_norm": 0.17221704125404358, "learning_rate": 3.472941652458722e-05, "loss": 0.4314, "step": 54345 }, { "epoch": 1.958770317511803, "grad_norm": 0.15523254871368408, "learning_rate": 3.472672838116187e-05, "loss": 0.3914, "step": 54350 }, { "epoch": 1.9589505171730277, "grad_norm": 0.19016127288341522, "learning_rate": 3.472404010521172e-05, "loss": 0.4245, "step": 54355 }, { "epoch": 1.9591307168342524, "grad_norm": 0.17222009599208832, "learning_rate": 3.472135169677341e-05, "loss": 0.4452, "step": 54360 }, { "epoch": 1.959310916495477, "grad_norm": 0.205901101231575, "learning_rate": 3.471866315588356e-05, "loss": 0.4402, "step": 54365 }, { "epoch": 1.9594911161567015, "grad_norm": 0.21535156667232513, "learning_rate": 3.4715974482578814e-05, "loss": 0.4408, "step": 54370 }, { "epoch": 1.9596713158179262, "grad_norm": 0.18163524568080902, "learning_rate": 3.471328567689579e-05, "loss": 0.4052, "step": 54375 }, { "epoch": 1.959851515479151, "grad_norm": 0.20107874274253845, "learning_rate": 3.471059673887114e-05, "loss": 0.3852, "step": 54380 }, { "epoch": 1.9600317151403757, "grad_norm": 0.19046540558338165, "learning_rate": 3.470790766854148e-05, "loss": 0.4287, "step": 54385 }, { "epoch": 1.9602119148016002, "grad_norm": 0.16740958392620087, "learning_rate": 3.4705218465943456e-05, "loss": 0.4202, "step": 54390 }, { "epoch": 1.9603921144628247, "grad_norm": 0.21119292080402374, "learning_rate": 3.4702529131113715e-05, "loss": 0.4048, "step": 54395 }, { "epoch": 1.9605723141240494, "grad_norm": 0.20554935932159424, "learning_rate": 3.469983966408889e-05, "loss": 0.4192, "step": 54400 }, { "epoch": 1.9607525137852742, "grad_norm": 0.2031758427619934, "learning_rate": 3.469715006490563e-05, "loss": 0.4199, "step": 54405 }, { "epoch": 1.9609327134464987, "grad_norm": 0.18050605058670044, "learning_rate": 3.4694460333600574e-05, "loss": 0.3987, "step": 54410 }, { "epoch": 1.9611129131077234, "grad_norm": 0.19206275045871735, "learning_rate": 3.4691770470210374e-05, "loss": 0.4103, "step": 54415 }, { "epoch": 1.961293112768948, "grad_norm": 0.16883911192417145, "learning_rate": 3.4689080474771676e-05, "loss": 0.4011, "step": 54420 }, { "epoch": 1.9614733124301726, "grad_norm": 0.1800912320613861, "learning_rate": 3.4686390347321144e-05, "loss": 0.3789, "step": 54425 }, { "epoch": 1.9616535120913974, "grad_norm": 0.1670406460762024, "learning_rate": 3.46837000878954e-05, "loss": 0.4023, "step": 54430 }, { "epoch": 1.961833711752622, "grad_norm": 0.14875911176204681, "learning_rate": 3.468100969653114e-05, "loss": 0.3879, "step": 54435 }, { "epoch": 1.9620139114138464, "grad_norm": 0.1597927361726761, "learning_rate": 3.4678319173264975e-05, "loss": 0.4384, "step": 54440 }, { "epoch": 1.9621941110750711, "grad_norm": 0.15800374746322632, "learning_rate": 3.46756285181336e-05, "loss": 0.378, "step": 54445 }, { "epoch": 1.9623743107362959, "grad_norm": 0.1771484762430191, "learning_rate": 3.467293773117365e-05, "loss": 0.4328, "step": 54450 }, { "epoch": 1.9625545103975206, "grad_norm": 0.16408397257328033, "learning_rate": 3.46702468124218e-05, "loss": 0.3782, "step": 54455 }, { "epoch": 1.9627347100587451, "grad_norm": 0.17288720607757568, "learning_rate": 3.466755576191471e-05, "loss": 0.4191, "step": 54460 }, { "epoch": 1.9629149097199696, "grad_norm": 0.1751626580953598, "learning_rate": 3.4664864579689036e-05, "loss": 0.4097, "step": 54465 }, { "epoch": 1.9630951093811944, "grad_norm": 0.19084684550762177, "learning_rate": 3.4662173265781464e-05, "loss": 0.4035, "step": 54470 }, { "epoch": 1.963275309042419, "grad_norm": 0.1722564846277237, "learning_rate": 3.4659481820228654e-05, "loss": 0.4273, "step": 54475 }, { "epoch": 1.9634555087036436, "grad_norm": 0.17883461713790894, "learning_rate": 3.4656790243067274e-05, "loss": 0.397, "step": 54480 }, { "epoch": 1.9636357083648681, "grad_norm": 0.1756594032049179, "learning_rate": 3.465409853433399e-05, "loss": 0.39, "step": 54485 }, { "epoch": 1.9638159080260928, "grad_norm": 0.1727340817451477, "learning_rate": 3.465140669406548e-05, "loss": 0.4083, "step": 54490 }, { "epoch": 1.9639961076873176, "grad_norm": 0.16905559599399567, "learning_rate": 3.464871472229843e-05, "loss": 0.4025, "step": 54495 }, { "epoch": 1.9641763073485423, "grad_norm": 0.1733274906873703, "learning_rate": 3.464602261906951e-05, "loss": 0.4237, "step": 54500 }, { "epoch": 1.9641763073485423, "eval_loss": 0.4376771152019501, "eval_runtime": 3.5433, "eval_samples_per_second": 28.223, "eval_steps_per_second": 7.056, "step": 54500 }, { "epoch": 1.9643565070097668, "grad_norm": 0.19322708249092102, "learning_rate": 3.4643330384415396e-05, "loss": 0.4047, "step": 54505 }, { "epoch": 1.9645367066709913, "grad_norm": 0.18007676303386688, "learning_rate": 3.464063801837277e-05, "loss": 0.4018, "step": 54510 }, { "epoch": 1.964716906332216, "grad_norm": 0.17371296882629395, "learning_rate": 3.463794552097832e-05, "loss": 0.3826, "step": 54515 }, { "epoch": 1.9648971059934408, "grad_norm": 0.1630249321460724, "learning_rate": 3.463525289226873e-05, "loss": 0.3994, "step": 54520 }, { "epoch": 1.9650773056546653, "grad_norm": 0.15053841471672058, "learning_rate": 3.463256013228068e-05, "loss": 0.385, "step": 54525 }, { "epoch": 1.96525750531589, "grad_norm": 0.19676600396633148, "learning_rate": 3.462986724105087e-05, "loss": 0.3725, "step": 54530 }, { "epoch": 1.9654377049771146, "grad_norm": 0.1895761638879776, "learning_rate": 3.462717421861597e-05, "loss": 0.4309, "step": 54535 }, { "epoch": 1.9656179046383393, "grad_norm": 0.19437263906002045, "learning_rate": 3.46244810650127e-05, "loss": 0.3823, "step": 54540 }, { "epoch": 1.965798104299564, "grad_norm": 0.19000405073165894, "learning_rate": 3.462178778027774e-05, "loss": 0.4287, "step": 54545 }, { "epoch": 1.9659783039607885, "grad_norm": 0.22047989070415497, "learning_rate": 3.4619094364447776e-05, "loss": 0.4186, "step": 54550 }, { "epoch": 1.966158503622013, "grad_norm": 0.22895020246505737, "learning_rate": 3.461640081755951e-05, "loss": 0.4308, "step": 54555 }, { "epoch": 1.9663387032832378, "grad_norm": 0.1648799479007721, "learning_rate": 3.4613707139649654e-05, "loss": 0.4343, "step": 54560 }, { "epoch": 1.9665189029444625, "grad_norm": 0.22859375178813934, "learning_rate": 3.46110133307549e-05, "loss": 0.4118, "step": 54565 }, { "epoch": 1.9666991026056873, "grad_norm": 0.14714932441711426, "learning_rate": 3.4608319390911937e-05, "loss": 0.4129, "step": 54570 }, { "epoch": 1.9668793022669118, "grad_norm": 0.16554409265518188, "learning_rate": 3.4605625320157496e-05, "loss": 0.4315, "step": 54575 }, { "epoch": 1.9670595019281363, "grad_norm": 0.1864306479692459, "learning_rate": 3.460293111852827e-05, "loss": 0.4123, "step": 54580 }, { "epoch": 1.967239701589361, "grad_norm": 0.22947263717651367, "learning_rate": 3.460023678606096e-05, "loss": 0.3806, "step": 54585 }, { "epoch": 1.9674199012505857, "grad_norm": 0.19999592006206512, "learning_rate": 3.459754232279228e-05, "loss": 0.3781, "step": 54590 }, { "epoch": 1.9676001009118103, "grad_norm": 0.15868158638477325, "learning_rate": 3.459484772875895e-05, "loss": 0.4076, "step": 54595 }, { "epoch": 1.9677803005730348, "grad_norm": 0.18088065087795258, "learning_rate": 3.459215300399768e-05, "loss": 0.3592, "step": 54600 }, { "epoch": 1.9679605002342595, "grad_norm": 0.19539353251457214, "learning_rate": 3.4589458148545174e-05, "loss": 0.4784, "step": 54605 }, { "epoch": 1.9681406998954842, "grad_norm": 0.1806805580854416, "learning_rate": 3.458676316243816e-05, "loss": 0.3996, "step": 54610 }, { "epoch": 1.968320899556709, "grad_norm": 0.22320657968521118, "learning_rate": 3.458406804571335e-05, "loss": 0.3974, "step": 54615 }, { "epoch": 1.9685010992179335, "grad_norm": 0.21854592859745026, "learning_rate": 3.458137279840749e-05, "loss": 0.3921, "step": 54620 }, { "epoch": 1.968681298879158, "grad_norm": 0.20072756707668304, "learning_rate": 3.457867742055725e-05, "loss": 0.3974, "step": 54625 }, { "epoch": 1.9688614985403827, "grad_norm": 0.19224333763122559, "learning_rate": 3.4575981912199404e-05, "loss": 0.436, "step": 54630 }, { "epoch": 1.9690416982016075, "grad_norm": 0.1824558526277542, "learning_rate": 3.457328627337065e-05, "loss": 0.4161, "step": 54635 }, { "epoch": 1.969221897862832, "grad_norm": 0.16836610436439514, "learning_rate": 3.457059050410773e-05, "loss": 0.4122, "step": 54640 }, { "epoch": 1.9694020975240567, "grad_norm": 0.17469918727874756, "learning_rate": 3.456789460444737e-05, "loss": 0.4447, "step": 54645 }, { "epoch": 1.9695822971852812, "grad_norm": 0.18369092047214508, "learning_rate": 3.456519857442629e-05, "loss": 0.4229, "step": 54650 }, { "epoch": 1.969762496846506, "grad_norm": 0.20481358468532562, "learning_rate": 3.456250241408123e-05, "loss": 0.3953, "step": 54655 }, { "epoch": 1.9699426965077307, "grad_norm": 0.277506023645401, "learning_rate": 3.455980612344894e-05, "loss": 0.4109, "step": 54660 }, { "epoch": 1.9701228961689552, "grad_norm": 0.18882182240486145, "learning_rate": 3.455710970256613e-05, "loss": 0.3827, "step": 54665 }, { "epoch": 1.9703030958301797, "grad_norm": 0.19233731925487518, "learning_rate": 3.4554413151469553e-05, "loss": 0.4228, "step": 54670 }, { "epoch": 1.9704832954914044, "grad_norm": 0.19764070212841034, "learning_rate": 3.455171647019595e-05, "loss": 0.401, "step": 54675 }, { "epoch": 1.9706634951526292, "grad_norm": 0.17623180150985718, "learning_rate": 3.454901965878205e-05, "loss": 0.4398, "step": 54680 }, { "epoch": 1.970843694813854, "grad_norm": 0.16016143560409546, "learning_rate": 3.454632271726461e-05, "loss": 0.4032, "step": 54685 }, { "epoch": 1.9710238944750784, "grad_norm": 0.1639336496591568, "learning_rate": 3.4543625645680375e-05, "loss": 0.4031, "step": 54690 }, { "epoch": 1.971204094136303, "grad_norm": 0.1621045023202896, "learning_rate": 3.454092844406609e-05, "loss": 0.3798, "step": 54695 }, { "epoch": 1.9713842937975277, "grad_norm": 0.19391906261444092, "learning_rate": 3.4538231112458497e-05, "loss": 0.4085, "step": 54700 }, { "epoch": 1.9715644934587524, "grad_norm": 0.2154162973165512, "learning_rate": 3.4535533650894356e-05, "loss": 0.4131, "step": 54705 }, { "epoch": 1.971744693119977, "grad_norm": 0.16932529211044312, "learning_rate": 3.453283605941041e-05, "loss": 0.3887, "step": 54710 }, { "epoch": 1.9719248927812014, "grad_norm": 0.18449749052524567, "learning_rate": 3.453013833804342e-05, "loss": 0.4192, "step": 54715 }, { "epoch": 1.9721050924424262, "grad_norm": 0.1401815116405487, "learning_rate": 3.452744048683014e-05, "loss": 0.4199, "step": 54720 }, { "epoch": 1.9722852921036509, "grad_norm": 0.17515966296195984, "learning_rate": 3.452474250580734e-05, "loss": 0.3881, "step": 54725 }, { "epoch": 1.9724654917648756, "grad_norm": 0.19562271237373352, "learning_rate": 3.452204439501175e-05, "loss": 0.4005, "step": 54730 }, { "epoch": 1.9726456914261001, "grad_norm": 0.16657692193984985, "learning_rate": 3.451934615448016e-05, "loss": 0.3684, "step": 54735 }, { "epoch": 1.9728258910873246, "grad_norm": 0.2561399042606354, "learning_rate": 3.451664778424931e-05, "loss": 0.4216, "step": 54740 }, { "epoch": 1.9730060907485494, "grad_norm": 0.18860171735286713, "learning_rate": 3.451394928435598e-05, "loss": 0.4218, "step": 54745 }, { "epoch": 1.9731862904097741, "grad_norm": 0.17932887375354767, "learning_rate": 3.451125065483695e-05, "loss": 0.3872, "step": 54750 }, { "epoch": 1.9733664900709986, "grad_norm": 0.19358305633068085, "learning_rate": 3.450855189572895e-05, "loss": 0.4098, "step": 54755 }, { "epoch": 1.9735466897322234, "grad_norm": 0.1724853366613388, "learning_rate": 3.450585300706878e-05, "loss": 0.4266, "step": 54760 }, { "epoch": 1.9737268893934479, "grad_norm": 0.17911700904369354, "learning_rate": 3.45031539888932e-05, "loss": 0.417, "step": 54765 }, { "epoch": 1.9739070890546726, "grad_norm": 0.16080361604690552, "learning_rate": 3.450045484123899e-05, "loss": 0.4101, "step": 54770 }, { "epoch": 1.9740872887158973, "grad_norm": 0.19026781618595123, "learning_rate": 3.449775556414292e-05, "loss": 0.4418, "step": 54775 }, { "epoch": 1.9742674883771218, "grad_norm": 0.16302798688411713, "learning_rate": 3.449505615764177e-05, "loss": 0.4291, "step": 54780 }, { "epoch": 1.9744476880383464, "grad_norm": 0.13728991150856018, "learning_rate": 3.449235662177233e-05, "loss": 0.4054, "step": 54785 }, { "epoch": 1.974627887699571, "grad_norm": 0.16925320029258728, "learning_rate": 3.4489656956571345e-05, "loss": 0.3945, "step": 54790 }, { "epoch": 1.9748080873607958, "grad_norm": 0.20769056677818298, "learning_rate": 3.448695716207564e-05, "loss": 0.416, "step": 54795 }, { "epoch": 1.9749882870220206, "grad_norm": 0.20202518999576569, "learning_rate": 3.448425723832197e-05, "loss": 0.3943, "step": 54800 }, { "epoch": 1.975168486683245, "grad_norm": 0.19320270419120789, "learning_rate": 3.448155718534714e-05, "loss": 0.4378, "step": 54805 }, { "epoch": 1.9753486863444696, "grad_norm": 0.15490759909152985, "learning_rate": 3.447885700318792e-05, "loss": 0.3863, "step": 54810 }, { "epoch": 1.9755288860056943, "grad_norm": 0.1758003681898117, "learning_rate": 3.447615669188111e-05, "loss": 0.4029, "step": 54815 }, { "epoch": 1.975709085666919, "grad_norm": 0.1570878028869629, "learning_rate": 3.44734562514635e-05, "loss": 0.386, "step": 54820 }, { "epoch": 1.9758892853281436, "grad_norm": 0.15345117449760437, "learning_rate": 3.4470755681971886e-05, "loss": 0.401, "step": 54825 }, { "epoch": 1.976069484989368, "grad_norm": 0.201069176197052, "learning_rate": 3.446805498344307e-05, "loss": 0.4219, "step": 54830 }, { "epoch": 1.9762496846505928, "grad_norm": 0.16482049226760864, "learning_rate": 3.446535415591382e-05, "loss": 0.4091, "step": 54835 }, { "epoch": 1.9764298843118175, "grad_norm": 0.19689258933067322, "learning_rate": 3.446265319942096e-05, "loss": 0.3863, "step": 54840 }, { "epoch": 1.9766100839730423, "grad_norm": 0.14975933730602264, "learning_rate": 3.4459952114001284e-05, "loss": 0.426, "step": 54845 }, { "epoch": 1.9767902836342668, "grad_norm": 0.17771407961845398, "learning_rate": 3.4457250899691586e-05, "loss": 0.4053, "step": 54850 }, { "epoch": 1.9769704832954913, "grad_norm": 0.16967308521270752, "learning_rate": 3.4454549556528674e-05, "loss": 0.3714, "step": 54855 }, { "epoch": 1.977150682956716, "grad_norm": 0.19800063967704773, "learning_rate": 3.445184808454936e-05, "loss": 0.4534, "step": 54860 }, { "epoch": 1.9773308826179408, "grad_norm": 0.19260716438293457, "learning_rate": 3.444914648379045e-05, "loss": 0.4158, "step": 54865 }, { "epoch": 1.9775110822791653, "grad_norm": 0.18546351790428162, "learning_rate": 3.4446444754288745e-05, "loss": 0.3974, "step": 54870 }, { "epoch": 1.9776912819403898, "grad_norm": 0.17617270350456238, "learning_rate": 3.444374289608105e-05, "loss": 0.3895, "step": 54875 }, { "epoch": 1.9778714816016145, "grad_norm": 0.20239382982254028, "learning_rate": 3.444104090920419e-05, "loss": 0.417, "step": 54880 }, { "epoch": 1.9780516812628393, "grad_norm": 0.15842540562152863, "learning_rate": 3.443833879369499e-05, "loss": 0.4095, "step": 54885 }, { "epoch": 1.978231880924064, "grad_norm": 0.1709548830986023, "learning_rate": 3.443563654959024e-05, "loss": 0.4075, "step": 54890 }, { "epoch": 1.9784120805852885, "grad_norm": 0.20768848061561584, "learning_rate": 3.4432934176926765e-05, "loss": 0.427, "step": 54895 }, { "epoch": 1.978592280246513, "grad_norm": 0.16096894443035126, "learning_rate": 3.443023167574139e-05, "loss": 0.4047, "step": 54900 }, { "epoch": 1.9787724799077377, "grad_norm": 0.17708556354045868, "learning_rate": 3.4427529046070936e-05, "loss": 0.4069, "step": 54905 }, { "epoch": 1.9789526795689625, "grad_norm": 0.1786879301071167, "learning_rate": 3.442482628795223e-05, "loss": 0.3877, "step": 54910 }, { "epoch": 1.979132879230187, "grad_norm": 0.2042560875415802, "learning_rate": 3.442212340142209e-05, "loss": 0.4307, "step": 54915 }, { "epoch": 1.9793130788914117, "grad_norm": 0.16689689457416534, "learning_rate": 3.441942038651733e-05, "loss": 0.4006, "step": 54920 }, { "epoch": 1.9794932785526362, "grad_norm": 0.16733959317207336, "learning_rate": 3.44167172432748e-05, "loss": 0.4051, "step": 54925 }, { "epoch": 1.979673478213861, "grad_norm": 0.18891684710979462, "learning_rate": 3.4414013971731323e-05, "loss": 0.4374, "step": 54930 }, { "epoch": 1.9798536778750857, "grad_norm": 0.18657353520393372, "learning_rate": 3.441131057192373e-05, "loss": 0.4465, "step": 54935 }, { "epoch": 1.9800338775363102, "grad_norm": 0.1951577365398407, "learning_rate": 3.440860704388884e-05, "loss": 0.3765, "step": 54940 }, { "epoch": 1.9802140771975347, "grad_norm": 0.21163040399551392, "learning_rate": 3.440590338766351e-05, "loss": 0.4099, "step": 54945 }, { "epoch": 1.9803942768587595, "grad_norm": 0.19105908274650574, "learning_rate": 3.4403199603284567e-05, "loss": 0.4014, "step": 54950 }, { "epoch": 1.9805744765199842, "grad_norm": 0.18041996657848358, "learning_rate": 3.440049569078885e-05, "loss": 0.4271, "step": 54955 }, { "epoch": 1.980754676181209, "grad_norm": 0.19663773477077484, "learning_rate": 3.4397791650213193e-05, "loss": 0.4073, "step": 54960 }, { "epoch": 1.9809348758424334, "grad_norm": 0.17666995525360107, "learning_rate": 3.439508748159445e-05, "loss": 0.3892, "step": 54965 }, { "epoch": 1.981115075503658, "grad_norm": 0.14386975765228271, "learning_rate": 3.4392383184969464e-05, "loss": 0.3762, "step": 54970 }, { "epoch": 1.9812952751648827, "grad_norm": 0.19409483671188354, "learning_rate": 3.438967876037507e-05, "loss": 0.375, "step": 54975 }, { "epoch": 1.9814754748261074, "grad_norm": 0.22416892647743225, "learning_rate": 3.438697420784812e-05, "loss": 0.4125, "step": 54980 }, { "epoch": 1.981655674487332, "grad_norm": 0.20359428226947784, "learning_rate": 3.438426952742546e-05, "loss": 0.4154, "step": 54985 }, { "epoch": 1.9818358741485564, "grad_norm": 0.20030748844146729, "learning_rate": 3.4381564719143947e-05, "loss": 0.3902, "step": 54990 }, { "epoch": 1.9820160738097812, "grad_norm": 0.15663334727287292, "learning_rate": 3.437885978304043e-05, "loss": 0.4205, "step": 54995 }, { "epoch": 1.982196273471006, "grad_norm": 0.17842374742031097, "learning_rate": 3.437615471915177e-05, "loss": 0.4141, "step": 55000 }, { "epoch": 1.982196273471006, "eval_loss": 0.43720561265945435, "eval_runtime": 3.5352, "eval_samples_per_second": 28.287, "eval_steps_per_second": 7.072, "step": 55000 }, { "epoch": 1.9823764731322306, "grad_norm": 0.155527725815773, "learning_rate": 3.437344952751481e-05, "loss": 0.4354, "step": 55005 }, { "epoch": 1.9825566727934552, "grad_norm": 0.16779151558876038, "learning_rate": 3.4370744208166424e-05, "loss": 0.4442, "step": 55010 }, { "epoch": 1.9827368724546797, "grad_norm": 0.25572511553764343, "learning_rate": 3.436803876114346e-05, "loss": 0.4068, "step": 55015 }, { "epoch": 1.9829170721159044, "grad_norm": 0.1537223905324936, "learning_rate": 3.436533318648277e-05, "loss": 0.4256, "step": 55020 }, { "epoch": 1.9830972717771291, "grad_norm": 0.17157311737537384, "learning_rate": 3.436262748422124e-05, "loss": 0.3851, "step": 55025 }, { "epoch": 1.9832774714383536, "grad_norm": 0.21722643077373505, "learning_rate": 3.435992165439572e-05, "loss": 0.4225, "step": 55030 }, { "epoch": 1.9834576710995784, "grad_norm": 0.20483043789863586, "learning_rate": 3.435721569704308e-05, "loss": 0.4113, "step": 55035 }, { "epoch": 1.983637870760803, "grad_norm": 0.2097228616476059, "learning_rate": 3.4354509612200194e-05, "loss": 0.407, "step": 55040 }, { "epoch": 1.9838180704220276, "grad_norm": 0.1592007428407669, "learning_rate": 3.435180339990391e-05, "loss": 0.3818, "step": 55045 }, { "epoch": 1.9839982700832524, "grad_norm": 0.1988215148448944, "learning_rate": 3.4349097060191135e-05, "loss": 0.3992, "step": 55050 }, { "epoch": 1.9841784697444769, "grad_norm": 0.19926537573337555, "learning_rate": 3.434639059309871e-05, "loss": 0.3907, "step": 55055 }, { "epoch": 1.9843586694057014, "grad_norm": 0.1831171214580536, "learning_rate": 3.434368399866352e-05, "loss": 0.4176, "step": 55060 }, { "epoch": 1.9845388690669261, "grad_norm": 0.17820048332214355, "learning_rate": 3.434097727692245e-05, "loss": 0.4282, "step": 55065 }, { "epoch": 1.9847190687281508, "grad_norm": 0.21258282661437988, "learning_rate": 3.433827042791238e-05, "loss": 0.4548, "step": 55070 }, { "epoch": 1.9848992683893756, "grad_norm": 0.17607107758522034, "learning_rate": 3.433556345167017e-05, "loss": 0.4106, "step": 55075 }, { "epoch": 1.9850794680506, "grad_norm": 0.18194575607776642, "learning_rate": 3.4332856348232725e-05, "loss": 0.4119, "step": 55080 }, { "epoch": 1.9852596677118246, "grad_norm": 0.16195902228355408, "learning_rate": 3.433014911763691e-05, "loss": 0.3892, "step": 55085 }, { "epoch": 1.9854398673730493, "grad_norm": 0.17873816192150116, "learning_rate": 3.432744175991963e-05, "loss": 0.406, "step": 55090 }, { "epoch": 1.985620067034274, "grad_norm": 0.17679697275161743, "learning_rate": 3.432473427511776e-05, "loss": 0.4588, "step": 55095 }, { "epoch": 1.9858002666954986, "grad_norm": 0.1970539093017578, "learning_rate": 3.4322026663268186e-05, "loss": 0.3898, "step": 55100 }, { "epoch": 1.985980466356723, "grad_norm": 0.2121034860610962, "learning_rate": 3.4319318924407806e-05, "loss": 0.4556, "step": 55105 }, { "epoch": 1.9861606660179478, "grad_norm": 0.1846599280834198, "learning_rate": 3.431661105857351e-05, "loss": 0.4054, "step": 55110 }, { "epoch": 1.9863408656791726, "grad_norm": 0.17753110826015472, "learning_rate": 3.4313903065802187e-05, "loss": 0.3984, "step": 55115 }, { "epoch": 1.9865210653403973, "grad_norm": 0.1930995136499405, "learning_rate": 3.431119494613075e-05, "loss": 0.4278, "step": 55120 }, { "epoch": 1.9867012650016218, "grad_norm": 0.17854326963424683, "learning_rate": 3.430848669959607e-05, "loss": 0.406, "step": 55125 }, { "epoch": 1.9868814646628463, "grad_norm": 0.16902780532836914, "learning_rate": 3.430577832623507e-05, "loss": 0.3918, "step": 55130 }, { "epoch": 1.987061664324071, "grad_norm": 0.23383301496505737, "learning_rate": 3.4303069826084646e-05, "loss": 0.3994, "step": 55135 }, { "epoch": 1.9872418639852958, "grad_norm": 0.19351869821548462, "learning_rate": 3.430036119918168e-05, "loss": 0.4034, "step": 55140 }, { "epoch": 1.9874220636465203, "grad_norm": 0.19689901173114777, "learning_rate": 3.429765244556311e-05, "loss": 0.4055, "step": 55145 }, { "epoch": 1.987602263307745, "grad_norm": 0.19208216667175293, "learning_rate": 3.429494356526581e-05, "loss": 0.3989, "step": 55150 }, { "epoch": 1.9877824629689695, "grad_norm": 0.19520142674446106, "learning_rate": 3.4292234558326724e-05, "loss": 0.3776, "step": 55155 }, { "epoch": 1.9879626626301943, "grad_norm": 0.22114428877830505, "learning_rate": 3.4289525424782726e-05, "loss": 0.422, "step": 55160 }, { "epoch": 1.988142862291419, "grad_norm": 0.16735602915287018, "learning_rate": 3.428681616467075e-05, "loss": 0.4248, "step": 55165 }, { "epoch": 1.9883230619526435, "grad_norm": 0.18746241927146912, "learning_rate": 3.4284106778027696e-05, "loss": 0.4177, "step": 55170 }, { "epoch": 1.988503261613868, "grad_norm": 0.24923117458820343, "learning_rate": 3.4281397264890484e-05, "loss": 0.4006, "step": 55175 }, { "epoch": 1.9886834612750928, "grad_norm": 0.147854283452034, "learning_rate": 3.427868762529604e-05, "loss": 0.4182, "step": 55180 }, { "epoch": 1.9888636609363175, "grad_norm": 0.14005811512470245, "learning_rate": 3.427597785928126e-05, "loss": 0.3873, "step": 55185 }, { "epoch": 1.9890438605975422, "grad_norm": 0.18232989311218262, "learning_rate": 3.4273267966883094e-05, "loss": 0.374, "step": 55190 }, { "epoch": 1.9892240602587667, "grad_norm": 0.18193449079990387, "learning_rate": 3.427055794813844e-05, "loss": 0.3967, "step": 55195 }, { "epoch": 1.9894042599199913, "grad_norm": 0.1848289668560028, "learning_rate": 3.426784780308423e-05, "loss": 0.3852, "step": 55200 }, { "epoch": 1.989584459581216, "grad_norm": 0.2115050107240677, "learning_rate": 3.426513753175738e-05, "loss": 0.4434, "step": 55205 }, { "epoch": 1.9897646592424407, "grad_norm": 0.20298829674720764, "learning_rate": 3.426242713419483e-05, "loss": 0.4076, "step": 55210 }, { "epoch": 1.9899448589036652, "grad_norm": 0.20630566775798798, "learning_rate": 3.425971661043351e-05, "loss": 0.434, "step": 55215 }, { "epoch": 1.9901250585648897, "grad_norm": 0.1815614402294159, "learning_rate": 3.425700596051033e-05, "loss": 0.4045, "step": 55220 }, { "epoch": 1.9903052582261145, "grad_norm": 0.1584119349718094, "learning_rate": 3.425429518446225e-05, "loss": 0.4018, "step": 55225 }, { "epoch": 1.9904854578873392, "grad_norm": 0.17491492629051208, "learning_rate": 3.425158428232618e-05, "loss": 0.4208, "step": 55230 }, { "epoch": 1.990665657548564, "grad_norm": 0.16318289935588837, "learning_rate": 3.4248873254139066e-05, "loss": 0.4531, "step": 55235 }, { "epoch": 1.9908458572097885, "grad_norm": 0.18993204832077026, "learning_rate": 3.424616209993785e-05, "loss": 0.3925, "step": 55240 }, { "epoch": 1.991026056871013, "grad_norm": 0.20490875840187073, "learning_rate": 3.4243450819759464e-05, "loss": 0.3857, "step": 55245 }, { "epoch": 1.9912062565322377, "grad_norm": 0.22841264307498932, "learning_rate": 3.424073941364085e-05, "loss": 0.4436, "step": 55250 }, { "epoch": 1.9913864561934624, "grad_norm": 0.18176591396331787, "learning_rate": 3.423802788161895e-05, "loss": 0.4367, "step": 55255 }, { "epoch": 1.991566655854687, "grad_norm": 0.20801039040088654, "learning_rate": 3.4235316223730706e-05, "loss": 0.4058, "step": 55260 }, { "epoch": 1.9917468555159117, "grad_norm": 0.19156023859977722, "learning_rate": 3.423260444001307e-05, "loss": 0.4368, "step": 55265 }, { "epoch": 1.9919270551771362, "grad_norm": 0.1914074718952179, "learning_rate": 3.422989253050298e-05, "loss": 0.4253, "step": 55270 }, { "epoch": 1.992107254838361, "grad_norm": 0.15868034958839417, "learning_rate": 3.42271804952374e-05, "loss": 0.3822, "step": 55275 }, { "epoch": 1.9922874544995857, "grad_norm": 0.20103418827056885, "learning_rate": 3.4224468334253265e-05, "loss": 0.4241, "step": 55280 }, { "epoch": 1.9924676541608102, "grad_norm": 0.22683289647102356, "learning_rate": 3.422175604758754e-05, "loss": 0.4063, "step": 55285 }, { "epoch": 1.9926478538220347, "grad_norm": 0.20226216316223145, "learning_rate": 3.4219043635277173e-05, "loss": 0.417, "step": 55290 }, { "epoch": 1.9928280534832594, "grad_norm": 0.20145370066165924, "learning_rate": 3.4216331097359123e-05, "loss": 0.4072, "step": 55295 }, { "epoch": 1.9930082531444842, "grad_norm": 0.1815030723810196, "learning_rate": 3.421361843387034e-05, "loss": 0.4481, "step": 55300 }, { "epoch": 1.9931884528057089, "grad_norm": 0.16565322875976562, "learning_rate": 3.42109056448478e-05, "loss": 0.4235, "step": 55305 }, { "epoch": 1.9933686524669334, "grad_norm": 0.19166086614131927, "learning_rate": 3.420819273032844e-05, "loss": 0.4324, "step": 55310 }, { "epoch": 1.993548852128158, "grad_norm": 0.1797381490468979, "learning_rate": 3.4205479690349246e-05, "loss": 0.4356, "step": 55315 }, { "epoch": 1.9937290517893826, "grad_norm": 0.19711998105049133, "learning_rate": 3.420276652494717e-05, "loss": 0.3925, "step": 55320 }, { "epoch": 1.9939092514506074, "grad_norm": 0.157618910074234, "learning_rate": 3.4200053234159185e-05, "loss": 0.386, "step": 55325 }, { "epoch": 1.994089451111832, "grad_norm": 0.2216421216726303, "learning_rate": 3.419733981802226e-05, "loss": 0.4304, "step": 55330 }, { "epoch": 1.9942696507730564, "grad_norm": 0.16847671568393707, "learning_rate": 3.419462627657335e-05, "loss": 0.4074, "step": 55335 }, { "epoch": 1.9944498504342811, "grad_norm": 0.19874538481235504, "learning_rate": 3.4191912609849444e-05, "loss": 0.386, "step": 55340 }, { "epoch": 1.9946300500955059, "grad_norm": 0.19660434126853943, "learning_rate": 3.41891988178875e-05, "loss": 0.3841, "step": 55345 }, { "epoch": 1.9948102497567306, "grad_norm": 0.2128075808286667, "learning_rate": 3.4186484900724514e-05, "loss": 0.4227, "step": 55350 }, { "epoch": 1.9949904494179551, "grad_norm": 0.1658129245042801, "learning_rate": 3.418377085839744e-05, "loss": 0.4055, "step": 55355 }, { "epoch": 1.9951706490791796, "grad_norm": 0.1386236697435379, "learning_rate": 3.418105669094327e-05, "loss": 0.3855, "step": 55360 }, { "epoch": 1.9953508487404044, "grad_norm": 0.16813713312149048, "learning_rate": 3.4178342398398985e-05, "loss": 0.4049, "step": 55365 }, { "epoch": 1.995531048401629, "grad_norm": 0.19493988156318665, "learning_rate": 3.417562798080155e-05, "loss": 0.3933, "step": 55370 }, { "epoch": 1.9957112480628536, "grad_norm": 0.19702892005443573, "learning_rate": 3.417291343818797e-05, "loss": 0.453, "step": 55375 }, { "epoch": 1.9958914477240781, "grad_norm": 0.18380559980869293, "learning_rate": 3.417019877059522e-05, "loss": 0.3968, "step": 55380 }, { "epoch": 1.9960716473853028, "grad_norm": 0.2419026494026184, "learning_rate": 3.4167483978060285e-05, "loss": 0.4219, "step": 55385 }, { "epoch": 1.9962518470465276, "grad_norm": 0.19167737662792206, "learning_rate": 3.416476906062015e-05, "loss": 0.4485, "step": 55390 }, { "epoch": 1.9964320467077523, "grad_norm": 0.18639305233955383, "learning_rate": 3.416205401831182e-05, "loss": 0.4201, "step": 55395 }, { "epoch": 1.9966122463689768, "grad_norm": 0.2547195851802826, "learning_rate": 3.4159338851172276e-05, "loss": 0.4503, "step": 55400 }, { "epoch": 1.9967924460302013, "grad_norm": 0.19804906845092773, "learning_rate": 3.41566235592385e-05, "loss": 0.4057, "step": 55405 }, { "epoch": 1.996972645691426, "grad_norm": 0.21647123992443085, "learning_rate": 3.415390814254752e-05, "loss": 0.4265, "step": 55410 }, { "epoch": 1.9971528453526508, "grad_norm": 0.13523387908935547, "learning_rate": 3.41511926011363e-05, "loss": 0.4253, "step": 55415 }, { "epoch": 1.9973330450138753, "grad_norm": 0.16237081587314606, "learning_rate": 3.414847693504186e-05, "loss": 0.3982, "step": 55420 }, { "epoch": 1.9975132446751, "grad_norm": 0.12674693763256073, "learning_rate": 3.414576114430119e-05, "loss": 0.4324, "step": 55425 }, { "epoch": 1.9976934443363246, "grad_norm": 0.18315039575099945, "learning_rate": 3.4143045228951296e-05, "loss": 0.4158, "step": 55430 }, { "epoch": 1.9978736439975493, "grad_norm": 0.1764761209487915, "learning_rate": 3.414032918902918e-05, "loss": 0.3859, "step": 55435 }, { "epoch": 1.998053843658774, "grad_norm": 0.19060806930065155, "learning_rate": 3.413761302457185e-05, "loss": 0.4238, "step": 55440 }, { "epoch": 1.9982340433199985, "grad_norm": 0.14990010857582092, "learning_rate": 3.413489673561632e-05, "loss": 0.4259, "step": 55445 }, { "epoch": 1.998414242981223, "grad_norm": 0.16765926778316498, "learning_rate": 3.413218032219957e-05, "loss": 0.4103, "step": 55450 }, { "epoch": 1.9985944426424478, "grad_norm": 0.1568935662508011, "learning_rate": 3.412946378435865e-05, "loss": 0.4334, "step": 55455 }, { "epoch": 1.9987746423036725, "grad_norm": 0.1629842072725296, "learning_rate": 3.4126747122130545e-05, "loss": 0.4099, "step": 55460 }, { "epoch": 1.9989548419648973, "grad_norm": 0.1970033496618271, "learning_rate": 3.4124030335552274e-05, "loss": 0.4265, "step": 55465 }, { "epoch": 1.9991350416261218, "grad_norm": 0.2183963507413864, "learning_rate": 3.4121313424660866e-05, "loss": 0.4178, "step": 55470 }, { "epoch": 1.9993152412873463, "grad_norm": 0.15539808571338654, "learning_rate": 3.411859638949332e-05, "loss": 0.4105, "step": 55475 }, { "epoch": 1.999495440948571, "grad_norm": 0.22591222822666168, "learning_rate": 3.411642267190535e-05, "loss": 0.4401, "step": 55480 }, { "epoch": 1.9996756406097957, "grad_norm": 0.13716307282447815, "learning_rate": 3.4113705413134064e-05, "loss": 0.3836, "step": 55485 }, { "epoch": 1.9998558402710203, "grad_norm": 0.20352575182914734, "learning_rate": 3.411098803019031e-05, "loss": 0.4013, "step": 55490 }, { "epoch": 2.0000360399322448, "grad_norm": 0.2378479391336441, "learning_rate": 3.41082705231111e-05, "loss": 0.4204, "step": 55495 }, { "epoch": 2.0002162395934695, "grad_norm": 0.17885233461856842, "learning_rate": 3.410555289193347e-05, "loss": 0.3819, "step": 55500 }, { "epoch": 2.0002162395934695, "eval_loss": 0.43618687987327576, "eval_runtime": 3.5358, "eval_samples_per_second": 28.282, "eval_steps_per_second": 7.071, "step": 55500 }, { "epoch": 2.0003964392546942, "grad_norm": 0.21153418719768524, "learning_rate": 3.4102835136694445e-05, "loss": 0.4348, "step": 55505 }, { "epoch": 2.000576638915919, "grad_norm": 0.18446187674999237, "learning_rate": 3.4100117257431055e-05, "loss": 0.387, "step": 55510 }, { "epoch": 2.0007568385771433, "grad_norm": 0.16465526819229126, "learning_rate": 3.409739925418034e-05, "loss": 0.3821, "step": 55515 }, { "epoch": 2.000937038238368, "grad_norm": 0.19307419657707214, "learning_rate": 3.40946811269793e-05, "loss": 0.4331, "step": 55520 }, { "epoch": 2.0011172378995927, "grad_norm": 0.18775279819965363, "learning_rate": 3.4091962875865e-05, "loss": 0.3975, "step": 55525 }, { "epoch": 2.0012974375608175, "grad_norm": 0.1970471441745758, "learning_rate": 3.4089244500874474e-05, "loss": 0.4205, "step": 55530 }, { "epoch": 2.001477637222042, "grad_norm": 0.15047413110733032, "learning_rate": 3.408652600204474e-05, "loss": 0.4004, "step": 55535 }, { "epoch": 2.0016578368832665, "grad_norm": 0.18766699731349945, "learning_rate": 3.408380737941285e-05, "loss": 0.3961, "step": 55540 }, { "epoch": 2.001838036544491, "grad_norm": 0.16125249862670898, "learning_rate": 3.4081088633015845e-05, "loss": 0.3797, "step": 55545 }, { "epoch": 2.002018236205716, "grad_norm": 0.23382239043712616, "learning_rate": 3.407836976289077e-05, "loss": 0.414, "step": 55550 }, { "epoch": 2.0021984358669407, "grad_norm": 0.17505022883415222, "learning_rate": 3.4075650769074664e-05, "loss": 0.3889, "step": 55555 }, { "epoch": 2.0023786355281654, "grad_norm": 0.21252861618995667, "learning_rate": 3.4072931651604566e-05, "loss": 0.4107, "step": 55560 }, { "epoch": 2.0025588351893897, "grad_norm": 0.1734471619129181, "learning_rate": 3.407021241051754e-05, "loss": 0.3442, "step": 55565 }, { "epoch": 2.0027390348506144, "grad_norm": 0.21779297292232513, "learning_rate": 3.406749304585062e-05, "loss": 0.3749, "step": 55570 }, { "epoch": 2.002919234511839, "grad_norm": 0.17229339480400085, "learning_rate": 3.406477355764087e-05, "loss": 0.3806, "step": 55575 }, { "epoch": 2.003099434173064, "grad_norm": 0.18596251308918, "learning_rate": 3.406205394592532e-05, "loss": 0.3808, "step": 55580 }, { "epoch": 2.003279633834288, "grad_norm": 0.20308445394039154, "learning_rate": 3.405933421074105e-05, "loss": 0.395, "step": 55585 }, { "epoch": 2.003459833495513, "grad_norm": 0.26746681332588196, "learning_rate": 3.40566143521251e-05, "loss": 0.3969, "step": 55590 }, { "epoch": 2.0036400331567377, "grad_norm": 0.20009200274944305, "learning_rate": 3.405389437011454e-05, "loss": 0.3977, "step": 55595 }, { "epoch": 2.0038202328179624, "grad_norm": 0.17441947758197784, "learning_rate": 3.405117426474642e-05, "loss": 0.3884, "step": 55600 }, { "epoch": 2.004000432479187, "grad_norm": 0.19016499817371368, "learning_rate": 3.40484540360578e-05, "loss": 0.3964, "step": 55605 }, { "epoch": 2.0041806321404114, "grad_norm": 0.16852574050426483, "learning_rate": 3.4045733684085745e-05, "loss": 0.4337, "step": 55610 }, { "epoch": 2.004360831801636, "grad_norm": 0.20671193301677704, "learning_rate": 3.404301320886732e-05, "loss": 0.4207, "step": 55615 }, { "epoch": 2.004541031462861, "grad_norm": 0.17593398690223694, "learning_rate": 3.404029261043961e-05, "loss": 0.3864, "step": 55620 }, { "epoch": 2.0047212311240856, "grad_norm": 0.18823134899139404, "learning_rate": 3.4037571888839635e-05, "loss": 0.3806, "step": 55625 }, { "epoch": 2.00490143078531, "grad_norm": 0.16644109785556793, "learning_rate": 3.403485104410451e-05, "loss": 0.3839, "step": 55630 }, { "epoch": 2.0050816304465346, "grad_norm": 0.16421851515769958, "learning_rate": 3.403213007627128e-05, "loss": 0.3643, "step": 55635 }, { "epoch": 2.0052618301077594, "grad_norm": 0.14267808198928833, "learning_rate": 3.402940898537703e-05, "loss": 0.4057, "step": 55640 }, { "epoch": 2.005442029768984, "grad_norm": 0.19570884108543396, "learning_rate": 3.402668777145883e-05, "loss": 0.3704, "step": 55645 }, { "epoch": 2.005622229430209, "grad_norm": 0.20084281265735626, "learning_rate": 3.402396643455376e-05, "loss": 0.43, "step": 55650 }, { "epoch": 2.005802429091433, "grad_norm": 0.17763961851596832, "learning_rate": 3.402124497469889e-05, "loss": 0.4088, "step": 55655 }, { "epoch": 2.005982628752658, "grad_norm": 0.19127719104290009, "learning_rate": 3.40185233919313e-05, "loss": 0.3433, "step": 55660 }, { "epoch": 2.0061628284138826, "grad_norm": 0.20189210772514343, "learning_rate": 3.4015801686288086e-05, "loss": 0.3741, "step": 55665 }, { "epoch": 2.0063430280751073, "grad_norm": 0.16015605628490448, "learning_rate": 3.401307985780631e-05, "loss": 0.3657, "step": 55670 }, { "epoch": 2.006523227736332, "grad_norm": 0.16415055096149445, "learning_rate": 3.401035790652307e-05, "loss": 0.3841, "step": 55675 }, { "epoch": 2.0067034273975564, "grad_norm": 0.169295534491539, "learning_rate": 3.400763583247545e-05, "loss": 0.3687, "step": 55680 }, { "epoch": 2.006883627058781, "grad_norm": 0.17803806066513062, "learning_rate": 3.400491363570053e-05, "loss": 0.3908, "step": 55685 }, { "epoch": 2.007063826720006, "grad_norm": 0.1836196929216385, "learning_rate": 3.400219131623541e-05, "loss": 0.4002, "step": 55690 }, { "epoch": 2.0072440263812306, "grad_norm": 0.21328406035900116, "learning_rate": 3.3999468874117176e-05, "loss": 0.3604, "step": 55695 }, { "epoch": 2.007424226042455, "grad_norm": 0.1810711771249771, "learning_rate": 3.399674630938292e-05, "loss": 0.4015, "step": 55700 }, { "epoch": 2.0076044257036796, "grad_norm": 0.21469327807426453, "learning_rate": 3.3994023622069734e-05, "loss": 0.4499, "step": 55705 }, { "epoch": 2.0077846253649043, "grad_norm": 0.20458552241325378, "learning_rate": 3.3991300812214724e-05, "loss": 0.4261, "step": 55710 }, { "epoch": 2.007964825026129, "grad_norm": 0.13773076236248016, "learning_rate": 3.398857787985498e-05, "loss": 0.3517, "step": 55715 }, { "epoch": 2.008145024687354, "grad_norm": 0.17209510505199432, "learning_rate": 3.39858548250276e-05, "loss": 0.3871, "step": 55720 }, { "epoch": 2.008325224348578, "grad_norm": 0.18736940622329712, "learning_rate": 3.398313164776968e-05, "loss": 0.3906, "step": 55725 }, { "epoch": 2.008505424009803, "grad_norm": 0.17324689030647278, "learning_rate": 3.3980408348118337e-05, "loss": 0.4141, "step": 55730 }, { "epoch": 2.0086856236710275, "grad_norm": 0.21437758207321167, "learning_rate": 3.397768492611068e-05, "loss": 0.3743, "step": 55735 }, { "epoch": 2.0088658233322523, "grad_norm": 0.20862148702144623, "learning_rate": 3.397496138178379e-05, "loss": 0.3897, "step": 55740 }, { "epoch": 2.0090460229934766, "grad_norm": 0.2141301929950714, "learning_rate": 3.397223771517479e-05, "loss": 0.4016, "step": 55745 }, { "epoch": 2.0092262226547013, "grad_norm": 0.21781469881534576, "learning_rate": 3.39695139263208e-05, "loss": 0.4502, "step": 55750 }, { "epoch": 2.009406422315926, "grad_norm": 0.1999913901090622, "learning_rate": 3.39667900152589e-05, "loss": 0.4362, "step": 55755 }, { "epoch": 2.0095866219771508, "grad_norm": 0.17094168066978455, "learning_rate": 3.396406598202624e-05, "loss": 0.4162, "step": 55760 }, { "epoch": 2.0097668216383755, "grad_norm": 0.19397181272506714, "learning_rate": 3.39613418266599e-05, "loss": 0.4285, "step": 55765 }, { "epoch": 2.0099470212996, "grad_norm": 0.1974741369485855, "learning_rate": 3.3958617549197024e-05, "loss": 0.3795, "step": 55770 }, { "epoch": 2.0101272209608245, "grad_norm": 0.16916494071483612, "learning_rate": 3.395589314967472e-05, "loss": 0.4219, "step": 55775 }, { "epoch": 2.0103074206220493, "grad_norm": 0.1723833829164505, "learning_rate": 3.39531686281301e-05, "loss": 0.3858, "step": 55780 }, { "epoch": 2.010487620283274, "grad_norm": 0.19679874181747437, "learning_rate": 3.39504439846003e-05, "loss": 0.4087, "step": 55785 }, { "epoch": 2.0106678199444987, "grad_norm": 0.18012060225009918, "learning_rate": 3.3947719219122415e-05, "loss": 0.4004, "step": 55790 }, { "epoch": 2.010848019605723, "grad_norm": 0.190474733710289, "learning_rate": 3.39449943317336e-05, "loss": 0.3732, "step": 55795 }, { "epoch": 2.0110282192669477, "grad_norm": 0.19163033366203308, "learning_rate": 3.394226932247097e-05, "loss": 0.4261, "step": 55800 }, { "epoch": 2.0112084189281725, "grad_norm": 0.18268360197544098, "learning_rate": 3.3939544191371654e-05, "loss": 0.3969, "step": 55805 }, { "epoch": 2.011388618589397, "grad_norm": 0.1998191624879837, "learning_rate": 3.393681893847277e-05, "loss": 0.3728, "step": 55810 }, { "epoch": 2.0115688182506215, "grad_norm": 0.20111285150051117, "learning_rate": 3.393409356381147e-05, "loss": 0.3964, "step": 55815 }, { "epoch": 2.0117490179118462, "grad_norm": 0.18400463461875916, "learning_rate": 3.393136806742487e-05, "loss": 0.3896, "step": 55820 }, { "epoch": 2.011929217573071, "grad_norm": 0.16936840116977692, "learning_rate": 3.39286424493501e-05, "loss": 0.3907, "step": 55825 }, { "epoch": 2.0121094172342957, "grad_norm": 0.1799679696559906, "learning_rate": 3.392591670962432e-05, "loss": 0.3636, "step": 55830 }, { "epoch": 2.0122896168955204, "grad_norm": 0.17350183427333832, "learning_rate": 3.392319084828464e-05, "loss": 0.3845, "step": 55835 }, { "epoch": 2.0124698165567447, "grad_norm": 0.18203559517860413, "learning_rate": 3.3920464865368225e-05, "loss": 0.4245, "step": 55840 }, { "epoch": 2.0126500162179695, "grad_norm": 0.18975740671157837, "learning_rate": 3.3917738760912186e-05, "loss": 0.3997, "step": 55845 }, { "epoch": 2.012830215879194, "grad_norm": 0.2037028819322586, "learning_rate": 3.39150125349537e-05, "loss": 0.3931, "step": 55850 }, { "epoch": 2.013010415540419, "grad_norm": 0.2006373107433319, "learning_rate": 3.391228618752988e-05, "loss": 0.3418, "step": 55855 }, { "epoch": 2.013190615201643, "grad_norm": 0.2143949270248413, "learning_rate": 3.39095597186779e-05, "loss": 0.3607, "step": 55860 }, { "epoch": 2.013370814862868, "grad_norm": 0.18995679914951324, "learning_rate": 3.390683312843489e-05, "loss": 0.3774, "step": 55865 }, { "epoch": 2.0135510145240927, "grad_norm": 0.2137516885995865, "learning_rate": 3.3904106416837994e-05, "loss": 0.3916, "step": 55870 }, { "epoch": 2.0137312141853174, "grad_norm": 0.18241751194000244, "learning_rate": 3.390137958392438e-05, "loss": 0.3773, "step": 55875 }, { "epoch": 2.013911413846542, "grad_norm": 0.19162051379680634, "learning_rate": 3.3898652629731195e-05, "loss": 0.3743, "step": 55880 }, { "epoch": 2.0140916135077664, "grad_norm": 0.17561465501785278, "learning_rate": 3.389592555429558e-05, "loss": 0.3987, "step": 55885 }, { "epoch": 2.014271813168991, "grad_norm": 0.20353040099143982, "learning_rate": 3.389319835765471e-05, "loss": 0.3807, "step": 55890 }, { "epoch": 2.014452012830216, "grad_norm": 0.1713477373123169, "learning_rate": 3.389047103984573e-05, "loss": 0.4177, "step": 55895 }, { "epoch": 2.0146322124914406, "grad_norm": 0.18497739732265472, "learning_rate": 3.388774360090581e-05, "loss": 0.3734, "step": 55900 }, { "epoch": 2.014812412152665, "grad_norm": 0.1842721551656723, "learning_rate": 3.38850160408721e-05, "loss": 0.367, "step": 55905 }, { "epoch": 2.0149926118138897, "grad_norm": 0.14854677021503448, "learning_rate": 3.3882288359781764e-05, "loss": 0.4289, "step": 55910 }, { "epoch": 2.0151728114751144, "grad_norm": 0.2103215456008911, "learning_rate": 3.387956055767197e-05, "loss": 0.3974, "step": 55915 }, { "epoch": 2.015353011136339, "grad_norm": 0.2002989947795868, "learning_rate": 3.387683263457989e-05, "loss": 0.3743, "step": 55920 }, { "epoch": 2.015533210797564, "grad_norm": 0.1742120236158371, "learning_rate": 3.3874104590542676e-05, "loss": 0.386, "step": 55925 }, { "epoch": 2.015713410458788, "grad_norm": 0.19794981181621552, "learning_rate": 3.38713764255975e-05, "loss": 0.4048, "step": 55930 }, { "epoch": 2.015893610120013, "grad_norm": 0.21913039684295654, "learning_rate": 3.3868648139781545e-05, "loss": 0.4255, "step": 55935 }, { "epoch": 2.0160738097812376, "grad_norm": 0.19685392081737518, "learning_rate": 3.3865919733131975e-05, "loss": 0.3708, "step": 55940 }, { "epoch": 2.0162540094424624, "grad_norm": 0.23018737137317657, "learning_rate": 3.386319120568597e-05, "loss": 0.411, "step": 55945 }, { "epoch": 2.016434209103687, "grad_norm": 0.196548730134964, "learning_rate": 3.38604625574807e-05, "loss": 0.3945, "step": 55950 }, { "epoch": 2.0166144087649114, "grad_norm": 0.24264010787010193, "learning_rate": 3.3857733788553335e-05, "loss": 0.4077, "step": 55955 }, { "epoch": 2.016794608426136, "grad_norm": 0.24170154333114624, "learning_rate": 3.385500489894107e-05, "loss": 0.3743, "step": 55960 }, { "epoch": 2.016974808087361, "grad_norm": 0.1775246411561966, "learning_rate": 3.3852275888681076e-05, "loss": 0.3975, "step": 55965 }, { "epoch": 2.0171550077485856, "grad_norm": 0.18977300822734833, "learning_rate": 3.3849546757810536e-05, "loss": 0.4093, "step": 55970 }, { "epoch": 2.01733520740981, "grad_norm": 0.1697954684495926, "learning_rate": 3.3846817506366625e-05, "loss": 0.4306, "step": 55975 }, { "epoch": 2.0175154070710346, "grad_norm": 0.20978444814682007, "learning_rate": 3.384408813438655e-05, "loss": 0.4178, "step": 55980 }, { "epoch": 2.0176956067322593, "grad_norm": 0.23438295722007751, "learning_rate": 3.384135864190748e-05, "loss": 0.4378, "step": 55985 }, { "epoch": 2.017875806393484, "grad_norm": 0.19239699840545654, "learning_rate": 3.383862902896662e-05, "loss": 0.3842, "step": 55990 }, { "epoch": 2.018056006054709, "grad_norm": 0.1987629383802414, "learning_rate": 3.3835899295601146e-05, "loss": 0.3805, "step": 55995 }, { "epoch": 2.018236205715933, "grad_norm": 0.19622477889060974, "learning_rate": 3.383316944184825e-05, "loss": 0.3615, "step": 56000 }, { "epoch": 2.018236205715933, "eval_loss": 0.4372096359729767, "eval_runtime": 3.5364, "eval_samples_per_second": 28.278, "eval_steps_per_second": 7.069, "step": 56000 }, { "epoch": 2.018416405377158, "grad_norm": 0.1874462217092514, "learning_rate": 3.383043946774514e-05, "loss": 0.4029, "step": 56005 }, { "epoch": 2.0185966050383826, "grad_norm": 0.1793491393327713, "learning_rate": 3.382770937332899e-05, "loss": 0.4104, "step": 56010 }, { "epoch": 2.0187768046996073, "grad_norm": 0.15393778681755066, "learning_rate": 3.382497915863702e-05, "loss": 0.4142, "step": 56015 }, { "epoch": 2.0189570043608316, "grad_norm": 0.17542561888694763, "learning_rate": 3.3822248823706406e-05, "loss": 0.3852, "step": 56020 }, { "epoch": 2.0191372040220563, "grad_norm": 0.20072583854198456, "learning_rate": 3.3819518368574365e-05, "loss": 0.4079, "step": 56025 }, { "epoch": 2.019317403683281, "grad_norm": 0.24251943826675415, "learning_rate": 3.38167877932781e-05, "loss": 0.4273, "step": 56030 }, { "epoch": 2.019497603344506, "grad_norm": 0.19765913486480713, "learning_rate": 3.3814057097854804e-05, "loss": 0.3785, "step": 56035 }, { "epoch": 2.0196778030057305, "grad_norm": 0.1533811390399933, "learning_rate": 3.381132628234168e-05, "loss": 0.3913, "step": 56040 }, { "epoch": 2.019858002666955, "grad_norm": 0.15910130739212036, "learning_rate": 3.380859534677595e-05, "loss": 0.3877, "step": 56045 }, { "epoch": 2.0200382023281795, "grad_norm": 0.298079252243042, "learning_rate": 3.380586429119481e-05, "loss": 0.418, "step": 56050 }, { "epoch": 2.0202184019894043, "grad_norm": 0.22058376669883728, "learning_rate": 3.380313311563548e-05, "loss": 0.4053, "step": 56055 }, { "epoch": 2.020398601650629, "grad_norm": 0.2122090458869934, "learning_rate": 3.380040182013515e-05, "loss": 0.4353, "step": 56060 }, { "epoch": 2.0205788013118537, "grad_norm": 0.15873844921588898, "learning_rate": 3.3797670404731066e-05, "loss": 0.4032, "step": 56065 }, { "epoch": 2.020759000973078, "grad_norm": 0.24122844636440277, "learning_rate": 3.3794938869460416e-05, "loss": 0.3695, "step": 56070 }, { "epoch": 2.0209392006343028, "grad_norm": 0.15981855988502502, "learning_rate": 3.3792207214360434e-05, "loss": 0.3844, "step": 56075 }, { "epoch": 2.0211194002955275, "grad_norm": 0.19484391808509827, "learning_rate": 3.3789475439468324e-05, "loss": 0.4208, "step": 56080 }, { "epoch": 2.0212995999567522, "grad_norm": 0.21466252207756042, "learning_rate": 3.378674354482132e-05, "loss": 0.4127, "step": 56085 }, { "epoch": 2.0214797996179765, "grad_norm": 0.20310963690280914, "learning_rate": 3.378401153045663e-05, "loss": 0.4158, "step": 56090 }, { "epoch": 2.0216599992792013, "grad_norm": 0.21700692176818848, "learning_rate": 3.3781279396411484e-05, "loss": 0.4001, "step": 56095 }, { "epoch": 2.021840198940426, "grad_norm": 0.17587092518806458, "learning_rate": 3.3778547142723104e-05, "loss": 0.3782, "step": 56100 }, { "epoch": 2.0220203986016507, "grad_norm": 0.2222931832075119, "learning_rate": 3.3775814769428725e-05, "loss": 0.388, "step": 56105 }, { "epoch": 2.0222005982628755, "grad_norm": 0.2034565955400467, "learning_rate": 3.377308227656557e-05, "loss": 0.4348, "step": 56110 }, { "epoch": 2.0223807979240997, "grad_norm": 0.17816239595413208, "learning_rate": 3.3770349664170865e-05, "loss": 0.3529, "step": 56115 }, { "epoch": 2.0225609975853245, "grad_norm": 0.18056336045265198, "learning_rate": 3.376761693228184e-05, "loss": 0.429, "step": 56120 }, { "epoch": 2.022741197246549, "grad_norm": 0.21382421255111694, "learning_rate": 3.3764884080935735e-05, "loss": 0.3775, "step": 56125 }, { "epoch": 2.022921396907774, "grad_norm": 0.19596946239471436, "learning_rate": 3.376215111016978e-05, "loss": 0.4107, "step": 56130 }, { "epoch": 2.0231015965689982, "grad_norm": 0.1608893871307373, "learning_rate": 3.375941802002122e-05, "loss": 0.3723, "step": 56135 }, { "epoch": 2.023281796230223, "grad_norm": 0.22571824491024017, "learning_rate": 3.3756684810527275e-05, "loss": 0.4514, "step": 56140 }, { "epoch": 2.0234619958914477, "grad_norm": 0.17446362972259521, "learning_rate": 3.3753951481725196e-05, "loss": 0.3783, "step": 56145 }, { "epoch": 2.0236421955526724, "grad_norm": 0.2468831092119217, "learning_rate": 3.3751218033652225e-05, "loss": 0.4116, "step": 56150 }, { "epoch": 2.023822395213897, "grad_norm": 0.22208808362483978, "learning_rate": 3.374848446634561e-05, "loss": 0.4228, "step": 56155 }, { "epoch": 2.0240025948751215, "grad_norm": 0.20974206924438477, "learning_rate": 3.3745750779842575e-05, "loss": 0.406, "step": 56160 }, { "epoch": 2.024182794536346, "grad_norm": 0.14893017709255219, "learning_rate": 3.374301697418039e-05, "loss": 0.3712, "step": 56165 }, { "epoch": 2.024362994197571, "grad_norm": 0.16727864742279053, "learning_rate": 3.3740283049396285e-05, "loss": 0.415, "step": 56170 }, { "epoch": 2.0245431938587957, "grad_norm": 0.2555207908153534, "learning_rate": 3.3737549005527514e-05, "loss": 0.4133, "step": 56175 }, { "epoch": 2.0247233935200204, "grad_norm": 0.22973132133483887, "learning_rate": 3.3734814842611326e-05, "loss": 0.4124, "step": 56180 }, { "epoch": 2.0249035931812447, "grad_norm": 0.21145066618919373, "learning_rate": 3.373208056068499e-05, "loss": 0.3879, "step": 56185 }, { "epoch": 2.0250837928424694, "grad_norm": 0.19409604370594025, "learning_rate": 3.372934615978574e-05, "loss": 0.3742, "step": 56190 }, { "epoch": 2.025263992503694, "grad_norm": 0.21370406448841095, "learning_rate": 3.372661163995084e-05, "loss": 0.3909, "step": 56195 }, { "epoch": 2.025444192164919, "grad_norm": 0.22447529435157776, "learning_rate": 3.372387700121754e-05, "loss": 0.4121, "step": 56200 }, { "epoch": 2.025624391826143, "grad_norm": 0.22073820233345032, "learning_rate": 3.372114224362311e-05, "loss": 0.3848, "step": 56205 }, { "epoch": 2.025804591487368, "grad_norm": 0.18990257382392883, "learning_rate": 3.3718407367204807e-05, "loss": 0.3688, "step": 56210 }, { "epoch": 2.0259847911485926, "grad_norm": 0.172977477312088, "learning_rate": 3.371567237199989e-05, "loss": 0.3931, "step": 56215 }, { "epoch": 2.0261649908098174, "grad_norm": 0.2128041833639145, "learning_rate": 3.371293725804562e-05, "loss": 0.4185, "step": 56220 }, { "epoch": 2.026345190471042, "grad_norm": 0.1763525903224945, "learning_rate": 3.3710202025379276e-05, "loss": 0.4008, "step": 56225 }, { "epoch": 2.0265253901322664, "grad_norm": 0.19905276596546173, "learning_rate": 3.370746667403811e-05, "loss": 0.3789, "step": 56230 }, { "epoch": 2.026705589793491, "grad_norm": 0.19667592644691467, "learning_rate": 3.37047312040594e-05, "loss": 0.3738, "step": 56235 }, { "epoch": 2.026885789454716, "grad_norm": 0.19164909422397614, "learning_rate": 3.37019956154804e-05, "loss": 0.433, "step": 56240 }, { "epoch": 2.0270659891159406, "grad_norm": 0.1493864804506302, "learning_rate": 3.369925990833841e-05, "loss": 0.4096, "step": 56245 }, { "epoch": 2.027246188777165, "grad_norm": 0.18463338911533356, "learning_rate": 3.3696524082670684e-05, "loss": 0.3974, "step": 56250 }, { "epoch": 2.0274263884383896, "grad_norm": 0.22145789861679077, "learning_rate": 3.369378813851449e-05, "loss": 0.3888, "step": 56255 }, { "epoch": 2.0276065880996144, "grad_norm": 0.1729925125837326, "learning_rate": 3.369105207590713e-05, "loss": 0.4284, "step": 56260 }, { "epoch": 2.027786787760839, "grad_norm": 0.1994604617357254, "learning_rate": 3.368831589488586e-05, "loss": 0.4122, "step": 56265 }, { "epoch": 2.027966987422064, "grad_norm": 0.16139256954193115, "learning_rate": 3.3685579595487985e-05, "loss": 0.3947, "step": 56270 }, { "epoch": 2.028147187083288, "grad_norm": 0.1860417276620865, "learning_rate": 3.3682843177750745e-05, "loss": 0.4223, "step": 56275 }, { "epoch": 2.028327386744513, "grad_norm": 0.1829526275396347, "learning_rate": 3.368010664171146e-05, "loss": 0.3896, "step": 56280 }, { "epoch": 2.0285075864057376, "grad_norm": 0.23350392282009125, "learning_rate": 3.36773699874074e-05, "loss": 0.4205, "step": 56285 }, { "epoch": 2.0286877860669623, "grad_norm": 0.18550008535385132, "learning_rate": 3.3674633214875855e-05, "loss": 0.4167, "step": 56290 }, { "epoch": 2.028867985728187, "grad_norm": 0.2336433231830597, "learning_rate": 3.367189632415411e-05, "loss": 0.3576, "step": 56295 }, { "epoch": 2.0290481853894113, "grad_norm": 0.18545769155025482, "learning_rate": 3.366915931527946e-05, "loss": 0.4111, "step": 56300 }, { "epoch": 2.029228385050636, "grad_norm": 0.1865958422422409, "learning_rate": 3.3666969623134705e-05, "loss": 0.4044, "step": 56305 }, { "epoch": 2.029408584711861, "grad_norm": 0.202582448720932, "learning_rate": 3.366423240167879e-05, "loss": 0.3743, "step": 56310 }, { "epoch": 2.0295887843730855, "grad_norm": 0.19475452601909637, "learning_rate": 3.366149506217439e-05, "loss": 0.419, "step": 56315 }, { "epoch": 2.02976898403431, "grad_norm": 0.19832675158977509, "learning_rate": 3.36587576046588e-05, "loss": 0.387, "step": 56320 }, { "epoch": 2.0299491836955346, "grad_norm": 0.19266216456890106, "learning_rate": 3.3656020029169306e-05, "loss": 0.4015, "step": 56325 }, { "epoch": 2.0301293833567593, "grad_norm": 0.19489936530590057, "learning_rate": 3.365328233574322e-05, "loss": 0.4126, "step": 56330 }, { "epoch": 2.030309583017984, "grad_norm": 0.1996261477470398, "learning_rate": 3.365054452441783e-05, "loss": 0.4134, "step": 56335 }, { "epoch": 2.0304897826792088, "grad_norm": 0.1988709717988968, "learning_rate": 3.364780659523046e-05, "loss": 0.4044, "step": 56340 }, { "epoch": 2.030669982340433, "grad_norm": 0.21509073674678802, "learning_rate": 3.3645068548218385e-05, "loss": 0.3803, "step": 56345 }, { "epoch": 2.030850182001658, "grad_norm": 0.20546337962150574, "learning_rate": 3.364233038341893e-05, "loss": 0.3971, "step": 56350 }, { "epoch": 2.0310303816628825, "grad_norm": 0.20153018832206726, "learning_rate": 3.36395921008694e-05, "loss": 0.3793, "step": 56355 }, { "epoch": 2.0312105813241073, "grad_norm": 0.23244844377040863, "learning_rate": 3.3636853700607095e-05, "loss": 0.3696, "step": 56360 }, { "epoch": 2.0313907809853315, "grad_norm": 0.20720942318439484, "learning_rate": 3.3634115182669335e-05, "loss": 0.3894, "step": 56365 }, { "epoch": 2.0315709806465563, "grad_norm": 0.19618476927280426, "learning_rate": 3.363137654709342e-05, "loss": 0.4301, "step": 56370 }, { "epoch": 2.031751180307781, "grad_norm": 0.1612580120563507, "learning_rate": 3.362863779391668e-05, "loss": 0.3662, "step": 56375 }, { "epoch": 2.0319313799690057, "grad_norm": 0.14148502051830292, "learning_rate": 3.362589892317641e-05, "loss": 0.3881, "step": 56380 }, { "epoch": 2.0321115796302305, "grad_norm": 0.19896137714385986, "learning_rate": 3.3623159934909954e-05, "loss": 0.3847, "step": 56385 }, { "epoch": 2.0322917792914548, "grad_norm": 0.16365410387516022, "learning_rate": 3.3620420829154605e-05, "loss": 0.3822, "step": 56390 }, { "epoch": 2.0324719789526795, "grad_norm": 0.1631069928407669, "learning_rate": 3.361768160594768e-05, "loss": 0.4095, "step": 56395 }, { "epoch": 2.0326521786139042, "grad_norm": 0.22321099042892456, "learning_rate": 3.361494226532653e-05, "loss": 0.3992, "step": 56400 }, { "epoch": 2.032832378275129, "grad_norm": 0.2479621022939682, "learning_rate": 3.361220280732845e-05, "loss": 0.3764, "step": 56405 }, { "epoch": 2.0330125779363533, "grad_norm": 0.2009827196598053, "learning_rate": 3.3609463231990774e-05, "loss": 0.4009, "step": 56410 }, { "epoch": 2.033192777597578, "grad_norm": 0.20739832520484924, "learning_rate": 3.3606723539350825e-05, "loss": 0.3841, "step": 56415 }, { "epoch": 2.0333729772588027, "grad_norm": 0.2028648406267166, "learning_rate": 3.360398372944594e-05, "loss": 0.4247, "step": 56420 }, { "epoch": 2.0335531769200275, "grad_norm": 0.19367647171020508, "learning_rate": 3.360124380231344e-05, "loss": 0.381, "step": 56425 }, { "epoch": 2.033733376581252, "grad_norm": 0.1919804960489273, "learning_rate": 3.3598503757990664e-05, "loss": 0.4649, "step": 56430 }, { "epoch": 2.0339135762424765, "grad_norm": 0.17203310132026672, "learning_rate": 3.359576359651493e-05, "loss": 0.3987, "step": 56435 }, { "epoch": 2.034093775903701, "grad_norm": 0.23200438916683197, "learning_rate": 3.359302331792358e-05, "loss": 0.385, "step": 56440 }, { "epoch": 2.034273975564926, "grad_norm": 0.20147477090358734, "learning_rate": 3.3590282922253966e-05, "loss": 0.3981, "step": 56445 }, { "epoch": 2.0344541752261507, "grad_norm": 0.20963551104068756, "learning_rate": 3.3587542409543396e-05, "loss": 0.4105, "step": 56450 }, { "epoch": 2.0346343748873754, "grad_norm": 0.20766286551952362, "learning_rate": 3.358480177982923e-05, "loss": 0.4136, "step": 56455 }, { "epoch": 2.0348145745485997, "grad_norm": 0.22245804965496063, "learning_rate": 3.3582061033148795e-05, "loss": 0.4043, "step": 56460 }, { "epoch": 2.0349947742098244, "grad_norm": 0.18696880340576172, "learning_rate": 3.3579320169539445e-05, "loss": 0.3749, "step": 56465 }, { "epoch": 2.035174973871049, "grad_norm": 0.19157637655735016, "learning_rate": 3.3576579189038525e-05, "loss": 0.4225, "step": 56470 }, { "epoch": 2.035355173532274, "grad_norm": 0.18588903546333313, "learning_rate": 3.3573838091683366e-05, "loss": 0.3961, "step": 56475 }, { "epoch": 2.035535373193498, "grad_norm": 0.196426123380661, "learning_rate": 3.3571096877511324e-05, "loss": 0.4047, "step": 56480 }, { "epoch": 2.035715572854723, "grad_norm": 0.1676120012998581, "learning_rate": 3.356835554655975e-05, "loss": 0.4061, "step": 56485 }, { "epoch": 2.0358957725159477, "grad_norm": 0.22126492857933044, "learning_rate": 3.3565614098865985e-05, "loss": 0.4079, "step": 56490 }, { "epoch": 2.0360759721771724, "grad_norm": 0.1804613471031189, "learning_rate": 3.356287253446738e-05, "loss": 0.3552, "step": 56495 }, { "epoch": 2.036256171838397, "grad_norm": 0.16997934877872467, "learning_rate": 3.3560130853401306e-05, "loss": 0.3867, "step": 56500 }, { "epoch": 2.036256171838397, "eval_loss": 0.43729928135871887, "eval_runtime": 3.5477, "eval_samples_per_second": 28.187, "eval_steps_per_second": 7.047, "step": 56500 }, { "epoch": 2.0364363714996214, "grad_norm": 0.1884368509054184, "learning_rate": 3.355738905570511e-05, "loss": 0.3983, "step": 56505 }, { "epoch": 2.036616571160846, "grad_norm": 0.21548138558864594, "learning_rate": 3.3554647141416136e-05, "loss": 0.4085, "step": 56510 }, { "epoch": 2.036796770822071, "grad_norm": 0.23149925470352173, "learning_rate": 3.355190511057175e-05, "loss": 0.3907, "step": 56515 }, { "epoch": 2.0369769704832956, "grad_norm": 0.18845608830451965, "learning_rate": 3.354916296320931e-05, "loss": 0.4065, "step": 56520 }, { "epoch": 2.03715717014452, "grad_norm": 0.19569791853427887, "learning_rate": 3.354642069936619e-05, "loss": 0.3941, "step": 56525 }, { "epoch": 2.0373373698057446, "grad_norm": 0.24140161275863647, "learning_rate": 3.354367831907974e-05, "loss": 0.4019, "step": 56530 }, { "epoch": 2.0375175694669694, "grad_norm": 0.15203697979450226, "learning_rate": 3.354093582238732e-05, "loss": 0.3785, "step": 56535 }, { "epoch": 2.037697769128194, "grad_norm": 0.17884066700935364, "learning_rate": 3.353819320932632e-05, "loss": 0.3748, "step": 56540 }, { "epoch": 2.037877968789419, "grad_norm": 0.20854458212852478, "learning_rate": 3.353545047993408e-05, "loss": 0.3853, "step": 56545 }, { "epoch": 2.038058168450643, "grad_norm": 0.20811249315738678, "learning_rate": 3.3532707634247975e-05, "loss": 0.3865, "step": 56550 }, { "epoch": 2.038238368111868, "grad_norm": 0.1722935438156128, "learning_rate": 3.352996467230538e-05, "loss": 0.3549, "step": 56555 }, { "epoch": 2.0384185677730926, "grad_norm": 0.2165447473526001, "learning_rate": 3.3527221594143675e-05, "loss": 0.3848, "step": 56560 }, { "epoch": 2.0385987674343173, "grad_norm": 0.221002995967865, "learning_rate": 3.352447839980022e-05, "loss": 0.3889, "step": 56565 }, { "epoch": 2.038778967095542, "grad_norm": 0.2625134587287903, "learning_rate": 3.35217350893124e-05, "loss": 0.3828, "step": 56570 }, { "epoch": 2.0389591667567664, "grad_norm": 0.21043117344379425, "learning_rate": 3.35189916627176e-05, "loss": 0.3867, "step": 56575 }, { "epoch": 2.039139366417991, "grad_norm": 0.19859947264194489, "learning_rate": 3.351624812005317e-05, "loss": 0.3922, "step": 56580 }, { "epoch": 2.039319566079216, "grad_norm": 0.15382780134677887, "learning_rate": 3.351350446135651e-05, "loss": 0.3614, "step": 56585 }, { "epoch": 2.0394997657404406, "grad_norm": 0.21880534291267395, "learning_rate": 3.3510760686665014e-05, "loss": 0.396, "step": 56590 }, { "epoch": 2.039679965401665, "grad_norm": 0.21891264617443085, "learning_rate": 3.3508016796016047e-05, "loss": 0.392, "step": 56595 }, { "epoch": 2.0398601650628896, "grad_norm": 0.20019477605819702, "learning_rate": 3.350527278944699e-05, "loss": 0.3729, "step": 56600 }, { "epoch": 2.0400403647241143, "grad_norm": 0.18630538880825043, "learning_rate": 3.350252866699525e-05, "loss": 0.3704, "step": 56605 }, { "epoch": 2.040220564385339, "grad_norm": 0.20929007232189178, "learning_rate": 3.349978442869819e-05, "loss": 0.4028, "step": 56610 }, { "epoch": 2.040400764046564, "grad_norm": 0.18913328647613525, "learning_rate": 3.3497040074593215e-05, "loss": 0.3956, "step": 56615 }, { "epoch": 2.040580963707788, "grad_norm": 0.1790267825126648, "learning_rate": 3.349429560471772e-05, "loss": 0.4009, "step": 56620 }, { "epoch": 2.040761163369013, "grad_norm": 0.20545370876789093, "learning_rate": 3.349155101910909e-05, "loss": 0.3951, "step": 56625 }, { "epoch": 2.0409413630302375, "grad_norm": 0.21455180644989014, "learning_rate": 3.348880631780472e-05, "loss": 0.4135, "step": 56630 }, { "epoch": 2.0411215626914623, "grad_norm": 0.19538956880569458, "learning_rate": 3.3486061500842014e-05, "loss": 0.4057, "step": 56635 }, { "epoch": 2.0413017623526866, "grad_norm": 0.19726307690143585, "learning_rate": 3.348331656825835e-05, "loss": 0.4318, "step": 56640 }, { "epoch": 2.0414819620139113, "grad_norm": 0.21314232051372528, "learning_rate": 3.348057152009115e-05, "loss": 0.4373, "step": 56645 }, { "epoch": 2.041662161675136, "grad_norm": 0.19441983103752136, "learning_rate": 3.347782635637781e-05, "loss": 0.3647, "step": 56650 }, { "epoch": 2.0418423613363608, "grad_norm": 0.19044964015483856, "learning_rate": 3.347508107715572e-05, "loss": 0.3825, "step": 56655 }, { "epoch": 2.0420225609975855, "grad_norm": 0.19193421304225922, "learning_rate": 3.347233568246228e-05, "loss": 0.4275, "step": 56660 }, { "epoch": 2.04220276065881, "grad_norm": 0.225186288356781, "learning_rate": 3.3469590172334926e-05, "loss": 0.3661, "step": 56665 }, { "epoch": 2.0423829603200345, "grad_norm": 0.16008292138576508, "learning_rate": 3.346684454681104e-05, "loss": 0.355, "step": 56670 }, { "epoch": 2.0425631599812593, "grad_norm": 0.20448531210422516, "learning_rate": 3.346409880592802e-05, "loss": 0.3863, "step": 56675 }, { "epoch": 2.042743359642484, "grad_norm": 0.15298992395401, "learning_rate": 3.346135294972331e-05, "loss": 0.3827, "step": 56680 }, { "epoch": 2.0429235593037087, "grad_norm": 0.1888943463563919, "learning_rate": 3.34586069782343e-05, "loss": 0.367, "step": 56685 }, { "epoch": 2.043103758964933, "grad_norm": 0.18571306765079498, "learning_rate": 3.345586089149841e-05, "loss": 0.3533, "step": 56690 }, { "epoch": 2.0432839586261577, "grad_norm": 0.2252894788980484, "learning_rate": 3.345311468955305e-05, "loss": 0.3834, "step": 56695 }, { "epoch": 2.0434641582873825, "grad_norm": 0.20659992098808289, "learning_rate": 3.3450368372435643e-05, "loss": 0.4081, "step": 56700 }, { "epoch": 2.043644357948607, "grad_norm": 0.20119290053844452, "learning_rate": 3.344762194018359e-05, "loss": 0.3907, "step": 56705 }, { "epoch": 2.0438245576098315, "grad_norm": 0.17512202262878418, "learning_rate": 3.344487539283434e-05, "loss": 0.3676, "step": 56710 }, { "epoch": 2.0440047572710562, "grad_norm": 0.18519236147403717, "learning_rate": 3.3442128730425295e-05, "loss": 0.4245, "step": 56715 }, { "epoch": 2.044184956932281, "grad_norm": 0.20480823516845703, "learning_rate": 3.343938195299387e-05, "loss": 0.4162, "step": 56720 }, { "epoch": 2.0443651565935057, "grad_norm": 0.2772522568702698, "learning_rate": 3.3436635060577506e-05, "loss": 0.3941, "step": 56725 }, { "epoch": 2.0445453562547304, "grad_norm": 0.19306902587413788, "learning_rate": 3.3433888053213624e-05, "loss": 0.4025, "step": 56730 }, { "epoch": 2.0447255559159547, "grad_norm": 0.22703427076339722, "learning_rate": 3.3431140930939653e-05, "loss": 0.3602, "step": 56735 }, { "epoch": 2.0449057555771795, "grad_norm": 0.17691659927368164, "learning_rate": 3.3428393693793006e-05, "loss": 0.3831, "step": 56740 }, { "epoch": 2.045085955238404, "grad_norm": 0.2599165439605713, "learning_rate": 3.342564634181114e-05, "loss": 0.4334, "step": 56745 }, { "epoch": 2.045266154899629, "grad_norm": 0.16780352592468262, "learning_rate": 3.3422898875031475e-05, "loss": 0.4114, "step": 56750 }, { "epoch": 2.045446354560853, "grad_norm": 0.21082206070423126, "learning_rate": 3.342015129349143e-05, "loss": 0.3919, "step": 56755 }, { "epoch": 2.045626554222078, "grad_norm": 0.24321195483207703, "learning_rate": 3.341740359722846e-05, "loss": 0.4228, "step": 56760 }, { "epoch": 2.0458067538833027, "grad_norm": 0.189229354262352, "learning_rate": 3.341465578628e-05, "loss": 0.378, "step": 56765 }, { "epoch": 2.0459869535445274, "grad_norm": 0.2039814442396164, "learning_rate": 3.341190786068349e-05, "loss": 0.3869, "step": 56770 }, { "epoch": 2.046167153205752, "grad_norm": 0.15500949323177338, "learning_rate": 3.340915982047635e-05, "loss": 0.3749, "step": 56775 }, { "epoch": 2.0463473528669764, "grad_norm": 0.21698758006095886, "learning_rate": 3.340641166569604e-05, "loss": 0.3944, "step": 56780 }, { "epoch": 2.046527552528201, "grad_norm": 0.1889837086200714, "learning_rate": 3.3403663396379997e-05, "loss": 0.3515, "step": 56785 }, { "epoch": 2.046707752189426, "grad_norm": 0.1569356471300125, "learning_rate": 3.340091501256567e-05, "loss": 0.4297, "step": 56790 }, { "epoch": 2.0468879518506506, "grad_norm": 0.17683616280555725, "learning_rate": 3.339816651429051e-05, "loss": 0.4058, "step": 56795 }, { "epoch": 2.0470681515118754, "grad_norm": 0.22128205001354218, "learning_rate": 3.339541790159194e-05, "loss": 0.429, "step": 56800 }, { "epoch": 2.0472483511730997, "grad_norm": 0.1818923056125641, "learning_rate": 3.339266917450744e-05, "loss": 0.4136, "step": 56805 }, { "epoch": 2.0474285508343244, "grad_norm": 0.19361944496631622, "learning_rate": 3.3389920333074434e-05, "loss": 0.4062, "step": 56810 }, { "epoch": 2.047608750495549, "grad_norm": 0.1958514004945755, "learning_rate": 3.33871713773304e-05, "loss": 0.4092, "step": 56815 }, { "epoch": 2.047788950156774, "grad_norm": 0.2066863626241684, "learning_rate": 3.338442230731278e-05, "loss": 0.4023, "step": 56820 }, { "epoch": 2.047969149817998, "grad_norm": 0.18415698409080505, "learning_rate": 3.338167312305902e-05, "loss": 0.3767, "step": 56825 }, { "epoch": 2.048149349479223, "grad_norm": 0.19590577483177185, "learning_rate": 3.33789238246066e-05, "loss": 0.3947, "step": 56830 }, { "epoch": 2.0483295491404476, "grad_norm": 0.17477436363697052, "learning_rate": 3.337617441199295e-05, "loss": 0.3764, "step": 56835 }, { "epoch": 2.0485097488016724, "grad_norm": 0.20771732926368713, "learning_rate": 3.337342488525556e-05, "loss": 0.437, "step": 56840 }, { "epoch": 2.048689948462897, "grad_norm": 0.18982523679733276, "learning_rate": 3.337067524443187e-05, "loss": 0.4064, "step": 56845 }, { "epoch": 2.0488701481241214, "grad_norm": 0.19457025825977325, "learning_rate": 3.3367925489559346e-05, "loss": 0.4211, "step": 56850 }, { "epoch": 2.049050347785346, "grad_norm": 0.17263978719711304, "learning_rate": 3.336517562067546e-05, "loss": 0.366, "step": 56855 }, { "epoch": 2.049230547446571, "grad_norm": 0.22003693878650665, "learning_rate": 3.336242563781768e-05, "loss": 0.4007, "step": 56860 }, { "epoch": 2.0494107471077956, "grad_norm": 0.24091579020023346, "learning_rate": 3.3359675541023474e-05, "loss": 0.4483, "step": 56865 }, { "epoch": 2.04959094676902, "grad_norm": 0.1734263151884079, "learning_rate": 3.3356925330330294e-05, "loss": 0.3737, "step": 56870 }, { "epoch": 2.0497711464302446, "grad_norm": 0.20855608582496643, "learning_rate": 3.335417500577563e-05, "loss": 0.4281, "step": 56875 }, { "epoch": 2.0499513460914693, "grad_norm": 0.2285163253545761, "learning_rate": 3.335142456739695e-05, "loss": 0.4231, "step": 56880 }, { "epoch": 2.050131545752694, "grad_norm": 0.17819838225841522, "learning_rate": 3.334867401523173e-05, "loss": 0.3831, "step": 56885 }, { "epoch": 2.050311745413919, "grad_norm": 0.2181788682937622, "learning_rate": 3.3345923349317436e-05, "loss": 0.4211, "step": 56890 }, { "epoch": 2.050491945075143, "grad_norm": 0.18571996688842773, "learning_rate": 3.334317256969156e-05, "loss": 0.4292, "step": 56895 }, { "epoch": 2.050672144736368, "grad_norm": 0.20971627533435822, "learning_rate": 3.334042167639157e-05, "loss": 0.4386, "step": 56900 }, { "epoch": 2.0508523443975926, "grad_norm": 0.17739515006542206, "learning_rate": 3.3337670669454945e-05, "loss": 0.3931, "step": 56905 }, { "epoch": 2.0510325440588173, "grad_norm": 0.24408304691314697, "learning_rate": 3.3334919548919177e-05, "loss": 0.3871, "step": 56910 }, { "epoch": 2.0512127437200416, "grad_norm": 0.21893776953220367, "learning_rate": 3.333216831482174e-05, "loss": 0.381, "step": 56915 }, { "epoch": 2.0513929433812663, "grad_norm": 0.19919031858444214, "learning_rate": 3.332941696720012e-05, "loss": 0.432, "step": 56920 }, { "epoch": 2.051573143042491, "grad_norm": 0.18088489770889282, "learning_rate": 3.3326665506091816e-05, "loss": 0.3544, "step": 56925 }, { "epoch": 2.051753342703716, "grad_norm": 0.19796021282672882, "learning_rate": 3.332391393153429e-05, "loss": 0.3834, "step": 56930 }, { "epoch": 2.0519335423649405, "grad_norm": 0.15804626047611237, "learning_rate": 3.3321162243565074e-05, "loss": 0.351, "step": 56935 }, { "epoch": 2.052113742026165, "grad_norm": 0.18689633905887604, "learning_rate": 3.3318410442221614e-05, "loss": 0.4289, "step": 56940 }, { "epoch": 2.0522939416873895, "grad_norm": 0.18491008877754211, "learning_rate": 3.3315658527541425e-05, "loss": 0.3782, "step": 56945 }, { "epoch": 2.0524741413486143, "grad_norm": 0.29741668701171875, "learning_rate": 3.3312906499562e-05, "loss": 0.4184, "step": 56950 }, { "epoch": 2.052654341009839, "grad_norm": 0.20080535113811493, "learning_rate": 3.331015435832084e-05, "loss": 0.3833, "step": 56955 }, { "epoch": 2.0528345406710637, "grad_norm": 0.16579574346542358, "learning_rate": 3.3307402103855426e-05, "loss": 0.414, "step": 56960 }, { "epoch": 2.053014740332288, "grad_norm": 0.2213381975889206, "learning_rate": 3.330464973620327e-05, "loss": 0.3911, "step": 56965 }, { "epoch": 2.0531949399935128, "grad_norm": 0.1835223287343979, "learning_rate": 3.330189725540187e-05, "loss": 0.4325, "step": 56970 }, { "epoch": 2.0533751396547375, "grad_norm": 0.18675781786441803, "learning_rate": 3.329914466148872e-05, "loss": 0.4228, "step": 56975 }, { "epoch": 2.0535553393159622, "grad_norm": 0.26438936591148376, "learning_rate": 3.329639195450135e-05, "loss": 0.4218, "step": 56980 }, { "epoch": 2.0537355389771865, "grad_norm": 0.1843392252922058, "learning_rate": 3.329363913447723e-05, "loss": 0.3871, "step": 56985 }, { "epoch": 2.0539157386384113, "grad_norm": 0.1797383427619934, "learning_rate": 3.329088620145389e-05, "loss": 0.3869, "step": 56990 }, { "epoch": 2.054095938299636, "grad_norm": 0.19929052889347076, "learning_rate": 3.3288133155468826e-05, "loss": 0.4024, "step": 56995 }, { "epoch": 2.0542761379608607, "grad_norm": 0.2309020608663559, "learning_rate": 3.328537999655955e-05, "loss": 0.394, "step": 57000 }, { "epoch": 2.0542761379608607, "eval_loss": 0.43758705258369446, "eval_runtime": 3.5257, "eval_samples_per_second": 28.363, "eval_steps_per_second": 7.091, "step": 57000 }, { "epoch": 2.0544563376220855, "grad_norm": 0.1692163497209549, "learning_rate": 3.328262672476358e-05, "loss": 0.3992, "step": 57005 }, { "epoch": 2.0546365372833097, "grad_norm": 0.2034154087305069, "learning_rate": 3.327987334011842e-05, "loss": 0.4069, "step": 57010 }, { "epoch": 2.0548167369445345, "grad_norm": 0.20764844119548798, "learning_rate": 3.32771198426616e-05, "loss": 0.3666, "step": 57015 }, { "epoch": 2.054996936605759, "grad_norm": 0.1821412444114685, "learning_rate": 3.327436623243061e-05, "loss": 0.4083, "step": 57020 }, { "epoch": 2.055177136266984, "grad_norm": 0.17971596121788025, "learning_rate": 3.3271612509463e-05, "loss": 0.4364, "step": 57025 }, { "epoch": 2.0553573359282082, "grad_norm": 0.17184744775295258, "learning_rate": 3.326885867379625e-05, "loss": 0.3749, "step": 57030 }, { "epoch": 2.055537535589433, "grad_norm": 0.20721890032291412, "learning_rate": 3.326610472546792e-05, "loss": 0.3955, "step": 57035 }, { "epoch": 2.0557177352506577, "grad_norm": 0.1466008871793747, "learning_rate": 3.32633506645155e-05, "loss": 0.4099, "step": 57040 }, { "epoch": 2.0558979349118824, "grad_norm": 0.1782764494419098, "learning_rate": 3.326059649097652e-05, "loss": 0.4146, "step": 57045 }, { "epoch": 2.056078134573107, "grad_norm": 0.21149560809135437, "learning_rate": 3.3257842204888536e-05, "loss": 0.4045, "step": 57050 }, { "epoch": 2.0562583342343315, "grad_norm": 0.1797577291727066, "learning_rate": 3.325508780628903e-05, "loss": 0.356, "step": 57055 }, { "epoch": 2.056438533895556, "grad_norm": 0.17433103919029236, "learning_rate": 3.325233329521557e-05, "loss": 0.4083, "step": 57060 }, { "epoch": 2.056618733556781, "grad_norm": 0.24138209223747253, "learning_rate": 3.324957867170565e-05, "loss": 0.3696, "step": 57065 }, { "epoch": 2.0567989332180057, "grad_norm": 0.17933684587478638, "learning_rate": 3.324682393579682e-05, "loss": 0.4377, "step": 57070 }, { "epoch": 2.0569791328792304, "grad_norm": 0.21966612339019775, "learning_rate": 3.324406908752662e-05, "loss": 0.3704, "step": 57075 }, { "epoch": 2.0571593325404547, "grad_norm": 0.21100081503391266, "learning_rate": 3.324131412693257e-05, "loss": 0.382, "step": 57080 }, { "epoch": 2.0573395322016794, "grad_norm": 0.17946301400661469, "learning_rate": 3.3238559054052206e-05, "loss": 0.3886, "step": 57085 }, { "epoch": 2.057519731862904, "grad_norm": 0.19525077939033508, "learning_rate": 3.3235803868923077e-05, "loss": 0.3696, "step": 57090 }, { "epoch": 2.057699931524129, "grad_norm": 0.22821834683418274, "learning_rate": 3.3233048571582715e-05, "loss": 0.4039, "step": 57095 }, { "epoch": 2.057880131185353, "grad_norm": 0.23264740407466888, "learning_rate": 3.3230293162068655e-05, "loss": 0.406, "step": 57100 }, { "epoch": 2.058060330846578, "grad_norm": 0.22399888932704926, "learning_rate": 3.3227537640418445e-05, "loss": 0.3937, "step": 57105 }, { "epoch": 2.0582405305078026, "grad_norm": 0.18442541360855103, "learning_rate": 3.3224782006669624e-05, "loss": 0.412, "step": 57110 }, { "epoch": 2.0584207301690274, "grad_norm": 0.23033219575881958, "learning_rate": 3.3222026260859747e-05, "loss": 0.3833, "step": 57115 }, { "epoch": 2.058600929830252, "grad_norm": 0.18063490092754364, "learning_rate": 3.3219270403026354e-05, "loss": 0.3814, "step": 57120 }, { "epoch": 2.0587811294914764, "grad_norm": 0.17391562461853027, "learning_rate": 3.321651443320699e-05, "loss": 0.3861, "step": 57125 }, { "epoch": 2.058961329152701, "grad_norm": 0.21318939328193665, "learning_rate": 3.32137583514392e-05, "loss": 0.4278, "step": 57130 }, { "epoch": 2.059141528813926, "grad_norm": 0.17975524067878723, "learning_rate": 3.3211002157760554e-05, "loss": 0.3949, "step": 57135 }, { "epoch": 2.0593217284751506, "grad_norm": 0.18724536895751953, "learning_rate": 3.32082458522086e-05, "loss": 0.3923, "step": 57140 }, { "epoch": 2.059501928136375, "grad_norm": 0.19911803305149078, "learning_rate": 3.3205489434820876e-05, "loss": 0.3708, "step": 57145 }, { "epoch": 2.0596821277975996, "grad_norm": 0.21658672392368317, "learning_rate": 3.3202732905634936e-05, "loss": 0.4215, "step": 57150 }, { "epoch": 2.0598623274588244, "grad_norm": 0.2002650946378708, "learning_rate": 3.3199976264688365e-05, "loss": 0.4014, "step": 57155 }, { "epoch": 2.060042527120049, "grad_norm": 0.20283368229866028, "learning_rate": 3.3197219512018704e-05, "loss": 0.4014, "step": 57160 }, { "epoch": 2.060222726781274, "grad_norm": 0.1828315407037735, "learning_rate": 3.319446264766351e-05, "loss": 0.4021, "step": 57165 }, { "epoch": 2.060402926442498, "grad_norm": 0.1759759783744812, "learning_rate": 3.319170567166034e-05, "loss": 0.4176, "step": 57170 }, { "epoch": 2.060583126103723, "grad_norm": 0.19315093755722046, "learning_rate": 3.318894858404678e-05, "loss": 0.3938, "step": 57175 }, { "epoch": 2.0607633257649476, "grad_norm": 0.19143354892730713, "learning_rate": 3.318619138486037e-05, "loss": 0.3987, "step": 57180 }, { "epoch": 2.0609435254261723, "grad_norm": 0.21193043887615204, "learning_rate": 3.31834340741387e-05, "loss": 0.4053, "step": 57185 }, { "epoch": 2.061123725087397, "grad_norm": 0.17882151901721954, "learning_rate": 3.318067665191932e-05, "loss": 0.3777, "step": 57190 }, { "epoch": 2.0613039247486213, "grad_norm": 0.19089268147945404, "learning_rate": 3.3177919118239794e-05, "loss": 0.3783, "step": 57195 }, { "epoch": 2.061484124409846, "grad_norm": 0.1889115571975708, "learning_rate": 3.317516147313771e-05, "loss": 0.4081, "step": 57200 }, { "epoch": 2.061664324071071, "grad_norm": 0.1909630447626114, "learning_rate": 3.317240371665064e-05, "loss": 0.3764, "step": 57205 }, { "epoch": 2.0618445237322955, "grad_norm": 0.1816696971654892, "learning_rate": 3.3169645848816146e-05, "loss": 0.4064, "step": 57210 }, { "epoch": 2.06202472339352, "grad_norm": 0.15952745079994202, "learning_rate": 3.316688786967181e-05, "loss": 0.3982, "step": 57215 }, { "epoch": 2.0622049230547446, "grad_norm": 0.17730703949928284, "learning_rate": 3.31641297792552e-05, "loss": 0.421, "step": 57220 }, { "epoch": 2.0623851227159693, "grad_norm": 0.19057762622833252, "learning_rate": 3.316137157760392e-05, "loss": 0.3998, "step": 57225 }, { "epoch": 2.062565322377194, "grad_norm": 0.20309016108512878, "learning_rate": 3.315861326475552e-05, "loss": 0.3808, "step": 57230 }, { "epoch": 2.0627455220384188, "grad_norm": 0.1632905900478363, "learning_rate": 3.3155854840747596e-05, "loss": 0.3994, "step": 57235 }, { "epoch": 2.062925721699643, "grad_norm": 0.20555098354816437, "learning_rate": 3.315364802153166e-05, "loss": 0.4086, "step": 57240 }, { "epoch": 2.063105921360868, "grad_norm": 0.22128038108348846, "learning_rate": 3.315088939753129e-05, "loss": 0.4045, "step": 57245 }, { "epoch": 2.0632861210220925, "grad_norm": 0.23114217817783356, "learning_rate": 3.314813066247664e-05, "loss": 0.3849, "step": 57250 }, { "epoch": 2.0634663206833173, "grad_norm": 0.17400681972503662, "learning_rate": 3.314537181640529e-05, "loss": 0.3957, "step": 57255 }, { "epoch": 2.0636465203445415, "grad_norm": 0.18826141953468323, "learning_rate": 3.314261285935483e-05, "loss": 0.3964, "step": 57260 }, { "epoch": 2.0638267200057663, "grad_norm": 0.21883268654346466, "learning_rate": 3.313985379136283e-05, "loss": 0.4175, "step": 57265 }, { "epoch": 2.064006919666991, "grad_norm": 0.1698734164237976, "learning_rate": 3.3137094612466914e-05, "loss": 0.4036, "step": 57270 }, { "epoch": 2.0641871193282157, "grad_norm": 0.17218467593193054, "learning_rate": 3.3134335322704646e-05, "loss": 0.3493, "step": 57275 }, { "epoch": 2.0643673189894405, "grad_norm": 0.19277000427246094, "learning_rate": 3.313157592211364e-05, "loss": 0.4359, "step": 57280 }, { "epoch": 2.0645475186506648, "grad_norm": 0.1942652016878128, "learning_rate": 3.312881641073149e-05, "loss": 0.4229, "step": 57285 }, { "epoch": 2.0647277183118895, "grad_norm": 0.23124489188194275, "learning_rate": 3.3126056788595786e-05, "loss": 0.38, "step": 57290 }, { "epoch": 2.0649079179731142, "grad_norm": 0.1828756481409073, "learning_rate": 3.312329705574413e-05, "loss": 0.4185, "step": 57295 }, { "epoch": 2.065088117634339, "grad_norm": 0.18582990765571594, "learning_rate": 3.3120537212214134e-05, "loss": 0.376, "step": 57300 }, { "epoch": 2.0652683172955637, "grad_norm": 0.18808506429195404, "learning_rate": 3.3117777258043384e-05, "loss": 0.3729, "step": 57305 }, { "epoch": 2.065448516956788, "grad_norm": 0.2194932997226715, "learning_rate": 3.311501719326948e-05, "loss": 0.4381, "step": 57310 }, { "epoch": 2.0656287166180127, "grad_norm": 0.1534089893102646, "learning_rate": 3.311225701793005e-05, "loss": 0.3966, "step": 57315 }, { "epoch": 2.0658089162792375, "grad_norm": 0.19418910145759583, "learning_rate": 3.3109496732062686e-05, "loss": 0.4222, "step": 57320 }, { "epoch": 2.065989115940462, "grad_norm": 0.20485571026802063, "learning_rate": 3.3106736335705e-05, "loss": 0.3795, "step": 57325 }, { "epoch": 2.0661693156016865, "grad_norm": 0.198880136013031, "learning_rate": 3.31039758288946e-05, "loss": 0.4168, "step": 57330 }, { "epoch": 2.066349515262911, "grad_norm": 0.18784551322460175, "learning_rate": 3.3101215211669104e-05, "loss": 0.3876, "step": 57335 }, { "epoch": 2.066529714924136, "grad_norm": 0.1909671276807785, "learning_rate": 3.309845448406612e-05, "loss": 0.3914, "step": 57340 }, { "epoch": 2.0667099145853607, "grad_norm": 0.18267397582530975, "learning_rate": 3.309569364612325e-05, "loss": 0.3791, "step": 57345 }, { "epoch": 2.0668901142465854, "grad_norm": 0.16980095207691193, "learning_rate": 3.3092932697878127e-05, "loss": 0.3921, "step": 57350 }, { "epoch": 2.0670703139078097, "grad_norm": 0.2372709959745407, "learning_rate": 3.3090171639368364e-05, "loss": 0.4233, "step": 57355 }, { "epoch": 2.0672505135690344, "grad_norm": 0.2137833833694458, "learning_rate": 3.3087410470631575e-05, "loss": 0.4325, "step": 57360 }, { "epoch": 2.067430713230259, "grad_norm": 0.20736674964427948, "learning_rate": 3.308464919170539e-05, "loss": 0.4262, "step": 57365 }, { "epoch": 2.067610912891484, "grad_norm": 0.23613141477108002, "learning_rate": 3.308188780262742e-05, "loss": 0.3844, "step": 57370 }, { "epoch": 2.067791112552708, "grad_norm": 0.23751363158226013, "learning_rate": 3.30791263034353e-05, "loss": 0.3884, "step": 57375 }, { "epoch": 2.067971312213933, "grad_norm": 0.22667773067951202, "learning_rate": 3.307636469416664e-05, "loss": 0.3871, "step": 57380 }, { "epoch": 2.0681515118751577, "grad_norm": 0.1814824640750885, "learning_rate": 3.307360297485907e-05, "loss": 0.3844, "step": 57385 }, { "epoch": 2.0683317115363824, "grad_norm": 0.19579993188381195, "learning_rate": 3.307084114555023e-05, "loss": 0.4153, "step": 57390 }, { "epoch": 2.068511911197607, "grad_norm": 0.2431478351354599, "learning_rate": 3.306807920627775e-05, "loss": 0.3946, "step": 57395 }, { "epoch": 2.0686921108588314, "grad_norm": 0.16477234661579132, "learning_rate": 3.306531715707925e-05, "loss": 0.3787, "step": 57400 }, { "epoch": 2.068872310520056, "grad_norm": 0.14775709807872772, "learning_rate": 3.306255499799235e-05, "loss": 0.3659, "step": 57405 }, { "epoch": 2.069052510181281, "grad_norm": 0.2197529375553131, "learning_rate": 3.305979272905472e-05, "loss": 0.4249, "step": 57410 }, { "epoch": 2.0692327098425056, "grad_norm": 0.1822102814912796, "learning_rate": 3.3057030350303954e-05, "loss": 0.4078, "step": 57415 }, { "epoch": 2.06941290950373, "grad_norm": 0.2318163812160492, "learning_rate": 3.305426786177772e-05, "loss": 0.4369, "step": 57420 }, { "epoch": 2.0695931091649546, "grad_norm": 0.1871294230222702, "learning_rate": 3.305150526351365e-05, "loss": 0.3798, "step": 57425 }, { "epoch": 2.0697733088261794, "grad_norm": 0.20110271871089935, "learning_rate": 3.304874255554937e-05, "loss": 0.3754, "step": 57430 }, { "epoch": 2.069953508487404, "grad_norm": 0.17223583161830902, "learning_rate": 3.304597973792254e-05, "loss": 0.3773, "step": 57435 }, { "epoch": 2.070133708148629, "grad_norm": 0.19843506813049316, "learning_rate": 3.304321681067079e-05, "loss": 0.3627, "step": 57440 }, { "epoch": 2.070313907809853, "grad_norm": 0.21979078650474548, "learning_rate": 3.304045377383177e-05, "loss": 0.401, "step": 57445 }, { "epoch": 2.070494107471078, "grad_norm": 0.18887178599834442, "learning_rate": 3.303769062744312e-05, "loss": 0.3996, "step": 57450 }, { "epoch": 2.0706743071323026, "grad_norm": 0.19170895218849182, "learning_rate": 3.303492737154249e-05, "loss": 0.3618, "step": 57455 }, { "epoch": 2.0708545067935273, "grad_norm": 0.19946110248565674, "learning_rate": 3.303216400616754e-05, "loss": 0.3732, "step": 57460 }, { "epoch": 2.071034706454752, "grad_norm": 0.20270924270153046, "learning_rate": 3.3029400531355897e-05, "loss": 0.3963, "step": 57465 }, { "epoch": 2.0712149061159764, "grad_norm": 0.24232977628707886, "learning_rate": 3.302663694714524e-05, "loss": 0.3853, "step": 57470 }, { "epoch": 2.071395105777201, "grad_norm": 0.17532899975776672, "learning_rate": 3.30238732535732e-05, "loss": 0.3874, "step": 57475 }, { "epoch": 2.071575305438426, "grad_norm": 0.18288403749465942, "learning_rate": 3.3021109450677445e-05, "loss": 0.3988, "step": 57480 }, { "epoch": 2.0717555050996506, "grad_norm": 0.22599361836910248, "learning_rate": 3.301834553849562e-05, "loss": 0.4176, "step": 57485 }, { "epoch": 2.071935704760875, "grad_norm": 0.20328468084335327, "learning_rate": 3.30155815170654e-05, "loss": 0.3994, "step": 57490 }, { "epoch": 2.0721159044220996, "grad_norm": 0.2231900542974472, "learning_rate": 3.3012817386424416e-05, "loss": 0.3832, "step": 57495 }, { "epoch": 2.0722961040833243, "grad_norm": 0.19576068222522736, "learning_rate": 3.301005314661037e-05, "loss": 0.3742, "step": 57500 }, { "epoch": 2.0722961040833243, "eval_loss": 0.43739715218544006, "eval_runtime": 3.5417, "eval_samples_per_second": 28.235, "eval_steps_per_second": 7.059, "step": 57500 }, { "epoch": 2.072476303744549, "grad_norm": 0.18939457833766937, "learning_rate": 3.3007288797660886e-05, "loss": 0.3979, "step": 57505 }, { "epoch": 2.072656503405774, "grad_norm": 0.15962184965610504, "learning_rate": 3.3004524339613636e-05, "loss": 0.3678, "step": 57510 }, { "epoch": 2.072836703066998, "grad_norm": 0.20595142245292664, "learning_rate": 3.30017597725063e-05, "loss": 0.4048, "step": 57515 }, { "epoch": 2.073016902728223, "grad_norm": 0.23062437772750854, "learning_rate": 3.299899509637654e-05, "loss": 0.3793, "step": 57520 }, { "epoch": 2.0731971023894475, "grad_norm": 0.23280833661556244, "learning_rate": 3.299623031126201e-05, "loss": 0.4115, "step": 57525 }, { "epoch": 2.0733773020506723, "grad_norm": 0.18664391338825226, "learning_rate": 3.2993465417200406e-05, "loss": 0.4042, "step": 57530 }, { "epoch": 2.0735575017118966, "grad_norm": 0.19512483477592468, "learning_rate": 3.299070041422937e-05, "loss": 0.4043, "step": 57535 }, { "epoch": 2.0737377013731213, "grad_norm": 0.2118794023990631, "learning_rate": 3.2987935302386594e-05, "loss": 0.4164, "step": 57540 }, { "epoch": 2.073917901034346, "grad_norm": 0.20261244475841522, "learning_rate": 3.298517008170974e-05, "loss": 0.4024, "step": 57545 }, { "epoch": 2.0740981006955708, "grad_norm": 0.18971537053585052, "learning_rate": 3.29824047522365e-05, "loss": 0.4133, "step": 57550 }, { "epoch": 2.0742783003567955, "grad_norm": 0.26411035656929016, "learning_rate": 3.297963931400453e-05, "loss": 0.397, "step": 57555 }, { "epoch": 2.07445850001802, "grad_norm": 0.19595226645469666, "learning_rate": 3.297687376705153e-05, "loss": 0.4261, "step": 57560 }, { "epoch": 2.0746386996792445, "grad_norm": 0.18524418771266937, "learning_rate": 3.297410811141516e-05, "loss": 0.3958, "step": 57565 }, { "epoch": 2.0748188993404693, "grad_norm": 0.1883489340543747, "learning_rate": 3.297134234713311e-05, "loss": 0.3905, "step": 57570 }, { "epoch": 2.074999099001694, "grad_norm": 0.1916968673467636, "learning_rate": 3.296857647424307e-05, "loss": 0.3914, "step": 57575 }, { "epoch": 2.0751792986629187, "grad_norm": 0.19922398030757904, "learning_rate": 3.296581049278272e-05, "loss": 0.3591, "step": 57580 }, { "epoch": 2.075359498324143, "grad_norm": 0.18488508462905884, "learning_rate": 3.2963044402789736e-05, "loss": 0.3581, "step": 57585 }, { "epoch": 2.0755396979853677, "grad_norm": 0.21221880614757538, "learning_rate": 3.296027820430182e-05, "loss": 0.4096, "step": 57590 }, { "epoch": 2.0757198976465925, "grad_norm": 0.1988646239042282, "learning_rate": 3.295751189735665e-05, "loss": 0.3703, "step": 57595 }, { "epoch": 2.075900097307817, "grad_norm": 0.17219915986061096, "learning_rate": 3.295474548199193e-05, "loss": 0.3574, "step": 57600 }, { "epoch": 2.0760802969690415, "grad_norm": 0.21070365607738495, "learning_rate": 3.2951978958245336e-05, "loss": 0.4147, "step": 57605 }, { "epoch": 2.0762604966302662, "grad_norm": 0.18376260995864868, "learning_rate": 3.294921232615457e-05, "loss": 0.4007, "step": 57610 }, { "epoch": 2.076440696291491, "grad_norm": 0.21397769451141357, "learning_rate": 3.294644558575732e-05, "loss": 0.3699, "step": 57615 }, { "epoch": 2.0766208959527157, "grad_norm": 0.2026335448026657, "learning_rate": 3.294367873709129e-05, "loss": 0.4035, "step": 57620 }, { "epoch": 2.0768010956139404, "grad_norm": 0.21820026636123657, "learning_rate": 3.294091178019418e-05, "loss": 0.3559, "step": 57625 }, { "epoch": 2.0769812952751647, "grad_norm": 0.1874457746744156, "learning_rate": 3.293814471510368e-05, "loss": 0.398, "step": 57630 }, { "epoch": 2.0771614949363895, "grad_norm": 0.20042584836483002, "learning_rate": 3.293537754185749e-05, "loss": 0.366, "step": 57635 }, { "epoch": 2.077341694597614, "grad_norm": 0.24218867719173431, "learning_rate": 3.293261026049333e-05, "loss": 0.4413, "step": 57640 }, { "epoch": 2.077521894258839, "grad_norm": 0.17626816034317017, "learning_rate": 3.2929842871048885e-05, "loss": 0.3598, "step": 57645 }, { "epoch": 2.077702093920063, "grad_norm": 0.2106868326663971, "learning_rate": 3.292707537356186e-05, "loss": 0.3975, "step": 57650 }, { "epoch": 2.077882293581288, "grad_norm": 0.19412852823734283, "learning_rate": 3.292430776806997e-05, "loss": 0.3979, "step": 57655 }, { "epoch": 2.0780624932425127, "grad_norm": 0.2266625612974167, "learning_rate": 3.292154005461093e-05, "loss": 0.4145, "step": 57660 }, { "epoch": 2.0782426929037374, "grad_norm": 0.2035531848669052, "learning_rate": 3.291877223322244e-05, "loss": 0.3795, "step": 57665 }, { "epoch": 2.078422892564962, "grad_norm": 0.22533543407917023, "learning_rate": 3.29160043039422e-05, "loss": 0.4332, "step": 57670 }, { "epoch": 2.0786030922261864, "grad_norm": 0.13781999051570892, "learning_rate": 3.291323626680793e-05, "loss": 0.3766, "step": 57675 }, { "epoch": 2.078783291887411, "grad_norm": 0.20056338608264923, "learning_rate": 3.291046812185736e-05, "loss": 0.3835, "step": 57680 }, { "epoch": 2.078963491548636, "grad_norm": 0.17711946368217468, "learning_rate": 3.29076998691282e-05, "loss": 0.4235, "step": 57685 }, { "epoch": 2.0791436912098606, "grad_norm": 0.19499550759792328, "learning_rate": 3.2904931508658154e-05, "loss": 0.3613, "step": 57690 }, { "epoch": 2.079323890871085, "grad_norm": 0.20534376800060272, "learning_rate": 3.290216304048494e-05, "loss": 0.3908, "step": 57695 }, { "epoch": 2.0795040905323097, "grad_norm": 0.18097397685050964, "learning_rate": 3.289939446464629e-05, "loss": 0.409, "step": 57700 }, { "epoch": 2.0796842901935344, "grad_norm": 0.18338914215564728, "learning_rate": 3.289662578117992e-05, "loss": 0.3821, "step": 57705 }, { "epoch": 2.079864489854759, "grad_norm": 0.1914772242307663, "learning_rate": 3.2893856990123544e-05, "loss": 0.3561, "step": 57710 }, { "epoch": 2.080044689515984, "grad_norm": 0.25630638003349304, "learning_rate": 3.2891088091514905e-05, "loss": 0.3759, "step": 57715 }, { "epoch": 2.080224889177208, "grad_norm": 0.18817178905010223, "learning_rate": 3.288831908539171e-05, "loss": 0.3603, "step": 57720 }, { "epoch": 2.080405088838433, "grad_norm": 0.15790888667106628, "learning_rate": 3.288554997179171e-05, "loss": 0.3602, "step": 57725 }, { "epoch": 2.0805852884996576, "grad_norm": 0.17841728031635284, "learning_rate": 3.2882780750752604e-05, "loss": 0.3972, "step": 57730 }, { "epoch": 2.0807654881608824, "grad_norm": 0.20370912551879883, "learning_rate": 3.288001142231214e-05, "loss": 0.3692, "step": 57735 }, { "epoch": 2.080945687822107, "grad_norm": 0.19954347610473633, "learning_rate": 3.2877241986508045e-05, "loss": 0.3974, "step": 57740 }, { "epoch": 2.0811258874833314, "grad_norm": 0.1677468866109848, "learning_rate": 3.2874472443378056e-05, "loss": 0.3678, "step": 57745 }, { "epoch": 2.081306087144556, "grad_norm": 0.2071567326784134, "learning_rate": 3.287170279295991e-05, "loss": 0.3873, "step": 57750 }, { "epoch": 2.081486286805781, "grad_norm": 0.1877603381872177, "learning_rate": 3.286893303529132e-05, "loss": 0.3882, "step": 57755 }, { "epoch": 2.0816664864670056, "grad_norm": 0.20741944015026093, "learning_rate": 3.286616317041006e-05, "loss": 0.4103, "step": 57760 }, { "epoch": 2.08184668612823, "grad_norm": 0.18143180012702942, "learning_rate": 3.286339319835384e-05, "loss": 0.4236, "step": 57765 }, { "epoch": 2.0820268857894546, "grad_norm": 0.19399969279766083, "learning_rate": 3.286062311916041e-05, "loss": 0.3963, "step": 57770 }, { "epoch": 2.0822070854506793, "grad_norm": 0.19400350749492645, "learning_rate": 3.285785293286751e-05, "loss": 0.4274, "step": 57775 }, { "epoch": 2.082387285111904, "grad_norm": 0.21012353897094727, "learning_rate": 3.285508263951289e-05, "loss": 0.4118, "step": 57780 }, { "epoch": 2.082567484773129, "grad_norm": 0.21954594552516937, "learning_rate": 3.285231223913429e-05, "loss": 0.4276, "step": 57785 }, { "epoch": 2.082747684434353, "grad_norm": 0.1681157350540161, "learning_rate": 3.284954173176945e-05, "loss": 0.3828, "step": 57790 }, { "epoch": 2.082927884095578, "grad_norm": 0.19216038286685944, "learning_rate": 3.2846771117456134e-05, "loss": 0.3608, "step": 57795 }, { "epoch": 2.0831080837568026, "grad_norm": 0.20407375693321228, "learning_rate": 3.2844000396232064e-05, "loss": 0.3638, "step": 57800 }, { "epoch": 2.0832882834180273, "grad_norm": 0.186093270778656, "learning_rate": 3.284122956813503e-05, "loss": 0.4093, "step": 57805 }, { "epoch": 2.083468483079252, "grad_norm": 0.19043242931365967, "learning_rate": 3.2838458633202745e-05, "loss": 0.4267, "step": 57810 }, { "epoch": 2.0836486827404763, "grad_norm": 0.22312848269939423, "learning_rate": 3.283568759147298e-05, "loss": 0.3784, "step": 57815 }, { "epoch": 2.083828882401701, "grad_norm": 0.207689568400383, "learning_rate": 3.283291644298349e-05, "loss": 0.3612, "step": 57820 }, { "epoch": 2.084009082062926, "grad_norm": 0.1909976601600647, "learning_rate": 3.283014518777203e-05, "loss": 0.412, "step": 57825 }, { "epoch": 2.0841892817241505, "grad_norm": 0.17157649993896484, "learning_rate": 3.2827373825876364e-05, "loss": 0.4281, "step": 57830 }, { "epoch": 2.084369481385375, "grad_norm": 0.20599566400051117, "learning_rate": 3.282460235733424e-05, "loss": 0.4165, "step": 57835 }, { "epoch": 2.0845496810465995, "grad_norm": 0.23705652356147766, "learning_rate": 3.282183078218342e-05, "loss": 0.4269, "step": 57840 }, { "epoch": 2.0847298807078243, "grad_norm": 0.17649759352207184, "learning_rate": 3.281905910046167e-05, "loss": 0.3876, "step": 57845 }, { "epoch": 2.084910080369049, "grad_norm": 0.16404461860656738, "learning_rate": 3.281628731220676e-05, "loss": 0.3754, "step": 57850 }, { "epoch": 2.0850902800302737, "grad_norm": 0.1914927363395691, "learning_rate": 3.2813515417456456e-05, "loss": 0.4207, "step": 57855 }, { "epoch": 2.085270479691498, "grad_norm": 0.19702866673469543, "learning_rate": 3.2810743416248496e-05, "loss": 0.4151, "step": 57860 }, { "epoch": 2.0854506793527228, "grad_norm": 0.19254851341247559, "learning_rate": 3.2807971308620685e-05, "loss": 0.389, "step": 57865 }, { "epoch": 2.0856308790139475, "grad_norm": 0.2003370225429535, "learning_rate": 3.2805199094610774e-05, "loss": 0.3934, "step": 57870 }, { "epoch": 2.0858110786751722, "grad_norm": 0.18016155064105988, "learning_rate": 3.2802426774256534e-05, "loss": 0.3857, "step": 57875 }, { "epoch": 2.0859912783363965, "grad_norm": 0.16226695477962494, "learning_rate": 3.2799654347595736e-05, "loss": 0.4044, "step": 57880 }, { "epoch": 2.0861714779976213, "grad_norm": 0.1907840520143509, "learning_rate": 3.2796881814666164e-05, "loss": 0.4128, "step": 57885 }, { "epoch": 2.086351677658846, "grad_norm": 0.19597387313842773, "learning_rate": 3.279410917550559e-05, "loss": 0.4094, "step": 57890 }, { "epoch": 2.0865318773200707, "grad_norm": 0.2085808515548706, "learning_rate": 3.2791336430151775e-05, "loss": 0.416, "step": 57895 }, { "epoch": 2.0867120769812955, "grad_norm": 0.22985954582691193, "learning_rate": 3.278856357864252e-05, "loss": 0.4266, "step": 57900 }, { "epoch": 2.0868922766425198, "grad_norm": 0.2437005490064621, "learning_rate": 3.278579062101559e-05, "loss": 0.4272, "step": 57905 }, { "epoch": 2.0870724763037445, "grad_norm": 0.1723114401102066, "learning_rate": 3.278301755730878e-05, "loss": 0.3741, "step": 57910 }, { "epoch": 2.087252675964969, "grad_norm": 0.1853064000606537, "learning_rate": 3.2780244387559846e-05, "loss": 0.4152, "step": 57915 }, { "epoch": 2.087432875626194, "grad_norm": 0.2420477718114853, "learning_rate": 3.27774711118066e-05, "loss": 0.3977, "step": 57920 }, { "epoch": 2.0876130752874182, "grad_norm": 0.23371827602386475, "learning_rate": 3.277469773008681e-05, "loss": 0.4223, "step": 57925 }, { "epoch": 2.087793274948643, "grad_norm": 0.1876314878463745, "learning_rate": 3.277192424243827e-05, "loss": 0.4122, "step": 57930 }, { "epoch": 2.0879734746098677, "grad_norm": 0.1702897995710373, "learning_rate": 3.276915064889877e-05, "loss": 0.3766, "step": 57935 }, { "epoch": 2.0881536742710924, "grad_norm": 0.22066879272460938, "learning_rate": 3.2766376949506085e-05, "loss": 0.4061, "step": 57940 }, { "epoch": 2.088333873932317, "grad_norm": 0.19222991168498993, "learning_rate": 3.2763603144298026e-05, "loss": 0.372, "step": 57945 }, { "epoch": 2.0885140735935415, "grad_norm": 0.17189811170101166, "learning_rate": 3.2760829233312385e-05, "loss": 0.3949, "step": 57950 }, { "epoch": 2.088694273254766, "grad_norm": 0.19438187777996063, "learning_rate": 3.275805521658694e-05, "loss": 0.4451, "step": 57955 }, { "epoch": 2.088874472915991, "grad_norm": 0.17549294233322144, "learning_rate": 3.27552810941595e-05, "loss": 0.39, "step": 57960 }, { "epoch": 2.0890546725772157, "grad_norm": 0.18093203008174896, "learning_rate": 3.275250686606784e-05, "loss": 0.3649, "step": 57965 }, { "epoch": 2.0892348722384404, "grad_norm": 0.17545856535434723, "learning_rate": 3.2749732532349796e-05, "loss": 0.4106, "step": 57970 }, { "epoch": 2.0894150718996647, "grad_norm": 0.21285952627658844, "learning_rate": 3.2746958093043136e-05, "loss": 0.3845, "step": 57975 }, { "epoch": 2.0895952715608894, "grad_norm": 0.2009621560573578, "learning_rate": 3.2744183548185674e-05, "loss": 0.3827, "step": 57980 }, { "epoch": 2.089775471222114, "grad_norm": 0.21972990036010742, "learning_rate": 3.274140889781521e-05, "loss": 0.367, "step": 57985 }, { "epoch": 2.089955670883339, "grad_norm": 0.1965899020433426, "learning_rate": 3.273863414196955e-05, "loss": 0.3854, "step": 57990 }, { "epoch": 2.090135870544563, "grad_norm": 0.17252902686595917, "learning_rate": 3.27358592806865e-05, "loss": 0.4063, "step": 57995 }, { "epoch": 2.090316070205788, "grad_norm": 0.1891898810863495, "learning_rate": 3.273308431400386e-05, "loss": 0.4243, "step": 58000 }, { "epoch": 2.090316070205788, "eval_loss": 0.4369398057460785, "eval_runtime": 3.5377, "eval_samples_per_second": 28.267, "eval_steps_per_second": 7.067, "step": 58000 }, { "epoch": 2.0904962698670126, "grad_norm": 0.17718040943145752, "learning_rate": 3.2730309241959446e-05, "loss": 0.4142, "step": 58005 }, { "epoch": 2.0906764695282374, "grad_norm": 0.16075636446475983, "learning_rate": 3.272753406459106e-05, "loss": 0.4091, "step": 58010 }, { "epoch": 2.090856669189462, "grad_norm": 0.21720968186855316, "learning_rate": 3.272475878193653e-05, "loss": 0.404, "step": 58015 }, { "epoch": 2.0910368688506864, "grad_norm": 0.19920498132705688, "learning_rate": 3.2721983394033645e-05, "loss": 0.39, "step": 58020 }, { "epoch": 2.091217068511911, "grad_norm": 0.17115192115306854, "learning_rate": 3.271920790092024e-05, "loss": 0.398, "step": 58025 }, { "epoch": 2.091397268173136, "grad_norm": 0.15494772791862488, "learning_rate": 3.2716432302634116e-05, "loss": 0.4062, "step": 58030 }, { "epoch": 2.0915774678343606, "grad_norm": 0.1952543705701828, "learning_rate": 3.2713656599213094e-05, "loss": 0.4006, "step": 58035 }, { "epoch": 2.091757667495585, "grad_norm": 0.2297617346048355, "learning_rate": 3.2710880790695e-05, "loss": 0.393, "step": 58040 }, { "epoch": 2.0919378671568096, "grad_norm": 0.18533745408058167, "learning_rate": 3.2708104877117644e-05, "loss": 0.375, "step": 58045 }, { "epoch": 2.0921180668180344, "grad_norm": 0.19725355505943298, "learning_rate": 3.270532885851886e-05, "loss": 0.3976, "step": 58050 }, { "epoch": 2.092298266479259, "grad_norm": 0.2088615596294403, "learning_rate": 3.2702552734936454e-05, "loss": 0.423, "step": 58055 }, { "epoch": 2.092478466140484, "grad_norm": 0.1870979368686676, "learning_rate": 3.2699776506408266e-05, "loss": 0.3923, "step": 58060 }, { "epoch": 2.092658665801708, "grad_norm": 0.18204568326473236, "learning_rate": 3.2697000172972106e-05, "loss": 0.3888, "step": 58065 }, { "epoch": 2.092838865462933, "grad_norm": 0.16376551985740662, "learning_rate": 3.2694223734665806e-05, "loss": 0.3746, "step": 58070 }, { "epoch": 2.0930190651241576, "grad_norm": 0.21093137562274933, "learning_rate": 3.26914471915272e-05, "loss": 0.4078, "step": 58075 }, { "epoch": 2.0931992647853823, "grad_norm": 0.199448361992836, "learning_rate": 3.2688670543594116e-05, "loss": 0.3742, "step": 58080 }, { "epoch": 2.093379464446607, "grad_norm": 0.21736840903759003, "learning_rate": 3.268589379090439e-05, "loss": 0.3834, "step": 58085 }, { "epoch": 2.0935596641078313, "grad_norm": 0.22230195999145508, "learning_rate": 3.2683116933495844e-05, "loss": 0.4269, "step": 58090 }, { "epoch": 2.093739863769056, "grad_norm": 0.20408831536769867, "learning_rate": 3.2680339971406325e-05, "loss": 0.4019, "step": 58095 }, { "epoch": 2.093920063430281, "grad_norm": 0.25087887048721313, "learning_rate": 3.267756290467365e-05, "loss": 0.4346, "step": 58100 }, { "epoch": 2.0941002630915055, "grad_norm": 0.2030712068080902, "learning_rate": 3.267478573333567e-05, "loss": 0.3983, "step": 58105 }, { "epoch": 2.09428046275273, "grad_norm": 0.24952729046344757, "learning_rate": 3.267200845743022e-05, "loss": 0.3967, "step": 58110 }, { "epoch": 2.0944606624139546, "grad_norm": 0.16245847940444946, "learning_rate": 3.2669231076995146e-05, "loss": 0.3667, "step": 58115 }, { "epoch": 2.0946408620751793, "grad_norm": 0.22543512284755707, "learning_rate": 3.266645359206827e-05, "loss": 0.3892, "step": 58120 }, { "epoch": 2.094821061736404, "grad_norm": 0.1827520877122879, "learning_rate": 3.266367600268746e-05, "loss": 0.3929, "step": 58125 }, { "epoch": 2.0950012613976288, "grad_norm": 0.20801417529582977, "learning_rate": 3.2660898308890546e-05, "loss": 0.3763, "step": 58130 }, { "epoch": 2.095181461058853, "grad_norm": 0.19347546994686127, "learning_rate": 3.265812051071537e-05, "loss": 0.3982, "step": 58135 }, { "epoch": 2.095361660720078, "grad_norm": 0.19839556515216827, "learning_rate": 3.265534260819979e-05, "loss": 0.4412, "step": 58140 }, { "epoch": 2.0955418603813025, "grad_norm": 0.20302700996398926, "learning_rate": 3.265256460138165e-05, "loss": 0.4398, "step": 58145 }, { "epoch": 2.0957220600425273, "grad_norm": 0.19693662226200104, "learning_rate": 3.2649786490298796e-05, "loss": 0.4202, "step": 58150 }, { "epoch": 2.0959022597037515, "grad_norm": 0.17689436674118042, "learning_rate": 3.2647008274989087e-05, "loss": 0.4202, "step": 58155 }, { "epoch": 2.0960824593649763, "grad_norm": 0.20521299540996552, "learning_rate": 3.264422995549037e-05, "loss": 0.3863, "step": 58160 }, { "epoch": 2.096262659026201, "grad_norm": 0.16490758955478668, "learning_rate": 3.26414515318405e-05, "loss": 0.372, "step": 58165 }, { "epoch": 2.0964428586874257, "grad_norm": 0.19816817343235016, "learning_rate": 3.263867300407732e-05, "loss": 0.4008, "step": 58170 }, { "epoch": 2.0966230583486505, "grad_norm": 0.19456301629543304, "learning_rate": 3.263589437223871e-05, "loss": 0.3828, "step": 58175 }, { "epoch": 2.0968032580098748, "grad_norm": 0.20074453949928284, "learning_rate": 3.263311563636252e-05, "loss": 0.3943, "step": 58180 }, { "epoch": 2.0969834576710995, "grad_norm": 0.21630464494228363, "learning_rate": 3.26303367964866e-05, "loss": 0.3669, "step": 58185 }, { "epoch": 2.0971636573323242, "grad_norm": 0.17707021534442902, "learning_rate": 3.262755785264882e-05, "loss": 0.3598, "step": 58190 }, { "epoch": 2.097343856993549, "grad_norm": 0.1683293730020523, "learning_rate": 3.262477880488705e-05, "loss": 0.3759, "step": 58195 }, { "epoch": 2.0975240566547733, "grad_norm": 0.20540519058704376, "learning_rate": 3.262199965323913e-05, "loss": 0.4094, "step": 58200 }, { "epoch": 2.097704256315998, "grad_norm": 0.17812873423099518, "learning_rate": 3.261922039774295e-05, "loss": 0.401, "step": 58205 }, { "epoch": 2.0978844559772227, "grad_norm": 0.21470516920089722, "learning_rate": 3.261644103843637e-05, "loss": 0.3805, "step": 58210 }, { "epoch": 2.0980646556384475, "grad_norm": 0.23254641890525818, "learning_rate": 3.261366157535725e-05, "loss": 0.4054, "step": 58215 }, { "epoch": 2.098244855299672, "grad_norm": 0.2531212568283081, "learning_rate": 3.2610882008543466e-05, "loss": 0.3981, "step": 58220 }, { "epoch": 2.0984250549608965, "grad_norm": 0.2381640523672104, "learning_rate": 3.260810233803289e-05, "loss": 0.4355, "step": 58225 }, { "epoch": 2.098605254622121, "grad_norm": 0.1578020453453064, "learning_rate": 3.260532256386338e-05, "loss": 0.3769, "step": 58230 }, { "epoch": 2.098785454283346, "grad_norm": 0.23701877892017365, "learning_rate": 3.2602542686072835e-05, "loss": 0.4272, "step": 58235 }, { "epoch": 2.0989656539445707, "grad_norm": 0.2273239940404892, "learning_rate": 3.259976270469912e-05, "loss": 0.4038, "step": 58240 }, { "epoch": 2.0991458536057954, "grad_norm": 0.1812596321105957, "learning_rate": 3.25969826197801e-05, "loss": 0.3835, "step": 58245 }, { "epoch": 2.0993260532670197, "grad_norm": 0.21768318116664886, "learning_rate": 3.259420243135367e-05, "loss": 0.4035, "step": 58250 }, { "epoch": 2.0995062529282444, "grad_norm": 0.16613122820854187, "learning_rate": 3.25914221394577e-05, "loss": 0.3626, "step": 58255 }, { "epoch": 2.099686452589469, "grad_norm": 0.21264012157917023, "learning_rate": 3.258864174413008e-05, "loss": 0.4092, "step": 58260 }, { "epoch": 2.099866652250694, "grad_norm": 0.22502678632736206, "learning_rate": 3.2585861245408676e-05, "loss": 0.4298, "step": 58265 }, { "epoch": 2.100046851911918, "grad_norm": 0.17495203018188477, "learning_rate": 3.2583080643331385e-05, "loss": 0.3757, "step": 58270 }, { "epoch": 2.100227051573143, "grad_norm": 0.16976486146450043, "learning_rate": 3.2580299937936084e-05, "loss": 0.4213, "step": 58275 }, { "epoch": 2.1004072512343677, "grad_norm": 0.25477150082588196, "learning_rate": 3.2577519129260666e-05, "loss": 0.4203, "step": 58280 }, { "epoch": 2.1005874508955924, "grad_norm": 0.27590665221214294, "learning_rate": 3.257473821734302e-05, "loss": 0.4257, "step": 58285 }, { "epoch": 2.100767650556817, "grad_norm": 0.18008361756801605, "learning_rate": 3.257195720222103e-05, "loss": 0.3971, "step": 58290 }, { "epoch": 2.1009478502180414, "grad_norm": 0.22707924246788025, "learning_rate": 3.256917608393259e-05, "loss": 0.3946, "step": 58295 }, { "epoch": 2.101128049879266, "grad_norm": 0.18620999157428741, "learning_rate": 3.2566394862515596e-05, "loss": 0.3527, "step": 58300 }, { "epoch": 2.101308249540491, "grad_norm": 0.20725572109222412, "learning_rate": 3.256361353800793e-05, "loss": 0.4064, "step": 58305 }, { "epoch": 2.1014884492017156, "grad_norm": 0.2142266035079956, "learning_rate": 3.2560832110447495e-05, "loss": 0.4219, "step": 58310 }, { "epoch": 2.1016686488629404, "grad_norm": 0.2480008453130722, "learning_rate": 3.255805057987218e-05, "loss": 0.4156, "step": 58315 }, { "epoch": 2.1018488485241646, "grad_norm": 0.22101950645446777, "learning_rate": 3.255526894631991e-05, "loss": 0.3769, "step": 58320 }, { "epoch": 2.1020290481853894, "grad_norm": 0.1711684614419937, "learning_rate": 3.255248720982854e-05, "loss": 0.4069, "step": 58325 }, { "epoch": 2.102209247846614, "grad_norm": 0.2133340984582901, "learning_rate": 3.254970537043601e-05, "loss": 0.4075, "step": 58330 }, { "epoch": 2.102389447507839, "grad_norm": 0.27530649304389954, "learning_rate": 3.2546923428180184e-05, "loss": 0.4109, "step": 58335 }, { "epoch": 2.102569647169063, "grad_norm": 0.19913306832313538, "learning_rate": 3.2544141383099014e-05, "loss": 0.4136, "step": 58340 }, { "epoch": 2.102749846830288, "grad_norm": 0.2158804088830948, "learning_rate": 3.254135923523037e-05, "loss": 0.3929, "step": 58345 }, { "epoch": 2.1029300464915126, "grad_norm": 0.23708494007587433, "learning_rate": 3.253857698461216e-05, "loss": 0.4019, "step": 58350 }, { "epoch": 2.1031102461527373, "grad_norm": 0.20912696421146393, "learning_rate": 3.25357946312823e-05, "loss": 0.4115, "step": 58355 }, { "epoch": 2.103290445813962, "grad_norm": 0.1815493106842041, "learning_rate": 3.25330121752787e-05, "loss": 0.4154, "step": 58360 }, { "epoch": 2.1034706454751864, "grad_norm": 0.2595524489879608, "learning_rate": 3.253022961663927e-05, "loss": 0.4105, "step": 58365 }, { "epoch": 2.103650845136411, "grad_norm": 0.1517713963985443, "learning_rate": 3.252744695540191e-05, "loss": 0.4013, "step": 58370 }, { "epoch": 2.103831044797636, "grad_norm": 0.1964835673570633, "learning_rate": 3.252466419160455e-05, "loss": 0.4256, "step": 58375 }, { "epoch": 2.1040112444588606, "grad_norm": 0.18157154321670532, "learning_rate": 3.25218813252851e-05, "loss": 0.3714, "step": 58380 }, { "epoch": 2.104191444120085, "grad_norm": 0.18907707929611206, "learning_rate": 3.251909835648147e-05, "loss": 0.4031, "step": 58385 }, { "epoch": 2.1043716437813096, "grad_norm": 0.17940135300159454, "learning_rate": 3.2516315285231576e-05, "loss": 0.414, "step": 58390 }, { "epoch": 2.1045518434425343, "grad_norm": 0.18015572428703308, "learning_rate": 3.251353211157334e-05, "loss": 0.3899, "step": 58395 }, { "epoch": 2.104732043103759, "grad_norm": 0.20908917486667633, "learning_rate": 3.25107488355447e-05, "loss": 0.4163, "step": 58400 }, { "epoch": 2.104912242764984, "grad_norm": 0.1619907170534134, "learning_rate": 3.2507965457183545e-05, "loss": 0.4148, "step": 58405 }, { "epoch": 2.105092442426208, "grad_norm": 0.1768513172864914, "learning_rate": 3.2505181976527815e-05, "loss": 0.4, "step": 58410 }, { "epoch": 2.105272642087433, "grad_norm": 0.17166374623775482, "learning_rate": 3.2502398393615435e-05, "loss": 0.394, "step": 58415 }, { "epoch": 2.1054528417486575, "grad_norm": 0.21615466475486755, "learning_rate": 3.249961470848433e-05, "loss": 0.3977, "step": 58420 }, { "epoch": 2.1056330414098823, "grad_norm": 0.1769254207611084, "learning_rate": 3.249683092117243e-05, "loss": 0.3984, "step": 58425 }, { "epoch": 2.1058132410711066, "grad_norm": 0.19884678721427917, "learning_rate": 3.249404703171766e-05, "loss": 0.3915, "step": 58430 }, { "epoch": 2.1059934407323313, "grad_norm": 0.186051145195961, "learning_rate": 3.249126304015795e-05, "loss": 0.388, "step": 58435 }, { "epoch": 2.106173640393556, "grad_norm": 0.16118311882019043, "learning_rate": 3.248847894653122e-05, "loss": 0.3806, "step": 58440 }, { "epoch": 2.1063538400547808, "grad_norm": 0.24056154489517212, "learning_rate": 3.248569475087544e-05, "loss": 0.4179, "step": 58445 }, { "epoch": 2.1065340397160055, "grad_norm": 0.204376682639122, "learning_rate": 3.24829104532285e-05, "loss": 0.3641, "step": 58450 }, { "epoch": 2.10671423937723, "grad_norm": 0.2513475716114044, "learning_rate": 3.248012605362836e-05, "loss": 0.3944, "step": 58455 }, { "epoch": 2.1068944390384545, "grad_norm": 0.24770568311214447, "learning_rate": 3.247734155211294e-05, "loss": 0.3903, "step": 58460 }, { "epoch": 2.1070746386996793, "grad_norm": 0.28164035081863403, "learning_rate": 3.2474556948720197e-05, "loss": 0.3925, "step": 58465 }, { "epoch": 2.107254838360904, "grad_norm": 0.2520608603954315, "learning_rate": 3.2471772243488064e-05, "loss": 0.3713, "step": 58470 }, { "epoch": 2.1074350380221287, "grad_norm": 0.18132475018501282, "learning_rate": 3.2468987436454476e-05, "loss": 0.4178, "step": 58475 }, { "epoch": 2.107615237683353, "grad_norm": 0.20063307881355286, "learning_rate": 3.246620252765739e-05, "loss": 0.417, "step": 58480 }, { "epoch": 2.1077954373445777, "grad_norm": 0.227056086063385, "learning_rate": 3.2463417517134734e-05, "loss": 0.4046, "step": 58485 }, { "epoch": 2.1079756370058025, "grad_norm": 0.16620944440364838, "learning_rate": 3.246063240492445e-05, "loss": 0.3735, "step": 58490 }, { "epoch": 2.108155836667027, "grad_norm": 0.18251557648181915, "learning_rate": 3.2457847191064505e-05, "loss": 0.3888, "step": 58495 }, { "epoch": 2.1083360363282515, "grad_norm": 0.20727889239788055, "learning_rate": 3.245506187559283e-05, "loss": 0.4018, "step": 58500 }, { "epoch": 2.1083360363282515, "eval_loss": 0.4373342990875244, "eval_runtime": 3.5409, "eval_samples_per_second": 28.242, "eval_steps_per_second": 7.06, "step": 58500 }, { "epoch": 2.1085162359894762, "grad_norm": 0.2316305935382843, "learning_rate": 3.245227645854739e-05, "loss": 0.3647, "step": 58505 }, { "epoch": 2.108696435650701, "grad_norm": 0.22615645825862885, "learning_rate": 3.244949093996612e-05, "loss": 0.3822, "step": 58510 }, { "epoch": 2.1088766353119257, "grad_norm": 0.2182559221982956, "learning_rate": 3.244670531988697e-05, "loss": 0.3833, "step": 58515 }, { "epoch": 2.1090568349731504, "grad_norm": 0.19747740030288696, "learning_rate": 3.244391959834791e-05, "loss": 0.3731, "step": 58520 }, { "epoch": 2.1092370346343747, "grad_norm": 0.165822371840477, "learning_rate": 3.244113377538689e-05, "loss": 0.3825, "step": 58525 }, { "epoch": 2.1094172342955995, "grad_norm": 0.19013039767742157, "learning_rate": 3.243834785104186e-05, "loss": 0.3978, "step": 58530 }, { "epoch": 2.109597433956824, "grad_norm": 0.16244149208068848, "learning_rate": 3.243556182535077e-05, "loss": 0.3966, "step": 58535 }, { "epoch": 2.109777633618049, "grad_norm": 0.19332407414913177, "learning_rate": 3.2432775698351605e-05, "loss": 0.3793, "step": 58540 }, { "epoch": 2.109957833279273, "grad_norm": 0.18343211710453033, "learning_rate": 3.242998947008231e-05, "loss": 0.4015, "step": 58545 }, { "epoch": 2.110138032940498, "grad_norm": 0.17457517981529236, "learning_rate": 3.242720314058084e-05, "loss": 0.3965, "step": 58550 }, { "epoch": 2.1103182326017227, "grad_norm": 0.1578681319952011, "learning_rate": 3.2424416709885165e-05, "loss": 0.4025, "step": 58555 }, { "epoch": 2.1104984322629474, "grad_norm": 0.20855949819087982, "learning_rate": 3.242163017803325e-05, "loss": 0.4154, "step": 58560 }, { "epoch": 2.110678631924172, "grad_norm": 0.15797370672225952, "learning_rate": 3.2418843545063065e-05, "loss": 0.3684, "step": 58565 }, { "epoch": 2.1108588315853964, "grad_norm": 0.22277599573135376, "learning_rate": 3.241605681101256e-05, "loss": 0.401, "step": 58570 }, { "epoch": 2.111039031246621, "grad_norm": 0.19849325716495514, "learning_rate": 3.2413269975919736e-05, "loss": 0.4138, "step": 58575 }, { "epoch": 2.111219230907846, "grad_norm": 0.22037936747074127, "learning_rate": 3.2410483039822527e-05, "loss": 0.3959, "step": 58580 }, { "epoch": 2.1113994305690706, "grad_norm": 0.1882595717906952, "learning_rate": 3.2407696002758936e-05, "loss": 0.3915, "step": 58585 }, { "epoch": 2.1115796302302954, "grad_norm": 0.20610591769218445, "learning_rate": 3.240490886476691e-05, "loss": 0.4286, "step": 58590 }, { "epoch": 2.1117598298915197, "grad_norm": 0.22938185930252075, "learning_rate": 3.240212162588444e-05, "loss": 0.3866, "step": 58595 }, { "epoch": 2.1119400295527444, "grad_norm": 0.20700225234031677, "learning_rate": 3.2399334286149495e-05, "loss": 0.3753, "step": 58600 }, { "epoch": 2.112120229213969, "grad_norm": 0.19341452419757843, "learning_rate": 3.239654684560005e-05, "loss": 0.4153, "step": 58605 }, { "epoch": 2.112300428875194, "grad_norm": 0.20919573307037354, "learning_rate": 3.23937593042741e-05, "loss": 0.4088, "step": 58610 }, { "epoch": 2.112480628536418, "grad_norm": 0.20595782995224, "learning_rate": 3.239097166220959e-05, "loss": 0.3805, "step": 58615 }, { "epoch": 2.112660828197643, "grad_norm": 0.18179921805858612, "learning_rate": 3.238818391944453e-05, "loss": 0.4064, "step": 58620 }, { "epoch": 2.1128410278588676, "grad_norm": 0.1617589145898819, "learning_rate": 3.2385396076016896e-05, "loss": 0.395, "step": 58625 }, { "epoch": 2.1130212275200924, "grad_norm": 0.195877343416214, "learning_rate": 3.2382608131964676e-05, "loss": 0.4229, "step": 58630 }, { "epoch": 2.113201427181317, "grad_norm": 0.20379838347434998, "learning_rate": 3.2379820087325844e-05, "loss": 0.4083, "step": 58635 }, { "epoch": 2.1133816268425414, "grad_norm": 0.2131894826889038, "learning_rate": 3.237703194213839e-05, "loss": 0.3798, "step": 58640 }, { "epoch": 2.113561826503766, "grad_norm": 0.23146046698093414, "learning_rate": 3.2374243696440305e-05, "loss": 0.4218, "step": 58645 }, { "epoch": 2.113742026164991, "grad_norm": 0.1999361366033554, "learning_rate": 3.2371455350269574e-05, "loss": 0.4121, "step": 58650 }, { "epoch": 2.1139222258262156, "grad_norm": 0.16494305431842804, "learning_rate": 3.23686669036642e-05, "loss": 0.392, "step": 58655 }, { "epoch": 2.11410242548744, "grad_norm": 0.24381648004055023, "learning_rate": 3.236587835666216e-05, "loss": 0.3991, "step": 58660 }, { "epoch": 2.1142826251486646, "grad_norm": 0.15115749835968018, "learning_rate": 3.236308970930145e-05, "loss": 0.3447, "step": 58665 }, { "epoch": 2.1144628248098893, "grad_norm": 0.19442135095596313, "learning_rate": 3.236030096162008e-05, "loss": 0.3937, "step": 58670 }, { "epoch": 2.114643024471114, "grad_norm": 0.23755992949008942, "learning_rate": 3.235751211365602e-05, "loss": 0.381, "step": 58675 }, { "epoch": 2.114823224132339, "grad_norm": 0.19326242804527283, "learning_rate": 3.23547231654473e-05, "loss": 0.3761, "step": 58680 }, { "epoch": 2.115003423793563, "grad_norm": 0.2071000635623932, "learning_rate": 3.2351934117031877e-05, "loss": 0.4316, "step": 58685 }, { "epoch": 2.115183623454788, "grad_norm": 0.17056724429130554, "learning_rate": 3.23491449684478e-05, "loss": 0.3951, "step": 58690 }, { "epoch": 2.1153638231160126, "grad_norm": 0.172649085521698, "learning_rate": 3.234635571973303e-05, "loss": 0.3895, "step": 58695 }, { "epoch": 2.1155440227772373, "grad_norm": 0.21372824907302856, "learning_rate": 3.2343566370925594e-05, "loss": 0.398, "step": 58700 }, { "epoch": 2.1157242224384616, "grad_norm": 0.23039843142032623, "learning_rate": 3.234077692206347e-05, "loss": 0.4055, "step": 58705 }, { "epoch": 2.1159044220996863, "grad_norm": 0.22688081860542297, "learning_rate": 3.2337987373184704e-05, "loss": 0.3998, "step": 58710 }, { "epoch": 2.116084621760911, "grad_norm": 0.15555809438228607, "learning_rate": 3.233519772432727e-05, "loss": 0.3709, "step": 58715 }, { "epoch": 2.116264821422136, "grad_norm": 0.22005966305732727, "learning_rate": 3.233240797552919e-05, "loss": 0.3701, "step": 58720 }, { "epoch": 2.1164450210833605, "grad_norm": 0.21118348836898804, "learning_rate": 3.232961812682847e-05, "loss": 0.3888, "step": 58725 }, { "epoch": 2.116625220744585, "grad_norm": 0.18728987872600555, "learning_rate": 3.2326828178263125e-05, "loss": 0.4143, "step": 58730 }, { "epoch": 2.1168054204058095, "grad_norm": 0.19817517697811127, "learning_rate": 3.2324038129871166e-05, "loss": 0.4269, "step": 58735 }, { "epoch": 2.1169856200670343, "grad_norm": 0.19187265634536743, "learning_rate": 3.232124798169059e-05, "loss": 0.4259, "step": 58740 }, { "epoch": 2.117165819728259, "grad_norm": 0.2186998724937439, "learning_rate": 3.231845773375944e-05, "loss": 0.4291, "step": 58745 }, { "epoch": 2.1173460193894837, "grad_norm": 0.22712989151477814, "learning_rate": 3.231566738611572e-05, "loss": 0.3747, "step": 58750 }, { "epoch": 2.117526219050708, "grad_norm": 0.15697424113750458, "learning_rate": 3.231287693879745e-05, "loss": 0.3601, "step": 58755 }, { "epoch": 2.1177064187119328, "grad_norm": 0.23286950588226318, "learning_rate": 3.231008639184265e-05, "loss": 0.388, "step": 58760 }, { "epoch": 2.1178866183731575, "grad_norm": 0.19460979104042053, "learning_rate": 3.230729574528932e-05, "loss": 0.4061, "step": 58765 }, { "epoch": 2.1180668180343822, "grad_norm": 0.1705007404088974, "learning_rate": 3.230450499917552e-05, "loss": 0.4024, "step": 58770 }, { "epoch": 2.1182470176956065, "grad_norm": 0.20845447480678558, "learning_rate": 3.2301714153539244e-05, "loss": 0.4182, "step": 58775 }, { "epoch": 2.1184272173568313, "grad_norm": 0.18309171497821808, "learning_rate": 3.2298923208418535e-05, "loss": 0.3745, "step": 58780 }, { "epoch": 2.118607417018056, "grad_norm": 0.2173132747411728, "learning_rate": 3.22961321638514e-05, "loss": 0.397, "step": 58785 }, { "epoch": 2.1187876166792807, "grad_norm": 0.16794531047344208, "learning_rate": 3.229334101987588e-05, "loss": 0.3844, "step": 58790 }, { "epoch": 2.1189678163405055, "grad_norm": 0.21813854575157166, "learning_rate": 3.229054977653001e-05, "loss": 0.42, "step": 58795 }, { "epoch": 2.1191480160017298, "grad_norm": 0.21310067176818848, "learning_rate": 3.22877584338518e-05, "loss": 0.413, "step": 58800 }, { "epoch": 2.1193282156629545, "grad_norm": 0.20644977688789368, "learning_rate": 3.2284966991879295e-05, "loss": 0.3717, "step": 58805 }, { "epoch": 2.119508415324179, "grad_norm": 0.2088310867547989, "learning_rate": 3.228217545065052e-05, "loss": 0.4002, "step": 58810 }, { "epoch": 2.119688614985404, "grad_norm": 0.23381415009498596, "learning_rate": 3.227938381020353e-05, "loss": 0.392, "step": 58815 }, { "epoch": 2.1198688146466287, "grad_norm": 0.19192180037498474, "learning_rate": 3.227659207057633e-05, "loss": 0.3862, "step": 58820 }, { "epoch": 2.120049014307853, "grad_norm": 0.17391976714134216, "learning_rate": 3.2273800231806974e-05, "loss": 0.3916, "step": 58825 }, { "epoch": 2.1202292139690777, "grad_norm": 0.236240416765213, "learning_rate": 3.2271008293933496e-05, "loss": 0.3979, "step": 58830 }, { "epoch": 2.1204094136303024, "grad_norm": 0.15316404402256012, "learning_rate": 3.2268216256993944e-05, "loss": 0.3986, "step": 58835 }, { "epoch": 2.120589613291527, "grad_norm": 0.20581558346748352, "learning_rate": 3.2265424121026355e-05, "loss": 0.3874, "step": 58840 }, { "epoch": 2.1207698129527515, "grad_norm": 0.1844107061624527, "learning_rate": 3.226263188606876e-05, "loss": 0.4366, "step": 58845 }, { "epoch": 2.120950012613976, "grad_norm": 0.17652803659439087, "learning_rate": 3.225983955215922e-05, "loss": 0.3898, "step": 58850 }, { "epoch": 2.121130212275201, "grad_norm": 0.20647066831588745, "learning_rate": 3.225704711933577e-05, "loss": 0.3942, "step": 58855 }, { "epoch": 2.1213104119364257, "grad_norm": 0.24524955451488495, "learning_rate": 3.2254254587636454e-05, "loss": 0.4065, "step": 58860 }, { "epoch": 2.1214906115976504, "grad_norm": 0.23697435855865479, "learning_rate": 3.225146195709933e-05, "loss": 0.4129, "step": 58865 }, { "epoch": 2.1216708112588747, "grad_norm": 0.2048749029636383, "learning_rate": 3.2248669227762427e-05, "loss": 0.39, "step": 58870 }, { "epoch": 2.1218510109200994, "grad_norm": 0.18117062747478485, "learning_rate": 3.2245876399663826e-05, "loss": 0.3636, "step": 58875 }, { "epoch": 2.122031210581324, "grad_norm": 0.20858252048492432, "learning_rate": 3.224308347284155e-05, "loss": 0.3803, "step": 58880 }, { "epoch": 2.122211410242549, "grad_norm": 0.23120179772377014, "learning_rate": 3.224029044733367e-05, "loss": 0.4179, "step": 58885 }, { "epoch": 2.122391609903773, "grad_norm": 0.20885401964187622, "learning_rate": 3.2237497323178234e-05, "loss": 0.3951, "step": 58890 }, { "epoch": 2.122571809564998, "grad_norm": 0.22970037162303925, "learning_rate": 3.223470410041329e-05, "loss": 0.4036, "step": 58895 }, { "epoch": 2.1227520092262226, "grad_norm": 0.2186799943447113, "learning_rate": 3.2231910779076916e-05, "loss": 0.4109, "step": 58900 }, { "epoch": 2.1229322088874474, "grad_norm": 0.19462229311466217, "learning_rate": 3.222911735920715e-05, "loss": 0.4233, "step": 58905 }, { "epoch": 2.123112408548672, "grad_norm": 0.18175840377807617, "learning_rate": 3.222632384084207e-05, "loss": 0.3827, "step": 58910 }, { "epoch": 2.1232926082098964, "grad_norm": 0.2110246866941452, "learning_rate": 3.222353022401971e-05, "loss": 0.3749, "step": 58915 }, { "epoch": 2.123472807871121, "grad_norm": 0.18468889594078064, "learning_rate": 3.2220736508778166e-05, "loss": 0.396, "step": 58920 }, { "epoch": 2.123653007532346, "grad_norm": 0.18783201277256012, "learning_rate": 3.221794269515547e-05, "loss": 0.3639, "step": 58925 }, { "epoch": 2.1238332071935706, "grad_norm": 0.21580557525157928, "learning_rate": 3.2215148783189716e-05, "loss": 0.4192, "step": 58930 }, { "epoch": 2.124013406854795, "grad_norm": 0.18120792508125305, "learning_rate": 3.221235477291895e-05, "loss": 0.3901, "step": 58935 }, { "epoch": 2.1241936065160196, "grad_norm": 0.18788616359233856, "learning_rate": 3.2209560664381244e-05, "loss": 0.3894, "step": 58940 }, { "epoch": 2.1243738061772444, "grad_norm": 0.18515391647815704, "learning_rate": 3.220676645761467e-05, "loss": 0.4085, "step": 58945 }, { "epoch": 2.124554005838469, "grad_norm": 0.17624923586845398, "learning_rate": 3.2203972152657294e-05, "loss": 0.4079, "step": 58950 }, { "epoch": 2.124734205499694, "grad_norm": 0.2669857144355774, "learning_rate": 3.22011777495472e-05, "loss": 0.3783, "step": 58955 }, { "epoch": 2.124914405160918, "grad_norm": 0.21977299451828003, "learning_rate": 3.219838324832246e-05, "loss": 0.3794, "step": 58960 }, { "epoch": 2.125094604822143, "grad_norm": 0.18140974640846252, "learning_rate": 3.219558864902113e-05, "loss": 0.3855, "step": 58965 }, { "epoch": 2.1252748044833676, "grad_norm": 0.2620275020599365, "learning_rate": 3.21927939516813e-05, "loss": 0.3959, "step": 58970 }, { "epoch": 2.1254550041445923, "grad_norm": 0.2260800153017044, "learning_rate": 3.218999915634105e-05, "loss": 0.3754, "step": 58975 }, { "epoch": 2.125635203805817, "grad_norm": 0.22423382103443146, "learning_rate": 3.218720426303845e-05, "loss": 0.3836, "step": 58980 }, { "epoch": 2.1258154034670413, "grad_norm": 0.21626518666744232, "learning_rate": 3.2184409271811586e-05, "loss": 0.4172, "step": 58985 }, { "epoch": 2.125995603128266, "grad_norm": 0.2051631510257721, "learning_rate": 3.218161418269853e-05, "loss": 0.384, "step": 58990 }, { "epoch": 2.126175802789491, "grad_norm": 0.16707560420036316, "learning_rate": 3.217881899573738e-05, "loss": 0.4404, "step": 58995 }, { "epoch": 2.1263560024507155, "grad_norm": 0.20359165966510773, "learning_rate": 3.21760237109662e-05, "loss": 0.383, "step": 59000 }, { "epoch": 2.1263560024507155, "eval_loss": 0.43649742007255554, "eval_runtime": 3.5299, "eval_samples_per_second": 28.33, "eval_steps_per_second": 7.082, "step": 59000 }, { "epoch": 2.12653620211194, "grad_norm": 0.24366778135299683, "learning_rate": 3.2173228328423095e-05, "loss": 0.3883, "step": 59005 }, { "epoch": 2.1267164017731646, "grad_norm": 0.18067045509815216, "learning_rate": 3.217043284814614e-05, "loss": 0.4133, "step": 59010 }, { "epoch": 2.1268966014343893, "grad_norm": 0.18387949466705322, "learning_rate": 3.2167637270173425e-05, "loss": 0.4048, "step": 59015 }, { "epoch": 2.127076801095614, "grad_norm": 0.19910840690135956, "learning_rate": 3.2164841594543044e-05, "loss": 0.4296, "step": 59020 }, { "epoch": 2.1272570007568388, "grad_norm": 0.24461115896701813, "learning_rate": 3.216204582129308e-05, "loss": 0.3926, "step": 59025 }, { "epoch": 2.127437200418063, "grad_norm": 0.2087002843618393, "learning_rate": 3.215924995046163e-05, "loss": 0.4611, "step": 59030 }, { "epoch": 2.127617400079288, "grad_norm": 0.20317687094211578, "learning_rate": 3.215645398208678e-05, "loss": 0.4292, "step": 59035 }, { "epoch": 2.1277975997405125, "grad_norm": 0.17067931592464447, "learning_rate": 3.215365791620664e-05, "loss": 0.4072, "step": 59040 }, { "epoch": 2.1279777994017373, "grad_norm": 0.23524482548236847, "learning_rate": 3.2150861752859286e-05, "loss": 0.405, "step": 59045 }, { "epoch": 2.128157999062962, "grad_norm": 0.24678029119968414, "learning_rate": 3.214806549208283e-05, "loss": 0.4318, "step": 59050 }, { "epoch": 2.1283381987241863, "grad_norm": 0.22866828739643097, "learning_rate": 3.214526913391536e-05, "loss": 0.3886, "step": 59055 }, { "epoch": 2.128518398385411, "grad_norm": 0.2560563385486603, "learning_rate": 3.2142472678395e-05, "loss": 0.422, "step": 59060 }, { "epoch": 2.1286985980466357, "grad_norm": 0.23088377714157104, "learning_rate": 3.213967612555981e-05, "loss": 0.4044, "step": 59065 }, { "epoch": 2.1288787977078605, "grad_norm": 0.21493729948997498, "learning_rate": 3.2136879475447924e-05, "loss": 0.3938, "step": 59070 }, { "epoch": 2.1290589973690848, "grad_norm": 0.1736428588628769, "learning_rate": 3.213408272809744e-05, "loss": 0.4205, "step": 59075 }, { "epoch": 2.1292391970303095, "grad_norm": 0.18080323934555054, "learning_rate": 3.213128588354645e-05, "loss": 0.363, "step": 59080 }, { "epoch": 2.1294193966915342, "grad_norm": 0.19538411498069763, "learning_rate": 3.212848894183308e-05, "loss": 0.3855, "step": 59085 }, { "epoch": 2.129599596352759, "grad_norm": 0.2233283966779709, "learning_rate": 3.212569190299542e-05, "loss": 0.4047, "step": 59090 }, { "epoch": 2.1297797960139837, "grad_norm": 0.19378229975700378, "learning_rate": 3.21228947670716e-05, "loss": 0.3827, "step": 59095 }, { "epoch": 2.129959995675208, "grad_norm": 0.18152061104774475, "learning_rate": 3.212009753409971e-05, "loss": 0.4019, "step": 59100 }, { "epoch": 2.1301401953364327, "grad_norm": 0.2508603632450104, "learning_rate": 3.2117300204117876e-05, "loss": 0.399, "step": 59105 }, { "epoch": 2.1303203949976575, "grad_norm": 0.16633902490139008, "learning_rate": 3.211450277716419e-05, "loss": 0.3941, "step": 59110 }, { "epoch": 2.130500594658882, "grad_norm": 0.1936190277338028, "learning_rate": 3.211170525327679e-05, "loss": 0.4041, "step": 59115 }, { "epoch": 2.1306807943201065, "grad_norm": 0.2269958257675171, "learning_rate": 3.210890763249379e-05, "loss": 0.4195, "step": 59120 }, { "epoch": 2.130860993981331, "grad_norm": 0.18237541615962982, "learning_rate": 3.210610991485329e-05, "loss": 0.3866, "step": 59125 }, { "epoch": 2.131041193642556, "grad_norm": 0.19430314004421234, "learning_rate": 3.210331210039342e-05, "loss": 0.4121, "step": 59130 }, { "epoch": 2.1312213933037807, "grad_norm": 0.18985673785209656, "learning_rate": 3.2100514189152297e-05, "loss": 0.4231, "step": 59135 }, { "epoch": 2.1314015929650054, "grad_norm": 0.19704566895961761, "learning_rate": 3.209771618116805e-05, "loss": 0.3891, "step": 59140 }, { "epoch": 2.1315817926262297, "grad_norm": 0.20834749937057495, "learning_rate": 3.209491807647879e-05, "loss": 0.3716, "step": 59145 }, { "epoch": 2.1317619922874544, "grad_norm": 0.15605294704437256, "learning_rate": 3.2092119875122636e-05, "loss": 0.3832, "step": 59150 }, { "epoch": 2.131942191948679, "grad_norm": 0.19391027092933655, "learning_rate": 3.208932157713773e-05, "loss": 0.4034, "step": 59155 }, { "epoch": 2.132122391609904, "grad_norm": 0.20829804241657257, "learning_rate": 3.208652318256219e-05, "loss": 0.3805, "step": 59160 }, { "epoch": 2.132302591271128, "grad_norm": 0.21649335324764252, "learning_rate": 3.2083724691434145e-05, "loss": 0.4418, "step": 59165 }, { "epoch": 2.132482790932353, "grad_norm": 0.18842458724975586, "learning_rate": 3.208092610379172e-05, "loss": 0.396, "step": 59170 }, { "epoch": 2.1326629905935777, "grad_norm": 0.21697010099887848, "learning_rate": 3.2078127419673046e-05, "loss": 0.3576, "step": 59175 }, { "epoch": 2.1328431902548024, "grad_norm": 0.20509681105613708, "learning_rate": 3.207532863911627e-05, "loss": 0.4059, "step": 59180 }, { "epoch": 2.133023389916027, "grad_norm": 0.19861997663974762, "learning_rate": 3.2072529762159495e-05, "loss": 0.3848, "step": 59185 }, { "epoch": 2.1332035895772514, "grad_norm": 0.17639335989952087, "learning_rate": 3.206973078884087e-05, "loss": 0.4051, "step": 59190 }, { "epoch": 2.133383789238476, "grad_norm": 0.2027795910835266, "learning_rate": 3.206693171919854e-05, "loss": 0.4244, "step": 59195 }, { "epoch": 2.133563988899701, "grad_norm": 0.18750731647014618, "learning_rate": 3.206413255327063e-05, "loss": 0.4224, "step": 59200 }, { "epoch": 2.1337441885609256, "grad_norm": 0.21276669204235077, "learning_rate": 3.206133329109529e-05, "loss": 0.4186, "step": 59205 }, { "epoch": 2.13392438822215, "grad_norm": 0.17720721662044525, "learning_rate": 3.205853393271064e-05, "loss": 0.3877, "step": 59210 }, { "epoch": 2.1341045878833746, "grad_norm": 0.2149345576763153, "learning_rate": 3.2055734478154834e-05, "loss": 0.3937, "step": 59215 }, { "epoch": 2.1342847875445994, "grad_norm": 0.18279078602790833, "learning_rate": 3.2052934927466016e-05, "loss": 0.3948, "step": 59220 }, { "epoch": 2.134464987205824, "grad_norm": 0.24067635834217072, "learning_rate": 3.2050135280682327e-05, "loss": 0.4092, "step": 59225 }, { "epoch": 2.134645186867049, "grad_norm": 0.22247518599033356, "learning_rate": 3.20473355378419e-05, "loss": 0.4003, "step": 59230 }, { "epoch": 2.134825386528273, "grad_norm": 0.16726024448871613, "learning_rate": 3.2044535698982895e-05, "loss": 0.3806, "step": 59235 }, { "epoch": 2.135005586189498, "grad_norm": 0.18711987137794495, "learning_rate": 3.204173576414345e-05, "loss": 0.3994, "step": 59240 }, { "epoch": 2.1351857858507226, "grad_norm": 0.1729484349489212, "learning_rate": 3.2038935733361734e-05, "loss": 0.3954, "step": 59245 }, { "epoch": 2.1353659855119473, "grad_norm": 0.16871348023414612, "learning_rate": 3.203613560667587e-05, "loss": 0.3775, "step": 59250 }, { "epoch": 2.135546185173172, "grad_norm": 0.21796941757202148, "learning_rate": 3.203333538412402e-05, "loss": 0.4258, "step": 59255 }, { "epoch": 2.1357263848343964, "grad_norm": 0.15358436107635498, "learning_rate": 3.203053506574434e-05, "loss": 0.3841, "step": 59260 }, { "epoch": 2.135906584495621, "grad_norm": 0.2047918438911438, "learning_rate": 3.202773465157498e-05, "loss": 0.4093, "step": 59265 }, { "epoch": 2.136086784156846, "grad_norm": 0.22522784769535065, "learning_rate": 3.20249341416541e-05, "loss": 0.3782, "step": 59270 }, { "epoch": 2.1362669838180706, "grad_norm": 0.20467866957187653, "learning_rate": 3.202213353601985e-05, "loss": 0.3872, "step": 59275 }, { "epoch": 2.136447183479295, "grad_norm": 0.24087628722190857, "learning_rate": 3.201933283471039e-05, "loss": 0.4176, "step": 59280 }, { "epoch": 2.1366273831405196, "grad_norm": 0.20225749909877777, "learning_rate": 3.201653203776388e-05, "loss": 0.3515, "step": 59285 }, { "epoch": 2.1368075828017443, "grad_norm": 0.24749909341335297, "learning_rate": 3.201373114521847e-05, "loss": 0.3979, "step": 59290 }, { "epoch": 2.136987782462969, "grad_norm": 0.14851811528205872, "learning_rate": 3.201093015711234e-05, "loss": 0.3873, "step": 59295 }, { "epoch": 2.137167982124194, "grad_norm": 0.2001311480998993, "learning_rate": 3.200812907348364e-05, "loss": 0.3825, "step": 59300 }, { "epoch": 2.137348181785418, "grad_norm": 0.1819891780614853, "learning_rate": 3.200532789437055e-05, "loss": 0.4069, "step": 59305 }, { "epoch": 2.137528381446643, "grad_norm": 0.20626020431518555, "learning_rate": 3.200252661981121e-05, "loss": 0.4005, "step": 59310 }, { "epoch": 2.1377085811078675, "grad_norm": 0.15189993381500244, "learning_rate": 3.1999725249843806e-05, "loss": 0.3802, "step": 59315 }, { "epoch": 2.1378887807690923, "grad_norm": 0.2139143943786621, "learning_rate": 3.1996923784506494e-05, "loss": 0.3658, "step": 59320 }, { "epoch": 2.138068980430317, "grad_norm": 0.17770539224147797, "learning_rate": 3.199412222383746e-05, "loss": 0.3744, "step": 59325 }, { "epoch": 2.1382491800915413, "grad_norm": 0.18923962116241455, "learning_rate": 3.1991320567874863e-05, "loss": 0.399, "step": 59330 }, { "epoch": 2.138429379752766, "grad_norm": 0.2178812175989151, "learning_rate": 3.198851881665687e-05, "loss": 0.3778, "step": 59335 }, { "epoch": 2.1386095794139908, "grad_norm": 0.16876627504825592, "learning_rate": 3.198571697022167e-05, "loss": 0.4179, "step": 59340 }, { "epoch": 2.1387897790752155, "grad_norm": 0.17547792196273804, "learning_rate": 3.198291502860742e-05, "loss": 0.3674, "step": 59345 }, { "epoch": 2.13896997873644, "grad_norm": 0.20084445178508759, "learning_rate": 3.198011299185232e-05, "loss": 0.3989, "step": 59350 }, { "epoch": 2.1391501783976645, "grad_norm": 0.1786831021308899, "learning_rate": 3.197731085999451e-05, "loss": 0.3793, "step": 59355 }, { "epoch": 2.1393303780588893, "grad_norm": 0.1777740865945816, "learning_rate": 3.197450863307221e-05, "loss": 0.3599, "step": 59360 }, { "epoch": 2.139510577720114, "grad_norm": 0.19308814406394958, "learning_rate": 3.1971706311123564e-05, "loss": 0.422, "step": 59365 }, { "epoch": 2.1396907773813387, "grad_norm": 0.23540256917476654, "learning_rate": 3.196890389418678e-05, "loss": 0.4141, "step": 59370 }, { "epoch": 2.139870977042563, "grad_norm": 0.2817511260509491, "learning_rate": 3.196610138230003e-05, "loss": 0.3921, "step": 59375 }, { "epoch": 2.1400511767037877, "grad_norm": 0.2218995839357376, "learning_rate": 3.196329877550149e-05, "loss": 0.3879, "step": 59380 }, { "epoch": 2.1402313763650125, "grad_norm": 0.1908760666847229, "learning_rate": 3.196049607382936e-05, "loss": 0.3864, "step": 59385 }, { "epoch": 2.140411576026237, "grad_norm": 0.22384685277938843, "learning_rate": 3.195769327732181e-05, "loss": 0.3835, "step": 59390 }, { "epoch": 2.1405917756874615, "grad_norm": 0.20056529343128204, "learning_rate": 3.195489038601704e-05, "loss": 0.3914, "step": 59395 }, { "epoch": 2.1407719753486862, "grad_norm": 0.1963140219449997, "learning_rate": 3.1952087399953236e-05, "loss": 0.4141, "step": 59400 }, { "epoch": 2.140952175009911, "grad_norm": 0.20005357265472412, "learning_rate": 3.194928431916858e-05, "loss": 0.4379, "step": 59405 }, { "epoch": 2.1411323746711357, "grad_norm": 0.1524960994720459, "learning_rate": 3.194648114370129e-05, "loss": 0.3916, "step": 59410 }, { "epoch": 2.1413125743323604, "grad_norm": 0.20259173214435577, "learning_rate": 3.1943677873589525e-05, "loss": 0.4089, "step": 59415 }, { "epoch": 2.1414927739935847, "grad_norm": 0.23878759145736694, "learning_rate": 3.1940874508871496e-05, "loss": 0.4275, "step": 59420 }, { "epoch": 2.1416729736548095, "grad_norm": 0.15623390674591064, "learning_rate": 3.193807104958539e-05, "loss": 0.4258, "step": 59425 }, { "epoch": 2.141853173316034, "grad_norm": 0.1934848129749298, "learning_rate": 3.1935267495769416e-05, "loss": 0.3741, "step": 59430 }, { "epoch": 2.142033372977259, "grad_norm": 0.2278786599636078, "learning_rate": 3.193246384746176e-05, "loss": 0.417, "step": 59435 }, { "epoch": 2.142213572638483, "grad_norm": 0.22830450534820557, "learning_rate": 3.192966010470063e-05, "loss": 0.3854, "step": 59440 }, { "epoch": 2.142393772299708, "grad_norm": 0.236038476228714, "learning_rate": 3.192685626752422e-05, "loss": 0.4147, "step": 59445 }, { "epoch": 2.1425739719609327, "grad_norm": 0.2309621423482895, "learning_rate": 3.1924052335970736e-05, "loss": 0.4035, "step": 59450 }, { "epoch": 2.1427541716221574, "grad_norm": 0.2083134800195694, "learning_rate": 3.1921248310078386e-05, "loss": 0.4002, "step": 59455 }, { "epoch": 2.142934371283382, "grad_norm": 0.22884568572044373, "learning_rate": 3.1918444189885355e-05, "loss": 0.4022, "step": 59460 }, { "epoch": 2.1431145709446064, "grad_norm": 0.18977230787277222, "learning_rate": 3.191563997542987e-05, "loss": 0.375, "step": 59465 }, { "epoch": 2.143294770605831, "grad_norm": 0.21738559007644653, "learning_rate": 3.191283566675013e-05, "loss": 0.3757, "step": 59470 }, { "epoch": 2.143474970267056, "grad_norm": 0.19232676923274994, "learning_rate": 3.1910031263884335e-05, "loss": 0.4058, "step": 59475 }, { "epoch": 2.1436551699282806, "grad_norm": 0.21828657388687134, "learning_rate": 3.1907226766870714e-05, "loss": 0.4174, "step": 59480 }, { "epoch": 2.1438353695895054, "grad_norm": 0.19182120263576508, "learning_rate": 3.190442217574745e-05, "loss": 0.3884, "step": 59485 }, { "epoch": 2.1440155692507297, "grad_norm": 0.16395203769207, "learning_rate": 3.190161749055279e-05, "loss": 0.3748, "step": 59490 }, { "epoch": 2.1441957689119544, "grad_norm": 0.18212929368019104, "learning_rate": 3.189881271132491e-05, "loss": 0.3492, "step": 59495 }, { "epoch": 2.144375968573179, "grad_norm": 0.19863685965538025, "learning_rate": 3.189600783810205e-05, "loss": 0.3785, "step": 59500 }, { "epoch": 2.144375968573179, "eval_loss": 0.43618133664131165, "eval_runtime": 3.5383, "eval_samples_per_second": 28.262, "eval_steps_per_second": 7.065, "step": 59500 }, { "epoch": 2.144556168234404, "grad_norm": 0.17493978142738342, "learning_rate": 3.1893202870922414e-05, "loss": 0.3907, "step": 59505 }, { "epoch": 2.144736367895628, "grad_norm": 0.18247364461421967, "learning_rate": 3.189039780982423e-05, "loss": 0.4129, "step": 59510 }, { "epoch": 2.144916567556853, "grad_norm": 0.2383251041173935, "learning_rate": 3.188759265484571e-05, "loss": 0.3971, "step": 59515 }, { "epoch": 2.1450967672180776, "grad_norm": 0.1882295161485672, "learning_rate": 3.188478740602506e-05, "loss": 0.3815, "step": 59520 }, { "epoch": 2.1452769668793024, "grad_norm": 0.19644425809383392, "learning_rate": 3.1881982063400526e-05, "loss": 0.3999, "step": 59525 }, { "epoch": 2.145457166540527, "grad_norm": 0.1579708456993103, "learning_rate": 3.1879176627010324e-05, "loss": 0.3654, "step": 59530 }, { "epoch": 2.1456373662017514, "grad_norm": 0.23646564781665802, "learning_rate": 3.187637109689267e-05, "loss": 0.3908, "step": 59535 }, { "epoch": 2.145817565862976, "grad_norm": 0.2409384548664093, "learning_rate": 3.187356547308578e-05, "loss": 0.4183, "step": 59540 }, { "epoch": 2.145997765524201, "grad_norm": 0.19728174805641174, "learning_rate": 3.18707597556279e-05, "loss": 0.4135, "step": 59545 }, { "epoch": 2.1461779651854256, "grad_norm": 0.22944805026054382, "learning_rate": 3.186795394455725e-05, "loss": 0.4424, "step": 59550 }, { "epoch": 2.1463581648466503, "grad_norm": 0.21976561844348907, "learning_rate": 3.186514803991205e-05, "loss": 0.4277, "step": 59555 }, { "epoch": 2.1465383645078746, "grad_norm": 0.2405269593000412, "learning_rate": 3.1862342041730545e-05, "loss": 0.3874, "step": 59560 }, { "epoch": 2.1467185641690993, "grad_norm": 0.18481574952602386, "learning_rate": 3.185953595005095e-05, "loss": 0.3858, "step": 59565 }, { "epoch": 2.146898763830324, "grad_norm": 0.1834956705570221, "learning_rate": 3.1856729764911506e-05, "loss": 0.4078, "step": 59570 }, { "epoch": 2.147078963491549, "grad_norm": 0.2032243311405182, "learning_rate": 3.1853923486350455e-05, "loss": 0.4089, "step": 59575 }, { "epoch": 2.147259163152773, "grad_norm": 0.19479459524154663, "learning_rate": 3.185111711440601e-05, "loss": 0.4115, "step": 59580 }, { "epoch": 2.147439362813998, "grad_norm": 0.20436975359916687, "learning_rate": 3.184831064911644e-05, "loss": 0.3796, "step": 59585 }, { "epoch": 2.1476195624752226, "grad_norm": 0.2070939838886261, "learning_rate": 3.1845504090519954e-05, "loss": 0.3942, "step": 59590 }, { "epoch": 2.1477997621364473, "grad_norm": 0.18167893588542938, "learning_rate": 3.18426974386548e-05, "loss": 0.4023, "step": 59595 }, { "epoch": 2.147979961797672, "grad_norm": 0.19285474717617035, "learning_rate": 3.1839890693559216e-05, "loss": 0.4162, "step": 59600 }, { "epoch": 2.1481601614588963, "grad_norm": 0.23616044223308563, "learning_rate": 3.183708385527144e-05, "loss": 0.3866, "step": 59605 }, { "epoch": 2.148340361120121, "grad_norm": 0.2100156992673874, "learning_rate": 3.1834276923829734e-05, "loss": 0.4258, "step": 59610 }, { "epoch": 2.148520560781346, "grad_norm": 0.23628145456314087, "learning_rate": 3.183146989927232e-05, "loss": 0.4279, "step": 59615 }, { "epoch": 2.1487007604425705, "grad_norm": 0.17379525303840637, "learning_rate": 3.1828662781637455e-05, "loss": 0.4271, "step": 59620 }, { "epoch": 2.148880960103795, "grad_norm": 0.20716167986392975, "learning_rate": 3.182585557096337e-05, "loss": 0.4243, "step": 59625 }, { "epoch": 2.1490611597650195, "grad_norm": 0.19413475692272186, "learning_rate": 3.1823048267288336e-05, "loss": 0.3949, "step": 59630 }, { "epoch": 2.1492413594262443, "grad_norm": 0.2092110812664032, "learning_rate": 3.182024087065059e-05, "loss": 0.3402, "step": 59635 }, { "epoch": 2.149421559087469, "grad_norm": 0.18373681604862213, "learning_rate": 3.1817433381088385e-05, "loss": 0.3934, "step": 59640 }, { "epoch": 2.1496017587486937, "grad_norm": 0.2630181908607483, "learning_rate": 3.181462579863996e-05, "loss": 0.415, "step": 59645 }, { "epoch": 2.149781958409918, "grad_norm": 0.20736628770828247, "learning_rate": 3.1811818123343584e-05, "loss": 0.4196, "step": 59650 }, { "epoch": 2.1499621580711428, "grad_norm": 0.2118118554353714, "learning_rate": 3.180901035523751e-05, "loss": 0.408, "step": 59655 }, { "epoch": 2.1501423577323675, "grad_norm": 0.218112975358963, "learning_rate": 3.180620249435998e-05, "loss": 0.3967, "step": 59660 }, { "epoch": 2.1503225573935922, "grad_norm": 0.19679492712020874, "learning_rate": 3.180339454074926e-05, "loss": 0.3891, "step": 59665 }, { "epoch": 2.1505027570548165, "grad_norm": 0.19406099617481232, "learning_rate": 3.18005864944436e-05, "loss": 0.406, "step": 59670 }, { "epoch": 2.1506829567160413, "grad_norm": 0.21432217955589294, "learning_rate": 3.1797778355481285e-05, "loss": 0.4209, "step": 59675 }, { "epoch": 2.150863156377266, "grad_norm": 0.20044691860675812, "learning_rate": 3.179497012390054e-05, "loss": 0.3638, "step": 59680 }, { "epoch": 2.1510433560384907, "grad_norm": 0.2646994888782501, "learning_rate": 3.179216179973964e-05, "loss": 0.4082, "step": 59685 }, { "epoch": 2.1512235556997155, "grad_norm": 0.1870126724243164, "learning_rate": 3.178935338303686e-05, "loss": 0.3992, "step": 59690 }, { "epoch": 2.1514037553609398, "grad_norm": 0.19518372416496277, "learning_rate": 3.178654487383045e-05, "loss": 0.359, "step": 59695 }, { "epoch": 2.1515839550221645, "grad_norm": 0.18647782504558563, "learning_rate": 3.178373627215869e-05, "loss": 0.4009, "step": 59700 }, { "epoch": 2.151764154683389, "grad_norm": 0.16734398901462555, "learning_rate": 3.178092757805982e-05, "loss": 0.4091, "step": 59705 }, { "epoch": 2.151944354344614, "grad_norm": 0.1782519370317459, "learning_rate": 3.177811879157214e-05, "loss": 0.384, "step": 59710 }, { "epoch": 2.1521245540058382, "grad_norm": 0.16030040383338928, "learning_rate": 3.1775309912733897e-05, "loss": 0.3843, "step": 59715 }, { "epoch": 2.152304753667063, "grad_norm": 0.19871199131011963, "learning_rate": 3.177250094158336e-05, "loss": 0.4164, "step": 59720 }, { "epoch": 2.1524849533282877, "grad_norm": 0.2312350869178772, "learning_rate": 3.1769691878158823e-05, "loss": 0.3661, "step": 59725 }, { "epoch": 2.1526651529895124, "grad_norm": 0.20747533440589905, "learning_rate": 3.176688272249854e-05, "loss": 0.3765, "step": 59730 }, { "epoch": 2.152845352650737, "grad_norm": 0.19539090991020203, "learning_rate": 3.176407347464079e-05, "loss": 0.4251, "step": 59735 }, { "epoch": 2.1530255523119615, "grad_norm": 0.21648086607456207, "learning_rate": 3.1761264134623844e-05, "loss": 0.3954, "step": 59740 }, { "epoch": 2.153205751973186, "grad_norm": 0.21644406020641327, "learning_rate": 3.175845470248599e-05, "loss": 0.3859, "step": 59745 }, { "epoch": 2.153385951634411, "grad_norm": 0.19673486053943634, "learning_rate": 3.175564517826549e-05, "loss": 0.3904, "step": 59750 }, { "epoch": 2.1535661512956357, "grad_norm": 0.22667108476161957, "learning_rate": 3.1752835562000636e-05, "loss": 0.4018, "step": 59755 }, { "epoch": 2.1537463509568604, "grad_norm": 0.22213202714920044, "learning_rate": 3.175002585372971e-05, "loss": 0.3882, "step": 59760 }, { "epoch": 2.1539265506180847, "grad_norm": 0.1795310080051422, "learning_rate": 3.174721605349099e-05, "loss": 0.3719, "step": 59765 }, { "epoch": 2.1541067502793094, "grad_norm": 0.20347318053245544, "learning_rate": 3.174440616132275e-05, "loss": 0.3895, "step": 59770 }, { "epoch": 2.154286949940534, "grad_norm": 0.2064104527235031, "learning_rate": 3.174159617726329e-05, "loss": 0.3691, "step": 59775 }, { "epoch": 2.154467149601759, "grad_norm": 0.22160401940345764, "learning_rate": 3.1738786101350884e-05, "loss": 0.394, "step": 59780 }, { "epoch": 2.154647349262983, "grad_norm": 0.18739666044712067, "learning_rate": 3.173597593362381e-05, "loss": 0.393, "step": 59785 }, { "epoch": 2.154827548924208, "grad_norm": 0.21694833040237427, "learning_rate": 3.173316567412038e-05, "loss": 0.3736, "step": 59790 }, { "epoch": 2.1550077485854326, "grad_norm": 0.2537897229194641, "learning_rate": 3.173035532287887e-05, "loss": 0.3989, "step": 59795 }, { "epoch": 2.1551879482466574, "grad_norm": 0.16658885776996613, "learning_rate": 3.172754487993757e-05, "loss": 0.4302, "step": 59800 }, { "epoch": 2.155368147907882, "grad_norm": 0.18496111035346985, "learning_rate": 3.1724734345334775e-05, "loss": 0.3784, "step": 59805 }, { "epoch": 2.1555483475691064, "grad_norm": 0.19085146486759186, "learning_rate": 3.1721923719108775e-05, "loss": 0.3818, "step": 59810 }, { "epoch": 2.155728547230331, "grad_norm": 0.21254359185695648, "learning_rate": 3.1719113001297866e-05, "loss": 0.4093, "step": 59815 }, { "epoch": 2.155908746891556, "grad_norm": 0.16708926856517792, "learning_rate": 3.171630219194035e-05, "loss": 0.4064, "step": 59820 }, { "epoch": 2.1560889465527806, "grad_norm": 0.20576465129852295, "learning_rate": 3.171349129107451e-05, "loss": 0.4028, "step": 59825 }, { "epoch": 2.1562691462140053, "grad_norm": 0.16969583928585052, "learning_rate": 3.171068029873865e-05, "loss": 0.3922, "step": 59830 }, { "epoch": 2.1564493458752296, "grad_norm": 0.1903369426727295, "learning_rate": 3.170786921497107e-05, "loss": 0.3691, "step": 59835 }, { "epoch": 2.1566295455364544, "grad_norm": 0.1657458245754242, "learning_rate": 3.1705058039810075e-05, "loss": 0.385, "step": 59840 }, { "epoch": 2.156809745197679, "grad_norm": 0.17674793303012848, "learning_rate": 3.170224677329396e-05, "loss": 0.3697, "step": 59845 }, { "epoch": 2.156989944858904, "grad_norm": 0.1935698390007019, "learning_rate": 3.1699435415461034e-05, "loss": 0.3688, "step": 59850 }, { "epoch": 2.157170144520128, "grad_norm": 0.22389110922813416, "learning_rate": 3.1696623966349586e-05, "loss": 0.4113, "step": 59855 }, { "epoch": 2.157350344181353, "grad_norm": 0.2064765840768814, "learning_rate": 3.1693812425997946e-05, "loss": 0.3933, "step": 59860 }, { "epoch": 2.1575305438425776, "grad_norm": 0.17828290164470673, "learning_rate": 3.1691000794444404e-05, "loss": 0.4174, "step": 59865 }, { "epoch": 2.1577107435038023, "grad_norm": 0.18194176256656647, "learning_rate": 3.168818907172727e-05, "loss": 0.421, "step": 59870 }, { "epoch": 2.157890943165027, "grad_norm": 0.1584172546863556, "learning_rate": 3.168537725788485e-05, "loss": 0.4051, "step": 59875 }, { "epoch": 2.1580711428262513, "grad_norm": 0.21431034803390503, "learning_rate": 3.168256535295547e-05, "loss": 0.4027, "step": 59880 }, { "epoch": 2.158251342487476, "grad_norm": 0.18599197268486023, "learning_rate": 3.167975335697743e-05, "loss": 0.4038, "step": 59885 }, { "epoch": 2.158431542148701, "grad_norm": 0.18698103725910187, "learning_rate": 3.167694126998903e-05, "loss": 0.4269, "step": 59890 }, { "epoch": 2.1586117418099255, "grad_norm": 0.2108622044324875, "learning_rate": 3.1674129092028607e-05, "loss": 0.4332, "step": 59895 }, { "epoch": 2.15879194147115, "grad_norm": 0.22758710384368896, "learning_rate": 3.167131682313447e-05, "loss": 0.4095, "step": 59900 }, { "epoch": 2.1589721411323746, "grad_norm": 0.17931969463825226, "learning_rate": 3.1668504463344926e-05, "loss": 0.4192, "step": 59905 }, { "epoch": 2.1591523407935993, "grad_norm": 0.1740187406539917, "learning_rate": 3.166569201269831e-05, "loss": 0.4038, "step": 59910 }, { "epoch": 2.159332540454824, "grad_norm": 0.199792742729187, "learning_rate": 3.166287947123292e-05, "loss": 0.4077, "step": 59915 }, { "epoch": 2.1595127401160488, "grad_norm": 0.22314900159835815, "learning_rate": 3.1660066838987095e-05, "loss": 0.4176, "step": 59920 }, { "epoch": 2.159692939777273, "grad_norm": 0.2095964401960373, "learning_rate": 3.165725411599914e-05, "loss": 0.391, "step": 59925 }, { "epoch": 2.159873139438498, "grad_norm": 0.2291499823331833, "learning_rate": 3.16544413023074e-05, "loss": 0.4169, "step": 59930 }, { "epoch": 2.1600533390997225, "grad_norm": 0.17338679730892181, "learning_rate": 3.165162839795017e-05, "loss": 0.3882, "step": 59935 }, { "epoch": 2.1602335387609473, "grad_norm": 0.16860181093215942, "learning_rate": 3.16488154029658e-05, "loss": 0.3689, "step": 59940 }, { "epoch": 2.1604137384221715, "grad_norm": 0.2292938083410263, "learning_rate": 3.164600231739261e-05, "loss": 0.3986, "step": 59945 }, { "epoch": 2.1605939380833963, "grad_norm": 0.20005054771900177, "learning_rate": 3.164318914126891e-05, "loss": 0.4175, "step": 59950 }, { "epoch": 2.160774137744621, "grad_norm": 0.18958815932273865, "learning_rate": 3.164037587463306e-05, "loss": 0.3758, "step": 59955 }, { "epoch": 2.1609543374058457, "grad_norm": 0.218599334359169, "learning_rate": 3.1637562517523374e-05, "loss": 0.3645, "step": 59960 }, { "epoch": 2.1611345370670705, "grad_norm": 0.19358861446380615, "learning_rate": 3.163474906997818e-05, "loss": 0.3981, "step": 59965 }, { "epoch": 2.1613147367282948, "grad_norm": 0.21078740060329437, "learning_rate": 3.1631935532035814e-05, "loss": 0.3875, "step": 59970 }, { "epoch": 2.1614949363895195, "grad_norm": 0.14373530447483063, "learning_rate": 3.16291219037346e-05, "loss": 0.4011, "step": 59975 }, { "epoch": 2.1616751360507442, "grad_norm": 0.19193744659423828, "learning_rate": 3.16263081851129e-05, "loss": 0.3839, "step": 59980 }, { "epoch": 2.161855335711969, "grad_norm": 0.24244289100170135, "learning_rate": 3.162349437620903e-05, "loss": 0.3978, "step": 59985 }, { "epoch": 2.1620355353731937, "grad_norm": 0.24157002568244934, "learning_rate": 3.162068047706133e-05, "loss": 0.3703, "step": 59990 }, { "epoch": 2.162215735034418, "grad_norm": 0.20742827653884888, "learning_rate": 3.161786648770813e-05, "loss": 0.375, "step": 59995 }, { "epoch": 2.1623959346956427, "grad_norm": 0.1863773912191391, "learning_rate": 3.1615052408187797e-05, "loss": 0.3623, "step": 60000 }, { "epoch": 2.1623959346956427, "eval_loss": 0.4364284873008728, "eval_runtime": 3.5244, "eval_samples_per_second": 28.373, "eval_steps_per_second": 7.093, "step": 60000 }, { "epoch": 2.1625761343568675, "grad_norm": 0.21003571152687073, "learning_rate": 3.1612238238538646e-05, "loss": 0.3717, "step": 60005 }, { "epoch": 2.162756334018092, "grad_norm": 0.1988876461982727, "learning_rate": 3.160942397879903e-05, "loss": 0.3693, "step": 60010 }, { "epoch": 2.1629365336793165, "grad_norm": 0.22915050387382507, "learning_rate": 3.160660962900729e-05, "loss": 0.3922, "step": 60015 }, { "epoch": 2.163116733340541, "grad_norm": 0.18933100998401642, "learning_rate": 3.160379518920177e-05, "loss": 0.4227, "step": 60020 }, { "epoch": 2.163296933001766, "grad_norm": 0.19733920693397522, "learning_rate": 3.160098065942084e-05, "loss": 0.3909, "step": 60025 }, { "epoch": 2.1634771326629907, "grad_norm": 0.21198596060276031, "learning_rate": 3.1598166039702805e-05, "loss": 0.3822, "step": 60030 }, { "epoch": 2.1636573323242154, "grad_norm": 0.20547007024288177, "learning_rate": 3.159535133008604e-05, "loss": 0.3746, "step": 60035 }, { "epoch": 2.1638375319854397, "grad_norm": 0.21266385912895203, "learning_rate": 3.1592536530608894e-05, "loss": 0.372, "step": 60040 }, { "epoch": 2.1640177316466644, "grad_norm": 0.21298128366470337, "learning_rate": 3.158972164130971e-05, "loss": 0.3811, "step": 60045 }, { "epoch": 2.164197931307889, "grad_norm": 0.17953209578990936, "learning_rate": 3.158690666222685e-05, "loss": 0.3941, "step": 60050 }, { "epoch": 2.164378130969114, "grad_norm": 0.15556363761425018, "learning_rate": 3.158409159339866e-05, "loss": 0.3773, "step": 60055 }, { "epoch": 2.1645583306303386, "grad_norm": 0.1946418732404709, "learning_rate": 3.158127643486349e-05, "loss": 0.3913, "step": 60060 }, { "epoch": 2.164738530291563, "grad_norm": 0.23327438533306122, "learning_rate": 3.157846118665971e-05, "loss": 0.4158, "step": 60065 }, { "epoch": 2.1649187299527877, "grad_norm": 0.1919216811656952, "learning_rate": 3.1575645848825675e-05, "loss": 0.3949, "step": 60070 }, { "epoch": 2.1650989296140124, "grad_norm": 0.18845520913600922, "learning_rate": 3.1572830421399724e-05, "loss": 0.4004, "step": 60075 }, { "epoch": 2.165279129275237, "grad_norm": 0.19371269643306732, "learning_rate": 3.1570014904420245e-05, "loss": 0.4108, "step": 60080 }, { "epoch": 2.1654593289364614, "grad_norm": 0.1754084974527359, "learning_rate": 3.156719929792557e-05, "loss": 0.4292, "step": 60085 }, { "epoch": 2.165639528597686, "grad_norm": 0.2297254204750061, "learning_rate": 3.156438360195409e-05, "loss": 0.3867, "step": 60090 }, { "epoch": 2.165819728258911, "grad_norm": 0.18021254241466522, "learning_rate": 3.156156781654415e-05, "loss": 0.4032, "step": 60095 }, { "epoch": 2.1659999279201356, "grad_norm": 0.20905311405658722, "learning_rate": 3.155875194173411e-05, "loss": 0.4037, "step": 60100 }, { "epoch": 2.1661801275813604, "grad_norm": 0.1627390831708908, "learning_rate": 3.1555935977562354e-05, "loss": 0.3906, "step": 60105 }, { "epoch": 2.1663603272425846, "grad_norm": 0.2780887484550476, "learning_rate": 3.155311992406724e-05, "loss": 0.4073, "step": 60110 }, { "epoch": 2.1665405269038094, "grad_norm": 0.17551511526107788, "learning_rate": 3.155030378128713e-05, "loss": 0.4206, "step": 60115 }, { "epoch": 2.166720726565034, "grad_norm": 0.16955679655075073, "learning_rate": 3.15474875492604e-05, "loss": 0.402, "step": 60120 }, { "epoch": 2.166900926226259, "grad_norm": 0.16937977075576782, "learning_rate": 3.154467122802543e-05, "loss": 0.3718, "step": 60125 }, { "epoch": 2.167081125887483, "grad_norm": 0.18345296382904053, "learning_rate": 3.154185481762057e-05, "loss": 0.4043, "step": 60130 }, { "epoch": 2.167261325548708, "grad_norm": 0.19128628075122833, "learning_rate": 3.153903831808421e-05, "loss": 0.4128, "step": 60135 }, { "epoch": 2.1674415252099326, "grad_norm": 0.18635614216327667, "learning_rate": 3.153622172945472e-05, "loss": 0.3931, "step": 60140 }, { "epoch": 2.1676217248711573, "grad_norm": 0.22392097115516663, "learning_rate": 3.153340505177047e-05, "loss": 0.3779, "step": 60145 }, { "epoch": 2.167801924532382, "grad_norm": 0.18912804126739502, "learning_rate": 3.153058828506984e-05, "loss": 0.387, "step": 60150 }, { "epoch": 2.1679821241936064, "grad_norm": 0.17587023973464966, "learning_rate": 3.152777142939122e-05, "loss": 0.4231, "step": 60155 }, { "epoch": 2.168162323854831, "grad_norm": 0.19323548674583435, "learning_rate": 3.152495448477296e-05, "loss": 0.3857, "step": 60160 }, { "epoch": 2.168342523516056, "grad_norm": 0.2253098487854004, "learning_rate": 3.152213745125348e-05, "loss": 0.3647, "step": 60165 }, { "epoch": 2.1685227231772806, "grad_norm": 0.26896703243255615, "learning_rate": 3.151932032887112e-05, "loss": 0.4163, "step": 60170 }, { "epoch": 2.168702922838505, "grad_norm": 0.16771872341632843, "learning_rate": 3.151650311766429e-05, "loss": 0.3935, "step": 60175 }, { "epoch": 2.1688831224997296, "grad_norm": 0.1727340817451477, "learning_rate": 3.1513685817671365e-05, "loss": 0.3809, "step": 60180 }, { "epoch": 2.1690633221609543, "grad_norm": 0.17910057306289673, "learning_rate": 3.151086842893074e-05, "loss": 0.4137, "step": 60185 }, { "epoch": 2.169243521822179, "grad_norm": 0.21039491891860962, "learning_rate": 3.150805095148079e-05, "loss": 0.3931, "step": 60190 }, { "epoch": 2.169423721483404, "grad_norm": 0.15780936181545258, "learning_rate": 3.15052333853599e-05, "loss": 0.3762, "step": 60195 }, { "epoch": 2.169603921144628, "grad_norm": 0.2158997654914856, "learning_rate": 3.150241573060647e-05, "loss": 0.413, "step": 60200 }, { "epoch": 2.169784120805853, "grad_norm": 0.22214800119400024, "learning_rate": 3.1499597987258876e-05, "loss": 0.406, "step": 60205 }, { "epoch": 2.1699643204670775, "grad_norm": 0.21257710456848145, "learning_rate": 3.149678015535553e-05, "loss": 0.4273, "step": 60210 }, { "epoch": 2.1701445201283023, "grad_norm": 0.18004845082759857, "learning_rate": 3.14939622349348e-05, "loss": 0.3658, "step": 60215 }, { "epoch": 2.1703247197895266, "grad_norm": 0.20979435741901398, "learning_rate": 3.149114422603511e-05, "loss": 0.397, "step": 60220 }, { "epoch": 2.1705049194507513, "grad_norm": 0.20553478598594666, "learning_rate": 3.148832612869482e-05, "loss": 0.3749, "step": 60225 }, { "epoch": 2.170685119111976, "grad_norm": 0.15849637985229492, "learning_rate": 3.148550794295235e-05, "loss": 0.3821, "step": 60230 }, { "epoch": 2.1708653187732008, "grad_norm": 0.19608749449253082, "learning_rate": 3.148268966884609e-05, "loss": 0.3915, "step": 60235 }, { "epoch": 2.1710455184344255, "grad_norm": 0.16606873273849487, "learning_rate": 3.147987130641443e-05, "loss": 0.3923, "step": 60240 }, { "epoch": 2.17122571809565, "grad_norm": 0.17005008459091187, "learning_rate": 3.147705285569579e-05, "loss": 0.417, "step": 60245 }, { "epoch": 2.1714059177568745, "grad_norm": 0.1739397495985031, "learning_rate": 3.1474234316728554e-05, "loss": 0.4078, "step": 60250 }, { "epoch": 2.1715861174180993, "grad_norm": 0.21615131199359894, "learning_rate": 3.1471415689551124e-05, "loss": 0.3828, "step": 60255 }, { "epoch": 2.171766317079324, "grad_norm": 0.18913507461547852, "learning_rate": 3.1468596974201915e-05, "loss": 0.4276, "step": 60260 }, { "epoch": 2.1719465167405487, "grad_norm": 0.17284362018108368, "learning_rate": 3.1465778170719314e-05, "loss": 0.4342, "step": 60265 }, { "epoch": 2.172126716401773, "grad_norm": 0.21640555560588837, "learning_rate": 3.146295927914175e-05, "loss": 0.4252, "step": 60270 }, { "epoch": 2.1723069160629978, "grad_norm": 0.1958705186843872, "learning_rate": 3.1460140299507614e-05, "loss": 0.4443, "step": 60275 }, { "epoch": 2.1724871157242225, "grad_norm": 0.18334457278251648, "learning_rate": 3.145732123185531e-05, "loss": 0.3744, "step": 60280 }, { "epoch": 2.172667315385447, "grad_norm": 0.2335662692785263, "learning_rate": 3.1454502076223255e-05, "loss": 0.4132, "step": 60285 }, { "epoch": 2.1728475150466715, "grad_norm": 0.1668962836265564, "learning_rate": 3.145168283264987e-05, "loss": 0.3935, "step": 60290 }, { "epoch": 2.1730277147078962, "grad_norm": 0.2275708168745041, "learning_rate": 3.144886350117355e-05, "loss": 0.3886, "step": 60295 }, { "epoch": 2.173207914369121, "grad_norm": 0.21035301685333252, "learning_rate": 3.144604408183271e-05, "loss": 0.4101, "step": 60300 }, { "epoch": 2.1733881140303457, "grad_norm": 0.20323359966278076, "learning_rate": 3.144322457466577e-05, "loss": 0.3783, "step": 60305 }, { "epoch": 2.1735683136915704, "grad_norm": 0.21606206893920898, "learning_rate": 3.144040497971115e-05, "loss": 0.4244, "step": 60310 }, { "epoch": 2.1737485133527947, "grad_norm": 0.1799047589302063, "learning_rate": 3.143758529700724e-05, "loss": 0.3813, "step": 60315 }, { "epoch": 2.1739287130140195, "grad_norm": 0.17974518239498138, "learning_rate": 3.143476552659249e-05, "loss": 0.4338, "step": 60320 }, { "epoch": 2.174108912675244, "grad_norm": 0.17822134494781494, "learning_rate": 3.14319456685053e-05, "loss": 0.3918, "step": 60325 }, { "epoch": 2.174289112336469, "grad_norm": 0.21287919580936432, "learning_rate": 3.1429125722784105e-05, "loss": 0.4043, "step": 60330 }, { "epoch": 2.1744693119976937, "grad_norm": 0.2515278160572052, "learning_rate": 3.1426305689467304e-05, "loss": 0.3708, "step": 60335 }, { "epoch": 2.174649511658918, "grad_norm": 0.20817367732524872, "learning_rate": 3.142348556859335e-05, "loss": 0.4246, "step": 60340 }, { "epoch": 2.1748297113201427, "grad_norm": 0.20883257687091827, "learning_rate": 3.142066536020063e-05, "loss": 0.4121, "step": 60345 }, { "epoch": 2.1750099109813674, "grad_norm": 0.2272225171327591, "learning_rate": 3.14178450643276e-05, "loss": 0.4404, "step": 60350 }, { "epoch": 2.175190110642592, "grad_norm": 0.20350000262260437, "learning_rate": 3.141502468101267e-05, "loss": 0.4316, "step": 60355 }, { "epoch": 2.1753703103038164, "grad_norm": 0.16587962210178375, "learning_rate": 3.141220421029427e-05, "loss": 0.367, "step": 60360 }, { "epoch": 2.175550509965041, "grad_norm": 0.16492369771003723, "learning_rate": 3.140938365221082e-05, "loss": 0.417, "step": 60365 }, { "epoch": 2.175730709626266, "grad_norm": 0.1600433737039566, "learning_rate": 3.140656300680077e-05, "loss": 0.3949, "step": 60370 }, { "epoch": 2.1759109092874906, "grad_norm": 0.2284023016691208, "learning_rate": 3.140374227410254e-05, "loss": 0.433, "step": 60375 }, { "epoch": 2.1760911089487154, "grad_norm": 0.267976313829422, "learning_rate": 3.1400921454154553e-05, "loss": 0.3941, "step": 60380 }, { "epoch": 2.1762713086099397, "grad_norm": 0.19785836338996887, "learning_rate": 3.1398100546995256e-05, "loss": 0.4047, "step": 60385 }, { "epoch": 2.1764515082711644, "grad_norm": 0.19465455412864685, "learning_rate": 3.1395279552663075e-05, "loss": 0.4211, "step": 60390 }, { "epoch": 2.176631707932389, "grad_norm": 0.22500787675380707, "learning_rate": 3.139245847119646e-05, "loss": 0.4166, "step": 60395 }, { "epoch": 2.176811907593614, "grad_norm": 0.19240856170654297, "learning_rate": 3.1389637302633816e-05, "loss": 0.3926, "step": 60400 }, { "epoch": 2.176992107254838, "grad_norm": 0.22199541330337524, "learning_rate": 3.1386816047013615e-05, "loss": 0.3848, "step": 60405 }, { "epoch": 2.177172306916063, "grad_norm": 0.21987438201904297, "learning_rate": 3.1383994704374276e-05, "loss": 0.3841, "step": 60410 }, { "epoch": 2.1773525065772876, "grad_norm": 0.21120600402355194, "learning_rate": 3.138117327475425e-05, "loss": 0.3895, "step": 60415 }, { "epoch": 2.1775327062385124, "grad_norm": 0.14966708421707153, "learning_rate": 3.137835175819197e-05, "loss": 0.3998, "step": 60420 }, { "epoch": 2.177712905899737, "grad_norm": 0.18459533154964447, "learning_rate": 3.137553015472588e-05, "loss": 0.3855, "step": 60425 }, { "epoch": 2.1778931055609614, "grad_norm": 0.18099567294120789, "learning_rate": 3.1372708464394427e-05, "loss": 0.3645, "step": 60430 }, { "epoch": 2.178073305222186, "grad_norm": 0.20282356441020966, "learning_rate": 3.136988668723606e-05, "loss": 0.4062, "step": 60435 }, { "epoch": 2.178253504883411, "grad_norm": 0.22229717671871185, "learning_rate": 3.136706482328922e-05, "loss": 0.3824, "step": 60440 }, { "epoch": 2.1784337045446356, "grad_norm": 0.22084426879882812, "learning_rate": 3.1364242872592345e-05, "loss": 0.3796, "step": 60445 }, { "epoch": 2.17861390420586, "grad_norm": 0.24100591242313385, "learning_rate": 3.1361420835183894e-05, "loss": 0.4144, "step": 60450 }, { "epoch": 2.1787941038670846, "grad_norm": 0.18646858632564545, "learning_rate": 3.1358598711102336e-05, "loss": 0.3665, "step": 60455 }, { "epoch": 2.1789743035283093, "grad_norm": 0.22386908531188965, "learning_rate": 3.135577650038608e-05, "loss": 0.3888, "step": 60460 }, { "epoch": 2.179154503189534, "grad_norm": 0.1991269886493683, "learning_rate": 3.135295420307361e-05, "loss": 0.4148, "step": 60465 }, { "epoch": 2.179334702850759, "grad_norm": 0.18925313651561737, "learning_rate": 3.135013181920336e-05, "loss": 0.3654, "step": 60470 }, { "epoch": 2.179514902511983, "grad_norm": 0.2189246565103531, "learning_rate": 3.13473093488138e-05, "loss": 0.3729, "step": 60475 }, { "epoch": 2.179695102173208, "grad_norm": 0.17129361629486084, "learning_rate": 3.134448679194338e-05, "loss": 0.4165, "step": 60480 }, { "epoch": 2.1798753018344326, "grad_norm": 0.20182783901691437, "learning_rate": 3.134166414863055e-05, "loss": 0.3914, "step": 60485 }, { "epoch": 2.1800555014956573, "grad_norm": 0.18997080624103546, "learning_rate": 3.1338841418913776e-05, "loss": 0.3937, "step": 60490 }, { "epoch": 2.1802357011568816, "grad_norm": 0.1698470115661621, "learning_rate": 3.133601860283152e-05, "loss": 0.3883, "step": 60495 }, { "epoch": 2.1804159008181063, "grad_norm": 0.2348821759223938, "learning_rate": 3.133319570042224e-05, "loss": 0.4298, "step": 60500 }, { "epoch": 2.1804159008181063, "eval_loss": 0.4370137155056, "eval_runtime": 3.5451, "eval_samples_per_second": 28.208, "eval_steps_per_second": 7.052, "step": 60500 }, { "epoch": 2.180596100479331, "grad_norm": 0.16450156271457672, "learning_rate": 3.1330372711724385e-05, "loss": 0.3918, "step": 60505 }, { "epoch": 2.180776300140556, "grad_norm": 0.23515671491622925, "learning_rate": 3.132754963677643e-05, "loss": 0.3826, "step": 60510 }, { "epoch": 2.1809564998017805, "grad_norm": 0.1602397859096527, "learning_rate": 3.132472647561684e-05, "loss": 0.3654, "step": 60515 }, { "epoch": 2.181136699463005, "grad_norm": 0.15597917139530182, "learning_rate": 3.1321903228284064e-05, "loss": 0.3848, "step": 60520 }, { "epoch": 2.1813168991242295, "grad_norm": 0.1703244298696518, "learning_rate": 3.1319079894816595e-05, "loss": 0.4033, "step": 60525 }, { "epoch": 2.1814970987854543, "grad_norm": 0.19135898351669312, "learning_rate": 3.131625647525288e-05, "loss": 0.389, "step": 60530 }, { "epoch": 2.181677298446679, "grad_norm": 0.22922751307487488, "learning_rate": 3.1313432969631397e-05, "loss": 0.422, "step": 60535 }, { "epoch": 2.1818574981079037, "grad_norm": 0.24647971987724304, "learning_rate": 3.1310609377990603e-05, "loss": 0.4223, "step": 60540 }, { "epoch": 2.182037697769128, "grad_norm": 0.2550751268863678, "learning_rate": 3.130778570036898e-05, "loss": 0.4325, "step": 60545 }, { "epoch": 2.1822178974303528, "grad_norm": 0.20791274309158325, "learning_rate": 3.130496193680501e-05, "loss": 0.4028, "step": 60550 }, { "epoch": 2.1823980970915775, "grad_norm": 0.1871381402015686, "learning_rate": 3.130213808733714e-05, "loss": 0.3844, "step": 60555 }, { "epoch": 2.1825782967528022, "grad_norm": 0.2246485948562622, "learning_rate": 3.129931415200387e-05, "loss": 0.4274, "step": 60560 }, { "epoch": 2.182758496414027, "grad_norm": 0.22834356129169464, "learning_rate": 3.129649013084365e-05, "loss": 0.4065, "step": 60565 }, { "epoch": 2.1829386960752513, "grad_norm": 0.2202819585800171, "learning_rate": 3.1293666023894984e-05, "loss": 0.3958, "step": 60570 }, { "epoch": 2.183118895736476, "grad_norm": 0.23861894011497498, "learning_rate": 3.129084183119634e-05, "loss": 0.3895, "step": 60575 }, { "epoch": 2.1832990953977007, "grad_norm": 0.20570367574691772, "learning_rate": 3.128801755278618e-05, "loss": 0.3865, "step": 60580 }, { "epoch": 2.1834792950589255, "grad_norm": 0.2041841596364975, "learning_rate": 3.128519318870301e-05, "loss": 0.417, "step": 60585 }, { "epoch": 2.1836594947201498, "grad_norm": 0.19558176398277283, "learning_rate": 3.128236873898529e-05, "loss": 0.3732, "step": 60590 }, { "epoch": 2.1838396943813745, "grad_norm": 0.2021927386522293, "learning_rate": 3.1279544203671516e-05, "loss": 0.4129, "step": 60595 }, { "epoch": 2.184019894042599, "grad_norm": 0.19117502868175507, "learning_rate": 3.1276719582800176e-05, "loss": 0.4208, "step": 60600 }, { "epoch": 2.184200093703824, "grad_norm": 0.18669839203357697, "learning_rate": 3.127389487640974e-05, "loss": 0.4171, "step": 60605 }, { "epoch": 2.1843802933650487, "grad_norm": 0.18443822860717773, "learning_rate": 3.1271070084538703e-05, "loss": 0.4149, "step": 60610 }, { "epoch": 2.184560493026273, "grad_norm": 0.20405270159244537, "learning_rate": 3.126824520722554e-05, "loss": 0.3927, "step": 60615 }, { "epoch": 2.1847406926874977, "grad_norm": 0.19594557583332062, "learning_rate": 3.126542024450876e-05, "loss": 0.3956, "step": 60620 }, { "epoch": 2.1849208923487224, "grad_norm": 0.21037110686302185, "learning_rate": 3.126259519642684e-05, "loss": 0.3782, "step": 60625 }, { "epoch": 2.185101092009947, "grad_norm": 0.21924084424972534, "learning_rate": 3.125977006301828e-05, "loss": 0.3774, "step": 60630 }, { "epoch": 2.1852812916711715, "grad_norm": 0.22989316284656525, "learning_rate": 3.125694484432155e-05, "loss": 0.4115, "step": 60635 }, { "epoch": 2.185461491332396, "grad_norm": 0.20585152506828308, "learning_rate": 3.1254119540375173e-05, "loss": 0.3983, "step": 60640 }, { "epoch": 2.185641690993621, "grad_norm": 0.20063400268554688, "learning_rate": 3.1251294151217614e-05, "loss": 0.3712, "step": 60645 }, { "epoch": 2.1858218906548457, "grad_norm": 0.18336834013462067, "learning_rate": 3.1248468676887396e-05, "loss": 0.4123, "step": 60650 }, { "epoch": 2.1860020903160704, "grad_norm": 0.2062855362892151, "learning_rate": 3.124564311742299e-05, "loss": 0.3857, "step": 60655 }, { "epoch": 2.1861822899772947, "grad_norm": 0.2320207953453064, "learning_rate": 3.124281747286291e-05, "loss": 0.4176, "step": 60660 }, { "epoch": 2.1863624896385194, "grad_norm": 0.17851118743419647, "learning_rate": 3.1239991743245656e-05, "loss": 0.3797, "step": 60665 }, { "epoch": 2.186542689299744, "grad_norm": 0.17879897356033325, "learning_rate": 3.123716592860971e-05, "loss": 0.3958, "step": 60670 }, { "epoch": 2.186722888960969, "grad_norm": 0.24118387699127197, "learning_rate": 3.12343400289936e-05, "loss": 0.4083, "step": 60675 }, { "epoch": 2.186903088622193, "grad_norm": 0.17323794960975647, "learning_rate": 3.123151404443581e-05, "loss": 0.3856, "step": 60680 }, { "epoch": 2.187083288283418, "grad_norm": 0.19309526681900024, "learning_rate": 3.122868797497485e-05, "loss": 0.3939, "step": 60685 }, { "epoch": 2.1872634879446426, "grad_norm": 0.20537187159061432, "learning_rate": 3.122586182064921e-05, "loss": 0.3747, "step": 60690 }, { "epoch": 2.1874436876058674, "grad_norm": 0.22961921989917755, "learning_rate": 3.122303558149742e-05, "loss": 0.4318, "step": 60695 }, { "epoch": 2.187623887267092, "grad_norm": 0.2008984237909317, "learning_rate": 3.122020925755797e-05, "loss": 0.4176, "step": 60700 }, { "epoch": 2.1878040869283164, "grad_norm": 0.2251536101102829, "learning_rate": 3.121738284886938e-05, "loss": 0.3891, "step": 60705 }, { "epoch": 2.187984286589541, "grad_norm": 0.16935603320598602, "learning_rate": 3.121455635547014e-05, "loss": 0.3819, "step": 60710 }, { "epoch": 2.188164486250766, "grad_norm": 0.20971350371837616, "learning_rate": 3.121172977739878e-05, "loss": 0.4172, "step": 60715 }, { "epoch": 2.1883446859119906, "grad_norm": 0.25330233573913574, "learning_rate": 3.1208903114693804e-05, "loss": 0.403, "step": 60720 }, { "epoch": 2.188524885573215, "grad_norm": 0.17658570408821106, "learning_rate": 3.1206076367393724e-05, "loss": 0.3783, "step": 60725 }, { "epoch": 2.1887050852344396, "grad_norm": 0.18725471198558807, "learning_rate": 3.1203249535537056e-05, "loss": 0.3887, "step": 60730 }, { "epoch": 2.1888852848956644, "grad_norm": 0.17408056557178497, "learning_rate": 3.1200422619162315e-05, "loss": 0.3688, "step": 60735 }, { "epoch": 2.189065484556889, "grad_norm": 0.21276924014091492, "learning_rate": 3.119759561830802e-05, "loss": 0.4114, "step": 60740 }, { "epoch": 2.189245684218114, "grad_norm": 0.2514096200466156, "learning_rate": 3.119476853301268e-05, "loss": 0.4205, "step": 60745 }, { "epoch": 2.189425883879338, "grad_norm": 0.1897374540567398, "learning_rate": 3.1191941363314814e-05, "loss": 0.383, "step": 60750 }, { "epoch": 2.189606083540563, "grad_norm": 0.23337703943252563, "learning_rate": 3.1189114109252946e-05, "loss": 0.4357, "step": 60755 }, { "epoch": 2.1897862832017876, "grad_norm": 0.2705073654651642, "learning_rate": 3.118628677086561e-05, "loss": 0.4014, "step": 60760 }, { "epoch": 2.1899664828630123, "grad_norm": 0.20422200858592987, "learning_rate": 3.1183459348191296e-05, "loss": 0.4171, "step": 60765 }, { "epoch": 2.190146682524237, "grad_norm": 0.20398341119289398, "learning_rate": 3.118063184126856e-05, "loss": 0.3475, "step": 60770 }, { "epoch": 2.1903268821854613, "grad_norm": 0.17860305309295654, "learning_rate": 3.11778042501359e-05, "loss": 0.3701, "step": 60775 }, { "epoch": 2.190507081846686, "grad_norm": 0.19039638340473175, "learning_rate": 3.117497657483187e-05, "loss": 0.4299, "step": 60780 }, { "epoch": 2.190687281507911, "grad_norm": 0.2850659489631653, "learning_rate": 3.117214881539496e-05, "loss": 0.4103, "step": 60785 }, { "epoch": 2.1908674811691355, "grad_norm": 0.24027515947818756, "learning_rate": 3.116932097186373e-05, "loss": 0.422, "step": 60790 }, { "epoch": 2.19104768083036, "grad_norm": 0.18420986831188202, "learning_rate": 3.116649304427669e-05, "loss": 0.3831, "step": 60795 }, { "epoch": 2.1912278804915846, "grad_norm": 0.23363029956817627, "learning_rate": 3.116366503267238e-05, "loss": 0.4279, "step": 60800 }, { "epoch": 2.1914080801528093, "grad_norm": 0.18705283105373383, "learning_rate": 3.116083693708933e-05, "loss": 0.377, "step": 60805 }, { "epoch": 2.191588279814034, "grad_norm": 0.22469288110733032, "learning_rate": 3.115800875756606e-05, "loss": 0.3951, "step": 60810 }, { "epoch": 2.1917684794752588, "grad_norm": 0.17964445054531097, "learning_rate": 3.115518049414112e-05, "loss": 0.3898, "step": 60815 }, { "epoch": 2.191948679136483, "grad_norm": 0.22461974620819092, "learning_rate": 3.115235214685303e-05, "loss": 0.4117, "step": 60820 }, { "epoch": 2.192128878797708, "grad_norm": 0.1743023842573166, "learning_rate": 3.114952371574035e-05, "loss": 0.4163, "step": 60825 }, { "epoch": 2.1923090784589325, "grad_norm": 0.22755619883537292, "learning_rate": 3.114669520084158e-05, "loss": 0.4297, "step": 60830 }, { "epoch": 2.1924892781201573, "grad_norm": 0.21366189420223236, "learning_rate": 3.114386660219528e-05, "loss": 0.3677, "step": 60835 }, { "epoch": 2.192669477781382, "grad_norm": 0.19875569641590118, "learning_rate": 3.1141037919839996e-05, "loss": 0.377, "step": 60840 }, { "epoch": 2.1928496774426063, "grad_norm": 0.18235017359256744, "learning_rate": 3.113820915381426e-05, "loss": 0.4023, "step": 60845 }, { "epoch": 2.193029877103831, "grad_norm": 0.22413848340511322, "learning_rate": 3.1135380304156614e-05, "loss": 0.4025, "step": 60850 }, { "epoch": 2.1932100767650557, "grad_norm": 0.226275235414505, "learning_rate": 3.11325513709056e-05, "loss": 0.4236, "step": 60855 }, { "epoch": 2.1933902764262805, "grad_norm": 0.23043504357337952, "learning_rate": 3.1129722354099746e-05, "loss": 0.3911, "step": 60860 }, { "epoch": 2.1935704760875048, "grad_norm": 0.22926190495491028, "learning_rate": 3.1126893253777626e-05, "loss": 0.3957, "step": 60865 }, { "epoch": 2.1937506757487295, "grad_norm": 0.24694472551345825, "learning_rate": 3.1124064069977766e-05, "loss": 0.3715, "step": 60870 }, { "epoch": 2.1939308754099542, "grad_norm": 0.16308090090751648, "learning_rate": 3.112123480273872e-05, "loss": 0.3668, "step": 60875 }, { "epoch": 2.194111075071179, "grad_norm": 0.17201615869998932, "learning_rate": 3.111840545209903e-05, "loss": 0.3633, "step": 60880 }, { "epoch": 2.1942912747324037, "grad_norm": 0.20584902167320251, "learning_rate": 3.1115576018097264e-05, "loss": 0.4049, "step": 60885 }, { "epoch": 2.194471474393628, "grad_norm": 0.16585664451122284, "learning_rate": 3.111274650077195e-05, "loss": 0.3564, "step": 60890 }, { "epoch": 2.1946516740548527, "grad_norm": 0.18358787894248962, "learning_rate": 3.110991690016165e-05, "loss": 0.4176, "step": 60895 }, { "epoch": 2.1948318737160775, "grad_norm": 0.28625836968421936, "learning_rate": 3.11070872163049e-05, "loss": 0.4292, "step": 60900 }, { "epoch": 2.195012073377302, "grad_norm": 0.2175116091966629, "learning_rate": 3.110425744924029e-05, "loss": 0.4274, "step": 60905 }, { "epoch": 2.1951922730385265, "grad_norm": 0.18882659077644348, "learning_rate": 3.1101427599006346e-05, "loss": 0.3686, "step": 60910 }, { "epoch": 2.195372472699751, "grad_norm": 0.2079334855079651, "learning_rate": 3.109859766564163e-05, "loss": 0.4185, "step": 60915 }, { "epoch": 2.195552672360976, "grad_norm": 0.18339209258556366, "learning_rate": 3.1095767649184704e-05, "loss": 0.399, "step": 60920 }, { "epoch": 2.1957328720222007, "grad_norm": 0.17973054945468903, "learning_rate": 3.1092937549674126e-05, "loss": 0.3799, "step": 60925 }, { "epoch": 2.1959130716834254, "grad_norm": 0.24256204068660736, "learning_rate": 3.109010736714845e-05, "loss": 0.3867, "step": 60930 }, { "epoch": 2.1960932713446497, "grad_norm": 0.21241244673728943, "learning_rate": 3.1087277101646244e-05, "loss": 0.4304, "step": 60935 }, { "epoch": 2.1962734710058744, "grad_norm": 0.1754169911146164, "learning_rate": 3.108444675320607e-05, "loss": 0.4074, "step": 60940 }, { "epoch": 2.196453670667099, "grad_norm": 0.20329289138317108, "learning_rate": 3.108161632186648e-05, "loss": 0.4086, "step": 60945 }, { "epoch": 2.196633870328324, "grad_norm": 0.2189503312110901, "learning_rate": 3.107878580766604e-05, "loss": 0.4192, "step": 60950 }, { "epoch": 2.196814069989548, "grad_norm": 0.1868715137243271, "learning_rate": 3.107595521064333e-05, "loss": 0.3519, "step": 60955 }, { "epoch": 2.196994269650773, "grad_norm": 0.25199243426322937, "learning_rate": 3.1073124530836894e-05, "loss": 0.3997, "step": 60960 }, { "epoch": 2.1971744693119977, "grad_norm": 0.21293796598911285, "learning_rate": 3.107029376828533e-05, "loss": 0.3592, "step": 60965 }, { "epoch": 2.1973546689732224, "grad_norm": 0.21643023192882538, "learning_rate": 3.1067462923027174e-05, "loss": 0.418, "step": 60970 }, { "epoch": 2.197534868634447, "grad_norm": 0.20323172211647034, "learning_rate": 3.106463199510102e-05, "loss": 0.4273, "step": 60975 }, { "epoch": 2.1977150682956714, "grad_norm": 0.20973147451877594, "learning_rate": 3.106180098454542e-05, "loss": 0.4134, "step": 60980 }, { "epoch": 2.197895267956896, "grad_norm": 0.19453154504299164, "learning_rate": 3.105896989139896e-05, "loss": 0.3847, "step": 60985 }, { "epoch": 2.198075467618121, "grad_norm": 0.2046874463558197, "learning_rate": 3.105613871570021e-05, "loss": 0.3834, "step": 60990 }, { "epoch": 2.1982556672793456, "grad_norm": 0.2236749827861786, "learning_rate": 3.105330745748774e-05, "loss": 0.4089, "step": 60995 }, { "epoch": 2.19843586694057, "grad_norm": 0.2126908153295517, "learning_rate": 3.105047611680013e-05, "loss": 0.4004, "step": 61000 }, { "epoch": 2.19843586694057, "eval_loss": 0.43686825037002563, "eval_runtime": 3.5139, "eval_samples_per_second": 28.458, "eval_steps_per_second": 7.115, "step": 61000 }, { "epoch": 2.1986160666017946, "grad_norm": 0.19092722237110138, "learning_rate": 3.1047644693675956e-05, "loss": 0.3829, "step": 61005 }, { "epoch": 2.1987962662630194, "grad_norm": 0.24691295623779297, "learning_rate": 3.104481318815379e-05, "loss": 0.4313, "step": 61010 }, { "epoch": 2.198976465924244, "grad_norm": 0.2079208791255951, "learning_rate": 3.104198160027222e-05, "loss": 0.425, "step": 61015 }, { "epoch": 2.199156665585469, "grad_norm": 0.19778048992156982, "learning_rate": 3.103914993006981e-05, "loss": 0.3969, "step": 61020 }, { "epoch": 2.199336865246693, "grad_norm": 0.16332995891571045, "learning_rate": 3.1036318177585156e-05, "loss": 0.4092, "step": 61025 }, { "epoch": 2.199517064907918, "grad_norm": 0.1742788553237915, "learning_rate": 3.103348634285684e-05, "loss": 0.4048, "step": 61030 }, { "epoch": 2.1996972645691426, "grad_norm": 0.14753888547420502, "learning_rate": 3.103065442592344e-05, "loss": 0.3834, "step": 61035 }, { "epoch": 2.1998774642303673, "grad_norm": 0.22011804580688477, "learning_rate": 3.1027822426823536e-05, "loss": 0.4417, "step": 61040 }, { "epoch": 2.200057663891592, "grad_norm": 0.20552678406238556, "learning_rate": 3.1024990345595725e-05, "loss": 0.4126, "step": 61045 }, { "epoch": 2.2002378635528164, "grad_norm": 0.18421411514282227, "learning_rate": 3.1022158182278584e-05, "loss": 0.4111, "step": 61050 }, { "epoch": 2.200418063214041, "grad_norm": 0.21071533858776093, "learning_rate": 3.1019325936910696e-05, "loss": 0.4239, "step": 61055 }, { "epoch": 2.200598262875266, "grad_norm": 0.19925324618816376, "learning_rate": 3.1016493609530666e-05, "loss": 0.3629, "step": 61060 }, { "epoch": 2.2007784625364906, "grad_norm": 0.19704802334308624, "learning_rate": 3.101366120017707e-05, "loss": 0.3847, "step": 61065 }, { "epoch": 2.2009586621977153, "grad_norm": 0.17782747745513916, "learning_rate": 3.1010828708888516e-05, "loss": 0.3889, "step": 61070 }, { "epoch": 2.2011388618589396, "grad_norm": 0.1703190803527832, "learning_rate": 3.1007996135703576e-05, "loss": 0.3616, "step": 61075 }, { "epoch": 2.2013190615201643, "grad_norm": 0.22071696817874908, "learning_rate": 3.100516348066085e-05, "loss": 0.4017, "step": 61080 }, { "epoch": 2.201499261181389, "grad_norm": 0.20108157396316528, "learning_rate": 3.100233074379894e-05, "loss": 0.3965, "step": 61085 }, { "epoch": 2.201679460842614, "grad_norm": 0.16138023138046265, "learning_rate": 3.099949792515643e-05, "loss": 0.4049, "step": 61090 }, { "epoch": 2.201859660503838, "grad_norm": 0.1845240294933319, "learning_rate": 3.0996665024771924e-05, "loss": 0.4032, "step": 61095 }, { "epoch": 2.202039860165063, "grad_norm": 0.17944936454296112, "learning_rate": 3.099383204268402e-05, "loss": 0.3838, "step": 61100 }, { "epoch": 2.2022200598262875, "grad_norm": 0.17411671578884125, "learning_rate": 3.0990998978931315e-05, "loss": 0.4018, "step": 61105 }, { "epoch": 2.2024002594875123, "grad_norm": 0.1903943568468094, "learning_rate": 3.0988165833552404e-05, "loss": 0.3924, "step": 61110 }, { "epoch": 2.202580459148737, "grad_norm": 0.1921924352645874, "learning_rate": 3.0985332606585905e-05, "loss": 0.3686, "step": 61115 }, { "epoch": 2.2027606588099613, "grad_norm": 0.21751470863819122, "learning_rate": 3.0982499298070394e-05, "loss": 0.387, "step": 61120 }, { "epoch": 2.202940858471186, "grad_norm": 0.18325336277484894, "learning_rate": 3.0979665908044495e-05, "loss": 0.3853, "step": 61125 }, { "epoch": 2.2031210581324108, "grad_norm": 0.21470201015472412, "learning_rate": 3.097683243654681e-05, "loss": 0.4171, "step": 61130 }, { "epoch": 2.2033012577936355, "grad_norm": 0.2272443175315857, "learning_rate": 3.097399888361593e-05, "loss": 0.3425, "step": 61135 }, { "epoch": 2.20348145745486, "grad_norm": 0.22367946803569794, "learning_rate": 3.097116524929048e-05, "loss": 0.3819, "step": 61140 }, { "epoch": 2.2036616571160845, "grad_norm": 0.16753867268562317, "learning_rate": 3.096833153360905e-05, "loss": 0.3832, "step": 61145 }, { "epoch": 2.2038418567773093, "grad_norm": 0.18004858493804932, "learning_rate": 3.096549773661027e-05, "loss": 0.4408, "step": 61150 }, { "epoch": 2.204022056438534, "grad_norm": 0.20204921066761017, "learning_rate": 3.096266385833273e-05, "loss": 0.3311, "step": 61155 }, { "epoch": 2.2042022560997587, "grad_norm": 0.19747331738471985, "learning_rate": 3.0959829898815053e-05, "loss": 0.4104, "step": 61160 }, { "epoch": 2.204382455760983, "grad_norm": 0.20761699974536896, "learning_rate": 3.095699585809584e-05, "loss": 0.3564, "step": 61165 }, { "epoch": 2.2045626554222078, "grad_norm": 0.18775121867656708, "learning_rate": 3.0954161736213725e-05, "loss": 0.3941, "step": 61170 }, { "epoch": 2.2047428550834325, "grad_norm": 0.1855042576789856, "learning_rate": 3.0951327533207305e-05, "loss": 0.398, "step": 61175 }, { "epoch": 2.204923054744657, "grad_norm": 0.15789079666137695, "learning_rate": 3.094849324911519e-05, "loss": 0.3827, "step": 61180 }, { "epoch": 2.2051032544058815, "grad_norm": 0.20551757514476776, "learning_rate": 3.0945658883976014e-05, "loss": 0.4138, "step": 61185 }, { "epoch": 2.2052834540671062, "grad_norm": 0.20195716619491577, "learning_rate": 3.0942824437828386e-05, "loss": 0.4424, "step": 61190 }, { "epoch": 2.205463653728331, "grad_norm": 0.18200279772281647, "learning_rate": 3.093998991071093e-05, "loss": 0.4024, "step": 61195 }, { "epoch": 2.2056438533895557, "grad_norm": 0.21861431002616882, "learning_rate": 3.0937155302662256e-05, "loss": 0.4019, "step": 61200 }, { "epoch": 2.2058240530507804, "grad_norm": 0.18421269953250885, "learning_rate": 3.093432061372098e-05, "loss": 0.3865, "step": 61205 }, { "epoch": 2.2060042527120047, "grad_norm": 0.2282218486070633, "learning_rate": 3.093148584392576e-05, "loss": 0.3667, "step": 61210 }, { "epoch": 2.2061844523732295, "grad_norm": 0.17307542264461517, "learning_rate": 3.0928650993315175e-05, "loss": 0.4014, "step": 61215 }, { "epoch": 2.206364652034454, "grad_norm": 0.23555909097194672, "learning_rate": 3.092581606192788e-05, "loss": 0.4212, "step": 61220 }, { "epoch": 2.206544851695679, "grad_norm": 0.2506897747516632, "learning_rate": 3.092298104980247e-05, "loss": 0.3723, "step": 61225 }, { "epoch": 2.2067250513569032, "grad_norm": 0.2201319932937622, "learning_rate": 3.0920145956977606e-05, "loss": 0.3987, "step": 61230 }, { "epoch": 2.206905251018128, "grad_norm": 0.17496994137763977, "learning_rate": 3.09173107834919e-05, "loss": 0.3906, "step": 61235 }, { "epoch": 2.2070854506793527, "grad_norm": 0.20881304144859314, "learning_rate": 3.0914475529383966e-05, "loss": 0.3911, "step": 61240 }, { "epoch": 2.2072656503405774, "grad_norm": 0.21811749041080475, "learning_rate": 3.091164019469246e-05, "loss": 0.3785, "step": 61245 }, { "epoch": 2.207445850001802, "grad_norm": 0.18549318611621857, "learning_rate": 3.0908804779456e-05, "loss": 0.3929, "step": 61250 }, { "epoch": 2.2076260496630264, "grad_norm": 0.21845227479934692, "learning_rate": 3.090596928371322e-05, "loss": 0.3835, "step": 61255 }, { "epoch": 2.207806249324251, "grad_norm": 0.2296115756034851, "learning_rate": 3.0903133707502744e-05, "loss": 0.4349, "step": 61260 }, { "epoch": 2.207986448985476, "grad_norm": 0.19739088416099548, "learning_rate": 3.090029805086322e-05, "loss": 0.3676, "step": 61265 }, { "epoch": 2.2081666486467006, "grad_norm": 0.23218046128749847, "learning_rate": 3.089746231383327e-05, "loss": 0.3863, "step": 61270 }, { "epoch": 2.2083468483079254, "grad_norm": 0.17996297776699066, "learning_rate": 3.089462649645155e-05, "loss": 0.3793, "step": 61275 }, { "epoch": 2.2085270479691497, "grad_norm": 0.22149422764778137, "learning_rate": 3.089179059875668e-05, "loss": 0.4028, "step": 61280 }, { "epoch": 2.2087072476303744, "grad_norm": 0.18242286145687103, "learning_rate": 3.0888954620787306e-05, "loss": 0.3801, "step": 61285 }, { "epoch": 2.208887447291599, "grad_norm": 0.18819750845432281, "learning_rate": 3.0886118562582056e-05, "loss": 0.3964, "step": 61290 }, { "epoch": 2.209067646952824, "grad_norm": 0.19808700680732727, "learning_rate": 3.0883282424179586e-05, "loss": 0.4066, "step": 61295 }, { "epoch": 2.209247846614048, "grad_norm": 0.1916135549545288, "learning_rate": 3.088044620561854e-05, "loss": 0.4026, "step": 61300 }, { "epoch": 2.209428046275273, "grad_norm": 0.2114742547273636, "learning_rate": 3.0877609906937545e-05, "loss": 0.3913, "step": 61305 }, { "epoch": 2.2096082459364976, "grad_norm": 0.2341557890176773, "learning_rate": 3.0874773528175245e-05, "loss": 0.3725, "step": 61310 }, { "epoch": 2.2097884455977224, "grad_norm": 0.20667120814323425, "learning_rate": 3.087193706937031e-05, "loss": 0.4142, "step": 61315 }, { "epoch": 2.209968645258947, "grad_norm": 0.23338265717029572, "learning_rate": 3.086910053056136e-05, "loss": 0.406, "step": 61320 }, { "epoch": 2.2101488449201714, "grad_norm": 0.22917935252189636, "learning_rate": 3.086626391178705e-05, "loss": 0.3956, "step": 61325 }, { "epoch": 2.210329044581396, "grad_norm": 0.2156306803226471, "learning_rate": 3.0863427213086026e-05, "loss": 0.4162, "step": 61330 }, { "epoch": 2.210509244242621, "grad_norm": 0.20012244582176208, "learning_rate": 3.086059043449695e-05, "loss": 0.4189, "step": 61335 }, { "epoch": 2.2106894439038456, "grad_norm": 0.18535156548023224, "learning_rate": 3.085775357605847e-05, "loss": 0.4042, "step": 61340 }, { "epoch": 2.2108696435650703, "grad_norm": 0.18791675567626953, "learning_rate": 3.0854916637809215e-05, "loss": 0.3618, "step": 61345 }, { "epoch": 2.2110498432262946, "grad_norm": 0.2098093181848526, "learning_rate": 3.085207961978786e-05, "loss": 0.4419, "step": 61350 }, { "epoch": 2.2112300428875193, "grad_norm": 0.22902563214302063, "learning_rate": 3.0849242522033064e-05, "loss": 0.4385, "step": 61355 }, { "epoch": 2.211410242548744, "grad_norm": 0.20538847148418427, "learning_rate": 3.084640534458346e-05, "loss": 0.385, "step": 61360 }, { "epoch": 2.211590442209969, "grad_norm": 0.24018481373786926, "learning_rate": 3.084356808747772e-05, "loss": 0.4011, "step": 61365 }, { "epoch": 2.211770641871193, "grad_norm": 0.17867416143417358, "learning_rate": 3.084073075075449e-05, "loss": 0.3826, "step": 61370 }, { "epoch": 2.211950841532418, "grad_norm": 0.22849269211292267, "learning_rate": 3.083789333445244e-05, "loss": 0.4173, "step": 61375 }, { "epoch": 2.2121310411936426, "grad_norm": 0.20987308025360107, "learning_rate": 3.0835055838610224e-05, "loss": 0.414, "step": 61380 }, { "epoch": 2.2123112408548673, "grad_norm": 0.19698165357112885, "learning_rate": 3.08322182632665e-05, "loss": 0.3898, "step": 61385 }, { "epoch": 2.212491440516092, "grad_norm": 0.2028915286064148, "learning_rate": 3.082938060845993e-05, "loss": 0.4183, "step": 61390 }, { "epoch": 2.2126716401773163, "grad_norm": 0.19117394089698792, "learning_rate": 3.082654287422918e-05, "loss": 0.4264, "step": 61395 }, { "epoch": 2.212851839838541, "grad_norm": 0.23229442536830902, "learning_rate": 3.082370506061291e-05, "loss": 0.3847, "step": 61400 }, { "epoch": 2.213032039499766, "grad_norm": 0.1863051801919937, "learning_rate": 3.0820867167649794e-05, "loss": 0.3803, "step": 61405 }, { "epoch": 2.2132122391609905, "grad_norm": 0.17941780388355255, "learning_rate": 3.081802919537847e-05, "loss": 0.3799, "step": 61410 }, { "epoch": 2.213392438822215, "grad_norm": 0.20453837513923645, "learning_rate": 3.081519114383764e-05, "loss": 0.4076, "step": 61415 }, { "epoch": 2.2135726384834395, "grad_norm": 0.1980455219745636, "learning_rate": 3.081235301306596e-05, "loss": 0.4094, "step": 61420 }, { "epoch": 2.2137528381446643, "grad_norm": 0.23042725026607513, "learning_rate": 3.080951480310209e-05, "loss": 0.4069, "step": 61425 }, { "epoch": 2.213933037805889, "grad_norm": 0.197109192609787, "learning_rate": 3.08066765139847e-05, "loss": 0.3933, "step": 61430 }, { "epoch": 2.2141132374671137, "grad_norm": 0.19078285992145538, "learning_rate": 3.0803838145752465e-05, "loss": 0.4077, "step": 61435 }, { "epoch": 2.214293437128338, "grad_norm": 0.20791934430599213, "learning_rate": 3.0800999698444065e-05, "loss": 0.3886, "step": 61440 }, { "epoch": 2.2144736367895628, "grad_norm": 0.24572139978408813, "learning_rate": 3.0798161172098175e-05, "loss": 0.4195, "step": 61445 }, { "epoch": 2.2146538364507875, "grad_norm": 0.2402421236038208, "learning_rate": 3.079532256675345e-05, "loss": 0.3967, "step": 61450 }, { "epoch": 2.2148340361120122, "grad_norm": 0.20814205706119537, "learning_rate": 3.079248388244858e-05, "loss": 0.4013, "step": 61455 }, { "epoch": 2.2150142357732365, "grad_norm": 0.17785018682479858, "learning_rate": 3.078964511922224e-05, "loss": 0.4192, "step": 61460 }, { "epoch": 2.2151944354344613, "grad_norm": 0.2151138335466385, "learning_rate": 3.0786806277113115e-05, "loss": 0.3994, "step": 61465 }, { "epoch": 2.215374635095686, "grad_norm": 0.18419857323169708, "learning_rate": 3.0783967356159856e-05, "loss": 0.4021, "step": 61470 }, { "epoch": 2.2155548347569107, "grad_norm": 0.18258464336395264, "learning_rate": 3.078112835640117e-05, "loss": 0.3915, "step": 61475 }, { "epoch": 2.2157350344181355, "grad_norm": 0.2090165615081787, "learning_rate": 3.077828927787574e-05, "loss": 0.3759, "step": 61480 }, { "epoch": 2.2159152340793598, "grad_norm": 0.21204036474227905, "learning_rate": 3.0775450120622215e-05, "loss": 0.4124, "step": 61485 }, { "epoch": 2.2160954337405845, "grad_norm": 0.1751544028520584, "learning_rate": 3.077261088467932e-05, "loss": 0.3981, "step": 61490 }, { "epoch": 2.216275633401809, "grad_norm": 0.23898641765117645, "learning_rate": 3.076977157008571e-05, "loss": 0.395, "step": 61495 }, { "epoch": 2.216455833063034, "grad_norm": 0.17369025945663452, "learning_rate": 3.076693217688009e-05, "loss": 0.4193, "step": 61500 }, { "epoch": 2.216455833063034, "eval_loss": 0.4364326000213623, "eval_runtime": 3.523, "eval_samples_per_second": 28.385, "eval_steps_per_second": 7.096, "step": 61500 }, { "epoch": 2.2166360327242582, "grad_norm": 0.22703151404857635, "learning_rate": 3.076409270510112e-05, "loss": 0.3983, "step": 61505 }, { "epoch": 2.216816232385483, "grad_norm": 0.21053709089756012, "learning_rate": 3.076125315478752e-05, "loss": 0.3839, "step": 61510 }, { "epoch": 2.2169964320467077, "grad_norm": 0.17118942737579346, "learning_rate": 3.075841352597794e-05, "loss": 0.3901, "step": 61515 }, { "epoch": 2.2171766317079324, "grad_norm": 0.21972143650054932, "learning_rate": 3.07555738187111e-05, "loss": 0.4219, "step": 61520 }, { "epoch": 2.217356831369157, "grad_norm": 0.1972840577363968, "learning_rate": 3.0752734033025694e-05, "loss": 0.3747, "step": 61525 }, { "epoch": 2.2175370310303815, "grad_norm": 0.20041783154010773, "learning_rate": 3.074989416896038e-05, "loss": 0.3963, "step": 61530 }, { "epoch": 2.217717230691606, "grad_norm": 0.23045070469379425, "learning_rate": 3.0747054226553886e-05, "loss": 0.3746, "step": 61535 }, { "epoch": 2.217897430352831, "grad_norm": 0.17599715292453766, "learning_rate": 3.0744214205844884e-05, "loss": 0.393, "step": 61540 }, { "epoch": 2.2180776300140557, "grad_norm": 0.17511135339736938, "learning_rate": 3.0741374106872084e-05, "loss": 0.4001, "step": 61545 }, { "epoch": 2.2182578296752804, "grad_norm": 0.18678990006446838, "learning_rate": 3.0738533929674165e-05, "loss": 0.4134, "step": 61550 }, { "epoch": 2.2184380293365047, "grad_norm": 0.21317550539970398, "learning_rate": 3.0735693674289826e-05, "loss": 0.4136, "step": 61555 }, { "epoch": 2.2186182289977294, "grad_norm": 0.24046799540519714, "learning_rate": 3.073285334075778e-05, "loss": 0.3942, "step": 61560 }, { "epoch": 2.218798428658954, "grad_norm": 0.19992442429065704, "learning_rate": 3.073001292911672e-05, "loss": 0.3546, "step": 61565 }, { "epoch": 2.218978628320179, "grad_norm": 0.2136695832014084, "learning_rate": 3.072717243940534e-05, "loss": 0.407, "step": 61570 }, { "epoch": 2.2191588279814036, "grad_norm": 0.2068341076374054, "learning_rate": 3.072433187166234e-05, "loss": 0.3849, "step": 61575 }, { "epoch": 2.219339027642628, "grad_norm": 0.2191094160079956, "learning_rate": 3.072149122592643e-05, "loss": 0.4173, "step": 61580 }, { "epoch": 2.2195192273038526, "grad_norm": 0.20730826258659363, "learning_rate": 3.071865050223631e-05, "loss": 0.3846, "step": 61585 }, { "epoch": 2.2196994269650774, "grad_norm": 0.20395426452159882, "learning_rate": 3.071580970063068e-05, "loss": 0.3963, "step": 61590 }, { "epoch": 2.219879626626302, "grad_norm": 0.19093595445156097, "learning_rate": 3.071296882114826e-05, "loss": 0.39, "step": 61595 }, { "epoch": 2.2200598262875264, "grad_norm": 0.2352960854768753, "learning_rate": 3.071012786382774e-05, "loss": 0.4021, "step": 61600 }, { "epoch": 2.220240025948751, "grad_norm": 0.21295884251594543, "learning_rate": 3.0707286828707835e-05, "loss": 0.3812, "step": 61605 }, { "epoch": 2.220420225609976, "grad_norm": 0.19988127052783966, "learning_rate": 3.0704445715827246e-05, "loss": 0.4401, "step": 61610 }, { "epoch": 2.2206004252712006, "grad_norm": 0.19108517467975616, "learning_rate": 3.0701604525224695e-05, "loss": 0.3947, "step": 61615 }, { "epoch": 2.2207806249324253, "grad_norm": 0.21383409202098846, "learning_rate": 3.0698763256938876e-05, "loss": 0.3947, "step": 61620 }, { "epoch": 2.2209608245936496, "grad_norm": 0.201728954911232, "learning_rate": 3.069592191100852e-05, "loss": 0.3971, "step": 61625 }, { "epoch": 2.2211410242548744, "grad_norm": 0.20201417803764343, "learning_rate": 3.069308048747233e-05, "loss": 0.3725, "step": 61630 }, { "epoch": 2.221321223916099, "grad_norm": 0.19340789318084717, "learning_rate": 3.069023898636902e-05, "loss": 0.3866, "step": 61635 }, { "epoch": 2.221501423577324, "grad_norm": 0.22743280231952667, "learning_rate": 3.0687397407737306e-05, "loss": 0.3558, "step": 61640 }, { "epoch": 2.221681623238548, "grad_norm": 0.18240687251091003, "learning_rate": 3.06845557516159e-05, "loss": 0.4093, "step": 61645 }, { "epoch": 2.221861822899773, "grad_norm": 0.22113995254039764, "learning_rate": 3.068171401804353e-05, "loss": 0.3851, "step": 61650 }, { "epoch": 2.2220420225609976, "grad_norm": 0.22573597729206085, "learning_rate": 3.0678872207058903e-05, "loss": 0.4202, "step": 61655 }, { "epoch": 2.2222222222222223, "grad_norm": 0.1990000456571579, "learning_rate": 3.067603031870074e-05, "loss": 0.4169, "step": 61660 }, { "epoch": 2.222402421883447, "grad_norm": 0.18506182730197906, "learning_rate": 3.067318835300777e-05, "loss": 0.3866, "step": 61665 }, { "epoch": 2.2225826215446713, "grad_norm": 0.16143712401390076, "learning_rate": 3.067034631001869e-05, "loss": 0.3977, "step": 61670 }, { "epoch": 2.222762821205896, "grad_norm": 0.2105492502450943, "learning_rate": 3.066750418977225e-05, "loss": 0.3876, "step": 61675 }, { "epoch": 2.222943020867121, "grad_norm": 0.2138456106185913, "learning_rate": 3.066466199230716e-05, "loss": 0.4023, "step": 61680 }, { "epoch": 2.2231232205283455, "grad_norm": 0.2341010868549347, "learning_rate": 3.066181971766215e-05, "loss": 0.394, "step": 61685 }, { "epoch": 2.22330342018957, "grad_norm": 0.2601710557937622, "learning_rate": 3.065897736587594e-05, "loss": 0.36, "step": 61690 }, { "epoch": 2.2234836198507946, "grad_norm": 0.20886510610580444, "learning_rate": 3.065613493698726e-05, "loss": 0.4128, "step": 61695 }, { "epoch": 2.2236638195120193, "grad_norm": 0.21699224412441254, "learning_rate": 3.0653292431034845e-05, "loss": 0.385, "step": 61700 }, { "epoch": 2.223844019173244, "grad_norm": 0.22566765546798706, "learning_rate": 3.0650449848057404e-05, "loss": 0.3951, "step": 61705 }, { "epoch": 2.2240242188344688, "grad_norm": 0.16451287269592285, "learning_rate": 3.064760718809368e-05, "loss": 0.3828, "step": 61710 }, { "epoch": 2.224204418495693, "grad_norm": 0.2075955718755722, "learning_rate": 3.06447644511824e-05, "loss": 0.3986, "step": 61715 }, { "epoch": 2.224384618156918, "grad_norm": 0.1905994862318039, "learning_rate": 3.06419216373623e-05, "loss": 0.4061, "step": 61720 }, { "epoch": 2.2245648178181425, "grad_norm": 0.2250581681728363, "learning_rate": 3.063907874667211e-05, "loss": 0.4116, "step": 61725 }, { "epoch": 2.2247450174793673, "grad_norm": 0.20644237101078033, "learning_rate": 3.063623577915056e-05, "loss": 0.3753, "step": 61730 }, { "epoch": 2.2249252171405915, "grad_norm": 0.2077893614768982, "learning_rate": 3.0633392734836395e-05, "loss": 0.3853, "step": 61735 }, { "epoch": 2.2251054168018163, "grad_norm": 0.18702587485313416, "learning_rate": 3.063054961376834e-05, "loss": 0.412, "step": 61740 }, { "epoch": 2.225285616463041, "grad_norm": 0.20379160344600677, "learning_rate": 3.0627706415985146e-05, "loss": 0.4043, "step": 61745 }, { "epoch": 2.2254658161242657, "grad_norm": 0.17889107763767242, "learning_rate": 3.062486314152553e-05, "loss": 0.4084, "step": 61750 }, { "epoch": 2.2256460157854905, "grad_norm": 0.23296573758125305, "learning_rate": 3.0622019790428254e-05, "loss": 0.4243, "step": 61755 }, { "epoch": 2.2258262154467148, "grad_norm": 0.22724519670009613, "learning_rate": 3.0619176362732035e-05, "loss": 0.4073, "step": 61760 }, { "epoch": 2.2260064151079395, "grad_norm": 0.27276527881622314, "learning_rate": 3.0616332858475625e-05, "loss": 0.4086, "step": 61765 }, { "epoch": 2.2261866147691642, "grad_norm": 0.21575415134429932, "learning_rate": 3.0613489277697784e-05, "loss": 0.4389, "step": 61770 }, { "epoch": 2.226366814430389, "grad_norm": 0.19603478908538818, "learning_rate": 3.061064562043722e-05, "loss": 0.4233, "step": 61775 }, { "epoch": 2.2265470140916137, "grad_norm": 0.2517644464969635, "learning_rate": 3.06078018867327e-05, "loss": 0.4004, "step": 61780 }, { "epoch": 2.226727213752838, "grad_norm": 0.2613866627216339, "learning_rate": 3.060495807662297e-05, "loss": 0.3955, "step": 61785 }, { "epoch": 2.2269074134140627, "grad_norm": 0.18380023539066315, "learning_rate": 3.0602114190146775e-05, "loss": 0.3754, "step": 61790 }, { "epoch": 2.2270876130752875, "grad_norm": 0.1809166818857193, "learning_rate": 3.0599270227342845e-05, "loss": 0.3961, "step": 61795 }, { "epoch": 2.227267812736512, "grad_norm": 0.1867736577987671, "learning_rate": 3.059642618824996e-05, "loss": 0.3826, "step": 61800 }, { "epoch": 2.2274480123977365, "grad_norm": 0.24860471487045288, "learning_rate": 3.059358207290683e-05, "loss": 0.4294, "step": 61805 }, { "epoch": 2.227628212058961, "grad_norm": 0.2238049954175949, "learning_rate": 3.059073788135224e-05, "loss": 0.44, "step": 61810 }, { "epoch": 2.227808411720186, "grad_norm": 0.1938982456922531, "learning_rate": 3.0587893613624934e-05, "loss": 0.3959, "step": 61815 }, { "epoch": 2.2279886113814107, "grad_norm": 0.1805550903081894, "learning_rate": 3.058504926976365e-05, "loss": 0.3997, "step": 61820 }, { "epoch": 2.2281688110426354, "grad_norm": 0.2023649662733078, "learning_rate": 3.0582204849807154e-05, "loss": 0.4009, "step": 61825 }, { "epoch": 2.2283490107038597, "grad_norm": 0.18612384796142578, "learning_rate": 3.05793603537942e-05, "loss": 0.4259, "step": 61830 }, { "epoch": 2.2285292103650844, "grad_norm": 0.22503447532653809, "learning_rate": 3.0576515781763545e-05, "loss": 0.4017, "step": 61835 }, { "epoch": 2.228709410026309, "grad_norm": 0.2309921830892563, "learning_rate": 3.057367113375393e-05, "loss": 0.3924, "step": 61840 }, { "epoch": 2.228889609687534, "grad_norm": 0.18473617732524872, "learning_rate": 3.0570826409804135e-05, "loss": 0.3755, "step": 61845 }, { "epoch": 2.2290698093487586, "grad_norm": 0.19264580309391022, "learning_rate": 3.056798160995291e-05, "loss": 0.4029, "step": 61850 }, { "epoch": 2.229250009009983, "grad_norm": 0.21103538572788239, "learning_rate": 3.0565136734239006e-05, "loss": 0.4169, "step": 61855 }, { "epoch": 2.2294302086712077, "grad_norm": 0.21922504901885986, "learning_rate": 3.0562291782701194e-05, "loss": 0.3899, "step": 61860 }, { "epoch": 2.2296104083324324, "grad_norm": 0.19922494888305664, "learning_rate": 3.0559446755378226e-05, "loss": 0.3969, "step": 61865 }, { "epoch": 2.229790607993657, "grad_norm": 0.2152629941701889, "learning_rate": 3.055660165230888e-05, "loss": 0.4128, "step": 61870 }, { "epoch": 2.2299708076548814, "grad_norm": 0.19293451309204102, "learning_rate": 3.055375647353191e-05, "loss": 0.3871, "step": 61875 }, { "epoch": 2.230151007316106, "grad_norm": 0.2509141266345978, "learning_rate": 3.0550911219086084e-05, "loss": 0.4476, "step": 61880 }, { "epoch": 2.230331206977331, "grad_norm": 0.24453610181808472, "learning_rate": 3.0548065889010176e-05, "loss": 0.3912, "step": 61885 }, { "epoch": 2.2305114066385556, "grad_norm": 0.24755702912807465, "learning_rate": 3.054522048334294e-05, "loss": 0.3986, "step": 61890 }, { "epoch": 2.2306916062997804, "grad_norm": 0.1772449016571045, "learning_rate": 3.0542375002123145e-05, "loss": 0.3765, "step": 61895 }, { "epoch": 2.2308718059610047, "grad_norm": 0.1797766089439392, "learning_rate": 3.053952944538956e-05, "loss": 0.3711, "step": 61900 }, { "epoch": 2.2310520056222294, "grad_norm": 0.21006938815116882, "learning_rate": 3.0536683813180966e-05, "loss": 0.4086, "step": 61905 }, { "epoch": 2.231232205283454, "grad_norm": 0.18526072800159454, "learning_rate": 3.053383810553613e-05, "loss": 0.3676, "step": 61910 }, { "epoch": 2.231412404944679, "grad_norm": 0.20477519929409027, "learning_rate": 3.053099232249381e-05, "loss": 0.4048, "step": 61915 }, { "epoch": 2.231592604605903, "grad_norm": 0.17895658314228058, "learning_rate": 3.05281464640928e-05, "loss": 0.4049, "step": 61920 }, { "epoch": 2.231772804267128, "grad_norm": 0.20034639537334442, "learning_rate": 3.0525300530371846e-05, "loss": 0.4089, "step": 61925 }, { "epoch": 2.2319530039283526, "grad_norm": 0.24000723659992218, "learning_rate": 3.052245452136977e-05, "loss": 0.3823, "step": 61930 }, { "epoch": 2.2321332035895773, "grad_norm": 0.20230047404766083, "learning_rate": 3.0519608437125305e-05, "loss": 0.4144, "step": 61935 }, { "epoch": 2.232313403250802, "grad_norm": 0.16788794100284576, "learning_rate": 3.0516762277677243e-05, "loss": 0.3911, "step": 61940 }, { "epoch": 2.2324936029120264, "grad_norm": 0.1820405274629593, "learning_rate": 3.051391604306436e-05, "loss": 0.3892, "step": 61945 }, { "epoch": 2.232673802573251, "grad_norm": 0.24724507331848145, "learning_rate": 3.0511069733325447e-05, "loss": 0.376, "step": 61950 }, { "epoch": 2.232854002234476, "grad_norm": 0.21663494408130646, "learning_rate": 3.0508223348499275e-05, "loss": 0.393, "step": 61955 }, { "epoch": 2.2330342018957006, "grad_norm": 0.17999057471752167, "learning_rate": 3.0505376888624622e-05, "loss": 0.4099, "step": 61960 }, { "epoch": 2.233214401556925, "grad_norm": 0.15646670758724213, "learning_rate": 3.0502530353740278e-05, "loss": 0.3565, "step": 61965 }, { "epoch": 2.2333946012181496, "grad_norm": 0.1919204592704773, "learning_rate": 3.049968374388502e-05, "loss": 0.4127, "step": 61970 }, { "epoch": 2.2335748008793743, "grad_norm": 0.1685272455215454, "learning_rate": 3.0496837059097645e-05, "loss": 0.4332, "step": 61975 }, { "epoch": 2.233755000540599, "grad_norm": 0.1549319177865982, "learning_rate": 3.0493990299416913e-05, "loss": 0.4114, "step": 61980 }, { "epoch": 2.233935200201824, "grad_norm": 0.22232964634895325, "learning_rate": 3.0491143464881634e-05, "loss": 0.3857, "step": 61985 }, { "epoch": 2.234115399863048, "grad_norm": 0.14861559867858887, "learning_rate": 3.0488296555530594e-05, "loss": 0.3871, "step": 61990 }, { "epoch": 2.234295599524273, "grad_norm": 0.20374061167240143, "learning_rate": 3.0485449571402573e-05, "loss": 0.3995, "step": 61995 }, { "epoch": 2.2344757991854975, "grad_norm": 0.21584060788154602, "learning_rate": 3.048260251253636e-05, "loss": 0.4383, "step": 62000 }, { "epoch": 2.2344757991854975, "eval_loss": 0.43634411692619324, "eval_runtime": 3.5361, "eval_samples_per_second": 28.28, "eval_steps_per_second": 7.07, "step": 62000 }, { "epoch": 2.2346559988467223, "grad_norm": 0.18200920522212982, "learning_rate": 3.0479755378970753e-05, "loss": 0.4092, "step": 62005 }, { "epoch": 2.2348361985079466, "grad_norm": 0.18603946268558502, "learning_rate": 3.047690817074454e-05, "loss": 0.3547, "step": 62010 }, { "epoch": 2.2350163981691713, "grad_norm": 0.17530210316181183, "learning_rate": 3.0474060887896518e-05, "loss": 0.3952, "step": 62015 }, { "epoch": 2.235196597830396, "grad_norm": 0.1511596292257309, "learning_rate": 3.0471213530465465e-05, "loss": 0.3703, "step": 62020 }, { "epoch": 2.2353767974916208, "grad_norm": 0.22197118401527405, "learning_rate": 3.046836609849019e-05, "loss": 0.4399, "step": 62025 }, { "epoch": 2.2355569971528455, "grad_norm": 0.2044990509748459, "learning_rate": 3.046551859200949e-05, "loss": 0.3614, "step": 62030 }, { "epoch": 2.23573719681407, "grad_norm": 0.2116348147392273, "learning_rate": 3.0462671011062162e-05, "loss": 0.3879, "step": 62035 }, { "epoch": 2.2359173964752945, "grad_norm": 0.18771792948246002, "learning_rate": 3.045982335568699e-05, "loss": 0.4121, "step": 62040 }, { "epoch": 2.2360975961365193, "grad_norm": 0.2089640349149704, "learning_rate": 3.0456975625922784e-05, "loss": 0.3884, "step": 62045 }, { "epoch": 2.236277795797744, "grad_norm": 0.1947159469127655, "learning_rate": 3.0454127821808337e-05, "loss": 0.3953, "step": 62050 }, { "epoch": 2.2364579954589687, "grad_norm": 0.23277081549167633, "learning_rate": 3.045127994338246e-05, "loss": 0.397, "step": 62055 }, { "epoch": 2.236638195120193, "grad_norm": 0.17750267684459686, "learning_rate": 3.0448431990683958e-05, "loss": 0.3889, "step": 62060 }, { "epoch": 2.2368183947814178, "grad_norm": 0.16454583406448364, "learning_rate": 3.0445583963751606e-05, "loss": 0.3797, "step": 62065 }, { "epoch": 2.2369985944426425, "grad_norm": 0.20463277399539948, "learning_rate": 3.044273586262424e-05, "loss": 0.3917, "step": 62070 }, { "epoch": 2.237178794103867, "grad_norm": 0.20936442911624908, "learning_rate": 3.043988768734065e-05, "loss": 0.4067, "step": 62075 }, { "epoch": 2.237358993765092, "grad_norm": 0.20374515652656555, "learning_rate": 3.043703943793964e-05, "loss": 0.392, "step": 62080 }, { "epoch": 2.2375391934263162, "grad_norm": 0.20407363772392273, "learning_rate": 3.043419111446002e-05, "loss": 0.3983, "step": 62085 }, { "epoch": 2.237719393087541, "grad_norm": 0.19166499376296997, "learning_rate": 3.0431342716940602e-05, "loss": 0.396, "step": 62090 }, { "epoch": 2.2378995927487657, "grad_norm": 0.22127671539783478, "learning_rate": 3.0428494245420192e-05, "loss": 0.3952, "step": 62095 }, { "epoch": 2.2380797924099904, "grad_norm": 0.19556500017642975, "learning_rate": 3.0425645699937594e-05, "loss": 0.4074, "step": 62100 }, { "epoch": 2.2382599920712147, "grad_norm": 0.18051710724830627, "learning_rate": 3.0422797080531628e-05, "loss": 0.3663, "step": 62105 }, { "epoch": 2.2384401917324395, "grad_norm": 0.1607939898967743, "learning_rate": 3.0419948387241093e-05, "loss": 0.3728, "step": 62110 }, { "epoch": 2.238620391393664, "grad_norm": 0.21384070813655853, "learning_rate": 3.0417099620104826e-05, "loss": 0.4021, "step": 62115 }, { "epoch": 2.238800591054889, "grad_norm": 0.16077342629432678, "learning_rate": 3.041425077916161e-05, "loss": 0.3476, "step": 62120 }, { "epoch": 2.2389807907161137, "grad_norm": 0.20714950561523438, "learning_rate": 3.0411401864450283e-05, "loss": 0.402, "step": 62125 }, { "epoch": 2.239160990377338, "grad_norm": 0.2111266702413559, "learning_rate": 3.040855287600965e-05, "loss": 0.4138, "step": 62130 }, { "epoch": 2.2393411900385627, "grad_norm": 0.21749116480350494, "learning_rate": 3.0405703813878534e-05, "loss": 0.3802, "step": 62135 }, { "epoch": 2.2395213896997874, "grad_norm": 0.2501288056373596, "learning_rate": 3.0402854678095754e-05, "loss": 0.4045, "step": 62140 }, { "epoch": 2.239701589361012, "grad_norm": 0.18549959361553192, "learning_rate": 3.0400005468700115e-05, "loss": 0.4179, "step": 62145 }, { "epoch": 2.2398817890222364, "grad_norm": 0.19732961058616638, "learning_rate": 3.0397156185730453e-05, "loss": 0.3776, "step": 62150 }, { "epoch": 2.240061988683461, "grad_norm": 0.2629784345626831, "learning_rate": 3.039430682922558e-05, "loss": 0.3572, "step": 62155 }, { "epoch": 2.240242188344686, "grad_norm": 0.1689624935388565, "learning_rate": 3.0391457399224325e-05, "loss": 0.3499, "step": 62160 }, { "epoch": 2.2404223880059106, "grad_norm": 0.20784690976142883, "learning_rate": 3.038860789576551e-05, "loss": 0.4062, "step": 62165 }, { "epoch": 2.2406025876671354, "grad_norm": 0.17468421161174774, "learning_rate": 3.038575831888794e-05, "loss": 0.4152, "step": 62170 }, { "epoch": 2.2407827873283597, "grad_norm": 0.21627172827720642, "learning_rate": 3.0382908668630477e-05, "loss": 0.4106, "step": 62175 }, { "epoch": 2.2409629869895844, "grad_norm": 0.19633768498897552, "learning_rate": 3.0380058945031907e-05, "loss": 0.3889, "step": 62180 }, { "epoch": 2.241143186650809, "grad_norm": 0.18723514676094055, "learning_rate": 3.0377209148131086e-05, "loss": 0.4013, "step": 62185 }, { "epoch": 2.241323386312034, "grad_norm": 0.17972959578037262, "learning_rate": 3.037435927796683e-05, "loss": 0.3765, "step": 62190 }, { "epoch": 2.241503585973258, "grad_norm": 0.21043039858341217, "learning_rate": 3.037150933457797e-05, "loss": 0.3866, "step": 62195 }, { "epoch": 2.241683785634483, "grad_norm": 0.20892179012298584, "learning_rate": 3.0368659318003335e-05, "loss": 0.4267, "step": 62200 }, { "epoch": 2.2418639852957076, "grad_norm": 0.22525620460510254, "learning_rate": 3.036580922828175e-05, "loss": 0.3884, "step": 62205 }, { "epoch": 2.2420441849569324, "grad_norm": 0.23556070029735565, "learning_rate": 3.0362959065452057e-05, "loss": 0.4409, "step": 62210 }, { "epoch": 2.242224384618157, "grad_norm": 0.20461376011371613, "learning_rate": 3.036010882955308e-05, "loss": 0.4232, "step": 62215 }, { "epoch": 2.2424045842793814, "grad_norm": 0.19230584800243378, "learning_rate": 3.035725852062367e-05, "loss": 0.391, "step": 62220 }, { "epoch": 2.242584783940606, "grad_norm": 0.17546550929546356, "learning_rate": 3.0354408138702645e-05, "loss": 0.3665, "step": 62225 }, { "epoch": 2.242764983601831, "grad_norm": 0.24618138372898102, "learning_rate": 3.0351557683828846e-05, "loss": 0.4114, "step": 62230 }, { "epoch": 2.2429451832630556, "grad_norm": 0.2337779998779297, "learning_rate": 3.03487071560411e-05, "loss": 0.3832, "step": 62235 }, { "epoch": 2.24312538292428, "grad_norm": 0.19449256360530853, "learning_rate": 3.0345856555378267e-05, "loss": 0.4175, "step": 62240 }, { "epoch": 2.2433055825855046, "grad_norm": 0.19468238949775696, "learning_rate": 3.0343005881879172e-05, "loss": 0.3843, "step": 62245 }, { "epoch": 2.2434857822467293, "grad_norm": 0.24619287252426147, "learning_rate": 3.0340155135582648e-05, "loss": 0.403, "step": 62250 }, { "epoch": 2.243665981907954, "grad_norm": 0.15675576031208038, "learning_rate": 3.0337304316527548e-05, "loss": 0.3894, "step": 62255 }, { "epoch": 2.243846181569179, "grad_norm": 0.18250007927417755, "learning_rate": 3.0334453424752712e-05, "loss": 0.4095, "step": 62260 }, { "epoch": 2.244026381230403, "grad_norm": 0.17939797043800354, "learning_rate": 3.033160246029698e-05, "loss": 0.3893, "step": 62265 }, { "epoch": 2.244206580891628, "grad_norm": 0.15464815497398376, "learning_rate": 3.0328751423199198e-05, "loss": 0.3849, "step": 62270 }, { "epoch": 2.2443867805528526, "grad_norm": 0.1741357445716858, "learning_rate": 3.03259003134982e-05, "loss": 0.4079, "step": 62275 }, { "epoch": 2.2445669802140773, "grad_norm": 0.1852835714817047, "learning_rate": 3.0323049131232863e-05, "loss": 0.388, "step": 62280 }, { "epoch": 2.244747179875302, "grad_norm": 0.17500340938568115, "learning_rate": 3.0320197876441992e-05, "loss": 0.4356, "step": 62285 }, { "epoch": 2.2449273795365263, "grad_norm": 0.20107300579547882, "learning_rate": 3.0317346549164465e-05, "loss": 0.3873, "step": 62290 }, { "epoch": 2.245107579197751, "grad_norm": 0.19302281737327576, "learning_rate": 3.0314495149439115e-05, "loss": 0.3943, "step": 62295 }, { "epoch": 2.245287778858976, "grad_norm": 0.20569072663784027, "learning_rate": 3.03116436773048e-05, "loss": 0.4247, "step": 62300 }, { "epoch": 2.2454679785202005, "grad_norm": 0.1855345070362091, "learning_rate": 3.0308792132800368e-05, "loss": 0.3976, "step": 62305 }, { "epoch": 2.245648178181425, "grad_norm": 0.1881013661623001, "learning_rate": 3.0305940515964672e-05, "loss": 0.4136, "step": 62310 }, { "epoch": 2.2458283778426495, "grad_norm": 0.20543043315410614, "learning_rate": 3.0303088826836563e-05, "loss": 0.3816, "step": 62315 }, { "epoch": 2.2460085775038743, "grad_norm": 0.28218919038772583, "learning_rate": 3.0300237065454894e-05, "loss": 0.4104, "step": 62320 }, { "epoch": 2.246188777165099, "grad_norm": 0.21315787732601166, "learning_rate": 3.0297385231858526e-05, "loss": 0.4041, "step": 62325 }, { "epoch": 2.2463689768263237, "grad_norm": 0.19718264043331146, "learning_rate": 3.0294533326086305e-05, "loss": 0.4086, "step": 62330 }, { "epoch": 2.246549176487548, "grad_norm": 0.1942020058631897, "learning_rate": 3.0291681348177092e-05, "loss": 0.3855, "step": 62335 }, { "epoch": 2.2467293761487728, "grad_norm": 0.1929059475660324, "learning_rate": 3.0288829298169753e-05, "loss": 0.3879, "step": 62340 }, { "epoch": 2.2469095758099975, "grad_norm": 0.2083449363708496, "learning_rate": 3.0285977176103132e-05, "loss": 0.3884, "step": 62345 }, { "epoch": 2.2470897754712222, "grad_norm": 0.22706344723701477, "learning_rate": 3.028312498201609e-05, "loss": 0.4189, "step": 62350 }, { "epoch": 2.247269975132447, "grad_norm": 0.2006312906742096, "learning_rate": 3.02802727159475e-05, "loss": 0.408, "step": 62355 }, { "epoch": 2.2474501747936713, "grad_norm": 0.2271738350391388, "learning_rate": 3.0277420377936222e-05, "loss": 0.4563, "step": 62360 }, { "epoch": 2.247630374454896, "grad_norm": 0.19731751084327698, "learning_rate": 3.0274567968021107e-05, "loss": 0.4227, "step": 62365 }, { "epoch": 2.2478105741161207, "grad_norm": 0.19308514893054962, "learning_rate": 3.0271715486241026e-05, "loss": 0.4012, "step": 62370 }, { "epoch": 2.2479907737773455, "grad_norm": 0.19526423513889313, "learning_rate": 3.0268862932634832e-05, "loss": 0.3879, "step": 62375 }, { "epoch": 2.2481709734385698, "grad_norm": 0.20854395627975464, "learning_rate": 3.026601030724141e-05, "loss": 0.3913, "step": 62380 }, { "epoch": 2.2483511730997945, "grad_norm": 0.1848597228527069, "learning_rate": 3.0263157610099625e-05, "loss": 0.3934, "step": 62385 }, { "epoch": 2.248531372761019, "grad_norm": 0.22897033393383026, "learning_rate": 3.0260304841248322e-05, "loss": 0.3878, "step": 62390 }, { "epoch": 2.248711572422244, "grad_norm": 0.1810678094625473, "learning_rate": 3.025745200072639e-05, "loss": 0.4209, "step": 62395 }, { "epoch": 2.2488917720834687, "grad_norm": 0.19008022546768188, "learning_rate": 3.0254599088572688e-05, "loss": 0.3718, "step": 62400 }, { "epoch": 2.249071971744693, "grad_norm": 0.18570220470428467, "learning_rate": 3.02517461048261e-05, "loss": 0.3892, "step": 62405 }, { "epoch": 2.2492521714059177, "grad_norm": 0.23810695111751556, "learning_rate": 3.0248893049525483e-05, "loss": 0.42, "step": 62410 }, { "epoch": 2.2494323710671424, "grad_norm": 0.23687207698822021, "learning_rate": 3.024603992270971e-05, "loss": 0.4312, "step": 62415 }, { "epoch": 2.249612570728367, "grad_norm": 0.1891470104455948, "learning_rate": 3.0243186724417664e-05, "loss": 0.3563, "step": 62420 }, { "epoch": 2.2497927703895915, "grad_norm": 0.18046557903289795, "learning_rate": 3.0240333454688214e-05, "loss": 0.3756, "step": 62425 }, { "epoch": 2.249972970050816, "grad_norm": 0.19262628257274628, "learning_rate": 3.0237480113560235e-05, "loss": 0.3925, "step": 62430 }, { "epoch": 2.250153169712041, "grad_norm": 0.22571761906147003, "learning_rate": 3.02346267010726e-05, "loss": 0.3438, "step": 62435 }, { "epoch": 2.2503333693732657, "grad_norm": 0.19609612226486206, "learning_rate": 3.02317732172642e-05, "loss": 0.3906, "step": 62440 }, { "epoch": 2.2505135690344904, "grad_norm": 0.2036208212375641, "learning_rate": 3.0228919662173895e-05, "loss": 0.4103, "step": 62445 }, { "epoch": 2.2506937686957147, "grad_norm": 0.20205596089363098, "learning_rate": 3.0226066035840568e-05, "loss": 0.4045, "step": 62450 }, { "epoch": 2.2508739683569394, "grad_norm": 0.18515092134475708, "learning_rate": 3.0223212338303108e-05, "loss": 0.4139, "step": 62455 }, { "epoch": 2.251054168018164, "grad_norm": 0.19203773140907288, "learning_rate": 3.0220358569600388e-05, "loss": 0.3927, "step": 62460 }, { "epoch": 2.251234367679389, "grad_norm": 0.2266424596309662, "learning_rate": 3.02175047297713e-05, "loss": 0.4254, "step": 62465 }, { "epoch": 2.251414567340613, "grad_norm": 0.23780296742916107, "learning_rate": 3.0214650818854713e-05, "loss": 0.3954, "step": 62470 }, { "epoch": 2.251594767001838, "grad_norm": 0.2071855366230011, "learning_rate": 3.021179683688952e-05, "loss": 0.3667, "step": 62475 }, { "epoch": 2.2517749666630626, "grad_norm": 0.23853279650211334, "learning_rate": 3.0208942783914606e-05, "loss": 0.4328, "step": 62480 }, { "epoch": 2.2519551663242874, "grad_norm": 0.23091068863868713, "learning_rate": 3.0206088659968857e-05, "loss": 0.4059, "step": 62485 }, { "epoch": 2.252135365985512, "grad_norm": 0.23153997957706451, "learning_rate": 3.0203234465091157e-05, "loss": 0.3665, "step": 62490 }, { "epoch": 2.2523155656467364, "grad_norm": 0.18207654356956482, "learning_rate": 3.020038019932039e-05, "loss": 0.3626, "step": 62495 }, { "epoch": 2.252495765307961, "grad_norm": 0.19852490723133087, "learning_rate": 3.0197525862695452e-05, "loss": 0.4133, "step": 62500 }, { "epoch": 2.252495765307961, "eval_loss": 0.4360164999961853, "eval_runtime": 3.5278, "eval_samples_per_second": 28.347, "eval_steps_per_second": 7.087, "step": 62500 }, { "epoch": 2.252675964969186, "grad_norm": 0.23163332045078278, "learning_rate": 3.019467145525523e-05, "loss": 0.3957, "step": 62505 }, { "epoch": 2.2528561646304106, "grad_norm": 0.21983997523784637, "learning_rate": 3.0191816977038622e-05, "loss": 0.3898, "step": 62510 }, { "epoch": 2.253036364291635, "grad_norm": 0.1701703816652298, "learning_rate": 3.01889624280845e-05, "loss": 0.3719, "step": 62515 }, { "epoch": 2.2532165639528596, "grad_norm": 0.17912349104881287, "learning_rate": 3.018610780843178e-05, "loss": 0.3743, "step": 62520 }, { "epoch": 2.2533967636140844, "grad_norm": 0.20103944838047028, "learning_rate": 3.0183253118119342e-05, "loss": 0.4066, "step": 62525 }, { "epoch": 2.253576963275309, "grad_norm": 0.20848768949508667, "learning_rate": 3.018039835718608e-05, "loss": 0.4139, "step": 62530 }, { "epoch": 2.253757162936534, "grad_norm": 0.1789763867855072, "learning_rate": 3.0177543525670898e-05, "loss": 0.404, "step": 62535 }, { "epoch": 2.2539373625977586, "grad_norm": 0.1968272179365158, "learning_rate": 3.0174688623612684e-05, "loss": 0.39, "step": 62540 }, { "epoch": 2.254117562258983, "grad_norm": 0.2528758645057678, "learning_rate": 3.0171833651050352e-05, "loss": 0.4127, "step": 62545 }, { "epoch": 2.2542977619202076, "grad_norm": 0.23864948749542236, "learning_rate": 3.016897860802277e-05, "loss": 0.3495, "step": 62550 }, { "epoch": 2.2544779615814323, "grad_norm": 0.21808521449565887, "learning_rate": 3.0166123494568865e-05, "loss": 0.3913, "step": 62555 }, { "epoch": 2.254658161242657, "grad_norm": 0.17956504225730896, "learning_rate": 3.016326831072752e-05, "loss": 0.3843, "step": 62560 }, { "epoch": 2.2548383609038813, "grad_norm": 0.1821889877319336, "learning_rate": 3.0160413056537656e-05, "loss": 0.376, "step": 62565 }, { "epoch": 2.255018560565106, "grad_norm": 0.1842850148677826, "learning_rate": 3.0157557732038155e-05, "loss": 0.3938, "step": 62570 }, { "epoch": 2.255198760226331, "grad_norm": 0.17056186497211456, "learning_rate": 3.0154702337267926e-05, "loss": 0.3657, "step": 62575 }, { "epoch": 2.2553789598875555, "grad_norm": 0.23995059728622437, "learning_rate": 3.0151846872265883e-05, "loss": 0.4019, "step": 62580 }, { "epoch": 2.2555591595487803, "grad_norm": 0.25965413451194763, "learning_rate": 3.014899133707092e-05, "loss": 0.4349, "step": 62585 }, { "epoch": 2.2557393592100046, "grad_norm": 0.16279491782188416, "learning_rate": 3.0146135731721946e-05, "loss": 0.3995, "step": 62590 }, { "epoch": 2.2559195588712293, "grad_norm": 0.2433793842792511, "learning_rate": 3.0143280056257874e-05, "loss": 0.3998, "step": 62595 }, { "epoch": 2.256099758532454, "grad_norm": 0.17817862331867218, "learning_rate": 3.0140424310717597e-05, "loss": 0.3904, "step": 62600 }, { "epoch": 2.2562799581936788, "grad_norm": 0.18165291845798492, "learning_rate": 3.0137568495140045e-05, "loss": 0.3814, "step": 62605 }, { "epoch": 2.256460157854903, "grad_norm": 0.17060628533363342, "learning_rate": 3.0134712609564114e-05, "loss": 0.4033, "step": 62610 }, { "epoch": 2.256640357516128, "grad_norm": 0.19508397579193115, "learning_rate": 3.0131856654028716e-05, "loss": 0.3854, "step": 62615 }, { "epoch": 2.2568205571773525, "grad_norm": 0.21551352739334106, "learning_rate": 3.012900062857276e-05, "loss": 0.4103, "step": 62620 }, { "epoch": 2.2570007568385773, "grad_norm": 0.20918194949626923, "learning_rate": 3.0126144533235172e-05, "loss": 0.3812, "step": 62625 }, { "epoch": 2.257180956499802, "grad_norm": 0.2055574357509613, "learning_rate": 3.0123288368054857e-05, "loss": 0.3614, "step": 62630 }, { "epoch": 2.2573611561610263, "grad_norm": 0.201625257730484, "learning_rate": 3.012043213307072e-05, "loss": 0.3726, "step": 62635 }, { "epoch": 2.257541355822251, "grad_norm": 0.2094610631465912, "learning_rate": 3.0117575828321697e-05, "loss": 0.3753, "step": 62640 }, { "epoch": 2.2577215554834758, "grad_norm": 0.19469302892684937, "learning_rate": 3.0114719453846684e-05, "loss": 0.3766, "step": 62645 }, { "epoch": 2.2579017551447005, "grad_norm": 0.26172351837158203, "learning_rate": 3.0111863009684627e-05, "loss": 0.4036, "step": 62650 }, { "epoch": 2.2580819548059248, "grad_norm": 0.16828452050685883, "learning_rate": 3.0109006495874408e-05, "loss": 0.3963, "step": 62655 }, { "epoch": 2.2582621544671495, "grad_norm": 0.17929872870445251, "learning_rate": 3.0106149912454974e-05, "loss": 0.3774, "step": 62660 }, { "epoch": 2.2584423541283742, "grad_norm": 0.19596625864505768, "learning_rate": 3.0103293259465227e-05, "loss": 0.3821, "step": 62665 }, { "epoch": 2.258622553789599, "grad_norm": 0.17597244679927826, "learning_rate": 3.0100436536944105e-05, "loss": 0.4049, "step": 62670 }, { "epoch": 2.2588027534508237, "grad_norm": 0.22724290192127228, "learning_rate": 3.0097579744930522e-05, "loss": 0.4355, "step": 62675 }, { "epoch": 2.258982953112048, "grad_norm": 0.24098409712314606, "learning_rate": 3.0094722883463396e-05, "loss": 0.4125, "step": 62680 }, { "epoch": 2.2591631527732727, "grad_norm": 0.22858020663261414, "learning_rate": 3.0091865952581665e-05, "loss": 0.4162, "step": 62685 }, { "epoch": 2.2593433524344975, "grad_norm": 0.16358384490013123, "learning_rate": 3.0089008952324243e-05, "loss": 0.3844, "step": 62690 }, { "epoch": 2.259523552095722, "grad_norm": 0.2142842710018158, "learning_rate": 3.0086151882730063e-05, "loss": 0.3829, "step": 62695 }, { "epoch": 2.2597037517569465, "grad_norm": 0.26690196990966797, "learning_rate": 3.0083294743838036e-05, "loss": 0.4132, "step": 62700 }, { "epoch": 2.259883951418171, "grad_norm": 0.16452406346797943, "learning_rate": 3.008043753568711e-05, "loss": 0.3578, "step": 62705 }, { "epoch": 2.260064151079396, "grad_norm": 0.21890616416931152, "learning_rate": 3.0077580258316213e-05, "loss": 0.3875, "step": 62710 }, { "epoch": 2.2602443507406207, "grad_norm": 0.22171573340892792, "learning_rate": 3.0074722911764258e-05, "loss": 0.3939, "step": 62715 }, { "epoch": 2.2604245504018454, "grad_norm": 0.2403387874364853, "learning_rate": 3.007186549607019e-05, "loss": 0.4446, "step": 62720 }, { "epoch": 2.2606047500630697, "grad_norm": 0.1787687987089157, "learning_rate": 3.0069008011272936e-05, "loss": 0.3989, "step": 62725 }, { "epoch": 2.2607849497242944, "grad_norm": 0.17443282902240753, "learning_rate": 3.006615045741143e-05, "loss": 0.4032, "step": 62730 }, { "epoch": 2.260965149385519, "grad_norm": 0.18043413758277893, "learning_rate": 3.0063292834524604e-05, "loss": 0.415, "step": 62735 }, { "epoch": 2.261145349046744, "grad_norm": 0.22148096561431885, "learning_rate": 3.0060435142651387e-05, "loss": 0.423, "step": 62740 }, { "epoch": 2.261325548707968, "grad_norm": 0.18598996102809906, "learning_rate": 3.0057577381830732e-05, "loss": 0.3616, "step": 62745 }, { "epoch": 2.261505748369193, "grad_norm": 0.17888624966144562, "learning_rate": 3.005471955210156e-05, "loss": 0.3796, "step": 62750 }, { "epoch": 2.2616859480304177, "grad_norm": 0.18961434066295624, "learning_rate": 3.0051861653502815e-05, "loss": 0.3933, "step": 62755 }, { "epoch": 2.2618661476916424, "grad_norm": 0.23519083857536316, "learning_rate": 3.004900368607343e-05, "loss": 0.3984, "step": 62760 }, { "epoch": 2.262046347352867, "grad_norm": 0.24493834376335144, "learning_rate": 3.0046145649852347e-05, "loss": 0.3918, "step": 62765 }, { "epoch": 2.2622265470140914, "grad_norm": 0.16111089289188385, "learning_rate": 3.0043287544878513e-05, "loss": 0.4249, "step": 62770 }, { "epoch": 2.262406746675316, "grad_norm": 0.24691814184188843, "learning_rate": 3.0040429371190855e-05, "loss": 0.3989, "step": 62775 }, { "epoch": 2.262586946336541, "grad_norm": 0.22405491769313812, "learning_rate": 3.0037571128828323e-05, "loss": 0.3911, "step": 62780 }, { "epoch": 2.2627671459977656, "grad_norm": 0.16816742718219757, "learning_rate": 3.003471281782986e-05, "loss": 0.3844, "step": 62785 }, { "epoch": 2.26294734565899, "grad_norm": 0.21033096313476562, "learning_rate": 3.0031854438234413e-05, "loss": 0.4017, "step": 62790 }, { "epoch": 2.2631275453202147, "grad_norm": 0.16821134090423584, "learning_rate": 3.002899599008092e-05, "loss": 0.3908, "step": 62795 }, { "epoch": 2.2633077449814394, "grad_norm": 0.23085516691207886, "learning_rate": 3.0026137473408332e-05, "loss": 0.4195, "step": 62800 }, { "epoch": 2.263487944642664, "grad_norm": 0.22191166877746582, "learning_rate": 3.0023278888255595e-05, "loss": 0.4187, "step": 62805 }, { "epoch": 2.263668144303889, "grad_norm": 0.173319473862648, "learning_rate": 3.0020420234661655e-05, "loss": 0.3752, "step": 62810 }, { "epoch": 2.2638483439651136, "grad_norm": 0.20444796979427338, "learning_rate": 3.001756151266546e-05, "loss": 0.4048, "step": 62815 }, { "epoch": 2.264028543626338, "grad_norm": 0.18994398415088654, "learning_rate": 3.0014702722305958e-05, "loss": 0.4, "step": 62820 }, { "epoch": 2.2642087432875626, "grad_norm": 0.1826678216457367, "learning_rate": 3.0011843863622112e-05, "loss": 0.4135, "step": 62825 }, { "epoch": 2.2643889429487873, "grad_norm": 0.19503554701805115, "learning_rate": 3.000898493665285e-05, "loss": 0.4056, "step": 62830 }, { "epoch": 2.264569142610012, "grad_norm": 0.22407963871955872, "learning_rate": 3.0006125941437157e-05, "loss": 0.4232, "step": 62835 }, { "epoch": 2.2647493422712364, "grad_norm": 0.1850503832101822, "learning_rate": 3.000326687801395e-05, "loss": 0.4157, "step": 62840 }, { "epoch": 2.264929541932461, "grad_norm": 0.17998819053173065, "learning_rate": 3.000040774642221e-05, "loss": 0.3862, "step": 62845 }, { "epoch": 2.265109741593686, "grad_norm": 0.27859246730804443, "learning_rate": 2.999754854670087e-05, "loss": 0.4156, "step": 62850 }, { "epoch": 2.2652899412549106, "grad_norm": 0.21499688923358917, "learning_rate": 2.9994689278888914e-05, "loss": 0.3995, "step": 62855 }, { "epoch": 2.2654701409161353, "grad_norm": 0.1794780045747757, "learning_rate": 2.999182994302528e-05, "loss": 0.3926, "step": 62860 }, { "epoch": 2.2656503405773596, "grad_norm": 0.20833353698253632, "learning_rate": 2.998897053914892e-05, "loss": 0.4182, "step": 62865 }, { "epoch": 2.2658305402385843, "grad_norm": 0.2357635349035263, "learning_rate": 2.998611106729881e-05, "loss": 0.4268, "step": 62870 }, { "epoch": 2.266010739899809, "grad_norm": 0.18633919954299927, "learning_rate": 2.9983251527513906e-05, "loss": 0.3511, "step": 62875 }, { "epoch": 2.266190939561034, "grad_norm": 0.2481304407119751, "learning_rate": 2.9980391919833156e-05, "loss": 0.3642, "step": 62880 }, { "epoch": 2.266371139222258, "grad_norm": 0.24019500613212585, "learning_rate": 2.9977532244295537e-05, "loss": 0.391, "step": 62885 }, { "epoch": 2.266551338883483, "grad_norm": 0.21231551468372345, "learning_rate": 2.9974672500939994e-05, "loss": 0.3907, "step": 62890 }, { "epoch": 2.2667315385447075, "grad_norm": 0.19400948286056519, "learning_rate": 2.997181268980552e-05, "loss": 0.3995, "step": 62895 }, { "epoch": 2.2669117382059323, "grad_norm": 0.18428564071655273, "learning_rate": 2.9968952810931044e-05, "loss": 0.4017, "step": 62900 }, { "epoch": 2.267091937867157, "grad_norm": 0.20284946262836456, "learning_rate": 2.9966092864355556e-05, "loss": 0.3902, "step": 62905 }, { "epoch": 2.2672721375283813, "grad_norm": 0.21810269355773926, "learning_rate": 2.9963232850118006e-05, "loss": 0.4044, "step": 62910 }, { "epoch": 2.267452337189606, "grad_norm": 0.18810847401618958, "learning_rate": 2.9960372768257378e-05, "loss": 0.4189, "step": 62915 }, { "epoch": 2.2676325368508308, "grad_norm": 0.2173568606376648, "learning_rate": 2.9957512618812634e-05, "loss": 0.3888, "step": 62920 }, { "epoch": 2.2678127365120555, "grad_norm": 0.2114708125591278, "learning_rate": 2.9954652401822732e-05, "loss": 0.4262, "step": 62925 }, { "epoch": 2.26799293617328, "grad_norm": 0.20578879117965698, "learning_rate": 2.9951792117326648e-05, "loss": 0.3686, "step": 62930 }, { "epoch": 2.2681731358345045, "grad_norm": 0.18849804997444153, "learning_rate": 2.9948931765363364e-05, "loss": 0.4066, "step": 62935 }, { "epoch": 2.2683533354957293, "grad_norm": 0.2172732949256897, "learning_rate": 2.9946071345971842e-05, "loss": 0.4361, "step": 62940 }, { "epoch": 2.268533535156954, "grad_norm": 0.19015686213970184, "learning_rate": 2.994321085919105e-05, "loss": 0.4048, "step": 62945 }, { "epoch": 2.2687137348181787, "grad_norm": 0.18530982732772827, "learning_rate": 2.9940350305059972e-05, "loss": 0.3978, "step": 62950 }, { "epoch": 2.268893934479403, "grad_norm": 0.2398093044757843, "learning_rate": 2.9937489683617577e-05, "loss": 0.4006, "step": 62955 }, { "epoch": 2.2690741341406278, "grad_norm": 0.18880394101142883, "learning_rate": 2.9934628994902836e-05, "loss": 0.3719, "step": 62960 }, { "epoch": 2.2692543338018525, "grad_norm": 0.21370282769203186, "learning_rate": 2.993176823895474e-05, "loss": 0.4061, "step": 62965 }, { "epoch": 2.269434533463077, "grad_norm": 0.20947441458702087, "learning_rate": 2.992890741581224e-05, "loss": 0.4062, "step": 62970 }, { "epoch": 2.2696147331243015, "grad_norm": 0.18674728274345398, "learning_rate": 2.9926046525514345e-05, "loss": 0.3753, "step": 62975 }, { "epoch": 2.2697949327855262, "grad_norm": 0.19669872522354126, "learning_rate": 2.9923185568100014e-05, "loss": 0.3642, "step": 62980 }, { "epoch": 2.269975132446751, "grad_norm": 0.18584516644477844, "learning_rate": 2.992032454360824e-05, "loss": 0.373, "step": 62985 }, { "epoch": 2.2701553321079757, "grad_norm": 0.19654756784439087, "learning_rate": 2.9917463452077986e-05, "loss": 0.3898, "step": 62990 }, { "epoch": 2.2703355317692004, "grad_norm": 0.18995268642902374, "learning_rate": 2.991460229354825e-05, "loss": 0.4269, "step": 62995 }, { "epoch": 2.2705157314304247, "grad_norm": 0.2119891196489334, "learning_rate": 2.9911741068058012e-05, "loss": 0.3993, "step": 63000 }, { "epoch": 2.2705157314304247, "eval_loss": 0.4354766011238098, "eval_runtime": 3.5274, "eval_samples_per_second": 28.349, "eval_steps_per_second": 7.087, "step": 63000 }, { "epoch": 2.2706959310916495, "grad_norm": 0.22078485786914825, "learning_rate": 2.9908879775646247e-05, "loss": 0.3753, "step": 63005 }, { "epoch": 2.270876130752874, "grad_norm": 0.20009629428386688, "learning_rate": 2.990601841635195e-05, "loss": 0.4031, "step": 63010 }, { "epoch": 2.271056330414099, "grad_norm": 0.17347082495689392, "learning_rate": 2.9903156990214097e-05, "loss": 0.3555, "step": 63015 }, { "epoch": 2.2712365300753232, "grad_norm": 0.261214941740036, "learning_rate": 2.9900295497271687e-05, "loss": 0.3902, "step": 63020 }, { "epoch": 2.271416729736548, "grad_norm": 0.19732660055160522, "learning_rate": 2.9897433937563696e-05, "loss": 0.4313, "step": 63025 }, { "epoch": 2.2715969293977727, "grad_norm": 0.22915062308311462, "learning_rate": 2.989457231112911e-05, "loss": 0.4075, "step": 63030 }, { "epoch": 2.2717771290589974, "grad_norm": 0.20028504729270935, "learning_rate": 2.989171061800693e-05, "loss": 0.4022, "step": 63035 }, { "epoch": 2.271957328720222, "grad_norm": 0.23854713141918182, "learning_rate": 2.988884885823614e-05, "loss": 0.3984, "step": 63040 }, { "epoch": 2.272137528381447, "grad_norm": 0.2059304416179657, "learning_rate": 2.9885987031855733e-05, "loss": 0.4019, "step": 63045 }, { "epoch": 2.272317728042671, "grad_norm": 0.16523705422878265, "learning_rate": 2.9883125138904693e-05, "loss": 0.3699, "step": 63050 }, { "epoch": 2.272497927703896, "grad_norm": 0.20402154326438904, "learning_rate": 2.988026317942202e-05, "loss": 0.3868, "step": 63055 }, { "epoch": 2.2726781273651206, "grad_norm": 0.22401557862758636, "learning_rate": 2.987740115344671e-05, "loss": 0.3785, "step": 63060 }, { "epoch": 2.2728583270263454, "grad_norm": 0.21607942879199982, "learning_rate": 2.9874539061017746e-05, "loss": 0.3896, "step": 63065 }, { "epoch": 2.2730385266875697, "grad_norm": 0.2318323701620102, "learning_rate": 2.987167690217414e-05, "loss": 0.4374, "step": 63070 }, { "epoch": 2.2732187263487944, "grad_norm": 0.20749138295650482, "learning_rate": 2.986881467695487e-05, "loss": 0.4096, "step": 63075 }, { "epoch": 2.273398926010019, "grad_norm": 0.20667487382888794, "learning_rate": 2.9865952385398955e-05, "loss": 0.4047, "step": 63080 }, { "epoch": 2.273579125671244, "grad_norm": 0.23267313838005066, "learning_rate": 2.9863090027545364e-05, "loss": 0.4173, "step": 63085 }, { "epoch": 2.2737593253324686, "grad_norm": 0.19276480376720428, "learning_rate": 2.986022760343313e-05, "loss": 0.3679, "step": 63090 }, { "epoch": 2.273939524993693, "grad_norm": 0.20015855133533478, "learning_rate": 2.985736511310122e-05, "loss": 0.38, "step": 63095 }, { "epoch": 2.2741197246549176, "grad_norm": 0.2319357693195343, "learning_rate": 2.985450255658866e-05, "loss": 0.3908, "step": 63100 }, { "epoch": 2.2742999243161424, "grad_norm": 0.2397158294916153, "learning_rate": 2.9851639933934446e-05, "loss": 0.3892, "step": 63105 }, { "epoch": 2.274480123977367, "grad_norm": 0.162200927734375, "learning_rate": 2.9848777245177568e-05, "loss": 0.4061, "step": 63110 }, { "epoch": 2.2746603236385914, "grad_norm": 0.2283955216407776, "learning_rate": 2.9845914490357046e-05, "loss": 0.4059, "step": 63115 }, { "epoch": 2.274840523299816, "grad_norm": 0.20629531145095825, "learning_rate": 2.9843051669511872e-05, "loss": 0.3769, "step": 63120 }, { "epoch": 2.275020722961041, "grad_norm": 0.16891133785247803, "learning_rate": 2.9840188782681062e-05, "loss": 0.3967, "step": 63125 }, { "epoch": 2.2752009226222656, "grad_norm": 0.19010712206363678, "learning_rate": 2.983732582990361e-05, "loss": 0.4143, "step": 63130 }, { "epoch": 2.2753811222834903, "grad_norm": 0.2108030766248703, "learning_rate": 2.9834462811218534e-05, "loss": 0.3775, "step": 63135 }, { "epoch": 2.2755613219447146, "grad_norm": 0.21494841575622559, "learning_rate": 2.9831599726664844e-05, "loss": 0.3723, "step": 63140 }, { "epoch": 2.2757415216059393, "grad_norm": 0.19775624573230743, "learning_rate": 2.9828736576281535e-05, "loss": 0.4012, "step": 63145 }, { "epoch": 2.275921721267164, "grad_norm": 0.20917865633964539, "learning_rate": 2.9825873360107626e-05, "loss": 0.3865, "step": 63150 }, { "epoch": 2.276101920928389, "grad_norm": 0.1742408573627472, "learning_rate": 2.982301007818213e-05, "loss": 0.3552, "step": 63155 }, { "epoch": 2.276282120589613, "grad_norm": 0.20653748512268066, "learning_rate": 2.9820146730544052e-05, "loss": 0.3645, "step": 63160 }, { "epoch": 2.276462320250838, "grad_norm": 0.20753364264965057, "learning_rate": 2.9817283317232413e-05, "loss": 0.3959, "step": 63165 }, { "epoch": 2.2766425199120626, "grad_norm": 0.20545120537281036, "learning_rate": 2.9814419838286213e-05, "loss": 0.4067, "step": 63170 }, { "epoch": 2.2768227195732873, "grad_norm": 0.1817047894001007, "learning_rate": 2.981155629374448e-05, "loss": 0.4138, "step": 63175 }, { "epoch": 2.277002919234512, "grad_norm": 0.1619819700717926, "learning_rate": 2.980869268364622e-05, "loss": 0.4093, "step": 63180 }, { "epoch": 2.2771831188957363, "grad_norm": 0.18216536939144135, "learning_rate": 2.9805829008030466e-05, "loss": 0.388, "step": 63185 }, { "epoch": 2.277363318556961, "grad_norm": 0.23608942329883575, "learning_rate": 2.9802965266936213e-05, "loss": 0.3851, "step": 63190 }, { "epoch": 2.277543518218186, "grad_norm": 0.22420279681682587, "learning_rate": 2.9800101460402484e-05, "loss": 0.3876, "step": 63195 }, { "epoch": 2.2777237178794105, "grad_norm": 0.20337367057800293, "learning_rate": 2.9797237588468308e-05, "loss": 0.398, "step": 63200 }, { "epoch": 2.277903917540635, "grad_norm": 0.20395952463150024, "learning_rate": 2.9794373651172696e-05, "loss": 0.4199, "step": 63205 }, { "epoch": 2.2780841172018595, "grad_norm": 0.19904713332653046, "learning_rate": 2.9791509648554678e-05, "loss": 0.3991, "step": 63210 }, { "epoch": 2.2782643168630843, "grad_norm": 0.17146888375282288, "learning_rate": 2.9788645580653257e-05, "loss": 0.3952, "step": 63215 }, { "epoch": 2.278444516524309, "grad_norm": 0.2011823207139969, "learning_rate": 2.9785781447507476e-05, "loss": 0.3915, "step": 63220 }, { "epoch": 2.2786247161855337, "grad_norm": 0.2076421082019806, "learning_rate": 2.9782917249156346e-05, "loss": 0.386, "step": 63225 }, { "epoch": 2.278804915846758, "grad_norm": 0.18694275617599487, "learning_rate": 2.9780052985638896e-05, "loss": 0.3882, "step": 63230 }, { "epoch": 2.2789851155079828, "grad_norm": 0.2005622684955597, "learning_rate": 2.9777188656994147e-05, "loss": 0.3676, "step": 63235 }, { "epoch": 2.2791653151692075, "grad_norm": 0.17751555144786835, "learning_rate": 2.9774324263261126e-05, "loss": 0.3941, "step": 63240 }, { "epoch": 2.2793455148304322, "grad_norm": 0.20505878329277039, "learning_rate": 2.9771459804478868e-05, "loss": 0.3837, "step": 63245 }, { "epoch": 2.2795257144916565, "grad_norm": 0.20655593276023865, "learning_rate": 2.976859528068639e-05, "loss": 0.4162, "step": 63250 }, { "epoch": 2.2797059141528813, "grad_norm": 0.21149560809135437, "learning_rate": 2.9765730691922723e-05, "loss": 0.3893, "step": 63255 }, { "epoch": 2.279886113814106, "grad_norm": 0.2167680561542511, "learning_rate": 2.97628660382269e-05, "loss": 0.4075, "step": 63260 }, { "epoch": 2.2800663134753307, "grad_norm": 0.23179109394550323, "learning_rate": 2.9760001319637955e-05, "loss": 0.3927, "step": 63265 }, { "epoch": 2.2802465131365555, "grad_norm": 0.19186845421791077, "learning_rate": 2.97571365361949e-05, "loss": 0.3835, "step": 63270 }, { "epoch": 2.2804267127977798, "grad_norm": 0.21390306949615479, "learning_rate": 2.975427168793679e-05, "loss": 0.4458, "step": 63275 }, { "epoch": 2.2806069124590045, "grad_norm": 0.2087387591600418, "learning_rate": 2.9751406774902645e-05, "loss": 0.3978, "step": 63280 }, { "epoch": 2.280787112120229, "grad_norm": 0.2157135307788849, "learning_rate": 2.9748541797131506e-05, "loss": 0.4081, "step": 63285 }, { "epoch": 2.280967311781454, "grad_norm": 0.18895265460014343, "learning_rate": 2.9745676754662405e-05, "loss": 0.3776, "step": 63290 }, { "epoch": 2.2811475114426782, "grad_norm": 0.2101006656885147, "learning_rate": 2.9742811647534373e-05, "loss": 0.3925, "step": 63295 }, { "epoch": 2.281327711103903, "grad_norm": 0.21534328162670135, "learning_rate": 2.9739946475786452e-05, "loss": 0.4033, "step": 63300 }, { "epoch": 2.2815079107651277, "grad_norm": 0.17622357606887817, "learning_rate": 2.9737081239457683e-05, "loss": 0.41, "step": 63305 }, { "epoch": 2.2816881104263524, "grad_norm": 0.2728975713253021, "learning_rate": 2.973421593858709e-05, "loss": 0.4151, "step": 63310 }, { "epoch": 2.281868310087577, "grad_norm": 0.18991267681121826, "learning_rate": 2.973135057321373e-05, "loss": 0.426, "step": 63315 }, { "epoch": 2.282048509748802, "grad_norm": 0.16258344054222107, "learning_rate": 2.972848514337662e-05, "loss": 0.3736, "step": 63320 }, { "epoch": 2.282228709410026, "grad_norm": 0.19868828356266022, "learning_rate": 2.972561964911484e-05, "loss": 0.4063, "step": 63325 }, { "epoch": 2.282408909071251, "grad_norm": 0.21427805721759796, "learning_rate": 2.9722754090467385e-05, "loss": 0.3628, "step": 63330 }, { "epoch": 2.2825891087324757, "grad_norm": 0.16630151867866516, "learning_rate": 2.9719888467473333e-05, "loss": 0.3724, "step": 63335 }, { "epoch": 2.2827693083937004, "grad_norm": 0.20577584207057953, "learning_rate": 2.9717022780171704e-05, "loss": 0.4154, "step": 63340 }, { "epoch": 2.2829495080549247, "grad_norm": 0.24059414863586426, "learning_rate": 2.9714157028601558e-05, "loss": 0.3972, "step": 63345 }, { "epoch": 2.2831297077161494, "grad_norm": 0.1928698569536209, "learning_rate": 2.971129121280194e-05, "loss": 0.3794, "step": 63350 }, { "epoch": 2.283309907377374, "grad_norm": 0.20987224578857422, "learning_rate": 2.9708425332811883e-05, "loss": 0.3816, "step": 63355 }, { "epoch": 2.283490107038599, "grad_norm": 0.23037847876548767, "learning_rate": 2.9705559388670446e-05, "loss": 0.4389, "step": 63360 }, { "epoch": 2.2836703066998236, "grad_norm": 0.19015249609947205, "learning_rate": 2.970269338041668e-05, "loss": 0.3849, "step": 63365 }, { "epoch": 2.283850506361048, "grad_norm": 0.21437907218933105, "learning_rate": 2.9699827308089617e-05, "loss": 0.4241, "step": 63370 }, { "epoch": 2.2840307060222726, "grad_norm": 0.2129935771226883, "learning_rate": 2.969696117172832e-05, "loss": 0.402, "step": 63375 }, { "epoch": 2.2842109056834974, "grad_norm": 0.2034277319908142, "learning_rate": 2.969409497137184e-05, "loss": 0.4077, "step": 63380 }, { "epoch": 2.284391105344722, "grad_norm": 0.21894314885139465, "learning_rate": 2.9691228707059216e-05, "loss": 0.4069, "step": 63385 }, { "epoch": 2.2845713050059464, "grad_norm": 0.20878823101520538, "learning_rate": 2.9688362378829514e-05, "loss": 0.3912, "step": 63390 }, { "epoch": 2.284751504667171, "grad_norm": 0.21701130270957947, "learning_rate": 2.968549598672179e-05, "loss": 0.3831, "step": 63395 }, { "epoch": 2.284931704328396, "grad_norm": 0.20921416580677032, "learning_rate": 2.9682629530775075e-05, "loss": 0.3879, "step": 63400 }, { "epoch": 2.2851119039896206, "grad_norm": 0.22157233953475952, "learning_rate": 2.967976301102845e-05, "loss": 0.4179, "step": 63405 }, { "epoch": 2.2852921036508453, "grad_norm": 0.2359781712293625, "learning_rate": 2.967689642752096e-05, "loss": 0.3867, "step": 63410 }, { "epoch": 2.2854723033120696, "grad_norm": 0.18056300282478333, "learning_rate": 2.967402978029166e-05, "loss": 0.3854, "step": 63415 }, { "epoch": 2.2856525029732944, "grad_norm": 0.1796717792749405, "learning_rate": 2.967116306937961e-05, "loss": 0.3722, "step": 63420 }, { "epoch": 2.285832702634519, "grad_norm": 0.21330176293849945, "learning_rate": 2.9668296294823862e-05, "loss": 0.4205, "step": 63425 }, { "epoch": 2.286012902295744, "grad_norm": 0.16301804780960083, "learning_rate": 2.966542945666349e-05, "loss": 0.4064, "step": 63430 }, { "epoch": 2.286193101956968, "grad_norm": 0.2423224300146103, "learning_rate": 2.9662562554937534e-05, "loss": 0.4057, "step": 63435 }, { "epoch": 2.286373301618193, "grad_norm": 0.18112295866012573, "learning_rate": 2.9659695589685076e-05, "loss": 0.3972, "step": 63440 }, { "epoch": 2.2865535012794176, "grad_norm": 0.1735352873802185, "learning_rate": 2.965682856094516e-05, "loss": 0.391, "step": 63445 }, { "epoch": 2.2867337009406423, "grad_norm": 0.208638533949852, "learning_rate": 2.9653961468756863e-05, "loss": 0.3966, "step": 63450 }, { "epoch": 2.286913900601867, "grad_norm": 0.19702517986297607, "learning_rate": 2.965109431315924e-05, "loss": 0.412, "step": 63455 }, { "epoch": 2.2870941002630913, "grad_norm": 0.21106953918933868, "learning_rate": 2.964822709419135e-05, "loss": 0.4016, "step": 63460 }, { "epoch": 2.287274299924316, "grad_norm": 0.18838860094547272, "learning_rate": 2.9645359811892275e-05, "loss": 0.3739, "step": 63465 }, { "epoch": 2.287454499585541, "grad_norm": 0.22702044248580933, "learning_rate": 2.964249246630107e-05, "loss": 0.3855, "step": 63470 }, { "epoch": 2.2876346992467655, "grad_norm": 0.18174223601818085, "learning_rate": 2.9639625057456805e-05, "loss": 0.3885, "step": 63475 }, { "epoch": 2.28781489890799, "grad_norm": 0.19665606319904327, "learning_rate": 2.9636757585398544e-05, "loss": 0.4166, "step": 63480 }, { "epoch": 2.2879950985692146, "grad_norm": 0.1689072996377945, "learning_rate": 2.9633890050165357e-05, "loss": 0.3923, "step": 63485 }, { "epoch": 2.2881752982304393, "grad_norm": 0.20292653143405914, "learning_rate": 2.963102245179632e-05, "loss": 0.3948, "step": 63490 }, { "epoch": 2.288355497891664, "grad_norm": 0.1763313263654709, "learning_rate": 2.9628154790330498e-05, "loss": 0.3818, "step": 63495 }, { "epoch": 2.2885356975528888, "grad_norm": 0.2226899415254593, "learning_rate": 2.9625287065806962e-05, "loss": 0.393, "step": 63500 }, { "epoch": 2.2885356975528888, "eval_loss": 0.4357220530509949, "eval_runtime": 3.5317, "eval_samples_per_second": 28.315, "eval_steps_per_second": 7.079, "step": 63500 }, { "epoch": 2.288715897214113, "grad_norm": 0.17390502989292145, "learning_rate": 2.962241927826478e-05, "loss": 0.4268, "step": 63505 }, { "epoch": 2.288896096875338, "grad_norm": 0.250418484210968, "learning_rate": 2.9619551427743042e-05, "loss": 0.4187, "step": 63510 }, { "epoch": 2.2890762965365625, "grad_norm": 0.25360366702079773, "learning_rate": 2.9616683514280798e-05, "loss": 0.3917, "step": 63515 }, { "epoch": 2.2892564961977873, "grad_norm": 0.18742060661315918, "learning_rate": 2.9613815537917145e-05, "loss": 0.3793, "step": 63520 }, { "epoch": 2.2894366958590116, "grad_norm": 0.17973770201206207, "learning_rate": 2.961094749869114e-05, "loss": 0.3881, "step": 63525 }, { "epoch": 2.2896168955202363, "grad_norm": 0.229409322142601, "learning_rate": 2.9608079396641868e-05, "loss": 0.4357, "step": 63530 }, { "epoch": 2.289797095181461, "grad_norm": 0.2348993420600891, "learning_rate": 2.9605211231808417e-05, "loss": 0.4039, "step": 63535 }, { "epoch": 2.2899772948426858, "grad_norm": 0.21660517156124115, "learning_rate": 2.9602343004229842e-05, "loss": 0.4293, "step": 63540 }, { "epoch": 2.2901574945039105, "grad_norm": 0.1841536909341812, "learning_rate": 2.9599474713945242e-05, "loss": 0.394, "step": 63545 }, { "epoch": 2.290337694165135, "grad_norm": 0.2169848531484604, "learning_rate": 2.959660636099369e-05, "loss": 0.3922, "step": 63550 }, { "epoch": 2.2905178938263595, "grad_norm": 0.22825659811496735, "learning_rate": 2.9593737945414264e-05, "loss": 0.3977, "step": 63555 }, { "epoch": 2.2906980934875842, "grad_norm": 0.21935486793518066, "learning_rate": 2.9590869467246047e-05, "loss": 0.4143, "step": 63560 }, { "epoch": 2.290878293148809, "grad_norm": 0.18911194801330566, "learning_rate": 2.9588000926528126e-05, "loss": 0.3835, "step": 63565 }, { "epoch": 2.2910584928100337, "grad_norm": 0.16770300269126892, "learning_rate": 2.958513232329957e-05, "loss": 0.4211, "step": 63570 }, { "epoch": 2.291238692471258, "grad_norm": 0.20675276219844818, "learning_rate": 2.9582263657599485e-05, "loss": 0.384, "step": 63575 }, { "epoch": 2.2914188921324827, "grad_norm": 0.26239025592803955, "learning_rate": 2.9579394929466943e-05, "loss": 0.394, "step": 63580 }, { "epoch": 2.2915990917937075, "grad_norm": 0.2336747944355011, "learning_rate": 2.9576526138941025e-05, "loss": 0.4154, "step": 63585 }, { "epoch": 2.291779291454932, "grad_norm": 0.18274347484111786, "learning_rate": 2.957365728606083e-05, "loss": 0.3395, "step": 63590 }, { "epoch": 2.291959491116157, "grad_norm": 0.17019642889499664, "learning_rate": 2.9570788370865443e-05, "loss": 0.3737, "step": 63595 }, { "epoch": 2.2921396907773812, "grad_norm": 0.2089766263961792, "learning_rate": 2.956791939339394e-05, "loss": 0.3783, "step": 63600 }, { "epoch": 2.292319890438606, "grad_norm": 0.2029276192188263, "learning_rate": 2.956505035368543e-05, "loss": 0.4223, "step": 63605 }, { "epoch": 2.2925000900998307, "grad_norm": 0.1698850840330124, "learning_rate": 2.9562181251778986e-05, "loss": 0.4032, "step": 63610 }, { "epoch": 2.2926802897610554, "grad_norm": 0.17825381457805634, "learning_rate": 2.9559312087713714e-05, "loss": 0.3808, "step": 63615 }, { "epoch": 2.2928604894222797, "grad_norm": 0.21084287762641907, "learning_rate": 2.9556442861528688e-05, "loss": 0.4042, "step": 63620 }, { "epoch": 2.2930406890835044, "grad_norm": 0.21861131489276886, "learning_rate": 2.9553573573263016e-05, "loss": 0.4222, "step": 63625 }, { "epoch": 2.293220888744729, "grad_norm": 0.1877928525209427, "learning_rate": 2.9550704222955778e-05, "loss": 0.378, "step": 63630 }, { "epoch": 2.293401088405954, "grad_norm": 0.23301534354686737, "learning_rate": 2.954783481064608e-05, "loss": 0.407, "step": 63635 }, { "epoch": 2.2935812880671786, "grad_norm": 0.2327142059803009, "learning_rate": 2.954496533637302e-05, "loss": 0.396, "step": 63640 }, { "epoch": 2.293761487728403, "grad_norm": 0.22101731598377228, "learning_rate": 2.954209580017568e-05, "loss": 0.378, "step": 63645 }, { "epoch": 2.2939416873896277, "grad_norm": 0.22895368933677673, "learning_rate": 2.9539226202093162e-05, "loss": 0.4179, "step": 63650 }, { "epoch": 2.2941218870508524, "grad_norm": 0.19855442643165588, "learning_rate": 2.9536356542164573e-05, "loss": 0.3768, "step": 63655 }, { "epoch": 2.294302086712077, "grad_norm": 0.20044022798538208, "learning_rate": 2.9533486820429e-05, "loss": 0.3758, "step": 63660 }, { "epoch": 2.2944822863733014, "grad_norm": 0.23310525715351105, "learning_rate": 2.9530617036925545e-05, "loss": 0.4063, "step": 63665 }, { "epoch": 2.294662486034526, "grad_norm": 0.2389223724603653, "learning_rate": 2.9527747191693318e-05, "loss": 0.3821, "step": 63670 }, { "epoch": 2.294842685695751, "grad_norm": 0.21088966727256775, "learning_rate": 2.9524877284771406e-05, "loss": 0.4099, "step": 63675 }, { "epoch": 2.2950228853569756, "grad_norm": 0.19195705652236938, "learning_rate": 2.952200731619892e-05, "loss": 0.3833, "step": 63680 }, { "epoch": 2.2952030850182004, "grad_norm": 0.1707751750946045, "learning_rate": 2.9519137286014957e-05, "loss": 0.3551, "step": 63685 }, { "epoch": 2.2953832846794247, "grad_norm": 0.23566772043704987, "learning_rate": 2.9516267194258618e-05, "loss": 0.3843, "step": 63690 }, { "epoch": 2.2955634843406494, "grad_norm": 0.19582681357860565, "learning_rate": 2.9513397040969025e-05, "loss": 0.4201, "step": 63695 }, { "epoch": 2.295743684001874, "grad_norm": 0.2070237249135971, "learning_rate": 2.9510526826185263e-05, "loss": 0.3976, "step": 63700 }, { "epoch": 2.295923883663099, "grad_norm": 0.19391392171382904, "learning_rate": 2.950765654994645e-05, "loss": 0.4019, "step": 63705 }, { "epoch": 2.296104083324323, "grad_norm": 0.2209123820066452, "learning_rate": 2.9504786212291685e-05, "loss": 0.3918, "step": 63710 }, { "epoch": 2.296284282985548, "grad_norm": 0.18248969316482544, "learning_rate": 2.9501915813260084e-05, "loss": 0.3817, "step": 63715 }, { "epoch": 2.2964644826467726, "grad_norm": 0.17542307078838348, "learning_rate": 2.9499045352890754e-05, "loss": 0.3842, "step": 63720 }, { "epoch": 2.2966446823079973, "grad_norm": 0.19635504484176636, "learning_rate": 2.9496174831222796e-05, "loss": 0.3686, "step": 63725 }, { "epoch": 2.296824881969222, "grad_norm": 0.1903241127729416, "learning_rate": 2.9493304248295327e-05, "loss": 0.3946, "step": 63730 }, { "epoch": 2.2970050816304464, "grad_norm": 0.19925326108932495, "learning_rate": 2.949043360414746e-05, "loss": 0.3765, "step": 63735 }, { "epoch": 2.297185281291671, "grad_norm": 0.19926919043064117, "learning_rate": 2.9487562898818304e-05, "loss": 0.4365, "step": 63740 }, { "epoch": 2.297365480952896, "grad_norm": 0.23359358310699463, "learning_rate": 2.9484692132346974e-05, "loss": 0.3822, "step": 63745 }, { "epoch": 2.2975456806141206, "grad_norm": 0.2015082836151123, "learning_rate": 2.9481821304772572e-05, "loss": 0.404, "step": 63750 }, { "epoch": 2.297725880275345, "grad_norm": 0.2047175019979477, "learning_rate": 2.9478950416134243e-05, "loss": 0.4028, "step": 63755 }, { "epoch": 2.2979060799365696, "grad_norm": 0.16767185926437378, "learning_rate": 2.9476079466471063e-05, "loss": 0.412, "step": 63760 }, { "epoch": 2.2980862795977943, "grad_norm": 0.21474462747573853, "learning_rate": 2.9473208455822178e-05, "loss": 0.386, "step": 63765 }, { "epoch": 2.298266479259019, "grad_norm": 0.22542273998260498, "learning_rate": 2.947033738422668e-05, "loss": 0.3809, "step": 63770 }, { "epoch": 2.298446678920244, "grad_norm": 0.18820738792419434, "learning_rate": 2.9467466251723713e-05, "loss": 0.4045, "step": 63775 }, { "epoch": 2.298626878581468, "grad_norm": 0.23645508289337158, "learning_rate": 2.9464595058352383e-05, "loss": 0.3902, "step": 63780 }, { "epoch": 2.298807078242693, "grad_norm": 0.2576570510864258, "learning_rate": 2.9461723804151802e-05, "loss": 0.4025, "step": 63785 }, { "epoch": 2.2989872779039175, "grad_norm": 0.2036072313785553, "learning_rate": 2.9458852489161102e-05, "loss": 0.4015, "step": 63790 }, { "epoch": 2.2991674775651423, "grad_norm": 0.1797807663679123, "learning_rate": 2.94559811134194e-05, "loss": 0.3861, "step": 63795 }, { "epoch": 2.2993476772263666, "grad_norm": 0.2656286954879761, "learning_rate": 2.945310967696583e-05, "loss": 0.3931, "step": 63800 }, { "epoch": 2.2995278768875913, "grad_norm": 0.2161501944065094, "learning_rate": 2.9450238179839483e-05, "loss": 0.4066, "step": 63805 }, { "epoch": 2.299708076548816, "grad_norm": 0.24081888794898987, "learning_rate": 2.9447366622079515e-05, "loss": 0.4195, "step": 63810 }, { "epoch": 2.2998882762100408, "grad_norm": 0.18698875606060028, "learning_rate": 2.9444495003725033e-05, "loss": 0.3576, "step": 63815 }, { "epoch": 2.3000684758712655, "grad_norm": 0.22651271522045135, "learning_rate": 2.9441623324815166e-05, "loss": 0.4172, "step": 63820 }, { "epoch": 2.3002486755324902, "grad_norm": 0.23613341152668, "learning_rate": 2.9438751585389047e-05, "loss": 0.4038, "step": 63825 }, { "epoch": 2.3004288751937145, "grad_norm": 0.19608378410339355, "learning_rate": 2.9435879785485788e-05, "loss": 0.3828, "step": 63830 }, { "epoch": 2.3006090748549393, "grad_norm": 0.19310517609119415, "learning_rate": 2.943300792514453e-05, "loss": 0.3665, "step": 63835 }, { "epoch": 2.300789274516164, "grad_norm": 0.23690401017665863, "learning_rate": 2.9430136004404402e-05, "loss": 0.4279, "step": 63840 }, { "epoch": 2.3009694741773887, "grad_norm": 0.201737180352211, "learning_rate": 2.9427264023304523e-05, "loss": 0.3869, "step": 63845 }, { "epoch": 2.301149673838613, "grad_norm": 0.192640483379364, "learning_rate": 2.942439198188403e-05, "loss": 0.4047, "step": 63850 }, { "epoch": 2.3013298734998378, "grad_norm": 0.17696553468704224, "learning_rate": 2.9421519880182047e-05, "loss": 0.3942, "step": 63855 }, { "epoch": 2.3015100731610625, "grad_norm": 0.19767604768276215, "learning_rate": 2.9418647718237724e-05, "loss": 0.4076, "step": 63860 }, { "epoch": 2.301690272822287, "grad_norm": 0.25544825196266174, "learning_rate": 2.9415775496090174e-05, "loss": 0.3982, "step": 63865 }, { "epoch": 2.301870472483512, "grad_norm": 0.20909026265144348, "learning_rate": 2.941290321377854e-05, "loss": 0.4101, "step": 63870 }, { "epoch": 2.3020506721447362, "grad_norm": 0.16474294662475586, "learning_rate": 2.941003087134195e-05, "loss": 0.3868, "step": 63875 }, { "epoch": 2.302230871805961, "grad_norm": 0.18018871545791626, "learning_rate": 2.940715846881955e-05, "loss": 0.3999, "step": 63880 }, { "epoch": 2.3024110714671857, "grad_norm": 0.20449526607990265, "learning_rate": 2.9404286006250464e-05, "loss": 0.4119, "step": 63885 }, { "epoch": 2.3025912711284104, "grad_norm": 0.2095811516046524, "learning_rate": 2.9401413483673833e-05, "loss": 0.3833, "step": 63890 }, { "epoch": 2.3027714707896347, "grad_norm": 0.19565238058567047, "learning_rate": 2.93985409011288e-05, "loss": 0.3665, "step": 63895 }, { "epoch": 2.3029516704508595, "grad_norm": 0.18948253989219666, "learning_rate": 2.9395668258654497e-05, "loss": 0.3988, "step": 63900 }, { "epoch": 2.303131870112084, "grad_norm": 0.19964943826198578, "learning_rate": 2.9392795556290064e-05, "loss": 0.4107, "step": 63905 }, { "epoch": 2.303312069773309, "grad_norm": 0.2215508222579956, "learning_rate": 2.9389922794074643e-05, "loss": 0.379, "step": 63910 }, { "epoch": 2.3034922694345337, "grad_norm": 0.20707103610038757, "learning_rate": 2.9387049972047376e-05, "loss": 0.3797, "step": 63915 }, { "epoch": 2.303672469095758, "grad_norm": 0.24852395057678223, "learning_rate": 2.93841770902474e-05, "loss": 0.4041, "step": 63920 }, { "epoch": 2.3038526687569827, "grad_norm": 0.2447209507226944, "learning_rate": 2.9381304148713856e-05, "loss": 0.4051, "step": 63925 }, { "epoch": 2.3040328684182074, "grad_norm": 0.22265037894248962, "learning_rate": 2.9378431147485896e-05, "loss": 0.396, "step": 63930 }, { "epoch": 2.304213068079432, "grad_norm": 0.18129128217697144, "learning_rate": 2.9375558086602657e-05, "loss": 0.3808, "step": 63935 }, { "epoch": 2.3043932677406564, "grad_norm": 0.25198519229888916, "learning_rate": 2.9372684966103293e-05, "loss": 0.4207, "step": 63940 }, { "epoch": 2.304573467401881, "grad_norm": 0.2193840593099594, "learning_rate": 2.9369811786026935e-05, "loss": 0.4081, "step": 63945 }, { "epoch": 2.304753667063106, "grad_norm": 0.25426429510116577, "learning_rate": 2.9366938546412744e-05, "loss": 0.3851, "step": 63950 }, { "epoch": 2.3049338667243306, "grad_norm": 0.21279068291187286, "learning_rate": 2.9364065247299848e-05, "loss": 0.4081, "step": 63955 }, { "epoch": 2.3051140663855554, "grad_norm": 0.1979341357946396, "learning_rate": 2.9361191888727417e-05, "loss": 0.3702, "step": 63960 }, { "epoch": 2.3052942660467797, "grad_norm": 0.1835552603006363, "learning_rate": 2.93583184707346e-05, "loss": 0.3754, "step": 63965 }, { "epoch": 2.3054744657080044, "grad_norm": 0.16004249453544617, "learning_rate": 2.935544499336052e-05, "loss": 0.4186, "step": 63970 }, { "epoch": 2.305654665369229, "grad_norm": 0.22096280753612518, "learning_rate": 2.9352571456644357e-05, "loss": 0.3709, "step": 63975 }, { "epoch": 2.305834865030454, "grad_norm": 0.19364888966083527, "learning_rate": 2.934969786062524e-05, "loss": 0.3871, "step": 63980 }, { "epoch": 2.306015064691678, "grad_norm": 0.17691142857074738, "learning_rate": 2.934682420534235e-05, "loss": 0.4034, "step": 63985 }, { "epoch": 2.306195264352903, "grad_norm": 0.239870086312294, "learning_rate": 2.9343950490834806e-05, "loss": 0.4087, "step": 63990 }, { "epoch": 2.3063754640141276, "grad_norm": 0.2206905335187912, "learning_rate": 2.934107671714178e-05, "loss": 0.3875, "step": 63995 }, { "epoch": 2.3065556636753524, "grad_norm": 0.18567106127738953, "learning_rate": 2.9338202884302423e-05, "loss": 0.4075, "step": 64000 }, { "epoch": 2.3065556636753524, "eval_loss": 0.43620339035987854, "eval_runtime": 3.5308, "eval_samples_per_second": 28.322, "eval_steps_per_second": 7.081, "step": 64000 }, { "epoch": 2.306735863336577, "grad_norm": 0.1770346313714981, "learning_rate": 2.93353289923559e-05, "loss": 0.395, "step": 64005 }, { "epoch": 2.3069160629978014, "grad_norm": 0.20676358044147491, "learning_rate": 2.9332455041341355e-05, "loss": 0.3684, "step": 64010 }, { "epoch": 2.307096262659026, "grad_norm": 0.21041785180568695, "learning_rate": 2.932958103129794e-05, "loss": 0.3907, "step": 64015 }, { "epoch": 2.307276462320251, "grad_norm": 0.19155463576316833, "learning_rate": 2.9326706962264832e-05, "loss": 0.3741, "step": 64020 }, { "epoch": 2.3074566619814756, "grad_norm": 0.1932283639907837, "learning_rate": 2.9323832834281177e-05, "loss": 0.3953, "step": 64025 }, { "epoch": 2.3076368616427, "grad_norm": 0.19772790372371674, "learning_rate": 2.932095864738613e-05, "loss": 0.4258, "step": 64030 }, { "epoch": 2.3078170613039246, "grad_norm": 0.18976783752441406, "learning_rate": 2.9318084401618872e-05, "loss": 0.403, "step": 64035 }, { "epoch": 2.3079972609651493, "grad_norm": 0.19546537101268768, "learning_rate": 2.9315210097018535e-05, "loss": 0.3811, "step": 64040 }, { "epoch": 2.308177460626374, "grad_norm": 0.17727042734622955, "learning_rate": 2.9312335733624312e-05, "loss": 0.3558, "step": 64045 }, { "epoch": 2.308357660287599, "grad_norm": 0.17102521657943726, "learning_rate": 2.930946131147534e-05, "loss": 0.3756, "step": 64050 }, { "epoch": 2.3085378599488235, "grad_norm": 0.21491068601608276, "learning_rate": 2.9306586830610794e-05, "loss": 0.4153, "step": 64055 }, { "epoch": 2.308718059610048, "grad_norm": 0.18855595588684082, "learning_rate": 2.930371229106983e-05, "loss": 0.3981, "step": 64060 }, { "epoch": 2.3088982592712726, "grad_norm": 0.19503138959407806, "learning_rate": 2.9300837692891627e-05, "loss": 0.4133, "step": 64065 }, { "epoch": 2.3090784589324973, "grad_norm": 0.18976575136184692, "learning_rate": 2.929796303611534e-05, "loss": 0.3941, "step": 64070 }, { "epoch": 2.309258658593722, "grad_norm": 0.1784740537405014, "learning_rate": 2.9295088320780133e-05, "loss": 0.408, "step": 64075 }, { "epoch": 2.3094388582549463, "grad_norm": 0.21722306311130524, "learning_rate": 2.929221354692519e-05, "loss": 0.4086, "step": 64080 }, { "epoch": 2.309619057916171, "grad_norm": 0.2144099622964859, "learning_rate": 2.9289338714589664e-05, "loss": 0.3952, "step": 64085 }, { "epoch": 2.309799257577396, "grad_norm": 0.1988193541765213, "learning_rate": 2.9286463823812732e-05, "loss": 0.3712, "step": 64090 }, { "epoch": 2.3099794572386205, "grad_norm": 0.18340793251991272, "learning_rate": 2.928358887463355e-05, "loss": 0.4021, "step": 64095 }, { "epoch": 2.3101596568998453, "grad_norm": 0.24040386080741882, "learning_rate": 2.928071386709131e-05, "loss": 0.4141, "step": 64100 }, { "epoch": 2.3103398565610695, "grad_norm": 0.19517391920089722, "learning_rate": 2.9277838801225165e-05, "loss": 0.368, "step": 64105 }, { "epoch": 2.3105200562222943, "grad_norm": 0.1790241003036499, "learning_rate": 2.9274963677074298e-05, "loss": 0.3828, "step": 64110 }, { "epoch": 2.310700255883519, "grad_norm": 0.15677717328071594, "learning_rate": 2.927208849467788e-05, "loss": 0.4316, "step": 64115 }, { "epoch": 2.3108804555447438, "grad_norm": 0.21783356368541718, "learning_rate": 2.9269213254075074e-05, "loss": 0.4188, "step": 64120 }, { "epoch": 2.311060655205968, "grad_norm": 0.23642495274543762, "learning_rate": 2.926633795530508e-05, "loss": 0.3957, "step": 64125 }, { "epoch": 2.3112408548671928, "grad_norm": 0.18061873316764832, "learning_rate": 2.9263462598407044e-05, "loss": 0.3771, "step": 64130 }, { "epoch": 2.3114210545284175, "grad_norm": 0.1819910854101181, "learning_rate": 2.9260587183420164e-05, "loss": 0.3468, "step": 64135 }, { "epoch": 2.3116012541896422, "grad_norm": 0.2120652049779892, "learning_rate": 2.9257711710383595e-05, "loss": 0.4319, "step": 64140 }, { "epoch": 2.311781453850867, "grad_norm": 0.20193518698215485, "learning_rate": 2.925483617933654e-05, "loss": 0.3839, "step": 64145 }, { "epoch": 2.3119616535120913, "grad_norm": 0.19727912545204163, "learning_rate": 2.9251960590318167e-05, "loss": 0.4062, "step": 64150 }, { "epoch": 2.312141853173316, "grad_norm": 0.17611241340637207, "learning_rate": 2.9249084943367648e-05, "loss": 0.3645, "step": 64155 }, { "epoch": 2.3123220528345407, "grad_norm": 0.17656007409095764, "learning_rate": 2.9246209238524176e-05, "loss": 0.3498, "step": 64160 }, { "epoch": 2.3125022524957655, "grad_norm": 0.1993638426065445, "learning_rate": 2.9243333475826916e-05, "loss": 0.381, "step": 64165 }, { "epoch": 2.3126824521569898, "grad_norm": 0.23477478325366974, "learning_rate": 2.924045765531506e-05, "loss": 0.361, "step": 64170 }, { "epoch": 2.3128626518182145, "grad_norm": 0.23515748977661133, "learning_rate": 2.92375817770278e-05, "loss": 0.4028, "step": 64175 }, { "epoch": 2.313042851479439, "grad_norm": 0.17302124202251434, "learning_rate": 2.9234705841004295e-05, "loss": 0.4029, "step": 64180 }, { "epoch": 2.313223051140664, "grad_norm": 0.2219003438949585, "learning_rate": 2.9231829847283753e-05, "loss": 0.3592, "step": 64185 }, { "epoch": 2.3134032508018887, "grad_norm": 0.23449034988880157, "learning_rate": 2.922895379590535e-05, "loss": 0.408, "step": 64190 }, { "epoch": 2.313583450463113, "grad_norm": 0.21402695775032043, "learning_rate": 2.9226077686908265e-05, "loss": 0.4229, "step": 64195 }, { "epoch": 2.3137636501243377, "grad_norm": 0.2237950712442398, "learning_rate": 2.922320152033169e-05, "loss": 0.4359, "step": 64200 }, { "epoch": 2.3139438497855624, "grad_norm": 0.20469123125076294, "learning_rate": 2.922032529621481e-05, "loss": 0.3704, "step": 64205 }, { "epoch": 2.314124049446787, "grad_norm": 0.17067334055900574, "learning_rate": 2.921744901459682e-05, "loss": 0.3781, "step": 64210 }, { "epoch": 2.3143042491080115, "grad_norm": 0.2015264630317688, "learning_rate": 2.92145726755169e-05, "loss": 0.4129, "step": 64215 }, { "epoch": 2.314484448769236, "grad_norm": 0.19434256851673126, "learning_rate": 2.9211696279014244e-05, "loss": 0.3847, "step": 64220 }, { "epoch": 2.314664648430461, "grad_norm": 0.18357601761817932, "learning_rate": 2.920881982512804e-05, "loss": 0.4103, "step": 64225 }, { "epoch": 2.3148448480916857, "grad_norm": 0.1933751106262207, "learning_rate": 2.9205943313897487e-05, "loss": 0.3704, "step": 64230 }, { "epoch": 2.3150250477529104, "grad_norm": 0.20393973588943481, "learning_rate": 2.920306674536177e-05, "loss": 0.3816, "step": 64235 }, { "epoch": 2.3152052474141347, "grad_norm": 0.23503683507442474, "learning_rate": 2.9200190119560077e-05, "loss": 0.3898, "step": 64240 }, { "epoch": 2.3153854470753594, "grad_norm": 0.18053038418293, "learning_rate": 2.9197313436531604e-05, "loss": 0.393, "step": 64245 }, { "epoch": 2.315565646736584, "grad_norm": 0.21024245023727417, "learning_rate": 2.919443669631555e-05, "loss": 0.4292, "step": 64250 }, { "epoch": 2.315745846397809, "grad_norm": 0.1868125945329666, "learning_rate": 2.9191559898951115e-05, "loss": 0.4292, "step": 64255 }, { "epoch": 2.315926046059033, "grad_norm": 0.19413365423679352, "learning_rate": 2.9188683044477484e-05, "loss": 0.4133, "step": 64260 }, { "epoch": 2.316106245720258, "grad_norm": 0.25181156396865845, "learning_rate": 2.9185806132933856e-05, "loss": 0.4184, "step": 64265 }, { "epoch": 2.3162864453814827, "grad_norm": 0.20910336077213287, "learning_rate": 2.9182929164359436e-05, "loss": 0.375, "step": 64270 }, { "epoch": 2.3164666450427074, "grad_norm": 0.1750982105731964, "learning_rate": 2.9180052138793413e-05, "loss": 0.3782, "step": 64275 }, { "epoch": 2.316646844703932, "grad_norm": 0.18748337030410767, "learning_rate": 2.917717505627498e-05, "loss": 0.3698, "step": 64280 }, { "epoch": 2.3168270443651564, "grad_norm": 0.2219618856906891, "learning_rate": 2.917429791684335e-05, "loss": 0.4027, "step": 64285 }, { "epoch": 2.317007244026381, "grad_norm": 0.19255468249320984, "learning_rate": 2.917142072053773e-05, "loss": 0.373, "step": 64290 }, { "epoch": 2.317187443687606, "grad_norm": 0.19871120154857635, "learning_rate": 2.9168543467397298e-05, "loss": 0.3987, "step": 64295 }, { "epoch": 2.3173676433488306, "grad_norm": 0.2183811515569687, "learning_rate": 2.9165666157461268e-05, "loss": 0.3834, "step": 64300 }, { "epoch": 2.317547843010055, "grad_norm": 0.19588343799114227, "learning_rate": 2.9162788790768847e-05, "loss": 0.4093, "step": 64305 }, { "epoch": 2.3177280426712796, "grad_norm": 0.1578301340341568, "learning_rate": 2.9159911367359238e-05, "loss": 0.3794, "step": 64310 }, { "epoch": 2.3179082423325044, "grad_norm": 0.21837523579597473, "learning_rate": 2.915703388727164e-05, "loss": 0.4073, "step": 64315 }, { "epoch": 2.318088441993729, "grad_norm": 0.2273416817188263, "learning_rate": 2.9154156350545252e-05, "loss": 0.3673, "step": 64320 }, { "epoch": 2.318268641654954, "grad_norm": 0.2129017859697342, "learning_rate": 2.9151278757219296e-05, "loss": 0.3983, "step": 64325 }, { "epoch": 2.3184488413161786, "grad_norm": 0.17560027539730072, "learning_rate": 2.9148401107332972e-05, "loss": 0.3978, "step": 64330 }, { "epoch": 2.318629040977403, "grad_norm": 0.20387177169322968, "learning_rate": 2.914552340092549e-05, "loss": 0.4097, "step": 64335 }, { "epoch": 2.3188092406386276, "grad_norm": 0.23885492980480194, "learning_rate": 2.9142645638036042e-05, "loss": 0.3813, "step": 64340 }, { "epoch": 2.3189894402998523, "grad_norm": 0.20437073707580566, "learning_rate": 2.9139767818703857e-05, "loss": 0.3662, "step": 64345 }, { "epoch": 2.319169639961077, "grad_norm": 0.20243240892887115, "learning_rate": 2.9136889942968138e-05, "loss": 0.3974, "step": 64350 }, { "epoch": 2.3193498396223013, "grad_norm": 0.19991594552993774, "learning_rate": 2.9134012010868082e-05, "loss": 0.4184, "step": 64355 }, { "epoch": 2.319530039283526, "grad_norm": 0.1909300833940506, "learning_rate": 2.913113402244293e-05, "loss": 0.3714, "step": 64360 }, { "epoch": 2.319710238944751, "grad_norm": 0.21020810306072235, "learning_rate": 2.912825597773186e-05, "loss": 0.4092, "step": 64365 }, { "epoch": 2.3198904386059755, "grad_norm": 0.29738765954971313, "learning_rate": 2.9125377876774118e-05, "loss": 0.4022, "step": 64370 }, { "epoch": 2.3200706382672003, "grad_norm": 0.21587322652339935, "learning_rate": 2.9122499719608894e-05, "loss": 0.4157, "step": 64375 }, { "epoch": 2.3202508379284246, "grad_norm": 0.19749046862125397, "learning_rate": 2.9119621506275407e-05, "loss": 0.3893, "step": 64380 }, { "epoch": 2.3204310375896493, "grad_norm": 0.24363020062446594, "learning_rate": 2.9116743236812878e-05, "loss": 0.4346, "step": 64385 }, { "epoch": 2.320611237250874, "grad_norm": 0.18595632910728455, "learning_rate": 2.911386491126052e-05, "loss": 0.3947, "step": 64390 }, { "epoch": 2.3207914369120988, "grad_norm": 0.2038854956626892, "learning_rate": 2.9110986529657547e-05, "loss": 0.4065, "step": 64395 }, { "epoch": 2.320971636573323, "grad_norm": 0.26512575149536133, "learning_rate": 2.9108108092043173e-05, "loss": 0.3958, "step": 64400 }, { "epoch": 2.321151836234548, "grad_norm": 0.2215341031551361, "learning_rate": 2.910522959845663e-05, "loss": 0.3719, "step": 64405 }, { "epoch": 2.3213320358957725, "grad_norm": 0.20210741460323334, "learning_rate": 2.9102351048937122e-05, "loss": 0.4205, "step": 64410 }, { "epoch": 2.3215122355569973, "grad_norm": 0.21246850490570068, "learning_rate": 2.9099472443523885e-05, "loss": 0.4175, "step": 64415 }, { "epoch": 2.321692435218222, "grad_norm": 0.23316890001296997, "learning_rate": 2.9096593782256122e-05, "loss": 0.4295, "step": 64420 }, { "epoch": 2.3218726348794463, "grad_norm": 0.17650622129440308, "learning_rate": 2.9093715065173066e-05, "loss": 0.4047, "step": 64425 }, { "epoch": 2.322052834540671, "grad_norm": 0.20012375712394714, "learning_rate": 2.9090836292313928e-05, "loss": 0.4085, "step": 64430 }, { "epoch": 2.3222330342018958, "grad_norm": 0.2151358723640442, "learning_rate": 2.908795746371794e-05, "loss": 0.3799, "step": 64435 }, { "epoch": 2.3224132338631205, "grad_norm": 0.20195366442203522, "learning_rate": 2.908507857942433e-05, "loss": 0.3912, "step": 64440 }, { "epoch": 2.3225934335243448, "grad_norm": 0.24572651088237762, "learning_rate": 2.9082199639472306e-05, "loss": 0.3691, "step": 64445 }, { "epoch": 2.3227736331855695, "grad_norm": 0.20580525696277618, "learning_rate": 2.907932064390111e-05, "loss": 0.3815, "step": 64450 }, { "epoch": 2.3229538328467942, "grad_norm": 0.26529133319854736, "learning_rate": 2.9076441592749954e-05, "loss": 0.3954, "step": 64455 }, { "epoch": 2.323134032508019, "grad_norm": 0.19922053813934326, "learning_rate": 2.907356248605807e-05, "loss": 0.3686, "step": 64460 }, { "epoch": 2.3233142321692437, "grad_norm": 0.22310146689414978, "learning_rate": 2.9071259160741603e-05, "loss": 0.4364, "step": 64465 }, { "epoch": 2.323494431830468, "grad_norm": 0.19857893884181976, "learning_rate": 2.9068379954175267e-05, "loss": 0.4276, "step": 64470 }, { "epoch": 2.3236746314916927, "grad_norm": 0.20761960744857788, "learning_rate": 2.906550069217804e-05, "loss": 0.3837, "step": 64475 }, { "epoch": 2.3238548311529175, "grad_norm": 0.22300156950950623, "learning_rate": 2.906262137478915e-05, "loss": 0.421, "step": 64480 }, { "epoch": 2.324035030814142, "grad_norm": 0.19981767237186432, "learning_rate": 2.9059742002047835e-05, "loss": 0.3987, "step": 64485 }, { "epoch": 2.3242152304753665, "grad_norm": 0.21243509650230408, "learning_rate": 2.9056862573993322e-05, "loss": 0.4114, "step": 64490 }, { "epoch": 2.3243954301365912, "grad_norm": 0.22393706440925598, "learning_rate": 2.9053983090664838e-05, "loss": 0.3966, "step": 64495 }, { "epoch": 2.324575629797816, "grad_norm": 0.19257164001464844, "learning_rate": 2.9051103552101623e-05, "loss": 0.389, "step": 64500 }, { "epoch": 2.324575629797816, "eval_loss": 0.4352463483810425, "eval_runtime": 3.5326, "eval_samples_per_second": 28.308, "eval_steps_per_second": 7.077, "step": 64500 }, { "epoch": 2.3247558294590407, "grad_norm": 0.18349726498126984, "learning_rate": 2.9048223958342906e-05, "loss": 0.407, "step": 64505 }, { "epoch": 2.3249360291202654, "grad_norm": 0.2214827537536621, "learning_rate": 2.9045344309427924e-05, "loss": 0.3542, "step": 64510 }, { "epoch": 2.3251162287814897, "grad_norm": 0.19785992801189423, "learning_rate": 2.9042464605395898e-05, "loss": 0.4088, "step": 64515 }, { "epoch": 2.3252964284427144, "grad_norm": 0.24504734575748444, "learning_rate": 2.9039584846286083e-05, "loss": 0.3953, "step": 64520 }, { "epoch": 2.325476628103939, "grad_norm": 0.22237750887870789, "learning_rate": 2.9036705032137707e-05, "loss": 0.3808, "step": 64525 }, { "epoch": 2.325656827765164, "grad_norm": 0.23211441934108734, "learning_rate": 2.903382516299e-05, "loss": 0.4144, "step": 64530 }, { "epoch": 2.325837027426388, "grad_norm": 0.21520470082759857, "learning_rate": 2.9030945238882214e-05, "loss": 0.4161, "step": 64535 }, { "epoch": 2.326017227087613, "grad_norm": 0.21746765077114105, "learning_rate": 2.9028065259853572e-05, "loss": 0.4375, "step": 64540 }, { "epoch": 2.3261974267488377, "grad_norm": 0.22863152623176575, "learning_rate": 2.902518522594333e-05, "loss": 0.409, "step": 64545 }, { "epoch": 2.3263776264100624, "grad_norm": 0.1936892420053482, "learning_rate": 2.902230513719071e-05, "loss": 0.4021, "step": 64550 }, { "epoch": 2.326557826071287, "grad_norm": 0.22578535974025726, "learning_rate": 2.9019424993634968e-05, "loss": 0.402, "step": 64555 }, { "epoch": 2.326738025732512, "grad_norm": 0.23165376484394073, "learning_rate": 2.901654479531533e-05, "loss": 0.4211, "step": 64560 }, { "epoch": 2.326918225393736, "grad_norm": 0.1904148906469345, "learning_rate": 2.9013664542271057e-05, "loss": 0.4131, "step": 64565 }, { "epoch": 2.327098425054961, "grad_norm": 0.16796857118606567, "learning_rate": 2.9010784234541376e-05, "loss": 0.4064, "step": 64570 }, { "epoch": 2.3272786247161856, "grad_norm": 0.1999785602092743, "learning_rate": 2.900790387216553e-05, "loss": 0.3945, "step": 64575 }, { "epoch": 2.3274588243774104, "grad_norm": 0.19684360921382904, "learning_rate": 2.9005023455182784e-05, "loss": 0.3999, "step": 64580 }, { "epoch": 2.3276390240386347, "grad_norm": 0.20736943185329437, "learning_rate": 2.9002142983632365e-05, "loss": 0.3924, "step": 64585 }, { "epoch": 2.3278192236998594, "grad_norm": 0.21448162198066711, "learning_rate": 2.8999262457553518e-05, "loss": 0.3711, "step": 64590 }, { "epoch": 2.327999423361084, "grad_norm": 0.17774830758571625, "learning_rate": 2.899638187698549e-05, "loss": 0.3944, "step": 64595 }, { "epoch": 2.328179623022309, "grad_norm": 0.19229644536972046, "learning_rate": 2.899350124196754e-05, "loss": 0.3717, "step": 64600 }, { "epoch": 2.3283598226835336, "grad_norm": 0.22813409566879272, "learning_rate": 2.8990620552538905e-05, "loss": 0.4118, "step": 64605 }, { "epoch": 2.328540022344758, "grad_norm": 0.16227515041828156, "learning_rate": 2.898773980873884e-05, "loss": 0.4017, "step": 64610 }, { "epoch": 2.3287202220059826, "grad_norm": 0.20297592878341675, "learning_rate": 2.898485901060659e-05, "loss": 0.4339, "step": 64615 }, { "epoch": 2.3289004216672073, "grad_norm": 0.2151886522769928, "learning_rate": 2.8981978158181412e-05, "loss": 0.4276, "step": 64620 }, { "epoch": 2.329080621328432, "grad_norm": 0.21326357126235962, "learning_rate": 2.8979097251502548e-05, "loss": 0.36, "step": 64625 }, { "epoch": 2.3292608209896564, "grad_norm": 0.20007814466953278, "learning_rate": 2.8976216290609258e-05, "loss": 0.4148, "step": 64630 }, { "epoch": 2.329441020650881, "grad_norm": 0.20268851518630981, "learning_rate": 2.8973335275540785e-05, "loss": 0.407, "step": 64635 }, { "epoch": 2.329621220312106, "grad_norm": 0.2361319661140442, "learning_rate": 2.8970454206336393e-05, "loss": 0.4504, "step": 64640 }, { "epoch": 2.3298014199733306, "grad_norm": 0.17205223441123962, "learning_rate": 2.8967573083035327e-05, "loss": 0.3959, "step": 64645 }, { "epoch": 2.3299816196345553, "grad_norm": 0.19118523597717285, "learning_rate": 2.8964691905676856e-05, "loss": 0.4025, "step": 64650 }, { "epoch": 2.3301618192957796, "grad_norm": 0.20011954009532928, "learning_rate": 2.8961810674300217e-05, "loss": 0.4142, "step": 64655 }, { "epoch": 2.3303420189570043, "grad_norm": 0.21023406088352203, "learning_rate": 2.895892938894468e-05, "loss": 0.4246, "step": 64660 }, { "epoch": 2.330522218618229, "grad_norm": 0.19074782729148865, "learning_rate": 2.8956048049649487e-05, "loss": 0.4024, "step": 64665 }, { "epoch": 2.330702418279454, "grad_norm": 0.24010269343852997, "learning_rate": 2.895316665645391e-05, "loss": 0.3951, "step": 64670 }, { "epoch": 2.330882617940678, "grad_norm": 0.23700617253780365, "learning_rate": 2.8950285209397206e-05, "loss": 0.3924, "step": 64675 }, { "epoch": 2.331062817601903, "grad_norm": 0.23753370344638824, "learning_rate": 2.8947403708518623e-05, "loss": 0.4263, "step": 64680 }, { "epoch": 2.3312430172631275, "grad_norm": 0.17111685872077942, "learning_rate": 2.8944522153857433e-05, "loss": 0.4019, "step": 64685 }, { "epoch": 2.3314232169243523, "grad_norm": 0.18538156151771545, "learning_rate": 2.8941640545452898e-05, "loss": 0.4268, "step": 64690 }, { "epoch": 2.331603416585577, "grad_norm": 0.2004711627960205, "learning_rate": 2.893875888334427e-05, "loss": 0.3993, "step": 64695 }, { "epoch": 2.3317836162468013, "grad_norm": 0.18873223662376404, "learning_rate": 2.8935877167570814e-05, "loss": 0.3939, "step": 64700 }, { "epoch": 2.331963815908026, "grad_norm": 0.1974310278892517, "learning_rate": 2.8932995398171793e-05, "loss": 0.3892, "step": 64705 }, { "epoch": 2.3321440155692508, "grad_norm": 0.2253679633140564, "learning_rate": 2.8930113575186473e-05, "loss": 0.4034, "step": 64710 }, { "epoch": 2.3323242152304755, "grad_norm": 0.2432539016008377, "learning_rate": 2.892723169865411e-05, "loss": 0.4074, "step": 64715 }, { "epoch": 2.3325044148917, "grad_norm": 0.22504882514476776, "learning_rate": 2.8924349768613984e-05, "loss": 0.4394, "step": 64720 }, { "epoch": 2.3326846145529245, "grad_norm": 0.24001994729042053, "learning_rate": 2.8921467785105342e-05, "loss": 0.3778, "step": 64725 }, { "epoch": 2.3328648142141493, "grad_norm": 0.19272451102733612, "learning_rate": 2.8918585748167475e-05, "loss": 0.4748, "step": 64730 }, { "epoch": 2.333045013875374, "grad_norm": 0.20509444177150726, "learning_rate": 2.8915703657839627e-05, "loss": 0.4025, "step": 64735 }, { "epoch": 2.3332252135365987, "grad_norm": 0.1780424416065216, "learning_rate": 2.8912821514161077e-05, "loss": 0.3849, "step": 64740 }, { "epoch": 2.333405413197823, "grad_norm": 0.20018140971660614, "learning_rate": 2.8909939317171086e-05, "loss": 0.4019, "step": 64745 }, { "epoch": 2.3335856128590478, "grad_norm": 0.22508153319358826, "learning_rate": 2.8907057066908934e-05, "loss": 0.3987, "step": 64750 }, { "epoch": 2.3337658125202725, "grad_norm": 0.22255435585975647, "learning_rate": 2.8904174763413888e-05, "loss": 0.3795, "step": 64755 }, { "epoch": 2.333946012181497, "grad_norm": 0.17312286794185638, "learning_rate": 2.890129240672521e-05, "loss": 0.4022, "step": 64760 }, { "epoch": 2.3341262118427215, "grad_norm": 0.18471607565879822, "learning_rate": 2.8898409996882182e-05, "loss": 0.3776, "step": 64765 }, { "epoch": 2.3343064115039462, "grad_norm": 0.19598588347434998, "learning_rate": 2.8895527533924076e-05, "loss": 0.3784, "step": 64770 }, { "epoch": 2.334486611165171, "grad_norm": 0.20340387523174286, "learning_rate": 2.889264501789015e-05, "loss": 0.3934, "step": 64775 }, { "epoch": 2.3346668108263957, "grad_norm": 0.2492077350616455, "learning_rate": 2.88897624488197e-05, "loss": 0.4267, "step": 64780 }, { "epoch": 2.3348470104876204, "grad_norm": 0.21682408452033997, "learning_rate": 2.8886879826751982e-05, "loss": 0.4055, "step": 64785 }, { "epoch": 2.3350272101488447, "grad_norm": 0.2086905837059021, "learning_rate": 2.8883997151726293e-05, "loss": 0.353, "step": 64790 }, { "epoch": 2.3352074098100695, "grad_norm": 0.18733155727386475, "learning_rate": 2.8881114423781885e-05, "loss": 0.372, "step": 64795 }, { "epoch": 2.335387609471294, "grad_norm": 0.18295986950397491, "learning_rate": 2.8878231642958044e-05, "loss": 0.3722, "step": 64800 }, { "epoch": 2.335567809132519, "grad_norm": 0.21052880585193634, "learning_rate": 2.8875348809294044e-05, "loss": 0.3814, "step": 64805 }, { "epoch": 2.3357480087937432, "grad_norm": 0.20123307406902313, "learning_rate": 2.887246592282917e-05, "loss": 0.3888, "step": 64810 }, { "epoch": 2.335928208454968, "grad_norm": 0.23528067767620087, "learning_rate": 2.8869582983602705e-05, "loss": 0.3984, "step": 64815 }, { "epoch": 2.3361084081161927, "grad_norm": 0.22291633486747742, "learning_rate": 2.8866699991653913e-05, "loss": 0.4111, "step": 64820 }, { "epoch": 2.3362886077774174, "grad_norm": 0.16556419432163239, "learning_rate": 2.8863816947022087e-05, "loss": 0.3962, "step": 64825 }, { "epoch": 2.336468807438642, "grad_norm": 0.2400919497013092, "learning_rate": 2.8860933849746496e-05, "loss": 0.3776, "step": 64830 }, { "epoch": 2.336649007099867, "grad_norm": 0.2065679430961609, "learning_rate": 2.8858050699866442e-05, "loss": 0.4298, "step": 64835 }, { "epoch": 2.336829206761091, "grad_norm": 0.19219578802585602, "learning_rate": 2.885516749742118e-05, "loss": 0.3546, "step": 64840 }, { "epoch": 2.337009406422316, "grad_norm": 0.2160220742225647, "learning_rate": 2.8852284242450013e-05, "loss": 0.4224, "step": 64845 }, { "epoch": 2.3371896060835406, "grad_norm": 0.17602595686912537, "learning_rate": 2.8849400934992215e-05, "loss": 0.4115, "step": 64850 }, { "epoch": 2.3373698057447654, "grad_norm": 0.21248827874660492, "learning_rate": 2.8846517575087086e-05, "loss": 0.4115, "step": 64855 }, { "epoch": 2.3375500054059897, "grad_norm": 0.1953934133052826, "learning_rate": 2.884363416277389e-05, "loss": 0.388, "step": 64860 }, { "epoch": 2.3377302050672144, "grad_norm": 0.19074764847755432, "learning_rate": 2.8840750698091924e-05, "loss": 0.3821, "step": 64865 }, { "epoch": 2.337910404728439, "grad_norm": 0.20078332722187042, "learning_rate": 2.883786718108047e-05, "loss": 0.4138, "step": 64870 }, { "epoch": 2.338090604389664, "grad_norm": 0.20209373533725739, "learning_rate": 2.8834983611778827e-05, "loss": 0.4012, "step": 64875 }, { "epoch": 2.3382708040508886, "grad_norm": 0.21952852606773376, "learning_rate": 2.8832099990226268e-05, "loss": 0.3731, "step": 64880 }, { "epoch": 2.338451003712113, "grad_norm": 0.18298974633216858, "learning_rate": 2.8829216316462092e-05, "loss": 0.3778, "step": 64885 }, { "epoch": 2.3386312033733376, "grad_norm": 0.2111566960811615, "learning_rate": 2.8826332590525583e-05, "loss": 0.4025, "step": 64890 }, { "epoch": 2.3388114030345624, "grad_norm": 0.21694448590278625, "learning_rate": 2.882344881245604e-05, "loss": 0.414, "step": 64895 }, { "epoch": 2.338991602695787, "grad_norm": 0.188668355345726, "learning_rate": 2.8820564982292736e-05, "loss": 0.4043, "step": 64900 }, { "epoch": 2.3391718023570114, "grad_norm": 0.18398909270763397, "learning_rate": 2.881768110007498e-05, "loss": 0.3847, "step": 64905 }, { "epoch": 2.339352002018236, "grad_norm": 0.19492432475090027, "learning_rate": 2.8814797165842057e-05, "loss": 0.38, "step": 64910 }, { "epoch": 2.339532201679461, "grad_norm": 0.16589581966400146, "learning_rate": 2.881191317963326e-05, "loss": 0.3701, "step": 64915 }, { "epoch": 2.3397124013406856, "grad_norm": 0.23120105266571045, "learning_rate": 2.8809029141487886e-05, "loss": 0.4066, "step": 64920 }, { "epoch": 2.3398926010019103, "grad_norm": 0.19977350533008575, "learning_rate": 2.8806145051445225e-05, "loss": 0.3919, "step": 64925 }, { "epoch": 2.3400728006631346, "grad_norm": 0.2010623812675476, "learning_rate": 2.8803260909544578e-05, "loss": 0.4305, "step": 64930 }, { "epoch": 2.3402530003243593, "grad_norm": 0.17667075991630554, "learning_rate": 2.880037671582524e-05, "loss": 0.3747, "step": 64935 }, { "epoch": 2.340433199985584, "grad_norm": 0.15319837629795074, "learning_rate": 2.8797492470326497e-05, "loss": 0.3922, "step": 64940 }, { "epoch": 2.340613399646809, "grad_norm": 0.20747162401676178, "learning_rate": 2.8794608173087655e-05, "loss": 0.3958, "step": 64945 }, { "epoch": 2.340793599308033, "grad_norm": 0.2174191176891327, "learning_rate": 2.879172382414802e-05, "loss": 0.3928, "step": 64950 }, { "epoch": 2.340973798969258, "grad_norm": 0.21791304647922516, "learning_rate": 2.8788839423546877e-05, "loss": 0.4441, "step": 64955 }, { "epoch": 2.3411539986304826, "grad_norm": 0.22638213634490967, "learning_rate": 2.8785954971323526e-05, "loss": 0.402, "step": 64960 }, { "epoch": 2.3413341982917073, "grad_norm": 0.20641210675239563, "learning_rate": 2.8783070467517277e-05, "loss": 0.4119, "step": 64965 }, { "epoch": 2.341514397952932, "grad_norm": 0.2296299785375595, "learning_rate": 2.8780185912167424e-05, "loss": 0.4335, "step": 64970 }, { "epoch": 2.3416945976141563, "grad_norm": 0.18867331743240356, "learning_rate": 2.8777301305313276e-05, "loss": 0.3613, "step": 64975 }, { "epoch": 2.341874797275381, "grad_norm": 0.27165400981903076, "learning_rate": 2.8774416646994117e-05, "loss": 0.4134, "step": 64980 }, { "epoch": 2.342054996936606, "grad_norm": 0.23474054038524628, "learning_rate": 2.877153193724927e-05, "loss": 0.3988, "step": 64985 }, { "epoch": 2.3422351965978305, "grad_norm": 0.2028888314962387, "learning_rate": 2.8768647176118024e-05, "loss": 0.3996, "step": 64990 }, { "epoch": 2.342415396259055, "grad_norm": 0.29794272780418396, "learning_rate": 2.8765762363639692e-05, "loss": 0.4289, "step": 64995 }, { "epoch": 2.3425955959202795, "grad_norm": 0.2336840033531189, "learning_rate": 2.8762877499853586e-05, "loss": 0.3917, "step": 65000 }, { "epoch": 2.3425955959202795, "eval_loss": 0.4352496862411499, "eval_runtime": 3.5177, "eval_samples_per_second": 28.428, "eval_steps_per_second": 7.107, "step": 65000 }, { "epoch": 2.3427757955815043, "grad_norm": 0.21938785910606384, "learning_rate": 2.8759992584798988e-05, "loss": 0.4125, "step": 65005 }, { "epoch": 2.342955995242729, "grad_norm": 0.1729237288236618, "learning_rate": 2.8757107618515227e-05, "loss": 0.405, "step": 65010 }, { "epoch": 2.3431361949039538, "grad_norm": 0.19324518740177155, "learning_rate": 2.8754222601041603e-05, "loss": 0.3719, "step": 65015 }, { "epoch": 2.343316394565178, "grad_norm": 0.2103443443775177, "learning_rate": 2.8751337532417428e-05, "loss": 0.4041, "step": 65020 }, { "epoch": 2.3434965942264028, "grad_norm": 0.2286413609981537, "learning_rate": 2.8748452412681994e-05, "loss": 0.4066, "step": 65025 }, { "epoch": 2.3436767938876275, "grad_norm": 0.2286667674779892, "learning_rate": 2.8745567241874627e-05, "loss": 0.3864, "step": 65030 }, { "epoch": 2.3438569935488522, "grad_norm": 0.25851571559906006, "learning_rate": 2.874268202003463e-05, "loss": 0.4042, "step": 65035 }, { "epoch": 2.3440371932100765, "grad_norm": 0.17988860607147217, "learning_rate": 2.8739796747201313e-05, "loss": 0.412, "step": 65040 }, { "epoch": 2.3442173928713013, "grad_norm": 0.21041817963123322, "learning_rate": 2.8736911423414e-05, "loss": 0.4372, "step": 65045 }, { "epoch": 2.344397592532526, "grad_norm": 0.25124192237854004, "learning_rate": 2.8734026048711975e-05, "loss": 0.3904, "step": 65050 }, { "epoch": 2.3445777921937507, "grad_norm": 0.18272101879119873, "learning_rate": 2.8731140623134583e-05, "loss": 0.3723, "step": 65055 }, { "epoch": 2.3447579918549755, "grad_norm": 0.2411685287952423, "learning_rate": 2.8728255146721117e-05, "loss": 0.3785, "step": 65060 }, { "epoch": 2.3449381915162, "grad_norm": 0.2054976224899292, "learning_rate": 2.8725369619510895e-05, "loss": 0.4264, "step": 65065 }, { "epoch": 2.3451183911774245, "grad_norm": 0.3253336250782013, "learning_rate": 2.8722484041543233e-05, "loss": 0.3911, "step": 65070 }, { "epoch": 2.345298590838649, "grad_norm": 0.23166240751743317, "learning_rate": 2.8719598412857445e-05, "loss": 0.3862, "step": 65075 }, { "epoch": 2.345478790499874, "grad_norm": 0.17862895131111145, "learning_rate": 2.871671273349286e-05, "loss": 0.3967, "step": 65080 }, { "epoch": 2.3456589901610987, "grad_norm": 0.2060977816581726, "learning_rate": 2.8713827003488776e-05, "loss": 0.3993, "step": 65085 }, { "epoch": 2.345839189822323, "grad_norm": 0.1897481232881546, "learning_rate": 2.8710941222884515e-05, "loss": 0.3673, "step": 65090 }, { "epoch": 2.3460193894835477, "grad_norm": 0.2062787413597107, "learning_rate": 2.8708055391719396e-05, "loss": 0.429, "step": 65095 }, { "epoch": 2.3461995891447724, "grad_norm": 0.21658870577812195, "learning_rate": 2.8705169510032747e-05, "loss": 0.4001, "step": 65100 }, { "epoch": 2.346379788805997, "grad_norm": 0.2051638811826706, "learning_rate": 2.8702283577863883e-05, "loss": 0.3861, "step": 65105 }, { "epoch": 2.346559988467222, "grad_norm": 0.22257257997989655, "learning_rate": 2.8699397595252116e-05, "loss": 0.3834, "step": 65110 }, { "epoch": 2.346740188128446, "grad_norm": 0.1782115399837494, "learning_rate": 2.8696511562236776e-05, "loss": 0.4001, "step": 65115 }, { "epoch": 2.346920387789671, "grad_norm": 0.2436537742614746, "learning_rate": 2.8693625478857182e-05, "loss": 0.4371, "step": 65120 }, { "epoch": 2.3471005874508957, "grad_norm": 0.22284133732318878, "learning_rate": 2.8690739345152658e-05, "loss": 0.4084, "step": 65125 }, { "epoch": 2.3472807871121204, "grad_norm": 0.19572477042675018, "learning_rate": 2.8687853161162514e-05, "loss": 0.3609, "step": 65130 }, { "epoch": 2.3474609867733447, "grad_norm": 0.21674886345863342, "learning_rate": 2.8684966926926092e-05, "loss": 0.3779, "step": 65135 }, { "epoch": 2.3476411864345694, "grad_norm": 0.2136157900094986, "learning_rate": 2.8682080642482717e-05, "loss": 0.4004, "step": 65140 }, { "epoch": 2.347821386095794, "grad_norm": 0.22586745023727417, "learning_rate": 2.8679194307871694e-05, "loss": 0.4183, "step": 65145 }, { "epoch": 2.348001585757019, "grad_norm": 0.20550450682640076, "learning_rate": 2.8676307923132367e-05, "loss": 0.3483, "step": 65150 }, { "epoch": 2.3481817854182436, "grad_norm": 0.18559207022190094, "learning_rate": 2.8673421488304048e-05, "loss": 0.3986, "step": 65155 }, { "epoch": 2.348361985079468, "grad_norm": 0.23518766462802887, "learning_rate": 2.867053500342609e-05, "loss": 0.3831, "step": 65160 }, { "epoch": 2.3485421847406927, "grad_norm": 0.193081796169281, "learning_rate": 2.866764846853779e-05, "loss": 0.4086, "step": 65165 }, { "epoch": 2.3487223844019174, "grad_norm": 0.24136221408843994, "learning_rate": 2.8664761883678493e-05, "loss": 0.391, "step": 65170 }, { "epoch": 2.348902584063142, "grad_norm": 0.1842050552368164, "learning_rate": 2.8661875248887515e-05, "loss": 0.3772, "step": 65175 }, { "epoch": 2.3490827837243664, "grad_norm": 0.16665658354759216, "learning_rate": 2.865898856420421e-05, "loss": 0.3921, "step": 65180 }, { "epoch": 2.349262983385591, "grad_norm": 0.24213731288909912, "learning_rate": 2.8656101829667892e-05, "loss": 0.4142, "step": 65185 }, { "epoch": 2.349443183046816, "grad_norm": 0.16028551757335663, "learning_rate": 2.8653215045317882e-05, "loss": 0.4045, "step": 65190 }, { "epoch": 2.3496233827080406, "grad_norm": 0.21428442001342773, "learning_rate": 2.865032821119354e-05, "loss": 0.3671, "step": 65195 }, { "epoch": 2.3498035823692653, "grad_norm": 0.2171047478914261, "learning_rate": 2.8647441327334166e-05, "loss": 0.3788, "step": 65200 }, { "epoch": 2.3499837820304896, "grad_norm": 0.17178626358509064, "learning_rate": 2.8644554393779127e-05, "loss": 0.404, "step": 65205 }, { "epoch": 2.3501639816917144, "grad_norm": 0.19908472895622253, "learning_rate": 2.8641667410567736e-05, "loss": 0.415, "step": 65210 }, { "epoch": 2.350344181352939, "grad_norm": 0.1920705884695053, "learning_rate": 2.8638780377739323e-05, "loss": 0.4006, "step": 65215 }, { "epoch": 2.350524381014164, "grad_norm": 0.22549034655094147, "learning_rate": 2.863589329533324e-05, "loss": 0.4142, "step": 65220 }, { "epoch": 2.350704580675388, "grad_norm": 0.19695831835269928, "learning_rate": 2.8633006163388815e-05, "loss": 0.3613, "step": 65225 }, { "epoch": 2.350884780336613, "grad_norm": 0.2582644522190094, "learning_rate": 2.8630118981945385e-05, "loss": 0.3895, "step": 65230 }, { "epoch": 2.3510649799978376, "grad_norm": 0.2204061895608902, "learning_rate": 2.862723175104228e-05, "loss": 0.4464, "step": 65235 }, { "epoch": 2.3512451796590623, "grad_norm": 0.2712995707988739, "learning_rate": 2.8624344470718852e-05, "loss": 0.4043, "step": 65240 }, { "epoch": 2.351425379320287, "grad_norm": 0.17687354981899261, "learning_rate": 2.862145714101443e-05, "loss": 0.3772, "step": 65245 }, { "epoch": 2.3516055789815113, "grad_norm": 0.22399690747261047, "learning_rate": 2.861856976196835e-05, "loss": 0.4168, "step": 65250 }, { "epoch": 2.351785778642736, "grad_norm": 0.22232884168624878, "learning_rate": 2.8615682333619957e-05, "loss": 0.3784, "step": 65255 }, { "epoch": 2.351965978303961, "grad_norm": 0.23076751828193665, "learning_rate": 2.8612794856008595e-05, "loss": 0.3769, "step": 65260 }, { "epoch": 2.3521461779651855, "grad_norm": 0.21458113193511963, "learning_rate": 2.8609907329173613e-05, "loss": 0.4084, "step": 65265 }, { "epoch": 2.35232637762641, "grad_norm": 0.22074739634990692, "learning_rate": 2.860701975315433e-05, "loss": 0.399, "step": 65270 }, { "epoch": 2.3525065772876346, "grad_norm": 0.20703339576721191, "learning_rate": 2.8604132127990107e-05, "loss": 0.3878, "step": 65275 }, { "epoch": 2.3526867769488593, "grad_norm": 0.25511276721954346, "learning_rate": 2.8601244453720276e-05, "loss": 0.3976, "step": 65280 }, { "epoch": 2.352866976610084, "grad_norm": 0.21042507886886597, "learning_rate": 2.859835673038419e-05, "loss": 0.4136, "step": 65285 }, { "epoch": 2.3530471762713088, "grad_norm": 0.24099446833133698, "learning_rate": 2.85954689580212e-05, "loss": 0.4005, "step": 65290 }, { "epoch": 2.353227375932533, "grad_norm": 0.18244995176792145, "learning_rate": 2.8592581136670625e-05, "loss": 0.3733, "step": 65295 }, { "epoch": 2.353407575593758, "grad_norm": 0.25763794779777527, "learning_rate": 2.8589693266371837e-05, "loss": 0.394, "step": 65300 }, { "epoch": 2.3535877752549825, "grad_norm": 0.19655391573905945, "learning_rate": 2.858680534716417e-05, "loss": 0.4198, "step": 65305 }, { "epoch": 2.3537679749162073, "grad_norm": 0.16229073703289032, "learning_rate": 2.858391737908698e-05, "loss": 0.3837, "step": 65310 }, { "epoch": 2.3539481745774316, "grad_norm": 0.2285899817943573, "learning_rate": 2.85810293621796e-05, "loss": 0.4059, "step": 65315 }, { "epoch": 2.3541283742386563, "grad_norm": 0.20628534257411957, "learning_rate": 2.85781412964814e-05, "loss": 0.4214, "step": 65320 }, { "epoch": 2.354308573899881, "grad_norm": 0.1957215815782547, "learning_rate": 2.857525318203171e-05, "loss": 0.391, "step": 65325 }, { "epoch": 2.3544887735611058, "grad_norm": 0.1700507253408432, "learning_rate": 2.8572365018869884e-05, "loss": 0.3891, "step": 65330 }, { "epoch": 2.3546689732223305, "grad_norm": 0.21826016902923584, "learning_rate": 2.8569476807035278e-05, "loss": 0.3958, "step": 65335 }, { "epoch": 2.354849172883555, "grad_norm": 0.1989716738462448, "learning_rate": 2.8566588546567243e-05, "loss": 0.4297, "step": 65340 }, { "epoch": 2.3550293725447795, "grad_norm": 0.20529745519161224, "learning_rate": 2.856370023750513e-05, "loss": 0.3922, "step": 65345 }, { "epoch": 2.3552095722060042, "grad_norm": 0.1638742834329605, "learning_rate": 2.8560811879888294e-05, "loss": 0.3881, "step": 65350 }, { "epoch": 2.355389771867229, "grad_norm": 0.16526326537132263, "learning_rate": 2.855792347375608e-05, "loss": 0.3958, "step": 65355 }, { "epoch": 2.3555699715284537, "grad_norm": 0.17560982704162598, "learning_rate": 2.8555035019147857e-05, "loss": 0.3716, "step": 65360 }, { "epoch": 2.355750171189678, "grad_norm": 0.21271197497844696, "learning_rate": 2.855214651610296e-05, "loss": 0.3795, "step": 65365 }, { "epoch": 2.3559303708509027, "grad_norm": 0.18829388916492462, "learning_rate": 2.8549257964660765e-05, "loss": 0.4005, "step": 65370 }, { "epoch": 2.3561105705121275, "grad_norm": 0.19685223698616028, "learning_rate": 2.8546369364860608e-05, "loss": 0.3688, "step": 65375 }, { "epoch": 2.356290770173352, "grad_norm": 0.2065257430076599, "learning_rate": 2.854348071674186e-05, "loss": 0.3742, "step": 65380 }, { "epoch": 2.356470969834577, "grad_norm": 0.22780166566371918, "learning_rate": 2.8540592020343872e-05, "loss": 0.394, "step": 65385 }, { "epoch": 2.3566511694958012, "grad_norm": 0.1743927299976349, "learning_rate": 2.8537703275705997e-05, "loss": 0.4064, "step": 65390 }, { "epoch": 2.356831369157026, "grad_norm": 0.15711626410484314, "learning_rate": 2.853481448286761e-05, "loss": 0.3607, "step": 65395 }, { "epoch": 2.3570115688182507, "grad_norm": 0.2312869429588318, "learning_rate": 2.853192564186805e-05, "loss": 0.3852, "step": 65400 }, { "epoch": 2.3571917684794754, "grad_norm": 0.16384528577327728, "learning_rate": 2.8529036752746697e-05, "loss": 0.4077, "step": 65405 }, { "epoch": 2.3573719681406997, "grad_norm": 0.24304990470409393, "learning_rate": 2.8526147815542898e-05, "loss": 0.4212, "step": 65410 }, { "epoch": 2.3575521678019244, "grad_norm": 0.21454188227653503, "learning_rate": 2.8523258830296017e-05, "loss": 0.4003, "step": 65415 }, { "epoch": 2.357732367463149, "grad_norm": 0.23705005645751953, "learning_rate": 2.852036979704541e-05, "loss": 0.3912, "step": 65420 }, { "epoch": 2.357912567124374, "grad_norm": 0.2235097885131836, "learning_rate": 2.851748071583046e-05, "loss": 0.4037, "step": 65425 }, { "epoch": 2.3580927667855986, "grad_norm": 0.2439454346895218, "learning_rate": 2.851459158669051e-05, "loss": 0.4144, "step": 65430 }, { "epoch": 2.358272966446823, "grad_norm": 0.23461012542247772, "learning_rate": 2.851170240966492e-05, "loss": 0.4088, "step": 65435 }, { "epoch": 2.3584531661080477, "grad_norm": 0.21233990788459778, "learning_rate": 2.8508813184793074e-05, "loss": 0.3875, "step": 65440 }, { "epoch": 2.3586333657692724, "grad_norm": 0.21089306473731995, "learning_rate": 2.8505923912114324e-05, "loss": 0.3812, "step": 65445 }, { "epoch": 2.358813565430497, "grad_norm": 0.23103797435760498, "learning_rate": 2.850303459166805e-05, "loss": 0.4117, "step": 65450 }, { "epoch": 2.3589937650917214, "grad_norm": 0.21833232045173645, "learning_rate": 2.8500145223493595e-05, "loss": 0.4149, "step": 65455 }, { "epoch": 2.359173964752946, "grad_norm": 0.20648404955863953, "learning_rate": 2.8497255807630346e-05, "loss": 0.4214, "step": 65460 }, { "epoch": 2.359354164414171, "grad_norm": 0.1955990344285965, "learning_rate": 2.849436634411766e-05, "loss": 0.3793, "step": 65465 }, { "epoch": 2.3595343640753956, "grad_norm": 0.18235990405082703, "learning_rate": 2.8491476832994912e-05, "loss": 0.3988, "step": 65470 }, { "epoch": 2.3597145637366204, "grad_norm": 0.19686979055404663, "learning_rate": 2.848858727430147e-05, "loss": 0.4026, "step": 65475 }, { "epoch": 2.3598947633978447, "grad_norm": 0.16368722915649414, "learning_rate": 2.8485697668076694e-05, "loss": 0.4115, "step": 65480 }, { "epoch": 2.3600749630590694, "grad_norm": 0.21965758502483368, "learning_rate": 2.848280801435997e-05, "loss": 0.3961, "step": 65485 }, { "epoch": 2.360255162720294, "grad_norm": 0.1920892298221588, "learning_rate": 2.8479918313190657e-05, "loss": 0.4003, "step": 65490 }, { "epoch": 2.360435362381519, "grad_norm": 0.18833613395690918, "learning_rate": 2.8477028564608126e-05, "loss": 0.3545, "step": 65495 }, { "epoch": 2.360615562042743, "grad_norm": 0.17606264352798462, "learning_rate": 2.8474138768651764e-05, "loss": 0.3993, "step": 65500 }, { "epoch": 2.360615562042743, "eval_loss": 0.43418970704078674, "eval_runtime": 3.5195, "eval_samples_per_second": 28.413, "eval_steps_per_second": 7.103, "step": 65500 }, { "epoch": 2.360795761703968, "grad_norm": 0.2237166315317154, "learning_rate": 2.8471248925360927e-05, "loss": 0.4224, "step": 65505 }, { "epoch": 2.3609759613651926, "grad_norm": 0.2391594648361206, "learning_rate": 2.8468359034775007e-05, "loss": 0.3891, "step": 65510 }, { "epoch": 2.3611561610264173, "grad_norm": 0.22656749188899994, "learning_rate": 2.8465469096933352e-05, "loss": 0.3941, "step": 65515 }, { "epoch": 2.361336360687642, "grad_norm": 0.24217578768730164, "learning_rate": 2.846257911187536e-05, "loss": 0.3866, "step": 65520 }, { "epoch": 2.3615165603488664, "grad_norm": 0.21993565559387207, "learning_rate": 2.845968907964039e-05, "loss": 0.4155, "step": 65525 }, { "epoch": 2.361696760010091, "grad_norm": 0.2261502742767334, "learning_rate": 2.845679900026783e-05, "loss": 0.3222, "step": 65530 }, { "epoch": 2.361876959671316, "grad_norm": 0.22237887978553772, "learning_rate": 2.8453908873797058e-05, "loss": 0.4065, "step": 65535 }, { "epoch": 2.3620571593325406, "grad_norm": 0.22978180646896362, "learning_rate": 2.845101870026744e-05, "loss": 0.3804, "step": 65540 }, { "epoch": 2.362237358993765, "grad_norm": 0.2067323923110962, "learning_rate": 2.8448128479718363e-05, "loss": 0.4058, "step": 65545 }, { "epoch": 2.3624175586549896, "grad_norm": 0.18871329724788666, "learning_rate": 2.8445238212189208e-05, "loss": 0.3929, "step": 65550 }, { "epoch": 2.3625977583162143, "grad_norm": 0.20891360938549042, "learning_rate": 2.8442347897719347e-05, "loss": 0.3813, "step": 65555 }, { "epoch": 2.362777957977439, "grad_norm": 0.20510460436344147, "learning_rate": 2.843945753634815e-05, "loss": 0.3578, "step": 65560 }, { "epoch": 2.362958157638664, "grad_norm": 0.20665214955806732, "learning_rate": 2.8436567128115022e-05, "loss": 0.406, "step": 65565 }, { "epoch": 2.3631383572998885, "grad_norm": 0.22137491405010223, "learning_rate": 2.8433676673059335e-05, "loss": 0.4062, "step": 65570 }, { "epoch": 2.363318556961113, "grad_norm": 0.17183031141757965, "learning_rate": 2.8430786171220457e-05, "loss": 0.3879, "step": 65575 }, { "epoch": 2.3634987566223375, "grad_norm": 0.2274576723575592, "learning_rate": 2.842789562263779e-05, "loss": 0.3876, "step": 65580 }, { "epoch": 2.3636789562835623, "grad_norm": 0.23537667095661163, "learning_rate": 2.8425005027350704e-05, "loss": 0.4282, "step": 65585 }, { "epoch": 2.363859155944787, "grad_norm": 0.2088640332221985, "learning_rate": 2.8422114385398597e-05, "loss": 0.3923, "step": 65590 }, { "epoch": 2.3640393556060113, "grad_norm": 0.19276586174964905, "learning_rate": 2.841922369682083e-05, "loss": 0.3885, "step": 65595 }, { "epoch": 2.364219555267236, "grad_norm": 0.17863479256629944, "learning_rate": 2.8416332961656812e-05, "loss": 0.3814, "step": 65600 }, { "epoch": 2.3643997549284608, "grad_norm": 0.21695052087306976, "learning_rate": 2.8413442179945915e-05, "loss": 0.395, "step": 65605 }, { "epoch": 2.3645799545896855, "grad_norm": 0.20921245217323303, "learning_rate": 2.8410551351727532e-05, "loss": 0.3627, "step": 65610 }, { "epoch": 2.3647601542509102, "grad_norm": 0.18168587982654572, "learning_rate": 2.8407660477041044e-05, "loss": 0.3791, "step": 65615 }, { "epoch": 2.3649403539121345, "grad_norm": 0.21238493919372559, "learning_rate": 2.840476955592584e-05, "loss": 0.4178, "step": 65620 }, { "epoch": 2.3651205535733593, "grad_norm": 0.19079379737377167, "learning_rate": 2.8401878588421316e-05, "loss": 0.3875, "step": 65625 }, { "epoch": 2.365300753234584, "grad_norm": 0.17326843738555908, "learning_rate": 2.8398987574566848e-05, "loss": 0.3903, "step": 65630 }, { "epoch": 2.3654809528958087, "grad_norm": 0.2072458267211914, "learning_rate": 2.8396096514401838e-05, "loss": 0.4034, "step": 65635 }, { "epoch": 2.365661152557033, "grad_norm": 0.22681084275245667, "learning_rate": 2.839320540796567e-05, "loss": 0.3799, "step": 65640 }, { "epoch": 2.3658413522182578, "grad_norm": 0.20382151007652283, "learning_rate": 2.839031425529773e-05, "loss": 0.4067, "step": 65645 }, { "epoch": 2.3660215518794825, "grad_norm": 0.19017788767814636, "learning_rate": 2.8387423056437417e-05, "loss": 0.4157, "step": 65650 }, { "epoch": 2.366201751540707, "grad_norm": 0.24520789086818695, "learning_rate": 2.8384531811424127e-05, "loss": 0.415, "step": 65655 }, { "epoch": 2.366381951201932, "grad_norm": 0.23502486944198608, "learning_rate": 2.8381640520297244e-05, "loss": 0.4009, "step": 65660 }, { "epoch": 2.3665621508631562, "grad_norm": 0.2212107628583908, "learning_rate": 2.8378749183096154e-05, "loss": 0.3814, "step": 65665 }, { "epoch": 2.366742350524381, "grad_norm": 0.16753430664539337, "learning_rate": 2.8375857799860272e-05, "loss": 0.3745, "step": 65670 }, { "epoch": 2.3669225501856057, "grad_norm": 0.1829628348350525, "learning_rate": 2.8372966370628978e-05, "loss": 0.4017, "step": 65675 }, { "epoch": 2.3671027498468304, "grad_norm": 0.210256427526474, "learning_rate": 2.8370074895441666e-05, "loss": 0.3829, "step": 65680 }, { "epoch": 2.3672829495080547, "grad_norm": 0.21395932137966156, "learning_rate": 2.8367183374337743e-05, "loss": 0.3984, "step": 65685 }, { "epoch": 2.3674631491692795, "grad_norm": 0.22579269111156464, "learning_rate": 2.8364291807356587e-05, "loss": 0.4096, "step": 65690 }, { "epoch": 2.367643348830504, "grad_norm": 0.22070454061031342, "learning_rate": 2.8361400194537623e-05, "loss": 0.4038, "step": 65695 }, { "epoch": 2.367823548491729, "grad_norm": 0.18013955652713776, "learning_rate": 2.835850853592022e-05, "loss": 0.3868, "step": 65700 }, { "epoch": 2.3680037481529537, "grad_norm": 0.1805589497089386, "learning_rate": 2.835561683154379e-05, "loss": 0.3842, "step": 65705 }, { "epoch": 2.368183947814178, "grad_norm": 0.17691951990127563, "learning_rate": 2.835272508144773e-05, "loss": 0.3759, "step": 65710 }, { "epoch": 2.3683641474754027, "grad_norm": 0.18184374272823334, "learning_rate": 2.8349833285671444e-05, "loss": 0.3885, "step": 65715 }, { "epoch": 2.3685443471366274, "grad_norm": 0.18779203295707703, "learning_rate": 2.8346941444254327e-05, "loss": 0.3972, "step": 65720 }, { "epoch": 2.368724546797852, "grad_norm": 0.20252971351146698, "learning_rate": 2.8344049557235775e-05, "loss": 0.3733, "step": 65725 }, { "epoch": 2.3689047464590764, "grad_norm": 0.17999647557735443, "learning_rate": 2.8341157624655202e-05, "loss": 0.3842, "step": 65730 }, { "epoch": 2.369084946120301, "grad_norm": 0.2151997685432434, "learning_rate": 2.8338265646552002e-05, "loss": 0.4194, "step": 65735 }, { "epoch": 2.369265145781526, "grad_norm": 0.22379030287265778, "learning_rate": 2.8335373622965576e-05, "loss": 0.4103, "step": 65740 }, { "epoch": 2.3694453454427507, "grad_norm": 0.20626239478588104, "learning_rate": 2.833248155393533e-05, "loss": 0.4013, "step": 65745 }, { "epoch": 2.3696255451039754, "grad_norm": 0.2125103622674942, "learning_rate": 2.8329589439500677e-05, "loss": 0.4139, "step": 65750 }, { "epoch": 2.3698057447651997, "grad_norm": 0.20649980008602142, "learning_rate": 2.8326697279701002e-05, "loss": 0.4195, "step": 65755 }, { "epoch": 2.3699859444264244, "grad_norm": 0.20015624165534973, "learning_rate": 2.832380507457572e-05, "loss": 0.3944, "step": 65760 }, { "epoch": 2.370166144087649, "grad_norm": 0.20391733944416046, "learning_rate": 2.8320912824164248e-05, "loss": 0.4466, "step": 65765 }, { "epoch": 2.370346343748874, "grad_norm": 0.21429723501205444, "learning_rate": 2.8318020528505967e-05, "loss": 0.4048, "step": 65770 }, { "epoch": 2.370526543410098, "grad_norm": 0.2534154951572418, "learning_rate": 2.831512818764031e-05, "loss": 0.3978, "step": 65775 }, { "epoch": 2.370706743071323, "grad_norm": 0.20116816461086273, "learning_rate": 2.8312235801606674e-05, "loss": 0.3887, "step": 65780 }, { "epoch": 2.3708869427325476, "grad_norm": 0.199702188372612, "learning_rate": 2.8309343370444457e-05, "loss": 0.3907, "step": 65785 }, { "epoch": 2.3710671423937724, "grad_norm": 0.21394966542720795, "learning_rate": 2.8306450894193086e-05, "loss": 0.3903, "step": 65790 }, { "epoch": 2.371247342054997, "grad_norm": 0.21755504608154297, "learning_rate": 2.8303558372891953e-05, "loss": 0.3855, "step": 65795 }, { "epoch": 2.3714275417162214, "grad_norm": 0.23619835078716278, "learning_rate": 2.830066580658049e-05, "loss": 0.4036, "step": 65800 }, { "epoch": 2.371607741377446, "grad_norm": 0.16918082535266876, "learning_rate": 2.8297773195298084e-05, "loss": 0.39, "step": 65805 }, { "epoch": 2.371787941038671, "grad_norm": 0.20656408369541168, "learning_rate": 2.8294880539084163e-05, "loss": 0.3941, "step": 65810 }, { "epoch": 2.3719681406998956, "grad_norm": 0.18240268528461456, "learning_rate": 2.8291987837978125e-05, "loss": 0.4155, "step": 65815 }, { "epoch": 2.37214834036112, "grad_norm": 0.19765466451644897, "learning_rate": 2.8289095092019396e-05, "loss": 0.3989, "step": 65820 }, { "epoch": 2.3723285400223446, "grad_norm": 0.2510075271129608, "learning_rate": 2.8286202301247382e-05, "loss": 0.401, "step": 65825 }, { "epoch": 2.3725087396835693, "grad_norm": 0.23590603470802307, "learning_rate": 2.828330946570149e-05, "loss": 0.4273, "step": 65830 }, { "epoch": 2.372688939344794, "grad_norm": 0.20524172484874725, "learning_rate": 2.8280416585421155e-05, "loss": 0.3926, "step": 65835 }, { "epoch": 2.372869139006019, "grad_norm": 0.21674352884292603, "learning_rate": 2.8277523660445776e-05, "loss": 0.3717, "step": 65840 }, { "epoch": 2.3730493386672435, "grad_norm": 0.20333261787891388, "learning_rate": 2.827463069081477e-05, "loss": 0.4245, "step": 65845 }, { "epoch": 2.373229538328468, "grad_norm": 0.16487130522727966, "learning_rate": 2.827173767656755e-05, "loss": 0.3902, "step": 65850 }, { "epoch": 2.3734097379896926, "grad_norm": 0.1875753402709961, "learning_rate": 2.8268844617743544e-05, "loss": 0.4189, "step": 65855 }, { "epoch": 2.3735899376509173, "grad_norm": 0.20067238807678223, "learning_rate": 2.826595151438216e-05, "loss": 0.3785, "step": 65860 }, { "epoch": 2.373770137312142, "grad_norm": 0.23048771917819977, "learning_rate": 2.826305836652282e-05, "loss": 0.4162, "step": 65865 }, { "epoch": 2.3739503369733663, "grad_norm": 0.19079884886741638, "learning_rate": 2.8260165174204938e-05, "loss": 0.3493, "step": 65870 }, { "epoch": 2.374130536634591, "grad_norm": 0.1478530913591385, "learning_rate": 2.825727193746794e-05, "loss": 0.3911, "step": 65875 }, { "epoch": 2.374310736295816, "grad_norm": 0.2297230362892151, "learning_rate": 2.825437865635125e-05, "loss": 0.4319, "step": 65880 }, { "epoch": 2.3744909359570405, "grad_norm": 0.22544041275978088, "learning_rate": 2.8251485330894266e-05, "loss": 0.3452, "step": 65885 }, { "epoch": 2.3746711356182653, "grad_norm": 0.19545534253120422, "learning_rate": 2.8248591961136435e-05, "loss": 0.4169, "step": 65890 }, { "epoch": 2.3748513352794896, "grad_norm": 0.18258170783519745, "learning_rate": 2.8245698547117162e-05, "loss": 0.3857, "step": 65895 }, { "epoch": 2.3750315349407143, "grad_norm": 0.19753779470920563, "learning_rate": 2.8242805088875874e-05, "loss": 0.3941, "step": 65900 }, { "epoch": 2.375211734601939, "grad_norm": 0.21284906566143036, "learning_rate": 2.8239911586452e-05, "loss": 0.3825, "step": 65905 }, { "epoch": 2.3753919342631638, "grad_norm": 0.18012243509292603, "learning_rate": 2.823701803988495e-05, "loss": 0.3937, "step": 65910 }, { "epoch": 2.375572133924388, "grad_norm": 0.2071056365966797, "learning_rate": 2.8234124449214163e-05, "loss": 0.3982, "step": 65915 }, { "epoch": 2.3757523335856128, "grad_norm": 0.21273143589496613, "learning_rate": 2.8231230814479052e-05, "loss": 0.3928, "step": 65920 }, { "epoch": 2.3759325332468375, "grad_norm": 0.21067939698696136, "learning_rate": 2.8228337135719046e-05, "loss": 0.3612, "step": 65925 }, { "epoch": 2.3761127329080622, "grad_norm": 0.22029119729995728, "learning_rate": 2.822544341297358e-05, "loss": 0.3829, "step": 65930 }, { "epoch": 2.376292932569287, "grad_norm": 0.2073868066072464, "learning_rate": 2.822254964628206e-05, "loss": 0.4605, "step": 65935 }, { "epoch": 2.3764731322305113, "grad_norm": 0.1779036968946457, "learning_rate": 2.821965583568394e-05, "loss": 0.3912, "step": 65940 }, { "epoch": 2.376653331891736, "grad_norm": 0.1958044022321701, "learning_rate": 2.821676198121862e-05, "loss": 0.4263, "step": 65945 }, { "epoch": 2.3768335315529607, "grad_norm": 0.1888829916715622, "learning_rate": 2.8213868082925542e-05, "loss": 0.3649, "step": 65950 }, { "epoch": 2.3770137312141855, "grad_norm": 0.2218841016292572, "learning_rate": 2.8210974140844137e-05, "loss": 0.3925, "step": 65955 }, { "epoch": 2.3771939308754098, "grad_norm": 0.19127243757247925, "learning_rate": 2.8208080155013826e-05, "loss": 0.4105, "step": 65960 }, { "epoch": 2.3773741305366345, "grad_norm": 0.19066543877124786, "learning_rate": 2.8205186125474054e-05, "loss": 0.3651, "step": 65965 }, { "epoch": 2.3775543301978592, "grad_norm": 0.1706237643957138, "learning_rate": 2.8202292052264234e-05, "loss": 0.4082, "step": 65970 }, { "epoch": 2.377734529859084, "grad_norm": 0.21013911068439484, "learning_rate": 2.8199397935423805e-05, "loss": 0.424, "step": 65975 }, { "epoch": 2.3779147295203087, "grad_norm": 0.22082720696926117, "learning_rate": 2.81965037749922e-05, "loss": 0.3968, "step": 65980 }, { "epoch": 2.378094929181533, "grad_norm": 0.21986991167068481, "learning_rate": 2.8193609571008856e-05, "loss": 0.4173, "step": 65985 }, { "epoch": 2.3782751288427577, "grad_norm": 0.22359991073608398, "learning_rate": 2.819071532351319e-05, "loss": 0.4279, "step": 65990 }, { "epoch": 2.3784553285039824, "grad_norm": 0.18317356705665588, "learning_rate": 2.8187821032544648e-05, "loss": 0.382, "step": 65995 }, { "epoch": 2.378635528165207, "grad_norm": 0.21548272669315338, "learning_rate": 2.818492669814266e-05, "loss": 0.3584, "step": 66000 }, { "epoch": 2.378635528165207, "eval_loss": 0.4347288906574249, "eval_runtime": 3.5302, "eval_samples_per_second": 28.327, "eval_steps_per_second": 7.082, "step": 66000 }, { "epoch": 2.3788157278264315, "grad_norm": 0.2195282280445099, "learning_rate": 2.8182032320346668e-05, "loss": 0.3977, "step": 66005 }, { "epoch": 2.378995927487656, "grad_norm": 0.19582293927669525, "learning_rate": 2.81791378991961e-05, "loss": 0.3921, "step": 66010 }, { "epoch": 2.379176127148881, "grad_norm": 0.1854901909828186, "learning_rate": 2.8176243434730388e-05, "loss": 0.3933, "step": 66015 }, { "epoch": 2.3793563268101057, "grad_norm": 0.2092180699110031, "learning_rate": 2.817334892698898e-05, "loss": 0.4335, "step": 66020 }, { "epoch": 2.3795365264713304, "grad_norm": 0.1754889339208603, "learning_rate": 2.8170454376011307e-05, "loss": 0.4288, "step": 66025 }, { "epoch": 2.3797167261325547, "grad_norm": 0.21382570266723633, "learning_rate": 2.816755978183681e-05, "loss": 0.4057, "step": 66030 }, { "epoch": 2.3798969257937794, "grad_norm": 0.183826744556427, "learning_rate": 2.8164665144504914e-05, "loss": 0.3857, "step": 66035 }, { "epoch": 2.380077125455004, "grad_norm": 0.17409420013427734, "learning_rate": 2.8161770464055077e-05, "loss": 0.394, "step": 66040 }, { "epoch": 2.380257325116229, "grad_norm": 0.19534289836883545, "learning_rate": 2.815887574052673e-05, "loss": 0.4162, "step": 66045 }, { "epoch": 2.380437524777453, "grad_norm": 0.2002120018005371, "learning_rate": 2.8155980973959308e-05, "loss": 0.3953, "step": 66050 }, { "epoch": 2.380617724438678, "grad_norm": 0.23090575635433197, "learning_rate": 2.8153086164392258e-05, "loss": 0.4104, "step": 66055 }, { "epoch": 2.3807979240999027, "grad_norm": 0.22412335872650146, "learning_rate": 2.8150191311865014e-05, "loss": 0.4029, "step": 66060 }, { "epoch": 2.3809781237611274, "grad_norm": 0.1874934881925583, "learning_rate": 2.814729641641703e-05, "loss": 0.4051, "step": 66065 }, { "epoch": 2.381158323422352, "grad_norm": 0.23350152373313904, "learning_rate": 2.8144401478087744e-05, "loss": 0.4113, "step": 66070 }, { "epoch": 2.381338523083577, "grad_norm": 0.21121875941753387, "learning_rate": 2.8141506496916586e-05, "loss": 0.3871, "step": 66075 }, { "epoch": 2.381518722744801, "grad_norm": 0.1994001567363739, "learning_rate": 2.8138611472943023e-05, "loss": 0.4012, "step": 66080 }, { "epoch": 2.381698922406026, "grad_norm": 0.18203619122505188, "learning_rate": 2.813571640620648e-05, "loss": 0.3756, "step": 66085 }, { "epoch": 2.3818791220672506, "grad_norm": 0.18496555089950562, "learning_rate": 2.813282129674641e-05, "loss": 0.4213, "step": 66090 }, { "epoch": 2.3820593217284753, "grad_norm": 0.22467875480651855, "learning_rate": 2.8129926144602247e-05, "loss": 0.4402, "step": 66095 }, { "epoch": 2.3822395213896996, "grad_norm": 0.1796996146440506, "learning_rate": 2.812703094981346e-05, "loss": 0.4387, "step": 66100 }, { "epoch": 2.3824197210509244, "grad_norm": 0.20980200171470642, "learning_rate": 2.8124135712419476e-05, "loss": 0.4291, "step": 66105 }, { "epoch": 2.382599920712149, "grad_norm": 0.1975623220205307, "learning_rate": 2.8121240432459746e-05, "loss": 0.3491, "step": 66110 }, { "epoch": 2.382780120373374, "grad_norm": 0.2462427318096161, "learning_rate": 2.811834510997372e-05, "loss": 0.3775, "step": 66115 }, { "epoch": 2.3829603200345986, "grad_norm": 0.201690673828125, "learning_rate": 2.8115449745000845e-05, "loss": 0.4109, "step": 66120 }, { "epoch": 2.383140519695823, "grad_norm": 0.18545566499233246, "learning_rate": 2.8112554337580577e-05, "loss": 0.3889, "step": 66125 }, { "epoch": 2.3833207193570476, "grad_norm": 0.19686880707740784, "learning_rate": 2.810965888775235e-05, "loss": 0.3594, "step": 66130 }, { "epoch": 2.3835009190182723, "grad_norm": 0.24028317630290985, "learning_rate": 2.810676339555563e-05, "loss": 0.4291, "step": 66135 }, { "epoch": 2.383681118679497, "grad_norm": 0.2691000998020172, "learning_rate": 2.8103867861029852e-05, "loss": 0.3862, "step": 66140 }, { "epoch": 2.3838613183407213, "grad_norm": 0.18645727634429932, "learning_rate": 2.8100972284214476e-05, "loss": 0.3964, "step": 66145 }, { "epoch": 2.384041518001946, "grad_norm": 0.21478243172168732, "learning_rate": 2.8098076665148965e-05, "loss": 0.4038, "step": 66150 }, { "epoch": 2.384221717663171, "grad_norm": 0.22936570644378662, "learning_rate": 2.8095181003872746e-05, "loss": 0.3795, "step": 66155 }, { "epoch": 2.3844019173243955, "grad_norm": 0.22326894104480743, "learning_rate": 2.809228530042529e-05, "loss": 0.4041, "step": 66160 }, { "epoch": 2.3845821169856203, "grad_norm": 0.22681626677513123, "learning_rate": 2.8089389554846048e-05, "loss": 0.4157, "step": 66165 }, { "epoch": 2.3847623166468446, "grad_norm": 0.19719672203063965, "learning_rate": 2.808649376717447e-05, "loss": 0.37, "step": 66170 }, { "epoch": 2.3849425163080693, "grad_norm": 0.22357481718063354, "learning_rate": 2.8083597937450006e-05, "loss": 0.385, "step": 66175 }, { "epoch": 2.385122715969294, "grad_norm": 0.22523802518844604, "learning_rate": 2.8080702065712127e-05, "loss": 0.3971, "step": 66180 }, { "epoch": 2.3853029156305188, "grad_norm": 0.2830953001976013, "learning_rate": 2.8077806152000273e-05, "loss": 0.3931, "step": 66185 }, { "epoch": 2.385483115291743, "grad_norm": 0.21233409643173218, "learning_rate": 2.8074910196353904e-05, "loss": 0.3689, "step": 66190 }, { "epoch": 2.385663314952968, "grad_norm": 0.20997081696987152, "learning_rate": 2.8072014198812487e-05, "loss": 0.3721, "step": 66195 }, { "epoch": 2.3858435146141925, "grad_norm": 0.19072303175926208, "learning_rate": 2.8069118159415458e-05, "loss": 0.3859, "step": 66200 }, { "epoch": 2.3860237142754173, "grad_norm": 0.2057289481163025, "learning_rate": 2.8066222078202303e-05, "loss": 0.3966, "step": 66205 }, { "epoch": 2.386203913936642, "grad_norm": 0.1966446340084076, "learning_rate": 2.806332595521246e-05, "loss": 0.3794, "step": 66210 }, { "epoch": 2.3863841135978663, "grad_norm": 0.2165762335062027, "learning_rate": 2.8060429790485386e-05, "loss": 0.4308, "step": 66215 }, { "epoch": 2.386564313259091, "grad_norm": 0.1911364644765854, "learning_rate": 2.8057533584060558e-05, "loss": 0.3758, "step": 66220 }, { "epoch": 2.3867445129203158, "grad_norm": 0.2317955642938614, "learning_rate": 2.8054637335977423e-05, "loss": 0.4062, "step": 66225 }, { "epoch": 2.3869247125815405, "grad_norm": 0.21293920278549194, "learning_rate": 2.8051741046275453e-05, "loss": 0.4013, "step": 66230 }, { "epoch": 2.3871049122427648, "grad_norm": 0.17797954380512238, "learning_rate": 2.8048844714994092e-05, "loss": 0.3899, "step": 66235 }, { "epoch": 2.3872851119039895, "grad_norm": 0.21565930545330048, "learning_rate": 2.8045948342172823e-05, "loss": 0.3822, "step": 66240 }, { "epoch": 2.3874653115652142, "grad_norm": 0.19301265478134155, "learning_rate": 2.8043051927851083e-05, "loss": 0.4029, "step": 66245 }, { "epoch": 2.387645511226439, "grad_norm": 0.20318666100502014, "learning_rate": 2.8040155472068365e-05, "loss": 0.3968, "step": 66250 }, { "epoch": 2.3878257108876637, "grad_norm": 0.18554876744747162, "learning_rate": 2.8037258974864105e-05, "loss": 0.3907, "step": 66255 }, { "epoch": 2.388005910548888, "grad_norm": 0.2470291256904602, "learning_rate": 2.8034362436277784e-05, "loss": 0.4145, "step": 66260 }, { "epoch": 2.3881861102101127, "grad_norm": 0.20082451403141022, "learning_rate": 2.8031465856348865e-05, "loss": 0.4123, "step": 66265 }, { "epoch": 2.3883663098713375, "grad_norm": 0.19475050270557404, "learning_rate": 2.802856923511681e-05, "loss": 0.4013, "step": 66270 }, { "epoch": 2.388546509532562, "grad_norm": 0.22373121976852417, "learning_rate": 2.8025672572621088e-05, "loss": 0.3921, "step": 66275 }, { "epoch": 2.3887267091937865, "grad_norm": 0.25062522292137146, "learning_rate": 2.8022775868901153e-05, "loss": 0.4029, "step": 66280 }, { "epoch": 2.3889069088550112, "grad_norm": 0.19427599012851715, "learning_rate": 2.801987912399649e-05, "loss": 0.3677, "step": 66285 }, { "epoch": 2.389087108516236, "grad_norm": 0.1876882165670395, "learning_rate": 2.8016982337946556e-05, "loss": 0.3781, "step": 66290 }, { "epoch": 2.3892673081774607, "grad_norm": 0.18821240961551666, "learning_rate": 2.801408551079082e-05, "loss": 0.3557, "step": 66295 }, { "epoch": 2.3894475078386854, "grad_norm": 0.21924707293510437, "learning_rate": 2.801118864256876e-05, "loss": 0.4088, "step": 66300 }, { "epoch": 2.3896277074999097, "grad_norm": 0.19032910466194153, "learning_rate": 2.8008291733319824e-05, "loss": 0.3856, "step": 66305 }, { "epoch": 2.3898079071611344, "grad_norm": 0.1727897822856903, "learning_rate": 2.800539478308351e-05, "loss": 0.4037, "step": 66310 }, { "epoch": 2.389988106822359, "grad_norm": 0.21995660662651062, "learning_rate": 2.8002497791899268e-05, "loss": 0.4113, "step": 66315 }, { "epoch": 2.390168306483584, "grad_norm": 0.20189730823040009, "learning_rate": 2.7999600759806577e-05, "loss": 0.3665, "step": 66320 }, { "epoch": 2.390348506144808, "grad_norm": 0.21424639225006104, "learning_rate": 2.79967036868449e-05, "loss": 0.4236, "step": 66325 }, { "epoch": 2.390528705806033, "grad_norm": 0.2475886195898056, "learning_rate": 2.7993806573053723e-05, "loss": 0.4123, "step": 66330 }, { "epoch": 2.3907089054672577, "grad_norm": 0.17715711891651154, "learning_rate": 2.7990909418472505e-05, "loss": 0.4205, "step": 66335 }, { "epoch": 2.3908891051284824, "grad_norm": 0.20529335737228394, "learning_rate": 2.7988012223140726e-05, "loss": 0.4083, "step": 66340 }, { "epoch": 2.391069304789707, "grad_norm": 0.18749858438968658, "learning_rate": 2.798511498709786e-05, "loss": 0.4141, "step": 66345 }, { "epoch": 2.391249504450932, "grad_norm": 0.18986235558986664, "learning_rate": 2.7982217710383386e-05, "loss": 0.4154, "step": 66350 }, { "epoch": 2.391429704112156, "grad_norm": 0.19709980487823486, "learning_rate": 2.7979320393036762e-05, "loss": 0.3889, "step": 66355 }, { "epoch": 2.391609903773381, "grad_norm": 0.17932918667793274, "learning_rate": 2.7976423035097486e-05, "loss": 0.3893, "step": 66360 }, { "epoch": 2.3917901034346056, "grad_norm": 0.20083005726337433, "learning_rate": 2.7973525636605014e-05, "loss": 0.4004, "step": 66365 }, { "epoch": 2.3919703030958304, "grad_norm": 0.21819722652435303, "learning_rate": 2.797062819759884e-05, "loss": 0.414, "step": 66370 }, { "epoch": 2.3921505027570547, "grad_norm": 0.22273634374141693, "learning_rate": 2.7967730718118424e-05, "loss": 0.4096, "step": 66375 }, { "epoch": 2.3923307024182794, "grad_norm": 0.18526360392570496, "learning_rate": 2.7964833198203254e-05, "loss": 0.3985, "step": 66380 }, { "epoch": 2.392510902079504, "grad_norm": 0.2259715348482132, "learning_rate": 2.79619356378928e-05, "loss": 0.3778, "step": 66385 }, { "epoch": 2.392691101740729, "grad_norm": 0.17341482639312744, "learning_rate": 2.7959038037226554e-05, "loss": 0.3816, "step": 66390 }, { "epoch": 2.3928713014019536, "grad_norm": 0.22242747247219086, "learning_rate": 2.7956140396243986e-05, "loss": 0.378, "step": 66395 }, { "epoch": 2.393051501063178, "grad_norm": 0.19790546596050262, "learning_rate": 2.7953242714984573e-05, "loss": 0.3698, "step": 66400 }, { "epoch": 2.3932317007244026, "grad_norm": 0.2102694809436798, "learning_rate": 2.7950344993487803e-05, "loss": 0.4262, "step": 66405 }, { "epoch": 2.3934119003856273, "grad_norm": 0.19927994906902313, "learning_rate": 2.7947447231793156e-05, "loss": 0.4135, "step": 66410 }, { "epoch": 2.393592100046852, "grad_norm": 0.20652970671653748, "learning_rate": 2.7944549429940115e-05, "loss": 0.3911, "step": 66415 }, { "epoch": 2.3937722997080764, "grad_norm": 0.17121002078056335, "learning_rate": 2.7941651587968147e-05, "loss": 0.394, "step": 66420 }, { "epoch": 2.393952499369301, "grad_norm": 0.20938289165496826, "learning_rate": 2.7938753705916752e-05, "loss": 0.3841, "step": 66425 }, { "epoch": 2.394132699030526, "grad_norm": 0.2229117602109909, "learning_rate": 2.7935855783825406e-05, "loss": 0.4056, "step": 66430 }, { "epoch": 2.3943128986917506, "grad_norm": 0.2586416006088257, "learning_rate": 2.7932957821733592e-05, "loss": 0.4423, "step": 66435 }, { "epoch": 2.3944930983529753, "grad_norm": 0.22627079486846924, "learning_rate": 2.79300598196808e-05, "loss": 0.3877, "step": 66440 }, { "epoch": 2.3946732980141996, "grad_norm": 0.20481657981872559, "learning_rate": 2.79271617777065e-05, "loss": 0.3804, "step": 66445 }, { "epoch": 2.3948534976754243, "grad_norm": 0.18298093974590302, "learning_rate": 2.7924263695850196e-05, "loss": 0.4435, "step": 66450 }, { "epoch": 2.395033697336649, "grad_norm": 0.2255946695804596, "learning_rate": 2.7921365574151364e-05, "loss": 0.3889, "step": 66455 }, { "epoch": 2.395213896997874, "grad_norm": 0.15484662353992462, "learning_rate": 2.7918467412649495e-05, "loss": 0.3927, "step": 66460 }, { "epoch": 2.395394096659098, "grad_norm": 0.23011015355587006, "learning_rate": 2.7915569211384064e-05, "loss": 0.3822, "step": 66465 }, { "epoch": 2.395574296320323, "grad_norm": 0.21641449630260468, "learning_rate": 2.791267097039457e-05, "loss": 0.3903, "step": 66470 }, { "epoch": 2.3957544959815475, "grad_norm": 0.19246269762516022, "learning_rate": 2.7909772689720503e-05, "loss": 0.3717, "step": 66475 }, { "epoch": 2.3959346956427723, "grad_norm": 0.1879129558801651, "learning_rate": 2.7906874369401342e-05, "loss": 0.4046, "step": 66480 }, { "epoch": 2.396114895303997, "grad_norm": 0.20422397553920746, "learning_rate": 2.7903976009476584e-05, "loss": 0.353, "step": 66485 }, { "epoch": 2.3962950949652213, "grad_norm": 0.19214318692684174, "learning_rate": 2.7901077609985708e-05, "loss": 0.3937, "step": 66490 }, { "epoch": 2.396475294626446, "grad_norm": 0.20487315952777863, "learning_rate": 2.789817917096822e-05, "loss": 0.3955, "step": 66495 }, { "epoch": 2.3966554942876708, "grad_norm": 0.19864313304424286, "learning_rate": 2.78952806924636e-05, "loss": 0.3657, "step": 66500 }, { "epoch": 2.3966554942876708, "eval_loss": 0.43440091609954834, "eval_runtime": 3.5323, "eval_samples_per_second": 28.31, "eval_steps_per_second": 7.077, "step": 66500 }, { "epoch": 2.3968356939488955, "grad_norm": 0.19604401290416718, "learning_rate": 2.7892382174511333e-05, "loss": 0.3751, "step": 66505 }, { "epoch": 2.39701589361012, "grad_norm": 0.27748361229896545, "learning_rate": 2.7889483617150924e-05, "loss": 0.3847, "step": 66510 }, { "epoch": 2.3971960932713445, "grad_norm": 0.20608752965927124, "learning_rate": 2.7886585020421863e-05, "loss": 0.364, "step": 66515 }, { "epoch": 2.3973762929325693, "grad_norm": 0.20215480029582977, "learning_rate": 2.788368638436364e-05, "loss": 0.3916, "step": 66520 }, { "epoch": 2.397556492593794, "grad_norm": 0.25804924964904785, "learning_rate": 2.7880787709015742e-05, "loss": 0.4008, "step": 66525 }, { "epoch": 2.3977366922550187, "grad_norm": 0.2014016956090927, "learning_rate": 2.787788899441768e-05, "loss": 0.3776, "step": 66530 }, { "epoch": 2.397916891916243, "grad_norm": 0.15427666902542114, "learning_rate": 2.7874990240608927e-05, "loss": 0.3746, "step": 66535 }, { "epoch": 2.3980970915774678, "grad_norm": 0.15082351863384247, "learning_rate": 2.787209144762899e-05, "loss": 0.3814, "step": 66540 }, { "epoch": 2.3982772912386925, "grad_norm": 0.1636931151151657, "learning_rate": 2.786919261551737e-05, "loss": 0.3839, "step": 66545 }, { "epoch": 2.398457490899917, "grad_norm": 0.17769426107406616, "learning_rate": 2.786629374431355e-05, "loss": 0.3759, "step": 66550 }, { "epoch": 2.3986376905611415, "grad_norm": 0.2125890552997589, "learning_rate": 2.7863394834057038e-05, "loss": 0.4056, "step": 66555 }, { "epoch": 2.3988178902223662, "grad_norm": 0.18982288241386414, "learning_rate": 2.7860495884787318e-05, "loss": 0.4115, "step": 66560 }, { "epoch": 2.398998089883591, "grad_norm": 0.18557365238666534, "learning_rate": 2.78575968965439e-05, "loss": 0.3919, "step": 66565 }, { "epoch": 2.3991782895448157, "grad_norm": 0.17741166055202484, "learning_rate": 2.7854697869366277e-05, "loss": 0.4147, "step": 66570 }, { "epoch": 2.3993584892060404, "grad_norm": 0.20264872908592224, "learning_rate": 2.785179880329395e-05, "loss": 0.3773, "step": 66575 }, { "epoch": 2.399538688867265, "grad_norm": 0.20139038562774658, "learning_rate": 2.7848899698366414e-05, "loss": 0.3799, "step": 66580 }, { "epoch": 2.3997188885284895, "grad_norm": 0.21390214562416077, "learning_rate": 2.7846000554623168e-05, "loss": 0.4303, "step": 66585 }, { "epoch": 2.399899088189714, "grad_norm": 0.20651130378246307, "learning_rate": 2.7843101372103726e-05, "loss": 0.4191, "step": 66590 }, { "epoch": 2.400079287850939, "grad_norm": 0.20792073011398315, "learning_rate": 2.7840202150847566e-05, "loss": 0.3953, "step": 66595 }, { "epoch": 2.4002594875121637, "grad_norm": 0.17147089540958405, "learning_rate": 2.783730289089421e-05, "loss": 0.3817, "step": 66600 }, { "epoch": 2.400439687173388, "grad_norm": 0.1750619262456894, "learning_rate": 2.7834403592283147e-05, "loss": 0.4007, "step": 66605 }, { "epoch": 2.4006198868346127, "grad_norm": 0.20801058411598206, "learning_rate": 2.7831504255053886e-05, "loss": 0.3996, "step": 66610 }, { "epoch": 2.4008000864958374, "grad_norm": 0.2012956142425537, "learning_rate": 2.7828604879245923e-05, "loss": 0.383, "step": 66615 }, { "epoch": 2.400980286157062, "grad_norm": 0.18420088291168213, "learning_rate": 2.7825705464898777e-05, "loss": 0.3725, "step": 66620 }, { "epoch": 2.401160485818287, "grad_norm": 0.2484653741121292, "learning_rate": 2.7822806012051934e-05, "loss": 0.4174, "step": 66625 }, { "epoch": 2.401340685479511, "grad_norm": 0.19384010136127472, "learning_rate": 2.7819906520744903e-05, "loss": 0.3899, "step": 66630 }, { "epoch": 2.401520885140736, "grad_norm": 0.19840480387210846, "learning_rate": 2.7817006991017196e-05, "loss": 0.3819, "step": 66635 }, { "epoch": 2.4017010848019607, "grad_norm": 0.20642586052417755, "learning_rate": 2.7814107422908315e-05, "loss": 0.3706, "step": 66640 }, { "epoch": 2.4018812844631854, "grad_norm": 0.20059265196323395, "learning_rate": 2.7811207816457756e-05, "loss": 0.3905, "step": 66645 }, { "epoch": 2.4020614841244097, "grad_norm": 0.19518163800239563, "learning_rate": 2.7808308171705045e-05, "loss": 0.3732, "step": 66650 }, { "epoch": 2.4022416837856344, "grad_norm": 0.2086179107427597, "learning_rate": 2.780540848868967e-05, "loss": 0.3932, "step": 66655 }, { "epoch": 2.402421883446859, "grad_norm": 0.18908485770225525, "learning_rate": 2.780250876745116e-05, "loss": 0.3602, "step": 66660 }, { "epoch": 2.402602083108084, "grad_norm": 0.19856788218021393, "learning_rate": 2.7799609008029004e-05, "loss": 0.3977, "step": 66665 }, { "epoch": 2.4027822827693086, "grad_norm": 0.1787065863609314, "learning_rate": 2.779670921046272e-05, "loss": 0.3638, "step": 66670 }, { "epoch": 2.402962482430533, "grad_norm": 0.225900337100029, "learning_rate": 2.7793809374791807e-05, "loss": 0.3711, "step": 66675 }, { "epoch": 2.4031426820917576, "grad_norm": 0.20942628383636475, "learning_rate": 2.779090950105579e-05, "loss": 0.3831, "step": 66680 }, { "epoch": 2.4033228817529824, "grad_norm": 0.1625223010778427, "learning_rate": 2.7788009589294167e-05, "loss": 0.368, "step": 66685 }, { "epoch": 2.403503081414207, "grad_norm": 0.17064473032951355, "learning_rate": 2.7785109639546454e-05, "loss": 0.4337, "step": 66690 }, { "epoch": 2.4036832810754314, "grad_norm": 0.1989968866109848, "learning_rate": 2.778220965185216e-05, "loss": 0.3975, "step": 66695 }, { "epoch": 2.403863480736656, "grad_norm": 0.1930345594882965, "learning_rate": 2.77793096262508e-05, "loss": 0.3987, "step": 66700 }, { "epoch": 2.404043680397881, "grad_norm": 0.22240734100341797, "learning_rate": 2.7776409562781885e-05, "loss": 0.4242, "step": 66705 }, { "epoch": 2.4042238800591056, "grad_norm": 0.22761568427085876, "learning_rate": 2.7773509461484924e-05, "loss": 0.3846, "step": 66710 }, { "epoch": 2.4044040797203303, "grad_norm": 0.18993456661701202, "learning_rate": 2.7770609322399438e-05, "loss": 0.3828, "step": 66715 }, { "epoch": 2.4045842793815546, "grad_norm": 0.253111869096756, "learning_rate": 2.7767709145564936e-05, "loss": 0.3801, "step": 66720 }, { "epoch": 2.4047644790427793, "grad_norm": 0.2091599851846695, "learning_rate": 2.7764808931020923e-05, "loss": 0.3715, "step": 66725 }, { "epoch": 2.404944678704004, "grad_norm": 0.19851459562778473, "learning_rate": 2.7761908678806937e-05, "loss": 0.4124, "step": 66730 }, { "epoch": 2.405124878365229, "grad_norm": 0.19328032433986664, "learning_rate": 2.775900838896247e-05, "loss": 0.3716, "step": 66735 }, { "epoch": 2.405305078026453, "grad_norm": 0.18993335962295532, "learning_rate": 2.7756108061527062e-05, "loss": 0.3552, "step": 66740 }, { "epoch": 2.405485277687678, "grad_norm": 0.19817394018173218, "learning_rate": 2.77532076965402e-05, "loss": 0.4001, "step": 66745 }, { "epoch": 2.4056654773489026, "grad_norm": 0.19719624519348145, "learning_rate": 2.7750307294041423e-05, "loss": 0.3777, "step": 66750 }, { "epoch": 2.4058456770101273, "grad_norm": 0.20810894668102264, "learning_rate": 2.774740685407024e-05, "loss": 0.3946, "step": 66755 }, { "epoch": 2.406025876671352, "grad_norm": 0.1935310959815979, "learning_rate": 2.7744506376666175e-05, "loss": 0.3903, "step": 66760 }, { "epoch": 2.4062060763325763, "grad_norm": 0.24938994646072388, "learning_rate": 2.7741605861868735e-05, "loss": 0.3914, "step": 66765 }, { "epoch": 2.406386275993801, "grad_norm": 0.22419728338718414, "learning_rate": 2.7738705309717444e-05, "loss": 0.4173, "step": 66770 }, { "epoch": 2.406566475655026, "grad_norm": 0.17850545048713684, "learning_rate": 2.7735804720251835e-05, "loss": 0.4045, "step": 66775 }, { "epoch": 2.4067466753162505, "grad_norm": 0.23113694787025452, "learning_rate": 2.773290409351141e-05, "loss": 0.3859, "step": 66780 }, { "epoch": 2.406926874977475, "grad_norm": 0.24465039372444153, "learning_rate": 2.7730003429535688e-05, "loss": 0.3736, "step": 66785 }, { "epoch": 2.4071070746386996, "grad_norm": 0.22059732675552368, "learning_rate": 2.7727102728364207e-05, "loss": 0.3946, "step": 66790 }, { "epoch": 2.4072872742999243, "grad_norm": 0.20629587769508362, "learning_rate": 2.7724201990036474e-05, "loss": 0.3921, "step": 66795 }, { "epoch": 2.407467473961149, "grad_norm": 0.2906322479248047, "learning_rate": 2.772130121459202e-05, "loss": 0.3928, "step": 66800 }, { "epoch": 2.4076476736223738, "grad_norm": 0.1948506087064743, "learning_rate": 2.771840040207037e-05, "loss": 0.3751, "step": 66805 }, { "epoch": 2.407827873283598, "grad_norm": 0.1990462988615036, "learning_rate": 2.7715499552511036e-05, "loss": 0.4181, "step": 66810 }, { "epoch": 2.4080080729448228, "grad_norm": 0.23790279030799866, "learning_rate": 2.7712598665953543e-05, "loss": 0.3669, "step": 66815 }, { "epoch": 2.4081882726060475, "grad_norm": 0.18465937674045563, "learning_rate": 2.7709697742437418e-05, "loss": 0.4092, "step": 66820 }, { "epoch": 2.4083684722672722, "grad_norm": 0.16770188510417938, "learning_rate": 2.7706796782002192e-05, "loss": 0.37, "step": 66825 }, { "epoch": 2.4085486719284965, "grad_norm": 0.2589314877986908, "learning_rate": 2.770389578468738e-05, "loss": 0.4184, "step": 66830 }, { "epoch": 2.4087288715897213, "grad_norm": 0.18053090572357178, "learning_rate": 2.770099475053251e-05, "loss": 0.3487, "step": 66835 }, { "epoch": 2.408909071250946, "grad_norm": 0.2627122700214386, "learning_rate": 2.7698093679577108e-05, "loss": 0.4316, "step": 66840 }, { "epoch": 2.4090892709121707, "grad_norm": 0.18675784766674042, "learning_rate": 2.769519257186071e-05, "loss": 0.3685, "step": 66845 }, { "epoch": 2.4092694705733955, "grad_norm": 0.27119845151901245, "learning_rate": 2.7692291427422828e-05, "loss": 0.4013, "step": 66850 }, { "epoch": 2.40944967023462, "grad_norm": 0.19368845224380493, "learning_rate": 2.7689390246302997e-05, "loss": 0.3814, "step": 66855 }, { "epoch": 2.4096298698958445, "grad_norm": 0.19200463593006134, "learning_rate": 2.7686489028540748e-05, "loss": 0.3739, "step": 66860 }, { "epoch": 2.4098100695570692, "grad_norm": 0.22479012608528137, "learning_rate": 2.76835877741756e-05, "loss": 0.4098, "step": 66865 }, { "epoch": 2.409990269218294, "grad_norm": 0.23108981549739838, "learning_rate": 2.7680686483247098e-05, "loss": 0.3996, "step": 66870 }, { "epoch": 2.4101704688795187, "grad_norm": 0.20057092607021332, "learning_rate": 2.767778515579475e-05, "loss": 0.4096, "step": 66875 }, { "epoch": 2.410350668540743, "grad_norm": 0.17036274075508118, "learning_rate": 2.7674883791858107e-05, "loss": 0.4209, "step": 66880 }, { "epoch": 2.4105308682019677, "grad_norm": 0.19558821618556976, "learning_rate": 2.7671982391476686e-05, "loss": 0.4133, "step": 66885 }, { "epoch": 2.4107110678631924, "grad_norm": 0.2363506406545639, "learning_rate": 2.7669080954690023e-05, "loss": 0.4149, "step": 66890 }, { "epoch": 2.410891267524417, "grad_norm": 0.23225028812885284, "learning_rate": 2.7666179481537646e-05, "loss": 0.4357, "step": 66895 }, { "epoch": 2.411071467185642, "grad_norm": 0.20364603400230408, "learning_rate": 2.76632779720591e-05, "loss": 0.3863, "step": 66900 }, { "epoch": 2.411251666846866, "grad_norm": 0.14624075591564178, "learning_rate": 2.76603764262939e-05, "loss": 0.3693, "step": 66905 }, { "epoch": 2.411431866508091, "grad_norm": 0.20455005764961243, "learning_rate": 2.7657474844281577e-05, "loss": 0.3922, "step": 66910 }, { "epoch": 2.4116120661693157, "grad_norm": 0.19567883014678955, "learning_rate": 2.7654573226061686e-05, "loss": 0.4302, "step": 66915 }, { "epoch": 2.4117922658305404, "grad_norm": 0.2601669430732727, "learning_rate": 2.7651671571673743e-05, "loss": 0.3964, "step": 66920 }, { "epoch": 2.4119724654917647, "grad_norm": 0.22823162376880646, "learning_rate": 2.76487698811573e-05, "loss": 0.3906, "step": 66925 }, { "epoch": 2.4121526651529894, "grad_norm": 0.2383589893579483, "learning_rate": 2.764586815455187e-05, "loss": 0.4254, "step": 66930 }, { "epoch": 2.412332864814214, "grad_norm": 0.18745693564414978, "learning_rate": 2.764296639189699e-05, "loss": 0.3825, "step": 66935 }, { "epoch": 2.412513064475439, "grad_norm": 0.2072102427482605, "learning_rate": 2.7640064593232218e-05, "loss": 0.3881, "step": 66940 }, { "epoch": 2.4126932641366636, "grad_norm": 0.1848241686820984, "learning_rate": 2.7637162758597073e-05, "loss": 0.3747, "step": 66945 }, { "epoch": 2.412873463797888, "grad_norm": 0.1639891117811203, "learning_rate": 2.7634260888031098e-05, "loss": 0.3715, "step": 66950 }, { "epoch": 2.4130536634591127, "grad_norm": 0.20181863009929657, "learning_rate": 2.7631358981573824e-05, "loss": 0.4048, "step": 66955 }, { "epoch": 2.4132338631203374, "grad_norm": 0.1724659949541092, "learning_rate": 2.76284570392648e-05, "loss": 0.4168, "step": 66960 }, { "epoch": 2.413414062781562, "grad_norm": 0.23063142597675323, "learning_rate": 2.7625555061143553e-05, "loss": 0.4061, "step": 66965 }, { "epoch": 2.4135942624427864, "grad_norm": 0.23275382816791534, "learning_rate": 2.7622653047249626e-05, "loss": 0.4226, "step": 66970 }, { "epoch": 2.413774462104011, "grad_norm": 0.17771199345588684, "learning_rate": 2.7619750997622564e-05, "loss": 0.4079, "step": 66975 }, { "epoch": 2.413954661765236, "grad_norm": 0.20910033583641052, "learning_rate": 2.7616848912301892e-05, "loss": 0.4074, "step": 66980 }, { "epoch": 2.4141348614264606, "grad_norm": 0.2285621017217636, "learning_rate": 2.7613946791327167e-05, "loss": 0.4133, "step": 66985 }, { "epoch": 2.4143150610876853, "grad_norm": 0.22376610338687897, "learning_rate": 2.7611044634737926e-05, "loss": 0.3761, "step": 66990 }, { "epoch": 2.4144952607489096, "grad_norm": 0.20609983801841736, "learning_rate": 2.7608142442573704e-05, "loss": 0.376, "step": 66995 }, { "epoch": 2.4146754604101344, "grad_norm": 0.20516866445541382, "learning_rate": 2.7605240214874038e-05, "loss": 0.3812, "step": 67000 }, { "epoch": 2.4146754604101344, "eval_loss": 0.4341346025466919, "eval_runtime": 3.524, "eval_samples_per_second": 28.377, "eval_steps_per_second": 7.094, "step": 67000 }, { "epoch": 2.414855660071359, "grad_norm": 0.19318906962871552, "learning_rate": 2.760233795167849e-05, "loss": 0.376, "step": 67005 }, { "epoch": 2.415035859732584, "grad_norm": 0.19978414475917816, "learning_rate": 2.759943565302659e-05, "loss": 0.3926, "step": 67010 }, { "epoch": 2.415216059393808, "grad_norm": 0.19029457867145538, "learning_rate": 2.759653331895788e-05, "loss": 0.3724, "step": 67015 }, { "epoch": 2.415396259055033, "grad_norm": 0.1965416818857193, "learning_rate": 2.7593630949511908e-05, "loss": 0.4342, "step": 67020 }, { "epoch": 2.4155764587162576, "grad_norm": 0.19365379214286804, "learning_rate": 2.7590728544728213e-05, "loss": 0.3787, "step": 67025 }, { "epoch": 2.4157566583774823, "grad_norm": 0.24059845507144928, "learning_rate": 2.7587826104646348e-05, "loss": 0.3632, "step": 67030 }, { "epoch": 2.415936858038707, "grad_norm": 0.2164888083934784, "learning_rate": 2.758492362930585e-05, "loss": 0.4078, "step": 67035 }, { "epoch": 2.4161170576999313, "grad_norm": 0.21107304096221924, "learning_rate": 2.7582021118746266e-05, "loss": 0.3813, "step": 67040 }, { "epoch": 2.416297257361156, "grad_norm": 0.23840226233005524, "learning_rate": 2.7579118573007144e-05, "loss": 0.3851, "step": 67045 }, { "epoch": 2.416477457022381, "grad_norm": 0.23817472159862518, "learning_rate": 2.7576215992128034e-05, "loss": 0.3961, "step": 67050 }, { "epoch": 2.4166576566836055, "grad_norm": 0.23038597404956818, "learning_rate": 2.7573313376148478e-05, "loss": 0.4176, "step": 67055 }, { "epoch": 2.41683785634483, "grad_norm": 0.18033789098262787, "learning_rate": 2.7570410725108027e-05, "loss": 0.4053, "step": 67060 }, { "epoch": 2.4170180560060546, "grad_norm": 0.24066656827926636, "learning_rate": 2.7567508039046232e-05, "loss": 0.4378, "step": 67065 }, { "epoch": 2.4171982556672793, "grad_norm": 0.2486283779144287, "learning_rate": 2.7564605318002627e-05, "loss": 0.4038, "step": 67070 }, { "epoch": 2.417378455328504, "grad_norm": 0.2280510812997818, "learning_rate": 2.756170256201678e-05, "loss": 0.3974, "step": 67075 }, { "epoch": 2.4175586549897288, "grad_norm": 0.19952905178070068, "learning_rate": 2.7558799771128223e-05, "loss": 0.408, "step": 67080 }, { "epoch": 2.417738854650953, "grad_norm": 0.19479811191558838, "learning_rate": 2.755589694537652e-05, "loss": 0.3922, "step": 67085 }, { "epoch": 2.417919054312178, "grad_norm": 0.2107398509979248, "learning_rate": 2.755299408480122e-05, "loss": 0.4211, "step": 67090 }, { "epoch": 2.4180992539734025, "grad_norm": 0.21813732385635376, "learning_rate": 2.755009118944186e-05, "loss": 0.3841, "step": 67095 }, { "epoch": 2.4182794536346273, "grad_norm": 0.18200333416461945, "learning_rate": 2.754718825933801e-05, "loss": 0.4034, "step": 67100 }, { "epoch": 2.418459653295852, "grad_norm": 0.18498298525810242, "learning_rate": 2.7544285294529204e-05, "loss": 0.3802, "step": 67105 }, { "epoch": 2.4186398529570763, "grad_norm": 0.21120643615722656, "learning_rate": 2.7541382295055014e-05, "loss": 0.4255, "step": 67110 }, { "epoch": 2.418820052618301, "grad_norm": 0.2860146164894104, "learning_rate": 2.753847926095498e-05, "loss": 0.402, "step": 67115 }, { "epoch": 2.4190002522795258, "grad_norm": 0.21103060245513916, "learning_rate": 2.7535576192268648e-05, "loss": 0.3978, "step": 67120 }, { "epoch": 2.4191804519407505, "grad_norm": 0.1569247990846634, "learning_rate": 2.7532673089035588e-05, "loss": 0.3973, "step": 67125 }, { "epoch": 2.419360651601975, "grad_norm": 0.23274268209934235, "learning_rate": 2.7529769951295355e-05, "loss": 0.3947, "step": 67130 }, { "epoch": 2.4195408512631995, "grad_norm": 0.1998492181301117, "learning_rate": 2.752686677908749e-05, "loss": 0.4082, "step": 67135 }, { "epoch": 2.4197210509244242, "grad_norm": 0.19233012199401855, "learning_rate": 2.7523963572451545e-05, "loss": 0.4093, "step": 67140 }, { "epoch": 2.419901250585649, "grad_norm": 0.22969095408916473, "learning_rate": 2.7521060331427096e-05, "loss": 0.4131, "step": 67145 }, { "epoch": 2.4200814502468737, "grad_norm": 0.2119283676147461, "learning_rate": 2.751815705605369e-05, "loss": 0.4162, "step": 67150 }, { "epoch": 2.420261649908098, "grad_norm": 0.18394353985786438, "learning_rate": 2.7515253746370873e-05, "loss": 0.3837, "step": 67155 }, { "epoch": 2.4204418495693227, "grad_norm": 0.19230255484580994, "learning_rate": 2.7512350402418212e-05, "loss": 0.3603, "step": 67160 }, { "epoch": 2.4206220492305475, "grad_norm": 0.2101142406463623, "learning_rate": 2.7509447024235262e-05, "loss": 0.3784, "step": 67165 }, { "epoch": 2.420802248891772, "grad_norm": 0.1899431347846985, "learning_rate": 2.7506543611861598e-05, "loss": 0.3301, "step": 67170 }, { "epoch": 2.420982448552997, "grad_norm": 0.2313234508037567, "learning_rate": 2.750364016533674e-05, "loss": 0.3929, "step": 67175 }, { "epoch": 2.4211626482142212, "grad_norm": 0.20797547698020935, "learning_rate": 2.7500736684700283e-05, "loss": 0.3923, "step": 67180 }, { "epoch": 2.421342847875446, "grad_norm": 0.2217523455619812, "learning_rate": 2.7497833169991767e-05, "loss": 0.4297, "step": 67185 }, { "epoch": 2.4215230475366707, "grad_norm": 0.19359783828258514, "learning_rate": 2.7494929621250765e-05, "loss": 0.3811, "step": 67190 }, { "epoch": 2.4217032471978954, "grad_norm": 0.21590293943881989, "learning_rate": 2.7492026038516826e-05, "loss": 0.3841, "step": 67195 }, { "epoch": 2.4218834468591197, "grad_norm": 0.17126692831516266, "learning_rate": 2.748912242182951e-05, "loss": 0.3761, "step": 67200 }, { "epoch": 2.4220636465203444, "grad_norm": 0.24167068302631378, "learning_rate": 2.7486218771228388e-05, "loss": 0.3784, "step": 67205 }, { "epoch": 2.422243846181569, "grad_norm": 0.21758897602558136, "learning_rate": 2.7483315086753015e-05, "loss": 0.4132, "step": 67210 }, { "epoch": 2.422424045842794, "grad_norm": 0.20795123279094696, "learning_rate": 2.7480411368442956e-05, "loss": 0.3909, "step": 67215 }, { "epoch": 2.4226042455040186, "grad_norm": 0.1906597912311554, "learning_rate": 2.7477507616337773e-05, "loss": 0.3742, "step": 67220 }, { "epoch": 2.422784445165243, "grad_norm": 0.22008202970027924, "learning_rate": 2.747460383047702e-05, "loss": 0.4114, "step": 67225 }, { "epoch": 2.4229646448264677, "grad_norm": 0.2209016978740692, "learning_rate": 2.747170001090028e-05, "loss": 0.3567, "step": 67230 }, { "epoch": 2.4231448444876924, "grad_norm": 0.2042957842350006, "learning_rate": 2.74687961576471e-05, "loss": 0.3943, "step": 67235 }, { "epoch": 2.423325044148917, "grad_norm": 0.2651365399360657, "learning_rate": 2.7465892270757055e-05, "loss": 0.4139, "step": 67240 }, { "epoch": 2.4235052438101414, "grad_norm": 0.2116497904062271, "learning_rate": 2.7462988350269698e-05, "loss": 0.4203, "step": 67245 }, { "epoch": 2.423685443471366, "grad_norm": 0.24418489634990692, "learning_rate": 2.7460084396224606e-05, "loss": 0.3874, "step": 67250 }, { "epoch": 2.423865643132591, "grad_norm": 0.18240700662136078, "learning_rate": 2.7457180408661343e-05, "loss": 0.3952, "step": 67255 }, { "epoch": 2.4240458427938156, "grad_norm": 0.20729464292526245, "learning_rate": 2.7454276387619465e-05, "loss": 0.378, "step": 67260 }, { "epoch": 2.4242260424550404, "grad_norm": 0.22618576884269714, "learning_rate": 2.745137233313856e-05, "loss": 0.408, "step": 67265 }, { "epoch": 2.4244062421162647, "grad_norm": 0.1970742791891098, "learning_rate": 2.7448468245258162e-05, "loss": 0.4034, "step": 67270 }, { "epoch": 2.4245864417774894, "grad_norm": 0.18698358535766602, "learning_rate": 2.744556412401788e-05, "loss": 0.3862, "step": 67275 }, { "epoch": 2.424766641438714, "grad_norm": 0.18384528160095215, "learning_rate": 2.7442659969457247e-05, "loss": 0.3994, "step": 67280 }, { "epoch": 2.424946841099939, "grad_norm": 0.199675053358078, "learning_rate": 2.7439755781615845e-05, "loss": 0.3781, "step": 67285 }, { "epoch": 2.425127040761163, "grad_norm": 0.2059258222579956, "learning_rate": 2.7436851560533246e-05, "loss": 0.4037, "step": 67290 }, { "epoch": 2.425307240422388, "grad_norm": 0.21822243928909302, "learning_rate": 2.7433947306249014e-05, "loss": 0.3944, "step": 67295 }, { "epoch": 2.4254874400836126, "grad_norm": 0.19655758142471313, "learning_rate": 2.7431043018802725e-05, "loss": 0.3946, "step": 67300 }, { "epoch": 2.4256676397448373, "grad_norm": 0.2280445247888565, "learning_rate": 2.742813869823394e-05, "loss": 0.3906, "step": 67305 }, { "epoch": 2.425847839406062, "grad_norm": 0.16742421686649323, "learning_rate": 2.742523434458224e-05, "loss": 0.3632, "step": 67310 }, { "epoch": 2.4260280390672864, "grad_norm": 0.19926291704177856, "learning_rate": 2.7422329957887188e-05, "loss": 0.4446, "step": 67315 }, { "epoch": 2.426208238728511, "grad_norm": 0.17569011449813843, "learning_rate": 2.7419425538188365e-05, "loss": 0.3709, "step": 67320 }, { "epoch": 2.426388438389736, "grad_norm": 0.26853156089782715, "learning_rate": 2.741652108552533e-05, "loss": 0.4286, "step": 67325 }, { "epoch": 2.4265686380509606, "grad_norm": 0.20490770041942596, "learning_rate": 2.741361659993767e-05, "loss": 0.3812, "step": 67330 }, { "epoch": 2.426748837712185, "grad_norm": 0.24885760247707367, "learning_rate": 2.7410712081464947e-05, "loss": 0.4166, "step": 67335 }, { "epoch": 2.4269290373734096, "grad_norm": 0.24645163118839264, "learning_rate": 2.7407807530146735e-05, "loss": 0.3814, "step": 67340 }, { "epoch": 2.4271092370346343, "grad_norm": 0.24521587789058685, "learning_rate": 2.740490294602262e-05, "loss": 0.3759, "step": 67345 }, { "epoch": 2.427289436695859, "grad_norm": 0.20544666051864624, "learning_rate": 2.7401998329132162e-05, "loss": 0.404, "step": 67350 }, { "epoch": 2.427469636357084, "grad_norm": 0.17196045815944672, "learning_rate": 2.7399093679514947e-05, "loss": 0.3892, "step": 67355 }, { "epoch": 2.4276498360183085, "grad_norm": 0.18687868118286133, "learning_rate": 2.7396188997210538e-05, "loss": 0.3991, "step": 67360 }, { "epoch": 2.427830035679533, "grad_norm": 0.23785381019115448, "learning_rate": 2.7393284282258517e-05, "loss": 0.4111, "step": 67365 }, { "epoch": 2.4280102353407575, "grad_norm": 0.23783884942531586, "learning_rate": 2.739037953469847e-05, "loss": 0.3777, "step": 67370 }, { "epoch": 2.4281904350019823, "grad_norm": 0.16781599819660187, "learning_rate": 2.7387474754569964e-05, "loss": 0.3757, "step": 67375 }, { "epoch": 2.428370634663207, "grad_norm": 0.15680092573165894, "learning_rate": 2.7384569941912574e-05, "loss": 0.4104, "step": 67380 }, { "epoch": 2.4285508343244313, "grad_norm": 0.23837818205356598, "learning_rate": 2.7381665096765874e-05, "loss": 0.3977, "step": 67385 }, { "epoch": 2.428731033985656, "grad_norm": 0.2140997052192688, "learning_rate": 2.7378760219169453e-05, "loss": 0.4186, "step": 67390 }, { "epoch": 2.4289112336468808, "grad_norm": 0.25618883967399597, "learning_rate": 2.7375855309162886e-05, "loss": 0.3869, "step": 67395 }, { "epoch": 2.4290914333081055, "grad_norm": 0.26297783851623535, "learning_rate": 2.7372950366785748e-05, "loss": 0.411, "step": 67400 }, { "epoch": 2.4292716329693302, "grad_norm": 0.22148928046226501, "learning_rate": 2.7370045392077626e-05, "loss": 0.3997, "step": 67405 }, { "epoch": 2.4294518326305545, "grad_norm": 0.21536856889724731, "learning_rate": 2.736714038507808e-05, "loss": 0.4008, "step": 67410 }, { "epoch": 2.4296320322917793, "grad_norm": 0.18646790087223053, "learning_rate": 2.736423534582672e-05, "loss": 0.3861, "step": 67415 }, { "epoch": 2.429812231953004, "grad_norm": 0.17240692675113678, "learning_rate": 2.7361330274363113e-05, "loss": 0.3718, "step": 67420 }, { "epoch": 2.4299924316142287, "grad_norm": 0.17794083058834076, "learning_rate": 2.735842517072683e-05, "loss": 0.4088, "step": 67425 }, { "epoch": 2.430172631275453, "grad_norm": 0.17455384135246277, "learning_rate": 2.735552003495746e-05, "loss": 0.4008, "step": 67430 }, { "epoch": 2.4303528309366778, "grad_norm": 0.22316579520702362, "learning_rate": 2.735261486709459e-05, "loss": 0.4001, "step": 67435 }, { "epoch": 2.4305330305979025, "grad_norm": 0.18151316046714783, "learning_rate": 2.734970966717779e-05, "loss": 0.4002, "step": 67440 }, { "epoch": 2.4307132302591272, "grad_norm": 0.27039554715156555, "learning_rate": 2.7346804435246658e-05, "loss": 0.4181, "step": 67445 }, { "epoch": 2.430893429920352, "grad_norm": 0.24719054996967316, "learning_rate": 2.734389917134077e-05, "loss": 0.417, "step": 67450 }, { "epoch": 2.4310736295815762, "grad_norm": 0.2097766101360321, "learning_rate": 2.7340993875499704e-05, "loss": 0.387, "step": 67455 }, { "epoch": 2.431253829242801, "grad_norm": 0.23086480796337128, "learning_rate": 2.7338088547763057e-05, "loss": 0.4455, "step": 67460 }, { "epoch": 2.4314340289040257, "grad_norm": 0.21231964230537415, "learning_rate": 2.7335183188170398e-05, "loss": 0.3746, "step": 67465 }, { "epoch": 2.4316142285652504, "grad_norm": 0.1851814240217209, "learning_rate": 2.733227779676133e-05, "loss": 0.4, "step": 67470 }, { "epoch": 2.4317944282264747, "grad_norm": 0.1971953958272934, "learning_rate": 2.7329372373575418e-05, "loss": 0.398, "step": 67475 }, { "epoch": 2.4319746278876995, "grad_norm": 0.24778111279010773, "learning_rate": 2.7326466918652267e-05, "loss": 0.4247, "step": 67480 }, { "epoch": 2.432154827548924, "grad_norm": 0.20682400465011597, "learning_rate": 2.7323561432031452e-05, "loss": 0.383, "step": 67485 }, { "epoch": 2.432335027210149, "grad_norm": 0.20953048765659332, "learning_rate": 2.7320655913752558e-05, "loss": 0.4237, "step": 67490 }, { "epoch": 2.4325152268713737, "grad_norm": 0.20623236894607544, "learning_rate": 2.7317750363855184e-05, "loss": 0.3984, "step": 67495 }, { "epoch": 2.432695426532598, "grad_norm": 0.19964554905891418, "learning_rate": 2.7314844782378907e-05, "loss": 0.3993, "step": 67500 }, { "epoch": 2.432695426532598, "eval_loss": 0.43458136916160583, "eval_runtime": 3.5307, "eval_samples_per_second": 28.323, "eval_steps_per_second": 7.081, "step": 67500 }, { "epoch": 2.4328756261938227, "grad_norm": 0.2280506193637848, "learning_rate": 2.731193916936331e-05, "loss": 0.3918, "step": 67505 }, { "epoch": 2.4330558258550474, "grad_norm": 0.2009294182062149, "learning_rate": 2.7309033524848e-05, "loss": 0.3965, "step": 67510 }, { "epoch": 2.433236025516272, "grad_norm": 0.18741028010845184, "learning_rate": 2.7306127848872547e-05, "loss": 0.3826, "step": 67515 }, { "epoch": 2.4334162251774965, "grad_norm": 0.2168714553117752, "learning_rate": 2.7303222141476566e-05, "loss": 0.3582, "step": 67520 }, { "epoch": 2.433596424838721, "grad_norm": 0.2223115712404251, "learning_rate": 2.730031640269961e-05, "loss": 0.3941, "step": 67525 }, { "epoch": 2.433776624499946, "grad_norm": 0.24095144867897034, "learning_rate": 2.7297410632581293e-05, "loss": 0.4198, "step": 67530 }, { "epoch": 2.4339568241611707, "grad_norm": 0.162398099899292, "learning_rate": 2.72945048311612e-05, "loss": 0.3884, "step": 67535 }, { "epoch": 2.4341370238223954, "grad_norm": 0.2290612757205963, "learning_rate": 2.7291598998478927e-05, "loss": 0.3902, "step": 67540 }, { "epoch": 2.4343172234836197, "grad_norm": 0.19016504287719727, "learning_rate": 2.7288693134574063e-05, "loss": 0.3899, "step": 67545 }, { "epoch": 2.4344974231448444, "grad_norm": 0.19795559346675873, "learning_rate": 2.7285787239486192e-05, "loss": 0.3749, "step": 67550 }, { "epoch": 2.434677622806069, "grad_norm": 0.18482926487922668, "learning_rate": 2.7282881313254916e-05, "loss": 0.3787, "step": 67555 }, { "epoch": 2.434857822467294, "grad_norm": 0.23885639011859894, "learning_rate": 2.727997535591983e-05, "loss": 0.4029, "step": 67560 }, { "epoch": 2.435038022128518, "grad_norm": 0.17567263543605804, "learning_rate": 2.7277069367520513e-05, "loss": 0.3902, "step": 67565 }, { "epoch": 2.435218221789743, "grad_norm": 0.195444256067276, "learning_rate": 2.7274163348096564e-05, "loss": 0.4088, "step": 67570 }, { "epoch": 2.4353984214509676, "grad_norm": 0.20873726904392242, "learning_rate": 2.727125729768759e-05, "loss": 0.4304, "step": 67575 }, { "epoch": 2.4355786211121924, "grad_norm": 0.22101178765296936, "learning_rate": 2.7268351216333164e-05, "loss": 0.4032, "step": 67580 }, { "epoch": 2.435758820773417, "grad_norm": 0.23092830181121826, "learning_rate": 2.72654451040729e-05, "loss": 0.4214, "step": 67585 }, { "epoch": 2.4359390204346414, "grad_norm": 0.21683692932128906, "learning_rate": 2.7262538960946382e-05, "loss": 0.4102, "step": 67590 }, { "epoch": 2.436119220095866, "grad_norm": 0.21636758744716644, "learning_rate": 2.7259632786993205e-05, "loss": 0.3927, "step": 67595 }, { "epoch": 2.436299419757091, "grad_norm": 0.22299599647521973, "learning_rate": 2.725672658225298e-05, "loss": 0.3997, "step": 67600 }, { "epoch": 2.4364796194183156, "grad_norm": 0.1862391084432602, "learning_rate": 2.7253820346765285e-05, "loss": 0.4002, "step": 67605 }, { "epoch": 2.4366598190795403, "grad_norm": 0.18761450052261353, "learning_rate": 2.725091408056972e-05, "loss": 0.3688, "step": 67610 }, { "epoch": 2.4368400187407646, "grad_norm": 0.19558748602867126, "learning_rate": 2.724800778370589e-05, "loss": 0.4267, "step": 67615 }, { "epoch": 2.4370202184019893, "grad_norm": 0.154653400182724, "learning_rate": 2.724510145621339e-05, "loss": 0.3866, "step": 67620 }, { "epoch": 2.437200418063214, "grad_norm": 0.18840929865837097, "learning_rate": 2.7242195098131824e-05, "loss": 0.3955, "step": 67625 }, { "epoch": 2.437380617724439, "grad_norm": 0.2518814206123352, "learning_rate": 2.7239288709500772e-05, "loss": 0.4223, "step": 67630 }, { "epoch": 2.4375608173856635, "grad_norm": 0.2483137845993042, "learning_rate": 2.723638229035985e-05, "loss": 0.3919, "step": 67635 }, { "epoch": 2.437741017046888, "grad_norm": 0.18424572050571442, "learning_rate": 2.7233475840748645e-05, "loss": 0.3912, "step": 67640 }, { "epoch": 2.4379212167081126, "grad_norm": 0.18739376962184906, "learning_rate": 2.7230569360706777e-05, "loss": 0.4361, "step": 67645 }, { "epoch": 2.4381014163693373, "grad_norm": 0.16617445647716522, "learning_rate": 2.7227662850273827e-05, "loss": 0.3941, "step": 67650 }, { "epoch": 2.438281616030562, "grad_norm": 0.23537923395633698, "learning_rate": 2.7224756309489403e-05, "loss": 0.3962, "step": 67655 }, { "epoch": 2.4384618156917863, "grad_norm": 0.21504931151866913, "learning_rate": 2.7221849738393103e-05, "loss": 0.4205, "step": 67660 }, { "epoch": 2.438642015353011, "grad_norm": 0.18787220120429993, "learning_rate": 2.7218943137024532e-05, "loss": 0.3643, "step": 67665 }, { "epoch": 2.438822215014236, "grad_norm": 0.24440403282642365, "learning_rate": 2.7216036505423293e-05, "loss": 0.3797, "step": 67670 }, { "epoch": 2.4390024146754605, "grad_norm": 0.2223052680492401, "learning_rate": 2.721312984362898e-05, "loss": 0.4005, "step": 67675 }, { "epoch": 2.4391826143366853, "grad_norm": 0.18885168433189392, "learning_rate": 2.7210223151681207e-05, "loss": 0.3727, "step": 67680 }, { "epoch": 2.4393628139979096, "grad_norm": 0.1814367175102234, "learning_rate": 2.7207316429619566e-05, "loss": 0.3837, "step": 67685 }, { "epoch": 2.4395430136591343, "grad_norm": 0.18942394852638245, "learning_rate": 2.7204409677483668e-05, "loss": 0.4423, "step": 67690 }, { "epoch": 2.439723213320359, "grad_norm": 0.16898776590824127, "learning_rate": 2.720150289531312e-05, "loss": 0.3894, "step": 67695 }, { "epoch": 2.4399034129815838, "grad_norm": 0.1935228556394577, "learning_rate": 2.719859608314751e-05, "loss": 0.4158, "step": 67700 }, { "epoch": 2.440083612642808, "grad_norm": 0.22555245459079742, "learning_rate": 2.719568924102647e-05, "loss": 0.3861, "step": 67705 }, { "epoch": 2.4402638123040328, "grad_norm": 0.22015783190727234, "learning_rate": 2.7192782368989577e-05, "loss": 0.3656, "step": 67710 }, { "epoch": 2.4404440119652575, "grad_norm": 0.20583826303482056, "learning_rate": 2.7189875467076454e-05, "loss": 0.4075, "step": 67715 }, { "epoch": 2.4406242116264822, "grad_norm": 0.2194293737411499, "learning_rate": 2.7186968535326694e-05, "loss": 0.4094, "step": 67720 }, { "epoch": 2.440804411287707, "grad_norm": 0.15374071896076202, "learning_rate": 2.7184061573779918e-05, "loss": 0.3868, "step": 67725 }, { "epoch": 2.4409846109489313, "grad_norm": 0.2262052297592163, "learning_rate": 2.7181154582475726e-05, "loss": 0.4121, "step": 67730 }, { "epoch": 2.441164810610156, "grad_norm": 0.21393641829490662, "learning_rate": 2.7178247561453724e-05, "loss": 0.3887, "step": 67735 }, { "epoch": 2.4413450102713807, "grad_norm": 0.23438535630702972, "learning_rate": 2.717534051075352e-05, "loss": 0.4055, "step": 67740 }, { "epoch": 2.4415252099326055, "grad_norm": 0.1914755254983902, "learning_rate": 2.7172433430414725e-05, "loss": 0.4044, "step": 67745 }, { "epoch": 2.4417054095938298, "grad_norm": 0.22475279867649078, "learning_rate": 2.7169526320476944e-05, "loss": 0.3659, "step": 67750 }, { "epoch": 2.4418856092550545, "grad_norm": 0.2241634875535965, "learning_rate": 2.7166619180979784e-05, "loss": 0.4037, "step": 67755 }, { "epoch": 2.4420658089162792, "grad_norm": 0.23309601843357086, "learning_rate": 2.716371201196286e-05, "loss": 0.3942, "step": 67760 }, { "epoch": 2.442246008577504, "grad_norm": 0.20057342946529388, "learning_rate": 2.716080481346578e-05, "loss": 0.3939, "step": 67765 }, { "epoch": 2.4424262082387287, "grad_norm": 0.19580478966236115, "learning_rate": 2.715789758552815e-05, "loss": 0.3698, "step": 67770 }, { "epoch": 2.442606407899953, "grad_norm": 0.21272897720336914, "learning_rate": 2.7154990328189583e-05, "loss": 0.4033, "step": 67775 }, { "epoch": 2.4427866075611777, "grad_norm": 0.22313345968723297, "learning_rate": 2.715208304148969e-05, "loss": 0.3641, "step": 67780 }, { "epoch": 2.4429668072224024, "grad_norm": 0.20916157960891724, "learning_rate": 2.7149175725468086e-05, "loss": 0.3875, "step": 67785 }, { "epoch": 2.443147006883627, "grad_norm": 0.20672792196273804, "learning_rate": 2.7146268380164376e-05, "loss": 0.4256, "step": 67790 }, { "epoch": 2.4433272065448515, "grad_norm": 0.19972656667232513, "learning_rate": 2.7143361005618168e-05, "loss": 0.4102, "step": 67795 }, { "epoch": 2.443507406206076, "grad_norm": 0.18391171097755432, "learning_rate": 2.7140453601869093e-05, "loss": 0.4145, "step": 67800 }, { "epoch": 2.443687605867301, "grad_norm": 0.17627693712711334, "learning_rate": 2.713754616895674e-05, "loss": 0.3597, "step": 67805 }, { "epoch": 2.4438678055285257, "grad_norm": 0.25177159905433655, "learning_rate": 2.7134638706920752e-05, "loss": 0.4215, "step": 67810 }, { "epoch": 2.4440480051897504, "grad_norm": 0.2532104253768921, "learning_rate": 2.713173121580071e-05, "loss": 0.3984, "step": 67815 }, { "epoch": 2.4442282048509747, "grad_norm": 0.19765953719615936, "learning_rate": 2.7128823695636253e-05, "loss": 0.4179, "step": 67820 }, { "epoch": 2.4444084045121994, "grad_norm": 0.2110573947429657, "learning_rate": 2.7125916146466973e-05, "loss": 0.3967, "step": 67825 }, { "epoch": 2.444588604173424, "grad_norm": 0.22106719017028809, "learning_rate": 2.7123008568332504e-05, "loss": 0.4195, "step": 67830 }, { "epoch": 2.444768803834649, "grad_norm": 0.21176251769065857, "learning_rate": 2.7120100961272455e-05, "loss": 0.3917, "step": 67835 }, { "epoch": 2.444949003495873, "grad_norm": 0.2130795568227768, "learning_rate": 2.711719332532644e-05, "loss": 0.3974, "step": 67840 }, { "epoch": 2.445129203157098, "grad_norm": 0.20041079819202423, "learning_rate": 2.7114285660534078e-05, "loss": 0.415, "step": 67845 }, { "epoch": 2.4453094028183227, "grad_norm": 0.2205459475517273, "learning_rate": 2.711137796693498e-05, "loss": 0.4196, "step": 67850 }, { "epoch": 2.4454896024795474, "grad_norm": 0.2311592847108841, "learning_rate": 2.7108470244568767e-05, "loss": 0.4111, "step": 67855 }, { "epoch": 2.445669802140772, "grad_norm": 0.2623636722564697, "learning_rate": 2.7105562493475052e-05, "loss": 0.4283, "step": 67860 }, { "epoch": 2.445850001801997, "grad_norm": 0.21312451362609863, "learning_rate": 2.710265471369346e-05, "loss": 0.3945, "step": 67865 }, { "epoch": 2.446030201463221, "grad_norm": 0.1985563039779663, "learning_rate": 2.7099746905263606e-05, "loss": 0.356, "step": 67870 }, { "epoch": 2.446210401124446, "grad_norm": 0.20180106163024902, "learning_rate": 2.70968390682251e-05, "loss": 0.3865, "step": 67875 }, { "epoch": 2.4463906007856706, "grad_norm": 0.1858830749988556, "learning_rate": 2.7093931202617572e-05, "loss": 0.417, "step": 67880 }, { "epoch": 2.4465708004468953, "grad_norm": 0.2230648547410965, "learning_rate": 2.7091023308480632e-05, "loss": 0.4184, "step": 67885 }, { "epoch": 2.4467510001081196, "grad_norm": 0.24601806700229645, "learning_rate": 2.708811538585392e-05, "loss": 0.4014, "step": 67890 }, { "epoch": 2.4469311997693444, "grad_norm": 0.1883206069469452, "learning_rate": 2.7085207434777026e-05, "loss": 0.3919, "step": 67895 }, { "epoch": 2.447111399430569, "grad_norm": 0.22426816821098328, "learning_rate": 2.7082299455289588e-05, "loss": 0.4058, "step": 67900 }, { "epoch": 2.447291599091794, "grad_norm": 0.2288169413805008, "learning_rate": 2.7079391447431218e-05, "loss": 0.3892, "step": 67905 }, { "epoch": 2.4474717987530186, "grad_norm": 0.2036815732717514, "learning_rate": 2.7076483411241547e-05, "loss": 0.3782, "step": 67910 }, { "epoch": 2.447651998414243, "grad_norm": 0.1831190586090088, "learning_rate": 2.7073575346760194e-05, "loss": 0.3705, "step": 67915 }, { "epoch": 2.4478321980754676, "grad_norm": 0.1885637491941452, "learning_rate": 2.707066725402677e-05, "loss": 0.4212, "step": 67920 }, { "epoch": 2.4480123977366923, "grad_norm": 0.202529639005661, "learning_rate": 2.7067759133080917e-05, "loss": 0.4009, "step": 67925 }, { "epoch": 2.448192597397917, "grad_norm": 0.22661788761615753, "learning_rate": 2.7064850983962243e-05, "loss": 0.4497, "step": 67930 }, { "epoch": 2.4483727970591413, "grad_norm": 0.2267916351556778, "learning_rate": 2.7061942806710368e-05, "loss": 0.3874, "step": 67935 }, { "epoch": 2.448552996720366, "grad_norm": 0.22525878250598907, "learning_rate": 2.7059034601364925e-05, "loss": 0.3741, "step": 67940 }, { "epoch": 2.448733196381591, "grad_norm": 0.18289059400558472, "learning_rate": 2.705612636796553e-05, "loss": 0.3917, "step": 67945 }, { "epoch": 2.4489133960428155, "grad_norm": 0.24663716554641724, "learning_rate": 2.7053218106551825e-05, "loss": 0.3758, "step": 67950 }, { "epoch": 2.4490935957040403, "grad_norm": 0.25891220569610596, "learning_rate": 2.7050309817163407e-05, "loss": 0.4179, "step": 67955 }, { "epoch": 2.4492737953652646, "grad_norm": 0.20873096585273743, "learning_rate": 2.704740149983992e-05, "loss": 0.4246, "step": 67960 }, { "epoch": 2.4494539950264893, "grad_norm": 0.2050144225358963, "learning_rate": 2.7044493154620975e-05, "loss": 0.4267, "step": 67965 }, { "epoch": 2.449634194687714, "grad_norm": 0.189696803689003, "learning_rate": 2.704158478154622e-05, "loss": 0.3697, "step": 67970 }, { "epoch": 2.4498143943489388, "grad_norm": 0.2150600403547287, "learning_rate": 2.7038676380655265e-05, "loss": 0.3711, "step": 67975 }, { "epoch": 2.449994594010163, "grad_norm": 0.2863604724407196, "learning_rate": 2.7035767951987735e-05, "loss": 0.4377, "step": 67980 }, { "epoch": 2.450174793671388, "grad_norm": 0.21974705159664154, "learning_rate": 2.7032859495583258e-05, "loss": 0.4083, "step": 67985 }, { "epoch": 2.4503549933326125, "grad_norm": 0.22870804369449615, "learning_rate": 2.7029951011481464e-05, "loss": 0.3969, "step": 67990 }, { "epoch": 2.4505351929938373, "grad_norm": 0.19309473037719727, "learning_rate": 2.7027042499721994e-05, "loss": 0.3831, "step": 67995 }, { "epoch": 2.450715392655062, "grad_norm": 0.22232699394226074, "learning_rate": 2.702413396034445e-05, "loss": 0.3634, "step": 68000 }, { "epoch": 2.450715392655062, "eval_loss": 0.43447232246398926, "eval_runtime": 3.5294, "eval_samples_per_second": 28.333, "eval_steps_per_second": 7.083, "step": 68000 }, { "epoch": 2.4508955923162863, "grad_norm": 0.16693948209285736, "learning_rate": 2.7021225393388477e-05, "loss": 0.3795, "step": 68005 }, { "epoch": 2.451075791977511, "grad_norm": 0.19517579674720764, "learning_rate": 2.7018316798893695e-05, "loss": 0.396, "step": 68010 }, { "epoch": 2.4512559916387358, "grad_norm": 0.18335868418216705, "learning_rate": 2.701540817689974e-05, "loss": 0.3768, "step": 68015 }, { "epoch": 2.4514361912999605, "grad_norm": 0.24918295443058014, "learning_rate": 2.701249952744624e-05, "loss": 0.3984, "step": 68020 }, { "epoch": 2.4516163909611848, "grad_norm": 0.1807776242494583, "learning_rate": 2.700959085057282e-05, "loss": 0.41, "step": 68025 }, { "epoch": 2.4517965906224095, "grad_norm": 0.18439331650733948, "learning_rate": 2.700668214631912e-05, "loss": 0.3853, "step": 68030 }, { "epoch": 2.4519767902836342, "grad_norm": 0.19809627532958984, "learning_rate": 2.700377341472476e-05, "loss": 0.3885, "step": 68035 }, { "epoch": 2.452156989944859, "grad_norm": 0.22589033842086792, "learning_rate": 2.7000864655829377e-05, "loss": 0.4058, "step": 68040 }, { "epoch": 2.4523371896060837, "grad_norm": 0.2336047887802124, "learning_rate": 2.699795586967259e-05, "loss": 0.435, "step": 68045 }, { "epoch": 2.452517389267308, "grad_norm": 0.20116937160491943, "learning_rate": 2.6995047056294054e-05, "loss": 0.3978, "step": 68050 }, { "epoch": 2.4526975889285327, "grad_norm": 0.2507151961326599, "learning_rate": 2.6992138215733385e-05, "loss": 0.4326, "step": 68055 }, { "epoch": 2.4528777885897575, "grad_norm": 0.21996821463108063, "learning_rate": 2.6989229348030214e-05, "loss": 0.3858, "step": 68060 }, { "epoch": 2.453057988250982, "grad_norm": 0.2189149707555771, "learning_rate": 2.698632045322419e-05, "loss": 0.434, "step": 68065 }, { "epoch": 2.4532381879122065, "grad_norm": 0.17697672545909882, "learning_rate": 2.6983411531354918e-05, "loss": 0.3774, "step": 68070 }, { "epoch": 2.4534183875734312, "grad_norm": 0.17975100874900818, "learning_rate": 2.6980502582462063e-05, "loss": 0.4279, "step": 68075 }, { "epoch": 2.453598587234656, "grad_norm": 0.2552946209907532, "learning_rate": 2.6977593606585232e-05, "loss": 0.3939, "step": 68080 }, { "epoch": 2.4537787868958807, "grad_norm": 0.23870886862277985, "learning_rate": 2.697468460376408e-05, "loss": 0.3806, "step": 68085 }, { "epoch": 2.4539589865571054, "grad_norm": 0.27226266264915466, "learning_rate": 2.6971775574038226e-05, "loss": 0.4057, "step": 68090 }, { "epoch": 2.4541391862183297, "grad_norm": 0.1971409022808075, "learning_rate": 2.6968866517447317e-05, "loss": 0.3958, "step": 68095 }, { "epoch": 2.4543193858795544, "grad_norm": 0.21888650953769684, "learning_rate": 2.696595743403098e-05, "loss": 0.3885, "step": 68100 }, { "epoch": 2.454499585540779, "grad_norm": 0.2371811419725418, "learning_rate": 2.6963048323828848e-05, "loss": 0.3722, "step": 68105 }, { "epoch": 2.454679785202004, "grad_norm": 0.23788423836231232, "learning_rate": 2.6960139186880574e-05, "loss": 0.3851, "step": 68110 }, { "epoch": 2.4548599848632287, "grad_norm": 0.23677048087120056, "learning_rate": 2.6957230023225778e-05, "loss": 0.4163, "step": 68115 }, { "epoch": 2.455040184524453, "grad_norm": 0.17837996780872345, "learning_rate": 2.6954320832904094e-05, "loss": 0.388, "step": 68120 }, { "epoch": 2.4552203841856777, "grad_norm": 0.19991345703601837, "learning_rate": 2.695141161595518e-05, "loss": 0.381, "step": 68125 }, { "epoch": 2.4554005838469024, "grad_norm": 0.22142352163791656, "learning_rate": 2.694850237241865e-05, "loss": 0.357, "step": 68130 }, { "epoch": 2.455580783508127, "grad_norm": 0.2008516937494278, "learning_rate": 2.6945593102334166e-05, "loss": 0.3499, "step": 68135 }, { "epoch": 2.455760983169352, "grad_norm": 0.2108553797006607, "learning_rate": 2.6942683805741337e-05, "loss": 0.4014, "step": 68140 }, { "epoch": 2.455941182830576, "grad_norm": 0.21089521050453186, "learning_rate": 2.693977448267983e-05, "loss": 0.4221, "step": 68145 }, { "epoch": 2.456121382491801, "grad_norm": 0.22154204547405243, "learning_rate": 2.693686513318926e-05, "loss": 0.3958, "step": 68150 }, { "epoch": 2.4563015821530256, "grad_norm": 0.17124603688716888, "learning_rate": 2.693395575730928e-05, "loss": 0.4072, "step": 68155 }, { "epoch": 2.4564817818142504, "grad_norm": 0.2264595329761505, "learning_rate": 2.6931046355079538e-05, "loss": 0.3937, "step": 68160 }, { "epoch": 2.4566619814754747, "grad_norm": 0.22285620868206024, "learning_rate": 2.692813692653965e-05, "loss": 0.3988, "step": 68165 }, { "epoch": 2.4568421811366994, "grad_norm": 0.20908579230308533, "learning_rate": 2.6925227471729275e-05, "loss": 0.3614, "step": 68170 }, { "epoch": 2.457022380797924, "grad_norm": 0.18623116612434387, "learning_rate": 2.692231799068805e-05, "loss": 0.4202, "step": 68175 }, { "epoch": 2.457202580459149, "grad_norm": 0.216128870844841, "learning_rate": 2.6919408483455615e-05, "loss": 0.3951, "step": 68180 }, { "epoch": 2.4573827801203736, "grad_norm": 0.18577782809734344, "learning_rate": 2.6916498950071605e-05, "loss": 0.4192, "step": 68185 }, { "epoch": 2.457562979781598, "grad_norm": 0.18905296921730042, "learning_rate": 2.6913589390575677e-05, "loss": 0.3804, "step": 68190 }, { "epoch": 2.4577431794428226, "grad_norm": 0.23588091135025024, "learning_rate": 2.6910679805007454e-05, "loss": 0.4285, "step": 68195 }, { "epoch": 2.4579233791040473, "grad_norm": 0.23398438096046448, "learning_rate": 2.6907770193406595e-05, "loss": 0.3918, "step": 68200 }, { "epoch": 2.458103578765272, "grad_norm": 0.22657303512096405, "learning_rate": 2.6904860555812738e-05, "loss": 0.3779, "step": 68205 }, { "epoch": 2.4582837784264964, "grad_norm": 0.17407497763633728, "learning_rate": 2.6901950892265514e-05, "loss": 0.4045, "step": 68210 }, { "epoch": 2.458463978087721, "grad_norm": 0.19006119668483734, "learning_rate": 2.6899041202804587e-05, "loss": 0.3903, "step": 68215 }, { "epoch": 2.458644177748946, "grad_norm": 0.22295033931732178, "learning_rate": 2.6896131487469595e-05, "loss": 0.3926, "step": 68220 }, { "epoch": 2.4588243774101706, "grad_norm": 0.21144932508468628, "learning_rate": 2.6893221746300167e-05, "loss": 0.4296, "step": 68225 }, { "epoch": 2.4590045770713953, "grad_norm": 0.2753607928752899, "learning_rate": 2.689031197933597e-05, "loss": 0.4244, "step": 68230 }, { "epoch": 2.4591847767326196, "grad_norm": 0.18635913729667664, "learning_rate": 2.6887402186616627e-05, "loss": 0.4052, "step": 68235 }, { "epoch": 2.4593649763938443, "grad_norm": 0.1916455328464508, "learning_rate": 2.6884492368181808e-05, "loss": 0.4064, "step": 68240 }, { "epoch": 2.459545176055069, "grad_norm": 0.21135953068733215, "learning_rate": 2.6881582524071137e-05, "loss": 0.4178, "step": 68245 }, { "epoch": 2.459725375716294, "grad_norm": 0.20750294625759125, "learning_rate": 2.6878672654324272e-05, "loss": 0.4014, "step": 68250 }, { "epoch": 2.459905575377518, "grad_norm": 0.18974362313747406, "learning_rate": 2.687576275898085e-05, "loss": 0.3691, "step": 68255 }, { "epoch": 2.460085775038743, "grad_norm": 0.19954413175582886, "learning_rate": 2.6872852838080524e-05, "loss": 0.3698, "step": 68260 }, { "epoch": 2.4602659746999676, "grad_norm": 0.20801982283592224, "learning_rate": 2.6869942891662947e-05, "loss": 0.3933, "step": 68265 }, { "epoch": 2.4604461743611923, "grad_norm": 0.2402622401714325, "learning_rate": 2.6867032919767754e-05, "loss": 0.4223, "step": 68270 }, { "epoch": 2.460626374022417, "grad_norm": 0.2088380753993988, "learning_rate": 2.6864122922434603e-05, "loss": 0.4071, "step": 68275 }, { "epoch": 2.4608065736836413, "grad_norm": 0.2807213366031647, "learning_rate": 2.6861212899703142e-05, "loss": 0.4163, "step": 68280 }, { "epoch": 2.460986773344866, "grad_norm": 0.1840617060661316, "learning_rate": 2.685830285161301e-05, "loss": 0.39, "step": 68285 }, { "epoch": 2.4611669730060908, "grad_norm": 0.21558882296085358, "learning_rate": 2.6855392778203858e-05, "loss": 0.3968, "step": 68290 }, { "epoch": 2.4613471726673155, "grad_norm": 0.2157306671142578, "learning_rate": 2.6852482679515345e-05, "loss": 0.4016, "step": 68295 }, { "epoch": 2.46152737232854, "grad_norm": 0.18490919470787048, "learning_rate": 2.6849572555587116e-05, "loss": 0.4155, "step": 68300 }, { "epoch": 2.4617075719897645, "grad_norm": 0.2350340336561203, "learning_rate": 2.684666240645881e-05, "loss": 0.4039, "step": 68305 }, { "epoch": 2.4618877716509893, "grad_norm": 0.1901131123304367, "learning_rate": 2.6843752232170095e-05, "loss": 0.3888, "step": 68310 }, { "epoch": 2.462067971312214, "grad_norm": 0.1914856880903244, "learning_rate": 2.68408420327606e-05, "loss": 0.3938, "step": 68315 }, { "epoch": 2.4622481709734387, "grad_norm": 0.23871468007564545, "learning_rate": 2.6837931808270006e-05, "loss": 0.3918, "step": 68320 }, { "epoch": 2.462428370634663, "grad_norm": 0.1966165006160736, "learning_rate": 2.6835021558737932e-05, "loss": 0.4099, "step": 68325 }, { "epoch": 2.4626085702958878, "grad_norm": 0.224965900182724, "learning_rate": 2.6832111284204054e-05, "loss": 0.4068, "step": 68330 }, { "epoch": 2.4627887699571125, "grad_norm": 0.20366400480270386, "learning_rate": 2.682920098470801e-05, "loss": 0.3795, "step": 68335 }, { "epoch": 2.4629689696183372, "grad_norm": 0.20222342014312744, "learning_rate": 2.682629066028946e-05, "loss": 0.3814, "step": 68340 }, { "epoch": 2.4631491692795615, "grad_norm": 0.21054887771606445, "learning_rate": 2.682338031098805e-05, "loss": 0.4116, "step": 68345 }, { "epoch": 2.4633293689407862, "grad_norm": 0.23802423477172852, "learning_rate": 2.6820469936843434e-05, "loss": 0.4045, "step": 68350 }, { "epoch": 2.463509568602011, "grad_norm": 0.2170737087726593, "learning_rate": 2.6817559537895276e-05, "loss": 0.3764, "step": 68355 }, { "epoch": 2.4636897682632357, "grad_norm": 0.2287246584892273, "learning_rate": 2.6814649114183216e-05, "loss": 0.4225, "step": 68360 }, { "epoch": 2.4638699679244604, "grad_norm": 0.17304402589797974, "learning_rate": 2.681173866574691e-05, "loss": 0.3751, "step": 68365 }, { "epoch": 2.464050167585685, "grad_norm": 0.22758109867572784, "learning_rate": 2.6808828192626022e-05, "loss": 0.4037, "step": 68370 }, { "epoch": 2.4642303672469095, "grad_norm": 0.22057542204856873, "learning_rate": 2.6805917694860195e-05, "loss": 0.4412, "step": 68375 }, { "epoch": 2.464410566908134, "grad_norm": 0.22053518891334534, "learning_rate": 2.680300717248909e-05, "loss": 0.3812, "step": 68380 }, { "epoch": 2.464590766569359, "grad_norm": 0.21665138006210327, "learning_rate": 2.6800096625552368e-05, "loss": 0.3685, "step": 68385 }, { "epoch": 2.4647709662305837, "grad_norm": 0.1874067336320877, "learning_rate": 2.6797186054089674e-05, "loss": 0.4096, "step": 68390 }, { "epoch": 2.464951165891808, "grad_norm": 0.2060876190662384, "learning_rate": 2.679427545814066e-05, "loss": 0.3738, "step": 68395 }, { "epoch": 2.4651313655530327, "grad_norm": 0.2187156081199646, "learning_rate": 2.6791364837744998e-05, "loss": 0.4185, "step": 68400 }, { "epoch": 2.4653115652142574, "grad_norm": 0.214223712682724, "learning_rate": 2.6788454192942342e-05, "loss": 0.4253, "step": 68405 }, { "epoch": 2.465491764875482, "grad_norm": 0.1790245622396469, "learning_rate": 2.6785543523772334e-05, "loss": 0.4309, "step": 68410 }, { "epoch": 2.465671964536707, "grad_norm": 0.20714068412780762, "learning_rate": 2.6782632830274645e-05, "loss": 0.4007, "step": 68415 }, { "epoch": 2.465852164197931, "grad_norm": 0.21486568450927734, "learning_rate": 2.6779722112488924e-05, "loss": 0.3859, "step": 68420 }, { "epoch": 2.466032363859156, "grad_norm": 0.24496997892856598, "learning_rate": 2.6776811370454848e-05, "loss": 0.3808, "step": 68425 }, { "epoch": 2.4662125635203807, "grad_norm": 0.24687924981117249, "learning_rate": 2.677390060421204e-05, "loss": 0.3642, "step": 68430 }, { "epoch": 2.4663927631816054, "grad_norm": 0.17569184303283691, "learning_rate": 2.6770989813800197e-05, "loss": 0.4155, "step": 68435 }, { "epoch": 2.4665729628428297, "grad_norm": 0.2444978952407837, "learning_rate": 2.676807899925895e-05, "loss": 0.4116, "step": 68440 }, { "epoch": 2.4667531625040544, "grad_norm": 0.16109678149223328, "learning_rate": 2.676516816062798e-05, "loss": 0.3992, "step": 68445 }, { "epoch": 2.466933362165279, "grad_norm": 0.20728158950805664, "learning_rate": 2.676225729794693e-05, "loss": 0.3597, "step": 68450 }, { "epoch": 2.467113561826504, "grad_norm": 0.2165517657995224, "learning_rate": 2.675934641125546e-05, "loss": 0.3792, "step": 68455 }, { "epoch": 2.4672937614877286, "grad_norm": 0.274471253156662, "learning_rate": 2.6756435500593242e-05, "loss": 0.4132, "step": 68460 }, { "epoch": 2.467473961148953, "grad_norm": 0.23226816952228546, "learning_rate": 2.675352456599993e-05, "loss": 0.4197, "step": 68465 }, { "epoch": 2.4676541608101776, "grad_norm": 0.24921727180480957, "learning_rate": 2.6750613607515184e-05, "loss": 0.4038, "step": 68470 }, { "epoch": 2.4678343604714024, "grad_norm": 0.17365936934947968, "learning_rate": 2.6747702625178667e-05, "loss": 0.4263, "step": 68475 }, { "epoch": 2.468014560132627, "grad_norm": 0.24769355356693268, "learning_rate": 2.6744791619030042e-05, "loss": 0.3998, "step": 68480 }, { "epoch": 2.4681947597938514, "grad_norm": 0.24163182079792023, "learning_rate": 2.674188058910897e-05, "loss": 0.3979, "step": 68485 }, { "epoch": 2.468374959455076, "grad_norm": 0.21235689520835876, "learning_rate": 2.6738969535455104e-05, "loss": 0.3893, "step": 68490 }, { "epoch": 2.468555159116301, "grad_norm": 0.23081441223621368, "learning_rate": 2.673605845810812e-05, "loss": 0.3736, "step": 68495 }, { "epoch": 2.4687353587775256, "grad_norm": 0.2072082906961441, "learning_rate": 2.6733147357107673e-05, "loss": 0.4183, "step": 68500 }, { "epoch": 2.4687353587775256, "eval_loss": 0.43393146991729736, "eval_runtime": 3.5293, "eval_samples_per_second": 28.334, "eval_steps_per_second": 7.084, "step": 68500 }, { "epoch": 2.4689155584387503, "grad_norm": 0.245635524392128, "learning_rate": 2.673023623249344e-05, "loss": 0.3877, "step": 68505 }, { "epoch": 2.4690957580999746, "grad_norm": 0.21268950402736664, "learning_rate": 2.672732508430506e-05, "loss": 0.3474, "step": 68510 }, { "epoch": 2.4692759577611993, "grad_norm": 0.22742435336112976, "learning_rate": 2.6724413912582208e-05, "loss": 0.3821, "step": 68515 }, { "epoch": 2.469456157422424, "grad_norm": 0.26614660024642944, "learning_rate": 2.672150271736456e-05, "loss": 0.3755, "step": 68520 }, { "epoch": 2.469636357083649, "grad_norm": 0.20712517201900482, "learning_rate": 2.671859149869177e-05, "loss": 0.4151, "step": 68525 }, { "epoch": 2.469816556744873, "grad_norm": 0.16104906797409058, "learning_rate": 2.6715680256603503e-05, "loss": 0.3874, "step": 68530 }, { "epoch": 2.469996756406098, "grad_norm": 0.20770768821239471, "learning_rate": 2.671276899113942e-05, "loss": 0.4159, "step": 68535 }, { "epoch": 2.4701769560673226, "grad_norm": 0.16630055010318756, "learning_rate": 2.670985770233919e-05, "loss": 0.4349, "step": 68540 }, { "epoch": 2.4703571557285473, "grad_norm": 0.18788853287696838, "learning_rate": 2.6706946390242488e-05, "loss": 0.3773, "step": 68545 }, { "epoch": 2.470537355389772, "grad_norm": 0.24081262946128845, "learning_rate": 2.6704035054888964e-05, "loss": 0.3926, "step": 68550 }, { "epoch": 2.4707175550509963, "grad_norm": 0.21271935105323792, "learning_rate": 2.67011236963183e-05, "loss": 0.3867, "step": 68555 }, { "epoch": 2.470897754712221, "grad_norm": 0.19515545666217804, "learning_rate": 2.6698212314570148e-05, "loss": 0.3917, "step": 68560 }, { "epoch": 2.471077954373446, "grad_norm": 0.18669338524341583, "learning_rate": 2.669530090968419e-05, "loss": 0.3656, "step": 68565 }, { "epoch": 2.4712581540346705, "grad_norm": 0.19376404583454132, "learning_rate": 2.669238948170008e-05, "loss": 0.3925, "step": 68570 }, { "epoch": 2.471438353695895, "grad_norm": 0.22779560089111328, "learning_rate": 2.6689478030657496e-05, "loss": 0.4522, "step": 68575 }, { "epoch": 2.4716185533571196, "grad_norm": 0.19855950772762299, "learning_rate": 2.6686566556596087e-05, "loss": 0.3975, "step": 68580 }, { "epoch": 2.4717987530183443, "grad_norm": 0.2209222912788391, "learning_rate": 2.6683655059555544e-05, "loss": 0.3871, "step": 68585 }, { "epoch": 2.471978952679569, "grad_norm": 0.1904340535402298, "learning_rate": 2.668074353957553e-05, "loss": 0.3682, "step": 68590 }, { "epoch": 2.4721591523407938, "grad_norm": 0.2192251831293106, "learning_rate": 2.6677831996695706e-05, "loss": 0.3725, "step": 68595 }, { "epoch": 2.472339352002018, "grad_norm": 0.20693185925483704, "learning_rate": 2.6674920430955753e-05, "loss": 0.3858, "step": 68600 }, { "epoch": 2.4725195516632428, "grad_norm": 0.25332531332969666, "learning_rate": 2.6672008842395323e-05, "loss": 0.3742, "step": 68605 }, { "epoch": 2.4726997513244675, "grad_norm": 0.2141808420419693, "learning_rate": 2.666909723105411e-05, "loss": 0.4053, "step": 68610 }, { "epoch": 2.4728799509856922, "grad_norm": 0.20601968467235565, "learning_rate": 2.6666185596971753e-05, "loss": 0.3986, "step": 68615 }, { "epoch": 2.473060150646917, "grad_norm": 0.21490851044654846, "learning_rate": 2.6663273940187956e-05, "loss": 0.3877, "step": 68620 }, { "epoch": 2.4732403503081413, "grad_norm": 0.1867503970861435, "learning_rate": 2.666036226074236e-05, "loss": 0.3738, "step": 68625 }, { "epoch": 2.473420549969366, "grad_norm": 0.21378225088119507, "learning_rate": 2.6657450558674658e-05, "loss": 0.3711, "step": 68630 }, { "epoch": 2.4736007496305907, "grad_norm": 0.20687870681285858, "learning_rate": 2.665453883402451e-05, "loss": 0.4117, "step": 68635 }, { "epoch": 2.4737809492918155, "grad_norm": 0.17772191762924194, "learning_rate": 2.665162708683159e-05, "loss": 0.3953, "step": 68640 }, { "epoch": 2.47396114895304, "grad_norm": 0.19218339025974274, "learning_rate": 2.6648715317135575e-05, "loss": 0.3936, "step": 68645 }, { "epoch": 2.4741413486142645, "grad_norm": 0.2298266589641571, "learning_rate": 2.6645803524976133e-05, "loss": 0.3804, "step": 68650 }, { "epoch": 2.4743215482754892, "grad_norm": 0.1857258528470993, "learning_rate": 2.664289171039293e-05, "loss": 0.3942, "step": 68655 }, { "epoch": 2.474501747936714, "grad_norm": 0.1851481795310974, "learning_rate": 2.6639979873425652e-05, "loss": 0.4023, "step": 68660 }, { "epoch": 2.4746819475979387, "grad_norm": 0.19881199300289154, "learning_rate": 2.663706801411396e-05, "loss": 0.4297, "step": 68665 }, { "epoch": 2.474862147259163, "grad_norm": 0.22843848168849945, "learning_rate": 2.6634156132497538e-05, "loss": 0.3793, "step": 68670 }, { "epoch": 2.4750423469203877, "grad_norm": 0.20685456693172455, "learning_rate": 2.6631244228616053e-05, "loss": 0.3826, "step": 68675 }, { "epoch": 2.4752225465816124, "grad_norm": 0.24268147349357605, "learning_rate": 2.6628332302509186e-05, "loss": 0.3882, "step": 68680 }, { "epoch": 2.475402746242837, "grad_norm": 0.21436165273189545, "learning_rate": 2.6625420354216597e-05, "loss": 0.4387, "step": 68685 }, { "epoch": 2.475582945904062, "grad_norm": 0.20121796429157257, "learning_rate": 2.6622508383777977e-05, "loss": 0.4059, "step": 68690 }, { "epoch": 2.475763145565286, "grad_norm": 0.1770712435245514, "learning_rate": 2.6619596391233e-05, "loss": 0.3906, "step": 68695 }, { "epoch": 2.475943345226511, "grad_norm": 0.2208649069070816, "learning_rate": 2.661668437662132e-05, "loss": 0.4162, "step": 68700 }, { "epoch": 2.4761235448877357, "grad_norm": 0.23539263010025024, "learning_rate": 2.661377233998264e-05, "loss": 0.4129, "step": 68705 }, { "epoch": 2.4763037445489604, "grad_norm": 0.18581153452396393, "learning_rate": 2.6610860281356627e-05, "loss": 0.4237, "step": 68710 }, { "epoch": 2.4764839442101847, "grad_norm": 0.20653104782104492, "learning_rate": 2.6607948200782944e-05, "loss": 0.3858, "step": 68715 }, { "epoch": 2.4766641438714094, "grad_norm": 0.18141742050647736, "learning_rate": 2.6605036098301283e-05, "loss": 0.3642, "step": 68720 }, { "epoch": 2.476844343532634, "grad_norm": 0.22775548696517944, "learning_rate": 2.6602123973951314e-05, "loss": 0.4237, "step": 68725 }, { "epoch": 2.477024543193859, "grad_norm": 0.1954440176486969, "learning_rate": 2.6599211827772724e-05, "loss": 0.403, "step": 68730 }, { "epoch": 2.4772047428550836, "grad_norm": 0.18037401139736176, "learning_rate": 2.6596299659805173e-05, "loss": 0.3839, "step": 68735 }, { "epoch": 2.477384942516308, "grad_norm": 0.2156188189983368, "learning_rate": 2.6593387470088354e-05, "loss": 0.4249, "step": 68740 }, { "epoch": 2.4775651421775327, "grad_norm": 0.2691105008125305, "learning_rate": 2.6590475258661935e-05, "loss": 0.4494, "step": 68745 }, { "epoch": 2.4777453418387574, "grad_norm": 0.1580956131219864, "learning_rate": 2.6587563025565604e-05, "loss": 0.3839, "step": 68750 }, { "epoch": 2.477925541499982, "grad_norm": 0.17688532173633575, "learning_rate": 2.6584650770839026e-05, "loss": 0.4133, "step": 68755 }, { "epoch": 2.4781057411612064, "grad_norm": 0.19186891615390778, "learning_rate": 2.6581738494521898e-05, "loss": 0.3996, "step": 68760 }, { "epoch": 2.478285940822431, "grad_norm": 0.23146666586399078, "learning_rate": 2.657882619665388e-05, "loss": 0.3699, "step": 68765 }, { "epoch": 2.478466140483656, "grad_norm": 0.2393793761730194, "learning_rate": 2.6575913877274666e-05, "loss": 0.3812, "step": 68770 }, { "epoch": 2.4786463401448806, "grad_norm": 0.18306025862693787, "learning_rate": 2.657300153642393e-05, "loss": 0.4051, "step": 68775 }, { "epoch": 2.4788265398061053, "grad_norm": 0.22833152115345, "learning_rate": 2.657008917414135e-05, "loss": 0.3908, "step": 68780 }, { "epoch": 2.4790067394673296, "grad_norm": 0.19252647459506989, "learning_rate": 2.6567176790466613e-05, "loss": 0.3735, "step": 68785 }, { "epoch": 2.4791869391285544, "grad_norm": 0.202143132686615, "learning_rate": 2.6564264385439385e-05, "loss": 0.411, "step": 68790 }, { "epoch": 2.479367138789779, "grad_norm": 0.1779770851135254, "learning_rate": 2.656135195909938e-05, "loss": 0.3841, "step": 68795 }, { "epoch": 2.479547338451004, "grad_norm": 0.17505523562431335, "learning_rate": 2.6558439511486232e-05, "loss": 0.3515, "step": 68800 }, { "epoch": 2.479727538112228, "grad_norm": 0.2236313372850418, "learning_rate": 2.655552704263966e-05, "loss": 0.4296, "step": 68805 }, { "epoch": 2.479907737773453, "grad_norm": 0.21467982232570648, "learning_rate": 2.655261455259933e-05, "loss": 0.3938, "step": 68810 }, { "epoch": 2.4800879374346776, "grad_norm": 0.1988895684480667, "learning_rate": 2.6549702041404932e-05, "loss": 0.4324, "step": 68815 }, { "epoch": 2.4802681370959023, "grad_norm": 0.17672614753246307, "learning_rate": 2.6546789509096144e-05, "loss": 0.4032, "step": 68820 }, { "epoch": 2.480448336757127, "grad_norm": 0.22681747376918793, "learning_rate": 2.6543876955712637e-05, "loss": 0.4158, "step": 68825 }, { "epoch": 2.4806285364183513, "grad_norm": 0.16268977522850037, "learning_rate": 2.654096438129412e-05, "loss": 0.3774, "step": 68830 }, { "epoch": 2.480808736079576, "grad_norm": 0.18678313493728638, "learning_rate": 2.6538051785880254e-05, "loss": 0.3649, "step": 68835 }, { "epoch": 2.480988935740801, "grad_norm": 0.2044302076101303, "learning_rate": 2.6535139169510727e-05, "loss": 0.3747, "step": 68840 }, { "epoch": 2.4811691354020255, "grad_norm": 0.2173137664794922, "learning_rate": 2.6532226532225235e-05, "loss": 0.3782, "step": 68845 }, { "epoch": 2.48134933506325, "grad_norm": 0.22002023458480835, "learning_rate": 2.6529313874063445e-05, "loss": 0.3879, "step": 68850 }, { "epoch": 2.4815295347244746, "grad_norm": 0.19581471383571625, "learning_rate": 2.652640119506506e-05, "loss": 0.3761, "step": 68855 }, { "epoch": 2.4817097343856993, "grad_norm": 0.18339914083480835, "learning_rate": 2.6523488495269744e-05, "loss": 0.4203, "step": 68860 }, { "epoch": 2.481889934046924, "grad_norm": 0.19573746621608734, "learning_rate": 2.6520575774717194e-05, "loss": 0.4061, "step": 68865 }, { "epoch": 2.4820701337081488, "grad_norm": 0.24082912504673004, "learning_rate": 2.6517663033447092e-05, "loss": 0.4076, "step": 68870 }, { "epoch": 2.4822503333693735, "grad_norm": 0.1869727075099945, "learning_rate": 2.6514750271499127e-05, "loss": 0.4011, "step": 68875 }, { "epoch": 2.482430533030598, "grad_norm": 0.19371429085731506, "learning_rate": 2.6511837488912988e-05, "loss": 0.3798, "step": 68880 }, { "epoch": 2.4826107326918225, "grad_norm": 0.20784878730773926, "learning_rate": 2.650892468572835e-05, "loss": 0.4097, "step": 68885 }, { "epoch": 2.4827909323530473, "grad_norm": 0.17571403086185455, "learning_rate": 2.65060118619849e-05, "loss": 0.3819, "step": 68890 }, { "epoch": 2.482971132014272, "grad_norm": 0.17903995513916016, "learning_rate": 2.6503099017722343e-05, "loss": 0.3743, "step": 68895 }, { "epoch": 2.4831513316754963, "grad_norm": 0.23476238548755646, "learning_rate": 2.650018615298035e-05, "loss": 0.3742, "step": 68900 }, { "epoch": 2.483331531336721, "grad_norm": 0.21859110891819, "learning_rate": 2.6497273267798605e-05, "loss": 0.3606, "step": 68905 }, { "epoch": 2.4835117309979458, "grad_norm": 0.2518290579319, "learning_rate": 2.6494360362216803e-05, "loss": 0.4162, "step": 68910 }, { "epoch": 2.4836919306591705, "grad_norm": 0.1847134232521057, "learning_rate": 2.6491447436274637e-05, "loss": 0.3972, "step": 68915 }, { "epoch": 2.483872130320395, "grad_norm": 0.18543338775634766, "learning_rate": 2.648853449001178e-05, "loss": 0.4026, "step": 68920 }, { "epoch": 2.4840523299816195, "grad_norm": 0.20667828619480133, "learning_rate": 2.648562152346793e-05, "loss": 0.4239, "step": 68925 }, { "epoch": 2.4842325296428442, "grad_norm": 0.21871261298656464, "learning_rate": 2.6482708536682777e-05, "loss": 0.4119, "step": 68930 }, { "epoch": 2.484412729304069, "grad_norm": 0.18584778904914856, "learning_rate": 2.647979552969601e-05, "loss": 0.3987, "step": 68935 }, { "epoch": 2.4845929289652937, "grad_norm": 0.1998424530029297, "learning_rate": 2.6476882502547305e-05, "loss": 0.3779, "step": 68940 }, { "epoch": 2.484773128626518, "grad_norm": 0.18726879358291626, "learning_rate": 2.647396945527637e-05, "loss": 0.4084, "step": 68945 }, { "epoch": 2.4849533282877427, "grad_norm": 0.19069020450115204, "learning_rate": 2.6471056387922886e-05, "loss": 0.3962, "step": 68950 }, { "epoch": 2.4851335279489675, "grad_norm": 0.231498122215271, "learning_rate": 2.6468143300526543e-05, "loss": 0.3819, "step": 68955 }, { "epoch": 2.485313727610192, "grad_norm": 0.19024336338043213, "learning_rate": 2.6465230193127033e-05, "loss": 0.3738, "step": 68960 }, { "epoch": 2.485493927271417, "grad_norm": 0.23656608164310455, "learning_rate": 2.6462317065764043e-05, "loss": 0.4132, "step": 68965 }, { "epoch": 2.4856741269326412, "grad_norm": 0.2034633606672287, "learning_rate": 2.645940391847727e-05, "loss": 0.4149, "step": 68970 }, { "epoch": 2.485854326593866, "grad_norm": 0.20113669335842133, "learning_rate": 2.6456490751306395e-05, "loss": 0.3942, "step": 68975 }, { "epoch": 2.4860345262550907, "grad_norm": 0.18040354549884796, "learning_rate": 2.645357756429112e-05, "loss": 0.403, "step": 68980 }, { "epoch": 2.4862147259163154, "grad_norm": 0.2093953937292099, "learning_rate": 2.645066435747113e-05, "loss": 0.3853, "step": 68985 }, { "epoch": 2.4863949255775397, "grad_norm": 0.18795111775398254, "learning_rate": 2.6447751130886117e-05, "loss": 0.4241, "step": 68990 }, { "epoch": 2.4865751252387644, "grad_norm": 0.24407252669334412, "learning_rate": 2.644483788457578e-05, "loss": 0.3794, "step": 68995 }, { "epoch": 2.486755324899989, "grad_norm": 0.18721212446689606, "learning_rate": 2.6441924618579807e-05, "loss": 0.3853, "step": 69000 }, { "epoch": 2.486755324899989, "eval_loss": 0.43400391936302185, "eval_runtime": 3.5358, "eval_samples_per_second": 28.282, "eval_steps_per_second": 7.071, "step": 69000 }, { "epoch": 2.486935524561214, "grad_norm": 0.21904613077640533, "learning_rate": 2.643901133293789e-05, "loss": 0.4267, "step": 69005 }, { "epoch": 2.4871157242224387, "grad_norm": 0.18053406476974487, "learning_rate": 2.6436098027689714e-05, "loss": 0.3885, "step": 69010 }, { "epoch": 2.487295923883663, "grad_norm": 0.18730615079402924, "learning_rate": 2.6433184702874993e-05, "loss": 0.371, "step": 69015 }, { "epoch": 2.4874761235448877, "grad_norm": 0.21903234720230103, "learning_rate": 2.6430271358533397e-05, "loss": 0.4299, "step": 69020 }, { "epoch": 2.4876563232061124, "grad_norm": 0.21092742681503296, "learning_rate": 2.6427357994704633e-05, "loss": 0.4174, "step": 69025 }, { "epoch": 2.487836522867337, "grad_norm": 0.20386511087417603, "learning_rate": 2.6424444611428396e-05, "loss": 0.4047, "step": 69030 }, { "epoch": 2.4880167225285614, "grad_norm": 0.2634836733341217, "learning_rate": 2.642153120874437e-05, "loss": 0.424, "step": 69035 }, { "epoch": 2.488196922189786, "grad_norm": 0.22535258531570435, "learning_rate": 2.6418617786692273e-05, "loss": 0.4279, "step": 69040 }, { "epoch": 2.488377121851011, "grad_norm": 0.19794537127017975, "learning_rate": 2.6415704345311764e-05, "loss": 0.3665, "step": 69045 }, { "epoch": 2.4885573215122356, "grad_norm": 0.20642319321632385, "learning_rate": 2.641279088464257e-05, "loss": 0.3978, "step": 69050 }, { "epoch": 2.4887375211734604, "grad_norm": 0.22682100534439087, "learning_rate": 2.6409877404724363e-05, "loss": 0.4194, "step": 69055 }, { "epoch": 2.4889177208346847, "grad_norm": 0.24420538544654846, "learning_rate": 2.640696390559686e-05, "loss": 0.3943, "step": 69060 }, { "epoch": 2.4890979204959094, "grad_norm": 0.16333971917629242, "learning_rate": 2.6404050387299744e-05, "loss": 0.3718, "step": 69065 }, { "epoch": 2.489278120157134, "grad_norm": 0.19951343536376953, "learning_rate": 2.640113684987271e-05, "loss": 0.3919, "step": 69070 }, { "epoch": 2.489458319818359, "grad_norm": 0.18329237401485443, "learning_rate": 2.6398223293355455e-05, "loss": 0.408, "step": 69075 }, { "epoch": 2.489638519479583, "grad_norm": 0.2317817211151123, "learning_rate": 2.6395309717787686e-05, "loss": 0.384, "step": 69080 }, { "epoch": 2.489818719140808, "grad_norm": 0.20693840086460114, "learning_rate": 2.6392396123209085e-05, "loss": 0.3992, "step": 69085 }, { "epoch": 2.4899989188020326, "grad_norm": 0.18415316939353943, "learning_rate": 2.6389482509659365e-05, "loss": 0.3955, "step": 69090 }, { "epoch": 2.4901791184632573, "grad_norm": 0.18562282621860504, "learning_rate": 2.6386568877178204e-05, "loss": 0.3756, "step": 69095 }, { "epoch": 2.490359318124482, "grad_norm": 0.1960134655237198, "learning_rate": 2.6383655225805326e-05, "loss": 0.3731, "step": 69100 }, { "epoch": 2.4905395177857064, "grad_norm": 0.22196051478385925, "learning_rate": 2.6380741555580398e-05, "loss": 0.3898, "step": 69105 }, { "epoch": 2.490719717446931, "grad_norm": 0.19862577319145203, "learning_rate": 2.6377827866543142e-05, "loss": 0.371, "step": 69110 }, { "epoch": 2.490899917108156, "grad_norm": 0.1888255774974823, "learning_rate": 2.6374914158733238e-05, "loss": 0.3924, "step": 69115 }, { "epoch": 2.4910801167693806, "grad_norm": 0.18701927363872528, "learning_rate": 2.6372000432190407e-05, "loss": 0.369, "step": 69120 }, { "epoch": 2.4912603164306053, "grad_norm": 0.25110724568367004, "learning_rate": 2.636908668695433e-05, "loss": 0.3578, "step": 69125 }, { "epoch": 2.4914405160918296, "grad_norm": 0.21141697466373444, "learning_rate": 2.6366172923064714e-05, "loss": 0.4043, "step": 69130 }, { "epoch": 2.4916207157530543, "grad_norm": 0.23328730463981628, "learning_rate": 2.6363259140561252e-05, "loss": 0.4044, "step": 69135 }, { "epoch": 2.491800915414279, "grad_norm": 0.17210394144058228, "learning_rate": 2.6360345339483655e-05, "loss": 0.3999, "step": 69140 }, { "epoch": 2.491981115075504, "grad_norm": 0.19488251209259033, "learning_rate": 2.6357431519871612e-05, "loss": 0.3879, "step": 69145 }, { "epoch": 2.4921613147367285, "grad_norm": 0.23553623259067535, "learning_rate": 2.635451768176483e-05, "loss": 0.412, "step": 69150 }, { "epoch": 2.492341514397953, "grad_norm": 0.21684016287326813, "learning_rate": 2.6351603825203003e-05, "loss": 0.425, "step": 69155 }, { "epoch": 2.4925217140591776, "grad_norm": 0.22947993874549866, "learning_rate": 2.634868995022584e-05, "loss": 0.3921, "step": 69160 }, { "epoch": 2.4927019137204023, "grad_norm": 0.1686110943555832, "learning_rate": 2.634577605687303e-05, "loss": 0.3758, "step": 69165 }, { "epoch": 2.492882113381627, "grad_norm": 0.2178329974412918, "learning_rate": 2.6342862145184287e-05, "loss": 0.3766, "step": 69170 }, { "epoch": 2.4930623130428513, "grad_norm": 0.16872194409370422, "learning_rate": 2.6339948215199304e-05, "loss": 0.3641, "step": 69175 }, { "epoch": 2.493242512704076, "grad_norm": 0.2486531138420105, "learning_rate": 2.633703426695779e-05, "loss": 0.4011, "step": 69180 }, { "epoch": 2.4934227123653008, "grad_norm": 0.1963934600353241, "learning_rate": 2.6334120300499443e-05, "loss": 0.3931, "step": 69185 }, { "epoch": 2.4936029120265255, "grad_norm": 0.20075222849845886, "learning_rate": 2.6331206315863966e-05, "loss": 0.3863, "step": 69190 }, { "epoch": 2.4937831116877502, "grad_norm": 0.19349929690361023, "learning_rate": 2.6328292313091056e-05, "loss": 0.3631, "step": 69195 }, { "epoch": 2.4939633113489745, "grad_norm": 0.14709793031215668, "learning_rate": 2.6325378292220428e-05, "loss": 0.3709, "step": 69200 }, { "epoch": 2.4941435110101993, "grad_norm": 0.21333067119121552, "learning_rate": 2.6322464253291775e-05, "loss": 0.3988, "step": 69205 }, { "epoch": 2.494323710671424, "grad_norm": 0.19537271559238434, "learning_rate": 2.6319550196344793e-05, "loss": 0.3719, "step": 69210 }, { "epoch": 2.4945039103326487, "grad_norm": 0.22581617534160614, "learning_rate": 2.63166361214192e-05, "loss": 0.4023, "step": 69215 }, { "epoch": 2.494684109993873, "grad_norm": 0.2060622274875641, "learning_rate": 2.6313722028554692e-05, "loss": 0.3808, "step": 69220 }, { "epoch": 2.4948643096550978, "grad_norm": 0.19554631412029266, "learning_rate": 2.631080791779099e-05, "loss": 0.4116, "step": 69225 }, { "epoch": 2.4950445093163225, "grad_norm": 0.1858394294977188, "learning_rate": 2.630789378916777e-05, "loss": 0.3857, "step": 69230 }, { "epoch": 2.4952247089775472, "grad_norm": 0.23908282816410065, "learning_rate": 2.6304979642724754e-05, "loss": 0.3515, "step": 69235 }, { "epoch": 2.495404908638772, "grad_norm": 0.1817556619644165, "learning_rate": 2.630206547850165e-05, "loss": 0.4181, "step": 69240 }, { "epoch": 2.4955851082999962, "grad_norm": 0.19453182816505432, "learning_rate": 2.629915129653815e-05, "loss": 0.3915, "step": 69245 }, { "epoch": 2.495765307961221, "grad_norm": 0.18795980513095856, "learning_rate": 2.6296237096873964e-05, "loss": 0.3749, "step": 69250 }, { "epoch": 2.4959455076224457, "grad_norm": 0.16127502918243408, "learning_rate": 2.6293322879548792e-05, "loss": 0.3953, "step": 69255 }, { "epoch": 2.4961257072836704, "grad_norm": 0.20099008083343506, "learning_rate": 2.629040864460236e-05, "loss": 0.3947, "step": 69260 }, { "epoch": 2.4963059069448947, "grad_norm": 0.18468354642391205, "learning_rate": 2.6287494392074352e-05, "loss": 0.3996, "step": 69265 }, { "epoch": 2.4964861066061195, "grad_norm": 0.190368190407753, "learning_rate": 2.6284580122004482e-05, "loss": 0.3393, "step": 69270 }, { "epoch": 2.496666306267344, "grad_norm": 0.2398798167705536, "learning_rate": 2.6281665834432462e-05, "loss": 0.396, "step": 69275 }, { "epoch": 2.496846505928569, "grad_norm": 0.21134978532791138, "learning_rate": 2.6278751529397983e-05, "loss": 0.3844, "step": 69280 }, { "epoch": 2.4970267055897937, "grad_norm": 0.221641406416893, "learning_rate": 2.6275837206940772e-05, "loss": 0.3982, "step": 69285 }, { "epoch": 2.497206905251018, "grad_norm": 0.2198064923286438, "learning_rate": 2.6272922867100524e-05, "loss": 0.4137, "step": 69290 }, { "epoch": 2.4973871049122427, "grad_norm": 0.26445555686950684, "learning_rate": 2.627000850991695e-05, "loss": 0.4162, "step": 69295 }, { "epoch": 2.4975673045734674, "grad_norm": 0.2406071126461029, "learning_rate": 2.6267094135429748e-05, "loss": 0.3863, "step": 69300 }, { "epoch": 2.497747504234692, "grad_norm": 0.21584929525852203, "learning_rate": 2.6264179743678642e-05, "loss": 0.3893, "step": 69305 }, { "epoch": 2.4979277038959165, "grad_norm": 0.19201698899269104, "learning_rate": 2.6261265334703327e-05, "loss": 0.4101, "step": 69310 }, { "epoch": 2.498107903557141, "grad_norm": 0.22103743255138397, "learning_rate": 2.625835090854351e-05, "loss": 0.4085, "step": 69315 }, { "epoch": 2.498288103218366, "grad_norm": 0.24371977150440216, "learning_rate": 2.625543646523892e-05, "loss": 0.4247, "step": 69320 }, { "epoch": 2.4984683028795907, "grad_norm": 0.21865001320838928, "learning_rate": 2.6252522004829243e-05, "loss": 0.3971, "step": 69325 }, { "epoch": 2.4986485025408154, "grad_norm": 0.18151873350143433, "learning_rate": 2.6249607527354198e-05, "loss": 0.4078, "step": 69330 }, { "epoch": 2.4988287022020397, "grad_norm": 0.19175831973552704, "learning_rate": 2.6246693032853486e-05, "loss": 0.4046, "step": 69335 }, { "epoch": 2.4990089018632644, "grad_norm": 0.18904618918895721, "learning_rate": 2.6243778521366835e-05, "loss": 0.3663, "step": 69340 }, { "epoch": 2.499189101524489, "grad_norm": 0.21790605783462524, "learning_rate": 2.6240863992933936e-05, "loss": 0.4033, "step": 69345 }, { "epoch": 2.499369301185714, "grad_norm": 0.20878206193447113, "learning_rate": 2.6237949447594502e-05, "loss": 0.4134, "step": 69350 }, { "epoch": 2.499549500846938, "grad_norm": 0.2429923415184021, "learning_rate": 2.6235034885388247e-05, "loss": 0.3973, "step": 69355 }, { "epoch": 2.499729700508163, "grad_norm": 0.19370168447494507, "learning_rate": 2.6232120306354884e-05, "loss": 0.3918, "step": 69360 }, { "epoch": 2.4999099001693876, "grad_norm": 0.22662904858589172, "learning_rate": 2.6229205710534126e-05, "loss": 0.37, "step": 69365 }, { "epoch": 2.5000900998306124, "grad_norm": 0.18225222826004028, "learning_rate": 2.6226291097965668e-05, "loss": 0.4206, "step": 69370 }, { "epoch": 2.500270299491837, "grad_norm": 0.2310526967048645, "learning_rate": 2.622337646868923e-05, "loss": 0.4082, "step": 69375 }, { "epoch": 2.500450499153062, "grad_norm": 0.20121249556541443, "learning_rate": 2.6220461822744536e-05, "loss": 0.407, "step": 69380 }, { "epoch": 2.500630698814286, "grad_norm": 0.24337324500083923, "learning_rate": 2.6217547160171274e-05, "loss": 0.4463, "step": 69385 }, { "epoch": 2.500810898475511, "grad_norm": 0.21460457146167755, "learning_rate": 2.6214632481009176e-05, "loss": 0.3511, "step": 69390 }, { "epoch": 2.5009910981367356, "grad_norm": 0.19160525500774384, "learning_rate": 2.621171778529794e-05, "loss": 0.3843, "step": 69395 }, { "epoch": 2.50117129779796, "grad_norm": 0.20841006934642792, "learning_rate": 2.6208803073077294e-05, "loss": 0.4167, "step": 69400 }, { "epoch": 2.5013514974591846, "grad_norm": 0.21460412442684174, "learning_rate": 2.6205888344386927e-05, "loss": 0.3881, "step": 69405 }, { "epoch": 2.5015316971204093, "grad_norm": 0.20934461057186127, "learning_rate": 2.6202973599266573e-05, "loss": 0.371, "step": 69410 }, { "epoch": 2.501711896781634, "grad_norm": 0.23594777286052704, "learning_rate": 2.620005883775593e-05, "loss": 0.3876, "step": 69415 }, { "epoch": 2.501892096442859, "grad_norm": 0.21076056361198425, "learning_rate": 2.6197144059894724e-05, "loss": 0.4082, "step": 69420 }, { "epoch": 2.5020722961040835, "grad_norm": 0.21848683059215546, "learning_rate": 2.619422926572266e-05, "loss": 0.3747, "step": 69425 }, { "epoch": 2.502252495765308, "grad_norm": 0.20957939326763153, "learning_rate": 2.6191314455279453e-05, "loss": 0.4058, "step": 69430 }, { "epoch": 2.5024326954265326, "grad_norm": 0.23492544889450073, "learning_rate": 2.618839962860482e-05, "loss": 0.383, "step": 69435 }, { "epoch": 2.5026128950877573, "grad_norm": 0.16732336580753326, "learning_rate": 2.6185484785738467e-05, "loss": 0.391, "step": 69440 }, { "epoch": 2.5027930947489816, "grad_norm": 0.18726083636283875, "learning_rate": 2.6182569926720117e-05, "loss": 0.381, "step": 69445 }, { "epoch": 2.5029732944102063, "grad_norm": 0.201826810836792, "learning_rate": 2.617965505158948e-05, "loss": 0.3941, "step": 69450 }, { "epoch": 2.503153494071431, "grad_norm": 0.20765550434589386, "learning_rate": 2.617674016038627e-05, "loss": 0.4062, "step": 69455 }, { "epoch": 2.503333693732656, "grad_norm": 0.23174385726451874, "learning_rate": 2.61738252531502e-05, "loss": 0.3968, "step": 69460 }, { "epoch": 2.5035138933938805, "grad_norm": 0.21468636393547058, "learning_rate": 2.6170910329920994e-05, "loss": 0.3938, "step": 69465 }, { "epoch": 2.5036940930551053, "grad_norm": 0.23031465709209442, "learning_rate": 2.6167995390738366e-05, "loss": 0.4022, "step": 69470 }, { "epoch": 2.5038742927163296, "grad_norm": 0.19280219078063965, "learning_rate": 2.6165080435642014e-05, "loss": 0.3947, "step": 69475 }, { "epoch": 2.5040544923775543, "grad_norm": 0.20321400463581085, "learning_rate": 2.616216546467168e-05, "loss": 0.3962, "step": 69480 }, { "epoch": 2.504234692038779, "grad_norm": 0.16147299110889435, "learning_rate": 2.6159250477867053e-05, "loss": 0.4046, "step": 69485 }, { "epoch": 2.5044148917000038, "grad_norm": 0.21102051436901093, "learning_rate": 2.6156335475267874e-05, "loss": 0.4235, "step": 69490 }, { "epoch": 2.504595091361228, "grad_norm": 0.22560648620128632, "learning_rate": 2.615342045691384e-05, "loss": 0.4053, "step": 69495 }, { "epoch": 2.5047752910224528, "grad_norm": 0.17885218560695648, "learning_rate": 2.615050542284468e-05, "loss": 0.35, "step": 69500 }, { "epoch": 2.5047752910224528, "eval_loss": 0.4340173304080963, "eval_runtime": 3.5362, "eval_samples_per_second": 28.279, "eval_steps_per_second": 7.07, "step": 69500 }, { "epoch": 2.5049554906836775, "grad_norm": 0.23639249801635742, "learning_rate": 2.6147590373100106e-05, "loss": 0.4151, "step": 69505 }, { "epoch": 2.5051356903449022, "grad_norm": 0.260985404253006, "learning_rate": 2.6144675307719835e-05, "loss": 0.4056, "step": 69510 }, { "epoch": 2.505315890006127, "grad_norm": 0.21319666504859924, "learning_rate": 2.614176022674359e-05, "loss": 0.3559, "step": 69515 }, { "epoch": 2.5054960896673513, "grad_norm": 0.20121723413467407, "learning_rate": 2.613884513021107e-05, "loss": 0.4013, "step": 69520 }, { "epoch": 2.505676289328576, "grad_norm": 0.1814257949590683, "learning_rate": 2.613593001816201e-05, "loss": 0.3781, "step": 69525 }, { "epoch": 2.5058564889898007, "grad_norm": 0.20692424476146698, "learning_rate": 2.613301489063613e-05, "loss": 0.3848, "step": 69530 }, { "epoch": 2.5060366886510255, "grad_norm": 0.21745166182518005, "learning_rate": 2.6130099747673136e-05, "loss": 0.3875, "step": 69535 }, { "epoch": 2.5062168883122498, "grad_norm": 0.19533394277095795, "learning_rate": 2.612718458931276e-05, "loss": 0.4273, "step": 69540 }, { "epoch": 2.5063970879734745, "grad_norm": 0.18785609304904938, "learning_rate": 2.6124269415594698e-05, "loss": 0.4146, "step": 69545 }, { "epoch": 2.5065772876346992, "grad_norm": 0.21127115190029144, "learning_rate": 2.6121354226558692e-05, "loss": 0.4068, "step": 69550 }, { "epoch": 2.506757487295924, "grad_norm": 0.2023439109325409, "learning_rate": 2.611843902224445e-05, "loss": 0.4257, "step": 69555 }, { "epoch": 2.5069376869571487, "grad_norm": 0.1987898051738739, "learning_rate": 2.6115523802691695e-05, "loss": 0.4159, "step": 69560 }, { "epoch": 2.5071178866183734, "grad_norm": 0.2078784704208374, "learning_rate": 2.6112608567940138e-05, "loss": 0.3997, "step": 69565 }, { "epoch": 2.5072980862795977, "grad_norm": 0.27035263180732727, "learning_rate": 2.610969331802951e-05, "loss": 0.4235, "step": 69570 }, { "epoch": 2.5074782859408224, "grad_norm": 0.22846317291259766, "learning_rate": 2.610677805299953e-05, "loss": 0.3812, "step": 69575 }, { "epoch": 2.507658485602047, "grad_norm": 0.20302127301692963, "learning_rate": 2.6103862772889902e-05, "loss": 0.3991, "step": 69580 }, { "epoch": 2.5078386852632715, "grad_norm": 0.20338794589042664, "learning_rate": 2.6100947477740367e-05, "loss": 0.3879, "step": 69585 }, { "epoch": 2.508018884924496, "grad_norm": 0.1619461178779602, "learning_rate": 2.609803216759063e-05, "loss": 0.4004, "step": 69590 }, { "epoch": 2.508199084585721, "grad_norm": 0.2222161740064621, "learning_rate": 2.6095116842480417e-05, "loss": 0.3937, "step": 69595 }, { "epoch": 2.5083792842469457, "grad_norm": 0.18493251502513885, "learning_rate": 2.6092201502449455e-05, "loss": 0.3873, "step": 69600 }, { "epoch": 2.5085594839081704, "grad_norm": 0.21007417142391205, "learning_rate": 2.6089286147537452e-05, "loss": 0.3758, "step": 69605 }, { "epoch": 2.508739683569395, "grad_norm": 0.2484482377767563, "learning_rate": 2.608637077778414e-05, "loss": 0.4411, "step": 69610 }, { "epoch": 2.5089198832306194, "grad_norm": 0.2558552920818329, "learning_rate": 2.608345539322924e-05, "loss": 0.4278, "step": 69615 }, { "epoch": 2.509100082891844, "grad_norm": 0.2762976586818695, "learning_rate": 2.6080539993912467e-05, "loss": 0.4425, "step": 69620 }, { "epoch": 2.509280282553069, "grad_norm": 0.2021670788526535, "learning_rate": 2.607762457987354e-05, "loss": 0.3999, "step": 69625 }, { "epoch": 2.509460482214293, "grad_norm": 0.22645121812820435, "learning_rate": 2.6074709151152193e-05, "loss": 0.4255, "step": 69630 }, { "epoch": 2.509640681875518, "grad_norm": 0.23237501084804535, "learning_rate": 2.6071793707788138e-05, "loss": 0.431, "step": 69635 }, { "epoch": 2.5098208815367427, "grad_norm": 0.22652441263198853, "learning_rate": 2.6068878249821106e-05, "loss": 0.3742, "step": 69640 }, { "epoch": 2.5100010811979674, "grad_norm": 0.17889447510242462, "learning_rate": 2.606596277729081e-05, "loss": 0.3447, "step": 69645 }, { "epoch": 2.510181280859192, "grad_norm": 0.21000555157661438, "learning_rate": 2.6063047290236974e-05, "loss": 0.3839, "step": 69650 }, { "epoch": 2.510361480520417, "grad_norm": 0.18811647593975067, "learning_rate": 2.6060131788699343e-05, "loss": 0.4068, "step": 69655 }, { "epoch": 2.510541680181641, "grad_norm": 0.21040557324886322, "learning_rate": 2.6057216272717605e-05, "loss": 0.3839, "step": 69660 }, { "epoch": 2.510721879842866, "grad_norm": 0.23716652393341064, "learning_rate": 2.6054300742331498e-05, "loss": 0.4053, "step": 69665 }, { "epoch": 2.5109020795040906, "grad_norm": 0.2132147252559662, "learning_rate": 2.6051385197580757e-05, "loss": 0.3949, "step": 69670 }, { "epoch": 2.511082279165315, "grad_norm": 0.20189203321933746, "learning_rate": 2.6048469638505092e-05, "loss": 0.4238, "step": 69675 }, { "epoch": 2.5112624788265396, "grad_norm": 0.2348521649837494, "learning_rate": 2.6045554065144234e-05, "loss": 0.3931, "step": 69680 }, { "epoch": 2.5114426784877644, "grad_norm": 0.1996840536594391, "learning_rate": 2.60426384775379e-05, "loss": 0.4236, "step": 69685 }, { "epoch": 2.511622878148989, "grad_norm": 0.20005884766578674, "learning_rate": 2.603972287572582e-05, "loss": 0.4177, "step": 69690 }, { "epoch": 2.511803077810214, "grad_norm": 0.19069145619869232, "learning_rate": 2.603680725974772e-05, "loss": 0.4015, "step": 69695 }, { "epoch": 2.5119832774714386, "grad_norm": 0.211561918258667, "learning_rate": 2.6033891629643314e-05, "loss": 0.3864, "step": 69700 }, { "epoch": 2.512163477132663, "grad_norm": 0.2280474752187729, "learning_rate": 2.6030975985452344e-05, "loss": 0.3813, "step": 69705 }, { "epoch": 2.5123436767938876, "grad_norm": 0.1532917618751526, "learning_rate": 2.602806032721452e-05, "loss": 0.3638, "step": 69710 }, { "epoch": 2.5125238764551123, "grad_norm": 0.22681502997875214, "learning_rate": 2.602514465496958e-05, "loss": 0.4054, "step": 69715 }, { "epoch": 2.512704076116337, "grad_norm": 0.18224921822547913, "learning_rate": 2.6022228968757233e-05, "loss": 0.36, "step": 69720 }, { "epoch": 2.5128842757775613, "grad_norm": 0.18726272881031036, "learning_rate": 2.6019313268617223e-05, "loss": 0.3718, "step": 69725 }, { "epoch": 2.513064475438786, "grad_norm": 0.21877476572990417, "learning_rate": 2.601639755458926e-05, "loss": 0.398, "step": 69730 }, { "epoch": 2.513244675100011, "grad_norm": 0.2230885922908783, "learning_rate": 2.6013481826713083e-05, "loss": 0.3982, "step": 69735 }, { "epoch": 2.5134248747612356, "grad_norm": 0.22673091292381287, "learning_rate": 2.6010566085028408e-05, "loss": 0.4168, "step": 69740 }, { "epoch": 2.5136050744224603, "grad_norm": 0.19574376940727234, "learning_rate": 2.6007650329574968e-05, "loss": 0.3864, "step": 69745 }, { "epoch": 2.5137852740836846, "grad_norm": 0.1650337278842926, "learning_rate": 2.6004734560392487e-05, "loss": 0.4042, "step": 69750 }, { "epoch": 2.5139654737449093, "grad_norm": 0.19529931247234344, "learning_rate": 2.6001818777520692e-05, "loss": 0.3919, "step": 69755 }, { "epoch": 2.514145673406134, "grad_norm": 0.21473079919815063, "learning_rate": 2.5998902980999314e-05, "loss": 0.3822, "step": 69760 }, { "epoch": 2.5143258730673588, "grad_norm": 0.20333951711654663, "learning_rate": 2.5995987170868068e-05, "loss": 0.3926, "step": 69765 }, { "epoch": 2.514506072728583, "grad_norm": 0.2425466775894165, "learning_rate": 2.5993071347166693e-05, "loss": 0.3759, "step": 69770 }, { "epoch": 2.514686272389808, "grad_norm": 0.20622824132442474, "learning_rate": 2.599015550993491e-05, "loss": 0.3813, "step": 69775 }, { "epoch": 2.5148664720510325, "grad_norm": 0.18303707242012024, "learning_rate": 2.598723965921246e-05, "loss": 0.4201, "step": 69780 }, { "epoch": 2.5150466717122573, "grad_norm": 0.1895645707845688, "learning_rate": 2.5984323795039057e-05, "loss": 0.362, "step": 69785 }, { "epoch": 2.515226871373482, "grad_norm": 0.20545552670955658, "learning_rate": 2.5981407917454427e-05, "loss": 0.384, "step": 69790 }, { "epoch": 2.5154070710347067, "grad_norm": 0.20499677956104279, "learning_rate": 2.5978492026498308e-05, "loss": 0.3918, "step": 69795 }, { "epoch": 2.515587270695931, "grad_norm": 0.22519873082637787, "learning_rate": 2.5975576122210422e-05, "loss": 0.4266, "step": 69800 }, { "epoch": 2.5157674703571558, "grad_norm": 0.21015673875808716, "learning_rate": 2.59726602046305e-05, "loss": 0.3873, "step": 69805 }, { "epoch": 2.5159476700183805, "grad_norm": 0.19208891689777374, "learning_rate": 2.5969744273798274e-05, "loss": 0.397, "step": 69810 }, { "epoch": 2.516127869679605, "grad_norm": 0.23260699212551117, "learning_rate": 2.5966828329753467e-05, "loss": 0.4168, "step": 69815 }, { "epoch": 2.5163080693408295, "grad_norm": 0.2245369702577591, "learning_rate": 2.596391237253582e-05, "loss": 0.4131, "step": 69820 }, { "epoch": 2.5164882690020542, "grad_norm": 0.17223899066448212, "learning_rate": 2.5960996402185043e-05, "loss": 0.373, "step": 69825 }, { "epoch": 2.516668468663279, "grad_norm": 0.23170410096645355, "learning_rate": 2.595808041874088e-05, "loss": 0.4081, "step": 69830 }, { "epoch": 2.5168486683245037, "grad_norm": 0.21683074533939362, "learning_rate": 2.5955164422243054e-05, "loss": 0.4064, "step": 69835 }, { "epoch": 2.5170288679857284, "grad_norm": 0.16481082141399384, "learning_rate": 2.5952248412731305e-05, "loss": 0.3402, "step": 69840 }, { "epoch": 2.5172090676469527, "grad_norm": 0.15683116018772125, "learning_rate": 2.594933239024535e-05, "loss": 0.4006, "step": 69845 }, { "epoch": 2.5173892673081775, "grad_norm": 0.18406714498996735, "learning_rate": 2.5946416354824927e-05, "loss": 0.3865, "step": 69850 }, { "epoch": 2.517569466969402, "grad_norm": 0.19561605155467987, "learning_rate": 2.5943500306509765e-05, "loss": 0.4131, "step": 69855 }, { "epoch": 2.5177496666306265, "grad_norm": 0.19253993034362793, "learning_rate": 2.5940584245339593e-05, "loss": 0.3926, "step": 69860 }, { "epoch": 2.5179298662918512, "grad_norm": 0.24067455530166626, "learning_rate": 2.5937668171354145e-05, "loss": 0.3809, "step": 69865 }, { "epoch": 2.518110065953076, "grad_norm": 0.17989800870418549, "learning_rate": 2.5934752084593143e-05, "loss": 0.3891, "step": 69870 }, { "epoch": 2.5182902656143007, "grad_norm": 0.21168993413448334, "learning_rate": 2.593183598509633e-05, "loss": 0.3663, "step": 69875 }, { "epoch": 2.5184704652755254, "grad_norm": 0.19849541783332825, "learning_rate": 2.5928919872903434e-05, "loss": 0.4195, "step": 69880 }, { "epoch": 2.51865066493675, "grad_norm": 0.22839267551898956, "learning_rate": 2.592600374805418e-05, "loss": 0.4412, "step": 69885 }, { "epoch": 2.5188308645979745, "grad_norm": 0.2030458301305771, "learning_rate": 2.5923087610588305e-05, "loss": 0.3982, "step": 69890 }, { "epoch": 2.519011064259199, "grad_norm": 0.161631777882576, "learning_rate": 2.592017146054554e-05, "loss": 0.3629, "step": 69895 }, { "epoch": 2.519191263920424, "grad_norm": 0.21375270187854767, "learning_rate": 2.5917255297965625e-05, "loss": 0.3821, "step": 69900 }, { "epoch": 2.519371463581648, "grad_norm": 0.19490370154380798, "learning_rate": 2.5914339122888272e-05, "loss": 0.411, "step": 69905 }, { "epoch": 2.519551663242873, "grad_norm": 0.2110588401556015, "learning_rate": 2.591142293535323e-05, "loss": 0.3623, "step": 69910 }, { "epoch": 2.5197318629040977, "grad_norm": 0.21977630257606506, "learning_rate": 2.5908506735400223e-05, "loss": 0.4114, "step": 69915 }, { "epoch": 2.5199120625653224, "grad_norm": 0.20621120929718018, "learning_rate": 2.5905590523068995e-05, "loss": 0.3517, "step": 69920 }, { "epoch": 2.520092262226547, "grad_norm": 0.20464351773262024, "learning_rate": 2.5902674298399273e-05, "loss": 0.3803, "step": 69925 }, { "epoch": 2.520272461887772, "grad_norm": 0.20108066499233246, "learning_rate": 2.5899758061430777e-05, "loss": 0.4085, "step": 69930 }, { "epoch": 2.520452661548996, "grad_norm": 0.2749893367290497, "learning_rate": 2.5896841812203265e-05, "loss": 0.3936, "step": 69935 }, { "epoch": 2.520632861210221, "grad_norm": 0.21574847400188446, "learning_rate": 2.589392555075645e-05, "loss": 0.3946, "step": 69940 }, { "epoch": 2.5208130608714456, "grad_norm": 0.17388522624969482, "learning_rate": 2.5891009277130073e-05, "loss": 0.4072, "step": 69945 }, { "epoch": 2.52099326053267, "grad_norm": 0.21994316577911377, "learning_rate": 2.5888092991363867e-05, "loss": 0.4174, "step": 69950 }, { "epoch": 2.5211734601938947, "grad_norm": 0.17600910365581512, "learning_rate": 2.5885176693497558e-05, "loss": 0.3881, "step": 69955 }, { "epoch": 2.5213536598551194, "grad_norm": 0.19850102066993713, "learning_rate": 2.5882260383570906e-05, "loss": 0.3902, "step": 69960 }, { "epoch": 2.521533859516344, "grad_norm": 0.20933623611927032, "learning_rate": 2.587934406162361e-05, "loss": 0.3793, "step": 69965 }, { "epoch": 2.521714059177569, "grad_norm": 0.2151513695716858, "learning_rate": 2.5876427727695433e-05, "loss": 0.3761, "step": 69970 }, { "epoch": 2.5218942588387936, "grad_norm": 0.19325150549411774, "learning_rate": 2.5873511381826087e-05, "loss": 0.3629, "step": 69975 }, { "epoch": 2.522074458500018, "grad_norm": 0.1916101574897766, "learning_rate": 2.5870595024055328e-05, "loss": 0.3827, "step": 69980 }, { "epoch": 2.5222546581612426, "grad_norm": 0.17457756400108337, "learning_rate": 2.586767865442288e-05, "loss": 0.4084, "step": 69985 }, { "epoch": 2.5224348578224673, "grad_norm": 0.19067886471748352, "learning_rate": 2.586476227296847e-05, "loss": 0.3825, "step": 69990 }, { "epoch": 2.522615057483692, "grad_norm": 0.18429118394851685, "learning_rate": 2.586184587973185e-05, "loss": 0.3885, "step": 69995 }, { "epoch": 2.5227952571449164, "grad_norm": 0.18464790284633636, "learning_rate": 2.5858929474752734e-05, "loss": 0.4124, "step": 70000 }, { "epoch": 2.5227952571449164, "eval_loss": 0.43379154801368713, "eval_runtime": 3.5327, "eval_samples_per_second": 28.307, "eval_steps_per_second": 7.077, "step": 70000 }, { "epoch": 2.522975456806141, "grad_norm": 0.20850814878940582, "learning_rate": 2.5856013058070888e-05, "loss": 0.3697, "step": 70005 }, { "epoch": 2.523155656467366, "grad_norm": 0.23533137142658234, "learning_rate": 2.5853096629726022e-05, "loss": 0.4432, "step": 70010 }, { "epoch": 2.5233358561285906, "grad_norm": 0.22087322175502777, "learning_rate": 2.5850180189757878e-05, "loss": 0.3875, "step": 70015 }, { "epoch": 2.5235160557898153, "grad_norm": 0.23013746738433838, "learning_rate": 2.584726373820619e-05, "loss": 0.3855, "step": 70020 }, { "epoch": 2.5236962554510396, "grad_norm": 0.17639127373695374, "learning_rate": 2.5844347275110702e-05, "loss": 0.3714, "step": 70025 }, { "epoch": 2.5238764551122643, "grad_norm": 0.18888616561889648, "learning_rate": 2.5841430800511145e-05, "loss": 0.4043, "step": 70030 }, { "epoch": 2.524056654773489, "grad_norm": 0.2257877141237259, "learning_rate": 2.5838514314447255e-05, "loss": 0.3873, "step": 70035 }, { "epoch": 2.524236854434714, "grad_norm": 0.20586729049682617, "learning_rate": 2.5835597816958774e-05, "loss": 0.406, "step": 70040 }, { "epoch": 2.524417054095938, "grad_norm": 0.2077612429857254, "learning_rate": 2.583268130808543e-05, "loss": 0.3753, "step": 70045 }, { "epoch": 2.524597253757163, "grad_norm": 0.17256224155426025, "learning_rate": 2.5829764787866974e-05, "loss": 0.35, "step": 70050 }, { "epoch": 2.5247774534183876, "grad_norm": 0.28003284335136414, "learning_rate": 2.5826848256343116e-05, "loss": 0.4164, "step": 70055 }, { "epoch": 2.5249576530796123, "grad_norm": 0.234117329120636, "learning_rate": 2.582393171355363e-05, "loss": 0.3949, "step": 70060 }, { "epoch": 2.525137852740837, "grad_norm": 0.21253302693367004, "learning_rate": 2.582101515953822e-05, "loss": 0.3964, "step": 70065 }, { "epoch": 2.5253180524020618, "grad_norm": 0.2189071625471115, "learning_rate": 2.5818098594336636e-05, "loss": 0.3859, "step": 70070 }, { "epoch": 2.525498252063286, "grad_norm": 0.2182706594467163, "learning_rate": 2.5815182017988626e-05, "loss": 0.3934, "step": 70075 }, { "epoch": 2.5256784517245108, "grad_norm": 0.1897723227739334, "learning_rate": 2.5812265430533916e-05, "loss": 0.4036, "step": 70080 }, { "epoch": 2.5258586513857355, "grad_norm": 0.20511159300804138, "learning_rate": 2.5809348832012253e-05, "loss": 0.4152, "step": 70085 }, { "epoch": 2.52603885104696, "grad_norm": 0.21636447310447693, "learning_rate": 2.5806432222463356e-05, "loss": 0.379, "step": 70090 }, { "epoch": 2.5262190507081845, "grad_norm": 0.2109370082616806, "learning_rate": 2.580351560192698e-05, "loss": 0.386, "step": 70095 }, { "epoch": 2.5263992503694093, "grad_norm": 0.20028775930404663, "learning_rate": 2.5800598970442862e-05, "loss": 0.4101, "step": 70100 }, { "epoch": 2.526579450030634, "grad_norm": 0.2018347680568695, "learning_rate": 2.5797682328050744e-05, "loss": 0.3857, "step": 70105 }, { "epoch": 2.5267596496918587, "grad_norm": 0.2127305567264557, "learning_rate": 2.579476567479035e-05, "loss": 0.3927, "step": 70110 }, { "epoch": 2.5269398493530835, "grad_norm": 0.19696669280529022, "learning_rate": 2.579184901070143e-05, "loss": 0.4074, "step": 70115 }, { "epoch": 2.5271200490143078, "grad_norm": 0.2948433458805084, "learning_rate": 2.578893233582372e-05, "loss": 0.3763, "step": 70120 }, { "epoch": 2.5273002486755325, "grad_norm": 0.1763259321451187, "learning_rate": 2.578601565019696e-05, "loss": 0.3824, "step": 70125 }, { "epoch": 2.5274804483367572, "grad_norm": 0.22030363976955414, "learning_rate": 2.5783098953860883e-05, "loss": 0.424, "step": 70130 }, { "epoch": 2.5276606479979815, "grad_norm": 0.2222214639186859, "learning_rate": 2.5780182246855245e-05, "loss": 0.421, "step": 70135 }, { "epoch": 2.5278408476592062, "grad_norm": 0.2385231852531433, "learning_rate": 2.5777265529219767e-05, "loss": 0.3931, "step": 70140 }, { "epoch": 2.528021047320431, "grad_norm": 0.1919146180152893, "learning_rate": 2.577434880099421e-05, "loss": 0.3618, "step": 70145 }, { "epoch": 2.5282012469816557, "grad_norm": 0.21808387339115143, "learning_rate": 2.5771432062218286e-05, "loss": 0.4243, "step": 70150 }, { "epoch": 2.5283814466428804, "grad_norm": 0.1588166505098343, "learning_rate": 2.576851531293176e-05, "loss": 0.3825, "step": 70155 }, { "epoch": 2.528561646304105, "grad_norm": 0.20755480229854584, "learning_rate": 2.576559855317435e-05, "loss": 0.4111, "step": 70160 }, { "epoch": 2.5287418459653295, "grad_norm": 0.22904981672763824, "learning_rate": 2.5762681782985816e-05, "loss": 0.3904, "step": 70165 }, { "epoch": 2.528922045626554, "grad_norm": 0.18714340031147003, "learning_rate": 2.575976500240589e-05, "loss": 0.3692, "step": 70170 }, { "epoch": 2.529102245287779, "grad_norm": 0.2257447987794876, "learning_rate": 2.5756848211474306e-05, "loss": 0.398, "step": 70175 }, { "epoch": 2.5292824449490032, "grad_norm": 0.21989066898822784, "learning_rate": 2.575393141023082e-05, "loss": 0.3885, "step": 70180 }, { "epoch": 2.529462644610228, "grad_norm": 0.21285291016101837, "learning_rate": 2.5751014598715155e-05, "loss": 0.3877, "step": 70185 }, { "epoch": 2.5296428442714527, "grad_norm": 0.20206022262573242, "learning_rate": 2.5748097776967073e-05, "loss": 0.4259, "step": 70190 }, { "epoch": 2.5298230439326774, "grad_norm": 0.18147850036621094, "learning_rate": 2.5745180945026298e-05, "loss": 0.3914, "step": 70195 }, { "epoch": 2.530003243593902, "grad_norm": 0.18609163165092468, "learning_rate": 2.574226410293258e-05, "loss": 0.4179, "step": 70200 }, { "epoch": 2.530183443255127, "grad_norm": 0.1647646427154541, "learning_rate": 2.573934725072565e-05, "loss": 0.3955, "step": 70205 }, { "epoch": 2.530363642916351, "grad_norm": 0.21977296471595764, "learning_rate": 2.573643038844526e-05, "loss": 0.3912, "step": 70210 }, { "epoch": 2.530543842577576, "grad_norm": 0.18793781101703644, "learning_rate": 2.5733513516131153e-05, "loss": 0.3975, "step": 70215 }, { "epoch": 2.5307240422388007, "grad_norm": 0.18933290243148804, "learning_rate": 2.573059663382306e-05, "loss": 0.4017, "step": 70220 }, { "epoch": 2.5309042419000254, "grad_norm": 0.17556174099445343, "learning_rate": 2.5727679741560734e-05, "loss": 0.4003, "step": 70225 }, { "epoch": 2.5310844415612497, "grad_norm": 0.18023037910461426, "learning_rate": 2.5724762839383915e-05, "loss": 0.4288, "step": 70230 }, { "epoch": 2.5312646412224744, "grad_norm": 0.1804884523153305, "learning_rate": 2.5721845927332333e-05, "loss": 0.3854, "step": 70235 }, { "epoch": 2.531444840883699, "grad_norm": 0.26398801803588867, "learning_rate": 2.5718929005445746e-05, "loss": 0.4287, "step": 70240 }, { "epoch": 2.531625040544924, "grad_norm": 0.2065039724111557, "learning_rate": 2.5716012073763883e-05, "loss": 0.397, "step": 70245 }, { "epoch": 2.5318052402061486, "grad_norm": 0.1797478199005127, "learning_rate": 2.5713095132326515e-05, "loss": 0.3692, "step": 70250 }, { "epoch": 2.531985439867373, "grad_norm": 0.19305941462516785, "learning_rate": 2.5710178181173344e-05, "loss": 0.3386, "step": 70255 }, { "epoch": 2.5321656395285976, "grad_norm": 0.2119571417570114, "learning_rate": 2.5707261220344143e-05, "loss": 0.4244, "step": 70260 }, { "epoch": 2.5323458391898224, "grad_norm": 0.21070972084999084, "learning_rate": 2.5704344249878637e-05, "loss": 0.3988, "step": 70265 }, { "epoch": 2.532526038851047, "grad_norm": 0.2026742547750473, "learning_rate": 2.570142726981658e-05, "loss": 0.4084, "step": 70270 }, { "epoch": 2.5327062385122714, "grad_norm": 0.16865421831607819, "learning_rate": 2.569851028019771e-05, "loss": 0.3662, "step": 70275 }, { "epoch": 2.532886438173496, "grad_norm": 0.23617038130760193, "learning_rate": 2.5695593281061774e-05, "loss": 0.3986, "step": 70280 }, { "epoch": 2.533066637834721, "grad_norm": 0.2046816498041153, "learning_rate": 2.5692676272448517e-05, "loss": 0.3803, "step": 70285 }, { "epoch": 2.5332468374959456, "grad_norm": 0.18710558116436005, "learning_rate": 2.5689759254397683e-05, "loss": 0.3968, "step": 70290 }, { "epoch": 2.5334270371571703, "grad_norm": 0.21419739723205566, "learning_rate": 2.5686842226949008e-05, "loss": 0.4224, "step": 70295 }, { "epoch": 2.533607236818395, "grad_norm": 0.18646739423274994, "learning_rate": 2.568392519014224e-05, "loss": 0.4011, "step": 70300 }, { "epoch": 2.5337874364796193, "grad_norm": 0.19873277842998505, "learning_rate": 2.5681008144017128e-05, "loss": 0.3635, "step": 70305 }, { "epoch": 2.533967636140844, "grad_norm": 0.18021991848945618, "learning_rate": 2.5678091088613408e-05, "loss": 0.3867, "step": 70310 }, { "epoch": 2.534147835802069, "grad_norm": 0.17604483664035797, "learning_rate": 2.5675174023970826e-05, "loss": 0.3857, "step": 70315 }, { "epoch": 2.534328035463293, "grad_norm": 0.2240837961435318, "learning_rate": 2.567225695012913e-05, "loss": 0.3711, "step": 70320 }, { "epoch": 2.534508235124518, "grad_norm": 0.1983606368303299, "learning_rate": 2.566933986712806e-05, "loss": 0.3828, "step": 70325 }, { "epoch": 2.5346884347857426, "grad_norm": 0.22788797318935394, "learning_rate": 2.566642277500738e-05, "loss": 0.3859, "step": 70330 }, { "epoch": 2.5348686344469673, "grad_norm": 0.2181435376405716, "learning_rate": 2.5663505673806805e-05, "loss": 0.4207, "step": 70335 }, { "epoch": 2.535048834108192, "grad_norm": 0.19292283058166504, "learning_rate": 2.5660588563566097e-05, "loss": 0.4102, "step": 70340 }, { "epoch": 2.5352290337694168, "grad_norm": 0.18651723861694336, "learning_rate": 2.5657671444324994e-05, "loss": 0.4028, "step": 70345 }, { "epoch": 2.535409233430641, "grad_norm": 0.20241175591945648, "learning_rate": 2.5654754316123248e-05, "loss": 0.4, "step": 70350 }, { "epoch": 2.535589433091866, "grad_norm": 0.19530391693115234, "learning_rate": 2.5651837179000605e-05, "loss": 0.3573, "step": 70355 }, { "epoch": 2.5357696327530905, "grad_norm": 0.1831565499305725, "learning_rate": 2.5648920032996794e-05, "loss": 0.3696, "step": 70360 }, { "epoch": 2.535949832414315, "grad_norm": 0.22338707745075226, "learning_rate": 2.5646002878151586e-05, "loss": 0.3642, "step": 70365 }, { "epoch": 2.5361300320755396, "grad_norm": 0.23138903081417084, "learning_rate": 2.5643669147936305e-05, "loss": 0.3546, "step": 70370 }, { "epoch": 2.5363102317367643, "grad_norm": 0.18024463951587677, "learning_rate": 2.5640751977276717e-05, "loss": 0.377, "step": 70375 }, { "epoch": 2.536490431397989, "grad_norm": 0.19997356832027435, "learning_rate": 2.5637834797887005e-05, "loss": 0.3983, "step": 70380 }, { "epoch": 2.5366706310592138, "grad_norm": 0.20950496196746826, "learning_rate": 2.5634917609806917e-05, "loss": 0.3991, "step": 70385 }, { "epoch": 2.5368508307204385, "grad_norm": 0.20799300074577332, "learning_rate": 2.56320004130762e-05, "loss": 0.3923, "step": 70390 }, { "epoch": 2.5370310303816628, "grad_norm": 0.23899616301059723, "learning_rate": 2.5629083207734595e-05, "loss": 0.4264, "step": 70395 }, { "epoch": 2.5372112300428875, "grad_norm": 0.18154381215572357, "learning_rate": 2.5626165993821866e-05, "loss": 0.3814, "step": 70400 }, { "epoch": 2.5373914297041122, "grad_norm": 0.1529223769903183, "learning_rate": 2.5623248771377733e-05, "loss": 0.3888, "step": 70405 }, { "epoch": 2.5375716293653365, "grad_norm": 0.18600383400917053, "learning_rate": 2.5620331540441956e-05, "loss": 0.3829, "step": 70410 }, { "epoch": 2.5377518290265613, "grad_norm": 0.17343242466449738, "learning_rate": 2.5617414301054288e-05, "loss": 0.3795, "step": 70415 }, { "epoch": 2.537932028687786, "grad_norm": 0.33770492672920227, "learning_rate": 2.5614497053254464e-05, "loss": 0.3925, "step": 70420 }, { "epoch": 2.5381122283490107, "grad_norm": 0.20434433221817017, "learning_rate": 2.5611579797082252e-05, "loss": 0.3664, "step": 70425 }, { "epoch": 2.5382924280102355, "grad_norm": 0.2171066850423813, "learning_rate": 2.5608662532577367e-05, "loss": 0.403, "step": 70430 }, { "epoch": 2.53847262767146, "grad_norm": 0.21163609623908997, "learning_rate": 2.5605745259779578e-05, "loss": 0.4073, "step": 70435 }, { "epoch": 2.5386528273326845, "grad_norm": 0.19679218530654907, "learning_rate": 2.5602827978728626e-05, "loss": 0.3859, "step": 70440 }, { "epoch": 2.5388330269939092, "grad_norm": 0.19141094386577606, "learning_rate": 2.5599910689464263e-05, "loss": 0.3611, "step": 70445 }, { "epoch": 2.539013226655134, "grad_norm": 0.18265005946159363, "learning_rate": 2.559699339202623e-05, "loss": 0.4121, "step": 70450 }, { "epoch": 2.5391934263163582, "grad_norm": 0.1961100548505783, "learning_rate": 2.559407608645428e-05, "loss": 0.387, "step": 70455 }, { "epoch": 2.539373625977583, "grad_norm": 0.1709757298231125, "learning_rate": 2.559115877278816e-05, "loss": 0.3952, "step": 70460 }, { "epoch": 2.5395538256388077, "grad_norm": 0.193019837141037, "learning_rate": 2.558824145106761e-05, "loss": 0.4121, "step": 70465 }, { "epoch": 2.5397340253000324, "grad_norm": 0.23720510303974152, "learning_rate": 2.5585324121332394e-05, "loss": 0.3731, "step": 70470 }, { "epoch": 2.539914224961257, "grad_norm": 0.21066009998321533, "learning_rate": 2.558240678362224e-05, "loss": 0.4199, "step": 70475 }, { "epoch": 2.540094424622482, "grad_norm": 0.23059865832328796, "learning_rate": 2.557948943797691e-05, "loss": 0.3951, "step": 70480 }, { "epoch": 2.540274624283706, "grad_norm": 0.19941085577011108, "learning_rate": 2.5576572084436153e-05, "loss": 0.4155, "step": 70485 }, { "epoch": 2.540454823944931, "grad_norm": 0.20750625431537628, "learning_rate": 2.5573654723039704e-05, "loss": 0.3913, "step": 70490 }, { "epoch": 2.5406350236061557, "grad_norm": 0.1879345029592514, "learning_rate": 2.5570737353827323e-05, "loss": 0.4201, "step": 70495 }, { "epoch": 2.5408152232673804, "grad_norm": 0.217643141746521, "learning_rate": 2.5567819976838752e-05, "loss": 0.3992, "step": 70500 }, { "epoch": 2.5408152232673804, "eval_loss": 0.4338361620903015, "eval_runtime": 3.5352, "eval_samples_per_second": 28.287, "eval_steps_per_second": 7.072, "step": 70500 }, { "epoch": 2.5409954229286047, "grad_norm": 0.24064627289772034, "learning_rate": 2.556490259211376e-05, "loss": 0.3785, "step": 70505 }, { "epoch": 2.5411756225898294, "grad_norm": 0.18428386747837067, "learning_rate": 2.5561985199692062e-05, "loss": 0.4018, "step": 70510 }, { "epoch": 2.541355822251054, "grad_norm": 0.20441985130310059, "learning_rate": 2.5559067799613434e-05, "loss": 0.3607, "step": 70515 }, { "epoch": 2.541536021912279, "grad_norm": 0.20763367414474487, "learning_rate": 2.555615039191761e-05, "loss": 0.3537, "step": 70520 }, { "epoch": 2.5417162215735036, "grad_norm": 0.2506287693977356, "learning_rate": 2.5553232976644352e-05, "loss": 0.382, "step": 70525 }, { "epoch": 2.541896421234728, "grad_norm": 0.2257665991783142, "learning_rate": 2.55503155538334e-05, "loss": 0.4206, "step": 70530 }, { "epoch": 2.5420766208959527, "grad_norm": 0.2143712192773819, "learning_rate": 2.5547398123524495e-05, "loss": 0.3677, "step": 70535 }, { "epoch": 2.5422568205571774, "grad_norm": 0.21773605048656464, "learning_rate": 2.554448068575741e-05, "loss": 0.4063, "step": 70540 }, { "epoch": 2.542437020218402, "grad_norm": 0.19206026196479797, "learning_rate": 2.5541563240571877e-05, "loss": 0.4034, "step": 70545 }, { "epoch": 2.5426172198796264, "grad_norm": 0.19149447977542877, "learning_rate": 2.553864578800765e-05, "loss": 0.377, "step": 70550 }, { "epoch": 2.542797419540851, "grad_norm": 0.18062478303909302, "learning_rate": 2.553572832810447e-05, "loss": 0.3932, "step": 70555 }, { "epoch": 2.542977619202076, "grad_norm": 0.1797667294740677, "learning_rate": 2.55328108609021e-05, "loss": 0.3771, "step": 70560 }, { "epoch": 2.5431578188633006, "grad_norm": 0.25208544731140137, "learning_rate": 2.5529893386440295e-05, "loss": 0.4192, "step": 70565 }, { "epoch": 2.5433380185245253, "grad_norm": 0.2537004351615906, "learning_rate": 2.552697590475878e-05, "loss": 0.3734, "step": 70570 }, { "epoch": 2.54351821818575, "grad_norm": 0.27715036273002625, "learning_rate": 2.5524058415897328e-05, "loss": 0.3997, "step": 70575 }, { "epoch": 2.5436984178469744, "grad_norm": 0.19086778163909912, "learning_rate": 2.552114091989568e-05, "loss": 0.3765, "step": 70580 }, { "epoch": 2.543878617508199, "grad_norm": 0.22903601825237274, "learning_rate": 2.5518223416793592e-05, "loss": 0.4433, "step": 70585 }, { "epoch": 2.544058817169424, "grad_norm": 0.21384993195533752, "learning_rate": 2.5515305906630805e-05, "loss": 0.4386, "step": 70590 }, { "epoch": 2.544239016830648, "grad_norm": 0.2099718600511551, "learning_rate": 2.5512388389447074e-05, "loss": 0.4044, "step": 70595 }, { "epoch": 2.544419216491873, "grad_norm": 0.20232221484184265, "learning_rate": 2.5509470865282155e-05, "loss": 0.4041, "step": 70600 }, { "epoch": 2.5445994161530976, "grad_norm": 0.19650457799434662, "learning_rate": 2.550655333417579e-05, "loss": 0.3656, "step": 70605 }, { "epoch": 2.5447796158143223, "grad_norm": 0.25365570187568665, "learning_rate": 2.550363579616774e-05, "loss": 0.3818, "step": 70610 }, { "epoch": 2.544959815475547, "grad_norm": 0.19008086621761322, "learning_rate": 2.5500718251297746e-05, "loss": 0.4, "step": 70615 }, { "epoch": 2.545140015136772, "grad_norm": 0.17244011163711548, "learning_rate": 2.5497800699605563e-05, "loss": 0.4169, "step": 70620 }, { "epoch": 2.545320214797996, "grad_norm": 0.21005220711231232, "learning_rate": 2.5494883141130938e-05, "loss": 0.3926, "step": 70625 }, { "epoch": 2.545500414459221, "grad_norm": 0.15076076984405518, "learning_rate": 2.5491965575913635e-05, "loss": 0.3612, "step": 70630 }, { "epoch": 2.5456806141204456, "grad_norm": 0.2208922952413559, "learning_rate": 2.548904800399339e-05, "loss": 0.4058, "step": 70635 }, { "epoch": 2.54586081378167, "grad_norm": 0.2186022400856018, "learning_rate": 2.548613042540996e-05, "loss": 0.4301, "step": 70640 }, { "epoch": 2.5460410134428946, "grad_norm": 0.2868672311306, "learning_rate": 2.5483212840203097e-05, "loss": 0.3947, "step": 70645 }, { "epoch": 2.5462212131041193, "grad_norm": 0.22895467281341553, "learning_rate": 2.5480295248412555e-05, "loss": 0.4076, "step": 70650 }, { "epoch": 2.546401412765344, "grad_norm": 0.23754490911960602, "learning_rate": 2.5477377650078087e-05, "loss": 0.3852, "step": 70655 }, { "epoch": 2.5465816124265688, "grad_norm": 0.17783097922801971, "learning_rate": 2.5474460045239435e-05, "loss": 0.408, "step": 70660 }, { "epoch": 2.5467618120877935, "grad_norm": 0.21512363851070404, "learning_rate": 2.5471542433936358e-05, "loss": 0.4015, "step": 70665 }, { "epoch": 2.546942011749018, "grad_norm": 0.2190975397825241, "learning_rate": 2.546862481620861e-05, "loss": 0.3845, "step": 70670 }, { "epoch": 2.5471222114102425, "grad_norm": 0.19571487605571747, "learning_rate": 2.5465707192095927e-05, "loss": 0.4478, "step": 70675 }, { "epoch": 2.5473024110714673, "grad_norm": 0.2442859262228012, "learning_rate": 2.546278956163809e-05, "loss": 0.3756, "step": 70680 }, { "epoch": 2.5474826107326916, "grad_norm": 0.20132243633270264, "learning_rate": 2.5459871924874822e-05, "loss": 0.3883, "step": 70685 }, { "epoch": 2.5476628103939163, "grad_norm": 0.22069872915744781, "learning_rate": 2.54569542818459e-05, "loss": 0.4054, "step": 70690 }, { "epoch": 2.547843010055141, "grad_norm": 0.1703534722328186, "learning_rate": 2.5454036632591055e-05, "loss": 0.4059, "step": 70695 }, { "epoch": 2.5480232097163658, "grad_norm": 0.1875164955854416, "learning_rate": 2.5451118977150053e-05, "loss": 0.3853, "step": 70700 }, { "epoch": 2.5482034093775905, "grad_norm": 0.22562925517559052, "learning_rate": 2.544820131556264e-05, "loss": 0.4122, "step": 70705 }, { "epoch": 2.5483836090388152, "grad_norm": 0.1860308051109314, "learning_rate": 2.5445283647868574e-05, "loss": 0.3849, "step": 70710 }, { "epoch": 2.5485638087000395, "grad_norm": 0.21737992763519287, "learning_rate": 2.54423659741076e-05, "loss": 0.4249, "step": 70715 }, { "epoch": 2.5487440083612642, "grad_norm": 0.2317013442516327, "learning_rate": 2.543944829431948e-05, "loss": 0.3782, "step": 70720 }, { "epoch": 2.548924208022489, "grad_norm": 0.183481365442276, "learning_rate": 2.543653060854396e-05, "loss": 0.4066, "step": 70725 }, { "epoch": 2.5491044076837137, "grad_norm": 0.2388121783733368, "learning_rate": 2.5433612916820798e-05, "loss": 0.3918, "step": 70730 }, { "epoch": 2.549284607344938, "grad_norm": 0.22414498031139374, "learning_rate": 2.5430695219189738e-05, "loss": 0.4132, "step": 70735 }, { "epoch": 2.5494648070061627, "grad_norm": 0.18682889640331268, "learning_rate": 2.5427777515690543e-05, "loss": 0.4106, "step": 70740 }, { "epoch": 2.5496450066673875, "grad_norm": 0.18605414032936096, "learning_rate": 2.5424859806362954e-05, "loss": 0.3922, "step": 70745 }, { "epoch": 2.549825206328612, "grad_norm": 0.19950343668460846, "learning_rate": 2.542194209124675e-05, "loss": 0.3529, "step": 70750 }, { "epoch": 2.550005405989837, "grad_norm": 0.22607798874378204, "learning_rate": 2.541902437038165e-05, "loss": 0.3833, "step": 70755 }, { "epoch": 2.5501856056510612, "grad_norm": 0.24269573390483856, "learning_rate": 2.5416106643807434e-05, "loss": 0.4602, "step": 70760 }, { "epoch": 2.550365805312286, "grad_norm": 0.22042149305343628, "learning_rate": 2.541318891156384e-05, "loss": 0.385, "step": 70765 }, { "epoch": 2.5505460049735107, "grad_norm": 0.21216388046741486, "learning_rate": 2.541027117369063e-05, "loss": 0.4038, "step": 70770 }, { "epoch": 2.5507262046347354, "grad_norm": 0.21207298338413239, "learning_rate": 2.5407353430227554e-05, "loss": 0.4195, "step": 70775 }, { "epoch": 2.5509064042959597, "grad_norm": 0.219013512134552, "learning_rate": 2.540443568121436e-05, "loss": 0.4065, "step": 70780 }, { "epoch": 2.5510866039571845, "grad_norm": 0.23736967146396637, "learning_rate": 2.5401517926690816e-05, "loss": 0.3827, "step": 70785 }, { "epoch": 2.551266803618409, "grad_norm": 0.20479167997837067, "learning_rate": 2.5398600166696668e-05, "loss": 0.4108, "step": 70790 }, { "epoch": 2.551447003279634, "grad_norm": 0.24934591352939606, "learning_rate": 2.539568240127167e-05, "loss": 0.3904, "step": 70795 }, { "epoch": 2.5516272029408587, "grad_norm": 0.20699314773082733, "learning_rate": 2.5392764630455574e-05, "loss": 0.4098, "step": 70800 }, { "epoch": 2.5518074026020834, "grad_norm": 0.22364868223667145, "learning_rate": 2.5389846854288135e-05, "loss": 0.4066, "step": 70805 }, { "epoch": 2.5519876022633077, "grad_norm": 0.23530325293540955, "learning_rate": 2.5386929072809107e-05, "loss": 0.4167, "step": 70810 }, { "epoch": 2.5521678019245324, "grad_norm": 0.2264833003282547, "learning_rate": 2.5384011286058245e-05, "loss": 0.3679, "step": 70815 }, { "epoch": 2.552348001585757, "grad_norm": 0.20546197891235352, "learning_rate": 2.5381093494075308e-05, "loss": 0.3812, "step": 70820 }, { "epoch": 2.5525282012469814, "grad_norm": 0.1657361537218094, "learning_rate": 2.5378175696900042e-05, "loss": 0.3598, "step": 70825 }, { "epoch": 2.552708400908206, "grad_norm": 0.24184440076351166, "learning_rate": 2.5375257894572207e-05, "loss": 0.3985, "step": 70830 }, { "epoch": 2.552888600569431, "grad_norm": 0.22822557389736176, "learning_rate": 2.537234008713156e-05, "loss": 0.3811, "step": 70835 }, { "epoch": 2.5530688002306556, "grad_norm": 0.2141774296760559, "learning_rate": 2.5369422274617844e-05, "loss": 0.4142, "step": 70840 }, { "epoch": 2.5532489998918804, "grad_norm": 0.16231632232666016, "learning_rate": 2.5366504457070826e-05, "loss": 0.3693, "step": 70845 }, { "epoch": 2.553429199553105, "grad_norm": 0.1983339488506317, "learning_rate": 2.536358663453025e-05, "loss": 0.4081, "step": 70850 }, { "epoch": 2.5536093992143294, "grad_norm": 0.21507583558559418, "learning_rate": 2.5360668807035887e-05, "loss": 0.3717, "step": 70855 }, { "epoch": 2.553789598875554, "grad_norm": 0.2187241017818451, "learning_rate": 2.535775097462747e-05, "loss": 0.3672, "step": 70860 }, { "epoch": 2.553969798536779, "grad_norm": 0.20706336200237274, "learning_rate": 2.5354833137344773e-05, "loss": 0.407, "step": 70865 }, { "epoch": 2.554149998198003, "grad_norm": 0.21873357892036438, "learning_rate": 2.5351915295227535e-05, "loss": 0.4204, "step": 70870 }, { "epoch": 2.554330197859228, "grad_norm": 0.1820431351661682, "learning_rate": 2.534899744831553e-05, "loss": 0.3873, "step": 70875 }, { "epoch": 2.5545103975204526, "grad_norm": 0.17599505186080933, "learning_rate": 2.5346079596648497e-05, "loss": 0.3794, "step": 70880 }, { "epoch": 2.5546905971816773, "grad_norm": 0.2262696623802185, "learning_rate": 2.534316174026619e-05, "loss": 0.4126, "step": 70885 }, { "epoch": 2.554870796842902, "grad_norm": 0.23906999826431274, "learning_rate": 2.534024387920838e-05, "loss": 0.3999, "step": 70890 }, { "epoch": 2.555050996504127, "grad_norm": 0.16976365447044373, "learning_rate": 2.5337326013514817e-05, "loss": 0.3864, "step": 70895 }, { "epoch": 2.555231196165351, "grad_norm": 0.2087818831205368, "learning_rate": 2.5334408143225246e-05, "loss": 0.4226, "step": 70900 }, { "epoch": 2.555411395826576, "grad_norm": 0.21390338242053986, "learning_rate": 2.533149026837942e-05, "loss": 0.3674, "step": 70905 }, { "epoch": 2.5555915954878006, "grad_norm": 0.22753164172172546, "learning_rate": 2.532857238901712e-05, "loss": 0.4407, "step": 70910 }, { "epoch": 2.555771795149025, "grad_norm": 0.179393008351326, "learning_rate": 2.5325654505178074e-05, "loss": 0.4156, "step": 70915 }, { "epoch": 2.5559519948102496, "grad_norm": 0.22798627614974976, "learning_rate": 2.53233201949103e-05, "loss": 0.4018, "step": 70920 }, { "epoch": 2.5561321944714743, "grad_norm": 0.203562393784523, "learning_rate": 2.5320402303113327e-05, "loss": 0.4037, "step": 70925 }, { "epoch": 2.556312394132699, "grad_norm": 0.22589832544326782, "learning_rate": 2.531748440695092e-05, "loss": 0.3967, "step": 70930 }, { "epoch": 2.556492593793924, "grad_norm": 0.23674431443214417, "learning_rate": 2.5314566506462856e-05, "loss": 0.3992, "step": 70935 }, { "epoch": 2.5566727934551485, "grad_norm": 0.20412121713161469, "learning_rate": 2.5311648601688876e-05, "loss": 0.4023, "step": 70940 }, { "epoch": 2.556852993116373, "grad_norm": 0.17974796891212463, "learning_rate": 2.5308730692668754e-05, "loss": 0.3787, "step": 70945 }, { "epoch": 2.5570331927775976, "grad_norm": 0.16718092560768127, "learning_rate": 2.5305812779442232e-05, "loss": 0.407, "step": 70950 }, { "epoch": 2.5572133924388223, "grad_norm": 0.16526854038238525, "learning_rate": 2.530289486204907e-05, "loss": 0.3996, "step": 70955 }, { "epoch": 2.5573935921000466, "grad_norm": 0.2542569935321808, "learning_rate": 2.529997694052903e-05, "loss": 0.3735, "step": 70960 }, { "epoch": 2.5575737917612713, "grad_norm": 0.21731750667095184, "learning_rate": 2.5297059014921857e-05, "loss": 0.381, "step": 70965 }, { "epoch": 2.557753991422496, "grad_norm": 0.18733660876750946, "learning_rate": 2.5294141085267315e-05, "loss": 0.3858, "step": 70970 }, { "epoch": 2.5579341910837208, "grad_norm": 0.23402203619480133, "learning_rate": 2.5291223151605147e-05, "loss": 0.4118, "step": 70975 }, { "epoch": 2.5581143907449455, "grad_norm": 0.19820962846279144, "learning_rate": 2.5288305213975132e-05, "loss": 0.3861, "step": 70980 }, { "epoch": 2.5582945904061702, "grad_norm": 0.2706112861633301, "learning_rate": 2.5285387272417015e-05, "loss": 0.4072, "step": 70985 }, { "epoch": 2.5584747900673945, "grad_norm": 0.21151851117610931, "learning_rate": 2.5282469326970543e-05, "loss": 0.3774, "step": 70990 }, { "epoch": 2.5586549897286193, "grad_norm": 0.1977061629295349, "learning_rate": 2.5279551377675488e-05, "loss": 0.4204, "step": 70995 }, { "epoch": 2.558835189389844, "grad_norm": 0.19445201754570007, "learning_rate": 2.5276633424571594e-05, "loss": 0.3868, "step": 71000 }, { "epoch": 2.558835189389844, "eval_loss": 0.43283751606941223, "eval_runtime": 3.5328, "eval_samples_per_second": 28.306, "eval_steps_per_second": 7.077, "step": 71000 }, { "epoch": 2.5590153890510687, "grad_norm": 0.17734093964099884, "learning_rate": 2.5273715467698633e-05, "loss": 0.3773, "step": 71005 }, { "epoch": 2.559195588712293, "grad_norm": 0.23937411606311798, "learning_rate": 2.527079750709634e-05, "loss": 0.4013, "step": 71010 }, { "epoch": 2.5593757883735178, "grad_norm": 0.1706862598657608, "learning_rate": 2.5267879542804484e-05, "loss": 0.3689, "step": 71015 }, { "epoch": 2.5595559880347425, "grad_norm": 0.20813287794589996, "learning_rate": 2.526496157486283e-05, "loss": 0.3894, "step": 71020 }, { "epoch": 2.5597361876959672, "grad_norm": 0.24396944046020508, "learning_rate": 2.526204360331112e-05, "loss": 0.3836, "step": 71025 }, { "epoch": 2.559916387357192, "grad_norm": 0.18609154224395752, "learning_rate": 2.525912562818912e-05, "loss": 0.3683, "step": 71030 }, { "epoch": 2.5600965870184162, "grad_norm": 0.24493709206581116, "learning_rate": 2.525620764953658e-05, "loss": 0.3732, "step": 71035 }, { "epoch": 2.560276786679641, "grad_norm": 0.19621089100837708, "learning_rate": 2.525328966739326e-05, "loss": 0.3917, "step": 71040 }, { "epoch": 2.5604569863408657, "grad_norm": 0.20630283653736115, "learning_rate": 2.525037168179892e-05, "loss": 0.4063, "step": 71045 }, { "epoch": 2.5606371860020904, "grad_norm": 0.2241489142179489, "learning_rate": 2.524745369279331e-05, "loss": 0.4282, "step": 71050 }, { "epoch": 2.5608173856633147, "grad_norm": 0.20686471462249756, "learning_rate": 2.52445357004162e-05, "loss": 0.4066, "step": 71055 }, { "epoch": 2.5609975853245395, "grad_norm": 0.2341819703578949, "learning_rate": 2.5241617704707328e-05, "loss": 0.4269, "step": 71060 }, { "epoch": 2.561177784985764, "grad_norm": 0.17863284051418304, "learning_rate": 2.523869970570647e-05, "loss": 0.4245, "step": 71065 }, { "epoch": 2.561357984646989, "grad_norm": 0.31622689962387085, "learning_rate": 2.523578170345337e-05, "loss": 0.437, "step": 71070 }, { "epoch": 2.5615381843082137, "grad_norm": 0.24306558072566986, "learning_rate": 2.5232863697987796e-05, "loss": 0.4115, "step": 71075 }, { "epoch": 2.5617183839694384, "grad_norm": 0.23051370680332184, "learning_rate": 2.5229945689349487e-05, "loss": 0.4229, "step": 71080 }, { "epoch": 2.5618985836306627, "grad_norm": 0.22686095535755157, "learning_rate": 2.5227027677578224e-05, "loss": 0.3949, "step": 71085 }, { "epoch": 2.5620787832918874, "grad_norm": 0.24122454226016998, "learning_rate": 2.5224109662713752e-05, "loss": 0.382, "step": 71090 }, { "epoch": 2.562258982953112, "grad_norm": 0.19796620309352875, "learning_rate": 2.5221191644795822e-05, "loss": 0.3849, "step": 71095 }, { "epoch": 2.5624391826143365, "grad_norm": 0.20853497087955475, "learning_rate": 2.5218273623864202e-05, "loss": 0.3628, "step": 71100 }, { "epoch": 2.562619382275561, "grad_norm": 0.24205127358436584, "learning_rate": 2.5215355599958647e-05, "loss": 0.4302, "step": 71105 }, { "epoch": 2.562799581936786, "grad_norm": 0.18552717566490173, "learning_rate": 2.521243757311892e-05, "loss": 0.3786, "step": 71110 }, { "epoch": 2.5629797815980107, "grad_norm": 0.15225747227668762, "learning_rate": 2.5209519543384763e-05, "loss": 0.3853, "step": 71115 }, { "epoch": 2.5631599812592354, "grad_norm": 0.2108543962240219, "learning_rate": 2.5206601510795948e-05, "loss": 0.4087, "step": 71120 }, { "epoch": 2.56334018092046, "grad_norm": 0.26248225569725037, "learning_rate": 2.520368347539222e-05, "loss": 0.3905, "step": 71125 }, { "epoch": 2.5635203805816844, "grad_norm": 0.18176917731761932, "learning_rate": 2.5200765437213347e-05, "loss": 0.3922, "step": 71130 }, { "epoch": 2.563700580242909, "grad_norm": 0.1985873579978943, "learning_rate": 2.519784739629909e-05, "loss": 0.3832, "step": 71135 }, { "epoch": 2.563880779904134, "grad_norm": 0.217198446393013, "learning_rate": 2.519492935268919e-05, "loss": 0.4287, "step": 71140 }, { "epoch": 2.564060979565358, "grad_norm": 0.18571309745311737, "learning_rate": 2.5192011306423424e-05, "loss": 0.3855, "step": 71145 }, { "epoch": 2.564241179226583, "grad_norm": 0.22361566126346588, "learning_rate": 2.518909325754154e-05, "loss": 0.419, "step": 71150 }, { "epoch": 2.5644213788878076, "grad_norm": 0.16923829913139343, "learning_rate": 2.51861752060833e-05, "loss": 0.3755, "step": 71155 }, { "epoch": 2.5646015785490324, "grad_norm": 0.19118502736091614, "learning_rate": 2.518325715208845e-05, "loss": 0.3829, "step": 71160 }, { "epoch": 2.564781778210257, "grad_norm": 0.2314773052930832, "learning_rate": 2.5180339095596755e-05, "loss": 0.3592, "step": 71165 }, { "epoch": 2.564961977871482, "grad_norm": 0.2128879874944687, "learning_rate": 2.517742103664799e-05, "loss": 0.3965, "step": 71170 }, { "epoch": 2.565142177532706, "grad_norm": 0.20926570892333984, "learning_rate": 2.5174502975281887e-05, "loss": 0.3829, "step": 71175 }, { "epoch": 2.565322377193931, "grad_norm": 0.2084009349346161, "learning_rate": 2.517158491153822e-05, "loss": 0.3937, "step": 71180 }, { "epoch": 2.5655025768551556, "grad_norm": 0.1534343957901001, "learning_rate": 2.5168666845456733e-05, "loss": 0.3473, "step": 71185 }, { "epoch": 2.56568277651638, "grad_norm": 0.16337619721889496, "learning_rate": 2.5165748777077197e-05, "loss": 0.3902, "step": 71190 }, { "epoch": 2.5658629761776046, "grad_norm": 0.23396508395671844, "learning_rate": 2.516283070643937e-05, "loss": 0.4145, "step": 71195 }, { "epoch": 2.5660431758388293, "grad_norm": 0.22363629937171936, "learning_rate": 2.5159912633582998e-05, "loss": 0.3752, "step": 71200 }, { "epoch": 2.566223375500054, "grad_norm": 0.22132565081119537, "learning_rate": 2.5156994558547857e-05, "loss": 0.3862, "step": 71205 }, { "epoch": 2.566403575161279, "grad_norm": 0.16280272603034973, "learning_rate": 2.5154076481373694e-05, "loss": 0.3692, "step": 71210 }, { "epoch": 2.5665837748225035, "grad_norm": 0.17420467734336853, "learning_rate": 2.5151158402100268e-05, "loss": 0.3694, "step": 71215 }, { "epoch": 2.566763974483728, "grad_norm": 0.22650916874408722, "learning_rate": 2.514824032076733e-05, "loss": 0.4037, "step": 71220 }, { "epoch": 2.5669441741449526, "grad_norm": 0.22971905767917633, "learning_rate": 2.514532223741466e-05, "loss": 0.4182, "step": 71225 }, { "epoch": 2.5671243738061773, "grad_norm": 0.19210931658744812, "learning_rate": 2.5142404152081993e-05, "loss": 0.3954, "step": 71230 }, { "epoch": 2.567304573467402, "grad_norm": 0.22867514193058014, "learning_rate": 2.5139486064809097e-05, "loss": 0.3992, "step": 71235 }, { "epoch": 2.5674847731286263, "grad_norm": 0.19859769940376282, "learning_rate": 2.5136567975635733e-05, "loss": 0.4205, "step": 71240 }, { "epoch": 2.567664972789851, "grad_norm": 0.20891262590885162, "learning_rate": 2.513364988460165e-05, "loss": 0.4269, "step": 71245 }, { "epoch": 2.567845172451076, "grad_norm": 0.20854581892490387, "learning_rate": 2.5130731791746627e-05, "loss": 0.3829, "step": 71250 }, { "epoch": 2.5680253721123005, "grad_norm": 0.23713940382003784, "learning_rate": 2.5127813697110398e-05, "loss": 0.3889, "step": 71255 }, { "epoch": 2.5682055717735253, "grad_norm": 0.17710144817829132, "learning_rate": 2.512489560073274e-05, "loss": 0.3795, "step": 71260 }, { "epoch": 2.5683857714347496, "grad_norm": 0.16682571172714233, "learning_rate": 2.5121977502653395e-05, "loss": 0.3858, "step": 71265 }, { "epoch": 2.5685659710959743, "grad_norm": 0.2379252016544342, "learning_rate": 2.511905940291214e-05, "loss": 0.4112, "step": 71270 }, { "epoch": 2.568746170757199, "grad_norm": 0.19735756516456604, "learning_rate": 2.5116141301548714e-05, "loss": 0.3893, "step": 71275 }, { "epoch": 2.5689263704184238, "grad_norm": 0.2579612135887146, "learning_rate": 2.5113223198602885e-05, "loss": 0.3624, "step": 71280 }, { "epoch": 2.569106570079648, "grad_norm": 0.2095087617635727, "learning_rate": 2.5110305094114416e-05, "loss": 0.377, "step": 71285 }, { "epoch": 2.5692867697408728, "grad_norm": 0.19634920358657837, "learning_rate": 2.510738698812306e-05, "loss": 0.3583, "step": 71290 }, { "epoch": 2.5694669694020975, "grad_norm": 0.2694518268108368, "learning_rate": 2.5104468880668587e-05, "loss": 0.4229, "step": 71295 }, { "epoch": 2.5696471690633222, "grad_norm": 0.1848263442516327, "learning_rate": 2.5101550771790733e-05, "loss": 0.4172, "step": 71300 }, { "epoch": 2.569827368724547, "grad_norm": 0.16099122166633606, "learning_rate": 2.509863266152927e-05, "loss": 0.3887, "step": 71305 }, { "epoch": 2.5700075683857713, "grad_norm": 0.2326761782169342, "learning_rate": 2.509571454992396e-05, "loss": 0.4158, "step": 71310 }, { "epoch": 2.570187768046996, "grad_norm": 0.23651954531669617, "learning_rate": 2.509279643701456e-05, "loss": 0.3646, "step": 71315 }, { "epoch": 2.5703679677082207, "grad_norm": 0.21195195615291595, "learning_rate": 2.5089878322840826e-05, "loss": 0.3927, "step": 71320 }, { "epoch": 2.5705481673694455, "grad_norm": 0.23943771421909332, "learning_rate": 2.5086960207442512e-05, "loss": 0.3641, "step": 71325 }, { "epoch": 2.5707283670306698, "grad_norm": 0.2217801809310913, "learning_rate": 2.5084042090859382e-05, "loss": 0.3776, "step": 71330 }, { "epoch": 2.5709085666918945, "grad_norm": 0.17442776262760162, "learning_rate": 2.5081123973131205e-05, "loss": 0.3989, "step": 71335 }, { "epoch": 2.5710887663531192, "grad_norm": 0.21039848029613495, "learning_rate": 2.5078205854297715e-05, "loss": 0.4116, "step": 71340 }, { "epoch": 2.571268966014344, "grad_norm": 0.1870678812265396, "learning_rate": 2.5075287734398695e-05, "loss": 0.3692, "step": 71345 }, { "epoch": 2.5714491656755687, "grad_norm": 0.22971993684768677, "learning_rate": 2.507236961347389e-05, "loss": 0.4334, "step": 71350 }, { "epoch": 2.5716293653367934, "grad_norm": 0.1942625790834427, "learning_rate": 2.5069451491563073e-05, "loss": 0.3755, "step": 71355 }, { "epoch": 2.5718095649980177, "grad_norm": 0.20081962645053864, "learning_rate": 2.506653336870598e-05, "loss": 0.381, "step": 71360 }, { "epoch": 2.5719897646592425, "grad_norm": 0.23155467212200165, "learning_rate": 2.506361524494239e-05, "loss": 0.3836, "step": 71365 }, { "epoch": 2.572169964320467, "grad_norm": 0.21292930841445923, "learning_rate": 2.506069712031205e-05, "loss": 0.3763, "step": 71370 }, { "epoch": 2.5723501639816915, "grad_norm": 0.1886298656463623, "learning_rate": 2.5057778994854724e-05, "loss": 0.3779, "step": 71375 }, { "epoch": 2.572530363642916, "grad_norm": 0.211809903383255, "learning_rate": 2.5054860868610173e-05, "loss": 0.3735, "step": 71380 }, { "epoch": 2.572710563304141, "grad_norm": 0.18282023072242737, "learning_rate": 2.505194274161815e-05, "loss": 0.3909, "step": 71385 }, { "epoch": 2.5728907629653657, "grad_norm": 0.22584885358810425, "learning_rate": 2.504902461391842e-05, "loss": 0.4002, "step": 71390 }, { "epoch": 2.5730709626265904, "grad_norm": 0.18001751601696014, "learning_rate": 2.504610648555074e-05, "loss": 0.4059, "step": 71395 }, { "epoch": 2.573251162287815, "grad_norm": 0.22520719468593597, "learning_rate": 2.504318835655487e-05, "loss": 0.4089, "step": 71400 }, { "epoch": 2.5734313619490394, "grad_norm": 0.17423519492149353, "learning_rate": 2.5040270226970558e-05, "loss": 0.3849, "step": 71405 }, { "epoch": 2.573611561610264, "grad_norm": 0.2317894846200943, "learning_rate": 2.503735209683758e-05, "loss": 0.3856, "step": 71410 }, { "epoch": 2.573791761271489, "grad_norm": 0.2146165519952774, "learning_rate": 2.5034433966195686e-05, "loss": 0.3974, "step": 71415 }, { "epoch": 2.573971960932713, "grad_norm": 0.23812200129032135, "learning_rate": 2.5031515835084636e-05, "loss": 0.4132, "step": 71420 }, { "epoch": 2.574152160593938, "grad_norm": 0.229581817984581, "learning_rate": 2.502859770354419e-05, "loss": 0.3816, "step": 71425 }, { "epoch": 2.5743323602551627, "grad_norm": 0.2108045518398285, "learning_rate": 2.5025679571614095e-05, "loss": 0.3487, "step": 71430 }, { "epoch": 2.5745125599163874, "grad_norm": 0.21576263010501862, "learning_rate": 2.502276143933413e-05, "loss": 0.3742, "step": 71435 }, { "epoch": 2.574692759577612, "grad_norm": 0.21896515786647797, "learning_rate": 2.5019843306744045e-05, "loss": 0.386, "step": 71440 }, { "epoch": 2.574872959238837, "grad_norm": 0.25227516889572144, "learning_rate": 2.50169251738836e-05, "loss": 0.3987, "step": 71445 }, { "epoch": 2.575053158900061, "grad_norm": 0.24805094301700592, "learning_rate": 2.5014007040792548e-05, "loss": 0.3862, "step": 71450 }, { "epoch": 2.575233358561286, "grad_norm": 0.1973733752965927, "learning_rate": 2.5011088907510648e-05, "loss": 0.3989, "step": 71455 }, { "epoch": 2.5754135582225106, "grad_norm": 0.20170655846595764, "learning_rate": 2.500817077407768e-05, "loss": 0.3698, "step": 71460 }, { "epoch": 2.575593757883735, "grad_norm": 0.22908920049667358, "learning_rate": 2.5005252640533374e-05, "loss": 0.3889, "step": 71465 }, { "epoch": 2.5757739575449596, "grad_norm": 0.17910641431808472, "learning_rate": 2.500233450691751e-05, "loss": 0.361, "step": 71470 }, { "epoch": 2.5759541572061844, "grad_norm": 0.22365345060825348, "learning_rate": 2.4999416373269836e-05, "loss": 0.4144, "step": 71475 }, { "epoch": 2.576134356867409, "grad_norm": 0.20202086865901947, "learning_rate": 2.4996498239630105e-05, "loss": 0.3748, "step": 71480 }, { "epoch": 2.576314556528634, "grad_norm": 0.17802977561950684, "learning_rate": 2.4993580106038096e-05, "loss": 0.4047, "step": 71485 }, { "epoch": 2.5764947561898586, "grad_norm": 0.22806064784526825, "learning_rate": 2.499066197253355e-05, "loss": 0.388, "step": 71490 }, { "epoch": 2.576674955851083, "grad_norm": 0.1772589087486267, "learning_rate": 2.4987743839156234e-05, "loss": 0.3769, "step": 71495 }, { "epoch": 2.5768551555123076, "grad_norm": 0.2450006902217865, "learning_rate": 2.4984825705945912e-05, "loss": 0.3806, "step": 71500 }, { "epoch": 2.5768551555123076, "eval_loss": 0.4331854581832886, "eval_runtime": 3.5274, "eval_samples_per_second": 28.349, "eval_steps_per_second": 7.087, "step": 71500 }, { "epoch": 2.5770353551735323, "grad_norm": 0.1859433799982071, "learning_rate": 2.4981907572942326e-05, "loss": 0.3643, "step": 71505 }, { "epoch": 2.577215554834757, "grad_norm": 0.21410761773586273, "learning_rate": 2.4978989440185254e-05, "loss": 0.3997, "step": 71510 }, { "epoch": 2.5773957544959814, "grad_norm": 0.19530677795410156, "learning_rate": 2.4976071307714446e-05, "loss": 0.403, "step": 71515 }, { "epoch": 2.577575954157206, "grad_norm": 0.19589672982692719, "learning_rate": 2.4973153175569657e-05, "loss": 0.3772, "step": 71520 }, { "epoch": 2.577756153818431, "grad_norm": 0.16859115660190582, "learning_rate": 2.4970235043790657e-05, "loss": 0.3551, "step": 71525 }, { "epoch": 2.5779363534796556, "grad_norm": 0.2540879547595978, "learning_rate": 2.4967316912417204e-05, "loss": 0.3946, "step": 71530 }, { "epoch": 2.5781165531408803, "grad_norm": 0.1967131346464157, "learning_rate": 2.4964398781489035e-05, "loss": 0.4273, "step": 71535 }, { "epoch": 2.5782967528021046, "grad_norm": 0.2016594558954239, "learning_rate": 2.496148065104594e-05, "loss": 0.4031, "step": 71540 }, { "epoch": 2.5784769524633293, "grad_norm": 0.22788941860198975, "learning_rate": 2.495856252112765e-05, "loss": 0.3956, "step": 71545 }, { "epoch": 2.578657152124554, "grad_norm": 0.18338678777217865, "learning_rate": 2.4955644391773954e-05, "loss": 0.3795, "step": 71550 }, { "epoch": 2.5788373517857788, "grad_norm": 0.16157367825508118, "learning_rate": 2.4952726263024588e-05, "loss": 0.3509, "step": 71555 }, { "epoch": 2.579017551447003, "grad_norm": 0.1809634417295456, "learning_rate": 2.4949808134919312e-05, "loss": 0.3658, "step": 71560 }, { "epoch": 2.579197751108228, "grad_norm": 0.2137366533279419, "learning_rate": 2.4946890007497898e-05, "loss": 0.3747, "step": 71565 }, { "epoch": 2.5793779507694525, "grad_norm": 0.1947045475244522, "learning_rate": 2.4943971880800093e-05, "loss": 0.4341, "step": 71570 }, { "epoch": 2.5795581504306773, "grad_norm": 0.2671882212162018, "learning_rate": 2.4941053754865658e-05, "loss": 0.4122, "step": 71575 }, { "epoch": 2.579738350091902, "grad_norm": 0.17521758377552032, "learning_rate": 2.4938135629734356e-05, "loss": 0.3967, "step": 71580 }, { "epoch": 2.5799185497531267, "grad_norm": 0.1912911832332611, "learning_rate": 2.4935217505445947e-05, "loss": 0.384, "step": 71585 }, { "epoch": 2.580098749414351, "grad_norm": 0.19323128461837769, "learning_rate": 2.4932299382040183e-05, "loss": 0.4119, "step": 71590 }, { "epoch": 2.5802789490755758, "grad_norm": 0.22955751419067383, "learning_rate": 2.4929381259556835e-05, "loss": 0.4337, "step": 71595 }, { "epoch": 2.5804591487368005, "grad_norm": 0.21088866889476776, "learning_rate": 2.492646313803564e-05, "loss": 0.3519, "step": 71600 }, { "epoch": 2.580639348398025, "grad_norm": 0.23416496813297272, "learning_rate": 2.492354501751638e-05, "loss": 0.4205, "step": 71605 }, { "epoch": 2.5808195480592495, "grad_norm": 0.22170627117156982, "learning_rate": 2.4920626898038806e-05, "loss": 0.4031, "step": 71610 }, { "epoch": 2.5809997477204742, "grad_norm": 0.18647152185440063, "learning_rate": 2.491770877964267e-05, "loss": 0.3669, "step": 71615 }, { "epoch": 2.581179947381699, "grad_norm": 0.24669143557548523, "learning_rate": 2.4914790662367737e-05, "loss": 0.4038, "step": 71620 }, { "epoch": 2.5813601470429237, "grad_norm": 0.2305600941181183, "learning_rate": 2.491187254625376e-05, "loss": 0.3989, "step": 71625 }, { "epoch": 2.5815403467041484, "grad_norm": 0.17700450122356415, "learning_rate": 2.4908954431340513e-05, "loss": 0.3687, "step": 71630 }, { "epoch": 2.5817205463653727, "grad_norm": 0.20750737190246582, "learning_rate": 2.4906036317667744e-05, "loss": 0.3824, "step": 71635 }, { "epoch": 2.5819007460265975, "grad_norm": 0.2014245092868805, "learning_rate": 2.49031182052752e-05, "loss": 0.3933, "step": 71640 }, { "epoch": 2.582080945687822, "grad_norm": 0.21627309918403625, "learning_rate": 2.4900200094202663e-05, "loss": 0.4101, "step": 71645 }, { "epoch": 2.5822611453490465, "grad_norm": 0.24448803067207336, "learning_rate": 2.4897281984489868e-05, "loss": 0.3986, "step": 71650 }, { "epoch": 2.5824413450102712, "grad_norm": 0.20513613522052765, "learning_rate": 2.4894363876176602e-05, "loss": 0.3929, "step": 71655 }, { "epoch": 2.582621544671496, "grad_norm": 0.16580136120319366, "learning_rate": 2.48914457693026e-05, "loss": 0.3893, "step": 71660 }, { "epoch": 2.5828017443327207, "grad_norm": 0.17390620708465576, "learning_rate": 2.4888527663907627e-05, "loss": 0.3854, "step": 71665 }, { "epoch": 2.5829819439939454, "grad_norm": 0.21692463755607605, "learning_rate": 2.4885609560031445e-05, "loss": 0.3975, "step": 71670 }, { "epoch": 2.58316214365517, "grad_norm": 0.20750007033348083, "learning_rate": 2.4882691457713813e-05, "loss": 0.4188, "step": 71675 }, { "epoch": 2.5833423433163945, "grad_norm": 0.1784081906080246, "learning_rate": 2.4879773356994478e-05, "loss": 0.3624, "step": 71680 }, { "epoch": 2.583522542977619, "grad_norm": 0.2542138993740082, "learning_rate": 2.4876855257913217e-05, "loss": 0.3848, "step": 71685 }, { "epoch": 2.583702742638844, "grad_norm": 0.19215676188468933, "learning_rate": 2.4873937160509772e-05, "loss": 0.3796, "step": 71690 }, { "epoch": 2.583882942300068, "grad_norm": 0.23548319935798645, "learning_rate": 2.4871019064823918e-05, "loss": 0.4226, "step": 71695 }, { "epoch": 2.584063141961293, "grad_norm": 0.23626303672790527, "learning_rate": 2.48681009708954e-05, "loss": 0.4119, "step": 71700 }, { "epoch": 2.5842433416225177, "grad_norm": 0.20605653524398804, "learning_rate": 2.4865182878763975e-05, "loss": 0.3605, "step": 71705 }, { "epoch": 2.5844235412837424, "grad_norm": 0.23323751986026764, "learning_rate": 2.4862264788469414e-05, "loss": 0.4294, "step": 71710 }, { "epoch": 2.584603740944967, "grad_norm": 0.19780725240707397, "learning_rate": 2.4859346700051474e-05, "loss": 0.3779, "step": 71715 }, { "epoch": 2.584783940606192, "grad_norm": 0.2039288431406021, "learning_rate": 2.4856428613549892e-05, "loss": 0.3828, "step": 71720 }, { "epoch": 2.584964140267416, "grad_norm": 0.1562710702419281, "learning_rate": 2.485351052900446e-05, "loss": 0.3812, "step": 71725 }, { "epoch": 2.585144339928641, "grad_norm": 0.18016739189624786, "learning_rate": 2.48505924464549e-05, "loss": 0.3886, "step": 71730 }, { "epoch": 2.5853245395898656, "grad_norm": 0.1871825009584427, "learning_rate": 2.4847674365941e-05, "loss": 0.3554, "step": 71735 }, { "epoch": 2.5855047392510904, "grad_norm": 0.2108829915523529, "learning_rate": 2.4844756287502514e-05, "loss": 0.37, "step": 71740 }, { "epoch": 2.5856849389123147, "grad_norm": 0.2470017671585083, "learning_rate": 2.4841838211179175e-05, "loss": 0.3934, "step": 71745 }, { "epoch": 2.5858651385735394, "grad_norm": 0.22116664052009583, "learning_rate": 2.4838920137010776e-05, "loss": 0.4159, "step": 71750 }, { "epoch": 2.586045338234764, "grad_norm": 0.1974770575761795, "learning_rate": 2.4836002065037056e-05, "loss": 0.4125, "step": 71755 }, { "epoch": 2.586225537895989, "grad_norm": 0.2147388607263565, "learning_rate": 2.4833083995297772e-05, "loss": 0.4346, "step": 71760 }, { "epoch": 2.5864057375572136, "grad_norm": 0.21751351654529572, "learning_rate": 2.483016592783269e-05, "loss": 0.354, "step": 71765 }, { "epoch": 2.586585937218438, "grad_norm": 0.1897956132888794, "learning_rate": 2.4827247862681556e-05, "loss": 0.3771, "step": 71770 }, { "epoch": 2.5867661368796626, "grad_norm": 0.2261020839214325, "learning_rate": 2.4824329799884144e-05, "loss": 0.3636, "step": 71775 }, { "epoch": 2.5869463365408873, "grad_norm": 0.20903125405311584, "learning_rate": 2.4821411739480206e-05, "loss": 0.4079, "step": 71780 }, { "epoch": 2.587126536202112, "grad_norm": 0.21350517868995667, "learning_rate": 2.481849368150949e-05, "loss": 0.3354, "step": 71785 }, { "epoch": 2.5873067358633364, "grad_norm": 0.18678854405879974, "learning_rate": 2.481557562601177e-05, "loss": 0.4028, "step": 71790 }, { "epoch": 2.587486935524561, "grad_norm": 0.20988419651985168, "learning_rate": 2.4812657573026797e-05, "loss": 0.3798, "step": 71795 }, { "epoch": 2.587667135185786, "grad_norm": 0.21831747889518738, "learning_rate": 2.4809739522594318e-05, "loss": 0.3999, "step": 71800 }, { "epoch": 2.5878473348470106, "grad_norm": 0.18378116190433502, "learning_rate": 2.4806821474754112e-05, "loss": 0.41, "step": 71805 }, { "epoch": 2.5880275345082353, "grad_norm": 0.22383366525173187, "learning_rate": 2.4803903429545918e-05, "loss": 0.3942, "step": 71810 }, { "epoch": 2.5882077341694596, "grad_norm": 0.15554063022136688, "learning_rate": 2.480098538700951e-05, "loss": 0.3572, "step": 71815 }, { "epoch": 2.5883879338306843, "grad_norm": 0.20337338745594025, "learning_rate": 2.4798067347184638e-05, "loss": 0.4069, "step": 71820 }, { "epoch": 2.588568133491909, "grad_norm": 0.25683242082595825, "learning_rate": 2.4795149310111047e-05, "loss": 0.4472, "step": 71825 }, { "epoch": 2.588748333153134, "grad_norm": 0.16255348920822144, "learning_rate": 2.479223127582852e-05, "loss": 0.3799, "step": 71830 }, { "epoch": 2.588928532814358, "grad_norm": 0.23991474509239197, "learning_rate": 2.478931324437679e-05, "loss": 0.3806, "step": 71835 }, { "epoch": 2.589108732475583, "grad_norm": 0.17780093848705292, "learning_rate": 2.478639521579564e-05, "loss": 0.371, "step": 71840 }, { "epoch": 2.5892889321368076, "grad_norm": 0.21101722121238708, "learning_rate": 2.478347719012481e-05, "loss": 0.3911, "step": 71845 }, { "epoch": 2.5894691317980323, "grad_norm": 0.1497720628976822, "learning_rate": 2.4780559167404054e-05, "loss": 0.3663, "step": 71850 }, { "epoch": 2.589649331459257, "grad_norm": 0.14714904129505157, "learning_rate": 2.4777641147673144e-05, "loss": 0.3627, "step": 71855 }, { "epoch": 2.5898295311204818, "grad_norm": 0.1989908665418625, "learning_rate": 2.477472313097183e-05, "loss": 0.3685, "step": 71860 }, { "epoch": 2.590009730781706, "grad_norm": 0.2075403332710266, "learning_rate": 2.4771805117339863e-05, "loss": 0.3824, "step": 71865 }, { "epoch": 2.5901899304429308, "grad_norm": 0.17004312574863434, "learning_rate": 2.476888710681701e-05, "loss": 0.3466, "step": 71870 }, { "epoch": 2.5903701301041555, "grad_norm": 0.20493172109127045, "learning_rate": 2.4765969099443025e-05, "loss": 0.4133, "step": 71875 }, { "epoch": 2.59055032976538, "grad_norm": 0.2063346803188324, "learning_rate": 2.476305109525767e-05, "loss": 0.4252, "step": 71880 }, { "epoch": 2.5907305294266045, "grad_norm": 0.19341862201690674, "learning_rate": 2.4760133094300697e-05, "loss": 0.3719, "step": 71885 }, { "epoch": 2.5909107290878293, "grad_norm": 0.17939716577529907, "learning_rate": 2.475721509661186e-05, "loss": 0.4111, "step": 71890 }, { "epoch": 2.591090928749054, "grad_norm": 0.19604428112506866, "learning_rate": 2.4754297102230923e-05, "loss": 0.3756, "step": 71895 }, { "epoch": 2.5912711284102787, "grad_norm": 0.20184142887592316, "learning_rate": 2.4751379111197642e-05, "loss": 0.3813, "step": 71900 }, { "epoch": 2.5914513280715035, "grad_norm": 0.21380701661109924, "learning_rate": 2.4748461123551768e-05, "loss": 0.3822, "step": 71905 }, { "epoch": 2.5916315277327278, "grad_norm": 0.22639767825603485, "learning_rate": 2.4745543139333067e-05, "loss": 0.4095, "step": 71910 }, { "epoch": 2.5918117273939525, "grad_norm": 0.21239355206489563, "learning_rate": 2.4742625158581286e-05, "loss": 0.4324, "step": 71915 }, { "epoch": 2.5919919270551772, "grad_norm": 0.19603148102760315, "learning_rate": 2.4739707181336193e-05, "loss": 0.3851, "step": 71920 }, { "epoch": 2.5921721267164015, "grad_norm": 0.21165809035301208, "learning_rate": 2.4736789207637543e-05, "loss": 0.3982, "step": 71925 }, { "epoch": 2.5923523263776262, "grad_norm": 0.22291170060634613, "learning_rate": 2.473387123752508e-05, "loss": 0.3934, "step": 71930 }, { "epoch": 2.592532526038851, "grad_norm": 0.1911042034626007, "learning_rate": 2.4730953271038575e-05, "loss": 0.3858, "step": 71935 }, { "epoch": 2.5927127257000757, "grad_norm": 0.2135748714208603, "learning_rate": 2.472803530821778e-05, "loss": 0.3912, "step": 71940 }, { "epoch": 2.5928929253613004, "grad_norm": 0.20679445564746857, "learning_rate": 2.472511734910245e-05, "loss": 0.369, "step": 71945 }, { "epoch": 2.593073125022525, "grad_norm": 0.22252236306667328, "learning_rate": 2.472219939373234e-05, "loss": 0.4035, "step": 71950 }, { "epoch": 2.5932533246837495, "grad_norm": 0.20789240300655365, "learning_rate": 2.471928144214721e-05, "loss": 0.3496, "step": 71955 }, { "epoch": 2.593433524344974, "grad_norm": 0.24072077870368958, "learning_rate": 2.4716363494386817e-05, "loss": 0.3821, "step": 71960 }, { "epoch": 2.593613724006199, "grad_norm": 0.21749582886695862, "learning_rate": 2.471344555049092e-05, "loss": 0.3826, "step": 71965 }, { "epoch": 2.5937939236674232, "grad_norm": 0.17097270488739014, "learning_rate": 2.4710527610499265e-05, "loss": 0.352, "step": 71970 }, { "epoch": 2.593974123328648, "grad_norm": 0.20925599336624146, "learning_rate": 2.470760967445162e-05, "loss": 0.3946, "step": 71975 }, { "epoch": 2.5941543229898727, "grad_norm": 0.16980622708797455, "learning_rate": 2.470469174238774e-05, "loss": 0.3889, "step": 71980 }, { "epoch": 2.5943345226510974, "grad_norm": 0.1989830583333969, "learning_rate": 2.4701773814347366e-05, "loss": 0.3903, "step": 71985 }, { "epoch": 2.594514722312322, "grad_norm": 0.22819073498249054, "learning_rate": 2.469885589037028e-05, "loss": 0.3905, "step": 71990 }, { "epoch": 2.594694921973547, "grad_norm": 0.23240187764167786, "learning_rate": 2.4695937970496212e-05, "loss": 0.3952, "step": 71995 }, { "epoch": 2.594875121634771, "grad_norm": 0.20458155870437622, "learning_rate": 2.4693020054764936e-05, "loss": 0.3712, "step": 72000 }, { "epoch": 2.594875121634771, "eval_loss": 0.43242183327674866, "eval_runtime": 3.5303, "eval_samples_per_second": 28.327, "eval_steps_per_second": 7.082, "step": 72000 }, { "epoch": 2.595055321295996, "grad_norm": 0.2061101496219635, "learning_rate": 2.4690102143216214e-05, "loss": 0.3734, "step": 72005 }, { "epoch": 2.5952355209572207, "grad_norm": 0.2840164303779602, "learning_rate": 2.4687184235889768e-05, "loss": 0.3939, "step": 72010 }, { "epoch": 2.5954157206184454, "grad_norm": 0.3389698266983032, "learning_rate": 2.468426633282539e-05, "loss": 0.3925, "step": 72015 }, { "epoch": 2.5955959202796697, "grad_norm": 0.22343645989894867, "learning_rate": 2.4681348434062825e-05, "loss": 0.4075, "step": 72020 }, { "epoch": 2.5957761199408944, "grad_norm": 0.23399618268013, "learning_rate": 2.467843053964181e-05, "loss": 0.3981, "step": 72025 }, { "epoch": 2.595956319602119, "grad_norm": 0.25679734349250793, "learning_rate": 2.4675512649602134e-05, "loss": 0.3848, "step": 72030 }, { "epoch": 2.596136519263344, "grad_norm": 0.19120201468467712, "learning_rate": 2.467259476398352e-05, "loss": 0.3813, "step": 72035 }, { "epoch": 2.5963167189245686, "grad_norm": 0.24785567820072174, "learning_rate": 2.4669676882825754e-05, "loss": 0.3926, "step": 72040 }, { "epoch": 2.596496918585793, "grad_norm": 0.21031877398490906, "learning_rate": 2.4666759006168572e-05, "loss": 0.3782, "step": 72045 }, { "epoch": 2.5966771182470176, "grad_norm": 0.22502188384532928, "learning_rate": 2.4663841134051727e-05, "loss": 0.389, "step": 72050 }, { "epoch": 2.5968573179082424, "grad_norm": 0.20378191769123077, "learning_rate": 2.4660923266514986e-05, "loss": 0.4028, "step": 72055 }, { "epoch": 2.597037517569467, "grad_norm": 0.19002820551395416, "learning_rate": 2.4658005403598098e-05, "loss": 0.3727, "step": 72060 }, { "epoch": 2.5972177172306914, "grad_norm": 0.19522058963775635, "learning_rate": 2.4655087545340823e-05, "loss": 0.3779, "step": 72065 }, { "epoch": 2.597397916891916, "grad_norm": 0.23693622648715973, "learning_rate": 2.4652169691782914e-05, "loss": 0.3623, "step": 72070 }, { "epoch": 2.597578116553141, "grad_norm": 0.18634121119976044, "learning_rate": 2.464925184296412e-05, "loss": 0.3996, "step": 72075 }, { "epoch": 2.5977583162143656, "grad_norm": 0.18742458522319794, "learning_rate": 2.4646333998924205e-05, "loss": 0.3633, "step": 72080 }, { "epoch": 2.5979385158755903, "grad_norm": 0.21051351726055145, "learning_rate": 2.4643416159702925e-05, "loss": 0.4086, "step": 72085 }, { "epoch": 2.598118715536815, "grad_norm": 0.1712328940629959, "learning_rate": 2.4640498325340022e-05, "loss": 0.3798, "step": 72090 }, { "epoch": 2.5982989151980393, "grad_norm": 0.19666485488414764, "learning_rate": 2.4637580495875267e-05, "loss": 0.3856, "step": 72095 }, { "epoch": 2.598479114859264, "grad_norm": 0.15711526572704315, "learning_rate": 2.4634662671348403e-05, "loss": 0.3996, "step": 72100 }, { "epoch": 2.598659314520489, "grad_norm": 0.21674515306949615, "learning_rate": 2.4631744851799192e-05, "loss": 0.3851, "step": 72105 }, { "epoch": 2.598839514181713, "grad_norm": 0.1570635885000229, "learning_rate": 2.4628827037267397e-05, "loss": 0.3768, "step": 72110 }, { "epoch": 2.599019713842938, "grad_norm": 0.188796266913414, "learning_rate": 2.462590922779274e-05, "loss": 0.4152, "step": 72115 }, { "epoch": 2.5991999135041626, "grad_norm": 0.22848844528198242, "learning_rate": 2.4622991423415016e-05, "loss": 0.3614, "step": 72120 }, { "epoch": 2.5993801131653873, "grad_norm": 0.23881231248378754, "learning_rate": 2.4620073624173952e-05, "loss": 0.3662, "step": 72125 }, { "epoch": 2.599560312826612, "grad_norm": 0.1681346446275711, "learning_rate": 2.461715583010931e-05, "loss": 0.4039, "step": 72130 }, { "epoch": 2.5997405124878368, "grad_norm": 0.19078193604946136, "learning_rate": 2.461423804126085e-05, "loss": 0.3721, "step": 72135 }, { "epoch": 2.599920712149061, "grad_norm": 0.19204656779766083, "learning_rate": 2.4611320257668318e-05, "loss": 0.3735, "step": 72140 }, { "epoch": 2.600100911810286, "grad_norm": 0.2286890149116516, "learning_rate": 2.4608402479371475e-05, "loss": 0.3929, "step": 72145 }, { "epoch": 2.6002811114715105, "grad_norm": 0.18372632563114166, "learning_rate": 2.4605484706410072e-05, "loss": 0.3658, "step": 72150 }, { "epoch": 2.600461311132735, "grad_norm": 0.1332695037126541, "learning_rate": 2.460256693882386e-05, "loss": 0.3649, "step": 72155 }, { "epoch": 2.6006415107939596, "grad_norm": 0.22151631116867065, "learning_rate": 2.4599649176652602e-05, "loss": 0.425, "step": 72160 }, { "epoch": 2.6008217104551843, "grad_norm": 0.25563111901283264, "learning_rate": 2.4596731419936044e-05, "loss": 0.4232, "step": 72165 }, { "epoch": 2.601001910116409, "grad_norm": 0.19068589806556702, "learning_rate": 2.4593813668713942e-05, "loss": 0.3822, "step": 72170 }, { "epoch": 2.6011821097776338, "grad_norm": 0.2579037845134735, "learning_rate": 2.459089592302605e-05, "loss": 0.3904, "step": 72175 }, { "epoch": 2.6013623094388585, "grad_norm": 0.1858731061220169, "learning_rate": 2.458797818291212e-05, "loss": 0.3894, "step": 72180 }, { "epoch": 2.601542509100083, "grad_norm": 0.2007628083229065, "learning_rate": 2.458506044841191e-05, "loss": 0.399, "step": 72185 }, { "epoch": 2.6017227087613075, "grad_norm": 0.22041386365890503, "learning_rate": 2.4582142719565173e-05, "loss": 0.4187, "step": 72190 }, { "epoch": 2.6019029084225322, "grad_norm": 0.22453515231609344, "learning_rate": 2.4579224996411655e-05, "loss": 0.4086, "step": 72195 }, { "epoch": 2.6020831080837565, "grad_norm": 0.2213824838399887, "learning_rate": 2.457630727899112e-05, "loss": 0.4141, "step": 72200 }, { "epoch": 2.6022633077449813, "grad_norm": 0.20723369717597961, "learning_rate": 2.4573389567343323e-05, "loss": 0.3806, "step": 72205 }, { "epoch": 2.602443507406206, "grad_norm": 0.2115570306777954, "learning_rate": 2.4570471861507994e-05, "loss": 0.38, "step": 72210 }, { "epoch": 2.6026237070674307, "grad_norm": 0.21702751517295837, "learning_rate": 2.4567554161524917e-05, "loss": 0.3766, "step": 72215 }, { "epoch": 2.6028039067286555, "grad_norm": 0.22824609279632568, "learning_rate": 2.4564636467433814e-05, "loss": 0.3602, "step": 72220 }, { "epoch": 2.60298410638988, "grad_norm": 0.2511116564273834, "learning_rate": 2.4561718779274474e-05, "loss": 0.3884, "step": 72225 }, { "epoch": 2.6031643060511045, "grad_norm": 0.19371597468852997, "learning_rate": 2.4558801097086627e-05, "loss": 0.3869, "step": 72230 }, { "epoch": 2.6033445057123292, "grad_norm": 0.27392515540122986, "learning_rate": 2.455588342091002e-05, "loss": 0.4111, "step": 72235 }, { "epoch": 2.603524705373554, "grad_norm": 0.20092566311359406, "learning_rate": 2.4552965750784422e-05, "loss": 0.4155, "step": 72240 }, { "epoch": 2.6037049050347787, "grad_norm": 0.18741832673549652, "learning_rate": 2.4550048086749572e-05, "loss": 0.4058, "step": 72245 }, { "epoch": 2.603885104696003, "grad_norm": 0.20643602311611176, "learning_rate": 2.454713042884524e-05, "loss": 0.4002, "step": 72250 }, { "epoch": 2.6040653043572277, "grad_norm": 0.1865357607603073, "learning_rate": 2.4544212777111164e-05, "loss": 0.3739, "step": 72255 }, { "epoch": 2.6042455040184525, "grad_norm": 0.165154829621315, "learning_rate": 2.4541295131587098e-05, "loss": 0.4122, "step": 72260 }, { "epoch": 2.604425703679677, "grad_norm": 0.19582092761993408, "learning_rate": 2.4538377492312797e-05, "loss": 0.4369, "step": 72265 }, { "epoch": 2.604605903340902, "grad_norm": 0.18715718388557434, "learning_rate": 2.453545985932802e-05, "loss": 0.3845, "step": 72270 }, { "epoch": 2.604786103002126, "grad_norm": 0.22059041261672974, "learning_rate": 2.4532542232672504e-05, "loss": 0.3972, "step": 72275 }, { "epoch": 2.604966302663351, "grad_norm": 0.22253043949604034, "learning_rate": 2.4529624612386015e-05, "loss": 0.4599, "step": 72280 }, { "epoch": 2.6051465023245757, "grad_norm": 0.22209565341472626, "learning_rate": 2.4526706998508296e-05, "loss": 0.39, "step": 72285 }, { "epoch": 2.6053267019858004, "grad_norm": 0.2502153515815735, "learning_rate": 2.4523789391079103e-05, "loss": 0.3973, "step": 72290 }, { "epoch": 2.6055069016470247, "grad_norm": 0.2121538519859314, "learning_rate": 2.4520871790138196e-05, "loss": 0.362, "step": 72295 }, { "epoch": 2.6056871013082494, "grad_norm": 0.22222217917442322, "learning_rate": 2.4517954195725305e-05, "loss": 0.4204, "step": 72300 }, { "epoch": 2.605867300969474, "grad_norm": 0.1976117193698883, "learning_rate": 2.4515036607880208e-05, "loss": 0.3909, "step": 72305 }, { "epoch": 2.606047500630699, "grad_norm": 0.16748464107513428, "learning_rate": 2.451211902664264e-05, "loss": 0.3622, "step": 72310 }, { "epoch": 2.6062277002919236, "grad_norm": 0.15831992030143738, "learning_rate": 2.4509201452052338e-05, "loss": 0.3806, "step": 72315 }, { "epoch": 2.606407899953148, "grad_norm": 0.18840594589710236, "learning_rate": 2.4506283884149094e-05, "loss": 0.4006, "step": 72320 }, { "epoch": 2.6065880996143727, "grad_norm": 0.28430864214897156, "learning_rate": 2.450336632297262e-05, "loss": 0.4045, "step": 72325 }, { "epoch": 2.6067682992755974, "grad_norm": 0.24897444248199463, "learning_rate": 2.45004487685627e-05, "loss": 0.4207, "step": 72330 }, { "epoch": 2.606948498936822, "grad_norm": 0.2031540423631668, "learning_rate": 2.4497531220959056e-05, "loss": 0.3999, "step": 72335 }, { "epoch": 2.6071286985980464, "grad_norm": 0.26375287771224976, "learning_rate": 2.4494613680201456e-05, "loss": 0.4072, "step": 72340 }, { "epoch": 2.607308898259271, "grad_norm": 0.2009999305009842, "learning_rate": 2.4491696146329648e-05, "loss": 0.366, "step": 72345 }, { "epoch": 2.607489097920496, "grad_norm": 0.24075284600257874, "learning_rate": 2.448877861938338e-05, "loss": 0.4361, "step": 72350 }, { "epoch": 2.6076692975817206, "grad_norm": 0.22679272294044495, "learning_rate": 2.4485861099402402e-05, "loss": 0.4212, "step": 72355 }, { "epoch": 2.6078494972429453, "grad_norm": 0.18782539665699005, "learning_rate": 2.4482943586426473e-05, "loss": 0.3904, "step": 72360 }, { "epoch": 2.60802969690417, "grad_norm": 0.18936201930046082, "learning_rate": 2.448002608049533e-05, "loss": 0.3494, "step": 72365 }, { "epoch": 2.6082098965653944, "grad_norm": 0.22821679711341858, "learning_rate": 2.4477108581648734e-05, "loss": 0.4238, "step": 72370 }, { "epoch": 2.608390096226619, "grad_norm": 0.19849362969398499, "learning_rate": 2.4474191089926438e-05, "loss": 0.3797, "step": 72375 }, { "epoch": 2.608570295887844, "grad_norm": 0.21136881411075592, "learning_rate": 2.447127360536818e-05, "loss": 0.3807, "step": 72380 }, { "epoch": 2.608750495549068, "grad_norm": 0.23229160904884338, "learning_rate": 2.446835612801372e-05, "loss": 0.4021, "step": 72385 }, { "epoch": 2.608930695210293, "grad_norm": 0.2442099153995514, "learning_rate": 2.446543865790281e-05, "loss": 0.4003, "step": 72390 }, { "epoch": 2.6091108948715176, "grad_norm": 0.2331792116165161, "learning_rate": 2.446252119507518e-05, "loss": 0.4425, "step": 72395 }, { "epoch": 2.6092910945327423, "grad_norm": 0.16892333328723907, "learning_rate": 2.445960373957061e-05, "loss": 0.3777, "step": 72400 }, { "epoch": 2.609471294193967, "grad_norm": 0.18657229840755463, "learning_rate": 2.445668629142882e-05, "loss": 0.3742, "step": 72405 }, { "epoch": 2.609651493855192, "grad_norm": 0.2221583127975464, "learning_rate": 2.4453768850689587e-05, "loss": 0.4194, "step": 72410 }, { "epoch": 2.609831693516416, "grad_norm": 0.1793508529663086, "learning_rate": 2.4450851417392645e-05, "loss": 0.397, "step": 72415 }, { "epoch": 2.610011893177641, "grad_norm": 0.1674244999885559, "learning_rate": 2.444793399157774e-05, "loss": 0.3605, "step": 72420 }, { "epoch": 2.6101920928388656, "grad_norm": 0.18199168145656586, "learning_rate": 2.4445016573284632e-05, "loss": 0.3821, "step": 72425 }, { "epoch": 2.61037229250009, "grad_norm": 0.20687104761600494, "learning_rate": 2.444209916255306e-05, "loss": 0.3999, "step": 72430 }, { "epoch": 2.6105524921613146, "grad_norm": 0.21444140374660492, "learning_rate": 2.4439181759422787e-05, "loss": 0.3575, "step": 72435 }, { "epoch": 2.6107326918225393, "grad_norm": 0.18158109486103058, "learning_rate": 2.4436264363933554e-05, "loss": 0.3997, "step": 72440 }, { "epoch": 2.610912891483764, "grad_norm": 0.23257912695407867, "learning_rate": 2.4433346976125103e-05, "loss": 0.4247, "step": 72445 }, { "epoch": 2.6110930911449888, "grad_norm": 0.24175713956356049, "learning_rate": 2.4430429596037195e-05, "loss": 0.407, "step": 72450 }, { "epoch": 2.6112732908062135, "grad_norm": 0.24281500279903412, "learning_rate": 2.442751222370957e-05, "loss": 0.4246, "step": 72455 }, { "epoch": 2.611453490467438, "grad_norm": 0.21452686190605164, "learning_rate": 2.4424594859181978e-05, "loss": 0.3739, "step": 72460 }, { "epoch": 2.6116336901286625, "grad_norm": 0.23282234370708466, "learning_rate": 2.4421677502494175e-05, "loss": 0.3948, "step": 72465 }, { "epoch": 2.6118138897898873, "grad_norm": 0.24855384230613708, "learning_rate": 2.44187601536859e-05, "loss": 0.4206, "step": 72470 }, { "epoch": 2.6119940894511116, "grad_norm": 0.22032064199447632, "learning_rate": 2.441584281279691e-05, "loss": 0.3899, "step": 72475 }, { "epoch": 2.6121742891123363, "grad_norm": 0.28760719299316406, "learning_rate": 2.4412925479866946e-05, "loss": 0.3476, "step": 72480 }, { "epoch": 2.612354488773561, "grad_norm": 0.22636237740516663, "learning_rate": 2.4410008154935757e-05, "loss": 0.3523, "step": 72485 }, { "epoch": 2.6125346884347858, "grad_norm": 0.19048437476158142, "learning_rate": 2.4407090838043097e-05, "loss": 0.3876, "step": 72490 }, { "epoch": 2.6127148880960105, "grad_norm": 0.20822523534297943, "learning_rate": 2.440417352922871e-05, "loss": 0.4092, "step": 72495 }, { "epoch": 2.6128950877572352, "grad_norm": 0.20144416391849518, "learning_rate": 2.4401256228532334e-05, "loss": 0.3772, "step": 72500 }, { "epoch": 2.6128950877572352, "eval_loss": 0.432780385017395, "eval_runtime": 3.5337, "eval_samples_per_second": 28.299, "eval_steps_per_second": 7.075, "step": 72500 }, { "epoch": 2.6130752874184595, "grad_norm": 0.20547565817832947, "learning_rate": 2.4398338935993742e-05, "loss": 0.4123, "step": 72505 }, { "epoch": 2.6132554870796842, "grad_norm": 0.2339104562997818, "learning_rate": 2.4395421651652646e-05, "loss": 0.4156, "step": 72510 }, { "epoch": 2.613435686740909, "grad_norm": 0.209024116396904, "learning_rate": 2.439250437554883e-05, "loss": 0.3592, "step": 72515 }, { "epoch": 2.6136158864021337, "grad_norm": 0.20917484164237976, "learning_rate": 2.438958710772202e-05, "loss": 0.3925, "step": 72520 }, { "epoch": 2.613796086063358, "grad_norm": 0.23410077393054962, "learning_rate": 2.438666984821196e-05, "loss": 0.3747, "step": 72525 }, { "epoch": 2.6139762857245827, "grad_norm": 0.18121454119682312, "learning_rate": 2.4383752597058414e-05, "loss": 0.3926, "step": 72530 }, { "epoch": 2.6141564853858075, "grad_norm": 0.1945260465145111, "learning_rate": 2.4380835354301117e-05, "loss": 0.4095, "step": 72535 }, { "epoch": 2.614336685047032, "grad_norm": 0.21491825580596924, "learning_rate": 2.437791811997981e-05, "loss": 0.4031, "step": 72540 }, { "epoch": 2.614516884708257, "grad_norm": 0.23276425898075104, "learning_rate": 2.4375000894134257e-05, "loss": 0.3753, "step": 72545 }, { "epoch": 2.6146970843694812, "grad_norm": 0.20412658154964447, "learning_rate": 2.4372083676804187e-05, "loss": 0.4058, "step": 72550 }, { "epoch": 2.614877284030706, "grad_norm": 0.2030048817396164, "learning_rate": 2.4369166468029367e-05, "loss": 0.371, "step": 72555 }, { "epoch": 2.6150574836919307, "grad_norm": 0.22809742391109467, "learning_rate": 2.436624926784953e-05, "loss": 0.3964, "step": 72560 }, { "epoch": 2.6152376833531554, "grad_norm": 0.20488815009593964, "learning_rate": 2.4363332076304413e-05, "loss": 0.3581, "step": 72565 }, { "epoch": 2.6154178830143797, "grad_norm": 0.2278917282819748, "learning_rate": 2.4360414893433784e-05, "loss": 0.3737, "step": 72570 }, { "epoch": 2.6155980826756045, "grad_norm": 0.2145715057849884, "learning_rate": 2.435749771927738e-05, "loss": 0.3794, "step": 72575 }, { "epoch": 2.615778282336829, "grad_norm": 0.1873069554567337, "learning_rate": 2.435458055387493e-05, "loss": 0.3869, "step": 72580 }, { "epoch": 2.615958481998054, "grad_norm": 0.16591453552246094, "learning_rate": 2.4351663397266213e-05, "loss": 0.3601, "step": 72585 }, { "epoch": 2.6161386816592787, "grad_norm": 0.21023093163967133, "learning_rate": 2.434874624949094e-05, "loss": 0.41, "step": 72590 }, { "epoch": 2.6163188813205034, "grad_norm": 0.17836223542690277, "learning_rate": 2.4345829110588892e-05, "loss": 0.3976, "step": 72595 }, { "epoch": 2.6164990809817277, "grad_norm": 0.2159523069858551, "learning_rate": 2.4342911980599788e-05, "loss": 0.4025, "step": 72600 }, { "epoch": 2.6166792806429524, "grad_norm": 0.23821072280406952, "learning_rate": 2.4339994859563368e-05, "loss": 0.4003, "step": 72605 }, { "epoch": 2.616859480304177, "grad_norm": 0.2293204665184021, "learning_rate": 2.433707774751941e-05, "loss": 0.3821, "step": 72610 }, { "epoch": 2.6170396799654014, "grad_norm": 0.2255268394947052, "learning_rate": 2.433416064450763e-05, "loss": 0.3848, "step": 72615 }, { "epoch": 2.617219879626626, "grad_norm": 0.18856073915958405, "learning_rate": 2.433124355056778e-05, "loss": 0.4097, "step": 72620 }, { "epoch": 2.617400079287851, "grad_norm": 0.221920907497406, "learning_rate": 2.432832646573961e-05, "loss": 0.3756, "step": 72625 }, { "epoch": 2.6175802789490756, "grad_norm": 0.23313437402248383, "learning_rate": 2.432540939006286e-05, "loss": 0.4022, "step": 72630 }, { "epoch": 2.6177604786103004, "grad_norm": 0.17918843030929565, "learning_rate": 2.432249232357728e-05, "loss": 0.394, "step": 72635 }, { "epoch": 2.617940678271525, "grad_norm": 0.1900051385164261, "learning_rate": 2.4319575266322607e-05, "loss": 0.4176, "step": 72640 }, { "epoch": 2.6181208779327494, "grad_norm": 0.21642592549324036, "learning_rate": 2.431665821833859e-05, "loss": 0.4371, "step": 72645 }, { "epoch": 2.618301077593974, "grad_norm": 0.21715706586837769, "learning_rate": 2.4313741179664974e-05, "loss": 0.3757, "step": 72650 }, { "epoch": 2.618481277255199, "grad_norm": 0.21024832129478455, "learning_rate": 2.4310824150341497e-05, "loss": 0.3762, "step": 72655 }, { "epoch": 2.618661476916423, "grad_norm": 0.2558997869491577, "learning_rate": 2.4307907130407916e-05, "loss": 0.4358, "step": 72660 }, { "epoch": 2.618841676577648, "grad_norm": 0.1785580962896347, "learning_rate": 2.4304990119903966e-05, "loss": 0.4231, "step": 72665 }, { "epoch": 2.6190218762388726, "grad_norm": 0.18506887555122375, "learning_rate": 2.430207311886938e-05, "loss": 0.3657, "step": 72670 }, { "epoch": 2.6192020759000973, "grad_norm": 0.1763153374195099, "learning_rate": 2.429915612734392e-05, "loss": 0.3529, "step": 72675 }, { "epoch": 2.619382275561322, "grad_norm": 0.21150760352611542, "learning_rate": 2.4296239145367333e-05, "loss": 0.3704, "step": 72680 }, { "epoch": 2.619562475222547, "grad_norm": 0.2270565927028656, "learning_rate": 2.429332217297933e-05, "loss": 0.3814, "step": 72685 }, { "epoch": 2.619742674883771, "grad_norm": 0.20172244310379028, "learning_rate": 2.4290405210219694e-05, "loss": 0.3692, "step": 72690 }, { "epoch": 2.619922874544996, "grad_norm": 0.20030339062213898, "learning_rate": 2.428748825712813e-05, "loss": 0.3969, "step": 72695 }, { "epoch": 2.6201030742062206, "grad_norm": 0.23000763356685638, "learning_rate": 2.4284571313744424e-05, "loss": 0.4319, "step": 72700 }, { "epoch": 2.620283273867445, "grad_norm": 0.18827667832374573, "learning_rate": 2.428165438010828e-05, "loss": 0.3748, "step": 72705 }, { "epoch": 2.6204634735286696, "grad_norm": 0.23635447025299072, "learning_rate": 2.427873745625946e-05, "loss": 0.4177, "step": 72710 }, { "epoch": 2.6206436731898943, "grad_norm": 0.18466027081012726, "learning_rate": 2.4275820542237703e-05, "loss": 0.3716, "step": 72715 }, { "epoch": 2.620823872851119, "grad_norm": 0.21398019790649414, "learning_rate": 2.427290363808275e-05, "loss": 0.3842, "step": 72720 }, { "epoch": 2.621004072512344, "grad_norm": 0.24221466481685638, "learning_rate": 2.426998674383434e-05, "loss": 0.4314, "step": 72725 }, { "epoch": 2.6211842721735685, "grad_norm": 0.18980202078819275, "learning_rate": 2.4267069859532228e-05, "loss": 0.3927, "step": 72730 }, { "epoch": 2.621364471834793, "grad_norm": 0.2422516644001007, "learning_rate": 2.4264152985216136e-05, "loss": 0.398, "step": 72735 }, { "epoch": 2.6215446714960176, "grad_norm": 0.18269525468349457, "learning_rate": 2.4261236120925828e-05, "loss": 0.4142, "step": 72740 }, { "epoch": 2.6217248711572423, "grad_norm": 0.17914442718029022, "learning_rate": 2.4258319266701032e-05, "loss": 0.3988, "step": 72745 }, { "epoch": 2.621905070818467, "grad_norm": 0.22300316393375397, "learning_rate": 2.4255402422581485e-05, "loss": 0.3862, "step": 72750 }, { "epoch": 2.6220852704796913, "grad_norm": 0.21765939891338348, "learning_rate": 2.4252485588606947e-05, "loss": 0.3836, "step": 72755 }, { "epoch": 2.622265470140916, "grad_norm": 0.2101963460445404, "learning_rate": 2.424956876481715e-05, "loss": 0.4349, "step": 72760 }, { "epoch": 2.6224456698021408, "grad_norm": 0.20395536720752716, "learning_rate": 2.4246651951251815e-05, "loss": 0.3813, "step": 72765 }, { "epoch": 2.6226258694633655, "grad_norm": 0.208245649933815, "learning_rate": 2.4243735147950715e-05, "loss": 0.3975, "step": 72770 }, { "epoch": 2.6228060691245902, "grad_norm": 0.20807571709156036, "learning_rate": 2.424081835495357e-05, "loss": 0.3674, "step": 72775 }, { "epoch": 2.6229862687858145, "grad_norm": 0.2246702015399933, "learning_rate": 2.423790157230014e-05, "loss": 0.3766, "step": 72780 }, { "epoch": 2.6231664684470393, "grad_norm": 0.16798308491706848, "learning_rate": 2.423498480003015e-05, "loss": 0.3983, "step": 72785 }, { "epoch": 2.623346668108264, "grad_norm": 0.18455500900745392, "learning_rate": 2.423206803818333e-05, "loss": 0.3787, "step": 72790 }, { "epoch": 2.6235268677694887, "grad_norm": 0.22922283411026, "learning_rate": 2.422915128679945e-05, "loss": 0.4127, "step": 72795 }, { "epoch": 2.623707067430713, "grad_norm": 0.19777072966098785, "learning_rate": 2.422623454591823e-05, "loss": 0.379, "step": 72800 }, { "epoch": 2.6238872670919378, "grad_norm": 0.2073335349559784, "learning_rate": 2.4223317815579414e-05, "loss": 0.3929, "step": 72805 }, { "epoch": 2.6240674667531625, "grad_norm": 0.19318436086177826, "learning_rate": 2.4220401095822742e-05, "loss": 0.3986, "step": 72810 }, { "epoch": 2.6242476664143872, "grad_norm": 0.20948757231235504, "learning_rate": 2.421748438668795e-05, "loss": 0.432, "step": 72815 }, { "epoch": 2.624427866075612, "grad_norm": 0.24110877513885498, "learning_rate": 2.4214567688214788e-05, "loss": 0.4188, "step": 72820 }, { "epoch": 2.6246080657368362, "grad_norm": 0.24411988258361816, "learning_rate": 2.4211651000442988e-05, "loss": 0.3898, "step": 72825 }, { "epoch": 2.624788265398061, "grad_norm": 0.21839167177677155, "learning_rate": 2.4208734323412284e-05, "loss": 0.4199, "step": 72830 }, { "epoch": 2.6249684650592857, "grad_norm": 0.2261667400598526, "learning_rate": 2.420581765716243e-05, "loss": 0.4064, "step": 72835 }, { "epoch": 2.6251486647205104, "grad_norm": 0.20573599636554718, "learning_rate": 2.420290100173315e-05, "loss": 0.3813, "step": 72840 }, { "epoch": 2.6253288643817347, "grad_norm": 0.17270725965499878, "learning_rate": 2.4199984357164197e-05, "loss": 0.3894, "step": 72845 }, { "epoch": 2.6255090640429595, "grad_norm": 0.2366410195827484, "learning_rate": 2.4197067723495298e-05, "loss": 0.3717, "step": 72850 }, { "epoch": 2.625689263704184, "grad_norm": 0.2237681895494461, "learning_rate": 2.4194151100766193e-05, "loss": 0.3762, "step": 72855 }, { "epoch": 2.625869463365409, "grad_norm": 0.20603452622890472, "learning_rate": 2.419123448901663e-05, "loss": 0.4403, "step": 72860 }, { "epoch": 2.6260496630266337, "grad_norm": 0.21571598947048187, "learning_rate": 2.4188317888286345e-05, "loss": 0.3842, "step": 72865 }, { "epoch": 2.6262298626878584, "grad_norm": 0.20403259992599487, "learning_rate": 2.4185401298615052e-05, "loss": 0.3822, "step": 72870 }, { "epoch": 2.6264100623490827, "grad_norm": 0.20591306686401367, "learning_rate": 2.418248472004253e-05, "loss": 0.4068, "step": 72875 }, { "epoch": 2.6265902620103074, "grad_norm": 0.233648881316185, "learning_rate": 2.4179568152608476e-05, "loss": 0.3983, "step": 72880 }, { "epoch": 2.626770461671532, "grad_norm": 0.21279099583625793, "learning_rate": 2.4176651596352657e-05, "loss": 0.3876, "step": 72885 }, { "epoch": 2.6269506613327565, "grad_norm": 0.24593648314476013, "learning_rate": 2.417373505131481e-05, "loss": 0.4046, "step": 72890 }, { "epoch": 2.627130860993981, "grad_norm": 0.18338236212730408, "learning_rate": 2.4170818517534642e-05, "loss": 0.3814, "step": 72895 }, { "epoch": 2.627311060655206, "grad_norm": 0.19615016877651215, "learning_rate": 2.416790199505193e-05, "loss": 0.3829, "step": 72900 }, { "epoch": 2.6274912603164307, "grad_norm": 0.17791742086410522, "learning_rate": 2.4164985483906384e-05, "loss": 0.3914, "step": 72905 }, { "epoch": 2.6276714599776554, "grad_norm": 0.22163866460323334, "learning_rate": 2.416206898413775e-05, "loss": 0.4144, "step": 72910 }, { "epoch": 2.62785165963888, "grad_norm": 0.18475517630577087, "learning_rate": 2.4159152495785765e-05, "loss": 0.3815, "step": 72915 }, { "epoch": 2.6280318593001044, "grad_norm": 0.22653162479400635, "learning_rate": 2.415623601889016e-05, "loss": 0.403, "step": 72920 }, { "epoch": 2.628212058961329, "grad_norm": 0.1823846399784088, "learning_rate": 2.4153319553490677e-05, "loss": 0.3878, "step": 72925 }, { "epoch": 2.628392258622554, "grad_norm": 0.21723927557468414, "learning_rate": 2.4150403099627056e-05, "loss": 0.4188, "step": 72930 }, { "epoch": 2.628572458283778, "grad_norm": 0.2315875142812729, "learning_rate": 2.414748665733902e-05, "loss": 0.3892, "step": 72935 }, { "epoch": 2.628752657945003, "grad_norm": 0.24299949407577515, "learning_rate": 2.4144570226666325e-05, "loss": 0.4066, "step": 72940 }, { "epoch": 2.6289328576062276, "grad_norm": 0.19684675335884094, "learning_rate": 2.414165380764869e-05, "loss": 0.4139, "step": 72945 }, { "epoch": 2.6291130572674524, "grad_norm": 0.232985258102417, "learning_rate": 2.413873740032585e-05, "loss": 0.3786, "step": 72950 }, { "epoch": 2.629293256928677, "grad_norm": 0.20983463525772095, "learning_rate": 2.413582100473755e-05, "loss": 0.3665, "step": 72955 }, { "epoch": 2.629473456589902, "grad_norm": 0.1940142810344696, "learning_rate": 2.4132904620923518e-05, "loss": 0.3979, "step": 72960 }, { "epoch": 2.629653656251126, "grad_norm": 0.20459094643592834, "learning_rate": 2.41299882489235e-05, "loss": 0.3806, "step": 72965 }, { "epoch": 2.629833855912351, "grad_norm": 0.22843003273010254, "learning_rate": 2.4127071888777227e-05, "loss": 0.3792, "step": 72970 }, { "epoch": 2.6300140555735756, "grad_norm": 0.25140973925590515, "learning_rate": 2.4124155540524414e-05, "loss": 0.4148, "step": 72975 }, { "epoch": 2.6301942552348, "grad_norm": 0.2235340029001236, "learning_rate": 2.4121239204204833e-05, "loss": 0.3699, "step": 72980 }, { "epoch": 2.6303744548960246, "grad_norm": 0.2354481816291809, "learning_rate": 2.4118322879858187e-05, "loss": 0.4014, "step": 72985 }, { "epoch": 2.6305546545572493, "grad_norm": 0.17458322644233704, "learning_rate": 2.4115406567524217e-05, "loss": 0.3573, "step": 72990 }, { "epoch": 2.630734854218474, "grad_norm": 0.1971106380224228, "learning_rate": 2.4112490267242665e-05, "loss": 0.3703, "step": 72995 }, { "epoch": 2.630915053879699, "grad_norm": 0.2097444236278534, "learning_rate": 2.410957397905326e-05, "loss": 0.4161, "step": 73000 }, { "epoch": 2.630915053879699, "eval_loss": 0.43243083357810974, "eval_runtime": 3.5273, "eval_samples_per_second": 28.35, "eval_steps_per_second": 7.087, "step": 73000 }, { "epoch": 2.6310952535409236, "grad_norm": 0.2296389639377594, "learning_rate": 2.410665770299574e-05, "loss": 0.386, "step": 73005 }, { "epoch": 2.631275453202148, "grad_norm": 0.2250434011220932, "learning_rate": 2.4103741439109835e-05, "loss": 0.3863, "step": 73010 }, { "epoch": 2.6314556528633726, "grad_norm": 0.20868182182312012, "learning_rate": 2.410082518743528e-05, "loss": 0.366, "step": 73015 }, { "epoch": 2.6316358525245973, "grad_norm": 0.2829134166240692, "learning_rate": 2.4097908948011804e-05, "loss": 0.3913, "step": 73020 }, { "epoch": 2.631816052185822, "grad_norm": 0.21935230493545532, "learning_rate": 2.4094992720879144e-05, "loss": 0.3815, "step": 73025 }, { "epoch": 2.6319962518470463, "grad_norm": 0.1936255842447281, "learning_rate": 2.4092076506077036e-05, "loss": 0.3789, "step": 73030 }, { "epoch": 2.632176451508271, "grad_norm": 0.18805243074893951, "learning_rate": 2.408916030364521e-05, "loss": 0.3998, "step": 73035 }, { "epoch": 2.632356651169496, "grad_norm": 0.19446292519569397, "learning_rate": 2.4086244113623395e-05, "loss": 0.4257, "step": 73040 }, { "epoch": 2.6325368508307205, "grad_norm": 0.2560482621192932, "learning_rate": 2.408332793605133e-05, "loss": 0.387, "step": 73045 }, { "epoch": 2.6327170504919453, "grad_norm": 0.1912938952445984, "learning_rate": 2.4080411770968746e-05, "loss": 0.4343, "step": 73050 }, { "epoch": 2.6328972501531696, "grad_norm": 0.2362576127052307, "learning_rate": 2.4077495618415367e-05, "loss": 0.3869, "step": 73055 }, { "epoch": 2.6330774498143943, "grad_norm": 0.19982530176639557, "learning_rate": 2.4074579478430942e-05, "loss": 0.3916, "step": 73060 }, { "epoch": 2.633257649475619, "grad_norm": 0.2339782565832138, "learning_rate": 2.407166335105518e-05, "loss": 0.3988, "step": 73065 }, { "epoch": 2.6334378491368438, "grad_norm": 0.2075488567352295, "learning_rate": 2.4068747236327838e-05, "loss": 0.4047, "step": 73070 }, { "epoch": 2.633618048798068, "grad_norm": 0.20350724458694458, "learning_rate": 2.4065831134288635e-05, "loss": 0.3793, "step": 73075 }, { "epoch": 2.633798248459293, "grad_norm": 0.20746393501758575, "learning_rate": 2.4062915044977284e-05, "loss": 0.401, "step": 73080 }, { "epoch": 2.6339784481205175, "grad_norm": 0.21754801273345947, "learning_rate": 2.4059998968433553e-05, "loss": 0.3765, "step": 73085 }, { "epoch": 2.6341586477817422, "grad_norm": 0.16761621832847595, "learning_rate": 2.4057082904697152e-05, "loss": 0.393, "step": 73090 }, { "epoch": 2.634338847442967, "grad_norm": 0.23849821090698242, "learning_rate": 2.4054166853807803e-05, "loss": 0.4177, "step": 73095 }, { "epoch": 2.6345190471041917, "grad_norm": 0.2702219784259796, "learning_rate": 2.4051250815805253e-05, "loss": 0.4136, "step": 73100 }, { "epoch": 2.634699246765416, "grad_norm": 0.22448520362377167, "learning_rate": 2.4048334790729225e-05, "loss": 0.423, "step": 73105 }, { "epoch": 2.6348794464266407, "grad_norm": 0.24827606976032257, "learning_rate": 2.4045418778619456e-05, "loss": 0.4137, "step": 73110 }, { "epoch": 2.6350596460878655, "grad_norm": 0.2294238954782486, "learning_rate": 2.4042502779515668e-05, "loss": 0.3948, "step": 73115 }, { "epoch": 2.6352398457490898, "grad_norm": 0.2006445825099945, "learning_rate": 2.4039586793457593e-05, "loss": 0.3847, "step": 73120 }, { "epoch": 2.6354200454103145, "grad_norm": 0.2071392983198166, "learning_rate": 2.4036670820484964e-05, "loss": 0.3991, "step": 73125 }, { "epoch": 2.6356002450715392, "grad_norm": 0.23408040404319763, "learning_rate": 2.403375486063751e-05, "loss": 0.4106, "step": 73130 }, { "epoch": 2.635780444732764, "grad_norm": 0.2418873906135559, "learning_rate": 2.4030838913954955e-05, "loss": 0.4135, "step": 73135 }, { "epoch": 2.6359606443939887, "grad_norm": 0.18823467195034027, "learning_rate": 2.4027922980477036e-05, "loss": 0.3632, "step": 73140 }, { "epoch": 2.6361408440552134, "grad_norm": 0.22256679832935333, "learning_rate": 2.402500706024347e-05, "loss": 0.3916, "step": 73145 }, { "epoch": 2.6363210437164377, "grad_norm": 0.2467431128025055, "learning_rate": 2.4022091153294004e-05, "loss": 0.4101, "step": 73150 }, { "epoch": 2.6365012433776625, "grad_norm": 0.20016492903232574, "learning_rate": 2.4019175259668362e-05, "loss": 0.3946, "step": 73155 }, { "epoch": 2.636681443038887, "grad_norm": 0.2600384056568146, "learning_rate": 2.4016259379406247e-05, "loss": 0.4024, "step": 73160 }, { "epoch": 2.6368616427001115, "grad_norm": 0.2585957646369934, "learning_rate": 2.401334351254743e-05, "loss": 0.3854, "step": 73165 }, { "epoch": 2.637041842361336, "grad_norm": 0.1720488965511322, "learning_rate": 2.4010427659131604e-05, "loss": 0.3705, "step": 73170 }, { "epoch": 2.637222042022561, "grad_norm": 0.23503340780735016, "learning_rate": 2.4007511819198503e-05, "loss": 0.4219, "step": 73175 }, { "epoch": 2.6374022416837857, "grad_norm": 0.21345233917236328, "learning_rate": 2.4004595992787877e-05, "loss": 0.4177, "step": 73180 }, { "epoch": 2.6375824413450104, "grad_norm": 0.2077561467885971, "learning_rate": 2.400168017993942e-05, "loss": 0.4072, "step": 73185 }, { "epoch": 2.637762641006235, "grad_norm": 0.2376868724822998, "learning_rate": 2.3998764380692896e-05, "loss": 0.4042, "step": 73190 }, { "epoch": 2.6379428406674594, "grad_norm": 0.24904000759124756, "learning_rate": 2.3995848595088008e-05, "loss": 0.423, "step": 73195 }, { "epoch": 2.638123040328684, "grad_norm": 0.20672129094600677, "learning_rate": 2.3992932823164483e-05, "loss": 0.397, "step": 73200 }, { "epoch": 2.638303239989909, "grad_norm": 0.17566558718681335, "learning_rate": 2.3990017064962056e-05, "loss": 0.4138, "step": 73205 }, { "epoch": 2.638483439651133, "grad_norm": 0.19264771044254303, "learning_rate": 2.398710132052045e-05, "loss": 0.3775, "step": 73210 }, { "epoch": 2.638663639312358, "grad_norm": 0.17429447174072266, "learning_rate": 2.3984185589879395e-05, "loss": 0.3963, "step": 73215 }, { "epoch": 2.6388438389735827, "grad_norm": 0.19570399820804596, "learning_rate": 2.3981269873078613e-05, "loss": 0.4127, "step": 73220 }, { "epoch": 2.6390240386348074, "grad_norm": 0.23415052890777588, "learning_rate": 2.3978354170157828e-05, "loss": 0.3912, "step": 73225 }, { "epoch": 2.639204238296032, "grad_norm": 0.22329415380954742, "learning_rate": 2.3975438481156772e-05, "loss": 0.4042, "step": 73230 }, { "epoch": 2.639384437957257, "grad_norm": 0.17532110214233398, "learning_rate": 2.3972522806115176e-05, "loss": 0.4019, "step": 73235 }, { "epoch": 2.639564637618481, "grad_norm": 0.2220272421836853, "learning_rate": 2.3969607145072747e-05, "loss": 0.4198, "step": 73240 }, { "epoch": 2.639744837279706, "grad_norm": 0.2223370224237442, "learning_rate": 2.3966691498069228e-05, "loss": 0.4365, "step": 73245 }, { "epoch": 2.6399250369409306, "grad_norm": 0.2924058437347412, "learning_rate": 2.396377586514433e-05, "loss": 0.3981, "step": 73250 }, { "epoch": 2.6401052366021553, "grad_norm": 0.22655771672725677, "learning_rate": 2.396086024633779e-05, "loss": 0.4427, "step": 73255 }, { "epoch": 2.6402854362633796, "grad_norm": 0.17349043488502502, "learning_rate": 2.3957944641689335e-05, "loss": 0.3575, "step": 73260 }, { "epoch": 2.6404656359246044, "grad_norm": 0.18977303802967072, "learning_rate": 2.3955029051238666e-05, "loss": 0.3798, "step": 73265 }, { "epoch": 2.640645835585829, "grad_norm": 0.2203160524368286, "learning_rate": 2.3952113475025543e-05, "loss": 0.3796, "step": 73270 }, { "epoch": 2.640826035247054, "grad_norm": 0.16892023384571075, "learning_rate": 2.394919791308966e-05, "loss": 0.4015, "step": 73275 }, { "epoch": 2.6410062349082786, "grad_norm": 0.22856315970420837, "learning_rate": 2.3946282365470755e-05, "loss": 0.4011, "step": 73280 }, { "epoch": 2.641186434569503, "grad_norm": 0.21979033946990967, "learning_rate": 2.3943366832208548e-05, "loss": 0.4229, "step": 73285 }, { "epoch": 2.6413666342307276, "grad_norm": 0.180423304438591, "learning_rate": 2.3940451313342757e-05, "loss": 0.3786, "step": 73290 }, { "epoch": 2.6415468338919523, "grad_norm": 0.2113245725631714, "learning_rate": 2.393753580891312e-05, "loss": 0.3818, "step": 73295 }, { "epoch": 2.641727033553177, "grad_norm": 0.21729253232479095, "learning_rate": 2.393462031895935e-05, "loss": 0.4212, "step": 73300 }, { "epoch": 2.6419072332144014, "grad_norm": 0.20087195932865143, "learning_rate": 2.393170484352117e-05, "loss": 0.4411, "step": 73305 }, { "epoch": 2.642087432875626, "grad_norm": 0.21028025448322296, "learning_rate": 2.3928789382638305e-05, "loss": 0.424, "step": 73310 }, { "epoch": 2.642267632536851, "grad_norm": 0.18576066195964813, "learning_rate": 2.392587393635048e-05, "loss": 0.3946, "step": 73315 }, { "epoch": 2.6424478321980756, "grad_norm": 0.18550072610378265, "learning_rate": 2.392295850469741e-05, "loss": 0.4047, "step": 73320 }, { "epoch": 2.6426280318593003, "grad_norm": 0.17761926352977753, "learning_rate": 2.3920043087718826e-05, "loss": 0.3817, "step": 73325 }, { "epoch": 2.6428082315205246, "grad_norm": 0.20507977902889252, "learning_rate": 2.3917127685454442e-05, "loss": 0.409, "step": 73330 }, { "epoch": 2.6429884311817493, "grad_norm": 0.24363189935684204, "learning_rate": 2.391421229794399e-05, "loss": 0.4085, "step": 73335 }, { "epoch": 2.643168630842974, "grad_norm": 0.17502768337726593, "learning_rate": 2.3911296925227182e-05, "loss": 0.4124, "step": 73340 }, { "epoch": 2.6433488305041988, "grad_norm": 0.21647201478481293, "learning_rate": 2.3908381567343736e-05, "loss": 0.3596, "step": 73345 }, { "epoch": 2.643529030165423, "grad_norm": 0.24254228174686432, "learning_rate": 2.390546622433339e-05, "loss": 0.4148, "step": 73350 }, { "epoch": 2.643709229826648, "grad_norm": 0.22495651245117188, "learning_rate": 2.3902550896235855e-05, "loss": 0.3867, "step": 73355 }, { "epoch": 2.6438894294878725, "grad_norm": 0.23512765765190125, "learning_rate": 2.3899635583090837e-05, "loss": 0.4377, "step": 73360 }, { "epoch": 2.6440696291490973, "grad_norm": 0.18619103729724884, "learning_rate": 2.389672028493809e-05, "loss": 0.4124, "step": 73365 }, { "epoch": 2.644249828810322, "grad_norm": 0.2616622745990753, "learning_rate": 2.3893805001817298e-05, "loss": 0.3899, "step": 73370 }, { "epoch": 2.6444300284715467, "grad_norm": 0.23114174604415894, "learning_rate": 2.3890889733768215e-05, "loss": 0.4002, "step": 73375 }, { "epoch": 2.644610228132771, "grad_norm": 0.19029228389263153, "learning_rate": 2.388797448083054e-05, "loss": 0.3984, "step": 73380 }, { "epoch": 2.6447904277939958, "grad_norm": 0.2512267529964447, "learning_rate": 2.388505924304399e-05, "loss": 0.4141, "step": 73385 }, { "epoch": 2.6449706274552205, "grad_norm": 0.21497133374214172, "learning_rate": 2.3882144020448297e-05, "loss": 0.4142, "step": 73390 }, { "epoch": 2.645150827116445, "grad_norm": 0.20400524139404297, "learning_rate": 2.3879228813083175e-05, "loss": 0.4061, "step": 73395 }, { "epoch": 2.6453310267776695, "grad_norm": 0.1923159807920456, "learning_rate": 2.387631362098834e-05, "loss": 0.4065, "step": 73400 }, { "epoch": 2.6455112264388942, "grad_norm": 0.2456676959991455, "learning_rate": 2.387339844420352e-05, "loss": 0.3786, "step": 73405 }, { "epoch": 2.645691426100119, "grad_norm": 0.20096588134765625, "learning_rate": 2.3870483282768422e-05, "loss": 0.4352, "step": 73410 }, { "epoch": 2.6458716257613437, "grad_norm": 0.19349773228168488, "learning_rate": 2.3867568136722777e-05, "loss": 0.4179, "step": 73415 }, { "epoch": 2.6460518254225684, "grad_norm": 0.24120834469795227, "learning_rate": 2.3864653006106298e-05, "loss": 0.3935, "step": 73420 }, { "epoch": 2.6462320250837927, "grad_norm": 0.1798119693994522, "learning_rate": 2.386173789095869e-05, "loss": 0.3727, "step": 73425 }, { "epoch": 2.6464122247450175, "grad_norm": 0.2536999583244324, "learning_rate": 2.3858822791319693e-05, "loss": 0.3995, "step": 73430 }, { "epoch": 2.646592424406242, "grad_norm": 0.24796532094478607, "learning_rate": 2.385590770722901e-05, "loss": 0.4144, "step": 73435 }, { "epoch": 2.6467726240674665, "grad_norm": 0.17443668842315674, "learning_rate": 2.3852992638726368e-05, "loss": 0.3669, "step": 73440 }, { "epoch": 2.6469528237286912, "grad_norm": 0.2030688226222992, "learning_rate": 2.385007758585148e-05, "loss": 0.3755, "step": 73445 }, { "epoch": 2.647133023389916, "grad_norm": 0.1865500956773758, "learning_rate": 2.3847162548644054e-05, "loss": 0.3985, "step": 73450 }, { "epoch": 2.6473132230511407, "grad_norm": 0.16397204995155334, "learning_rate": 2.3844247527143826e-05, "loss": 0.3599, "step": 73455 }, { "epoch": 2.6474934227123654, "grad_norm": 0.16662755608558655, "learning_rate": 2.3841332521390496e-05, "loss": 0.3815, "step": 73460 }, { "epoch": 2.64767362237359, "grad_norm": 0.2906810939311981, "learning_rate": 2.383841753142378e-05, "loss": 0.3871, "step": 73465 }, { "epoch": 2.6478538220348145, "grad_norm": 0.21434706449508667, "learning_rate": 2.383550255728341e-05, "loss": 0.3876, "step": 73470 }, { "epoch": 2.648034021696039, "grad_norm": 0.212555930018425, "learning_rate": 2.3832587599009083e-05, "loss": 0.4124, "step": 73475 }, { "epoch": 2.648214221357264, "grad_norm": 0.26868897676467896, "learning_rate": 2.3829672656640534e-05, "loss": 0.4283, "step": 73480 }, { "epoch": 2.648394421018488, "grad_norm": 0.20437636971473694, "learning_rate": 2.3826757730217467e-05, "loss": 0.3905, "step": 73485 }, { "epoch": 2.648574620679713, "grad_norm": 0.2129250317811966, "learning_rate": 2.382384281977959e-05, "loss": 0.4188, "step": 73490 }, { "epoch": 2.6487548203409377, "grad_norm": 0.2285563051700592, "learning_rate": 2.3820927925366634e-05, "loss": 0.3996, "step": 73495 }, { "epoch": 2.6489350200021624, "grad_norm": 0.20043610036373138, "learning_rate": 2.3818013047018304e-05, "loss": 0.403, "step": 73500 }, { "epoch": 2.6489350200021624, "eval_loss": 0.4324740469455719, "eval_runtime": 3.5359, "eval_samples_per_second": 28.282, "eval_steps_per_second": 7.07, "step": 73500 }, { "epoch": 2.649115219663387, "grad_norm": 0.21913573145866394, "learning_rate": 2.3815098184774318e-05, "loss": 0.4078, "step": 73505 }, { "epoch": 2.649295419324612, "grad_norm": 0.1827915608882904, "learning_rate": 2.3812183338674393e-05, "loss": 0.4111, "step": 73510 }, { "epoch": 2.649475618985836, "grad_norm": 0.2369619905948639, "learning_rate": 2.3809268508758232e-05, "loss": 0.3877, "step": 73515 }, { "epoch": 2.649655818647061, "grad_norm": 0.19070202112197876, "learning_rate": 2.3806353695065564e-05, "loss": 0.4125, "step": 73520 }, { "epoch": 2.6498360183082856, "grad_norm": 0.2124054878950119, "learning_rate": 2.3803438897636095e-05, "loss": 0.3944, "step": 73525 }, { "epoch": 2.6500162179695104, "grad_norm": 0.22031430900096893, "learning_rate": 2.3800524116509537e-05, "loss": 0.3687, "step": 73530 }, { "epoch": 2.6501964176307347, "grad_norm": 0.20872965455055237, "learning_rate": 2.379760935172561e-05, "loss": 0.376, "step": 73535 }, { "epoch": 2.6503766172919594, "grad_norm": 0.2545884847640991, "learning_rate": 2.3794694603324026e-05, "loss": 0.3869, "step": 73540 }, { "epoch": 2.650556816953184, "grad_norm": 0.23649878799915314, "learning_rate": 2.379177987134448e-05, "loss": 0.3706, "step": 73545 }, { "epoch": 2.650737016614409, "grad_norm": 0.20820178091526031, "learning_rate": 2.3788865155826716e-05, "loss": 0.4016, "step": 73550 }, { "epoch": 2.6509172162756336, "grad_norm": 0.2441217005252838, "learning_rate": 2.378595045681041e-05, "loss": 0.3979, "step": 73555 }, { "epoch": 2.651097415936858, "grad_norm": 0.2145996391773224, "learning_rate": 2.3783035774335313e-05, "loss": 0.4015, "step": 73560 }, { "epoch": 2.6512776155980826, "grad_norm": 0.2503126263618469, "learning_rate": 2.3780121108441116e-05, "loss": 0.4389, "step": 73565 }, { "epoch": 2.6514578152593073, "grad_norm": 0.19581691920757294, "learning_rate": 2.377720645916752e-05, "loss": 0.3767, "step": 73570 }, { "epoch": 2.651638014920532, "grad_norm": 0.19556576013565063, "learning_rate": 2.377429182655426e-05, "loss": 0.389, "step": 73575 }, { "epoch": 2.6518182145817564, "grad_norm": 0.2089264839887619, "learning_rate": 2.3771377210641035e-05, "loss": 0.3602, "step": 73580 }, { "epoch": 2.651998414242981, "grad_norm": 0.21499642729759216, "learning_rate": 2.3768462611467552e-05, "loss": 0.4022, "step": 73585 }, { "epoch": 2.652178613904206, "grad_norm": 0.27903100848197937, "learning_rate": 2.3765548029073535e-05, "loss": 0.417, "step": 73590 }, { "epoch": 2.6523588135654306, "grad_norm": 0.2061304748058319, "learning_rate": 2.3762633463498677e-05, "loss": 0.3962, "step": 73595 }, { "epoch": 2.6525390132266553, "grad_norm": 0.23685584962368011, "learning_rate": 2.375971891478271e-05, "loss": 0.3888, "step": 73600 }, { "epoch": 2.65271921288788, "grad_norm": 0.22389626502990723, "learning_rate": 2.3756804382965324e-05, "loss": 0.4113, "step": 73605 }, { "epoch": 2.6528994125491043, "grad_norm": 0.1898953765630722, "learning_rate": 2.375388986808624e-05, "loss": 0.3737, "step": 73610 }, { "epoch": 2.653079612210329, "grad_norm": 0.22866353392601013, "learning_rate": 2.375097537018517e-05, "loss": 0.3694, "step": 73615 }, { "epoch": 2.653259811871554, "grad_norm": 0.22122244536876678, "learning_rate": 2.374806088930181e-05, "loss": 0.4329, "step": 73620 }, { "epoch": 2.653440011532778, "grad_norm": 0.17833849787712097, "learning_rate": 2.3745146425475884e-05, "loss": 0.4094, "step": 73625 }, { "epoch": 2.653620211194003, "grad_norm": 0.2136964350938797, "learning_rate": 2.37422319787471e-05, "loss": 0.4217, "step": 73630 }, { "epoch": 2.6538004108552276, "grad_norm": 0.18438690900802612, "learning_rate": 2.3739317549155148e-05, "loss": 0.4136, "step": 73635 }, { "epoch": 2.6539806105164523, "grad_norm": 0.23062624037265778, "learning_rate": 2.373640313673976e-05, "loss": 0.3857, "step": 73640 }, { "epoch": 2.654160810177677, "grad_norm": 0.26239144802093506, "learning_rate": 2.373348874154064e-05, "loss": 0.4017, "step": 73645 }, { "epoch": 2.6543410098389018, "grad_norm": 0.22294588387012482, "learning_rate": 2.3730574363597476e-05, "loss": 0.4033, "step": 73650 }, { "epoch": 2.654521209500126, "grad_norm": 0.17945276200771332, "learning_rate": 2.3727660002950006e-05, "loss": 0.3721, "step": 73655 }, { "epoch": 2.654701409161351, "grad_norm": 0.22688071429729462, "learning_rate": 2.3724745659637902e-05, "loss": 0.3883, "step": 73660 }, { "epoch": 2.6548816088225755, "grad_norm": 0.21472230553627014, "learning_rate": 2.3721831333700913e-05, "loss": 0.4231, "step": 73665 }, { "epoch": 2.6550618084838, "grad_norm": 0.23527327179908752, "learning_rate": 2.371891702517872e-05, "loss": 0.3958, "step": 73670 }, { "epoch": 2.6552420081450245, "grad_norm": 0.19936297833919525, "learning_rate": 2.3716002734111024e-05, "loss": 0.3937, "step": 73675 }, { "epoch": 2.6554222078062493, "grad_norm": 0.19073469936847687, "learning_rate": 2.371308846053755e-05, "loss": 0.3815, "step": 73680 }, { "epoch": 2.655602407467474, "grad_norm": 0.19665153324604034, "learning_rate": 2.3710174204497997e-05, "loss": 0.4036, "step": 73685 }, { "epoch": 2.6557826071286987, "grad_norm": 0.2125919610261917, "learning_rate": 2.3707259966032064e-05, "loss": 0.4332, "step": 73690 }, { "epoch": 2.6559628067899235, "grad_norm": 0.16980135440826416, "learning_rate": 2.370434574517947e-05, "loss": 0.4054, "step": 73695 }, { "epoch": 2.6561430064511478, "grad_norm": 0.16812558472156525, "learning_rate": 2.370143154197991e-05, "loss": 0.3967, "step": 73700 }, { "epoch": 2.6563232061123725, "grad_norm": 0.23627859354019165, "learning_rate": 2.3698517356473098e-05, "loss": 0.3897, "step": 73705 }, { "epoch": 2.6565034057735972, "grad_norm": 0.2503635287284851, "learning_rate": 2.3695603188698733e-05, "loss": 0.3958, "step": 73710 }, { "epoch": 2.6566836054348215, "grad_norm": 0.2157202512025833, "learning_rate": 2.3692689038696518e-05, "loss": 0.4087, "step": 73715 }, { "epoch": 2.6568638050960462, "grad_norm": 0.1902592033147812, "learning_rate": 2.368977490650617e-05, "loss": 0.389, "step": 73720 }, { "epoch": 2.657044004757271, "grad_norm": 0.188002347946167, "learning_rate": 2.368686079216739e-05, "loss": 0.3733, "step": 73725 }, { "epoch": 2.6572242044184957, "grad_norm": 0.22462768852710724, "learning_rate": 2.3683946695719857e-05, "loss": 0.3922, "step": 73730 }, { "epoch": 2.6574044040797205, "grad_norm": 0.22756509482860565, "learning_rate": 2.3681032617203317e-05, "loss": 0.4083, "step": 73735 }, { "epoch": 2.657584603740945, "grad_norm": 0.2529270052909851, "learning_rate": 2.367811855665743e-05, "loss": 0.3848, "step": 73740 }, { "epoch": 2.6577648034021695, "grad_norm": 0.2393999844789505, "learning_rate": 2.3675204514121942e-05, "loss": 0.3802, "step": 73745 }, { "epoch": 2.657945003063394, "grad_norm": 0.2106008678674698, "learning_rate": 2.3672290489636533e-05, "loss": 0.4216, "step": 73750 }, { "epoch": 2.658125202724619, "grad_norm": 0.19306325912475586, "learning_rate": 2.3669376483240894e-05, "loss": 0.3937, "step": 73755 }, { "epoch": 2.6583054023858437, "grad_norm": 0.2211955040693283, "learning_rate": 2.366646249497476e-05, "loss": 0.4201, "step": 73760 }, { "epoch": 2.658485602047068, "grad_norm": 0.20790930092334747, "learning_rate": 2.366354852487781e-05, "loss": 0.368, "step": 73765 }, { "epoch": 2.6586658017082927, "grad_norm": 0.2273816168308258, "learning_rate": 2.3660634572989747e-05, "loss": 0.3965, "step": 73770 }, { "epoch": 2.6588460013695174, "grad_norm": 0.22291794419288635, "learning_rate": 2.3657720639350288e-05, "loss": 0.4491, "step": 73775 }, { "epoch": 2.659026201030742, "grad_norm": 0.21958285570144653, "learning_rate": 2.3654806723999117e-05, "loss": 0.3876, "step": 73780 }, { "epoch": 2.659206400691967, "grad_norm": 0.23919081687927246, "learning_rate": 2.365189282697595e-05, "loss": 0.3955, "step": 73785 }, { "epoch": 2.659386600353191, "grad_norm": 0.2069309651851654, "learning_rate": 2.3648978948320483e-05, "loss": 0.4031, "step": 73790 }, { "epoch": 2.659566800014416, "grad_norm": 0.22566114366054535, "learning_rate": 2.3646065088072407e-05, "loss": 0.3822, "step": 73795 }, { "epoch": 2.6597469996756407, "grad_norm": 0.193036288022995, "learning_rate": 2.364315124627144e-05, "loss": 0.3603, "step": 73800 }, { "epoch": 2.6599271993368654, "grad_norm": 0.19962070882320404, "learning_rate": 2.3640237422957275e-05, "loss": 0.3869, "step": 73805 }, { "epoch": 2.6601073989980897, "grad_norm": 0.20868006348609924, "learning_rate": 2.3637323618169606e-05, "loss": 0.3864, "step": 73810 }, { "epoch": 2.6602875986593144, "grad_norm": 0.22715534269809723, "learning_rate": 2.3634409831948144e-05, "loss": 0.3757, "step": 73815 }, { "epoch": 2.660467798320539, "grad_norm": 0.22947651147842407, "learning_rate": 2.363149606433258e-05, "loss": 0.3707, "step": 73820 }, { "epoch": 2.660647997981764, "grad_norm": 0.17915059626102448, "learning_rate": 2.3628582315362625e-05, "loss": 0.4104, "step": 73825 }, { "epoch": 2.6608281976429886, "grad_norm": 0.2261868566274643, "learning_rate": 2.362566858507797e-05, "loss": 0.3774, "step": 73830 }, { "epoch": 2.661008397304213, "grad_norm": 0.23218120634555817, "learning_rate": 2.3622754873518303e-05, "loss": 0.4169, "step": 73835 }, { "epoch": 2.6611885969654376, "grad_norm": 0.2152547836303711, "learning_rate": 2.3619841180723345e-05, "loss": 0.4173, "step": 73840 }, { "epoch": 2.6613687966266624, "grad_norm": 0.22858308255672455, "learning_rate": 2.3616927506732773e-05, "loss": 0.3866, "step": 73845 }, { "epoch": 2.661548996287887, "grad_norm": 0.2074364870786667, "learning_rate": 2.361401385158631e-05, "loss": 0.3856, "step": 73850 }, { "epoch": 2.6617291959491114, "grad_norm": 0.1993214190006256, "learning_rate": 2.361110021532363e-05, "loss": 0.4015, "step": 73855 }, { "epoch": 2.661909395610336, "grad_norm": 0.22513341903686523, "learning_rate": 2.360818659798444e-05, "loss": 0.3876, "step": 73860 }, { "epoch": 2.662089595271561, "grad_norm": 0.20720557868480682, "learning_rate": 2.3605272999608442e-05, "loss": 0.4208, "step": 73865 }, { "epoch": 2.6622697949327856, "grad_norm": 0.19978566467761993, "learning_rate": 2.3602359420235333e-05, "loss": 0.4022, "step": 73870 }, { "epoch": 2.6624499945940103, "grad_norm": 0.23757880926132202, "learning_rate": 2.3599445859904798e-05, "loss": 0.3755, "step": 73875 }, { "epoch": 2.662630194255235, "grad_norm": 0.17947709560394287, "learning_rate": 2.3596532318656547e-05, "loss": 0.4252, "step": 73880 }, { "epoch": 2.6628103939164594, "grad_norm": 0.22774136066436768, "learning_rate": 2.3593618796530268e-05, "loss": 0.3899, "step": 73885 }, { "epoch": 2.662990593577684, "grad_norm": 0.23940308392047882, "learning_rate": 2.3590705293565663e-05, "loss": 0.3636, "step": 73890 }, { "epoch": 2.663170793238909, "grad_norm": 0.1792476326227188, "learning_rate": 2.3587791809802427e-05, "loss": 0.3771, "step": 73895 }, { "epoch": 2.663350992900133, "grad_norm": 0.15978147089481354, "learning_rate": 2.358487834528025e-05, "loss": 0.4011, "step": 73900 }, { "epoch": 2.663531192561358, "grad_norm": 0.20006360113620758, "learning_rate": 2.3581964900038836e-05, "loss": 0.3993, "step": 73905 }, { "epoch": 2.6637113922225826, "grad_norm": 0.1868210732936859, "learning_rate": 2.357905147411788e-05, "loss": 0.3967, "step": 73910 }, { "epoch": 2.6638915918838073, "grad_norm": 0.21746650338172913, "learning_rate": 2.3576138067557058e-05, "loss": 0.3769, "step": 73915 }, { "epoch": 2.664071791545032, "grad_norm": 0.21393951773643494, "learning_rate": 2.357322468039609e-05, "loss": 0.4063, "step": 73920 }, { "epoch": 2.6642519912062568, "grad_norm": 0.21100027859210968, "learning_rate": 2.3570311312674654e-05, "loss": 0.4181, "step": 73925 }, { "epoch": 2.664432190867481, "grad_norm": 0.20578476786613464, "learning_rate": 2.356739796443245e-05, "loss": 0.396, "step": 73930 }, { "epoch": 2.664612390528706, "grad_norm": 0.2200337052345276, "learning_rate": 2.356448463570918e-05, "loss": 0.3953, "step": 73935 }, { "epoch": 2.6647925901899305, "grad_norm": 0.21242652833461761, "learning_rate": 2.356157132654452e-05, "loss": 0.4087, "step": 73940 }, { "epoch": 2.664972789851155, "grad_norm": 0.2014140486717224, "learning_rate": 2.3558658036978183e-05, "loss": 0.3968, "step": 73945 }, { "epoch": 2.6651529895123796, "grad_norm": 0.2067977786064148, "learning_rate": 2.355574476704984e-05, "loss": 0.3766, "step": 73950 }, { "epoch": 2.6653331891736043, "grad_norm": 0.18431727588176727, "learning_rate": 2.35528315167992e-05, "loss": 0.3723, "step": 73955 }, { "epoch": 2.665513388834829, "grad_norm": 0.2466651052236557, "learning_rate": 2.3549918286265947e-05, "loss": 0.3828, "step": 73960 }, { "epoch": 2.6656935884960538, "grad_norm": 0.22020891308784485, "learning_rate": 2.3547005075489776e-05, "loss": 0.3619, "step": 73965 }, { "epoch": 2.6658737881572785, "grad_norm": 0.173415869474411, "learning_rate": 2.3544091884510383e-05, "loss": 0.3719, "step": 73970 }, { "epoch": 2.666053987818503, "grad_norm": 0.17796094715595245, "learning_rate": 2.3541178713367456e-05, "loss": 0.4016, "step": 73975 }, { "epoch": 2.6662341874797275, "grad_norm": 0.235768124461174, "learning_rate": 2.353826556210068e-05, "loss": 0.3881, "step": 73980 }, { "epoch": 2.6664143871409522, "grad_norm": 0.253508597612381, "learning_rate": 2.3535352430749763e-05, "loss": 0.4, "step": 73985 }, { "epoch": 2.6665945868021765, "grad_norm": 0.20636709034442902, "learning_rate": 2.353243931935438e-05, "loss": 0.3943, "step": 73990 }, { "epoch": 2.6667747864634013, "grad_norm": 0.20948311686515808, "learning_rate": 2.3529526227954225e-05, "loss": 0.3992, "step": 73995 }, { "epoch": 2.666954986124626, "grad_norm": 0.19002006947994232, "learning_rate": 2.3526613156588997e-05, "loss": 0.3784, "step": 74000 }, { "epoch": 2.666954986124626, "eval_loss": 0.4320438802242279, "eval_runtime": 3.5274, "eval_samples_per_second": 28.35, "eval_steps_per_second": 7.087, "step": 74000 }, { "epoch": 2.6671351857858507, "grad_norm": 0.2230537235736847, "learning_rate": 2.3523700105298372e-05, "loss": 0.3806, "step": 74005 }, { "epoch": 2.6673153854470755, "grad_norm": 0.20656515657901764, "learning_rate": 2.3520787074122052e-05, "loss": 0.3979, "step": 74010 }, { "epoch": 2.6674955851083, "grad_norm": 0.2289830446243286, "learning_rate": 2.351787406309973e-05, "loss": 0.4243, "step": 74015 }, { "epoch": 2.6676757847695245, "grad_norm": 0.22350530326366425, "learning_rate": 2.3514961072271068e-05, "loss": 0.407, "step": 74020 }, { "epoch": 2.6678559844307492, "grad_norm": 0.224583700299263, "learning_rate": 2.351204810167579e-05, "loss": 0.3582, "step": 74025 }, { "epoch": 2.668036184091974, "grad_norm": 0.24546033143997192, "learning_rate": 2.3509135151353553e-05, "loss": 0.4033, "step": 74030 }, { "epoch": 2.6682163837531987, "grad_norm": 0.19116534292697906, "learning_rate": 2.350622222134408e-05, "loss": 0.4039, "step": 74035 }, { "epoch": 2.668396583414423, "grad_norm": 0.1550498604774475, "learning_rate": 2.350330931168703e-05, "loss": 0.3879, "step": 74040 }, { "epoch": 2.6685767830756477, "grad_norm": 0.2186582088470459, "learning_rate": 2.3500396422422092e-05, "loss": 0.3957, "step": 74045 }, { "epoch": 2.6687569827368725, "grad_norm": 0.21175815165042877, "learning_rate": 2.3497483553588977e-05, "loss": 0.3772, "step": 74050 }, { "epoch": 2.668937182398097, "grad_norm": 0.25158798694610596, "learning_rate": 2.3494570705227355e-05, "loss": 0.3367, "step": 74055 }, { "epoch": 2.669117382059322, "grad_norm": 0.21257662773132324, "learning_rate": 2.349165787737691e-05, "loss": 0.3836, "step": 74060 }, { "epoch": 2.669297581720546, "grad_norm": 0.22067958116531372, "learning_rate": 2.348874507007734e-05, "loss": 0.4208, "step": 74065 }, { "epoch": 2.669477781381771, "grad_norm": 0.18371091783046722, "learning_rate": 2.348583228336832e-05, "loss": 0.3965, "step": 74070 }, { "epoch": 2.6696579810429957, "grad_norm": 0.2284574806690216, "learning_rate": 2.3482919517289543e-05, "loss": 0.3892, "step": 74075 }, { "epoch": 2.6698381807042204, "grad_norm": 0.20297889411449432, "learning_rate": 2.34800067718807e-05, "loss": 0.4246, "step": 74080 }, { "epoch": 2.6700183803654447, "grad_norm": 0.23391233384609222, "learning_rate": 2.3477094047181463e-05, "loss": 0.4102, "step": 74085 }, { "epoch": 2.6701985800266694, "grad_norm": 0.1801353245973587, "learning_rate": 2.347418134323153e-05, "loss": 0.3542, "step": 74090 }, { "epoch": 2.670378779687894, "grad_norm": 0.20619447529315948, "learning_rate": 2.347126866007058e-05, "loss": 0.3497, "step": 74095 }, { "epoch": 2.670558979349119, "grad_norm": 0.15153087675571442, "learning_rate": 2.3468355997738293e-05, "loss": 0.4128, "step": 74100 }, { "epoch": 2.6707391790103436, "grad_norm": 0.26451894640922546, "learning_rate": 2.3465443356274365e-05, "loss": 0.4019, "step": 74105 }, { "epoch": 2.6709193786715684, "grad_norm": 0.24107202887535095, "learning_rate": 2.3462530735718472e-05, "loss": 0.4251, "step": 74110 }, { "epoch": 2.6710995783327927, "grad_norm": 0.2847641110420227, "learning_rate": 2.34596181361103e-05, "loss": 0.4012, "step": 74115 }, { "epoch": 2.6712797779940174, "grad_norm": 0.25060930848121643, "learning_rate": 2.3456705557489543e-05, "loss": 0.4052, "step": 74120 }, { "epoch": 2.671459977655242, "grad_norm": 0.2532309591770172, "learning_rate": 2.3453792999895855e-05, "loss": 0.3964, "step": 74125 }, { "epoch": 2.6716401773164664, "grad_norm": 0.1897568553686142, "learning_rate": 2.3450880463368955e-05, "loss": 0.3838, "step": 74130 }, { "epoch": 2.671820376977691, "grad_norm": 0.2430182695388794, "learning_rate": 2.3447967947948503e-05, "loss": 0.4141, "step": 74135 }, { "epoch": 2.672000576638916, "grad_norm": 0.22214165329933167, "learning_rate": 2.344505545367418e-05, "loss": 0.4151, "step": 74140 }, { "epoch": 2.6721807763001406, "grad_norm": 0.19647006690502167, "learning_rate": 2.344214298058568e-05, "loss": 0.4073, "step": 74145 }, { "epoch": 2.6723609759613653, "grad_norm": 0.18476322293281555, "learning_rate": 2.343923052872268e-05, "loss": 0.3706, "step": 74150 }, { "epoch": 2.67254117562259, "grad_norm": 0.18611617386341095, "learning_rate": 2.3436318098124864e-05, "loss": 0.3995, "step": 74155 }, { "epoch": 2.6727213752838144, "grad_norm": 0.2218163013458252, "learning_rate": 2.3433405688831915e-05, "loss": 0.3928, "step": 74160 }, { "epoch": 2.672901574945039, "grad_norm": 0.16507494449615479, "learning_rate": 2.34304933008835e-05, "loss": 0.3753, "step": 74165 }, { "epoch": 2.673081774606264, "grad_norm": 0.19643135368824005, "learning_rate": 2.3427580934319314e-05, "loss": 0.4089, "step": 74170 }, { "epoch": 2.673261974267488, "grad_norm": 0.22430889308452606, "learning_rate": 2.3424668589179037e-05, "loss": 0.3765, "step": 74175 }, { "epoch": 2.673442173928713, "grad_norm": 0.2428254932165146, "learning_rate": 2.342175626550234e-05, "loss": 0.3831, "step": 74180 }, { "epoch": 2.6736223735899376, "grad_norm": 0.2370181381702423, "learning_rate": 2.3418843963328912e-05, "loss": 0.3586, "step": 74185 }, { "epoch": 2.6738025732511623, "grad_norm": 0.19188377261161804, "learning_rate": 2.3415931682698427e-05, "loss": 0.3616, "step": 74190 }, { "epoch": 2.673982772912387, "grad_norm": 0.22248739004135132, "learning_rate": 2.341301942365057e-05, "loss": 0.4284, "step": 74195 }, { "epoch": 2.674162972573612, "grad_norm": 0.2631896138191223, "learning_rate": 2.3410107186225015e-05, "loss": 0.4025, "step": 74200 }, { "epoch": 2.674343172234836, "grad_norm": 0.19557088613510132, "learning_rate": 2.3407194970461435e-05, "loss": 0.4003, "step": 74205 }, { "epoch": 2.674523371896061, "grad_norm": 0.20168626308441162, "learning_rate": 2.3404282776399523e-05, "loss": 0.3501, "step": 74210 }, { "epoch": 2.6747035715572856, "grad_norm": 0.23849454522132874, "learning_rate": 2.340137060407894e-05, "loss": 0.3978, "step": 74215 }, { "epoch": 2.67488377121851, "grad_norm": 0.2284935563802719, "learning_rate": 2.3398458453539385e-05, "loss": 0.3623, "step": 74220 }, { "epoch": 2.6750639708797346, "grad_norm": 0.2417210191488266, "learning_rate": 2.3395546324820526e-05, "loss": 0.4028, "step": 74225 }, { "epoch": 2.6752441705409593, "grad_norm": 0.18951061367988586, "learning_rate": 2.3392634217962018e-05, "loss": 0.3889, "step": 74230 }, { "epoch": 2.675424370202184, "grad_norm": 0.24638362228870392, "learning_rate": 2.3389722133003577e-05, "loss": 0.3558, "step": 74235 }, { "epoch": 2.6756045698634088, "grad_norm": 0.18396976590156555, "learning_rate": 2.3386810069984856e-05, "loss": 0.3805, "step": 74240 }, { "epoch": 2.6757847695246335, "grad_norm": 0.20471645891666412, "learning_rate": 2.3383898028945528e-05, "loss": 0.4248, "step": 74245 }, { "epoch": 2.675964969185858, "grad_norm": 0.1948273777961731, "learning_rate": 2.338098600992528e-05, "loss": 0.3789, "step": 74250 }, { "epoch": 2.6761451688470825, "grad_norm": 0.22328773140907288, "learning_rate": 2.3378074012963783e-05, "loss": 0.4053, "step": 74255 }, { "epoch": 2.6763253685083073, "grad_norm": 0.25258857011795044, "learning_rate": 2.3375162038100716e-05, "loss": 0.4313, "step": 74260 }, { "epoch": 2.676505568169532, "grad_norm": 0.1960865557193756, "learning_rate": 2.337225008537575e-05, "loss": 0.3928, "step": 74265 }, { "epoch": 2.6766857678307563, "grad_norm": 0.190299391746521, "learning_rate": 2.3369338154828564e-05, "loss": 0.3747, "step": 74270 }, { "epoch": 2.676865967491981, "grad_norm": 0.2319035530090332, "learning_rate": 2.336642624649883e-05, "loss": 0.3932, "step": 74275 }, { "epoch": 2.6770461671532058, "grad_norm": 0.17021389305591583, "learning_rate": 2.336351436042622e-05, "loss": 0.362, "step": 74280 }, { "epoch": 2.6772263668144305, "grad_norm": 0.1885906457901001, "learning_rate": 2.3360602496650406e-05, "loss": 0.4038, "step": 74285 }, { "epoch": 2.6774065664756552, "grad_norm": 0.2009781450033188, "learning_rate": 2.3357690655211072e-05, "loss": 0.4274, "step": 74290 }, { "epoch": 2.6775867661368795, "grad_norm": 0.2083226591348648, "learning_rate": 2.335477883614788e-05, "loss": 0.4083, "step": 74295 }, { "epoch": 2.6777669657981042, "grad_norm": 0.18861836194992065, "learning_rate": 2.3351867039500513e-05, "loss": 0.3834, "step": 74300 }, { "epoch": 2.677947165459329, "grad_norm": 0.2695733904838562, "learning_rate": 2.3348955265308642e-05, "loss": 0.4182, "step": 74305 }, { "epoch": 2.6781273651205537, "grad_norm": 0.17389146983623505, "learning_rate": 2.334604351361192e-05, "loss": 0.4131, "step": 74310 }, { "epoch": 2.678307564781778, "grad_norm": 0.18311482667922974, "learning_rate": 2.3343131784450055e-05, "loss": 0.4198, "step": 74315 }, { "epoch": 2.6784877644430027, "grad_norm": 0.2671164572238922, "learning_rate": 2.334022007786269e-05, "loss": 0.3622, "step": 74320 }, { "epoch": 2.6786679641042275, "grad_norm": 0.2175154834985733, "learning_rate": 2.3337308393889493e-05, "loss": 0.4046, "step": 74325 }, { "epoch": 2.678848163765452, "grad_norm": 0.22409701347351074, "learning_rate": 2.3334396732570167e-05, "loss": 0.3906, "step": 74330 }, { "epoch": 2.679028363426677, "grad_norm": 0.203602135181427, "learning_rate": 2.3331485093944344e-05, "loss": 0.4466, "step": 74335 }, { "epoch": 2.6792085630879012, "grad_norm": 0.19964513182640076, "learning_rate": 2.332857347805173e-05, "loss": 0.4134, "step": 74340 }, { "epoch": 2.679388762749126, "grad_norm": 0.1884617805480957, "learning_rate": 2.3325661884931972e-05, "loss": 0.3872, "step": 74345 }, { "epoch": 2.6795689624103507, "grad_norm": 0.2019912749528885, "learning_rate": 2.3322750314624747e-05, "loss": 0.4068, "step": 74350 }, { "epoch": 2.6797491620715754, "grad_norm": 0.2299565076828003, "learning_rate": 2.3319838767169725e-05, "loss": 0.4014, "step": 74355 }, { "epoch": 2.6799293617327997, "grad_norm": 0.24402816593647003, "learning_rate": 2.3316927242606575e-05, "loss": 0.3874, "step": 74360 }, { "epoch": 2.6801095613940245, "grad_norm": 0.25074100494384766, "learning_rate": 2.331401574097496e-05, "loss": 0.3988, "step": 74365 }, { "epoch": 2.680289761055249, "grad_norm": 0.19405324757099152, "learning_rate": 2.3311104262314563e-05, "loss": 0.3689, "step": 74370 }, { "epoch": 2.680469960716474, "grad_norm": 0.19997498393058777, "learning_rate": 2.3308192806665034e-05, "loss": 0.3886, "step": 74375 }, { "epoch": 2.6806501603776987, "grad_norm": 0.22032709419727325, "learning_rate": 2.3305281374066057e-05, "loss": 0.4064, "step": 74380 }, { "epoch": 2.6808303600389234, "grad_norm": 0.25573626160621643, "learning_rate": 2.3302369964557292e-05, "loss": 0.3774, "step": 74385 }, { "epoch": 2.6810105597001477, "grad_norm": 0.23341487348079681, "learning_rate": 2.3299458578178403e-05, "loss": 0.3918, "step": 74390 }, { "epoch": 2.6811907593613724, "grad_norm": 0.19004227221012115, "learning_rate": 2.3296547214969066e-05, "loss": 0.4126, "step": 74395 }, { "epoch": 2.681370959022597, "grad_norm": 0.18891394138336182, "learning_rate": 2.3293635874968954e-05, "loss": 0.3896, "step": 74400 }, { "epoch": 2.6815511586838214, "grad_norm": 0.20582923293113708, "learning_rate": 2.32907245582177e-05, "loss": 0.433, "step": 74405 }, { "epoch": 2.681731358345046, "grad_norm": 0.22365623712539673, "learning_rate": 2.328781326475501e-05, "loss": 0.3804, "step": 74410 }, { "epoch": 2.681911558006271, "grad_norm": 0.2613740563392639, "learning_rate": 2.328490199462052e-05, "loss": 0.3879, "step": 74415 }, { "epoch": 2.6820917576674956, "grad_norm": 0.16647237539291382, "learning_rate": 2.3281990747853925e-05, "loss": 0.3906, "step": 74420 }, { "epoch": 2.6822719573287204, "grad_norm": 0.21178102493286133, "learning_rate": 2.3279079524494864e-05, "loss": 0.4273, "step": 74425 }, { "epoch": 2.682452156989945, "grad_norm": 0.22928111255168915, "learning_rate": 2.327616832458301e-05, "loss": 0.3926, "step": 74430 }, { "epoch": 2.6826323566511694, "grad_norm": 0.29340752959251404, "learning_rate": 2.327325714815803e-05, "loss": 0.3821, "step": 74435 }, { "epoch": 2.682812556312394, "grad_norm": 0.22144730389118195, "learning_rate": 2.3270345995259586e-05, "loss": 0.4207, "step": 74440 }, { "epoch": 2.682992755973619, "grad_norm": 0.2856822907924652, "learning_rate": 2.3267434865927345e-05, "loss": 0.3741, "step": 74445 }, { "epoch": 2.683172955634843, "grad_norm": 0.16166545450687408, "learning_rate": 2.326452376020097e-05, "loss": 0.3838, "step": 74450 }, { "epoch": 2.683353155296068, "grad_norm": 0.21917329728603363, "learning_rate": 2.3261612678120118e-05, "loss": 0.4113, "step": 74455 }, { "epoch": 2.6835333549572926, "grad_norm": 0.25646519660949707, "learning_rate": 2.325870161972446e-05, "loss": 0.4198, "step": 74460 }, { "epoch": 2.6837135546185173, "grad_norm": 0.18003955483436584, "learning_rate": 2.3255790585053654e-05, "loss": 0.4351, "step": 74465 }, { "epoch": 2.683893754279742, "grad_norm": 0.2252659797668457, "learning_rate": 2.3252879574147363e-05, "loss": 0.3775, "step": 74470 }, { "epoch": 2.684073953940967, "grad_norm": 0.21669524908065796, "learning_rate": 2.3249968587045253e-05, "loss": 0.4371, "step": 74475 }, { "epoch": 2.684254153602191, "grad_norm": 0.2279970496892929, "learning_rate": 2.3247057623786974e-05, "loss": 0.4017, "step": 74480 }, { "epoch": 2.684434353263416, "grad_norm": 0.1842222362756729, "learning_rate": 2.3244146684412205e-05, "loss": 0.3811, "step": 74485 }, { "epoch": 2.6846145529246406, "grad_norm": 0.24382725358009338, "learning_rate": 2.3241235768960595e-05, "loss": 0.423, "step": 74490 }, { "epoch": 2.684794752585865, "grad_norm": 0.24431829154491425, "learning_rate": 2.3238324877471804e-05, "loss": 0.3774, "step": 74495 }, { "epoch": 2.6849749522470896, "grad_norm": 0.22015175223350525, "learning_rate": 2.3235414009985498e-05, "loss": 0.4201, "step": 74500 }, { "epoch": 2.6849749522470896, "eval_loss": 0.4314359724521637, "eval_runtime": 3.5281, "eval_samples_per_second": 28.344, "eval_steps_per_second": 7.086, "step": 74500 }, { "epoch": 2.6851551519083143, "grad_norm": 0.19529901444911957, "learning_rate": 2.323250316654134e-05, "loss": 0.4097, "step": 74505 }, { "epoch": 2.685335351569539, "grad_norm": 0.1773121953010559, "learning_rate": 2.322959234717897e-05, "loss": 0.4016, "step": 74510 }, { "epoch": 2.685515551230764, "grad_norm": 0.24559174478054047, "learning_rate": 2.322668155193808e-05, "loss": 0.3864, "step": 74515 }, { "epoch": 2.6856957508919885, "grad_norm": 0.214651420712471, "learning_rate": 2.322377078085829e-05, "loss": 0.4201, "step": 74520 }, { "epoch": 2.685875950553213, "grad_norm": 0.19618816673755646, "learning_rate": 2.3220860033979296e-05, "loss": 0.4003, "step": 74525 }, { "epoch": 2.6860561502144376, "grad_norm": 0.22414274513721466, "learning_rate": 2.3217949311340733e-05, "loss": 0.3972, "step": 74530 }, { "epoch": 2.6862363498756623, "grad_norm": 0.1811375916004181, "learning_rate": 2.3215038612982265e-05, "loss": 0.4129, "step": 74535 }, { "epoch": 2.686416549536887, "grad_norm": 0.21368905901908875, "learning_rate": 2.3212127938943552e-05, "loss": 0.3923, "step": 74540 }, { "epoch": 2.6865967491981113, "grad_norm": 0.2091556340456009, "learning_rate": 2.320921728926425e-05, "loss": 0.3873, "step": 74545 }, { "epoch": 2.686776948859336, "grad_norm": 0.246855229139328, "learning_rate": 2.3206306663984013e-05, "loss": 0.4176, "step": 74550 }, { "epoch": 2.686957148520561, "grad_norm": 0.20908492803573608, "learning_rate": 2.3203396063142503e-05, "loss": 0.3964, "step": 74555 }, { "epoch": 2.6871373481817855, "grad_norm": 0.21159091591835022, "learning_rate": 2.3200485486779367e-05, "loss": 0.3901, "step": 74560 }, { "epoch": 2.6873175478430102, "grad_norm": 0.20965726673603058, "learning_rate": 2.3197574934934274e-05, "loss": 0.3859, "step": 74565 }, { "epoch": 2.6874977475042345, "grad_norm": 0.2037397176027298, "learning_rate": 2.3194664407646876e-05, "loss": 0.3793, "step": 74570 }, { "epoch": 2.6876779471654593, "grad_norm": 0.21732820570468903, "learning_rate": 2.319175390495682e-05, "loss": 0.4485, "step": 74575 }, { "epoch": 2.687858146826684, "grad_norm": 0.19209305942058563, "learning_rate": 2.3188843426903774e-05, "loss": 0.3827, "step": 74580 }, { "epoch": 2.6880383464879087, "grad_norm": 0.2674555480480194, "learning_rate": 2.3185932973527386e-05, "loss": 0.4225, "step": 74585 }, { "epoch": 2.688218546149133, "grad_norm": 0.1785704642534256, "learning_rate": 2.3183022544867298e-05, "loss": 0.4047, "step": 74590 }, { "epoch": 2.6883987458103578, "grad_norm": 0.18283165991306305, "learning_rate": 2.318011214096319e-05, "loss": 0.3662, "step": 74595 }, { "epoch": 2.6885789454715825, "grad_norm": 0.22287532687187195, "learning_rate": 2.3177201761854686e-05, "loss": 0.4041, "step": 74600 }, { "epoch": 2.6887591451328072, "grad_norm": 0.2034824639558792, "learning_rate": 2.317429140758147e-05, "loss": 0.3986, "step": 74605 }, { "epoch": 2.688939344794032, "grad_norm": 0.23879247903823853, "learning_rate": 2.317138107818318e-05, "loss": 0.3647, "step": 74610 }, { "epoch": 2.6891195444552567, "grad_norm": 0.2760174572467804, "learning_rate": 2.3168470773699452e-05, "loss": 0.3891, "step": 74615 }, { "epoch": 2.689299744116481, "grad_norm": 0.20346973836421967, "learning_rate": 2.3165560494169973e-05, "loss": 0.4036, "step": 74620 }, { "epoch": 2.6894799437777057, "grad_norm": 0.27176687121391296, "learning_rate": 2.3162650239634363e-05, "loss": 0.4122, "step": 74625 }, { "epoch": 2.6896601434389305, "grad_norm": 0.232636496424675, "learning_rate": 2.3159740010132304e-05, "loss": 0.4164, "step": 74630 }, { "epoch": 2.6898403431001547, "grad_norm": 0.1852894425392151, "learning_rate": 2.3156829805703422e-05, "loss": 0.4268, "step": 74635 }, { "epoch": 2.6900205427613795, "grad_norm": 0.1811005026102066, "learning_rate": 2.3153919626387374e-05, "loss": 0.4163, "step": 74640 }, { "epoch": 2.690200742422604, "grad_norm": 0.22391211986541748, "learning_rate": 2.315100947222382e-05, "loss": 0.3755, "step": 74645 }, { "epoch": 2.690380942083829, "grad_norm": 0.19471041858196259, "learning_rate": 2.31480993432524e-05, "loss": 0.3779, "step": 74650 }, { "epoch": 2.6905611417450537, "grad_norm": 0.21801535785198212, "learning_rate": 2.3145189239512765e-05, "loss": 0.3664, "step": 74655 }, { "epoch": 2.6907413414062784, "grad_norm": 0.21162468194961548, "learning_rate": 2.3142279161044575e-05, "loss": 0.3548, "step": 74660 }, { "epoch": 2.6909215410675027, "grad_norm": 0.20754042267799377, "learning_rate": 2.3139369107887467e-05, "loss": 0.3827, "step": 74665 }, { "epoch": 2.6911017407287274, "grad_norm": 0.175797700881958, "learning_rate": 2.3136459080081096e-05, "loss": 0.3674, "step": 74670 }, { "epoch": 2.691281940389952, "grad_norm": 0.19099979102611542, "learning_rate": 2.3133549077665114e-05, "loss": 0.411, "step": 74675 }, { "epoch": 2.6914621400511765, "grad_norm": 0.20772726833820343, "learning_rate": 2.313063910067915e-05, "loss": 0.3753, "step": 74680 }, { "epoch": 2.691642339712401, "grad_norm": 0.24013733863830566, "learning_rate": 2.312772914916288e-05, "loss": 0.3797, "step": 74685 }, { "epoch": 2.691822539373626, "grad_norm": 0.191719189286232, "learning_rate": 2.312481922315594e-05, "loss": 0.3634, "step": 74690 }, { "epoch": 2.6920027390348507, "grad_norm": 0.1952929049730301, "learning_rate": 2.312190932269796e-05, "loss": 0.374, "step": 74695 }, { "epoch": 2.6921829386960754, "grad_norm": 0.19985808432102203, "learning_rate": 2.3118999447828617e-05, "loss": 0.3761, "step": 74700 }, { "epoch": 2.6923631383573, "grad_norm": 0.22037459909915924, "learning_rate": 2.311608959858753e-05, "loss": 0.3537, "step": 74705 }, { "epoch": 2.6925433380185244, "grad_norm": 0.25286975502967834, "learning_rate": 2.311317977501437e-05, "loss": 0.3598, "step": 74710 }, { "epoch": 2.692723537679749, "grad_norm": 0.21672144532203674, "learning_rate": 2.3110269977148765e-05, "loss": 0.3825, "step": 74715 }, { "epoch": 2.692903737340974, "grad_norm": 0.2222987562417984, "learning_rate": 2.310736020503036e-05, "loss": 0.3665, "step": 74720 }, { "epoch": 2.693083937002198, "grad_norm": 0.1981443464756012, "learning_rate": 2.3104450458698816e-05, "loss": 0.4095, "step": 74725 }, { "epoch": 2.693264136663423, "grad_norm": 0.19497603178024292, "learning_rate": 2.3101540738193762e-05, "loss": 0.4022, "step": 74730 }, { "epoch": 2.6934443363246476, "grad_norm": 0.20269820094108582, "learning_rate": 2.3098631043554845e-05, "loss": 0.3861, "step": 74735 }, { "epoch": 2.6936245359858724, "grad_norm": 0.20779010653495789, "learning_rate": 2.3095721374821716e-05, "loss": 0.3832, "step": 74740 }, { "epoch": 2.693804735647097, "grad_norm": 0.18731550872325897, "learning_rate": 2.309281173203401e-05, "loss": 0.3904, "step": 74745 }, { "epoch": 2.693984935308322, "grad_norm": 0.2547980844974518, "learning_rate": 2.3089902115231378e-05, "loss": 0.3837, "step": 74750 }, { "epoch": 2.694165134969546, "grad_norm": 0.20064884424209595, "learning_rate": 2.3086992524453462e-05, "loss": 0.3908, "step": 74755 }, { "epoch": 2.694345334630771, "grad_norm": 0.1967686116695404, "learning_rate": 2.30840829597399e-05, "loss": 0.423, "step": 74760 }, { "epoch": 2.6945255342919956, "grad_norm": 0.21911115944385529, "learning_rate": 2.3081173421130336e-05, "loss": 0.4434, "step": 74765 }, { "epoch": 2.6947057339532203, "grad_norm": 0.20522983372211456, "learning_rate": 2.307826390866442e-05, "loss": 0.403, "step": 74770 }, { "epoch": 2.6948859336144446, "grad_norm": 0.19616533815860748, "learning_rate": 2.307535442238177e-05, "loss": 0.3686, "step": 74775 }, { "epoch": 2.6950661332756694, "grad_norm": 0.2152736634016037, "learning_rate": 2.3072444962322056e-05, "loss": 0.4022, "step": 74780 }, { "epoch": 2.695246332936894, "grad_norm": 0.17979800701141357, "learning_rate": 2.3069535528524902e-05, "loss": 0.3845, "step": 74785 }, { "epoch": 2.695426532598119, "grad_norm": 0.2075699120759964, "learning_rate": 2.3066626121029954e-05, "loss": 0.4106, "step": 74790 }, { "epoch": 2.6956067322593436, "grad_norm": 0.1615006923675537, "learning_rate": 2.306371673987686e-05, "loss": 0.3906, "step": 74795 }, { "epoch": 2.695786931920568, "grad_norm": 0.2201845496892929, "learning_rate": 2.306080738510523e-05, "loss": 0.3822, "step": 74800 }, { "epoch": 2.6959671315817926, "grad_norm": 0.17273396253585815, "learning_rate": 2.3057898056754744e-05, "loss": 0.401, "step": 74805 }, { "epoch": 2.6961473312430173, "grad_norm": 0.1817549616098404, "learning_rate": 2.3054988754865015e-05, "loss": 0.3613, "step": 74810 }, { "epoch": 2.696327530904242, "grad_norm": 0.18268698453903198, "learning_rate": 2.3052079479475683e-05, "loss": 0.4106, "step": 74815 }, { "epoch": 2.6965077305654663, "grad_norm": 0.22823894023895264, "learning_rate": 2.3049170230626395e-05, "loss": 0.3843, "step": 74820 }, { "epoch": 2.696687930226691, "grad_norm": 0.22048458456993103, "learning_rate": 2.304626100835678e-05, "loss": 0.4166, "step": 74825 }, { "epoch": 2.696868129887916, "grad_norm": 0.22666330635547638, "learning_rate": 2.3043351812706486e-05, "loss": 0.3873, "step": 74830 }, { "epoch": 2.6970483295491405, "grad_norm": 0.22461573779582977, "learning_rate": 2.3040442643715142e-05, "loss": 0.4052, "step": 74835 }, { "epoch": 2.6972285292103653, "grad_norm": 0.2268482744693756, "learning_rate": 2.3037533501422384e-05, "loss": 0.4011, "step": 74840 }, { "epoch": 2.6974087288715896, "grad_norm": 0.2219397872686386, "learning_rate": 2.303462438586786e-05, "loss": 0.387, "step": 74845 }, { "epoch": 2.6975889285328143, "grad_norm": 0.21480488777160645, "learning_rate": 2.3031715297091188e-05, "loss": 0.4225, "step": 74850 }, { "epoch": 2.697769128194039, "grad_norm": 0.1800355166196823, "learning_rate": 2.302880623513202e-05, "loss": 0.3738, "step": 74855 }, { "epoch": 2.6979493278552638, "grad_norm": 0.2048688679933548, "learning_rate": 2.302589720002999e-05, "loss": 0.3953, "step": 74860 }, { "epoch": 2.698129527516488, "grad_norm": 0.19619779288768768, "learning_rate": 2.302298819182472e-05, "loss": 0.3967, "step": 74865 }, { "epoch": 2.698309727177713, "grad_norm": 0.22370363771915436, "learning_rate": 2.302007921055586e-05, "loss": 0.4012, "step": 74870 }, { "epoch": 2.6984899268389375, "grad_norm": 0.2369144707918167, "learning_rate": 2.301717025626304e-05, "loss": 0.3758, "step": 74875 }, { "epoch": 2.6986701265001622, "grad_norm": 0.18146106600761414, "learning_rate": 2.301426132898588e-05, "loss": 0.4064, "step": 74880 }, { "epoch": 2.698850326161387, "grad_norm": 0.2013833373785019, "learning_rate": 2.301135242876404e-05, "loss": 0.3753, "step": 74885 }, { "epoch": 2.6990305258226117, "grad_norm": 0.18798790872097015, "learning_rate": 2.3008443555637116e-05, "loss": 0.4047, "step": 74890 }, { "epoch": 2.699210725483836, "grad_norm": 0.23676003515720367, "learning_rate": 2.3005534709644784e-05, "loss": 0.3704, "step": 74895 }, { "epoch": 2.6993909251450607, "grad_norm": 0.20541468262672424, "learning_rate": 2.300262589082665e-05, "loss": 0.3796, "step": 74900 }, { "epoch": 2.6995711248062855, "grad_norm": 0.2497328221797943, "learning_rate": 2.299971709922234e-05, "loss": 0.3954, "step": 74905 }, { "epoch": 2.6997513244675098, "grad_norm": 0.2306727170944214, "learning_rate": 2.2996808334871513e-05, "loss": 0.4071, "step": 74910 }, { "epoch": 2.6999315241287345, "grad_norm": 0.2239367663860321, "learning_rate": 2.2993899597813778e-05, "loss": 0.3672, "step": 74915 }, { "epoch": 2.7001117237899592, "grad_norm": 0.19781452417373657, "learning_rate": 2.299099088808877e-05, "loss": 0.3691, "step": 74920 }, { "epoch": 2.700291923451184, "grad_norm": 0.21767933666706085, "learning_rate": 2.298808220573613e-05, "loss": 0.4065, "step": 74925 }, { "epoch": 2.7004721231124087, "grad_norm": 0.18852004408836365, "learning_rate": 2.298517355079547e-05, "loss": 0.3676, "step": 74930 }, { "epoch": 2.7006523227736334, "grad_norm": 0.21407105028629303, "learning_rate": 2.2982264923306435e-05, "loss": 0.4, "step": 74935 }, { "epoch": 2.7008325224348577, "grad_norm": 0.1654965728521347, "learning_rate": 2.2979356323308653e-05, "loss": 0.4009, "step": 74940 }, { "epoch": 2.7010127220960825, "grad_norm": 0.1940935105085373, "learning_rate": 2.2976447750841742e-05, "loss": 0.379, "step": 74945 }, { "epoch": 2.701192921757307, "grad_norm": 0.21042941510677338, "learning_rate": 2.2973539205945347e-05, "loss": 0.437, "step": 74950 }, { "epoch": 2.7013731214185315, "grad_norm": 0.26631516218185425, "learning_rate": 2.2970630688659086e-05, "loss": 0.4402, "step": 74955 }, { "epoch": 2.701553321079756, "grad_norm": 0.23533588647842407, "learning_rate": 2.2967722199022585e-05, "loss": 0.3937, "step": 74960 }, { "epoch": 2.701733520740981, "grad_norm": 0.22266995906829834, "learning_rate": 2.296481373707548e-05, "loss": 0.3798, "step": 74965 }, { "epoch": 2.7019137204022057, "grad_norm": 0.22777952253818512, "learning_rate": 2.2961905302857387e-05, "loss": 0.4028, "step": 74970 }, { "epoch": 2.7020939200634304, "grad_norm": 0.2449411302804947, "learning_rate": 2.295899689640795e-05, "loss": 0.3865, "step": 74975 }, { "epoch": 2.702274119724655, "grad_norm": 0.188294917345047, "learning_rate": 2.2956088517766784e-05, "loss": 0.4135, "step": 74980 }, { "epoch": 2.7024543193858794, "grad_norm": 0.19857686758041382, "learning_rate": 2.2953180166973505e-05, "loss": 0.3972, "step": 74985 }, { "epoch": 2.702634519047104, "grad_norm": 0.2035861313343048, "learning_rate": 2.295027184406776e-05, "loss": 0.3869, "step": 74990 }, { "epoch": 2.702814718708329, "grad_norm": 0.20147527754306793, "learning_rate": 2.2947363549089164e-05, "loss": 0.3817, "step": 74995 }, { "epoch": 2.702994918369553, "grad_norm": 0.18223348259925842, "learning_rate": 2.2944455282077337e-05, "loss": 0.4237, "step": 75000 }, { "epoch": 2.702994918369553, "eval_loss": 0.43115875124931335, "eval_runtime": 3.5382, "eval_samples_per_second": 28.263, "eval_steps_per_second": 7.066, "step": 75000 }, { "epoch": 2.703175118030778, "grad_norm": 0.1930094212293625, "learning_rate": 2.2941547043071916e-05, "loss": 0.3965, "step": 75005 }, { "epoch": 2.7033553176920027, "grad_norm": 0.1886611133813858, "learning_rate": 2.2938638832112507e-05, "loss": 0.4005, "step": 75010 }, { "epoch": 2.7035355173532274, "grad_norm": 0.2501041293144226, "learning_rate": 2.2935730649238753e-05, "loss": 0.4165, "step": 75015 }, { "epoch": 2.703715717014452, "grad_norm": 0.2163308709859848, "learning_rate": 2.293282249449027e-05, "loss": 0.4063, "step": 75020 }, { "epoch": 2.703895916675677, "grad_norm": 0.20869512856006622, "learning_rate": 2.2929914367906674e-05, "loss": 0.4237, "step": 75025 }, { "epoch": 2.704076116336901, "grad_norm": 0.22830967605113983, "learning_rate": 2.2927006269527597e-05, "loss": 0.3773, "step": 75030 }, { "epoch": 2.704256315998126, "grad_norm": 0.2236100733280182, "learning_rate": 2.2924098199392658e-05, "loss": 0.4216, "step": 75035 }, { "epoch": 2.7044365156593506, "grad_norm": 0.20641185343265533, "learning_rate": 2.292119015754148e-05, "loss": 0.3819, "step": 75040 }, { "epoch": 2.7046167153205753, "grad_norm": 0.2077999860048294, "learning_rate": 2.2918282144013685e-05, "loss": 0.4211, "step": 75045 }, { "epoch": 2.7047969149817996, "grad_norm": 0.21933238208293915, "learning_rate": 2.291537415884889e-05, "loss": 0.3865, "step": 75050 }, { "epoch": 2.7049771146430244, "grad_norm": 0.1914396733045578, "learning_rate": 2.291246620208672e-05, "loss": 0.3647, "step": 75055 }, { "epoch": 2.705157314304249, "grad_norm": 0.24134762585163116, "learning_rate": 2.2909558273766802e-05, "loss": 0.3848, "step": 75060 }, { "epoch": 2.705337513965474, "grad_norm": 0.22134195268154144, "learning_rate": 2.290665037392873e-05, "loss": 0.3949, "step": 75065 }, { "epoch": 2.7055177136266986, "grad_norm": 0.17406292259693146, "learning_rate": 2.2903742502612153e-05, "loss": 0.3943, "step": 75070 }, { "epoch": 2.705697913287923, "grad_norm": 0.20366698503494263, "learning_rate": 2.2900834659856673e-05, "loss": 0.3911, "step": 75075 }, { "epoch": 2.7058781129491476, "grad_norm": 0.1711958944797516, "learning_rate": 2.289792684570192e-05, "loss": 0.3821, "step": 75080 }, { "epoch": 2.7060583126103723, "grad_norm": 0.21541287004947662, "learning_rate": 2.289501906018751e-05, "loss": 0.4022, "step": 75085 }, { "epoch": 2.706238512271597, "grad_norm": 0.2087329775094986, "learning_rate": 2.2892111303353046e-05, "loss": 0.3882, "step": 75090 }, { "epoch": 2.7064187119328214, "grad_norm": 0.2614741325378418, "learning_rate": 2.288920357523817e-05, "loss": 0.3778, "step": 75095 }, { "epoch": 2.706598911594046, "grad_norm": 0.286384642124176, "learning_rate": 2.2886295875882484e-05, "loss": 0.4149, "step": 75100 }, { "epoch": 2.706779111255271, "grad_norm": 0.22412830591201782, "learning_rate": 2.28833882053256e-05, "loss": 0.4088, "step": 75105 }, { "epoch": 2.7069593109164956, "grad_norm": 0.1880389153957367, "learning_rate": 2.2880480563607145e-05, "loss": 0.4153, "step": 75110 }, { "epoch": 2.7071395105777203, "grad_norm": 0.19842107594013214, "learning_rate": 2.287757295076673e-05, "loss": 0.3912, "step": 75115 }, { "epoch": 2.707319710238945, "grad_norm": 0.20793603360652924, "learning_rate": 2.2874665366843977e-05, "loss": 0.4447, "step": 75120 }, { "epoch": 2.7074999099001693, "grad_norm": 0.18394573032855988, "learning_rate": 2.2871757811878497e-05, "loss": 0.3807, "step": 75125 }, { "epoch": 2.707680109561394, "grad_norm": 0.2318859100341797, "learning_rate": 2.2868850285909897e-05, "loss": 0.407, "step": 75130 }, { "epoch": 2.7078603092226188, "grad_norm": 0.19001048803329468, "learning_rate": 2.286594278897781e-05, "loss": 0.3766, "step": 75135 }, { "epoch": 2.708040508883843, "grad_norm": 0.2057991772890091, "learning_rate": 2.2863035321121836e-05, "loss": 0.3665, "step": 75140 }, { "epoch": 2.708220708545068, "grad_norm": 0.23362144827842712, "learning_rate": 2.286012788238159e-05, "loss": 0.3889, "step": 75145 }, { "epoch": 2.7084009082062925, "grad_norm": 0.1938500553369522, "learning_rate": 2.2857220472796688e-05, "loss": 0.4234, "step": 75150 }, { "epoch": 2.7085811078675173, "grad_norm": 0.17589519917964935, "learning_rate": 2.285431309240674e-05, "loss": 0.3794, "step": 75155 }, { "epoch": 2.708761307528742, "grad_norm": 0.2224626988172531, "learning_rate": 2.285140574125136e-05, "loss": 0.3915, "step": 75160 }, { "epoch": 2.7089415071899667, "grad_norm": 0.17219646275043488, "learning_rate": 2.2848498419370174e-05, "loss": 0.4047, "step": 75165 }, { "epoch": 2.709121706851191, "grad_norm": 0.26903530955314636, "learning_rate": 2.284559112680276e-05, "loss": 0.3869, "step": 75170 }, { "epoch": 2.7093019065124158, "grad_norm": 0.18370231986045837, "learning_rate": 2.2842683863588766e-05, "loss": 0.3703, "step": 75175 }, { "epoch": 2.7094821061736405, "grad_norm": 0.20962488651275635, "learning_rate": 2.2839776629767785e-05, "loss": 0.3904, "step": 75180 }, { "epoch": 2.709662305834865, "grad_norm": 0.2684512734413147, "learning_rate": 2.283686942537942e-05, "loss": 0.3828, "step": 75185 }, { "epoch": 2.7098425054960895, "grad_norm": 0.20460106432437897, "learning_rate": 2.2833962250463293e-05, "loss": 0.4142, "step": 75190 }, { "epoch": 2.7100227051573142, "grad_norm": 0.21397659182548523, "learning_rate": 2.2831055105059007e-05, "loss": 0.3773, "step": 75195 }, { "epoch": 2.710202904818539, "grad_norm": 0.20380306243896484, "learning_rate": 2.282814798920619e-05, "loss": 0.3858, "step": 75200 }, { "epoch": 2.7103831044797637, "grad_norm": 0.2668623626232147, "learning_rate": 2.282524090294443e-05, "loss": 0.3665, "step": 75205 }, { "epoch": 2.7105633041409884, "grad_norm": 0.1861308068037033, "learning_rate": 2.2822333846313332e-05, "loss": 0.3891, "step": 75210 }, { "epoch": 2.7107435038022127, "grad_norm": 0.25733551383018494, "learning_rate": 2.2819426819352525e-05, "loss": 0.3773, "step": 75215 }, { "epoch": 2.7109237034634375, "grad_norm": 0.19054418802261353, "learning_rate": 2.2816519822101596e-05, "loss": 0.3619, "step": 75220 }, { "epoch": 2.711103903124662, "grad_norm": 0.2656010687351227, "learning_rate": 2.281361285460017e-05, "loss": 0.3848, "step": 75225 }, { "epoch": 2.7112841027858865, "grad_norm": 0.21391278505325317, "learning_rate": 2.281070591688784e-05, "loss": 0.3942, "step": 75230 }, { "epoch": 2.7114643024471112, "grad_norm": 0.20552653074264526, "learning_rate": 2.280779900900422e-05, "loss": 0.3606, "step": 75235 }, { "epoch": 2.711644502108336, "grad_norm": 0.2136029154062271, "learning_rate": 2.2804892130988916e-05, "loss": 0.4099, "step": 75240 }, { "epoch": 2.7118247017695607, "grad_norm": 0.20137493312358856, "learning_rate": 2.2801985282881532e-05, "loss": 0.3833, "step": 75245 }, { "epoch": 2.7120049014307854, "grad_norm": 0.17652560770511627, "learning_rate": 2.279907846472167e-05, "loss": 0.3546, "step": 75250 }, { "epoch": 2.71218510109201, "grad_norm": 0.2107645869255066, "learning_rate": 2.279617167654894e-05, "loss": 0.3693, "step": 75255 }, { "epoch": 2.7123653007532345, "grad_norm": 0.17879261076450348, "learning_rate": 2.279326491840294e-05, "loss": 0.4006, "step": 75260 }, { "epoch": 2.712545500414459, "grad_norm": 0.1885097324848175, "learning_rate": 2.279035819032328e-05, "loss": 0.4, "step": 75265 }, { "epoch": 2.712725700075684, "grad_norm": 0.19376368820667267, "learning_rate": 2.2787451492349574e-05, "loss": 0.3943, "step": 75270 }, { "epoch": 2.7129058997369087, "grad_norm": 0.22324879467487335, "learning_rate": 2.2784544824521392e-05, "loss": 0.4092, "step": 75275 }, { "epoch": 2.713086099398133, "grad_norm": 0.18715398013591766, "learning_rate": 2.2781638186878375e-05, "loss": 0.4188, "step": 75280 }, { "epoch": 2.7132662990593577, "grad_norm": 0.24565525352954865, "learning_rate": 2.2778731579460105e-05, "loss": 0.4029, "step": 75285 }, { "epoch": 2.7134464987205824, "grad_norm": 0.28024908900260925, "learning_rate": 2.277582500230618e-05, "loss": 0.3778, "step": 75290 }, { "epoch": 2.713626698381807, "grad_norm": 0.1823398321866989, "learning_rate": 2.2772918455456215e-05, "loss": 0.3794, "step": 75295 }, { "epoch": 2.713806898043032, "grad_norm": 0.22716470062732697, "learning_rate": 2.27700119389498e-05, "loss": 0.3845, "step": 75300 }, { "epoch": 2.713987097704256, "grad_norm": 0.21729101240634918, "learning_rate": 2.2767105452826542e-05, "loss": 0.3898, "step": 75305 }, { "epoch": 2.714167297365481, "grad_norm": 0.22123655676841736, "learning_rate": 2.2764198997126043e-05, "loss": 0.3791, "step": 75310 }, { "epoch": 2.7143474970267056, "grad_norm": 0.222514346241951, "learning_rate": 2.2761292571887894e-05, "loss": 0.3756, "step": 75315 }, { "epoch": 2.7145276966879304, "grad_norm": 0.23015157878398895, "learning_rate": 2.2758386177151707e-05, "loss": 0.3883, "step": 75320 }, { "epoch": 2.7147078963491547, "grad_norm": 0.23218706250190735, "learning_rate": 2.2755479812957074e-05, "loss": 0.3997, "step": 75325 }, { "epoch": 2.7148880960103794, "grad_norm": 0.22764332592487335, "learning_rate": 2.2752573479343588e-05, "loss": 0.3802, "step": 75330 }, { "epoch": 2.715068295671604, "grad_norm": 0.21155443787574768, "learning_rate": 2.2749667176350857e-05, "loss": 0.4231, "step": 75335 }, { "epoch": 2.715248495332829, "grad_norm": 0.19925571978092194, "learning_rate": 2.2746760904018472e-05, "loss": 0.3896, "step": 75340 }, { "epoch": 2.7154286949940536, "grad_norm": 0.22160978615283966, "learning_rate": 2.2743854662386035e-05, "loss": 0.3815, "step": 75345 }, { "epoch": 2.715608894655278, "grad_norm": 0.22966960072517395, "learning_rate": 2.2740948451493148e-05, "loss": 0.3712, "step": 75350 }, { "epoch": 2.7157890943165026, "grad_norm": 0.1914483606815338, "learning_rate": 2.2738042271379388e-05, "loss": 0.3998, "step": 75355 }, { "epoch": 2.7159692939777274, "grad_norm": 0.22551338374614716, "learning_rate": 2.2735136122084374e-05, "loss": 0.3848, "step": 75360 }, { "epoch": 2.716149493638952, "grad_norm": 0.2782670259475708, "learning_rate": 2.2732230003647698e-05, "loss": 0.4082, "step": 75365 }, { "epoch": 2.7163296933001764, "grad_norm": 0.21808646619319916, "learning_rate": 2.272932391610893e-05, "loss": 0.3989, "step": 75370 }, { "epoch": 2.716509892961401, "grad_norm": 0.22317920625209808, "learning_rate": 2.2726417859507703e-05, "loss": 0.4206, "step": 75375 }, { "epoch": 2.716690092622626, "grad_norm": 0.19897177815437317, "learning_rate": 2.2723511833883574e-05, "loss": 0.3832, "step": 75380 }, { "epoch": 2.7168702922838506, "grad_norm": 0.20371364057064056, "learning_rate": 2.2720605839276173e-05, "loss": 0.4227, "step": 75385 }, { "epoch": 2.7170504919450753, "grad_norm": 0.2331681102514267, "learning_rate": 2.2717699875725072e-05, "loss": 0.3592, "step": 75390 }, { "epoch": 2.7172306916063, "grad_norm": 0.2340395599603653, "learning_rate": 2.271479394326986e-05, "loss": 0.4067, "step": 75395 }, { "epoch": 2.7174108912675243, "grad_norm": 0.2065713256597519, "learning_rate": 2.2711888041950143e-05, "loss": 0.3794, "step": 75400 }, { "epoch": 2.717591090928749, "grad_norm": 0.19182537496089935, "learning_rate": 2.2708982171805512e-05, "loss": 0.388, "step": 75405 }, { "epoch": 2.717771290589974, "grad_norm": 0.2015332728624344, "learning_rate": 2.2706076332875546e-05, "loss": 0.3948, "step": 75410 }, { "epoch": 2.717951490251198, "grad_norm": 0.2081226408481598, "learning_rate": 2.2703170525199856e-05, "loss": 0.3549, "step": 75415 }, { "epoch": 2.718131689912423, "grad_norm": 0.23630069196224213, "learning_rate": 2.2700264748818015e-05, "loss": 0.4129, "step": 75420 }, { "epoch": 2.7183118895736476, "grad_norm": 0.20923583209514618, "learning_rate": 2.2697359003769627e-05, "loss": 0.4091, "step": 75425 }, { "epoch": 2.7184920892348723, "grad_norm": 0.2418445646762848, "learning_rate": 2.2694453290094276e-05, "loss": 0.4168, "step": 75430 }, { "epoch": 2.718672288896097, "grad_norm": 0.20278342068195343, "learning_rate": 2.2691547607831547e-05, "loss": 0.4111, "step": 75435 }, { "epoch": 2.7188524885573218, "grad_norm": 0.2284698486328125, "learning_rate": 2.2688641957021043e-05, "loss": 0.3705, "step": 75440 }, { "epoch": 2.719032688218546, "grad_norm": 0.21057669818401337, "learning_rate": 2.2685736337702336e-05, "loss": 0.3889, "step": 75445 }, { "epoch": 2.719212887879771, "grad_norm": 0.23061637580394745, "learning_rate": 2.268283074991503e-05, "loss": 0.412, "step": 75450 }, { "epoch": 2.7193930875409955, "grad_norm": 0.1986396461725235, "learning_rate": 2.2679925193698713e-05, "loss": 0.3599, "step": 75455 }, { "epoch": 2.71957328720222, "grad_norm": 0.20687183737754822, "learning_rate": 2.267701966909295e-05, "loss": 0.3999, "step": 75460 }, { "epoch": 2.7197534868634445, "grad_norm": 0.22581927478313446, "learning_rate": 2.267411417613736e-05, "loss": 0.3888, "step": 75465 }, { "epoch": 2.7199336865246693, "grad_norm": 0.1779545694589615, "learning_rate": 2.2671208714871507e-05, "loss": 0.3675, "step": 75470 }, { "epoch": 2.720113886185894, "grad_norm": 0.19196881353855133, "learning_rate": 2.2668303285334974e-05, "loss": 0.3927, "step": 75475 }, { "epoch": 2.7202940858471187, "grad_norm": 0.21891996264457703, "learning_rate": 2.2665397887567374e-05, "loss": 0.3826, "step": 75480 }, { "epoch": 2.7204742855083435, "grad_norm": 0.2428896129131317, "learning_rate": 2.2662492521608263e-05, "loss": 0.3847, "step": 75485 }, { "epoch": 2.7206544851695678, "grad_norm": 0.21766485273838043, "learning_rate": 2.2659587187497248e-05, "loss": 0.408, "step": 75490 }, { "epoch": 2.7208346848307925, "grad_norm": 0.26046082377433777, "learning_rate": 2.2656681885273907e-05, "loss": 0.3949, "step": 75495 }, { "epoch": 2.7210148844920172, "grad_norm": 0.2007281333208084, "learning_rate": 2.2653776614977813e-05, "loss": 0.3955, "step": 75500 }, { "epoch": 2.7210148844920172, "eval_loss": 0.4320552349090576, "eval_runtime": 3.5362, "eval_samples_per_second": 28.279, "eval_steps_per_second": 7.07, "step": 75500 }, { "epoch": 2.7211950841532415, "grad_norm": 0.29919010400772095, "learning_rate": 2.265087137664856e-05, "loss": 0.3914, "step": 75505 }, { "epoch": 2.7213752838144663, "grad_norm": 0.24532043933868408, "learning_rate": 2.2647966170325733e-05, "loss": 0.4182, "step": 75510 }, { "epoch": 2.721555483475691, "grad_norm": 0.2970276176929474, "learning_rate": 2.2645060996048904e-05, "loss": 0.3703, "step": 75515 }, { "epoch": 2.7217356831369157, "grad_norm": 0.19525529444217682, "learning_rate": 2.2642155853857673e-05, "loss": 0.4027, "step": 75520 }, { "epoch": 2.7219158827981405, "grad_norm": 0.21934610605239868, "learning_rate": 2.26392507437916e-05, "loss": 0.4019, "step": 75525 }, { "epoch": 2.722096082459365, "grad_norm": 0.20460151135921478, "learning_rate": 2.263634566589029e-05, "loss": 0.3934, "step": 75530 }, { "epoch": 2.7222762821205895, "grad_norm": 0.18550460040569305, "learning_rate": 2.263344062019331e-05, "loss": 0.4085, "step": 75535 }, { "epoch": 2.722456481781814, "grad_norm": 0.23762153089046478, "learning_rate": 2.263053560674024e-05, "loss": 0.4393, "step": 75540 }, { "epoch": 2.722636681443039, "grad_norm": 0.20546315610408783, "learning_rate": 2.2627630625570666e-05, "loss": 0.3945, "step": 75545 }, { "epoch": 2.7228168811042637, "grad_norm": 0.22104191780090332, "learning_rate": 2.2624725676724175e-05, "loss": 0.3628, "step": 75550 }, { "epoch": 2.722997080765488, "grad_norm": 0.21103298664093018, "learning_rate": 2.2621820760240316e-05, "loss": 0.3632, "step": 75555 }, { "epoch": 2.7231772804267127, "grad_norm": 0.20641304552555084, "learning_rate": 2.261891587615871e-05, "loss": 0.4007, "step": 75560 }, { "epoch": 2.7233574800879374, "grad_norm": 0.19765135645866394, "learning_rate": 2.261601102451889e-05, "loss": 0.3494, "step": 75565 }, { "epoch": 2.723537679749162, "grad_norm": 0.22192902863025665, "learning_rate": 2.261310620536048e-05, "loss": 0.3931, "step": 75570 }, { "epoch": 2.723717879410387, "grad_norm": 0.23835474252700806, "learning_rate": 2.261020141872303e-05, "loss": 0.3943, "step": 75575 }, { "epoch": 2.723898079071611, "grad_norm": 0.19383686780929565, "learning_rate": 2.260729666464612e-05, "loss": 0.3632, "step": 75580 }, { "epoch": 2.724078278732836, "grad_norm": 0.18293000757694244, "learning_rate": 2.260439194316933e-05, "loss": 0.3706, "step": 75585 }, { "epoch": 2.7242584783940607, "grad_norm": 0.27218204736709595, "learning_rate": 2.260148725433224e-05, "loss": 0.4146, "step": 75590 }, { "epoch": 2.7244386780552854, "grad_norm": 0.22335471212863922, "learning_rate": 2.259858259817441e-05, "loss": 0.3762, "step": 75595 }, { "epoch": 2.7246188777165097, "grad_norm": 0.19925439357757568, "learning_rate": 2.259567797473544e-05, "loss": 0.3917, "step": 75600 }, { "epoch": 2.7247990773777344, "grad_norm": 0.18860496580600739, "learning_rate": 2.2592773384054883e-05, "loss": 0.3926, "step": 75605 }, { "epoch": 2.724979277038959, "grad_norm": 0.23276196420192719, "learning_rate": 2.258986882617233e-05, "loss": 0.4083, "step": 75610 }, { "epoch": 2.725159476700184, "grad_norm": 0.21294569969177246, "learning_rate": 2.258696430112735e-05, "loss": 0.392, "step": 75615 }, { "epoch": 2.7253396763614086, "grad_norm": 0.21455052495002747, "learning_rate": 2.25840598089595e-05, "loss": 0.4136, "step": 75620 }, { "epoch": 2.7255198760226333, "grad_norm": 0.2718030512332916, "learning_rate": 2.2581736238923366e-05, "loss": 0.4269, "step": 75625 }, { "epoch": 2.7257000756838576, "grad_norm": 0.21406759321689606, "learning_rate": 2.2578831806034116e-05, "loss": 0.3959, "step": 75630 }, { "epoch": 2.7258802753450824, "grad_norm": 0.21770311892032623, "learning_rate": 2.2575927406132795e-05, "loss": 0.3844, "step": 75635 }, { "epoch": 2.726060475006307, "grad_norm": 0.22125771641731262, "learning_rate": 2.2573023039259013e-05, "loss": 0.3554, "step": 75640 }, { "epoch": 2.7262406746675314, "grad_norm": 0.2546500861644745, "learning_rate": 2.2570118705452317e-05, "loss": 0.4081, "step": 75645 }, { "epoch": 2.726420874328756, "grad_norm": 0.19551582634449005, "learning_rate": 2.2567214404752273e-05, "loss": 0.4039, "step": 75650 }, { "epoch": 2.726601073989981, "grad_norm": 0.21097378432750702, "learning_rate": 2.2564310137198474e-05, "loss": 0.4068, "step": 75655 }, { "epoch": 2.7267812736512056, "grad_norm": 0.20981697738170624, "learning_rate": 2.2561405902830464e-05, "loss": 0.3682, "step": 75660 }, { "epoch": 2.7269614733124303, "grad_norm": 0.22551362216472626, "learning_rate": 2.2558501701687847e-05, "loss": 0.3908, "step": 75665 }, { "epoch": 2.727141672973655, "grad_norm": 0.23293738067150116, "learning_rate": 2.255559753381016e-05, "loss": 0.3983, "step": 75670 }, { "epoch": 2.7273218726348794, "grad_norm": 0.19451351463794708, "learning_rate": 2.2552693399236978e-05, "loss": 0.3788, "step": 75675 }, { "epoch": 2.727502072296104, "grad_norm": 0.22846585512161255, "learning_rate": 2.2549789298007884e-05, "loss": 0.3828, "step": 75680 }, { "epoch": 2.727682271957329, "grad_norm": 0.20074519515037537, "learning_rate": 2.2546885230162435e-05, "loss": 0.3768, "step": 75685 }, { "epoch": 2.727862471618553, "grad_norm": 0.23105072975158691, "learning_rate": 2.2543981195740194e-05, "loss": 0.4379, "step": 75690 }, { "epoch": 2.728042671279778, "grad_norm": 0.20044337213039398, "learning_rate": 2.254107719478074e-05, "loss": 0.3861, "step": 75695 }, { "epoch": 2.7282228709410026, "grad_norm": 0.1897222101688385, "learning_rate": 2.2538173227323626e-05, "loss": 0.3739, "step": 75700 }, { "epoch": 2.7284030706022273, "grad_norm": 0.19628849625587463, "learning_rate": 2.253526929340843e-05, "loss": 0.3837, "step": 75705 }, { "epoch": 2.728583270263452, "grad_norm": 0.21223433315753937, "learning_rate": 2.2532365393074715e-05, "loss": 0.4052, "step": 75710 }, { "epoch": 2.7287634699246768, "grad_norm": 0.19551092386245728, "learning_rate": 2.2529461526362037e-05, "loss": 0.4148, "step": 75715 }, { "epoch": 2.728943669585901, "grad_norm": 0.1927630603313446, "learning_rate": 2.2526557693309974e-05, "loss": 0.39, "step": 75720 }, { "epoch": 2.729123869247126, "grad_norm": 0.21203365921974182, "learning_rate": 2.252365389395809e-05, "loss": 0.4016, "step": 75725 }, { "epoch": 2.7293040689083505, "grad_norm": 0.18629692494869232, "learning_rate": 2.252075012834592e-05, "loss": 0.4215, "step": 75730 }, { "epoch": 2.729484268569575, "grad_norm": 0.24014006555080414, "learning_rate": 2.251784639651307e-05, "loss": 0.4002, "step": 75735 }, { "epoch": 2.7296644682307996, "grad_norm": 0.17442671954631805, "learning_rate": 2.2514942698499067e-05, "loss": 0.4086, "step": 75740 }, { "epoch": 2.7298446678920243, "grad_norm": 0.23989979922771454, "learning_rate": 2.2512039034343504e-05, "loss": 0.3988, "step": 75745 }, { "epoch": 2.730024867553249, "grad_norm": 0.23518751561641693, "learning_rate": 2.250913540408592e-05, "loss": 0.3873, "step": 75750 }, { "epoch": 2.7302050672144738, "grad_norm": 0.23601964116096497, "learning_rate": 2.2506231807765882e-05, "loss": 0.3869, "step": 75755 }, { "epoch": 2.7303852668756985, "grad_norm": 0.269535630941391, "learning_rate": 2.2503328245422957e-05, "loss": 0.3967, "step": 75760 }, { "epoch": 2.730565466536923, "grad_norm": 0.20272400975227356, "learning_rate": 2.2500424717096702e-05, "loss": 0.3563, "step": 75765 }, { "epoch": 2.7307456661981475, "grad_norm": 0.19777542352676392, "learning_rate": 2.2497521222826667e-05, "loss": 0.4028, "step": 75770 }, { "epoch": 2.7309258658593722, "grad_norm": 0.18318623304367065, "learning_rate": 2.2494617762652433e-05, "loss": 0.3846, "step": 75775 }, { "epoch": 2.731106065520597, "grad_norm": 0.17845581471920013, "learning_rate": 2.2491714336613534e-05, "loss": 0.4036, "step": 75780 }, { "epoch": 2.7312862651818213, "grad_norm": 0.20689518749713898, "learning_rate": 2.248881094474955e-05, "loss": 0.3367, "step": 75785 }, { "epoch": 2.731466464843046, "grad_norm": 0.22084331512451172, "learning_rate": 2.2485907587100034e-05, "loss": 0.3819, "step": 75790 }, { "epoch": 2.7316466645042707, "grad_norm": 0.2153303027153015, "learning_rate": 2.248300426370453e-05, "loss": 0.3989, "step": 75795 }, { "epoch": 2.7318268641654955, "grad_norm": 0.23561865091323853, "learning_rate": 2.2480100974602613e-05, "loss": 0.3838, "step": 75800 }, { "epoch": 2.73200706382672, "grad_norm": 0.24491235613822937, "learning_rate": 2.247719771983384e-05, "loss": 0.3881, "step": 75805 }, { "epoch": 2.7321872634879445, "grad_norm": 0.1986062377691269, "learning_rate": 2.247429449943774e-05, "loss": 0.3913, "step": 75810 }, { "epoch": 2.7323674631491692, "grad_norm": 0.20403479039669037, "learning_rate": 2.24713913134539e-05, "loss": 0.3769, "step": 75815 }, { "epoch": 2.732547662810394, "grad_norm": 0.232151597738266, "learning_rate": 2.2468488161921858e-05, "loss": 0.4194, "step": 75820 }, { "epoch": 2.7327278624716187, "grad_norm": 0.19631794095039368, "learning_rate": 2.2465585044881182e-05, "loss": 0.4294, "step": 75825 }, { "epoch": 2.732908062132843, "grad_norm": 0.20293517410755157, "learning_rate": 2.2462681962371424e-05, "loss": 0.3644, "step": 75830 }, { "epoch": 2.7330882617940677, "grad_norm": 0.19923901557922363, "learning_rate": 2.2459778914432116e-05, "loss": 0.3929, "step": 75835 }, { "epoch": 2.7332684614552925, "grad_norm": 0.24110738933086395, "learning_rate": 2.2456875901102845e-05, "loss": 0.3879, "step": 75840 }, { "epoch": 2.733448661116517, "grad_norm": 0.21238499879837036, "learning_rate": 2.245397292242313e-05, "loss": 0.3906, "step": 75845 }, { "epoch": 2.733628860777742, "grad_norm": 0.23390400409698486, "learning_rate": 2.245106997843256e-05, "loss": 0.4043, "step": 75850 }, { "epoch": 2.733809060438966, "grad_norm": 0.17990170419216156, "learning_rate": 2.244816706917066e-05, "loss": 0.4187, "step": 75855 }, { "epoch": 2.733989260100191, "grad_norm": 0.20971287786960602, "learning_rate": 2.244526419467699e-05, "loss": 0.3834, "step": 75860 }, { "epoch": 2.7341694597614157, "grad_norm": 0.21100729703903198, "learning_rate": 2.24423613549911e-05, "loss": 0.3752, "step": 75865 }, { "epoch": 2.7343496594226404, "grad_norm": 0.19430191814899445, "learning_rate": 2.2439458550152544e-05, "loss": 0.4059, "step": 75870 }, { "epoch": 2.7345298590838647, "grad_norm": 0.1902761608362198, "learning_rate": 2.243655578020086e-05, "loss": 0.4116, "step": 75875 }, { "epoch": 2.7347100587450894, "grad_norm": 0.20415811240673065, "learning_rate": 2.2433653045175614e-05, "loss": 0.3747, "step": 75880 }, { "epoch": 2.734890258406314, "grad_norm": 0.21149209141731262, "learning_rate": 2.2430750345116346e-05, "loss": 0.3851, "step": 75885 }, { "epoch": 2.735070458067539, "grad_norm": 0.2101927250623703, "learning_rate": 2.2427847680062612e-05, "loss": 0.4146, "step": 75890 }, { "epoch": 2.7352506577287636, "grad_norm": 0.18154318630695343, "learning_rate": 2.2424945050053954e-05, "loss": 0.3912, "step": 75895 }, { "epoch": 2.7354308573899884, "grad_norm": 0.2463860660791397, "learning_rate": 2.2422042455129916e-05, "loss": 0.3867, "step": 75900 }, { "epoch": 2.7356110570512127, "grad_norm": 0.23593199253082275, "learning_rate": 2.2419139895330058e-05, "loss": 0.4111, "step": 75905 }, { "epoch": 2.7357912567124374, "grad_norm": 0.24448728561401367, "learning_rate": 2.2416237370693922e-05, "loss": 0.4308, "step": 75910 }, { "epoch": 2.735971456373662, "grad_norm": 0.21843284368515015, "learning_rate": 2.2413334881261038e-05, "loss": 0.3489, "step": 75915 }, { "epoch": 2.7361516560348864, "grad_norm": 0.20660178363323212, "learning_rate": 2.2410432427070975e-05, "loss": 0.3702, "step": 75920 }, { "epoch": 2.736331855696111, "grad_norm": 0.19898898899555206, "learning_rate": 2.240753000816326e-05, "loss": 0.3856, "step": 75925 }, { "epoch": 2.736512055357336, "grad_norm": 0.2137184739112854, "learning_rate": 2.240462762457746e-05, "loss": 0.3925, "step": 75930 }, { "epoch": 2.7366922550185606, "grad_norm": 0.21520856022834778, "learning_rate": 2.2401725276353103e-05, "loss": 0.4215, "step": 75935 }, { "epoch": 2.7368724546797853, "grad_norm": 0.22829484939575195, "learning_rate": 2.2398822963529722e-05, "loss": 0.3858, "step": 75940 }, { "epoch": 2.73705265434101, "grad_norm": 0.19753465056419373, "learning_rate": 2.2395920686146894e-05, "loss": 0.418, "step": 75945 }, { "epoch": 2.7372328540022344, "grad_norm": 0.19238927960395813, "learning_rate": 2.2393018444244132e-05, "loss": 0.4194, "step": 75950 }, { "epoch": 2.737413053663459, "grad_norm": 0.2262929379940033, "learning_rate": 2.2390116237860985e-05, "loss": 0.373, "step": 75955 }, { "epoch": 2.737593253324684, "grad_norm": 0.1645156592130661, "learning_rate": 2.238721406703701e-05, "loss": 0.3634, "step": 75960 }, { "epoch": 2.737773452985908, "grad_norm": 0.20983080565929413, "learning_rate": 2.2384311931811728e-05, "loss": 0.3576, "step": 75965 }, { "epoch": 2.737953652647133, "grad_norm": 0.21075214445590973, "learning_rate": 2.238199024928914e-05, "loss": 0.4086, "step": 75970 }, { "epoch": 2.7381338523083576, "grad_norm": 0.2087734490633011, "learning_rate": 2.237908817824117e-05, "loss": 0.428, "step": 75975 }, { "epoch": 2.7383140519695823, "grad_norm": 0.22745364904403687, "learning_rate": 2.2376186142902626e-05, "loss": 0.4003, "step": 75980 }, { "epoch": 2.738494251630807, "grad_norm": 0.214681476354599, "learning_rate": 2.2373284143313015e-05, "loss": 0.3532, "step": 75985 }, { "epoch": 2.738674451292032, "grad_norm": 0.25850626826286316, "learning_rate": 2.2370382179511915e-05, "loss": 0.3884, "step": 75990 }, { "epoch": 2.738854650953256, "grad_norm": 0.2193749099969864, "learning_rate": 2.2367480251538842e-05, "loss": 0.3821, "step": 75995 }, { "epoch": 2.739034850614481, "grad_norm": 0.24161560833454132, "learning_rate": 2.2364578359433345e-05, "loss": 0.3848, "step": 76000 }, { "epoch": 2.739034850614481, "eval_loss": 0.43177616596221924, "eval_runtime": 3.531, "eval_samples_per_second": 28.321, "eval_steps_per_second": 7.08, "step": 76000 }, { "epoch": 2.7392150502757056, "grad_norm": 0.21785877645015717, "learning_rate": 2.2361676503234963e-05, "loss": 0.3629, "step": 76005 }, { "epoch": 2.73939524993693, "grad_norm": 0.24404390156269073, "learning_rate": 2.2358774682983213e-05, "loss": 0.4146, "step": 76010 }, { "epoch": 2.7395754495981546, "grad_norm": 0.20576035976409912, "learning_rate": 2.235587289871766e-05, "loss": 0.4005, "step": 76015 }, { "epoch": 2.7397556492593793, "grad_norm": 0.1887839436531067, "learning_rate": 2.2352971150477824e-05, "loss": 0.3787, "step": 76020 }, { "epoch": 2.739935848920604, "grad_norm": 0.20635947585105896, "learning_rate": 2.235006943830324e-05, "loss": 0.3778, "step": 76025 }, { "epoch": 2.740116048581829, "grad_norm": 0.21008598804473877, "learning_rate": 2.234716776223345e-05, "loss": 0.3746, "step": 76030 }, { "epoch": 2.7402962482430535, "grad_norm": 0.2424369752407074, "learning_rate": 2.234426612230798e-05, "loss": 0.4026, "step": 76035 }, { "epoch": 2.740476447904278, "grad_norm": 0.2240283191204071, "learning_rate": 2.2341364518566378e-05, "loss": 0.4191, "step": 76040 }, { "epoch": 2.7406566475655025, "grad_norm": 0.22698961198329926, "learning_rate": 2.2338462951048165e-05, "loss": 0.4108, "step": 76045 }, { "epoch": 2.7408368472267273, "grad_norm": 0.1691308468580246, "learning_rate": 2.2335561419792876e-05, "loss": 0.3827, "step": 76050 }, { "epoch": 2.741017046887952, "grad_norm": 0.18437933921813965, "learning_rate": 2.233265992484005e-05, "loss": 0.3761, "step": 76055 }, { "epoch": 2.7411972465491763, "grad_norm": 0.23850589990615845, "learning_rate": 2.232975846622922e-05, "loss": 0.4086, "step": 76060 }, { "epoch": 2.741377446210401, "grad_norm": 0.20492541790008545, "learning_rate": 2.2326857043999906e-05, "loss": 0.3932, "step": 76065 }, { "epoch": 2.7415576458716258, "grad_norm": 0.219580739736557, "learning_rate": 2.2323955658191653e-05, "loss": 0.3816, "step": 76070 }, { "epoch": 2.7417378455328505, "grad_norm": 0.23200367391109467, "learning_rate": 2.232105430884398e-05, "loss": 0.3838, "step": 76075 }, { "epoch": 2.7419180451940752, "grad_norm": 0.24806389212608337, "learning_rate": 2.231815299599643e-05, "loss": 0.4068, "step": 76080 }, { "epoch": 2.7420982448552995, "grad_norm": 0.17643065750598907, "learning_rate": 2.2315251719688536e-05, "loss": 0.3776, "step": 76085 }, { "epoch": 2.7422784445165242, "grad_norm": 0.2253817468881607, "learning_rate": 2.231235047995979e-05, "loss": 0.3983, "step": 76090 }, { "epoch": 2.742458644177749, "grad_norm": 0.24909770488739014, "learning_rate": 2.2309449276849775e-05, "loss": 0.4178, "step": 76095 }, { "epoch": 2.7426388438389737, "grad_norm": 0.21218940615653992, "learning_rate": 2.2306548110397973e-05, "loss": 0.3915, "step": 76100 }, { "epoch": 2.742819043500198, "grad_norm": 0.19259501993656158, "learning_rate": 2.2303646980643948e-05, "loss": 0.3977, "step": 76105 }, { "epoch": 2.7429992431614227, "grad_norm": 0.22517803311347961, "learning_rate": 2.2300745887627206e-05, "loss": 0.383, "step": 76110 }, { "epoch": 2.7431794428226475, "grad_norm": 0.2479083389043808, "learning_rate": 2.2297844831387265e-05, "loss": 0.392, "step": 76115 }, { "epoch": 2.743359642483872, "grad_norm": 0.2069377303123474, "learning_rate": 2.2294943811963682e-05, "loss": 0.4116, "step": 76120 }, { "epoch": 2.743539842145097, "grad_norm": 0.21161840856075287, "learning_rate": 2.2292042829395964e-05, "loss": 0.4078, "step": 76125 }, { "epoch": 2.7437200418063217, "grad_norm": 0.24700872600078583, "learning_rate": 2.228914188372363e-05, "loss": 0.4206, "step": 76130 }, { "epoch": 2.743900241467546, "grad_norm": 0.19092321395874023, "learning_rate": 2.2286240974986218e-05, "loss": 0.4151, "step": 76135 }, { "epoch": 2.7440804411287707, "grad_norm": 0.20470459759235382, "learning_rate": 2.2283340103223243e-05, "loss": 0.4057, "step": 76140 }, { "epoch": 2.7442606407899954, "grad_norm": 0.18111029267311096, "learning_rate": 2.2280439268474236e-05, "loss": 0.3995, "step": 76145 }, { "epoch": 2.7444408404512197, "grad_norm": 0.20974625647068024, "learning_rate": 2.2277538470778722e-05, "loss": 0.3997, "step": 76150 }, { "epoch": 2.7446210401124445, "grad_norm": 0.24757510423660278, "learning_rate": 2.227463771017621e-05, "loss": 0.4023, "step": 76155 }, { "epoch": 2.744801239773669, "grad_norm": 0.2829555869102478, "learning_rate": 2.227173698670624e-05, "loss": 0.3991, "step": 76160 }, { "epoch": 2.744981439434894, "grad_norm": 0.1985725462436676, "learning_rate": 2.2268836300408323e-05, "loss": 0.4086, "step": 76165 }, { "epoch": 2.7451616390961187, "grad_norm": 0.23659586906433105, "learning_rate": 2.226593565132198e-05, "loss": 0.3794, "step": 76170 }, { "epoch": 2.7453418387573434, "grad_norm": 0.21936027705669403, "learning_rate": 2.2263035039486734e-05, "loss": 0.3971, "step": 76175 }, { "epoch": 2.7455220384185677, "grad_norm": 0.21939417719841003, "learning_rate": 2.2260134464942108e-05, "loss": 0.4245, "step": 76180 }, { "epoch": 2.7457022380797924, "grad_norm": 0.2699432969093323, "learning_rate": 2.225723392772762e-05, "loss": 0.4004, "step": 76185 }, { "epoch": 2.745882437741017, "grad_norm": 0.20219436287879944, "learning_rate": 2.2254333427882795e-05, "loss": 0.4058, "step": 76190 }, { "epoch": 2.7460626374022414, "grad_norm": 0.2475823611021042, "learning_rate": 2.2251432965447126e-05, "loss": 0.4338, "step": 76195 }, { "epoch": 2.746242837063466, "grad_norm": 0.18280702829360962, "learning_rate": 2.2248532540460173e-05, "loss": 0.3821, "step": 76200 }, { "epoch": 2.746423036724691, "grad_norm": 0.2569371461868286, "learning_rate": 2.2245632152961425e-05, "loss": 0.4106, "step": 76205 }, { "epoch": 2.7466032363859156, "grad_norm": 0.22354593873023987, "learning_rate": 2.2242731802990396e-05, "loss": 0.3885, "step": 76210 }, { "epoch": 2.7467834360471404, "grad_norm": 0.17847710847854614, "learning_rate": 2.223983149058662e-05, "loss": 0.4149, "step": 76215 }, { "epoch": 2.746963635708365, "grad_norm": 0.22239890694618225, "learning_rate": 2.2236931215789604e-05, "loss": 0.3831, "step": 76220 }, { "epoch": 2.7471438353695894, "grad_norm": 0.22584442794322968, "learning_rate": 2.2234030978638865e-05, "loss": 0.3784, "step": 76225 }, { "epoch": 2.747324035030814, "grad_norm": 0.2300465852022171, "learning_rate": 2.223113077917392e-05, "loss": 0.3863, "step": 76230 }, { "epoch": 2.747504234692039, "grad_norm": 0.21806028485298157, "learning_rate": 2.2228230617434276e-05, "loss": 0.408, "step": 76235 }, { "epoch": 2.747684434353263, "grad_norm": 0.26781561970710754, "learning_rate": 2.222533049345946e-05, "loss": 0.4062, "step": 76240 }, { "epoch": 2.747864634014488, "grad_norm": 0.23583123087882996, "learning_rate": 2.222243040728898e-05, "loss": 0.3975, "step": 76245 }, { "epoch": 2.7480448336757126, "grad_norm": 0.1862766593694687, "learning_rate": 2.221953035896234e-05, "loss": 0.4025, "step": 76250 }, { "epoch": 2.7482250333369374, "grad_norm": 0.21025675535202026, "learning_rate": 2.2216630348519067e-05, "loss": 0.4092, "step": 76255 }, { "epoch": 2.748405232998162, "grad_norm": 0.1862116903066635, "learning_rate": 2.2213730375998663e-05, "loss": 0.398, "step": 76260 }, { "epoch": 2.748585432659387, "grad_norm": 0.18690145015716553, "learning_rate": 2.2210830441440646e-05, "loss": 0.3963, "step": 76265 }, { "epoch": 2.748765632320611, "grad_norm": 0.20919784903526306, "learning_rate": 2.220793054488453e-05, "loss": 0.4236, "step": 76270 }, { "epoch": 2.748945831981836, "grad_norm": 0.24846714735031128, "learning_rate": 2.2205030686369805e-05, "loss": 0.3915, "step": 76275 }, { "epoch": 2.7491260316430606, "grad_norm": 0.19307979941368103, "learning_rate": 2.2202130865936006e-05, "loss": 0.3978, "step": 76280 }, { "epoch": 2.7493062313042853, "grad_norm": 0.2004070281982422, "learning_rate": 2.2199231083622627e-05, "loss": 0.4069, "step": 76285 }, { "epoch": 2.7494864309655096, "grad_norm": 0.20964334905147552, "learning_rate": 2.2196331339469187e-05, "loss": 0.3888, "step": 76290 }, { "epoch": 2.7496666306267343, "grad_norm": 0.1950366199016571, "learning_rate": 2.2193431633515194e-05, "loss": 0.3997, "step": 76295 }, { "epoch": 2.749846830287959, "grad_norm": 0.19967371225357056, "learning_rate": 2.2190531965800138e-05, "loss": 0.4061, "step": 76300 }, { "epoch": 2.750027029949184, "grad_norm": 0.22163501381874084, "learning_rate": 2.2187632336363555e-05, "loss": 0.3802, "step": 76305 }, { "epoch": 2.7502072296104085, "grad_norm": 0.21866455674171448, "learning_rate": 2.218473274524493e-05, "loss": 0.4057, "step": 76310 }, { "epoch": 2.750387429271633, "grad_norm": 0.20910955965518951, "learning_rate": 2.2181833192483774e-05, "loss": 0.4417, "step": 76315 }, { "epoch": 2.7505676289328576, "grad_norm": 0.21740221977233887, "learning_rate": 2.2178933678119598e-05, "loss": 0.3984, "step": 76320 }, { "epoch": 2.7507478285940823, "grad_norm": 0.28222358226776123, "learning_rate": 2.21760342021919e-05, "loss": 0.3966, "step": 76325 }, { "epoch": 2.750928028255307, "grad_norm": 0.16867958009243011, "learning_rate": 2.2173134764740196e-05, "loss": 0.4051, "step": 76330 }, { "epoch": 2.7511082279165313, "grad_norm": 0.26838427782058716, "learning_rate": 2.217023536580398e-05, "loss": 0.4368, "step": 76335 }, { "epoch": 2.751288427577756, "grad_norm": 0.21444910764694214, "learning_rate": 2.2167336005422758e-05, "loss": 0.3754, "step": 76340 }, { "epoch": 2.751468627238981, "grad_norm": 0.23337894678115845, "learning_rate": 2.2164436683636035e-05, "loss": 0.4093, "step": 76345 }, { "epoch": 2.7516488269002055, "grad_norm": 0.2542593479156494, "learning_rate": 2.216153740048332e-05, "loss": 0.4148, "step": 76350 }, { "epoch": 2.7518290265614302, "grad_norm": 0.24587145447731018, "learning_rate": 2.2158638156004098e-05, "loss": 0.3793, "step": 76355 }, { "epoch": 2.7520092262226545, "grad_norm": 0.19757163524627686, "learning_rate": 2.2155738950237887e-05, "loss": 0.4132, "step": 76360 }, { "epoch": 2.7521894258838793, "grad_norm": 0.21361534297466278, "learning_rate": 2.215283978322418e-05, "loss": 0.3932, "step": 76365 }, { "epoch": 2.752369625545104, "grad_norm": 0.17598572373390198, "learning_rate": 2.214994065500248e-05, "loss": 0.3777, "step": 76370 }, { "epoch": 2.7525498252063287, "grad_norm": 0.17586344480514526, "learning_rate": 2.2147041565612294e-05, "loss": 0.39, "step": 76375 }, { "epoch": 2.752730024867553, "grad_norm": 0.17030631005764008, "learning_rate": 2.2144142515093097e-05, "loss": 0.3821, "step": 76380 }, { "epoch": 2.7529102245287778, "grad_norm": 0.22826333343982697, "learning_rate": 2.2141243503484426e-05, "loss": 0.4265, "step": 76385 }, { "epoch": 2.7530904241900025, "grad_norm": 0.2388213574886322, "learning_rate": 2.213834453082575e-05, "loss": 0.3792, "step": 76390 }, { "epoch": 2.7532706238512272, "grad_norm": 0.19387775659561157, "learning_rate": 2.213544559715657e-05, "loss": 0.4057, "step": 76395 }, { "epoch": 2.753450823512452, "grad_norm": 0.22617575526237488, "learning_rate": 2.2132546702516395e-05, "loss": 0.4345, "step": 76400 }, { "epoch": 2.7536310231736767, "grad_norm": 0.19259920716285706, "learning_rate": 2.2129647846944708e-05, "loss": 0.367, "step": 76405 }, { "epoch": 2.753811222834901, "grad_norm": 0.20966337621212006, "learning_rate": 2.2126749030481026e-05, "loss": 0.3878, "step": 76410 }, { "epoch": 2.7539914224961257, "grad_norm": 0.22299322485923767, "learning_rate": 2.2123850253164826e-05, "loss": 0.4122, "step": 76415 }, { "epoch": 2.7541716221573505, "grad_norm": 0.1778857707977295, "learning_rate": 2.2120951515035605e-05, "loss": 0.4066, "step": 76420 }, { "epoch": 2.7543518218185747, "grad_norm": 0.22316712141036987, "learning_rate": 2.2118052816132873e-05, "loss": 0.4025, "step": 76425 }, { "epoch": 2.7545320214797995, "grad_norm": 0.19544607400894165, "learning_rate": 2.2115154156496105e-05, "loss": 0.4107, "step": 76430 }, { "epoch": 2.754712221141024, "grad_norm": 0.25387707352638245, "learning_rate": 2.2112255536164807e-05, "loss": 0.3657, "step": 76435 }, { "epoch": 2.754892420802249, "grad_norm": 0.21383005380630493, "learning_rate": 2.2109356955178463e-05, "loss": 0.3826, "step": 76440 }, { "epoch": 2.7550726204634737, "grad_norm": 0.19232189655303955, "learning_rate": 2.2106458413576573e-05, "loss": 0.403, "step": 76445 }, { "epoch": 2.7552528201246984, "grad_norm": 0.23429124057292938, "learning_rate": 2.210355991139863e-05, "loss": 0.3711, "step": 76450 }, { "epoch": 2.7554330197859227, "grad_norm": 0.2584487795829773, "learning_rate": 2.2100661448684123e-05, "loss": 0.389, "step": 76455 }, { "epoch": 2.7556132194471474, "grad_norm": 0.2131696194410324, "learning_rate": 2.2097763025472536e-05, "loss": 0.3589, "step": 76460 }, { "epoch": 2.755793419108372, "grad_norm": 0.21315625309944153, "learning_rate": 2.2094864641803372e-05, "loss": 0.3698, "step": 76465 }, { "epoch": 2.7559736187695965, "grad_norm": 0.2009851336479187, "learning_rate": 2.209196629771612e-05, "loss": 0.3787, "step": 76470 }, { "epoch": 2.756153818430821, "grad_norm": 0.24308665096759796, "learning_rate": 2.208906799325025e-05, "loss": 0.3962, "step": 76475 }, { "epoch": 2.756334018092046, "grad_norm": 0.2490856796503067, "learning_rate": 2.2086169728445276e-05, "loss": 0.4196, "step": 76480 }, { "epoch": 2.7565142177532707, "grad_norm": 0.20798978209495544, "learning_rate": 2.2083271503340662e-05, "loss": 0.3762, "step": 76485 }, { "epoch": 2.7566944174144954, "grad_norm": 0.21434953808784485, "learning_rate": 2.2080373317975927e-05, "loss": 0.4108, "step": 76490 }, { "epoch": 2.75687461707572, "grad_norm": 0.19935797154903412, "learning_rate": 2.2077475172390536e-05, "loss": 0.4004, "step": 76495 }, { "epoch": 2.7570548167369444, "grad_norm": 0.19653324782848358, "learning_rate": 2.2074577066623974e-05, "loss": 0.3803, "step": 76500 }, { "epoch": 2.7570548167369444, "eval_loss": 0.4315706491470337, "eval_runtime": 3.524, "eval_samples_per_second": 28.377, "eval_steps_per_second": 7.094, "step": 76500 }, { "epoch": 2.757235016398169, "grad_norm": 0.2254483997821808, "learning_rate": 2.2071679000715733e-05, "loss": 0.4047, "step": 76505 }, { "epoch": 2.757415216059394, "grad_norm": 0.20184098184108734, "learning_rate": 2.2068780974705298e-05, "loss": 0.386, "step": 76510 }, { "epoch": 2.757595415720618, "grad_norm": 0.20205454528331757, "learning_rate": 2.206588298863216e-05, "loss": 0.3598, "step": 76515 }, { "epoch": 2.757775615381843, "grad_norm": 0.18086735904216766, "learning_rate": 2.2062985042535797e-05, "loss": 0.3592, "step": 76520 }, { "epoch": 2.7579558150430676, "grad_norm": 0.18240606784820557, "learning_rate": 2.2060087136455687e-05, "loss": 0.3759, "step": 76525 }, { "epoch": 2.7581360147042924, "grad_norm": 0.19021601974964142, "learning_rate": 2.205718927043133e-05, "loss": 0.4057, "step": 76530 }, { "epoch": 2.758316214365517, "grad_norm": 0.19175371527671814, "learning_rate": 2.2054291444502198e-05, "loss": 0.3937, "step": 76535 }, { "epoch": 2.758496414026742, "grad_norm": 0.2313561886548996, "learning_rate": 2.2051393658707766e-05, "loss": 0.3928, "step": 76540 }, { "epoch": 2.758676613687966, "grad_norm": 0.19368553161621094, "learning_rate": 2.2048495913087535e-05, "loss": 0.3741, "step": 76545 }, { "epoch": 2.758856813349191, "grad_norm": 0.20152942836284637, "learning_rate": 2.2045598207680968e-05, "loss": 0.3813, "step": 76550 }, { "epoch": 2.7590370130104156, "grad_norm": 0.2724568843841553, "learning_rate": 2.204270054252756e-05, "loss": 0.386, "step": 76555 }, { "epoch": 2.7592172126716403, "grad_norm": 0.2629745602607727, "learning_rate": 2.203980291766679e-05, "loss": 0.4035, "step": 76560 }, { "epoch": 2.7593974123328646, "grad_norm": 0.19395357370376587, "learning_rate": 2.2036905333138115e-05, "loss": 0.3902, "step": 76565 }, { "epoch": 2.7595776119940894, "grad_norm": 0.19974055886268616, "learning_rate": 2.203400778898104e-05, "loss": 0.3891, "step": 76570 }, { "epoch": 2.759757811655314, "grad_norm": 0.21690985560417175, "learning_rate": 2.203111028523504e-05, "loss": 0.3544, "step": 76575 }, { "epoch": 2.759938011316539, "grad_norm": 0.16456058621406555, "learning_rate": 2.2028212821939576e-05, "loss": 0.378, "step": 76580 }, { "epoch": 2.7601182109777636, "grad_norm": 0.17220036685466766, "learning_rate": 2.202531539913415e-05, "loss": 0.3982, "step": 76585 }, { "epoch": 2.760298410638988, "grad_norm": 0.2109745293855667, "learning_rate": 2.202241801685821e-05, "loss": 0.3956, "step": 76590 }, { "epoch": 2.7604786103002126, "grad_norm": 0.18770918250083923, "learning_rate": 2.2019520675151263e-05, "loss": 0.3672, "step": 76595 }, { "epoch": 2.7606588099614373, "grad_norm": 0.22950011491775513, "learning_rate": 2.2016623374052766e-05, "loss": 0.3903, "step": 76600 }, { "epoch": 2.760839009622662, "grad_norm": 0.19520814716815948, "learning_rate": 2.2013726113602192e-05, "loss": 0.3848, "step": 76605 }, { "epoch": 2.7610192092838863, "grad_norm": 0.20134703814983368, "learning_rate": 2.201082889383903e-05, "loss": 0.4014, "step": 76610 }, { "epoch": 2.761199408945111, "grad_norm": 0.19556838274002075, "learning_rate": 2.200793171480274e-05, "loss": 0.4127, "step": 76615 }, { "epoch": 2.761379608606336, "grad_norm": 0.20522183179855347, "learning_rate": 2.20050345765328e-05, "loss": 0.4185, "step": 76620 }, { "epoch": 2.7615598082675605, "grad_norm": 0.20291735231876373, "learning_rate": 2.2002137479068684e-05, "loss": 0.3848, "step": 76625 }, { "epoch": 2.7617400079287853, "grad_norm": 0.18468672037124634, "learning_rate": 2.1999240422449863e-05, "loss": 0.4037, "step": 76630 }, { "epoch": 2.76192020759001, "grad_norm": 0.19746728241443634, "learning_rate": 2.1996343406715815e-05, "loss": 0.3784, "step": 76635 }, { "epoch": 2.7621004072512343, "grad_norm": 0.17763979732990265, "learning_rate": 2.1993446431906007e-05, "loss": 0.4116, "step": 76640 }, { "epoch": 2.762280606912459, "grad_norm": 0.2670569121837616, "learning_rate": 2.1990549498059906e-05, "loss": 0.3857, "step": 76645 }, { "epoch": 2.7624608065736838, "grad_norm": 0.21598556637763977, "learning_rate": 2.198765260521699e-05, "loss": 0.3965, "step": 76650 }, { "epoch": 2.762641006234908, "grad_norm": 0.17488926649093628, "learning_rate": 2.1984755753416728e-05, "loss": 0.3939, "step": 76655 }, { "epoch": 2.762821205896133, "grad_norm": 0.2201049029827118, "learning_rate": 2.1981858942698568e-05, "loss": 0.4167, "step": 76660 }, { "epoch": 2.7630014055573575, "grad_norm": 0.21279020607471466, "learning_rate": 2.1978962173102015e-05, "loss": 0.3794, "step": 76665 }, { "epoch": 2.7631816052185822, "grad_norm": 0.1922799050807953, "learning_rate": 2.1976065444666495e-05, "loss": 0.4126, "step": 76670 }, { "epoch": 2.763361804879807, "grad_norm": 0.19106921553611755, "learning_rate": 2.197316875743152e-05, "loss": 0.3927, "step": 76675 }, { "epoch": 2.7635420045410317, "grad_norm": 0.25177741050720215, "learning_rate": 2.1970272111436527e-05, "loss": 0.3636, "step": 76680 }, { "epoch": 2.763722204202256, "grad_norm": 0.21334508061408997, "learning_rate": 2.196737550672098e-05, "loss": 0.3626, "step": 76685 }, { "epoch": 2.7639024038634807, "grad_norm": 0.21705521643161774, "learning_rate": 2.196447894332437e-05, "loss": 0.377, "step": 76690 }, { "epoch": 2.7640826035247055, "grad_norm": 0.20139169692993164, "learning_rate": 2.196158242128613e-05, "loss": 0.4, "step": 76695 }, { "epoch": 2.7642628031859298, "grad_norm": 0.17421554028987885, "learning_rate": 2.195868594064576e-05, "loss": 0.3998, "step": 76700 }, { "epoch": 2.7644430028471545, "grad_norm": 0.1658887267112732, "learning_rate": 2.1955789501442696e-05, "loss": 0.3811, "step": 76705 }, { "epoch": 2.7646232025083792, "grad_norm": 0.1670427918434143, "learning_rate": 2.1952893103716408e-05, "loss": 0.3959, "step": 76710 }, { "epoch": 2.764803402169604, "grad_norm": 0.21571218967437744, "learning_rate": 2.194999674750637e-05, "loss": 0.4167, "step": 76715 }, { "epoch": 2.7649836018308287, "grad_norm": 0.25938844680786133, "learning_rate": 2.194710043285203e-05, "loss": 0.3906, "step": 76720 }, { "epoch": 2.7651638014920534, "grad_norm": 0.1947462111711502, "learning_rate": 2.1944204159792854e-05, "loss": 0.3942, "step": 76725 }, { "epoch": 2.7653440011532777, "grad_norm": 0.19629132747650146, "learning_rate": 2.1941307928368305e-05, "loss": 0.4012, "step": 76730 }, { "epoch": 2.7655242008145025, "grad_norm": 0.20872166752815247, "learning_rate": 2.1938411738617843e-05, "loss": 0.4147, "step": 76735 }, { "epoch": 2.765704400475727, "grad_norm": 0.2229888141155243, "learning_rate": 2.1935515590580934e-05, "loss": 0.372, "step": 76740 }, { "epoch": 2.7658846001369515, "grad_norm": 0.23434416949748993, "learning_rate": 2.193261948429703e-05, "loss": 0.404, "step": 76745 }, { "epoch": 2.766064799798176, "grad_norm": 0.2255534827709198, "learning_rate": 2.1929723419805582e-05, "loss": 0.4228, "step": 76750 }, { "epoch": 2.766244999459401, "grad_norm": 0.21123521029949188, "learning_rate": 2.192682739714607e-05, "loss": 0.371, "step": 76755 }, { "epoch": 2.7664251991206257, "grad_norm": 0.18391209840774536, "learning_rate": 2.1923931416357944e-05, "loss": 0.3993, "step": 76760 }, { "epoch": 2.7666053987818504, "grad_norm": 0.19269075989723206, "learning_rate": 2.1921035477480636e-05, "loss": 0.3941, "step": 76765 }, { "epoch": 2.766785598443075, "grad_norm": 0.18515953421592712, "learning_rate": 2.1918139580553644e-05, "loss": 0.4063, "step": 76770 }, { "epoch": 2.7669657981042994, "grad_norm": 0.21497590839862823, "learning_rate": 2.1915243725616386e-05, "loss": 0.4265, "step": 76775 }, { "epoch": 2.767145997765524, "grad_norm": 0.19425266981124878, "learning_rate": 2.1912347912708354e-05, "loss": 0.405, "step": 76780 }, { "epoch": 2.767326197426749, "grad_norm": 0.24012631177902222, "learning_rate": 2.1909452141868975e-05, "loss": 0.4209, "step": 76785 }, { "epoch": 2.7675063970879736, "grad_norm": 0.2343771904706955, "learning_rate": 2.190655641313771e-05, "loss": 0.3743, "step": 76790 }, { "epoch": 2.767686596749198, "grad_norm": 0.23053786158561707, "learning_rate": 2.1903660726554016e-05, "loss": 0.4217, "step": 76795 }, { "epoch": 2.7678667964104227, "grad_norm": 0.19305402040481567, "learning_rate": 2.1900765082157347e-05, "loss": 0.3926, "step": 76800 }, { "epoch": 2.7680469960716474, "grad_norm": 0.18887373805046082, "learning_rate": 2.1897869479987148e-05, "loss": 0.3979, "step": 76805 }, { "epoch": 2.768227195732872, "grad_norm": 0.2448185682296753, "learning_rate": 2.1894973920082884e-05, "loss": 0.4011, "step": 76810 }, { "epoch": 2.768407395394097, "grad_norm": 0.2089599370956421, "learning_rate": 2.189207840248399e-05, "loss": 0.383, "step": 76815 }, { "epoch": 2.768587595055321, "grad_norm": 0.24234536290168762, "learning_rate": 2.1889182927229936e-05, "loss": 0.3862, "step": 76820 }, { "epoch": 2.768767794716546, "grad_norm": 0.28005969524383545, "learning_rate": 2.188628749436016e-05, "loss": 0.405, "step": 76825 }, { "epoch": 2.7689479943777706, "grad_norm": 0.20914676785469055, "learning_rate": 2.188339210391411e-05, "loss": 0.3951, "step": 76830 }, { "epoch": 2.7691281940389953, "grad_norm": 0.20570850372314453, "learning_rate": 2.1880496755931244e-05, "loss": 0.4507, "step": 76835 }, { "epoch": 2.7693083937002196, "grad_norm": 0.20999915897846222, "learning_rate": 2.187760145045101e-05, "loss": 0.4017, "step": 76840 }, { "epoch": 2.7694885933614444, "grad_norm": 0.16798564791679382, "learning_rate": 2.1874706187512836e-05, "loss": 0.3827, "step": 76845 }, { "epoch": 2.769668793022669, "grad_norm": 0.20106874406337738, "learning_rate": 2.18718109671562e-05, "loss": 0.4012, "step": 76850 }, { "epoch": 2.769848992683894, "grad_norm": 0.20237289369106293, "learning_rate": 2.1868915789420522e-05, "loss": 0.3978, "step": 76855 }, { "epoch": 2.7700291923451186, "grad_norm": 0.2165011614561081, "learning_rate": 2.186602065434527e-05, "loss": 0.4484, "step": 76860 }, { "epoch": 2.770209392006343, "grad_norm": 0.19253584742546082, "learning_rate": 2.1863125561969883e-05, "loss": 0.3802, "step": 76865 }, { "epoch": 2.7703895916675676, "grad_norm": 0.19273778796195984, "learning_rate": 2.1860230512333784e-05, "loss": 0.3859, "step": 76870 }, { "epoch": 2.7705697913287923, "grad_norm": 0.1797461360692978, "learning_rate": 2.185733550547646e-05, "loss": 0.3808, "step": 76875 }, { "epoch": 2.770749990990017, "grad_norm": 0.2192516326904297, "learning_rate": 2.185444054143731e-05, "loss": 0.4253, "step": 76880 }, { "epoch": 2.7709301906512414, "grad_norm": 0.17714661359786987, "learning_rate": 2.1851545620255816e-05, "loss": 0.4122, "step": 76885 }, { "epoch": 2.771110390312466, "grad_norm": 0.26212644577026367, "learning_rate": 2.1848650741971395e-05, "loss": 0.4195, "step": 76890 }, { "epoch": 2.771290589973691, "grad_norm": 0.1727093756198883, "learning_rate": 2.1845755906623498e-05, "loss": 0.4085, "step": 76895 }, { "epoch": 2.7714707896349156, "grad_norm": 0.18318147957324982, "learning_rate": 2.184286111425157e-05, "loss": 0.349, "step": 76900 }, { "epoch": 2.7716509892961403, "grad_norm": 0.23283013701438904, "learning_rate": 2.1839966364895042e-05, "loss": 0.3714, "step": 76905 }, { "epoch": 2.771831188957365, "grad_norm": 0.20274753868579865, "learning_rate": 2.183707165859336e-05, "loss": 0.3953, "step": 76910 }, { "epoch": 2.7720113886185893, "grad_norm": 0.25053125619888306, "learning_rate": 2.183417699538597e-05, "loss": 0.4063, "step": 76915 }, { "epoch": 2.772191588279814, "grad_norm": 0.17475423216819763, "learning_rate": 2.18312823753123e-05, "loss": 0.386, "step": 76920 }, { "epoch": 2.772371787941039, "grad_norm": 0.19454878568649292, "learning_rate": 2.18283877984118e-05, "loss": 0.382, "step": 76925 }, { "epoch": 2.772551987602263, "grad_norm": 0.2621554434299469, "learning_rate": 2.1825493264723902e-05, "loss": 0.4036, "step": 76930 }, { "epoch": 2.772732187263488, "grad_norm": 0.211991548538208, "learning_rate": 2.1822598774288034e-05, "loss": 0.3894, "step": 76935 }, { "epoch": 2.7729123869247125, "grad_norm": 0.20913714170455933, "learning_rate": 2.181970432714365e-05, "loss": 0.3712, "step": 76940 }, { "epoch": 2.7730925865859373, "grad_norm": 0.22146856784820557, "learning_rate": 2.1816809923330188e-05, "loss": 0.3821, "step": 76945 }, { "epoch": 2.773272786247162, "grad_norm": 0.16859367489814758, "learning_rate": 2.1813915562887054e-05, "loss": 0.4161, "step": 76950 }, { "epoch": 2.7734529859083867, "grad_norm": 0.2146807461977005, "learning_rate": 2.1811021245853724e-05, "loss": 0.4039, "step": 76955 }, { "epoch": 2.773633185569611, "grad_norm": 0.23717908561229706, "learning_rate": 2.1808126972269594e-05, "loss": 0.403, "step": 76960 }, { "epoch": 2.7738133852308358, "grad_norm": 0.23532505333423615, "learning_rate": 2.180523274217413e-05, "loss": 0.3809, "step": 76965 }, { "epoch": 2.7739935848920605, "grad_norm": 0.21510958671569824, "learning_rate": 2.180233855560675e-05, "loss": 0.3985, "step": 76970 }, { "epoch": 2.774173784553285, "grad_norm": 0.21763652563095093, "learning_rate": 2.1799444412606873e-05, "loss": 0.3586, "step": 76975 }, { "epoch": 2.7743539842145095, "grad_norm": 0.23686784505844116, "learning_rate": 2.1796550313213963e-05, "loss": 0.4179, "step": 76980 }, { "epoch": 2.7745341838757343, "grad_norm": 0.1995023488998413, "learning_rate": 2.1793656257467432e-05, "loss": 0.4106, "step": 76985 }, { "epoch": 2.774714383536959, "grad_norm": 0.20156660676002502, "learning_rate": 2.179076224540671e-05, "loss": 0.3956, "step": 76990 }, { "epoch": 2.7748945831981837, "grad_norm": 0.19944174587726593, "learning_rate": 2.1787868277071234e-05, "loss": 0.3759, "step": 76995 }, { "epoch": 2.7750747828594085, "grad_norm": 0.22275997698307037, "learning_rate": 2.1784974352500423e-05, "loss": 0.4041, "step": 77000 }, { "epoch": 2.7750747828594085, "eval_loss": 0.4315272569656372, "eval_runtime": 3.5286, "eval_samples_per_second": 28.34, "eval_steps_per_second": 7.085, "step": 77000 }, { "epoch": 2.7752549825206327, "grad_norm": 0.19483999907970428, "learning_rate": 2.178208047173372e-05, "loss": 0.411, "step": 77005 }, { "epoch": 2.7754351821818575, "grad_norm": 0.18567782640457153, "learning_rate": 2.177918663481055e-05, "loss": 0.3731, "step": 77010 }, { "epoch": 2.775615381843082, "grad_norm": 0.22382299602031708, "learning_rate": 2.1776292841770333e-05, "loss": 0.4029, "step": 77015 }, { "epoch": 2.7757955815043065, "grad_norm": 0.20525696873664856, "learning_rate": 2.1773399092652507e-05, "loss": 0.3781, "step": 77020 }, { "epoch": 2.7759757811655312, "grad_norm": 0.21928441524505615, "learning_rate": 2.1770505387496494e-05, "loss": 0.3852, "step": 77025 }, { "epoch": 2.776155980826756, "grad_norm": 0.25740548968315125, "learning_rate": 2.1767611726341714e-05, "loss": 0.4268, "step": 77030 }, { "epoch": 2.7763361804879807, "grad_norm": 0.21519418060779572, "learning_rate": 2.17647181092276e-05, "loss": 0.4104, "step": 77035 }, { "epoch": 2.7765163801492054, "grad_norm": 0.19074714183807373, "learning_rate": 2.1761824536193575e-05, "loss": 0.4052, "step": 77040 }, { "epoch": 2.77669657981043, "grad_norm": 0.20910242199897766, "learning_rate": 2.1758931007279067e-05, "loss": 0.3869, "step": 77045 }, { "epoch": 2.7768767794716545, "grad_norm": 0.22091153264045715, "learning_rate": 2.1756037522523503e-05, "loss": 0.4058, "step": 77050 }, { "epoch": 2.777056979132879, "grad_norm": 0.18852129578590393, "learning_rate": 2.175314408196628e-05, "loss": 0.4099, "step": 77055 }, { "epoch": 2.777237178794104, "grad_norm": 0.23839035630226135, "learning_rate": 2.1750250685646863e-05, "loss": 0.3927, "step": 77060 }, { "epoch": 2.7774173784553287, "grad_norm": 0.20940519869327545, "learning_rate": 2.174735733360464e-05, "loss": 0.3632, "step": 77065 }, { "epoch": 2.777597578116553, "grad_norm": 0.24172858893871307, "learning_rate": 2.174446402587904e-05, "loss": 0.3759, "step": 77070 }, { "epoch": 2.7777777777777777, "grad_norm": 0.2583145201206207, "learning_rate": 2.1741570762509495e-05, "loss": 0.3658, "step": 77075 }, { "epoch": 2.7779579774390024, "grad_norm": 0.21428237855434418, "learning_rate": 2.173867754353541e-05, "loss": 0.4038, "step": 77080 }, { "epoch": 2.778138177100227, "grad_norm": 0.1586180180311203, "learning_rate": 2.173578436899622e-05, "loss": 0.3844, "step": 77085 }, { "epoch": 2.778318376761452, "grad_norm": 0.22997400164604187, "learning_rate": 2.1732891238931334e-05, "loss": 0.3767, "step": 77090 }, { "epoch": 2.778498576422676, "grad_norm": 0.22127170860767365, "learning_rate": 2.1729998153380165e-05, "loss": 0.3797, "step": 77095 }, { "epoch": 2.778678776083901, "grad_norm": 0.25849655270576477, "learning_rate": 2.1727105112382147e-05, "loss": 0.4181, "step": 77100 }, { "epoch": 2.7788589757451256, "grad_norm": 0.18317921459674835, "learning_rate": 2.172421211597668e-05, "loss": 0.3667, "step": 77105 }, { "epoch": 2.7790391754063504, "grad_norm": 0.21405282616615295, "learning_rate": 2.1721319164203195e-05, "loss": 0.4073, "step": 77110 }, { "epoch": 2.7792193750675747, "grad_norm": 0.18047797679901123, "learning_rate": 2.1718426257101103e-05, "loss": 0.4006, "step": 77115 }, { "epoch": 2.7793995747287994, "grad_norm": 0.1952604204416275, "learning_rate": 2.1715533394709807e-05, "loss": 0.3855, "step": 77120 }, { "epoch": 2.779579774390024, "grad_norm": 0.18268941342830658, "learning_rate": 2.1712640577068743e-05, "loss": 0.435, "step": 77125 }, { "epoch": 2.779759974051249, "grad_norm": 0.23162731528282166, "learning_rate": 2.1709747804217324e-05, "loss": 0.3652, "step": 77130 }, { "epoch": 2.7799401737124736, "grad_norm": 0.22689977288246155, "learning_rate": 2.170685507619493e-05, "loss": 0.3839, "step": 77135 }, { "epoch": 2.7801203733736983, "grad_norm": 0.2621477544307709, "learning_rate": 2.1703962393041015e-05, "loss": 0.4038, "step": 77140 }, { "epoch": 2.7803005730349226, "grad_norm": 0.22769121825695038, "learning_rate": 2.1701069754794966e-05, "loss": 0.4086, "step": 77145 }, { "epoch": 2.7804807726961474, "grad_norm": 0.21255040168762207, "learning_rate": 2.1698177161496205e-05, "loss": 0.4162, "step": 77150 }, { "epoch": 2.780660972357372, "grad_norm": 0.1877208799123764, "learning_rate": 2.1695284613184154e-05, "loss": 0.3944, "step": 77155 }, { "epoch": 2.7808411720185964, "grad_norm": 0.24076056480407715, "learning_rate": 2.1692392109898185e-05, "loss": 0.4121, "step": 77160 }, { "epoch": 2.781021371679821, "grad_norm": 0.22234618663787842, "learning_rate": 2.1689499651677754e-05, "loss": 0.3533, "step": 77165 }, { "epoch": 2.781201571341046, "grad_norm": 0.21065817773342133, "learning_rate": 2.1686607238562245e-05, "loss": 0.3904, "step": 77170 }, { "epoch": 2.7813817710022706, "grad_norm": 0.24153132736682892, "learning_rate": 2.1683714870591067e-05, "loss": 0.4196, "step": 77175 }, { "epoch": 2.7815619706634953, "grad_norm": 0.25936198234558105, "learning_rate": 2.1680822547803635e-05, "loss": 0.4077, "step": 77180 }, { "epoch": 2.78174217032472, "grad_norm": 0.257414847612381, "learning_rate": 2.1677930270239343e-05, "loss": 0.4039, "step": 77185 }, { "epoch": 2.7819223699859443, "grad_norm": 0.17230790853500366, "learning_rate": 2.167503803793762e-05, "loss": 0.3765, "step": 77190 }, { "epoch": 2.782102569647169, "grad_norm": 0.21409516036510468, "learning_rate": 2.167214585093786e-05, "loss": 0.3749, "step": 77195 }, { "epoch": 2.782282769308394, "grad_norm": 0.17981880903244019, "learning_rate": 2.1669253709279458e-05, "loss": 0.4021, "step": 77200 }, { "epoch": 2.782462968969618, "grad_norm": 0.1959773600101471, "learning_rate": 2.166636161300184e-05, "loss": 0.4146, "step": 77205 }, { "epoch": 2.782643168630843, "grad_norm": 0.30680885910987854, "learning_rate": 2.1663469562144395e-05, "loss": 0.3919, "step": 77210 }, { "epoch": 2.7828233682920676, "grad_norm": 0.2044559270143509, "learning_rate": 2.1660577556746527e-05, "loss": 0.391, "step": 77215 }, { "epoch": 2.7830035679532923, "grad_norm": 0.2195388227701187, "learning_rate": 2.165768559684765e-05, "loss": 0.395, "step": 77220 }, { "epoch": 2.783183767614517, "grad_norm": 0.2089926302433014, "learning_rate": 2.1654793682487157e-05, "loss": 0.3878, "step": 77225 }, { "epoch": 2.7833639672757418, "grad_norm": 0.21790191531181335, "learning_rate": 2.1651901813704452e-05, "loss": 0.4098, "step": 77230 }, { "epoch": 2.783544166936966, "grad_norm": 0.22714762389659882, "learning_rate": 2.1649009990538947e-05, "loss": 0.3506, "step": 77235 }, { "epoch": 2.783724366598191, "grad_norm": 0.18509992957115173, "learning_rate": 2.1646118213030015e-05, "loss": 0.3868, "step": 77240 }, { "epoch": 2.7839045662594155, "grad_norm": 0.1982908993959427, "learning_rate": 2.1643226481217084e-05, "loss": 0.4092, "step": 77245 }, { "epoch": 2.78408476592064, "grad_norm": 0.1664113849401474, "learning_rate": 2.1640334795139545e-05, "loss": 0.3934, "step": 77250 }, { "epoch": 2.7842649655818645, "grad_norm": 0.22491081058979034, "learning_rate": 2.163744315483678e-05, "loss": 0.4087, "step": 77255 }, { "epoch": 2.7844451652430893, "grad_norm": 0.22320492565631866, "learning_rate": 2.1634551560348213e-05, "loss": 0.4185, "step": 77260 }, { "epoch": 2.784625364904314, "grad_norm": 0.1935398429632187, "learning_rate": 2.1631660011713218e-05, "loss": 0.3915, "step": 77265 }, { "epoch": 2.7848055645655387, "grad_norm": 0.205779567360878, "learning_rate": 2.1628768508971213e-05, "loss": 0.4085, "step": 77270 }, { "epoch": 2.7849857642267635, "grad_norm": 0.19906838238239288, "learning_rate": 2.1625877052161586e-05, "loss": 0.4113, "step": 77275 }, { "epoch": 2.7851659638879878, "grad_norm": 0.23320987820625305, "learning_rate": 2.1622985641323724e-05, "loss": 0.3884, "step": 77280 }, { "epoch": 2.7853461635492125, "grad_norm": 0.22347044944763184, "learning_rate": 2.1620094276497033e-05, "loss": 0.3888, "step": 77285 }, { "epoch": 2.7855263632104372, "grad_norm": 0.27736568450927734, "learning_rate": 2.16172029577209e-05, "loss": 0.4216, "step": 77290 }, { "epoch": 2.785706562871662, "grad_norm": 0.2748807668685913, "learning_rate": 2.1614311685034726e-05, "loss": 0.4152, "step": 77295 }, { "epoch": 2.7858867625328863, "grad_norm": 0.2487899661064148, "learning_rate": 2.16114204584779e-05, "loss": 0.4088, "step": 77300 }, { "epoch": 2.786066962194111, "grad_norm": 0.21085642278194427, "learning_rate": 2.1608529278089808e-05, "loss": 0.3912, "step": 77305 }, { "epoch": 2.7862471618553357, "grad_norm": 0.19442741572856903, "learning_rate": 2.160563814390985e-05, "loss": 0.3965, "step": 77310 }, { "epoch": 2.7864273615165605, "grad_norm": 0.19160184264183044, "learning_rate": 2.1602747055977417e-05, "loss": 0.37, "step": 77315 }, { "epoch": 2.786607561177785, "grad_norm": 0.2595144212245941, "learning_rate": 2.1599856014331895e-05, "loss": 0.4168, "step": 77320 }, { "epoch": 2.7867877608390095, "grad_norm": 0.23235271871089935, "learning_rate": 2.1596965019012682e-05, "loss": 0.4176, "step": 77325 }, { "epoch": 2.786967960500234, "grad_norm": 0.1904749721288681, "learning_rate": 2.1594074070059155e-05, "loss": 0.3688, "step": 77330 }, { "epoch": 2.787148160161459, "grad_norm": 0.22936037182807922, "learning_rate": 2.1591183167510714e-05, "loss": 0.3988, "step": 77335 }, { "epoch": 2.7873283598226837, "grad_norm": 0.21783271431922913, "learning_rate": 2.158829231140675e-05, "loss": 0.4026, "step": 77340 }, { "epoch": 2.787508559483908, "grad_norm": 0.193325474858284, "learning_rate": 2.1585401501786622e-05, "loss": 0.4321, "step": 77345 }, { "epoch": 2.7876887591451327, "grad_norm": 0.24753624200820923, "learning_rate": 2.158251073868976e-05, "loss": 0.4161, "step": 77350 }, { "epoch": 2.7878689588063574, "grad_norm": 0.18600989878177643, "learning_rate": 2.1579620022155516e-05, "loss": 0.3775, "step": 77355 }, { "epoch": 2.788049158467582, "grad_norm": 0.19241438806056976, "learning_rate": 2.1576729352223285e-05, "loss": 0.395, "step": 77360 }, { "epoch": 2.788229358128807, "grad_norm": 0.2080182284116745, "learning_rate": 2.1573838728932462e-05, "loss": 0.3651, "step": 77365 }, { "epoch": 2.788409557790031, "grad_norm": 0.23465201258659363, "learning_rate": 2.157094815232241e-05, "loss": 0.3976, "step": 77370 }, { "epoch": 2.788589757451256, "grad_norm": 0.21152657270431519, "learning_rate": 2.1568057622432536e-05, "loss": 0.4053, "step": 77375 }, { "epoch": 2.7887699571124807, "grad_norm": 0.20607246458530426, "learning_rate": 2.1565167139302213e-05, "loss": 0.3984, "step": 77380 }, { "epoch": 2.7889501567737054, "grad_norm": 0.20786413550376892, "learning_rate": 2.1562276702970816e-05, "loss": 0.3772, "step": 77385 }, { "epoch": 2.7891303564349297, "grad_norm": 0.21368098258972168, "learning_rate": 2.155938631347774e-05, "loss": 0.3946, "step": 77390 }, { "epoch": 2.7893105560961544, "grad_norm": 0.22205038368701935, "learning_rate": 2.1556495970862356e-05, "loss": 0.3638, "step": 77395 }, { "epoch": 2.789490755757379, "grad_norm": 0.22499975562095642, "learning_rate": 2.1553605675164047e-05, "loss": 0.4032, "step": 77400 }, { "epoch": 2.789670955418604, "grad_norm": 0.2418082356452942, "learning_rate": 2.1550715426422194e-05, "loss": 0.3953, "step": 77405 }, { "epoch": 2.7898511550798286, "grad_norm": 0.19211691617965698, "learning_rate": 2.1547825224676173e-05, "loss": 0.3994, "step": 77410 }, { "epoch": 2.7900313547410533, "grad_norm": 0.2310890257358551, "learning_rate": 2.154493506996537e-05, "loss": 0.3642, "step": 77415 }, { "epoch": 2.7902115544022776, "grad_norm": 0.24673838913440704, "learning_rate": 2.1542044962329163e-05, "loss": 0.4185, "step": 77420 }, { "epoch": 2.7903917540635024, "grad_norm": 0.2144293338060379, "learning_rate": 2.153915490180691e-05, "loss": 0.4018, "step": 77425 }, { "epoch": 2.790571953724727, "grad_norm": 0.1878860741853714, "learning_rate": 2.1536264888438005e-05, "loss": 0.3942, "step": 77430 }, { "epoch": 2.7907521533859514, "grad_norm": 0.2370370328426361, "learning_rate": 2.1533374922261835e-05, "loss": 0.3942, "step": 77435 }, { "epoch": 2.790932353047176, "grad_norm": 0.20887020230293274, "learning_rate": 2.153048500331774e-05, "loss": 0.4095, "step": 77440 }, { "epoch": 2.791112552708401, "grad_norm": 0.21834127604961395, "learning_rate": 2.152759513164513e-05, "loss": 0.379, "step": 77445 }, { "epoch": 2.7912927523696256, "grad_norm": 0.16670918464660645, "learning_rate": 2.1524705307283348e-05, "loss": 0.3774, "step": 77450 }, { "epoch": 2.7914729520308503, "grad_norm": 0.20966099202632904, "learning_rate": 2.15218155302718e-05, "loss": 0.3779, "step": 77455 }, { "epoch": 2.791653151692075, "grad_norm": 0.1819358766078949, "learning_rate": 2.1518925800649836e-05, "loss": 0.4119, "step": 77460 }, { "epoch": 2.7918333513532994, "grad_norm": 0.22907419502735138, "learning_rate": 2.151603611845683e-05, "loss": 0.3981, "step": 77465 }, { "epoch": 2.792013551014524, "grad_norm": 0.2390015870332718, "learning_rate": 2.1513146483732163e-05, "loss": 0.3907, "step": 77470 }, { "epoch": 2.792193750675749, "grad_norm": 0.20397347211837769, "learning_rate": 2.1510256896515195e-05, "loss": 0.3836, "step": 77475 }, { "epoch": 2.792373950336973, "grad_norm": 0.2034919261932373, "learning_rate": 2.1507367356845304e-05, "loss": 0.398, "step": 77480 }, { "epoch": 2.792554149998198, "grad_norm": 0.1811286211013794, "learning_rate": 2.150447786476186e-05, "loss": 0.3666, "step": 77485 }, { "epoch": 2.7927343496594226, "grad_norm": 0.23901815712451935, "learning_rate": 2.1501588420304218e-05, "loss": 0.4266, "step": 77490 }, { "epoch": 2.7929145493206473, "grad_norm": 0.23680728673934937, "learning_rate": 2.1498699023511766e-05, "loss": 0.419, "step": 77495 }, { "epoch": 2.793094748981872, "grad_norm": 0.22944191098213196, "learning_rate": 2.1495809674423865e-05, "loss": 0.3911, "step": 77500 }, { "epoch": 2.793094748981872, "eval_loss": 0.4314689338207245, "eval_runtime": 3.521, "eval_samples_per_second": 28.401, "eval_steps_per_second": 7.1, "step": 77500 }, { "epoch": 2.7932749486430968, "grad_norm": 0.22808948159217834, "learning_rate": 2.1492920373079867e-05, "loss": 0.3854, "step": 77505 }, { "epoch": 2.793455148304321, "grad_norm": 0.17345234751701355, "learning_rate": 2.149003111951916e-05, "loss": 0.3919, "step": 77510 }, { "epoch": 2.793635347965546, "grad_norm": 0.18954497575759888, "learning_rate": 2.148714191378109e-05, "loss": 0.3631, "step": 77515 }, { "epoch": 2.7938155476267705, "grad_norm": 0.28941574692726135, "learning_rate": 2.148425275590504e-05, "loss": 0.396, "step": 77520 }, { "epoch": 2.793995747287995, "grad_norm": 0.2047412395477295, "learning_rate": 2.1481363645930367e-05, "loss": 0.3768, "step": 77525 }, { "epoch": 2.7941759469492196, "grad_norm": 0.17408131062984467, "learning_rate": 2.147847458389642e-05, "loss": 0.409, "step": 77530 }, { "epoch": 2.7943561466104443, "grad_norm": 0.23604151606559753, "learning_rate": 2.147558556984259e-05, "loss": 0.3933, "step": 77535 }, { "epoch": 2.794536346271669, "grad_norm": 0.22431814670562744, "learning_rate": 2.147269660380822e-05, "loss": 0.4178, "step": 77540 }, { "epoch": 2.7947165459328938, "grad_norm": 0.2082727700471878, "learning_rate": 2.1469807685832672e-05, "loss": 0.3803, "step": 77545 }, { "epoch": 2.7948967455941185, "grad_norm": 0.2045706808567047, "learning_rate": 2.146691881595531e-05, "loss": 0.401, "step": 77550 }, { "epoch": 2.795076945255343, "grad_norm": 0.17748849093914032, "learning_rate": 2.146402999421549e-05, "loss": 0.3753, "step": 77555 }, { "epoch": 2.7952571449165675, "grad_norm": 0.19480100274085999, "learning_rate": 2.1461141220652593e-05, "loss": 0.407, "step": 77560 }, { "epoch": 2.7954373445777922, "grad_norm": 0.2130713313817978, "learning_rate": 2.1458252495305954e-05, "loss": 0.368, "step": 77565 }, { "epoch": 2.795617544239017, "grad_norm": 0.22572968900203705, "learning_rate": 2.145536381821493e-05, "loss": 0.396, "step": 77570 }, { "epoch": 2.7957977439002413, "grad_norm": 0.21448364853858948, "learning_rate": 2.14524751894189e-05, "loss": 0.3612, "step": 77575 }, { "epoch": 2.795977943561466, "grad_norm": 0.2889059782028198, "learning_rate": 2.1449586608957207e-05, "loss": 0.4111, "step": 77580 }, { "epoch": 2.7961581432226907, "grad_norm": 0.23263846337795258, "learning_rate": 2.1446698076869203e-05, "loss": 0.4039, "step": 77585 }, { "epoch": 2.7963383428839155, "grad_norm": 0.22178532183170319, "learning_rate": 2.1443809593194253e-05, "loss": 0.4065, "step": 77590 }, { "epoch": 2.79651854254514, "grad_norm": 0.19402441382408142, "learning_rate": 2.1440921157971706e-05, "loss": 0.3759, "step": 77595 }, { "epoch": 2.7966987422063645, "grad_norm": 0.22122874855995178, "learning_rate": 2.1438032771240925e-05, "loss": 0.4054, "step": 77600 }, { "epoch": 2.7968789418675892, "grad_norm": 0.2299935668706894, "learning_rate": 2.143514443304126e-05, "loss": 0.3833, "step": 77605 }, { "epoch": 2.797059141528814, "grad_norm": 0.23036040365695953, "learning_rate": 2.143225614341205e-05, "loss": 0.4046, "step": 77610 }, { "epoch": 2.7972393411900387, "grad_norm": 0.21337628364562988, "learning_rate": 2.1429367902392668e-05, "loss": 0.3539, "step": 77615 }, { "epoch": 2.797419540851263, "grad_norm": 0.2017783671617508, "learning_rate": 2.1426479710022463e-05, "loss": 0.3555, "step": 77620 }, { "epoch": 2.7975997405124877, "grad_norm": 0.2635432183742523, "learning_rate": 2.142359156634076e-05, "loss": 0.4249, "step": 77625 }, { "epoch": 2.7977799401737125, "grad_norm": 0.1856364905834198, "learning_rate": 2.1420703471386952e-05, "loss": 0.41, "step": 77630 }, { "epoch": 2.797960139834937, "grad_norm": 0.1753203123807907, "learning_rate": 2.1417815425200346e-05, "loss": 0.3796, "step": 77635 }, { "epoch": 2.798140339496162, "grad_norm": 0.23167075216770172, "learning_rate": 2.141492742782033e-05, "loss": 0.3789, "step": 77640 }, { "epoch": 2.7983205391573867, "grad_norm": 0.17792221903800964, "learning_rate": 2.141203947928623e-05, "loss": 0.3429, "step": 77645 }, { "epoch": 2.798500738818611, "grad_norm": 0.23163169622421265, "learning_rate": 2.1409151579637386e-05, "loss": 0.4213, "step": 77650 }, { "epoch": 2.7986809384798357, "grad_norm": 0.21783442795276642, "learning_rate": 2.1406263728913163e-05, "loss": 0.3908, "step": 77655 }, { "epoch": 2.7988611381410604, "grad_norm": 0.2356899380683899, "learning_rate": 2.1403375927152903e-05, "loss": 0.3945, "step": 77660 }, { "epoch": 2.7990413378022847, "grad_norm": 0.23855198919773102, "learning_rate": 2.1400488174395946e-05, "loss": 0.4182, "step": 77665 }, { "epoch": 2.7992215374635094, "grad_norm": 0.20778150856494904, "learning_rate": 2.1397600470681643e-05, "loss": 0.3909, "step": 77670 }, { "epoch": 2.799401737124734, "grad_norm": 0.19535931944847107, "learning_rate": 2.1394712816049332e-05, "loss": 0.3873, "step": 77675 }, { "epoch": 2.799581936785959, "grad_norm": 0.21138401329517365, "learning_rate": 2.1391825210538366e-05, "loss": 0.4423, "step": 77680 }, { "epoch": 2.7997621364471836, "grad_norm": 0.25198328495025635, "learning_rate": 2.138893765418808e-05, "loss": 0.417, "step": 77685 }, { "epoch": 2.7999423361084084, "grad_norm": 0.1913152039051056, "learning_rate": 2.138605014703782e-05, "loss": 0.3844, "step": 77690 }, { "epoch": 2.8001225357696327, "grad_norm": 0.19458575546741486, "learning_rate": 2.138316268912693e-05, "loss": 0.3818, "step": 77695 }, { "epoch": 2.8003027354308574, "grad_norm": 0.25212234258651733, "learning_rate": 2.138027528049474e-05, "loss": 0.3994, "step": 77700 }, { "epoch": 2.800482935092082, "grad_norm": 0.19638745486736298, "learning_rate": 2.137738792118061e-05, "loss": 0.3779, "step": 77705 }, { "epoch": 2.8006631347533064, "grad_norm": 0.2299245297908783, "learning_rate": 2.1374500611223867e-05, "loss": 0.4019, "step": 77710 }, { "epoch": 2.800843334414531, "grad_norm": 0.2781182527542114, "learning_rate": 2.1371613350663837e-05, "loss": 0.3782, "step": 77715 }, { "epoch": 2.801023534075756, "grad_norm": 0.2580137848854065, "learning_rate": 2.1368726139539885e-05, "loss": 0.4054, "step": 77720 }, { "epoch": 2.8012037337369806, "grad_norm": 0.2500860095024109, "learning_rate": 2.1365838977891344e-05, "loss": 0.4145, "step": 77725 }, { "epoch": 2.8013839333982054, "grad_norm": 0.26811492443084717, "learning_rate": 2.1362951865757523e-05, "loss": 0.3767, "step": 77730 }, { "epoch": 2.80156413305943, "grad_norm": 0.22250352799892426, "learning_rate": 2.1360064803177793e-05, "loss": 0.3912, "step": 77735 }, { "epoch": 2.8017443327206544, "grad_norm": 0.16757109761238098, "learning_rate": 2.1357177790191463e-05, "loss": 0.3749, "step": 77740 }, { "epoch": 2.801924532381879, "grad_norm": 0.1849580705165863, "learning_rate": 2.13542908268379e-05, "loss": 0.3972, "step": 77745 }, { "epoch": 2.802104732043104, "grad_norm": 0.2086080014705658, "learning_rate": 2.1351403913156403e-05, "loss": 0.4013, "step": 77750 }, { "epoch": 2.802284931704328, "grad_norm": 0.22372741997241974, "learning_rate": 2.1348517049186323e-05, "loss": 0.4117, "step": 77755 }, { "epoch": 2.802465131365553, "grad_norm": 0.18635831773281097, "learning_rate": 2.1345630234966997e-05, "loss": 0.3779, "step": 77760 }, { "epoch": 2.8026453310267776, "grad_norm": 0.19005261361598969, "learning_rate": 2.134274347053775e-05, "loss": 0.4248, "step": 77765 }, { "epoch": 2.8028255306880023, "grad_norm": 0.19459028542041779, "learning_rate": 2.133985675593791e-05, "loss": 0.4361, "step": 77770 }, { "epoch": 2.803005730349227, "grad_norm": 0.19851277768611908, "learning_rate": 2.1336970091206814e-05, "loss": 0.3814, "step": 77775 }, { "epoch": 2.803185930010452, "grad_norm": 0.1843992918729782, "learning_rate": 2.1334083476383794e-05, "loss": 0.4244, "step": 77780 }, { "epoch": 2.803366129671676, "grad_norm": 0.24044600129127502, "learning_rate": 2.133119691150818e-05, "loss": 0.3871, "step": 77785 }, { "epoch": 2.803546329332901, "grad_norm": 0.1807718724012375, "learning_rate": 2.1328310396619296e-05, "loss": 0.4099, "step": 77790 }, { "epoch": 2.8037265289941256, "grad_norm": 0.22799517214298248, "learning_rate": 2.1325423931756463e-05, "loss": 0.3899, "step": 77795 }, { "epoch": 2.8039067286553503, "grad_norm": 0.2450421303510666, "learning_rate": 2.1322537516959026e-05, "loss": 0.3654, "step": 77800 }, { "epoch": 2.8040869283165746, "grad_norm": 0.21578456461429596, "learning_rate": 2.1319651152266313e-05, "loss": 0.3766, "step": 77805 }, { "epoch": 2.8042671279777993, "grad_norm": 0.22823040187358856, "learning_rate": 2.1316764837717618e-05, "loss": 0.4372, "step": 77810 }, { "epoch": 2.804447327639024, "grad_norm": 0.2035275250673294, "learning_rate": 2.131387857335231e-05, "loss": 0.3643, "step": 77815 }, { "epoch": 2.804627527300249, "grad_norm": 0.23069792985916138, "learning_rate": 2.1310992359209673e-05, "loss": 0.3928, "step": 77820 }, { "epoch": 2.8048077269614735, "grad_norm": 0.18597543239593506, "learning_rate": 2.130810619532907e-05, "loss": 0.3802, "step": 77825 }, { "epoch": 2.804987926622698, "grad_norm": 0.14916902780532837, "learning_rate": 2.1305220081749798e-05, "loss": 0.3602, "step": 77830 }, { "epoch": 2.8051681262839225, "grad_norm": 0.20437051355838776, "learning_rate": 2.130233401851118e-05, "loss": 0.3659, "step": 77835 }, { "epoch": 2.8053483259451473, "grad_norm": 0.25175103545188904, "learning_rate": 2.1299448005652552e-05, "loss": 0.3947, "step": 77840 }, { "epoch": 2.805528525606372, "grad_norm": 0.18190394341945648, "learning_rate": 2.129656204321323e-05, "loss": 0.3906, "step": 77845 }, { "epoch": 2.8057087252675963, "grad_norm": 0.16877366602420807, "learning_rate": 2.1293676131232526e-05, "loss": 0.3896, "step": 77850 }, { "epoch": 2.805888924928821, "grad_norm": 0.19917549192905426, "learning_rate": 2.1290790269749774e-05, "loss": 0.3843, "step": 77855 }, { "epoch": 2.8060691245900458, "grad_norm": 0.23629820346832275, "learning_rate": 2.128790445880428e-05, "loss": 0.3683, "step": 77860 }, { "epoch": 2.8062493242512705, "grad_norm": 0.2526760697364807, "learning_rate": 2.128501869843538e-05, "loss": 0.3883, "step": 77865 }, { "epoch": 2.8064295239124952, "grad_norm": 0.22685180604457855, "learning_rate": 2.1282132988682374e-05, "loss": 0.3851, "step": 77870 }, { "epoch": 2.8066097235737195, "grad_norm": 0.24433378875255585, "learning_rate": 2.1279247329584582e-05, "loss": 0.4229, "step": 77875 }, { "epoch": 2.8067899232349443, "grad_norm": 0.2599703371524811, "learning_rate": 2.1276361721181332e-05, "loss": 0.4136, "step": 77880 }, { "epoch": 2.806970122896169, "grad_norm": 0.22800195217132568, "learning_rate": 2.1273476163511924e-05, "loss": 0.3937, "step": 77885 }, { "epoch": 2.8071503225573937, "grad_norm": 0.26085367798805237, "learning_rate": 2.127059065661569e-05, "loss": 0.4329, "step": 77890 }, { "epoch": 2.807330522218618, "grad_norm": 0.1912272721529007, "learning_rate": 2.1267705200531933e-05, "loss": 0.4017, "step": 77895 }, { "epoch": 2.8075107218798427, "grad_norm": 0.2735271751880646, "learning_rate": 2.126481979529997e-05, "loss": 0.4036, "step": 77900 }, { "epoch": 2.8076909215410675, "grad_norm": 0.2603655755519867, "learning_rate": 2.1261934440959115e-05, "loss": 0.4244, "step": 77905 }, { "epoch": 2.807871121202292, "grad_norm": 0.21912328898906708, "learning_rate": 2.1259049137548686e-05, "loss": 0.4253, "step": 77910 }, { "epoch": 2.808051320863517, "grad_norm": 0.19126076996326447, "learning_rate": 2.1256163885107973e-05, "loss": 0.4077, "step": 77915 }, { "epoch": 2.8082315205247417, "grad_norm": 0.19802503287792206, "learning_rate": 2.125327868367632e-05, "loss": 0.3619, "step": 77920 }, { "epoch": 2.808411720185966, "grad_norm": 0.2620820105075836, "learning_rate": 2.1250393533293e-05, "loss": 0.4068, "step": 77925 }, { "epoch": 2.8085919198471907, "grad_norm": 0.269182413816452, "learning_rate": 2.124750843399736e-05, "loss": 0.4109, "step": 77930 }, { "epoch": 2.8087721195084154, "grad_norm": 0.2248949408531189, "learning_rate": 2.1244623385828687e-05, "loss": 0.4336, "step": 77935 }, { "epoch": 2.8089523191696397, "grad_norm": 0.2180756777524948, "learning_rate": 2.1241738388826288e-05, "loss": 0.3815, "step": 77940 }, { "epoch": 2.8091325188308645, "grad_norm": 0.22202380001544952, "learning_rate": 2.1238853443029476e-05, "loss": 0.3617, "step": 77945 }, { "epoch": 2.809312718492089, "grad_norm": 0.24584850668907166, "learning_rate": 2.1235968548477564e-05, "loss": 0.3859, "step": 77950 }, { "epoch": 2.809492918153314, "grad_norm": 0.18127258121967316, "learning_rate": 2.1233083705209845e-05, "loss": 0.3916, "step": 77955 }, { "epoch": 2.8096731178145387, "grad_norm": 0.2721177637577057, "learning_rate": 2.1230198913265635e-05, "loss": 0.3872, "step": 77960 }, { "epoch": 2.8098533174757634, "grad_norm": 0.23304437100887299, "learning_rate": 2.122731417268423e-05, "loss": 0.4107, "step": 77965 }, { "epoch": 2.8100335171369877, "grad_norm": 0.21200516819953918, "learning_rate": 2.122442948350494e-05, "loss": 0.4215, "step": 77970 }, { "epoch": 2.8102137167982124, "grad_norm": 0.1965387612581253, "learning_rate": 2.1221544845767074e-05, "loss": 0.4209, "step": 77975 }, { "epoch": 2.810393916459437, "grad_norm": 0.19175685942173004, "learning_rate": 2.121866025950992e-05, "loss": 0.383, "step": 77980 }, { "epoch": 2.8105741161206614, "grad_norm": 0.2140149623155594, "learning_rate": 2.1215775724772794e-05, "loss": 0.4087, "step": 77985 }, { "epoch": 2.810754315781886, "grad_norm": 0.239614799618721, "learning_rate": 2.1212891241594996e-05, "loss": 0.3841, "step": 77990 }, { "epoch": 2.810934515443111, "grad_norm": 0.24127493798732758, "learning_rate": 2.1210006810015803e-05, "loss": 0.375, "step": 77995 }, { "epoch": 2.8111147151043356, "grad_norm": 0.20837131142616272, "learning_rate": 2.120712243007455e-05, "loss": 0.4066, "step": 78000 }, { "epoch": 2.8111147151043356, "eval_loss": 0.4307841956615448, "eval_runtime": 3.5376, "eval_samples_per_second": 28.268, "eval_steps_per_second": 7.067, "step": 78000 }, { "epoch": 2.8112949147655604, "grad_norm": 0.21309497952461243, "learning_rate": 2.1204238101810507e-05, "loss": 0.3937, "step": 78005 }, { "epoch": 2.811475114426785, "grad_norm": 0.23278546333312988, "learning_rate": 2.1201353825262996e-05, "loss": 0.4088, "step": 78010 }, { "epoch": 2.8116553140880094, "grad_norm": 0.1590275913476944, "learning_rate": 2.1198469600471308e-05, "loss": 0.3719, "step": 78015 }, { "epoch": 2.811835513749234, "grad_norm": 0.19892996549606323, "learning_rate": 2.119558542747472e-05, "loss": 0.3786, "step": 78020 }, { "epoch": 2.812015713410459, "grad_norm": 0.22030261158943176, "learning_rate": 2.1192701306312556e-05, "loss": 0.3856, "step": 78025 }, { "epoch": 2.812195913071683, "grad_norm": 0.2299405336380005, "learning_rate": 2.11898172370241e-05, "loss": 0.4055, "step": 78030 }, { "epoch": 2.812376112732908, "grad_norm": 0.22635617852210999, "learning_rate": 2.1186933219648636e-05, "loss": 0.3831, "step": 78035 }, { "epoch": 2.8125563123941326, "grad_norm": 0.1920858919620514, "learning_rate": 2.1184049254225477e-05, "loss": 0.3861, "step": 78040 }, { "epoch": 2.8127365120553574, "grad_norm": 0.256536066532135, "learning_rate": 2.1181165340793902e-05, "loss": 0.4304, "step": 78045 }, { "epoch": 2.812916711716582, "grad_norm": 0.1779921054840088, "learning_rate": 2.1178281479393213e-05, "loss": 0.3844, "step": 78050 }, { "epoch": 2.813096911377807, "grad_norm": 0.20656806230545044, "learning_rate": 2.11753976700627e-05, "loss": 0.3907, "step": 78055 }, { "epoch": 2.813277111039031, "grad_norm": 0.2381805181503296, "learning_rate": 2.117251391284165e-05, "loss": 0.4102, "step": 78060 }, { "epoch": 2.813457310700256, "grad_norm": 0.2606278657913208, "learning_rate": 2.1169630207769366e-05, "loss": 0.3904, "step": 78065 }, { "epoch": 2.8136375103614806, "grad_norm": 0.24265384674072266, "learning_rate": 2.116674655488512e-05, "loss": 0.3964, "step": 78070 }, { "epoch": 2.8138177100227053, "grad_norm": 0.20753905177116394, "learning_rate": 2.116386295422821e-05, "loss": 0.395, "step": 78075 }, { "epoch": 2.8139979096839296, "grad_norm": 0.19484953582286835, "learning_rate": 2.116097940583793e-05, "loss": 0.4158, "step": 78080 }, { "epoch": 2.8141781093451543, "grad_norm": 0.17793631553649902, "learning_rate": 2.1158095909753555e-05, "loss": 0.404, "step": 78085 }, { "epoch": 2.814358309006379, "grad_norm": 0.226637065410614, "learning_rate": 2.1155212466014383e-05, "loss": 0.3846, "step": 78090 }, { "epoch": 2.814538508667604, "grad_norm": 0.24022680521011353, "learning_rate": 2.1152329074659707e-05, "loss": 0.3918, "step": 78095 }, { "epoch": 2.8147187083288285, "grad_norm": 0.23119503259658813, "learning_rate": 2.1149445735728784e-05, "loss": 0.3771, "step": 78100 }, { "epoch": 2.814898907990053, "grad_norm": 0.1578037291765213, "learning_rate": 2.1146562449260933e-05, "loss": 0.3745, "step": 78105 }, { "epoch": 2.8150791076512776, "grad_norm": 0.22053155303001404, "learning_rate": 2.1143679215295407e-05, "loss": 0.4288, "step": 78110 }, { "epoch": 2.8152593073125023, "grad_norm": 0.21788404881954193, "learning_rate": 2.114079603387152e-05, "loss": 0.4265, "step": 78115 }, { "epoch": 2.815439506973727, "grad_norm": 0.21442186832427979, "learning_rate": 2.1137912905028537e-05, "loss": 0.362, "step": 78120 }, { "epoch": 2.8156197066349513, "grad_norm": 0.2007633000612259, "learning_rate": 2.113502982880573e-05, "loss": 0.3879, "step": 78125 }, { "epoch": 2.815799906296176, "grad_norm": 0.23273354768753052, "learning_rate": 2.113214680524241e-05, "loss": 0.386, "step": 78130 }, { "epoch": 2.815980105957401, "grad_norm": 0.23062081634998322, "learning_rate": 2.1129263834377838e-05, "loss": 0.3886, "step": 78135 }, { "epoch": 2.8161603056186255, "grad_norm": 0.20802180469036102, "learning_rate": 2.1126380916251287e-05, "loss": 0.4092, "step": 78140 }, { "epoch": 2.8163405052798502, "grad_norm": 0.27845528721809387, "learning_rate": 2.1123498050902055e-05, "loss": 0.4517, "step": 78145 }, { "epoch": 2.816520704941075, "grad_norm": 0.20084762573242188, "learning_rate": 2.1120615238369407e-05, "loss": 0.4151, "step": 78150 }, { "epoch": 2.8167009046022993, "grad_norm": 0.2194603532552719, "learning_rate": 2.1117732478692627e-05, "loss": 0.4013, "step": 78155 }, { "epoch": 2.816881104263524, "grad_norm": 0.18530698120594025, "learning_rate": 2.111484977191099e-05, "loss": 0.37, "step": 78160 }, { "epoch": 2.8170613039247487, "grad_norm": 0.21759265661239624, "learning_rate": 2.1111967118063772e-05, "loss": 0.3906, "step": 78165 }, { "epoch": 2.817241503585973, "grad_norm": 0.18584828078746796, "learning_rate": 2.110908451719025e-05, "loss": 0.3837, "step": 78170 }, { "epoch": 2.8174217032471978, "grad_norm": 0.23392297327518463, "learning_rate": 2.11062019693297e-05, "loss": 0.4138, "step": 78175 }, { "epoch": 2.8176019029084225, "grad_norm": 0.21533243358135223, "learning_rate": 2.110331947452139e-05, "loss": 0.4038, "step": 78180 }, { "epoch": 2.8177821025696472, "grad_norm": 0.18144142627716064, "learning_rate": 2.11004370328046e-05, "loss": 0.4086, "step": 78185 }, { "epoch": 2.817962302230872, "grad_norm": 0.1902519166469574, "learning_rate": 2.10975546442186e-05, "loss": 0.366, "step": 78190 }, { "epoch": 2.8181425018920967, "grad_norm": 0.259445458650589, "learning_rate": 2.1094672308802666e-05, "loss": 0.448, "step": 78195 }, { "epoch": 2.818322701553321, "grad_norm": 0.2690201997756958, "learning_rate": 2.109179002659607e-05, "loss": 0.393, "step": 78200 }, { "epoch": 2.8185029012145457, "grad_norm": 0.17392581701278687, "learning_rate": 2.108890779763806e-05, "loss": 0.4098, "step": 78205 }, { "epoch": 2.8186831008757705, "grad_norm": 0.22515340149402618, "learning_rate": 2.108602562196794e-05, "loss": 0.392, "step": 78210 }, { "epoch": 2.8188633005369947, "grad_norm": 0.27040407061576843, "learning_rate": 2.1083143499624965e-05, "loss": 0.4095, "step": 78215 }, { "epoch": 2.8190435001982195, "grad_norm": 0.2060219943523407, "learning_rate": 2.108026143064839e-05, "loss": 0.4017, "step": 78220 }, { "epoch": 2.819223699859444, "grad_norm": 0.2325987070798874, "learning_rate": 2.10773794150775e-05, "loss": 0.3988, "step": 78225 }, { "epoch": 2.819403899520669, "grad_norm": 0.2505181133747101, "learning_rate": 2.107449745295155e-05, "loss": 0.3817, "step": 78230 }, { "epoch": 2.8195840991818937, "grad_norm": 0.1897851973772049, "learning_rate": 2.107161554430982e-05, "loss": 0.4017, "step": 78235 }, { "epoch": 2.8197642988431184, "grad_norm": 0.19039909541606903, "learning_rate": 2.1068733689191567e-05, "loss": 0.399, "step": 78240 }, { "epoch": 2.8199444985043427, "grad_norm": 0.20617642998695374, "learning_rate": 2.106585188763605e-05, "loss": 0.4053, "step": 78245 }, { "epoch": 2.8201246981655674, "grad_norm": 0.26967623829841614, "learning_rate": 2.1062970139682546e-05, "loss": 0.3708, "step": 78250 }, { "epoch": 2.820304897826792, "grad_norm": 0.1973973661661148, "learning_rate": 2.106008844537031e-05, "loss": 0.4058, "step": 78255 }, { "epoch": 2.8204850974880165, "grad_norm": 0.2256660908460617, "learning_rate": 2.1057206804738602e-05, "loss": 0.3945, "step": 78260 }, { "epoch": 2.820665297149241, "grad_norm": 0.2466830313205719, "learning_rate": 2.1054325217826694e-05, "loss": 0.4094, "step": 78265 }, { "epoch": 2.820845496810466, "grad_norm": 0.22766727209091187, "learning_rate": 2.1051443684673832e-05, "loss": 0.4063, "step": 78270 }, { "epoch": 2.8210256964716907, "grad_norm": 0.19182877242565155, "learning_rate": 2.1048562205319295e-05, "loss": 0.3953, "step": 78275 }, { "epoch": 2.8212058961329154, "grad_norm": 0.2543238699436188, "learning_rate": 2.1045680779802336e-05, "loss": 0.3709, "step": 78280 }, { "epoch": 2.82138609579414, "grad_norm": 0.16309164464473724, "learning_rate": 2.1042799408162194e-05, "loss": 0.3363, "step": 78285 }, { "epoch": 2.8215662954553644, "grad_norm": 0.1783873587846756, "learning_rate": 2.1039918090438156e-05, "loss": 0.3677, "step": 78290 }, { "epoch": 2.821746495116589, "grad_norm": 0.2764756679534912, "learning_rate": 2.1037036826669463e-05, "loss": 0.4055, "step": 78295 }, { "epoch": 2.821926694777814, "grad_norm": 0.23792728781700134, "learning_rate": 2.103415561689538e-05, "loss": 0.4224, "step": 78300 }, { "epoch": 2.8221068944390386, "grad_norm": 0.18397817015647888, "learning_rate": 2.1031274461155164e-05, "loss": 0.3565, "step": 78305 }, { "epoch": 2.822287094100263, "grad_norm": 0.16802199184894562, "learning_rate": 2.1028393359488048e-05, "loss": 0.3974, "step": 78310 }, { "epoch": 2.8224672937614876, "grad_norm": 0.20538485050201416, "learning_rate": 2.1025512311933324e-05, "loss": 0.3369, "step": 78315 }, { "epoch": 2.8226474934227124, "grad_norm": 0.2252337634563446, "learning_rate": 2.102263131853022e-05, "loss": 0.3967, "step": 78320 }, { "epoch": 2.822827693083937, "grad_norm": 0.2148447036743164, "learning_rate": 2.101975037931798e-05, "loss": 0.4041, "step": 78325 }, { "epoch": 2.823007892745162, "grad_norm": 0.24140605330467224, "learning_rate": 2.1016869494335882e-05, "loss": 0.365, "step": 78330 }, { "epoch": 2.823188092406386, "grad_norm": 0.21468763053417206, "learning_rate": 2.1013988663623165e-05, "loss": 0.3998, "step": 78335 }, { "epoch": 2.823368292067611, "grad_norm": 0.18902160227298737, "learning_rate": 2.101110788721908e-05, "loss": 0.3891, "step": 78340 }, { "epoch": 2.8235484917288356, "grad_norm": 0.18640626966953278, "learning_rate": 2.1008227165162877e-05, "loss": 0.3851, "step": 78345 }, { "epoch": 2.8237286913900603, "grad_norm": 0.21738696098327637, "learning_rate": 2.1005346497493807e-05, "loss": 0.4259, "step": 78350 }, { "epoch": 2.8239088910512846, "grad_norm": 0.18691019713878632, "learning_rate": 2.1002465884251116e-05, "loss": 0.3784, "step": 78355 }, { "epoch": 2.8240890907125094, "grad_norm": 0.17924901843070984, "learning_rate": 2.0999585325474057e-05, "loss": 0.3571, "step": 78360 }, { "epoch": 2.824269290373734, "grad_norm": 0.20223569869995117, "learning_rate": 2.0996704821201867e-05, "loss": 0.3781, "step": 78365 }, { "epoch": 2.824449490034959, "grad_norm": 0.2247609794139862, "learning_rate": 2.09938243714738e-05, "loss": 0.3776, "step": 78370 }, { "epoch": 2.8246296896961836, "grad_norm": 0.2532680034637451, "learning_rate": 2.09909439763291e-05, "loss": 0.4274, "step": 78375 }, { "epoch": 2.824809889357408, "grad_norm": 0.24293555319309235, "learning_rate": 2.0988063635807022e-05, "loss": 0.3768, "step": 78380 }, { "epoch": 2.8249900890186326, "grad_norm": 0.17587457597255707, "learning_rate": 2.09851833499468e-05, "loss": 0.3666, "step": 78385 }, { "epoch": 2.8251702886798573, "grad_norm": 0.20782296359539032, "learning_rate": 2.0982303118787662e-05, "loss": 0.4201, "step": 78390 }, { "epoch": 2.825350488341082, "grad_norm": 0.200978621840477, "learning_rate": 2.0979422942368882e-05, "loss": 0.3939, "step": 78395 }, { "epoch": 2.8255306880023063, "grad_norm": 0.18631309270858765, "learning_rate": 2.097654282072968e-05, "loss": 0.4075, "step": 78400 }, { "epoch": 2.825710887663531, "grad_norm": 0.22163410484790802, "learning_rate": 2.09736627539093e-05, "loss": 0.3522, "step": 78405 }, { "epoch": 2.825891087324756, "grad_norm": 0.18656250834465027, "learning_rate": 2.0970782741946987e-05, "loss": 0.4021, "step": 78410 }, { "epoch": 2.8260712869859805, "grad_norm": 0.23782800137996674, "learning_rate": 2.096790278488197e-05, "loss": 0.3833, "step": 78415 }, { "epoch": 2.8262514866472053, "grad_norm": 0.21709121763706207, "learning_rate": 2.0965022882753516e-05, "loss": 0.3792, "step": 78420 }, { "epoch": 2.82643168630843, "grad_norm": 0.22497032582759857, "learning_rate": 2.0962143035600833e-05, "loss": 0.4084, "step": 78425 }, { "epoch": 2.8266118859696543, "grad_norm": 0.2216866910457611, "learning_rate": 2.095926324346317e-05, "loss": 0.406, "step": 78430 }, { "epoch": 2.826792085630879, "grad_norm": 0.2360861450433731, "learning_rate": 2.0956383506379764e-05, "loss": 0.4031, "step": 78435 }, { "epoch": 2.8269722852921038, "grad_norm": 0.2680504620075226, "learning_rate": 2.0953503824389853e-05, "loss": 0.369, "step": 78440 }, { "epoch": 2.827152484953328, "grad_norm": 0.21421213448047638, "learning_rate": 2.0950624197532662e-05, "loss": 0.3846, "step": 78445 }, { "epoch": 2.827332684614553, "grad_norm": 0.1895037740468979, "learning_rate": 2.0947744625847437e-05, "loss": 0.4286, "step": 78450 }, { "epoch": 2.8275128842757775, "grad_norm": 0.20965850353240967, "learning_rate": 2.0944865109373405e-05, "loss": 0.408, "step": 78455 }, { "epoch": 2.8276930839370022, "grad_norm": 0.21087834239006042, "learning_rate": 2.0941985648149804e-05, "loss": 0.3806, "step": 78460 }, { "epoch": 2.827873283598227, "grad_norm": 0.19764740765094757, "learning_rate": 2.093910624221586e-05, "loss": 0.3897, "step": 78465 }, { "epoch": 2.8280534832594517, "grad_norm": 0.25473564863204956, "learning_rate": 2.0936226891610806e-05, "loss": 0.3687, "step": 78470 }, { "epoch": 2.828233682920676, "grad_norm": 0.2085646688938141, "learning_rate": 2.0933347596373876e-05, "loss": 0.4079, "step": 78475 }, { "epoch": 2.8284138825819007, "grad_norm": 0.22956323623657227, "learning_rate": 2.0930468356544294e-05, "loss": 0.3709, "step": 78480 }, { "epoch": 2.8285940822431255, "grad_norm": 0.1897376924753189, "learning_rate": 2.09275891721613e-05, "loss": 0.3705, "step": 78485 }, { "epoch": 2.8287742819043498, "grad_norm": 0.25354906916618347, "learning_rate": 2.0924710043264116e-05, "loss": 0.4236, "step": 78490 }, { "epoch": 2.8289544815655745, "grad_norm": 0.20143790543079376, "learning_rate": 2.0921830969891955e-05, "loss": 0.3469, "step": 78495 }, { "epoch": 2.8291346812267992, "grad_norm": 0.22372838854789734, "learning_rate": 2.0918951952084077e-05, "loss": 0.3999, "step": 78500 }, { "epoch": 2.8291346812267992, "eval_loss": 0.43108808994293213, "eval_runtime": 3.5275, "eval_samples_per_second": 28.348, "eval_steps_per_second": 7.087, "step": 78500 }, { "epoch": 2.829314880888024, "grad_norm": 0.21421097218990326, "learning_rate": 2.0916072989879678e-05, "loss": 0.3838, "step": 78505 }, { "epoch": 2.8294950805492487, "grad_norm": 0.23731063306331635, "learning_rate": 2.091319408331799e-05, "loss": 0.3827, "step": 78510 }, { "epoch": 2.8296752802104734, "grad_norm": 0.1949160099029541, "learning_rate": 2.0910315232438248e-05, "loss": 0.3793, "step": 78515 }, { "epoch": 2.8298554798716977, "grad_norm": 0.27850422263145447, "learning_rate": 2.0907436437279662e-05, "loss": 0.3917, "step": 78520 }, { "epoch": 2.8300356795329225, "grad_norm": 0.19794079661369324, "learning_rate": 2.090455769788147e-05, "loss": 0.4164, "step": 78525 }, { "epoch": 2.830215879194147, "grad_norm": 0.1963154375553131, "learning_rate": 2.0901679014282882e-05, "loss": 0.3783, "step": 78530 }, { "epoch": 2.8303960788553715, "grad_norm": 0.24002398550510406, "learning_rate": 2.0898800386523123e-05, "loss": 0.3993, "step": 78535 }, { "epoch": 2.830576278516596, "grad_norm": 0.20864902436733246, "learning_rate": 2.0895921814641416e-05, "loss": 0.3889, "step": 78540 }, { "epoch": 2.830756478177821, "grad_norm": 0.2638551890850067, "learning_rate": 2.0893043298676986e-05, "loss": 0.3931, "step": 78545 }, { "epoch": 2.8309366778390457, "grad_norm": 0.20083938539028168, "learning_rate": 2.0890164838669036e-05, "loss": 0.3713, "step": 78550 }, { "epoch": 2.8311168775002704, "grad_norm": 0.19976069033145905, "learning_rate": 2.08872864346568e-05, "loss": 0.4218, "step": 78555 }, { "epoch": 2.831297077161495, "grad_norm": 0.19148698449134827, "learning_rate": 2.0884408086679485e-05, "loss": 0.3993, "step": 78560 }, { "epoch": 2.8314772768227194, "grad_norm": 0.20389747619628906, "learning_rate": 2.088152979477632e-05, "loss": 0.3613, "step": 78565 }, { "epoch": 2.831657476483944, "grad_norm": 0.16990236937999725, "learning_rate": 2.087865155898652e-05, "loss": 0.3767, "step": 78570 }, { "epoch": 2.831837676145169, "grad_norm": 0.195389986038208, "learning_rate": 2.0875773379349273e-05, "loss": 0.3924, "step": 78575 }, { "epoch": 2.8320178758063936, "grad_norm": 0.19705142080783844, "learning_rate": 2.087289525590383e-05, "loss": 0.4067, "step": 78580 }, { "epoch": 2.832198075467618, "grad_norm": 0.18426579236984253, "learning_rate": 2.0870017188689394e-05, "loss": 0.4144, "step": 78585 }, { "epoch": 2.8323782751288427, "grad_norm": 0.23681855201721191, "learning_rate": 2.086713917774516e-05, "loss": 0.4152, "step": 78590 }, { "epoch": 2.8325584747900674, "grad_norm": 0.18488551676273346, "learning_rate": 2.0864261223110372e-05, "loss": 0.3904, "step": 78595 }, { "epoch": 2.832738674451292, "grad_norm": 0.21363814175128937, "learning_rate": 2.0861383324824206e-05, "loss": 0.4118, "step": 78600 }, { "epoch": 2.832918874112517, "grad_norm": 0.24424757063388824, "learning_rate": 2.0858505482925905e-05, "loss": 0.3585, "step": 78605 }, { "epoch": 2.833099073773741, "grad_norm": 0.20323701202869415, "learning_rate": 2.0855627697454662e-05, "loss": 0.3676, "step": 78610 }, { "epoch": 2.833279273434966, "grad_norm": 0.1850632280111313, "learning_rate": 2.0852749968449684e-05, "loss": 0.3832, "step": 78615 }, { "epoch": 2.8334594730961906, "grad_norm": 0.19925178587436676, "learning_rate": 2.084987229595019e-05, "loss": 0.3802, "step": 78620 }, { "epoch": 2.8336396727574154, "grad_norm": 0.21943655610084534, "learning_rate": 2.084699467999538e-05, "loss": 0.3908, "step": 78625 }, { "epoch": 2.8338198724186396, "grad_norm": 0.2018982470035553, "learning_rate": 2.0844117120624463e-05, "loss": 0.3912, "step": 78630 }, { "epoch": 2.8340000720798644, "grad_norm": 0.20863167941570282, "learning_rate": 2.0841239617876647e-05, "loss": 0.4028, "step": 78635 }, { "epoch": 2.834180271741089, "grad_norm": 0.19766663014888763, "learning_rate": 2.0838362171791133e-05, "loss": 0.3733, "step": 78640 }, { "epoch": 2.834360471402314, "grad_norm": 0.24761301279067993, "learning_rate": 2.083548478240713e-05, "loss": 0.3987, "step": 78645 }, { "epoch": 2.8345406710635386, "grad_norm": 0.22107259929180145, "learning_rate": 2.0832607449763843e-05, "loss": 0.4007, "step": 78650 }, { "epoch": 2.8347208707247633, "grad_norm": 0.2298266589641571, "learning_rate": 2.082973017390047e-05, "loss": 0.3774, "step": 78655 }, { "epoch": 2.8349010703859876, "grad_norm": 0.2100599855184555, "learning_rate": 2.0826852954856217e-05, "loss": 0.3824, "step": 78660 }, { "epoch": 2.8350812700472123, "grad_norm": 0.19169162213802338, "learning_rate": 2.0823975792670292e-05, "loss": 0.3735, "step": 78665 }, { "epoch": 2.835261469708437, "grad_norm": 0.2121647298336029, "learning_rate": 2.0821098687381874e-05, "loss": 0.4001, "step": 78670 }, { "epoch": 2.8354416693696614, "grad_norm": 0.2263890504837036, "learning_rate": 2.081822163903019e-05, "loss": 0.4074, "step": 78675 }, { "epoch": 2.835621869030886, "grad_norm": 0.2616461515426636, "learning_rate": 2.0815344647654413e-05, "loss": 0.4573, "step": 78680 }, { "epoch": 2.835802068692111, "grad_norm": 0.2435300052165985, "learning_rate": 2.081246771329377e-05, "loss": 0.3558, "step": 78685 }, { "epoch": 2.8359822683533356, "grad_norm": 0.20610463619232178, "learning_rate": 2.080959083598744e-05, "loss": 0.4027, "step": 78690 }, { "epoch": 2.8361624680145603, "grad_norm": 0.23141418397426605, "learning_rate": 2.0806714015774613e-05, "loss": 0.3734, "step": 78695 }, { "epoch": 2.836342667675785, "grad_norm": 0.1831231713294983, "learning_rate": 2.080383725269451e-05, "loss": 0.4006, "step": 78700 }, { "epoch": 2.8365228673370093, "grad_norm": 0.23543448746204376, "learning_rate": 2.0800960546786293e-05, "loss": 0.4156, "step": 78705 }, { "epoch": 2.836703066998234, "grad_norm": 0.1677248328924179, "learning_rate": 2.0798083898089193e-05, "loss": 0.3878, "step": 78710 }, { "epoch": 2.836883266659459, "grad_norm": 0.20878414809703827, "learning_rate": 2.0795207306642383e-05, "loss": 0.3831, "step": 78715 }, { "epoch": 2.837063466320683, "grad_norm": 0.24195106327533722, "learning_rate": 2.0792330772485055e-05, "loss": 0.4067, "step": 78720 }, { "epoch": 2.837243665981908, "grad_norm": 0.19112065434455872, "learning_rate": 2.078945429565641e-05, "loss": 0.4271, "step": 78725 }, { "epoch": 2.8374238656431325, "grad_norm": 0.2498737871646881, "learning_rate": 2.0786577876195633e-05, "loss": 0.3881, "step": 78730 }, { "epoch": 2.8376040653043573, "grad_norm": 0.1864451766014099, "learning_rate": 2.0783701514141916e-05, "loss": 0.3809, "step": 78735 }, { "epoch": 2.837784264965582, "grad_norm": 0.24682430922985077, "learning_rate": 2.0780825209534448e-05, "loss": 0.3981, "step": 78740 }, { "epoch": 2.8379644646268067, "grad_norm": 0.2143888771533966, "learning_rate": 2.077794896241242e-05, "loss": 0.3755, "step": 78745 }, { "epoch": 2.838144664288031, "grad_norm": 0.2417534738779068, "learning_rate": 2.0775072772815023e-05, "loss": 0.4129, "step": 78750 }, { "epoch": 2.8383248639492558, "grad_norm": 0.2023160457611084, "learning_rate": 2.0772196640781444e-05, "loss": 0.3503, "step": 78755 }, { "epoch": 2.8385050636104805, "grad_norm": 0.26836487650871277, "learning_rate": 2.076932056635086e-05, "loss": 0.433, "step": 78760 }, { "epoch": 2.838685263271705, "grad_norm": 0.2009926289319992, "learning_rate": 2.076644454956247e-05, "loss": 0.452, "step": 78765 }, { "epoch": 2.8388654629329295, "grad_norm": 0.20589284598827362, "learning_rate": 2.0763568590455458e-05, "loss": 0.3995, "step": 78770 }, { "epoch": 2.8390456625941543, "grad_norm": 0.23348768055438995, "learning_rate": 2.0760692689068988e-05, "loss": 0.3783, "step": 78775 }, { "epoch": 2.839225862255379, "grad_norm": 0.2545948028564453, "learning_rate": 2.0757816845442274e-05, "loss": 0.423, "step": 78780 }, { "epoch": 2.8394060619166037, "grad_norm": 0.19799551367759705, "learning_rate": 2.075494105961447e-05, "loss": 0.417, "step": 78785 }, { "epoch": 2.8395862615778285, "grad_norm": 0.22839927673339844, "learning_rate": 2.0752065331624788e-05, "loss": 0.3828, "step": 78790 }, { "epoch": 2.8397664612390527, "grad_norm": 0.1921931505203247, "learning_rate": 2.0749189661512387e-05, "loss": 0.4218, "step": 78795 }, { "epoch": 2.8399466609002775, "grad_norm": 0.20182907581329346, "learning_rate": 2.074631404931645e-05, "loss": 0.4186, "step": 78800 }, { "epoch": 2.840126860561502, "grad_norm": 0.20706386864185333, "learning_rate": 2.0743438495076164e-05, "loss": 0.394, "step": 78805 }, { "epoch": 2.840307060222727, "grad_norm": 0.23786523938179016, "learning_rate": 2.0740562998830706e-05, "loss": 0.3553, "step": 78810 }, { "epoch": 2.8404872598839512, "grad_norm": 0.1973177045583725, "learning_rate": 2.073768756061925e-05, "loss": 0.3815, "step": 78815 }, { "epoch": 2.840667459545176, "grad_norm": 0.21319210529327393, "learning_rate": 2.0734812180480976e-05, "loss": 0.3736, "step": 78820 }, { "epoch": 2.8408476592064007, "grad_norm": 0.21359285712242126, "learning_rate": 2.0731936858455057e-05, "loss": 0.358, "step": 78825 }, { "epoch": 2.8410278588676254, "grad_norm": 0.24287675321102142, "learning_rate": 2.0729061594580677e-05, "loss": 0.4399, "step": 78830 }, { "epoch": 2.84120805852885, "grad_norm": 0.2060524970293045, "learning_rate": 2.0726186388897007e-05, "loss": 0.3973, "step": 78835 }, { "epoch": 2.8413882581900745, "grad_norm": 0.21056078374385834, "learning_rate": 2.072331124144321e-05, "loss": 0.3884, "step": 78840 }, { "epoch": 2.841568457851299, "grad_norm": 0.22964583337306976, "learning_rate": 2.0720436152258483e-05, "loss": 0.3679, "step": 78845 }, { "epoch": 2.841748657512524, "grad_norm": 0.1646958887577057, "learning_rate": 2.0717561121381983e-05, "loss": 0.3748, "step": 78850 }, { "epoch": 2.8419288571737487, "grad_norm": 0.2664015591144562, "learning_rate": 2.0714686148852873e-05, "loss": 0.4059, "step": 78855 }, { "epoch": 2.842109056834973, "grad_norm": 0.22343656420707703, "learning_rate": 2.0711811234710347e-05, "loss": 0.4021, "step": 78860 }, { "epoch": 2.8422892564961977, "grad_norm": 0.2661823332309723, "learning_rate": 2.0708936378993545e-05, "loss": 0.3696, "step": 78865 }, { "epoch": 2.8424694561574224, "grad_norm": 0.25233519077301025, "learning_rate": 2.0706061581741667e-05, "loss": 0.4061, "step": 78870 }, { "epoch": 2.842649655818647, "grad_norm": 0.19629564881324768, "learning_rate": 2.0703186842993878e-05, "loss": 0.4031, "step": 78875 }, { "epoch": 2.842829855479872, "grad_norm": 0.17642425000667572, "learning_rate": 2.0700312162789316e-05, "loss": 0.4057, "step": 78880 }, { "epoch": 2.843010055141096, "grad_norm": 0.16491912305355072, "learning_rate": 2.0697437541167182e-05, "loss": 0.3817, "step": 78885 }, { "epoch": 2.843190254802321, "grad_norm": 0.18968559801578522, "learning_rate": 2.0694562978166617e-05, "loss": 0.4262, "step": 78890 }, { "epoch": 2.8433704544635456, "grad_norm": 0.23654945194721222, "learning_rate": 2.0691688473826813e-05, "loss": 0.4081, "step": 78895 }, { "epoch": 2.8435506541247704, "grad_norm": 0.21302512288093567, "learning_rate": 2.068881402818691e-05, "loss": 0.3762, "step": 78900 }, { "epoch": 2.8437308537859947, "grad_norm": 0.19468043744564056, "learning_rate": 2.068593964128608e-05, "loss": 0.4304, "step": 78905 }, { "epoch": 2.8439110534472194, "grad_norm": 0.20956778526306152, "learning_rate": 2.0683065313163493e-05, "loss": 0.3958, "step": 78910 }, { "epoch": 2.844091253108444, "grad_norm": 0.2271938920021057, "learning_rate": 2.0680191043858303e-05, "loss": 0.4014, "step": 78915 }, { "epoch": 2.844271452769669, "grad_norm": 0.2164512425661087, "learning_rate": 2.0677316833409672e-05, "loss": 0.4368, "step": 78920 }, { "epoch": 2.8444516524308936, "grad_norm": 0.2096407115459442, "learning_rate": 2.0674442681856764e-05, "loss": 0.3887, "step": 78925 }, { "epoch": 2.8446318520921183, "grad_norm": 0.199036106467247, "learning_rate": 2.0671568589238734e-05, "loss": 0.389, "step": 78930 }, { "epoch": 2.8448120517533426, "grad_norm": 0.20308394730091095, "learning_rate": 2.0668694555594746e-05, "loss": 0.3894, "step": 78935 }, { "epoch": 2.8449922514145674, "grad_norm": 0.2252362221479416, "learning_rate": 2.0665820580963957e-05, "loss": 0.3844, "step": 78940 }, { "epoch": 2.845172451075792, "grad_norm": 0.18750686943531036, "learning_rate": 2.066294666538552e-05, "loss": 0.4232, "step": 78945 }, { "epoch": 2.8453526507370164, "grad_norm": 0.20952045917510986, "learning_rate": 2.06600728088986e-05, "loss": 0.4092, "step": 78950 }, { "epoch": 2.845532850398241, "grad_norm": 0.24399419128894806, "learning_rate": 2.0657199011542352e-05, "loss": 0.4103, "step": 78955 }, { "epoch": 2.845713050059466, "grad_norm": 0.20559486746788025, "learning_rate": 2.065432527335591e-05, "loss": 0.3815, "step": 78960 }, { "epoch": 2.8458932497206906, "grad_norm": 0.24112515151500702, "learning_rate": 2.0651451594378462e-05, "loss": 0.4345, "step": 78965 }, { "epoch": 2.8460734493819153, "grad_norm": 0.24008318781852722, "learning_rate": 2.064857797464913e-05, "loss": 0.4203, "step": 78970 }, { "epoch": 2.84625364904314, "grad_norm": 0.1976160705089569, "learning_rate": 2.0645704414207096e-05, "loss": 0.3979, "step": 78975 }, { "epoch": 2.8464338487043643, "grad_norm": 0.21798163652420044, "learning_rate": 2.064283091309149e-05, "loss": 0.3961, "step": 78980 }, { "epoch": 2.846614048365589, "grad_norm": 0.30236032605171204, "learning_rate": 2.0639957471341463e-05, "loss": 0.3842, "step": 78985 }, { "epoch": 2.846794248026814, "grad_norm": 0.2003178596496582, "learning_rate": 2.0637084088996175e-05, "loss": 0.4055, "step": 78990 }, { "epoch": 2.846974447688038, "grad_norm": 0.18531939387321472, "learning_rate": 2.0634210766094775e-05, "loss": 0.3818, "step": 78995 }, { "epoch": 2.847154647349263, "grad_norm": 0.16672350466251373, "learning_rate": 2.06313375026764e-05, "loss": 0.3738, "step": 79000 }, { "epoch": 2.847154647349263, "eval_loss": 0.4311943054199219, "eval_runtime": 3.5347, "eval_samples_per_second": 28.291, "eval_steps_per_second": 7.073, "step": 79000 }, { "epoch": 2.8473348470104876, "grad_norm": 0.1916167140007019, "learning_rate": 2.0628464298780215e-05, "loss": 0.39, "step": 79005 }, { "epoch": 2.8475150466717123, "grad_norm": 0.4534687399864197, "learning_rate": 2.0625591154445348e-05, "loss": 0.3945, "step": 79010 }, { "epoch": 2.847695246332937, "grad_norm": 0.19475337862968445, "learning_rate": 2.062271806971096e-05, "loss": 0.4248, "step": 79015 }, { "epoch": 2.8478754459941618, "grad_norm": 0.20240464806556702, "learning_rate": 2.0619845044616195e-05, "loss": 0.4073, "step": 79020 }, { "epoch": 2.848055645655386, "grad_norm": 0.21959447860717773, "learning_rate": 2.0616972079200185e-05, "loss": 0.4052, "step": 79025 }, { "epoch": 2.848235845316611, "grad_norm": 0.21866481006145477, "learning_rate": 2.061409917350209e-05, "loss": 0.4036, "step": 79030 }, { "epoch": 2.8484160449778355, "grad_norm": 0.19479642808437347, "learning_rate": 2.0611226327561042e-05, "loss": 0.3468, "step": 79035 }, { "epoch": 2.84859624463906, "grad_norm": 0.24999023973941803, "learning_rate": 2.060835354141618e-05, "loss": 0.3691, "step": 79040 }, { "epoch": 2.8487764443002845, "grad_norm": 0.23853568732738495, "learning_rate": 2.0605480815106656e-05, "loss": 0.4224, "step": 79045 }, { "epoch": 2.8489566439615093, "grad_norm": 0.2416432797908783, "learning_rate": 2.0602608148671602e-05, "loss": 0.3812, "step": 79050 }, { "epoch": 2.849136843622734, "grad_norm": 0.18531100451946259, "learning_rate": 2.0599735542150164e-05, "loss": 0.4103, "step": 79055 }, { "epoch": 2.8493170432839587, "grad_norm": 0.1804402768611908, "learning_rate": 2.0596862995581485e-05, "loss": 0.3648, "step": 79060 }, { "epoch": 2.8494972429451835, "grad_norm": 0.21126359701156616, "learning_rate": 2.0593990509004675e-05, "loss": 0.3624, "step": 79065 }, { "epoch": 2.8496774426064078, "grad_norm": 0.23747804760932922, "learning_rate": 2.059111808245891e-05, "loss": 0.4037, "step": 79070 }, { "epoch": 2.8498576422676325, "grad_norm": 0.2273935079574585, "learning_rate": 2.058824571598329e-05, "loss": 0.4083, "step": 79075 }, { "epoch": 2.8500378419288572, "grad_norm": 0.22935499250888824, "learning_rate": 2.0585373409616985e-05, "loss": 0.3983, "step": 79080 }, { "epoch": 2.850218041590082, "grad_norm": 0.28681254386901855, "learning_rate": 2.058250116339911e-05, "loss": 0.39, "step": 79085 }, { "epoch": 2.8503982412513063, "grad_norm": 0.18276835978031158, "learning_rate": 2.0579628977368792e-05, "loss": 0.3573, "step": 79090 }, { "epoch": 2.850578440912531, "grad_norm": 0.19169297814369202, "learning_rate": 2.0576756851565182e-05, "loss": 0.4129, "step": 79095 }, { "epoch": 2.8507586405737557, "grad_norm": 0.2185550034046173, "learning_rate": 2.05738847860274e-05, "loss": 0.4006, "step": 79100 }, { "epoch": 2.8509388402349805, "grad_norm": 0.24736441671848297, "learning_rate": 2.0571012780794577e-05, "loss": 0.3866, "step": 79105 }, { "epoch": 2.851119039896205, "grad_norm": 0.19851014018058777, "learning_rate": 2.056814083590585e-05, "loss": 0.4227, "step": 79110 }, { "epoch": 2.8512992395574295, "grad_norm": 0.19932295382022858, "learning_rate": 2.0565268951400346e-05, "loss": 0.3941, "step": 79115 }, { "epoch": 2.851479439218654, "grad_norm": 0.21272258460521698, "learning_rate": 2.0562397127317197e-05, "loss": 0.37, "step": 79120 }, { "epoch": 2.851659638879879, "grad_norm": 0.2207152098417282, "learning_rate": 2.055952536369553e-05, "loss": 0.3985, "step": 79125 }, { "epoch": 2.8518398385411037, "grad_norm": 0.2772730588912964, "learning_rate": 2.0556653660574464e-05, "loss": 0.385, "step": 79130 }, { "epoch": 2.852020038202328, "grad_norm": 0.2133176475763321, "learning_rate": 2.0553782017993135e-05, "loss": 0.3741, "step": 79135 }, { "epoch": 2.8522002378635527, "grad_norm": 0.266476571559906, "learning_rate": 2.055091043599067e-05, "loss": 0.4071, "step": 79140 }, { "epoch": 2.8523804375247774, "grad_norm": 0.20669589936733246, "learning_rate": 2.0548038914606174e-05, "loss": 0.417, "step": 79145 }, { "epoch": 2.852560637186002, "grad_norm": 0.21725420653820038, "learning_rate": 2.0545167453878804e-05, "loss": 0.3852, "step": 79150 }, { "epoch": 2.852740836847227, "grad_norm": 0.23247067630290985, "learning_rate": 2.0542296053847647e-05, "loss": 0.399, "step": 79155 }, { "epoch": 2.8529210365084516, "grad_norm": 0.2411661595106125, "learning_rate": 2.0539424714551852e-05, "loss": 0.4155, "step": 79160 }, { "epoch": 2.853101236169676, "grad_norm": 0.2001282423734665, "learning_rate": 2.053655343603054e-05, "loss": 0.4257, "step": 79165 }, { "epoch": 2.8532814358309007, "grad_norm": 0.21642963588237762, "learning_rate": 2.0533682218322807e-05, "loss": 0.4346, "step": 79170 }, { "epoch": 2.8534616354921254, "grad_norm": 0.24576754868030548, "learning_rate": 2.0530811061467802e-05, "loss": 0.4105, "step": 79175 }, { "epoch": 2.8536418351533497, "grad_norm": 0.16596747934818268, "learning_rate": 2.052793996550463e-05, "loss": 0.3769, "step": 79180 }, { "epoch": 2.8538220348145744, "grad_norm": 0.20932526886463165, "learning_rate": 2.05250689304724e-05, "loss": 0.4079, "step": 79185 }, { "epoch": 2.854002234475799, "grad_norm": 0.23779064416885376, "learning_rate": 2.052219795641025e-05, "loss": 0.4114, "step": 79190 }, { "epoch": 2.854182434137024, "grad_norm": 0.17086093127727509, "learning_rate": 2.0519327043357278e-05, "loss": 0.3968, "step": 79195 }, { "epoch": 2.8543626337982486, "grad_norm": 0.22263990342617035, "learning_rate": 2.0516456191352612e-05, "loss": 0.3955, "step": 79200 }, { "epoch": 2.8545428334594733, "grad_norm": 0.24070154130458832, "learning_rate": 2.0513585400435363e-05, "loss": 0.3586, "step": 79205 }, { "epoch": 2.8547230331206976, "grad_norm": 0.18672329187393188, "learning_rate": 2.0510714670644643e-05, "loss": 0.3952, "step": 79210 }, { "epoch": 2.8549032327819224, "grad_norm": 0.20222359895706177, "learning_rate": 2.0507844002019564e-05, "loss": 0.3989, "step": 79215 }, { "epoch": 2.855083432443147, "grad_norm": 0.19654959440231323, "learning_rate": 2.0504973394599247e-05, "loss": 0.3831, "step": 79220 }, { "epoch": 2.8552636321043714, "grad_norm": 0.24008645117282867, "learning_rate": 2.050210284842279e-05, "loss": 0.39, "step": 79225 }, { "epoch": 2.855443831765596, "grad_norm": 0.20987041294574738, "learning_rate": 2.0499232363529315e-05, "loss": 0.3742, "step": 79230 }, { "epoch": 2.855624031426821, "grad_norm": 0.22069920599460602, "learning_rate": 2.0496361939957926e-05, "loss": 0.4068, "step": 79235 }, { "epoch": 2.8558042310880456, "grad_norm": 0.21462880074977875, "learning_rate": 2.0493491577747738e-05, "loss": 0.4358, "step": 79240 }, { "epoch": 2.8559844307492703, "grad_norm": 0.22585731744766235, "learning_rate": 2.0490621276937853e-05, "loss": 0.3979, "step": 79245 }, { "epoch": 2.856164630410495, "grad_norm": 0.20403482019901276, "learning_rate": 2.048775103756737e-05, "loss": 0.386, "step": 79250 }, { "epoch": 2.8563448300717194, "grad_norm": 0.1699545532464981, "learning_rate": 2.0484880859675422e-05, "loss": 0.3828, "step": 79255 }, { "epoch": 2.856525029732944, "grad_norm": 0.15183869004249573, "learning_rate": 2.0482010743301093e-05, "loss": 0.3445, "step": 79260 }, { "epoch": 2.856705229394169, "grad_norm": 0.25381314754486084, "learning_rate": 2.0479140688483485e-05, "loss": 0.4077, "step": 79265 }, { "epoch": 2.856885429055393, "grad_norm": 0.18916507065296173, "learning_rate": 2.0476270695261716e-05, "loss": 0.3801, "step": 79270 }, { "epoch": 2.857065628716618, "grad_norm": 0.18349602818489075, "learning_rate": 2.0473400763674876e-05, "loss": 0.4151, "step": 79275 }, { "epoch": 2.8572458283778426, "grad_norm": 0.1902550309896469, "learning_rate": 2.0470530893762087e-05, "loss": 0.4191, "step": 79280 }, { "epoch": 2.8574260280390673, "grad_norm": 0.23861198127269745, "learning_rate": 2.046766108556243e-05, "loss": 0.3875, "step": 79285 }, { "epoch": 2.857606227700292, "grad_norm": 0.19769497215747833, "learning_rate": 2.0464791339115014e-05, "loss": 0.3807, "step": 79290 }, { "epoch": 2.857786427361517, "grad_norm": 0.20650269091129303, "learning_rate": 2.0461921654458938e-05, "loss": 0.3896, "step": 79295 }, { "epoch": 2.857966627022741, "grad_norm": 0.19639898836612701, "learning_rate": 2.0459052031633297e-05, "loss": 0.3815, "step": 79300 }, { "epoch": 2.858146826683966, "grad_norm": 0.23616774380207062, "learning_rate": 2.0456182470677198e-05, "loss": 0.4199, "step": 79305 }, { "epoch": 2.8583270263451905, "grad_norm": 0.17947378754615784, "learning_rate": 2.0453312971629734e-05, "loss": 0.3959, "step": 79310 }, { "epoch": 2.8585072260064153, "grad_norm": 0.18461818993091583, "learning_rate": 2.0450443534529995e-05, "loss": 0.3596, "step": 79315 }, { "epoch": 2.8586874256676396, "grad_norm": 0.19616423547267914, "learning_rate": 2.044757415941709e-05, "loss": 0.3676, "step": 79320 }, { "epoch": 2.8588676253288643, "grad_norm": 0.18989914655685425, "learning_rate": 2.0444704846330098e-05, "loss": 0.4095, "step": 79325 }, { "epoch": 2.859047824990089, "grad_norm": 0.20153529942035675, "learning_rate": 2.0441835595308122e-05, "loss": 0.4075, "step": 79330 }, { "epoch": 2.8592280246513138, "grad_norm": 0.1888132393360138, "learning_rate": 2.0438966406390256e-05, "loss": 0.4005, "step": 79335 }, { "epoch": 2.8594082243125385, "grad_norm": 0.21628032624721527, "learning_rate": 2.0436097279615585e-05, "loss": 0.4074, "step": 79340 }, { "epoch": 2.859588423973763, "grad_norm": 0.18922530114650726, "learning_rate": 2.0433228215023213e-05, "loss": 0.3821, "step": 79345 }, { "epoch": 2.8597686236349875, "grad_norm": 0.24143609404563904, "learning_rate": 2.0430359212652224e-05, "loss": 0.3769, "step": 79350 }, { "epoch": 2.8599488232962123, "grad_norm": 0.22710378468036652, "learning_rate": 2.042749027254169e-05, "loss": 0.4179, "step": 79355 }, { "epoch": 2.860129022957437, "grad_norm": 0.23377621173858643, "learning_rate": 2.0424621394730735e-05, "loss": 0.3965, "step": 79360 }, { "epoch": 2.8603092226186613, "grad_norm": 0.19734351336956024, "learning_rate": 2.042175257925842e-05, "loss": 0.3964, "step": 79365 }, { "epoch": 2.860489422279886, "grad_norm": 0.21778082847595215, "learning_rate": 2.0418883826163833e-05, "loss": 0.4076, "step": 79370 }, { "epoch": 2.8606696219411107, "grad_norm": 0.1940212845802307, "learning_rate": 2.0416015135486074e-05, "loss": 0.3652, "step": 79375 }, { "epoch": 2.8608498216023355, "grad_norm": 0.21918119490146637, "learning_rate": 2.0413146507264216e-05, "loss": 0.411, "step": 79380 }, { "epoch": 2.86103002126356, "grad_norm": 0.21737195551395416, "learning_rate": 2.0410277941537352e-05, "loss": 0.3977, "step": 79385 }, { "epoch": 2.8612102209247845, "grad_norm": 0.1796942949295044, "learning_rate": 2.0407409438344566e-05, "loss": 0.3913, "step": 79390 }, { "epoch": 2.8613904205860092, "grad_norm": 0.20798861980438232, "learning_rate": 2.040454099772493e-05, "loss": 0.4107, "step": 79395 }, { "epoch": 2.861570620247234, "grad_norm": 0.2642776370048523, "learning_rate": 2.040167261971754e-05, "loss": 0.3942, "step": 79400 }, { "epoch": 2.8617508199084587, "grad_norm": 0.2090744972229004, "learning_rate": 2.039880430436147e-05, "loss": 0.3992, "step": 79405 }, { "epoch": 2.861931019569683, "grad_norm": 0.208735391497612, "learning_rate": 2.0395936051695794e-05, "loss": 0.37, "step": 79410 }, { "epoch": 2.8621112192309077, "grad_norm": 0.2147226631641388, "learning_rate": 2.0393067861759604e-05, "loss": 0.4038, "step": 79415 }, { "epoch": 2.8622914188921325, "grad_norm": 0.21452739834785461, "learning_rate": 2.0390199734591967e-05, "loss": 0.4, "step": 79420 }, { "epoch": 2.862471618553357, "grad_norm": 0.21164734661579132, "learning_rate": 2.0387331670231972e-05, "loss": 0.3923, "step": 79425 }, { "epoch": 2.862651818214582, "grad_norm": 0.25900834798812866, "learning_rate": 2.0384463668718695e-05, "loss": 0.4046, "step": 79430 }, { "epoch": 2.8628320178758067, "grad_norm": 0.2033701092004776, "learning_rate": 2.0381595730091187e-05, "loss": 0.4171, "step": 79435 }, { "epoch": 2.863012217537031, "grad_norm": 0.23494963347911835, "learning_rate": 2.0378727854388557e-05, "loss": 0.4213, "step": 79440 }, { "epoch": 2.8631924171982557, "grad_norm": 0.21937774121761322, "learning_rate": 2.0375860041649874e-05, "loss": 0.3904, "step": 79445 }, { "epoch": 2.8633726168594804, "grad_norm": 0.18949083983898163, "learning_rate": 2.0372992291914182e-05, "loss": 0.3691, "step": 79450 }, { "epoch": 2.8635528165207047, "grad_norm": 0.23285529017448425, "learning_rate": 2.037012460522059e-05, "loss": 0.4134, "step": 79455 }, { "epoch": 2.8637330161819294, "grad_norm": 0.2178962379693985, "learning_rate": 2.036725698160814e-05, "loss": 0.3796, "step": 79460 }, { "epoch": 2.863913215843154, "grad_norm": 0.20775160193443298, "learning_rate": 2.0364389421115936e-05, "loss": 0.3863, "step": 79465 }, { "epoch": 2.864093415504379, "grad_norm": 0.20087379217147827, "learning_rate": 2.0361521923783018e-05, "loss": 0.3904, "step": 79470 }, { "epoch": 2.8642736151656036, "grad_norm": 0.2299887239933014, "learning_rate": 2.035865448964846e-05, "loss": 0.4115, "step": 79475 }, { "epoch": 2.8644538148268284, "grad_norm": 0.22085849940776825, "learning_rate": 2.0355787118751346e-05, "loss": 0.3658, "step": 79480 }, { "epoch": 2.8646340144880527, "grad_norm": 0.22253592312335968, "learning_rate": 2.0352919811130726e-05, "loss": 0.4168, "step": 79485 }, { "epoch": 2.8648142141492774, "grad_norm": 0.20002683997154236, "learning_rate": 2.035005256682568e-05, "loss": 0.4205, "step": 79490 }, { "epoch": 2.864994413810502, "grad_norm": 0.22108720242977142, "learning_rate": 2.034718538587526e-05, "loss": 0.4173, "step": 79495 }, { "epoch": 2.8651746134717264, "grad_norm": 0.20260627567768097, "learning_rate": 2.034431826831854e-05, "loss": 0.4003, "step": 79500 }, { "epoch": 2.8651746134717264, "eval_loss": 0.43116477131843567, "eval_runtime": 3.5328, "eval_samples_per_second": 28.306, "eval_steps_per_second": 7.077, "step": 79500 }, { "epoch": 2.865354813132951, "grad_norm": 0.20049650967121124, "learning_rate": 2.0341451214194586e-05, "loss": 0.3975, "step": 79505 }, { "epoch": 2.865535012794176, "grad_norm": 0.21907275915145874, "learning_rate": 2.0338584223542462e-05, "loss": 0.4186, "step": 79510 }, { "epoch": 2.8657152124554006, "grad_norm": 0.2359967678785324, "learning_rate": 2.033571729640121e-05, "loss": 0.3657, "step": 79515 }, { "epoch": 2.8658954121166254, "grad_norm": 0.24726665019989014, "learning_rate": 2.0332850432809922e-05, "loss": 0.3807, "step": 79520 }, { "epoch": 2.86607561177785, "grad_norm": 0.24529293179512024, "learning_rate": 2.0329983632807632e-05, "loss": 0.4069, "step": 79525 }, { "epoch": 2.8662558114390744, "grad_norm": 0.19809693098068237, "learning_rate": 2.032711689643342e-05, "loss": 0.3777, "step": 79530 }, { "epoch": 2.866436011100299, "grad_norm": 0.1797298640012741, "learning_rate": 2.032425022372634e-05, "loss": 0.3542, "step": 79535 }, { "epoch": 2.866616210761524, "grad_norm": 0.1798580288887024, "learning_rate": 2.032138361472543e-05, "loss": 0.3837, "step": 79540 }, { "epoch": 2.866796410422748, "grad_norm": 0.2182854264974594, "learning_rate": 2.0318517069469775e-05, "loss": 0.406, "step": 79545 }, { "epoch": 2.866976610083973, "grad_norm": 0.19084390997886658, "learning_rate": 2.0315650587998416e-05, "loss": 0.3815, "step": 79550 }, { "epoch": 2.8671568097451976, "grad_norm": 0.20089589059352875, "learning_rate": 2.0312784170350404e-05, "loss": 0.3958, "step": 79555 }, { "epoch": 2.8673370094064223, "grad_norm": 0.26835712790489197, "learning_rate": 2.030991781656481e-05, "loss": 0.3866, "step": 79560 }, { "epoch": 2.867517209067647, "grad_norm": 0.1739826798439026, "learning_rate": 2.030705152668066e-05, "loss": 0.3778, "step": 79565 }, { "epoch": 2.867697408728872, "grad_norm": 0.23499102890491486, "learning_rate": 2.0304185300737046e-05, "loss": 0.3978, "step": 79570 }, { "epoch": 2.867877608390096, "grad_norm": 0.19982177019119263, "learning_rate": 2.030131913877299e-05, "loss": 0.4005, "step": 79575 }, { "epoch": 2.868057808051321, "grad_norm": 0.18712134659290314, "learning_rate": 2.0298453040827544e-05, "loss": 0.4348, "step": 79580 }, { "epoch": 2.8682380077125456, "grad_norm": 0.26873958110809326, "learning_rate": 2.0295587006939772e-05, "loss": 0.3693, "step": 79585 }, { "epoch": 2.8684182073737703, "grad_norm": 0.1947907954454422, "learning_rate": 2.0292721037148717e-05, "loss": 0.3872, "step": 79590 }, { "epoch": 2.8685984070349946, "grad_norm": 0.16691918671131134, "learning_rate": 2.0289855131493422e-05, "loss": 0.3799, "step": 79595 }, { "epoch": 2.8687786066962193, "grad_norm": 0.15523312985897064, "learning_rate": 2.0286989290012945e-05, "loss": 0.3837, "step": 79600 }, { "epoch": 2.868958806357444, "grad_norm": 0.22626052796840668, "learning_rate": 2.0284123512746317e-05, "loss": 0.3807, "step": 79605 }, { "epoch": 2.869139006018669, "grad_norm": 0.25360921025276184, "learning_rate": 2.0281257799732602e-05, "loss": 0.401, "step": 79610 }, { "epoch": 2.8693192056798935, "grad_norm": 0.21159762144088745, "learning_rate": 2.0278392151010837e-05, "loss": 0.4108, "step": 79615 }, { "epoch": 2.869499405341118, "grad_norm": 0.19640955328941345, "learning_rate": 2.0275526566620058e-05, "loss": 0.3899, "step": 79620 }, { "epoch": 2.8696796050023425, "grad_norm": 0.20014135539531708, "learning_rate": 2.0272661046599318e-05, "loss": 0.3942, "step": 79625 }, { "epoch": 2.8698598046635673, "grad_norm": 0.20158907771110535, "learning_rate": 2.026979559098766e-05, "loss": 0.367, "step": 79630 }, { "epoch": 2.870040004324792, "grad_norm": 0.18245342373847961, "learning_rate": 2.0266930199824108e-05, "loss": 0.3979, "step": 79635 }, { "epoch": 2.8702202039860163, "grad_norm": 0.2652152478694916, "learning_rate": 2.0264064873147735e-05, "loss": 0.3925, "step": 79640 }, { "epoch": 2.870400403647241, "grad_norm": 0.22294139862060547, "learning_rate": 2.026119961099754e-05, "loss": 0.4227, "step": 79645 }, { "epoch": 2.8705806033084658, "grad_norm": 0.20929698646068573, "learning_rate": 2.02583344134126e-05, "loss": 0.3903, "step": 79650 }, { "epoch": 2.8707608029696905, "grad_norm": 0.2380889505147934, "learning_rate": 2.0255469280431932e-05, "loss": 0.3933, "step": 79655 }, { "epoch": 2.8709410026309152, "grad_norm": 0.21243393421173096, "learning_rate": 2.025260421209457e-05, "loss": 0.4065, "step": 79660 }, { "epoch": 2.87112120229214, "grad_norm": 0.2032131552696228, "learning_rate": 2.0249739208439562e-05, "loss": 0.373, "step": 79665 }, { "epoch": 2.8713014019533643, "grad_norm": 0.2194889783859253, "learning_rate": 2.0246874269505934e-05, "loss": 0.4264, "step": 79670 }, { "epoch": 2.871481601614589, "grad_norm": 0.20919016003608704, "learning_rate": 2.0244009395332725e-05, "loss": 0.4062, "step": 79675 }, { "epoch": 2.8716618012758137, "grad_norm": 0.17519910633563995, "learning_rate": 2.024114458595897e-05, "loss": 0.3697, "step": 79680 }, { "epoch": 2.871842000937038, "grad_norm": 0.2320595383644104, "learning_rate": 2.0238279841423693e-05, "loss": 0.3617, "step": 79685 }, { "epoch": 2.8720222005982627, "grad_norm": 0.20876125991344452, "learning_rate": 2.0235415161765936e-05, "loss": 0.4013, "step": 79690 }, { "epoch": 2.8722024002594875, "grad_norm": 0.2067393809556961, "learning_rate": 2.0232550547024726e-05, "loss": 0.4045, "step": 79695 }, { "epoch": 2.872382599920712, "grad_norm": 0.2197972983121872, "learning_rate": 2.0229685997239088e-05, "loss": 0.3972, "step": 79700 }, { "epoch": 2.872562799581937, "grad_norm": 0.23758886754512787, "learning_rate": 2.0226821512448057e-05, "loss": 0.3835, "step": 79705 }, { "epoch": 2.8727429992431617, "grad_norm": 0.20239755511283875, "learning_rate": 2.0223957092690653e-05, "loss": 0.3908, "step": 79710 }, { "epoch": 2.872923198904386, "grad_norm": 0.23541153967380524, "learning_rate": 2.0221092738005916e-05, "loss": 0.3804, "step": 79715 }, { "epoch": 2.8731033985656107, "grad_norm": 0.19039641320705414, "learning_rate": 2.021822844843287e-05, "loss": 0.3887, "step": 79720 }, { "epoch": 2.8732835982268354, "grad_norm": 0.20644930005073547, "learning_rate": 2.0215364224010522e-05, "loss": 0.3808, "step": 79725 }, { "epoch": 2.8734637978880597, "grad_norm": 0.2427452951669693, "learning_rate": 2.021250006477792e-05, "loss": 0.4126, "step": 79730 }, { "epoch": 2.8736439975492845, "grad_norm": 0.19568517804145813, "learning_rate": 2.020963597077408e-05, "loss": 0.4055, "step": 79735 }, { "epoch": 2.873824197210509, "grad_norm": 0.21443352103233337, "learning_rate": 2.0206771942038008e-05, "loss": 0.3971, "step": 79740 }, { "epoch": 2.874004396871734, "grad_norm": 0.19338439404964447, "learning_rate": 2.020390797860876e-05, "loss": 0.3834, "step": 79745 }, { "epoch": 2.8741845965329587, "grad_norm": 0.1852528303861618, "learning_rate": 2.0201044080525314e-05, "loss": 0.3668, "step": 79750 }, { "epoch": 2.8743647961941834, "grad_norm": 0.2564043402671814, "learning_rate": 2.0198180247826734e-05, "loss": 0.4105, "step": 79755 }, { "epoch": 2.8745449958554077, "grad_norm": 0.227681502699852, "learning_rate": 2.0195316480552013e-05, "loss": 0.3855, "step": 79760 }, { "epoch": 2.8747251955166324, "grad_norm": 0.21400003135204315, "learning_rate": 2.0192452778740166e-05, "loss": 0.384, "step": 79765 }, { "epoch": 2.874905395177857, "grad_norm": 0.17560800909996033, "learning_rate": 2.018958914243023e-05, "loss": 0.3919, "step": 79770 }, { "epoch": 2.8750855948390814, "grad_norm": 0.2656613886356354, "learning_rate": 2.0186725571661207e-05, "loss": 0.4083, "step": 79775 }, { "epoch": 2.875265794500306, "grad_norm": 0.23810866475105286, "learning_rate": 2.018386206647211e-05, "loss": 0.3885, "step": 79780 }, { "epoch": 2.875445994161531, "grad_norm": 0.23813889920711517, "learning_rate": 2.0180998626901966e-05, "loss": 0.3846, "step": 79785 }, { "epoch": 2.8756261938227556, "grad_norm": 0.19781778752803802, "learning_rate": 2.017813525298978e-05, "loss": 0.385, "step": 79790 }, { "epoch": 2.8758063934839804, "grad_norm": 0.23655954003334045, "learning_rate": 2.017527194477457e-05, "loss": 0.406, "step": 79795 }, { "epoch": 2.875986593145205, "grad_norm": 0.2517338991165161, "learning_rate": 2.0172408702295347e-05, "loss": 0.3935, "step": 79800 }, { "epoch": 2.8761667928064294, "grad_norm": 0.19464854896068573, "learning_rate": 2.0169545525591117e-05, "loss": 0.41, "step": 79805 }, { "epoch": 2.876346992467654, "grad_norm": 0.2122073471546173, "learning_rate": 2.0166682414700896e-05, "loss": 0.3577, "step": 79810 }, { "epoch": 2.876527192128879, "grad_norm": 0.22305898368358612, "learning_rate": 2.01638193696637e-05, "loss": 0.4146, "step": 79815 }, { "epoch": 2.8767073917901036, "grad_norm": 0.2052910327911377, "learning_rate": 2.0160956390518508e-05, "loss": 0.3865, "step": 79820 }, { "epoch": 2.876887591451328, "grad_norm": 0.23175741732120514, "learning_rate": 2.0158093477304367e-05, "loss": 0.3845, "step": 79825 }, { "epoch": 2.8770677911125526, "grad_norm": 0.14851652085781097, "learning_rate": 2.0155230630060252e-05, "loss": 0.3451, "step": 79830 }, { "epoch": 2.8772479907737774, "grad_norm": 0.21722625195980072, "learning_rate": 2.0152367848825195e-05, "loss": 0.4043, "step": 79835 }, { "epoch": 2.877428190435002, "grad_norm": 0.17405612766742706, "learning_rate": 2.014950513363818e-05, "loss": 0.3895, "step": 79840 }, { "epoch": 2.877608390096227, "grad_norm": 0.23024611175060272, "learning_rate": 2.0146642484538217e-05, "loss": 0.4022, "step": 79845 }, { "epoch": 2.877788589757451, "grad_norm": 0.23710598051548004, "learning_rate": 2.0143779901564313e-05, "loss": 0.3921, "step": 79850 }, { "epoch": 2.877968789418676, "grad_norm": 0.19851794838905334, "learning_rate": 2.014091738475547e-05, "loss": 0.4004, "step": 79855 }, { "epoch": 2.8781489890799006, "grad_norm": 0.19404460489749908, "learning_rate": 2.013805493415068e-05, "loss": 0.3714, "step": 79860 }, { "epoch": 2.8783291887411253, "grad_norm": 0.1905554085969925, "learning_rate": 2.0135192549788955e-05, "loss": 0.3373, "step": 79865 }, { "epoch": 2.8785093884023496, "grad_norm": 0.20806780457496643, "learning_rate": 2.0132330231709287e-05, "loss": 0.387, "step": 79870 }, { "epoch": 2.8786895880635743, "grad_norm": 0.2478901594877243, "learning_rate": 2.012946797995068e-05, "loss": 0.4016, "step": 79875 }, { "epoch": 2.878869787724799, "grad_norm": 0.17782168090343475, "learning_rate": 2.012660579455213e-05, "loss": 0.4067, "step": 79880 }, { "epoch": 2.879049987386024, "grad_norm": 0.2104375660419464, "learning_rate": 2.0123743675552624e-05, "loss": 0.3679, "step": 79885 }, { "epoch": 2.8792301870472485, "grad_norm": 0.16701386868953705, "learning_rate": 2.0120881622991178e-05, "loss": 0.3655, "step": 79890 }, { "epoch": 2.879410386708473, "grad_norm": 0.24730798602104187, "learning_rate": 2.0118019636906765e-05, "loss": 0.3892, "step": 79895 }, { "epoch": 2.8795905863696976, "grad_norm": 0.19160984456539154, "learning_rate": 2.0115157717338396e-05, "loss": 0.4107, "step": 79900 }, { "epoch": 2.8797707860309223, "grad_norm": 0.22043399512767792, "learning_rate": 2.0112295864325057e-05, "loss": 0.3838, "step": 79905 }, { "epoch": 2.879950985692147, "grad_norm": 0.1867411732673645, "learning_rate": 2.0109434077905737e-05, "loss": 0.4019, "step": 79910 }, { "epoch": 2.8801311853533713, "grad_norm": 0.23410554230213165, "learning_rate": 2.0106572358119433e-05, "loss": 0.398, "step": 79915 }, { "epoch": 2.880311385014596, "grad_norm": 0.2368057370185852, "learning_rate": 2.0103710705005142e-05, "loss": 0.3796, "step": 79920 }, { "epoch": 2.880491584675821, "grad_norm": 0.14740437269210815, "learning_rate": 2.0100849118601824e-05, "loss": 0.3645, "step": 79925 }, { "epoch": 2.8806717843370455, "grad_norm": 0.18294650316238403, "learning_rate": 2.0097987598948507e-05, "loss": 0.3756, "step": 79930 }, { "epoch": 2.8808519839982702, "grad_norm": 0.18560317158699036, "learning_rate": 2.0095126146084145e-05, "loss": 0.3951, "step": 79935 }, { "epoch": 2.881032183659495, "grad_norm": 0.2644931375980377, "learning_rate": 2.0092264760047758e-05, "loss": 0.3868, "step": 79940 }, { "epoch": 2.8812123833207193, "grad_norm": 0.197056844830513, "learning_rate": 2.00894034408783e-05, "loss": 0.3664, "step": 79945 }, { "epoch": 2.881392582981944, "grad_norm": 0.2113136351108551, "learning_rate": 2.0086542188614772e-05, "loss": 0.3959, "step": 79950 }, { "epoch": 2.8815727826431687, "grad_norm": 0.21340829133987427, "learning_rate": 2.0083681003296158e-05, "loss": 0.3783, "step": 79955 }, { "epoch": 2.881752982304393, "grad_norm": 0.2382301241159439, "learning_rate": 2.0080819884961437e-05, "loss": 0.3865, "step": 79960 }, { "epoch": 2.8819331819656178, "grad_norm": 0.2526952028274536, "learning_rate": 2.007795883364959e-05, "loss": 0.3977, "step": 79965 }, { "epoch": 2.8821133816268425, "grad_norm": 0.2137371152639389, "learning_rate": 2.0075097849399603e-05, "loss": 0.3798, "step": 79970 }, { "epoch": 2.8822935812880672, "grad_norm": 0.2008858621120453, "learning_rate": 2.007223693225045e-05, "loss": 0.4151, "step": 79975 }, { "epoch": 2.882473780949292, "grad_norm": 0.20356670022010803, "learning_rate": 2.006937608224112e-05, "loss": 0.4056, "step": 79980 }, { "epoch": 2.8826539806105167, "grad_norm": 0.23385298252105713, "learning_rate": 2.006651529941059e-05, "loss": 0.4071, "step": 79985 }, { "epoch": 2.882834180271741, "grad_norm": 0.24331295490264893, "learning_rate": 2.0063654583797825e-05, "loss": 0.3944, "step": 79990 }, { "epoch": 2.8830143799329657, "grad_norm": 0.22254779934883118, "learning_rate": 2.0060793935441818e-05, "loss": 0.4021, "step": 79995 }, { "epoch": 2.8831945795941905, "grad_norm": 0.24361281096935272, "learning_rate": 2.0057933354381543e-05, "loss": 0.4402, "step": 80000 }, { "epoch": 2.8831945795941905, "eval_loss": 0.43133535981178284, "eval_runtime": 3.5314, "eval_samples_per_second": 28.317, "eval_steps_per_second": 7.079, "step": 80000 }, { "epoch": 2.8833747792554147, "grad_norm": 0.19643712043762207, "learning_rate": 2.0055072840655952e-05, "loss": 0.3874, "step": 80005 }, { "epoch": 2.8835549789166395, "grad_norm": 0.23182834684848785, "learning_rate": 2.005221239430405e-05, "loss": 0.3392, "step": 80010 }, { "epoch": 2.883735178577864, "grad_norm": 0.19430822134017944, "learning_rate": 2.004935201536478e-05, "loss": 0.4039, "step": 80015 }, { "epoch": 2.883915378239089, "grad_norm": 0.2064819037914276, "learning_rate": 2.0046491703877143e-05, "loss": 0.3976, "step": 80020 }, { "epoch": 2.8840955779003137, "grad_norm": 0.22902020812034607, "learning_rate": 2.0043631459880103e-05, "loss": 0.3979, "step": 80025 }, { "epoch": 2.8842757775615384, "grad_norm": 0.26856574416160583, "learning_rate": 2.004077128341261e-05, "loss": 0.3935, "step": 80030 }, { "epoch": 2.8844559772227627, "grad_norm": 0.2205887734889984, "learning_rate": 2.0037911174513663e-05, "loss": 0.3823, "step": 80035 }, { "epoch": 2.8846361768839874, "grad_norm": 0.17064271867275238, "learning_rate": 2.003505113322221e-05, "loss": 0.348, "step": 80040 }, { "epoch": 2.884816376545212, "grad_norm": 0.2369140088558197, "learning_rate": 2.003219115957722e-05, "loss": 0.3831, "step": 80045 }, { "epoch": 2.8849965762064365, "grad_norm": 0.19578613340854645, "learning_rate": 2.0029331253617666e-05, "loss": 0.3514, "step": 80050 }, { "epoch": 2.885176775867661, "grad_norm": 0.22336263954639435, "learning_rate": 2.0026471415382507e-05, "loss": 0.4095, "step": 80055 }, { "epoch": 2.885356975528886, "grad_norm": 0.20325559377670288, "learning_rate": 2.0023611644910716e-05, "loss": 0.412, "step": 80060 }, { "epoch": 2.8855371751901107, "grad_norm": 0.21424368023872375, "learning_rate": 2.0020751942241255e-05, "loss": 0.3845, "step": 80065 }, { "epoch": 2.8857173748513354, "grad_norm": 0.2809605300426483, "learning_rate": 2.001789230741308e-05, "loss": 0.3961, "step": 80070 }, { "epoch": 2.88589757451256, "grad_norm": 0.18413975834846497, "learning_rate": 2.0015032740465165e-05, "loss": 0.4107, "step": 80075 }, { "epoch": 2.8860777741737844, "grad_norm": 0.28514033555984497, "learning_rate": 2.0012173241436456e-05, "loss": 0.3854, "step": 80080 }, { "epoch": 2.886257973835009, "grad_norm": 0.27392441034317017, "learning_rate": 2.0009313810365925e-05, "loss": 0.3835, "step": 80085 }, { "epoch": 2.886438173496234, "grad_norm": 0.1907772570848465, "learning_rate": 2.000645444729253e-05, "loss": 0.3915, "step": 80090 }, { "epoch": 2.8866183731574586, "grad_norm": 0.2062581181526184, "learning_rate": 2.0003595152255218e-05, "loss": 0.3623, "step": 80095 }, { "epoch": 2.886798572818683, "grad_norm": 0.20222362875938416, "learning_rate": 2.000073592529296e-05, "loss": 0.369, "step": 80100 }, { "epoch": 2.8869787724799076, "grad_norm": 0.21699479222297668, "learning_rate": 1.9997876766444716e-05, "loss": 0.3965, "step": 80105 }, { "epoch": 2.8871589721411324, "grad_norm": 0.24619553983211517, "learning_rate": 1.999501767574941e-05, "loss": 0.4102, "step": 80110 }, { "epoch": 2.887339171802357, "grad_norm": 0.21754342317581177, "learning_rate": 1.9992158653246042e-05, "loss": 0.3494, "step": 80115 }, { "epoch": 2.887519371463582, "grad_norm": 0.22781917452812195, "learning_rate": 1.9989299698973525e-05, "loss": 0.3932, "step": 80120 }, { "epoch": 2.887699571124806, "grad_norm": 0.17550964653491974, "learning_rate": 1.9986440812970844e-05, "loss": 0.3504, "step": 80125 }, { "epoch": 2.887879770786031, "grad_norm": 0.20450954139232635, "learning_rate": 1.998358199527693e-05, "loss": 0.4168, "step": 80130 }, { "epoch": 2.8880599704472556, "grad_norm": 0.20474529266357422, "learning_rate": 1.9980723245930737e-05, "loss": 0.3699, "step": 80135 }, { "epoch": 2.8882401701084803, "grad_norm": 0.22211550176143646, "learning_rate": 1.9977864564971225e-05, "loss": 0.3711, "step": 80140 }, { "epoch": 2.8884203697697046, "grad_norm": 0.2340330183506012, "learning_rate": 1.9975005952437336e-05, "loss": 0.3975, "step": 80145 }, { "epoch": 2.8886005694309294, "grad_norm": 0.211472749710083, "learning_rate": 1.9972147408368008e-05, "loss": 0.3676, "step": 80150 }, { "epoch": 2.888780769092154, "grad_norm": 0.17939423024654388, "learning_rate": 1.9969288932802205e-05, "loss": 0.3778, "step": 80155 }, { "epoch": 2.888960968753379, "grad_norm": 0.21015068888664246, "learning_rate": 1.996643052577886e-05, "loss": 0.3659, "step": 80160 }, { "epoch": 2.8891411684146036, "grad_norm": 0.20937329530715942, "learning_rate": 1.9963572187336935e-05, "loss": 0.3988, "step": 80165 }, { "epoch": 2.8893213680758283, "grad_norm": 0.17730148136615753, "learning_rate": 1.996071391751536e-05, "loss": 0.3907, "step": 80170 }, { "epoch": 2.8895015677370526, "grad_norm": 0.18675543367862701, "learning_rate": 1.995785571635308e-05, "loss": 0.4226, "step": 80175 }, { "epoch": 2.8896817673982773, "grad_norm": 0.20549610257148743, "learning_rate": 1.995499758388904e-05, "loss": 0.3824, "step": 80180 }, { "epoch": 2.889861967059502, "grad_norm": 0.21189923584461212, "learning_rate": 1.9952139520162186e-05, "loss": 0.402, "step": 80185 }, { "epoch": 2.8900421667207263, "grad_norm": 0.24374066293239594, "learning_rate": 1.9949281525211446e-05, "loss": 0.4361, "step": 80190 }, { "epoch": 2.890222366381951, "grad_norm": 0.2334618866443634, "learning_rate": 1.9946423599075774e-05, "loss": 0.3988, "step": 80195 }, { "epoch": 2.890402566043176, "grad_norm": 0.25280308723449707, "learning_rate": 1.9943565741794095e-05, "loss": 0.4341, "step": 80200 }, { "epoch": 2.8905827657044005, "grad_norm": 0.21369872987270355, "learning_rate": 1.994070795340536e-05, "loss": 0.365, "step": 80205 }, { "epoch": 2.8907629653656253, "grad_norm": 0.23660530149936676, "learning_rate": 1.9937850233948505e-05, "loss": 0.3767, "step": 80210 }, { "epoch": 2.89094316502685, "grad_norm": 0.2095620483160019, "learning_rate": 1.9934992583462443e-05, "loss": 0.3988, "step": 80215 }, { "epoch": 2.8911233646880743, "grad_norm": 0.22344286739826202, "learning_rate": 1.9932135001986144e-05, "loss": 0.4129, "step": 80220 }, { "epoch": 2.891303564349299, "grad_norm": 0.18746274709701538, "learning_rate": 1.9929277489558517e-05, "loss": 0.41, "step": 80225 }, { "epoch": 2.8914837640105238, "grad_norm": 0.17481876909732819, "learning_rate": 1.9926420046218503e-05, "loss": 0.3326, "step": 80230 }, { "epoch": 2.891663963671748, "grad_norm": 0.185255765914917, "learning_rate": 1.9923562672005033e-05, "loss": 0.4074, "step": 80235 }, { "epoch": 2.891844163332973, "grad_norm": 0.16853371262550354, "learning_rate": 1.9920705366957035e-05, "loss": 0.3553, "step": 80240 }, { "epoch": 2.8920243629941975, "grad_norm": 0.25939464569091797, "learning_rate": 1.991784813111345e-05, "loss": 0.3817, "step": 80245 }, { "epoch": 2.8922045626554223, "grad_norm": 0.20100079476833344, "learning_rate": 1.9914990964513196e-05, "loss": 0.3873, "step": 80250 }, { "epoch": 2.892384762316647, "grad_norm": 0.16807907819747925, "learning_rate": 1.9912133867195203e-05, "loss": 0.387, "step": 80255 }, { "epoch": 2.8925649619778717, "grad_norm": 0.2342177778482437, "learning_rate": 1.990927683919841e-05, "loss": 0.3891, "step": 80260 }, { "epoch": 2.892745161639096, "grad_norm": 0.27248871326446533, "learning_rate": 1.9906419880561725e-05, "loss": 0.4153, "step": 80265 }, { "epoch": 2.8929253613003207, "grad_norm": 0.33067587018013, "learning_rate": 1.9903562991324088e-05, "loss": 0.4321, "step": 80270 }, { "epoch": 2.8931055609615455, "grad_norm": 0.22115959227085114, "learning_rate": 1.990070617152442e-05, "loss": 0.3908, "step": 80275 }, { "epoch": 2.8932857606227698, "grad_norm": 0.23916247487068176, "learning_rate": 1.9897849421201636e-05, "loss": 0.4196, "step": 80280 }, { "epoch": 2.8934659602839945, "grad_norm": 0.2263944149017334, "learning_rate": 1.9894992740394674e-05, "loss": 0.3994, "step": 80285 }, { "epoch": 2.8936461599452192, "grad_norm": 0.2417251467704773, "learning_rate": 1.9892136129142452e-05, "loss": 0.3894, "step": 80290 }, { "epoch": 2.893826359606444, "grad_norm": 0.18903349339962006, "learning_rate": 1.988927958748387e-05, "loss": 0.3851, "step": 80295 }, { "epoch": 2.8940065592676687, "grad_norm": 0.23545511066913605, "learning_rate": 1.988642311545788e-05, "loss": 0.3862, "step": 80300 }, { "epoch": 2.8941867589288934, "grad_norm": 0.27978673577308655, "learning_rate": 1.9883566713103368e-05, "loss": 0.4131, "step": 80305 }, { "epoch": 2.8943669585901177, "grad_norm": 0.171412855386734, "learning_rate": 1.988071038045928e-05, "loss": 0.3753, "step": 80310 }, { "epoch": 2.8945471582513425, "grad_norm": 0.18040184676647186, "learning_rate": 1.987785411756453e-05, "loss": 0.3975, "step": 80315 }, { "epoch": 2.894727357912567, "grad_norm": 0.2326379269361496, "learning_rate": 1.9874997924458007e-05, "loss": 0.4205, "step": 80320 }, { "epoch": 2.894907557573792, "grad_norm": 0.24374447762966156, "learning_rate": 1.987214180117866e-05, "loss": 0.4147, "step": 80325 }, { "epoch": 2.895087757235016, "grad_norm": 0.20812088251113892, "learning_rate": 1.9869285747765387e-05, "loss": 0.3989, "step": 80330 }, { "epoch": 2.895267956896241, "grad_norm": 0.22509217262268066, "learning_rate": 1.9866429764257092e-05, "loss": 0.3967, "step": 80335 }, { "epoch": 2.8954481565574657, "grad_norm": 0.244933620095253, "learning_rate": 1.9863573850692706e-05, "loss": 0.3987, "step": 80340 }, { "epoch": 2.8956283562186904, "grad_norm": 0.17772509157657623, "learning_rate": 1.986071800711113e-05, "loss": 0.4069, "step": 80345 }, { "epoch": 2.895808555879915, "grad_norm": 0.21300829946994781, "learning_rate": 1.9857862233551274e-05, "loss": 0.405, "step": 80350 }, { "epoch": 2.8959887555411394, "grad_norm": 0.20396198332309723, "learning_rate": 1.9855006530052055e-05, "loss": 0.3692, "step": 80355 }, { "epoch": 2.896168955202364, "grad_norm": 0.2265167236328125, "learning_rate": 1.985215089665237e-05, "loss": 0.387, "step": 80360 }, { "epoch": 2.896349154863589, "grad_norm": 0.2407812774181366, "learning_rate": 1.9849295333391134e-05, "loss": 0.3945, "step": 80365 }, { "epoch": 2.8965293545248136, "grad_norm": 0.19000792503356934, "learning_rate": 1.984643984030726e-05, "loss": 0.3897, "step": 80370 }, { "epoch": 2.896709554186038, "grad_norm": 0.21029935777187347, "learning_rate": 1.9843584417439633e-05, "loss": 0.4148, "step": 80375 }, { "epoch": 2.8968897538472627, "grad_norm": 0.21043239533901215, "learning_rate": 1.9840729064827173e-05, "loss": 0.3786, "step": 80380 }, { "epoch": 2.8970699535084874, "grad_norm": 0.24211741983890533, "learning_rate": 1.983787378250878e-05, "loss": 0.4162, "step": 80385 }, { "epoch": 2.897250153169712, "grad_norm": 0.20542925596237183, "learning_rate": 1.9835018570523363e-05, "loss": 0.4269, "step": 80390 }, { "epoch": 2.897430352830937, "grad_norm": 0.22718103229999542, "learning_rate": 1.983216342890982e-05, "loss": 0.3968, "step": 80395 }, { "epoch": 2.897610552492161, "grad_norm": 0.20570969581604004, "learning_rate": 1.9829308357707037e-05, "loss": 0.4045, "step": 80400 }, { "epoch": 2.897790752153386, "grad_norm": 0.26240652799606323, "learning_rate": 1.982645335695394e-05, "loss": 0.3815, "step": 80405 }, { "epoch": 2.8979709518146106, "grad_norm": 0.23519177734851837, "learning_rate": 1.982359842668941e-05, "loss": 0.4191, "step": 80410 }, { "epoch": 2.8981511514758354, "grad_norm": 0.23879577219486237, "learning_rate": 1.9820743566952348e-05, "loss": 0.4106, "step": 80415 }, { "epoch": 2.8983313511370596, "grad_norm": 0.22001482546329498, "learning_rate": 1.981788877778165e-05, "loss": 0.3793, "step": 80420 }, { "epoch": 2.8985115507982844, "grad_norm": 0.1924242228269577, "learning_rate": 1.9815034059216214e-05, "loss": 0.4246, "step": 80425 }, { "epoch": 2.898691750459509, "grad_norm": 0.2032802700996399, "learning_rate": 1.981217941129494e-05, "loss": 0.3554, "step": 80430 }, { "epoch": 2.898871950120734, "grad_norm": 0.2488231360912323, "learning_rate": 1.9809324834056713e-05, "loss": 0.4157, "step": 80435 }, { "epoch": 2.8990521497819586, "grad_norm": 0.20548661053180695, "learning_rate": 1.980647032754043e-05, "loss": 0.3964, "step": 80440 }, { "epoch": 2.8992323494431833, "grad_norm": 0.20527927577495575, "learning_rate": 1.9803615891784987e-05, "loss": 0.3843, "step": 80445 }, { "epoch": 2.8994125491044076, "grad_norm": 0.20890048146247864, "learning_rate": 1.980076152682927e-05, "loss": 0.418, "step": 80450 }, { "epoch": 2.8995927487656323, "grad_norm": 0.23384051024913788, "learning_rate": 1.9797907232712166e-05, "loss": 0.368, "step": 80455 }, { "epoch": 2.899772948426857, "grad_norm": 0.20361943542957306, "learning_rate": 1.9795053009472574e-05, "loss": 0.4239, "step": 80460 }, { "epoch": 2.8999531480880814, "grad_norm": 0.21155259013175964, "learning_rate": 1.9792198857149375e-05, "loss": 0.4109, "step": 80465 }, { "epoch": 2.900133347749306, "grad_norm": 0.24116067588329315, "learning_rate": 1.978934477578146e-05, "loss": 0.4045, "step": 80470 }, { "epoch": 2.900313547410531, "grad_norm": 0.20510059595108032, "learning_rate": 1.9786490765407713e-05, "loss": 0.4006, "step": 80475 }, { "epoch": 2.9004937470717556, "grad_norm": 0.2083190232515335, "learning_rate": 1.9783636826067015e-05, "loss": 0.3923, "step": 80480 }, { "epoch": 2.9006739467329803, "grad_norm": 0.22528444230556488, "learning_rate": 1.9780782957798263e-05, "loss": 0.3764, "step": 80485 }, { "epoch": 2.900854146394205, "grad_norm": 0.22572170197963715, "learning_rate": 1.9777929160640325e-05, "loss": 0.3875, "step": 80490 }, { "epoch": 2.9010343460554293, "grad_norm": 0.1767890602350235, "learning_rate": 1.97750754346321e-05, "loss": 0.3535, "step": 80495 }, { "epoch": 2.901214545716654, "grad_norm": 0.24823251366615295, "learning_rate": 1.977222177981246e-05, "loss": 0.4085, "step": 80500 }, { "epoch": 2.901214545716654, "eval_loss": 0.43115851283073425, "eval_runtime": 3.533, "eval_samples_per_second": 28.304, "eval_steps_per_second": 7.076, "step": 80500 }, { "epoch": 2.901394745377879, "grad_norm": 0.284284383058548, "learning_rate": 1.9769368196220275e-05, "loss": 0.4208, "step": 80505 }, { "epoch": 2.901574945039103, "grad_norm": 0.22723108530044556, "learning_rate": 1.976651468389445e-05, "loss": 0.415, "step": 80510 }, { "epoch": 2.901755144700328, "grad_norm": 0.2633325159549713, "learning_rate": 1.9763661242873845e-05, "loss": 0.4283, "step": 80515 }, { "epoch": 2.9019353443615525, "grad_norm": 0.2002272605895996, "learning_rate": 1.9760807873197336e-05, "loss": 0.3805, "step": 80520 }, { "epoch": 2.9021155440227773, "grad_norm": 0.1985533982515335, "learning_rate": 1.975795457490381e-05, "loss": 0.3822, "step": 80525 }, { "epoch": 2.902295743684002, "grad_norm": 0.18595746159553528, "learning_rate": 1.9755101348032136e-05, "loss": 0.3732, "step": 80530 }, { "epoch": 2.9024759433452267, "grad_norm": 0.22532545030117035, "learning_rate": 1.975224819262119e-05, "loss": 0.3716, "step": 80535 }, { "epoch": 2.902656143006451, "grad_norm": 0.20030663907527924, "learning_rate": 1.974939510870985e-05, "loss": 0.3823, "step": 80540 }, { "epoch": 2.9028363426676758, "grad_norm": 0.2480957806110382, "learning_rate": 1.974654209633698e-05, "loss": 0.3838, "step": 80545 }, { "epoch": 2.9030165423289005, "grad_norm": 0.23294273018836975, "learning_rate": 1.9743689155541458e-05, "loss": 0.412, "step": 80550 }, { "epoch": 2.903196741990125, "grad_norm": 0.2072293609380722, "learning_rate": 1.974083628636216e-05, "loss": 0.4121, "step": 80555 }, { "epoch": 2.9033769416513495, "grad_norm": 0.20918940007686615, "learning_rate": 1.973798348883794e-05, "loss": 0.3944, "step": 80560 }, { "epoch": 2.9035571413125743, "grad_norm": 0.24100269377231598, "learning_rate": 1.973513076300768e-05, "loss": 0.416, "step": 80565 }, { "epoch": 2.903737340973799, "grad_norm": 0.1925951987504959, "learning_rate": 1.9732278108910243e-05, "loss": 0.3716, "step": 80570 }, { "epoch": 2.9039175406350237, "grad_norm": 0.21889275312423706, "learning_rate": 1.97294255265845e-05, "loss": 0.4215, "step": 80575 }, { "epoch": 2.9040977402962485, "grad_norm": 0.20681817829608917, "learning_rate": 1.972657301606932e-05, "loss": 0.4319, "step": 80580 }, { "epoch": 2.9042779399574727, "grad_norm": 0.23390960693359375, "learning_rate": 1.9723720577403546e-05, "loss": 0.4255, "step": 80585 }, { "epoch": 2.9044581396186975, "grad_norm": 0.21536414325237274, "learning_rate": 1.9720868210626067e-05, "loss": 0.3939, "step": 80590 }, { "epoch": 2.904638339279922, "grad_norm": 0.245896577835083, "learning_rate": 1.9718015915775743e-05, "loss": 0.3782, "step": 80595 }, { "epoch": 2.904818538941147, "grad_norm": 0.20185402035713196, "learning_rate": 1.9715163692891416e-05, "loss": 0.4153, "step": 80600 }, { "epoch": 2.9049987386023712, "grad_norm": 0.19402766227722168, "learning_rate": 1.971231154201197e-05, "loss": 0.4159, "step": 80605 }, { "epoch": 2.905178938263596, "grad_norm": 0.23426468670368195, "learning_rate": 1.9709459463176243e-05, "loss": 0.3907, "step": 80610 }, { "epoch": 2.9053591379248207, "grad_norm": 0.2931303381919861, "learning_rate": 1.9706607456423122e-05, "loss": 0.432, "step": 80615 }, { "epoch": 2.9055393375860454, "grad_norm": 0.25184187293052673, "learning_rate": 1.9703755521791445e-05, "loss": 0.3872, "step": 80620 }, { "epoch": 2.90571953724727, "grad_norm": 0.2202758938074112, "learning_rate": 1.970090365932007e-05, "loss": 0.359, "step": 80625 }, { "epoch": 2.9058997369084945, "grad_norm": 0.2390233725309372, "learning_rate": 1.969805186904786e-05, "loss": 0.3898, "step": 80630 }, { "epoch": 2.906079936569719, "grad_norm": 0.1913604885339737, "learning_rate": 1.9695200151013666e-05, "loss": 0.3645, "step": 80635 }, { "epoch": 2.906260136230944, "grad_norm": 0.22707746922969818, "learning_rate": 1.9692348505256335e-05, "loss": 0.3945, "step": 80640 }, { "epoch": 2.9064403358921687, "grad_norm": 0.2409844696521759, "learning_rate": 1.968949693181474e-05, "loss": 0.3892, "step": 80645 }, { "epoch": 2.906620535553393, "grad_norm": 0.21036100387573242, "learning_rate": 1.968664543072771e-05, "loss": 0.3957, "step": 80650 }, { "epoch": 2.9068007352146177, "grad_norm": 0.2081831395626068, "learning_rate": 1.9683794002034115e-05, "loss": 0.3975, "step": 80655 }, { "epoch": 2.9069809348758424, "grad_norm": 0.17619097232818604, "learning_rate": 1.96809426457728e-05, "loss": 0.3864, "step": 80660 }, { "epoch": 2.907161134537067, "grad_norm": 0.25311923027038574, "learning_rate": 1.9678091361982602e-05, "loss": 0.3982, "step": 80665 }, { "epoch": 2.907341334198292, "grad_norm": 0.25830528140068054, "learning_rate": 1.967524015070239e-05, "loss": 0.4033, "step": 80670 }, { "epoch": 2.9075215338595166, "grad_norm": 0.20592638850212097, "learning_rate": 1.967238901197099e-05, "loss": 0.3828, "step": 80675 }, { "epoch": 2.907701733520741, "grad_norm": 0.19659774005413055, "learning_rate": 1.9669537945827265e-05, "loss": 0.3768, "step": 80680 }, { "epoch": 2.9078819331819656, "grad_norm": 0.24102912843227386, "learning_rate": 1.9666686952310057e-05, "loss": 0.4266, "step": 80685 }, { "epoch": 2.9080621328431904, "grad_norm": 0.18135298788547516, "learning_rate": 1.9663836031458195e-05, "loss": 0.4413, "step": 80690 }, { "epoch": 2.9082423325044147, "grad_norm": 0.2321069985628128, "learning_rate": 1.9660985183310543e-05, "loss": 0.4058, "step": 80695 }, { "epoch": 2.9084225321656394, "grad_norm": 0.21384575963020325, "learning_rate": 1.9658134407905935e-05, "loss": 0.382, "step": 80700 }, { "epoch": 2.908602731826864, "grad_norm": 0.2350618541240692, "learning_rate": 1.9655283705283205e-05, "loss": 0.4001, "step": 80705 }, { "epoch": 2.908782931488089, "grad_norm": 0.19745108485221863, "learning_rate": 1.9652433075481202e-05, "loss": 0.425, "step": 80710 }, { "epoch": 2.9089631311493136, "grad_norm": 0.21076776087284088, "learning_rate": 1.9649582518538757e-05, "loss": 0.4067, "step": 80715 }, { "epoch": 2.9091433308105383, "grad_norm": 0.20827296376228333, "learning_rate": 1.9646732034494726e-05, "loss": 0.3839, "step": 80720 }, { "epoch": 2.9093235304717626, "grad_norm": 0.18359017372131348, "learning_rate": 1.964388162338793e-05, "loss": 0.3977, "step": 80725 }, { "epoch": 2.9095037301329874, "grad_norm": 0.23850063979625702, "learning_rate": 1.9641031285257205e-05, "loss": 0.4165, "step": 80730 }, { "epoch": 2.909683929794212, "grad_norm": 0.2121899276971817, "learning_rate": 1.96381810201414e-05, "loss": 0.3906, "step": 80735 }, { "epoch": 2.9098641294554364, "grad_norm": 0.26047274470329285, "learning_rate": 1.9635330828079335e-05, "loss": 0.3992, "step": 80740 }, { "epoch": 2.910044329116661, "grad_norm": 0.16215106844902039, "learning_rate": 1.9632480709109845e-05, "loss": 0.4113, "step": 80745 }, { "epoch": 2.910224528777886, "grad_norm": 0.2457340657711029, "learning_rate": 1.9629630663271776e-05, "loss": 0.4222, "step": 80750 }, { "epoch": 2.9104047284391106, "grad_norm": 0.24519392848014832, "learning_rate": 1.962678069060394e-05, "loss": 0.4321, "step": 80755 }, { "epoch": 2.9105849281003353, "grad_norm": 0.169399693608284, "learning_rate": 1.9623930791145184e-05, "loss": 0.3873, "step": 80760 }, { "epoch": 2.91076512776156, "grad_norm": 0.20469214022159576, "learning_rate": 1.9621080964934326e-05, "loss": 0.3688, "step": 80765 }, { "epoch": 2.9109453274227843, "grad_norm": 0.21101103723049164, "learning_rate": 1.9618231212010195e-05, "loss": 0.4403, "step": 80770 }, { "epoch": 2.911125527084009, "grad_norm": 0.193876713514328, "learning_rate": 1.9615381532411632e-05, "loss": 0.3991, "step": 80775 }, { "epoch": 2.911305726745234, "grad_norm": 0.1711498647928238, "learning_rate": 1.9612531926177453e-05, "loss": 0.3954, "step": 80780 }, { "epoch": 2.911485926406458, "grad_norm": 0.2698494791984558, "learning_rate": 1.960968239334647e-05, "loss": 0.4084, "step": 80785 }, { "epoch": 2.911666126067683, "grad_norm": 0.22966143488883972, "learning_rate": 1.9606832933957536e-05, "loss": 0.3625, "step": 80790 }, { "epoch": 2.9118463257289076, "grad_norm": 0.22466062009334564, "learning_rate": 1.9603983548049444e-05, "loss": 0.3633, "step": 80795 }, { "epoch": 2.9120265253901323, "grad_norm": 0.23133142292499542, "learning_rate": 1.9601134235661047e-05, "loss": 0.3741, "step": 80800 }, { "epoch": 2.912206725051357, "grad_norm": 0.22116337716579437, "learning_rate": 1.9598284996831145e-05, "loss": 0.3696, "step": 80805 }, { "epoch": 2.9123869247125818, "grad_norm": 0.22359761595726013, "learning_rate": 1.9595435831598558e-05, "loss": 0.4236, "step": 80810 }, { "epoch": 2.912567124373806, "grad_norm": 0.24041809141635895, "learning_rate": 1.9592586740002116e-05, "loss": 0.3961, "step": 80815 }, { "epoch": 2.912747324035031, "grad_norm": 0.23116619884967804, "learning_rate": 1.9589737722080637e-05, "loss": 0.3997, "step": 80820 }, { "epoch": 2.9129275236962555, "grad_norm": 0.23118247091770172, "learning_rate": 1.9586888777872925e-05, "loss": 0.389, "step": 80825 }, { "epoch": 2.9131077233574802, "grad_norm": 0.2595424950122833, "learning_rate": 1.9584039907417812e-05, "loss": 0.4273, "step": 80830 }, { "epoch": 2.9132879230187045, "grad_norm": 0.1928250938653946, "learning_rate": 1.95811911107541e-05, "loss": 0.3858, "step": 80835 }, { "epoch": 2.9134681226799293, "grad_norm": 0.22600649297237396, "learning_rate": 1.957834238792062e-05, "loss": 0.402, "step": 80840 }, { "epoch": 2.913648322341154, "grad_norm": 0.1833333820104599, "learning_rate": 1.9575493738956168e-05, "loss": 0.4063, "step": 80845 }, { "epoch": 2.9138285220023787, "grad_norm": 0.21622666716575623, "learning_rate": 1.9572645163899563e-05, "loss": 0.4199, "step": 80850 }, { "epoch": 2.9140087216636035, "grad_norm": 0.21689878404140472, "learning_rate": 1.9569796662789623e-05, "loss": 0.4067, "step": 80855 }, { "epoch": 2.9141889213248278, "grad_norm": 0.21402768790721893, "learning_rate": 1.9566948235665144e-05, "loss": 0.3951, "step": 80860 }, { "epoch": 2.9143691209860525, "grad_norm": 0.24621036648750305, "learning_rate": 1.956409988256495e-05, "loss": 0.3993, "step": 80865 }, { "epoch": 2.9145493206472772, "grad_norm": 0.21831317245960236, "learning_rate": 1.9561251603527846e-05, "loss": 0.3955, "step": 80870 }, { "epoch": 2.914729520308502, "grad_norm": 0.20578089356422424, "learning_rate": 1.9558403398592625e-05, "loss": 0.4011, "step": 80875 }, { "epoch": 2.9149097199697263, "grad_norm": 0.18618744611740112, "learning_rate": 1.955555526779811e-05, "loss": 0.4194, "step": 80880 }, { "epoch": 2.915089919630951, "grad_norm": 0.21536029875278473, "learning_rate": 1.9552707211183107e-05, "loss": 0.3765, "step": 80885 }, { "epoch": 2.9152701192921757, "grad_norm": 0.20636074244976044, "learning_rate": 1.95498592287864e-05, "loss": 0.3903, "step": 80890 }, { "epoch": 2.9154503189534005, "grad_norm": 0.201945498585701, "learning_rate": 1.9547011320646817e-05, "loss": 0.3891, "step": 80895 }, { "epoch": 2.915630518614625, "grad_norm": 0.16355274617671967, "learning_rate": 1.9544163486803134e-05, "loss": 0.4209, "step": 80900 }, { "epoch": 2.9158107182758495, "grad_norm": 0.23773586750030518, "learning_rate": 1.9541315727294188e-05, "loss": 0.4007, "step": 80905 }, { "epoch": 2.915990917937074, "grad_norm": 0.1974399834871292, "learning_rate": 1.953846804215875e-05, "loss": 0.3999, "step": 80910 }, { "epoch": 2.916171117598299, "grad_norm": 0.24811916053295135, "learning_rate": 1.9535620431435623e-05, "loss": 0.3971, "step": 80915 }, { "epoch": 2.9163513172595237, "grad_norm": 0.20471271872520447, "learning_rate": 1.9532772895163616e-05, "loss": 0.4142, "step": 80920 }, { "epoch": 2.916531516920748, "grad_norm": 0.19927844405174255, "learning_rate": 1.9529925433381517e-05, "loss": 0.354, "step": 80925 }, { "epoch": 2.9167117165819727, "grad_norm": 0.18068204820156097, "learning_rate": 1.9527078046128118e-05, "loss": 0.3941, "step": 80930 }, { "epoch": 2.9168919162431974, "grad_norm": 0.21484220027923584, "learning_rate": 1.952423073344223e-05, "loss": 0.3866, "step": 80935 }, { "epoch": 2.917072115904422, "grad_norm": 0.21851830184459686, "learning_rate": 1.9521383495362634e-05, "loss": 0.3903, "step": 80940 }, { "epoch": 2.917252315565647, "grad_norm": 0.2002904862165451, "learning_rate": 1.9518536331928127e-05, "loss": 0.3791, "step": 80945 }, { "epoch": 2.9174325152268716, "grad_norm": 0.23152127861976624, "learning_rate": 1.9515689243177508e-05, "loss": 0.4406, "step": 80950 }, { "epoch": 2.917612714888096, "grad_norm": 0.1651298850774765, "learning_rate": 1.9512842229149553e-05, "loss": 0.3888, "step": 80955 }, { "epoch": 2.9177929145493207, "grad_norm": 0.17865732312202454, "learning_rate": 1.950999528988306e-05, "loss": 0.3686, "step": 80960 }, { "epoch": 2.9179731142105454, "grad_norm": 0.2451142817735672, "learning_rate": 1.9507148425416832e-05, "loss": 0.423, "step": 80965 }, { "epoch": 2.9181533138717697, "grad_norm": 0.16584070026874542, "learning_rate": 1.9504301635789623e-05, "loss": 0.3668, "step": 80970 }, { "epoch": 2.9183335135329944, "grad_norm": 0.22449831664562225, "learning_rate": 1.9501454921040256e-05, "loss": 0.3977, "step": 80975 }, { "epoch": 2.918513713194219, "grad_norm": 0.21804749965667725, "learning_rate": 1.949860828120749e-05, "loss": 0.3792, "step": 80980 }, { "epoch": 2.918693912855444, "grad_norm": 0.20953933894634247, "learning_rate": 1.9495761716330133e-05, "loss": 0.4228, "step": 80985 }, { "epoch": 2.9188741125166686, "grad_norm": 0.20794197916984558, "learning_rate": 1.949291522644695e-05, "loss": 0.4034, "step": 80990 }, { "epoch": 2.9190543121778934, "grad_norm": 0.2192896008491516, "learning_rate": 1.949006881159673e-05, "loss": 0.4113, "step": 80995 }, { "epoch": 2.9192345118391176, "grad_norm": 0.18000654876232147, "learning_rate": 1.9487222471818255e-05, "loss": 0.4156, "step": 81000 }, { "epoch": 2.9192345118391176, "eval_loss": 0.4301445782184601, "eval_runtime": 3.5355, "eval_samples_per_second": 28.285, "eval_steps_per_second": 7.071, "step": 81000 }, { "epoch": 2.9194147115003424, "grad_norm": 0.292233943939209, "learning_rate": 1.9484376207150314e-05, "loss": 0.3844, "step": 81005 }, { "epoch": 2.919594911161567, "grad_norm": 0.21577784419059753, "learning_rate": 1.948153001763167e-05, "loss": 0.3826, "step": 81010 }, { "epoch": 2.9197751108227914, "grad_norm": 0.2999797463417053, "learning_rate": 1.9478683903301116e-05, "loss": 0.4494, "step": 81015 }, { "epoch": 2.919955310484016, "grad_norm": 0.24274799227714539, "learning_rate": 1.9475837864197418e-05, "loss": 0.3721, "step": 81020 }, { "epoch": 2.920135510145241, "grad_norm": 0.2308402806520462, "learning_rate": 1.947299190035937e-05, "loss": 0.4185, "step": 81025 }, { "epoch": 2.9203157098064656, "grad_norm": 0.2198919653892517, "learning_rate": 1.9470146011825733e-05, "loss": 0.3646, "step": 81030 }, { "epoch": 2.9204959094676903, "grad_norm": 0.21217882633209229, "learning_rate": 1.946730019863528e-05, "loss": 0.3997, "step": 81035 }, { "epoch": 2.920676109128915, "grad_norm": 0.17091058194637299, "learning_rate": 1.94644544608268e-05, "loss": 0.3784, "step": 81040 }, { "epoch": 2.9208563087901394, "grad_norm": 0.23678135871887207, "learning_rate": 1.9461608798439055e-05, "loss": 0.3841, "step": 81045 }, { "epoch": 2.921036508451364, "grad_norm": 0.2015039473772049, "learning_rate": 1.945876321151081e-05, "loss": 0.4026, "step": 81050 }, { "epoch": 2.921216708112589, "grad_norm": 0.2211407572031021, "learning_rate": 1.945591770008085e-05, "loss": 0.3967, "step": 81055 }, { "epoch": 2.921396907773813, "grad_norm": 0.21739064157009125, "learning_rate": 1.945307226418793e-05, "loss": 0.3956, "step": 81060 }, { "epoch": 2.921577107435038, "grad_norm": 0.2019902616739273, "learning_rate": 1.945022690387084e-05, "loss": 0.4075, "step": 81065 }, { "epoch": 2.9217573070962626, "grad_norm": 0.1831779032945633, "learning_rate": 1.9447381619168332e-05, "loss": 0.3816, "step": 81070 }, { "epoch": 2.9219375067574873, "grad_norm": 0.21984632313251495, "learning_rate": 1.944453641011916e-05, "loss": 0.4448, "step": 81075 }, { "epoch": 2.922117706418712, "grad_norm": 0.19661669433116913, "learning_rate": 1.9441691276762123e-05, "loss": 0.41, "step": 81080 }, { "epoch": 2.922297906079937, "grad_norm": 0.2180146425962448, "learning_rate": 1.9438846219135948e-05, "loss": 0.3727, "step": 81085 }, { "epoch": 2.922478105741161, "grad_norm": 0.2767757475376129, "learning_rate": 1.9436001237279432e-05, "loss": 0.4416, "step": 81090 }, { "epoch": 2.922658305402386, "grad_norm": 0.20873478055000305, "learning_rate": 1.9433156331231314e-05, "loss": 0.3978, "step": 81095 }, { "epoch": 2.9228385050636105, "grad_norm": 0.26107144355773926, "learning_rate": 1.9430311501030362e-05, "loss": 0.3829, "step": 81100 }, { "epoch": 2.9230187047248353, "grad_norm": 0.267535924911499, "learning_rate": 1.9427466746715342e-05, "loss": 0.4117, "step": 81105 }, { "epoch": 2.9231989043860596, "grad_norm": 0.21252234280109406, "learning_rate": 1.942462206832501e-05, "loss": 0.379, "step": 81110 }, { "epoch": 2.9233791040472843, "grad_norm": 0.1903425008058548, "learning_rate": 1.9421777465898114e-05, "loss": 0.352, "step": 81115 }, { "epoch": 2.923559303708509, "grad_norm": 0.1975475549697876, "learning_rate": 1.9418932939473426e-05, "loss": 0.3898, "step": 81120 }, { "epoch": 2.9237395033697338, "grad_norm": 0.22255592048168182, "learning_rate": 1.941608848908969e-05, "loss": 0.3482, "step": 81125 }, { "epoch": 2.9239197030309585, "grad_norm": 0.24407097697257996, "learning_rate": 1.941324411478568e-05, "loss": 0.3865, "step": 81130 }, { "epoch": 2.924099902692183, "grad_norm": 0.18102699518203735, "learning_rate": 1.9410399816600128e-05, "loss": 0.3786, "step": 81135 }, { "epoch": 2.9242801023534075, "grad_norm": 0.23479129374027252, "learning_rate": 1.9407555594571796e-05, "loss": 0.3847, "step": 81140 }, { "epoch": 2.9244603020146323, "grad_norm": 0.20247110724449158, "learning_rate": 1.9404711448739442e-05, "loss": 0.3794, "step": 81145 }, { "epoch": 2.924640501675857, "grad_norm": 0.2692834436893463, "learning_rate": 1.9401867379141815e-05, "loss": 0.4258, "step": 81150 }, { "epoch": 2.9248207013370813, "grad_norm": 0.20718295872211456, "learning_rate": 1.9399023385817644e-05, "loss": 0.4251, "step": 81155 }, { "epoch": 2.925000900998306, "grad_norm": 0.18877676129341125, "learning_rate": 1.939617946880571e-05, "loss": 0.3919, "step": 81160 }, { "epoch": 2.9251811006595307, "grad_norm": 0.2178078591823578, "learning_rate": 1.939333562814473e-05, "loss": 0.4156, "step": 81165 }, { "epoch": 2.9253613003207555, "grad_norm": 0.17405816912651062, "learning_rate": 1.9390491863873482e-05, "loss": 0.371, "step": 81170 }, { "epoch": 2.92554149998198, "grad_norm": 0.20669405162334442, "learning_rate": 1.9387648176030697e-05, "loss": 0.389, "step": 81175 }, { "epoch": 2.925721699643205, "grad_norm": 0.20877471566200256, "learning_rate": 1.9384804564655106e-05, "loss": 0.3857, "step": 81180 }, { "epoch": 2.9259018993044292, "grad_norm": 0.2562907934188843, "learning_rate": 1.9381961029785485e-05, "loss": 0.396, "step": 81185 }, { "epoch": 2.926082098965654, "grad_norm": 0.2052457183599472, "learning_rate": 1.9379117571460545e-05, "loss": 0.3769, "step": 81190 }, { "epoch": 2.9262622986268787, "grad_norm": 0.26629289984703064, "learning_rate": 1.9376274189719034e-05, "loss": 0.4164, "step": 81195 }, { "epoch": 2.926442498288103, "grad_norm": 0.23448923230171204, "learning_rate": 1.9373430884599707e-05, "loss": 0.3989, "step": 81200 }, { "epoch": 2.9266226979493277, "grad_norm": 0.2015734761953354, "learning_rate": 1.937058765614129e-05, "loss": 0.4129, "step": 81205 }, { "epoch": 2.9268028976105525, "grad_norm": 0.3036739230155945, "learning_rate": 1.936774450438253e-05, "loss": 0.3919, "step": 81210 }, { "epoch": 2.926983097271777, "grad_norm": 0.19649726152420044, "learning_rate": 1.9364901429362164e-05, "loss": 0.4088, "step": 81215 }, { "epoch": 2.927163296933002, "grad_norm": 0.2458265870809555, "learning_rate": 1.9362058431118917e-05, "loss": 0.4255, "step": 81220 }, { "epoch": 2.9273434965942267, "grad_norm": 0.22339041531085968, "learning_rate": 1.935921550969154e-05, "loss": 0.3925, "step": 81225 }, { "epoch": 2.927523696255451, "grad_norm": 0.18713003396987915, "learning_rate": 1.9356372665118754e-05, "loss": 0.3988, "step": 81230 }, { "epoch": 2.9277038959166757, "grad_norm": 0.2202412635087967, "learning_rate": 1.9353529897439298e-05, "loss": 0.3597, "step": 81235 }, { "epoch": 2.9278840955779004, "grad_norm": 0.21688272058963776, "learning_rate": 1.9350687206691904e-05, "loss": 0.4228, "step": 81240 }, { "epoch": 2.9280642952391247, "grad_norm": 0.22464942932128906, "learning_rate": 1.9347844592915303e-05, "loss": 0.4061, "step": 81245 }, { "epoch": 2.9282444949003494, "grad_norm": 0.18823909759521484, "learning_rate": 1.934500205614823e-05, "loss": 0.4211, "step": 81250 }, { "epoch": 2.928424694561574, "grad_norm": 0.1837102770805359, "learning_rate": 1.9342159596429414e-05, "loss": 0.3719, "step": 81255 }, { "epoch": 2.928604894222799, "grad_norm": 0.19186070561408997, "learning_rate": 1.933931721379756e-05, "loss": 0.3568, "step": 81260 }, { "epoch": 2.9287850938840236, "grad_norm": 0.2203979641199112, "learning_rate": 1.933647490829143e-05, "loss": 0.3601, "step": 81265 }, { "epoch": 2.9289652935452484, "grad_norm": 0.1754385232925415, "learning_rate": 1.9333632679949713e-05, "loss": 0.3799, "step": 81270 }, { "epoch": 2.9291454932064727, "grad_norm": 0.24749146401882172, "learning_rate": 1.9330790528811177e-05, "loss": 0.3833, "step": 81275 }, { "epoch": 2.9293256928676974, "grad_norm": 0.19745169579982758, "learning_rate": 1.9327948454914514e-05, "loss": 0.3694, "step": 81280 }, { "epoch": 2.929505892528922, "grad_norm": 0.24451091885566711, "learning_rate": 1.932510645829845e-05, "loss": 0.3911, "step": 81285 }, { "epoch": 2.9296860921901464, "grad_norm": 0.22178997099399567, "learning_rate": 1.9322264539001716e-05, "loss": 0.3695, "step": 81290 }, { "epoch": 2.929866291851371, "grad_norm": 0.2586722671985626, "learning_rate": 1.9319422697063027e-05, "loss": 0.402, "step": 81295 }, { "epoch": 2.930046491512596, "grad_norm": 0.21260590851306915, "learning_rate": 1.9316580932521102e-05, "loss": 0.4434, "step": 81300 }, { "epoch": 2.9302266911738206, "grad_norm": 0.21419310569763184, "learning_rate": 1.9313739245414665e-05, "loss": 0.3775, "step": 81305 }, { "epoch": 2.9304068908350454, "grad_norm": 0.18965986371040344, "learning_rate": 1.9310897635782426e-05, "loss": 0.4149, "step": 81310 }, { "epoch": 2.93058709049627, "grad_norm": 0.2388392835855484, "learning_rate": 1.930805610366311e-05, "loss": 0.4024, "step": 81315 }, { "epoch": 2.9307672901574944, "grad_norm": 0.21854186058044434, "learning_rate": 1.9305214649095426e-05, "loss": 0.3543, "step": 81320 }, { "epoch": 2.930947489818719, "grad_norm": 0.18977004289627075, "learning_rate": 1.930237327211809e-05, "loss": 0.3614, "step": 81325 }, { "epoch": 2.931127689479944, "grad_norm": 0.20082025229930878, "learning_rate": 1.9299531972769813e-05, "loss": 0.3993, "step": 81330 }, { "epoch": 2.9313078891411686, "grad_norm": 0.20026427507400513, "learning_rate": 1.9296690751089312e-05, "loss": 0.3545, "step": 81335 }, { "epoch": 2.931488088802393, "grad_norm": 0.22359101474285126, "learning_rate": 1.9293849607115292e-05, "loss": 0.3945, "step": 81340 }, { "epoch": 2.9316682884636176, "grad_norm": 0.2114863246679306, "learning_rate": 1.929100854088647e-05, "loss": 0.3842, "step": 81345 }, { "epoch": 2.9318484881248423, "grad_norm": 0.22665932774543762, "learning_rate": 1.928816755244155e-05, "loss": 0.3776, "step": 81350 }, { "epoch": 2.932028687786067, "grad_norm": 0.27405086159706116, "learning_rate": 1.928532664181924e-05, "loss": 0.4077, "step": 81355 }, { "epoch": 2.932208887447292, "grad_norm": 0.16725726425647736, "learning_rate": 1.9282485809058254e-05, "loss": 0.3516, "step": 81360 }, { "epoch": 2.932389087108516, "grad_norm": 0.22895494103431702, "learning_rate": 1.927964505419728e-05, "loss": 0.3507, "step": 81365 }, { "epoch": 2.932569286769741, "grad_norm": 0.2224702686071396, "learning_rate": 1.9276804377275047e-05, "loss": 0.3727, "step": 81370 }, { "epoch": 2.9327494864309656, "grad_norm": 0.2331514060497284, "learning_rate": 1.927396377833024e-05, "loss": 0.4005, "step": 81375 }, { "epoch": 2.9329296860921903, "grad_norm": 0.18143446743488312, "learning_rate": 1.9271123257401568e-05, "loss": 0.3625, "step": 81380 }, { "epoch": 2.9331098857534146, "grad_norm": 0.20633216202259064, "learning_rate": 1.9268282814527737e-05, "loss": 0.3695, "step": 81385 }, { "epoch": 2.9332900854146393, "grad_norm": 0.2796785235404968, "learning_rate": 1.9265442449747432e-05, "loss": 0.4057, "step": 81390 }, { "epoch": 2.933470285075864, "grad_norm": 0.23631520569324493, "learning_rate": 1.9262602163099375e-05, "loss": 0.3928, "step": 81395 }, { "epoch": 2.933650484737089, "grad_norm": 0.2131415605545044, "learning_rate": 1.925976195462225e-05, "loss": 0.3849, "step": 81400 }, { "epoch": 2.9338306843983135, "grad_norm": 0.2613097131252289, "learning_rate": 1.925692182435475e-05, "loss": 0.418, "step": 81405 }, { "epoch": 2.934010884059538, "grad_norm": 0.24537603557109833, "learning_rate": 1.9254081772335586e-05, "loss": 0.4091, "step": 81410 }, { "epoch": 2.9341910837207625, "grad_norm": 0.19642150402069092, "learning_rate": 1.9251241798603447e-05, "loss": 0.4022, "step": 81415 }, { "epoch": 2.9343712833819873, "grad_norm": 0.1668914407491684, "learning_rate": 1.9248401903197015e-05, "loss": 0.3614, "step": 81420 }, { "epoch": 2.934551483043212, "grad_norm": 0.20156939327716827, "learning_rate": 1.9245562086155004e-05, "loss": 0.4329, "step": 81425 }, { "epoch": 2.9347316827044363, "grad_norm": 0.22902165353298187, "learning_rate": 1.9242722347516088e-05, "loss": 0.3377, "step": 81430 }, { "epoch": 2.934911882365661, "grad_norm": 0.1990087330341339, "learning_rate": 1.9239882687318972e-05, "loss": 0.4288, "step": 81435 }, { "epoch": 2.9350920820268858, "grad_norm": 0.1762835681438446, "learning_rate": 1.9237043105602342e-05, "loss": 0.3982, "step": 81440 }, { "epoch": 2.9352722816881105, "grad_norm": 0.23856672644615173, "learning_rate": 1.923420360240487e-05, "loss": 0.4047, "step": 81445 }, { "epoch": 2.9354524813493352, "grad_norm": 0.19141538441181183, "learning_rate": 1.923136417776527e-05, "loss": 0.4154, "step": 81450 }, { "epoch": 2.93563268101056, "grad_norm": 0.21707025170326233, "learning_rate": 1.9228524831722206e-05, "loss": 0.3691, "step": 81455 }, { "epoch": 2.9358128806717843, "grad_norm": 0.2077445536851883, "learning_rate": 1.922568556431438e-05, "loss": 0.3898, "step": 81460 }, { "epoch": 2.935993080333009, "grad_norm": 0.17210866510868073, "learning_rate": 1.9222846375580478e-05, "loss": 0.3616, "step": 81465 }, { "epoch": 2.9361732799942337, "grad_norm": 0.1976180225610733, "learning_rate": 1.922000726555916e-05, "loss": 0.3905, "step": 81470 }, { "epoch": 2.936353479655458, "grad_norm": 0.2516196072101593, "learning_rate": 1.9217168234289136e-05, "loss": 0.4011, "step": 81475 }, { "epoch": 2.9365336793166827, "grad_norm": 0.2320529818534851, "learning_rate": 1.9214329281809074e-05, "loss": 0.3951, "step": 81480 }, { "epoch": 2.9367138789779075, "grad_norm": 0.19517144560813904, "learning_rate": 1.921149040815765e-05, "loss": 0.3724, "step": 81485 }, { "epoch": 2.936894078639132, "grad_norm": 0.21140725910663605, "learning_rate": 1.9208651613373553e-05, "loss": 0.3686, "step": 81490 }, { "epoch": 2.937074278300357, "grad_norm": 0.19208674132823944, "learning_rate": 1.9205812897495448e-05, "loss": 0.3837, "step": 81495 }, { "epoch": 2.9372544779615817, "grad_norm": 0.19858016073703766, "learning_rate": 1.920297426056203e-05, "loss": 0.3975, "step": 81500 }, { "epoch": 2.9372544779615817, "eval_loss": 0.43074142932891846, "eval_runtime": 3.5337, "eval_samples_per_second": 28.299, "eval_steps_per_second": 7.075, "step": 81500 }, { "epoch": 2.937434677622806, "grad_norm": 0.20274978876113892, "learning_rate": 1.920013570261196e-05, "loss": 0.3892, "step": 81505 }, { "epoch": 2.9376148772840307, "grad_norm": 0.2162855863571167, "learning_rate": 1.919729722368392e-05, "loss": 0.4381, "step": 81510 }, { "epoch": 2.9377950769452554, "grad_norm": 0.212267205119133, "learning_rate": 1.9194458823816583e-05, "loss": 0.3685, "step": 81515 }, { "epoch": 2.9379752766064797, "grad_norm": 0.18356037139892578, "learning_rate": 1.919162050304862e-05, "loss": 0.3908, "step": 81520 }, { "epoch": 2.9381554762677045, "grad_norm": 0.23290087282657623, "learning_rate": 1.91887822614187e-05, "loss": 0.4314, "step": 81525 }, { "epoch": 2.938335675928929, "grad_norm": 0.2154729664325714, "learning_rate": 1.9185944098965502e-05, "loss": 0.3872, "step": 81530 }, { "epoch": 2.938515875590154, "grad_norm": 0.19503213465213776, "learning_rate": 1.918310601572768e-05, "loss": 0.3756, "step": 81535 }, { "epoch": 2.9386960752513787, "grad_norm": 0.22458268702030182, "learning_rate": 1.9180268011743925e-05, "loss": 0.3672, "step": 81540 }, { "epoch": 2.9388762749126034, "grad_norm": 0.20593036711215973, "learning_rate": 1.9177430087052892e-05, "loss": 0.4, "step": 81545 }, { "epoch": 2.9390564745738277, "grad_norm": 0.21392624080181122, "learning_rate": 1.917459224169323e-05, "loss": 0.3966, "step": 81550 }, { "epoch": 2.9392366742350524, "grad_norm": 0.19831807911396027, "learning_rate": 1.917175447570364e-05, "loss": 0.3983, "step": 81555 }, { "epoch": 2.939416873896277, "grad_norm": 0.20181818306446075, "learning_rate": 1.916891678912276e-05, "loss": 0.3699, "step": 81560 }, { "epoch": 2.9395970735575014, "grad_norm": 0.19284601509571075, "learning_rate": 1.916607918198926e-05, "loss": 0.3628, "step": 81565 }, { "epoch": 2.939777273218726, "grad_norm": 0.20783284306526184, "learning_rate": 1.91632416543418e-05, "loss": 0.4283, "step": 81570 }, { "epoch": 2.939957472879951, "grad_norm": 0.20689019560813904, "learning_rate": 1.916040420621904e-05, "loss": 0.3616, "step": 81575 }, { "epoch": 2.9401376725411756, "grad_norm": 0.3156082332134247, "learning_rate": 1.9157566837659645e-05, "loss": 0.4052, "step": 81580 }, { "epoch": 2.9403178722024004, "grad_norm": 0.23160193860530853, "learning_rate": 1.9154729548702272e-05, "loss": 0.3514, "step": 81585 }, { "epoch": 2.940498071863625, "grad_norm": 0.18180207908153534, "learning_rate": 1.9151892339385573e-05, "loss": 0.4118, "step": 81590 }, { "epoch": 2.9406782715248494, "grad_norm": 0.17559707164764404, "learning_rate": 1.9149055209748214e-05, "loss": 0.3459, "step": 81595 }, { "epoch": 2.940858471186074, "grad_norm": 0.2852272689342499, "learning_rate": 1.9146218159828845e-05, "loss": 0.4089, "step": 81600 }, { "epoch": 2.941038670847299, "grad_norm": 0.19265016913414001, "learning_rate": 1.914338118966611e-05, "loss": 0.4113, "step": 81605 }, { "epoch": 2.9412188705085236, "grad_norm": 0.27521640062332153, "learning_rate": 1.9140544299298687e-05, "loss": 0.4314, "step": 81610 }, { "epoch": 2.941399070169748, "grad_norm": 0.22525955736637115, "learning_rate": 1.91377074887652e-05, "loss": 0.4097, "step": 81615 }, { "epoch": 2.9415792698309726, "grad_norm": 0.1709366887807846, "learning_rate": 1.9134870758104324e-05, "loss": 0.3707, "step": 81620 }, { "epoch": 2.9417594694921974, "grad_norm": 0.20338067412376404, "learning_rate": 1.9132034107354693e-05, "loss": 0.4064, "step": 81625 }, { "epoch": 2.941939669153422, "grad_norm": 0.21200573444366455, "learning_rate": 1.912919753655496e-05, "loss": 0.3752, "step": 81630 }, { "epoch": 2.942119868814647, "grad_norm": 0.19498129189014435, "learning_rate": 1.9126361045743778e-05, "loss": 0.3777, "step": 81635 }, { "epoch": 2.942300068475871, "grad_norm": 0.24946102499961853, "learning_rate": 1.9123524634959794e-05, "loss": 0.4022, "step": 81640 }, { "epoch": 2.942480268137096, "grad_norm": 0.25150594115257263, "learning_rate": 1.9120688304241636e-05, "loss": 0.3575, "step": 81645 }, { "epoch": 2.9426604677983206, "grad_norm": 0.23567576706409454, "learning_rate": 1.9117852053627978e-05, "loss": 0.4141, "step": 81650 }, { "epoch": 2.9428406674595453, "grad_norm": 0.218300923705101, "learning_rate": 1.911501588315743e-05, "loss": 0.3508, "step": 81655 }, { "epoch": 2.9430208671207696, "grad_norm": 0.24794436991214752, "learning_rate": 1.9112179792868662e-05, "loss": 0.3964, "step": 81660 }, { "epoch": 2.9432010667819943, "grad_norm": 0.1559266299009323, "learning_rate": 1.9109343782800305e-05, "loss": 0.4062, "step": 81665 }, { "epoch": 2.943381266443219, "grad_norm": 0.25583502650260925, "learning_rate": 1.910650785299099e-05, "loss": 0.3751, "step": 81670 }, { "epoch": 2.943561466104444, "grad_norm": 0.2357344627380371, "learning_rate": 1.9103672003479372e-05, "loss": 0.3932, "step": 81675 }, { "epoch": 2.9437416657656685, "grad_norm": 0.2820563018321991, "learning_rate": 1.9100836234304076e-05, "loss": 0.3736, "step": 81680 }, { "epoch": 2.9439218654268933, "grad_norm": 0.227774977684021, "learning_rate": 1.9098000545503745e-05, "loss": 0.4056, "step": 81685 }, { "epoch": 2.9441020650881176, "grad_norm": 0.23130254447460175, "learning_rate": 1.909516493711702e-05, "loss": 0.3757, "step": 81690 }, { "epoch": 2.9442822647493423, "grad_norm": 0.24160918593406677, "learning_rate": 1.909232940918252e-05, "loss": 0.4112, "step": 81695 }, { "epoch": 2.944462464410567, "grad_norm": 0.19820939004421234, "learning_rate": 1.9089493961738896e-05, "loss": 0.3657, "step": 81700 }, { "epoch": 2.9446426640717913, "grad_norm": 0.21389785408973694, "learning_rate": 1.9086658594824774e-05, "loss": 0.3894, "step": 81705 }, { "epoch": 2.944822863733016, "grad_norm": 0.24125300347805023, "learning_rate": 1.9083823308478776e-05, "loss": 0.4037, "step": 81710 }, { "epoch": 2.945003063394241, "grad_norm": 0.23977577686309814, "learning_rate": 1.9080988102739543e-05, "loss": 0.4222, "step": 81715 }, { "epoch": 2.9451832630554655, "grad_norm": 0.2184176743030548, "learning_rate": 1.90781529776457e-05, "loss": 0.3853, "step": 81720 }, { "epoch": 2.9453634627166903, "grad_norm": 0.21365883946418762, "learning_rate": 1.9075884935661265e-05, "loss": 0.3671, "step": 81725 }, { "epoch": 2.945543662377915, "grad_norm": 0.2217915952205658, "learning_rate": 1.9073049955826478e-05, "loss": 0.3974, "step": 81730 }, { "epoch": 2.9457238620391393, "grad_norm": 0.2563817799091339, "learning_rate": 1.9070215056745234e-05, "loss": 0.4093, "step": 81735 }, { "epoch": 2.945904061700364, "grad_norm": 0.21083958446979523, "learning_rate": 1.906738023845615e-05, "loss": 0.3654, "step": 81740 }, { "epoch": 2.9460842613615887, "grad_norm": 0.2113693654537201, "learning_rate": 1.906454550099787e-05, "loss": 0.3854, "step": 81745 }, { "epoch": 2.946264461022813, "grad_norm": 0.2557770311832428, "learning_rate": 1.9061710844409007e-05, "loss": 0.3904, "step": 81750 }, { "epoch": 2.9464446606840378, "grad_norm": 0.23025763034820557, "learning_rate": 1.9058876268728188e-05, "loss": 0.3876, "step": 81755 }, { "epoch": 2.9466248603452625, "grad_norm": 0.28059622645378113, "learning_rate": 1.9056041773994025e-05, "loss": 0.4097, "step": 81760 }, { "epoch": 2.9468050600064872, "grad_norm": 0.19796496629714966, "learning_rate": 1.905320736024514e-05, "loss": 0.4118, "step": 81765 }, { "epoch": 2.946985259667712, "grad_norm": 0.19060267508029938, "learning_rate": 1.9050373027520157e-05, "loss": 0.3784, "step": 81770 }, { "epoch": 2.9471654593289367, "grad_norm": 0.1780025213956833, "learning_rate": 1.9047538775857694e-05, "loss": 0.384, "step": 81775 }, { "epoch": 2.947345658990161, "grad_norm": 0.20874053239822388, "learning_rate": 1.9044704605296353e-05, "loss": 0.3901, "step": 81780 }, { "epoch": 2.9475258586513857, "grad_norm": 0.18418976664543152, "learning_rate": 1.9041870515874767e-05, "loss": 0.365, "step": 81785 }, { "epoch": 2.9477060583126105, "grad_norm": 0.2519192695617676, "learning_rate": 1.903903650763153e-05, "loss": 0.3867, "step": 81790 }, { "epoch": 2.9478862579738347, "grad_norm": 0.20696429908275604, "learning_rate": 1.903620258060528e-05, "loss": 0.3652, "step": 81795 }, { "epoch": 2.9480664576350595, "grad_norm": 0.25634706020355225, "learning_rate": 1.903336873483461e-05, "loss": 0.366, "step": 81800 }, { "epoch": 2.948246657296284, "grad_norm": 0.17828556895256042, "learning_rate": 1.9030534970358135e-05, "loss": 0.438, "step": 81805 }, { "epoch": 2.948426856957509, "grad_norm": 0.21666783094406128, "learning_rate": 1.9027701287214466e-05, "loss": 0.4145, "step": 81810 }, { "epoch": 2.9486070566187337, "grad_norm": 0.22079980373382568, "learning_rate": 1.902486768544222e-05, "loss": 0.372, "step": 81815 }, { "epoch": 2.9487872562799584, "grad_norm": 0.22830483317375183, "learning_rate": 1.9022034165079977e-05, "loss": 0.3996, "step": 81820 }, { "epoch": 2.9489674559411827, "grad_norm": 0.20484139025211334, "learning_rate": 1.901920072616638e-05, "loss": 0.3791, "step": 81825 }, { "epoch": 2.9491476556024074, "grad_norm": 0.2294071912765503, "learning_rate": 1.9016367368739994e-05, "loss": 0.4122, "step": 81830 }, { "epoch": 2.949327855263632, "grad_norm": 0.20010051131248474, "learning_rate": 1.9013534092839463e-05, "loss": 0.3743, "step": 81835 }, { "epoch": 2.949508054924857, "grad_norm": 0.2721427381038666, "learning_rate": 1.9010700898503362e-05, "loss": 0.4054, "step": 81840 }, { "epoch": 2.949688254586081, "grad_norm": 0.21804888546466827, "learning_rate": 1.90078677857703e-05, "loss": 0.4004, "step": 81845 }, { "epoch": 2.949868454247306, "grad_norm": 0.20582912862300873, "learning_rate": 1.9005034754678887e-05, "loss": 0.3752, "step": 81850 }, { "epoch": 2.9500486539085307, "grad_norm": 0.22233439981937408, "learning_rate": 1.900220180526771e-05, "loss": 0.3631, "step": 81855 }, { "epoch": 2.9502288535697554, "grad_norm": 0.20850253105163574, "learning_rate": 1.8999368937575367e-05, "loss": 0.3808, "step": 81860 }, { "epoch": 2.95040905323098, "grad_norm": 0.23026061058044434, "learning_rate": 1.8996536151640472e-05, "loss": 0.4305, "step": 81865 }, { "epoch": 2.9505892528922044, "grad_norm": 0.2322373390197754, "learning_rate": 1.89937034475016e-05, "loss": 0.4094, "step": 81870 }, { "epoch": 2.950769452553429, "grad_norm": 0.2333720475435257, "learning_rate": 1.899087082519736e-05, "loss": 0.4051, "step": 81875 }, { "epoch": 2.950949652214654, "grad_norm": 0.2258276641368866, "learning_rate": 1.898803828476634e-05, "loss": 0.4222, "step": 81880 }, { "epoch": 2.9511298518758786, "grad_norm": 0.21611566841602325, "learning_rate": 1.898520582624713e-05, "loss": 0.4085, "step": 81885 }, { "epoch": 2.951310051537103, "grad_norm": 0.21131113171577454, "learning_rate": 1.8982373449678336e-05, "loss": 0.413, "step": 81890 }, { "epoch": 2.9514902511983276, "grad_norm": 0.21428248286247253, "learning_rate": 1.8979541155098528e-05, "loss": 0.3984, "step": 81895 }, { "epoch": 2.9516704508595524, "grad_norm": 0.1810792088508606, "learning_rate": 1.8976708942546316e-05, "loss": 0.4067, "step": 81900 }, { "epoch": 2.951850650520777, "grad_norm": 0.20499011874198914, "learning_rate": 1.897387681206028e-05, "loss": 0.3654, "step": 81905 }, { "epoch": 2.952030850182002, "grad_norm": 0.2227361649274826, "learning_rate": 1.8971044763678995e-05, "loss": 0.3786, "step": 81910 }, { "epoch": 2.952211049843226, "grad_norm": 0.19713594019412994, "learning_rate": 1.8968212797441064e-05, "loss": 0.3725, "step": 81915 }, { "epoch": 2.952391249504451, "grad_norm": 0.250509113073349, "learning_rate": 1.896538091338507e-05, "loss": 0.3395, "step": 81920 }, { "epoch": 2.9525714491656756, "grad_norm": 0.1757110208272934, "learning_rate": 1.8962549111549578e-05, "loss": 0.4056, "step": 81925 }, { "epoch": 2.9527516488269003, "grad_norm": 0.23518085479736328, "learning_rate": 1.8959717391973204e-05, "loss": 0.381, "step": 81930 }, { "epoch": 2.9529318484881246, "grad_norm": 0.2347138673067093, "learning_rate": 1.8956885754694495e-05, "loss": 0.4076, "step": 81935 }, { "epoch": 2.9531120481493494, "grad_norm": 0.19113247096538544, "learning_rate": 1.8954054199752063e-05, "loss": 0.3994, "step": 81940 }, { "epoch": 2.953292247810574, "grad_norm": 0.1934749335050583, "learning_rate": 1.8951222727184466e-05, "loss": 0.381, "step": 81945 }, { "epoch": 2.953472447471799, "grad_norm": 0.21083234250545502, "learning_rate": 1.894839133703028e-05, "loss": 0.3682, "step": 81950 }, { "epoch": 2.9536526471330236, "grad_norm": 0.22241094708442688, "learning_rate": 1.89455600293281e-05, "loss": 0.3661, "step": 81955 }, { "epoch": 2.9538328467942483, "grad_norm": 0.17992718517780304, "learning_rate": 1.894272880411649e-05, "loss": 0.4289, "step": 81960 }, { "epoch": 2.9540130464554726, "grad_norm": 0.20856983959674835, "learning_rate": 1.8939897661434022e-05, "loss": 0.412, "step": 81965 }, { "epoch": 2.9541932461166973, "grad_norm": 0.21314571797847748, "learning_rate": 1.893706660131928e-05, "loss": 0.3842, "step": 81970 }, { "epoch": 2.954373445777922, "grad_norm": 0.19296015799045563, "learning_rate": 1.8934235623810833e-05, "loss": 0.3784, "step": 81975 }, { "epoch": 2.9545536454391463, "grad_norm": 0.24017472565174103, "learning_rate": 1.8931404728947248e-05, "loss": 0.4024, "step": 81980 }, { "epoch": 2.954733845100371, "grad_norm": 0.27945849299430847, "learning_rate": 1.89285739167671e-05, "loss": 0.4157, "step": 81985 }, { "epoch": 2.954914044761596, "grad_norm": 0.23929205536842346, "learning_rate": 1.8925743187308954e-05, "loss": 0.385, "step": 81990 }, { "epoch": 2.9550942444228205, "grad_norm": 0.20242777466773987, "learning_rate": 1.8922912540611387e-05, "loss": 0.4123, "step": 81995 }, { "epoch": 2.9552744440840453, "grad_norm": 0.2415585070848465, "learning_rate": 1.8920081976712963e-05, "loss": 0.3732, "step": 82000 }, { "epoch": 2.9552744440840453, "eval_loss": 0.4305243194103241, "eval_runtime": 3.5265, "eval_samples_per_second": 28.356, "eval_steps_per_second": 7.089, "step": 82000 }, { "epoch": 2.95545464374527, "grad_norm": 0.16947853565216064, "learning_rate": 1.8917251495652234e-05, "loss": 0.38, "step": 82005 }, { "epoch": 2.9556348434064943, "grad_norm": 0.24770832061767578, "learning_rate": 1.8914421097467787e-05, "loss": 0.4288, "step": 82010 }, { "epoch": 2.955815043067719, "grad_norm": 0.21834175288677216, "learning_rate": 1.891159078219816e-05, "loss": 0.3732, "step": 82015 }, { "epoch": 2.9559952427289438, "grad_norm": 0.1806267648935318, "learning_rate": 1.8908760549881943e-05, "loss": 0.3953, "step": 82020 }, { "epoch": 2.956175442390168, "grad_norm": 0.26217013597488403, "learning_rate": 1.890593040055768e-05, "loss": 0.4025, "step": 82025 }, { "epoch": 2.956355642051393, "grad_norm": 0.15443582832813263, "learning_rate": 1.8903100334263933e-05, "loss": 0.3855, "step": 82030 }, { "epoch": 2.9565358417126175, "grad_norm": 0.21527697145938873, "learning_rate": 1.8900270351039263e-05, "loss": 0.3965, "step": 82035 }, { "epoch": 2.9567160413738423, "grad_norm": 0.2215004861354828, "learning_rate": 1.8897440450922234e-05, "loss": 0.3823, "step": 82040 }, { "epoch": 2.956896241035067, "grad_norm": 0.2365996241569519, "learning_rate": 1.8894610633951392e-05, "loss": 0.3848, "step": 82045 }, { "epoch": 2.9570764406962917, "grad_norm": 0.2466432750225067, "learning_rate": 1.88917809001653e-05, "loss": 0.4159, "step": 82050 }, { "epoch": 2.957256640357516, "grad_norm": 0.20089565217494965, "learning_rate": 1.888895124960251e-05, "loss": 0.3936, "step": 82055 }, { "epoch": 2.9574368400187407, "grad_norm": 0.23963125050067902, "learning_rate": 1.8886121682301576e-05, "loss": 0.3979, "step": 82060 }, { "epoch": 2.9576170396799655, "grad_norm": 0.2524862289428711, "learning_rate": 1.8883292198301056e-05, "loss": 0.3805, "step": 82065 }, { "epoch": 2.9577972393411898, "grad_norm": 0.18411079049110413, "learning_rate": 1.8880462797639487e-05, "loss": 0.3945, "step": 82070 }, { "epoch": 2.9579774390024145, "grad_norm": 0.20663435757160187, "learning_rate": 1.8877633480355434e-05, "loss": 0.3686, "step": 82075 }, { "epoch": 2.9581576386636392, "grad_norm": 0.20612770318984985, "learning_rate": 1.8874804246487437e-05, "loss": 0.4012, "step": 82080 }, { "epoch": 2.958337838324864, "grad_norm": 0.18578004837036133, "learning_rate": 1.887197509607404e-05, "loss": 0.3866, "step": 82085 }, { "epoch": 2.9585180379860887, "grad_norm": 0.19520944356918335, "learning_rate": 1.8869146029153805e-05, "loss": 0.4054, "step": 82090 }, { "epoch": 2.9586982376473134, "grad_norm": 0.2129872888326645, "learning_rate": 1.8866317045765264e-05, "loss": 0.3716, "step": 82095 }, { "epoch": 2.9588784373085377, "grad_norm": 0.2196713089942932, "learning_rate": 1.8863488145946965e-05, "loss": 0.3829, "step": 82100 }, { "epoch": 2.9590586369697625, "grad_norm": 0.21013066172599792, "learning_rate": 1.8860659329737467e-05, "loss": 0.403, "step": 82105 }, { "epoch": 2.959238836630987, "grad_norm": 0.19054462015628815, "learning_rate": 1.8857830597175273e-05, "loss": 0.3886, "step": 82110 }, { "epoch": 2.959419036292212, "grad_norm": 0.2173452377319336, "learning_rate": 1.8855001948298966e-05, "loss": 0.3946, "step": 82115 }, { "epoch": 2.959599235953436, "grad_norm": 0.22703658044338226, "learning_rate": 1.885217338314705e-05, "loss": 0.3912, "step": 82120 }, { "epoch": 2.959779435614661, "grad_norm": 0.1947726011276245, "learning_rate": 1.8849344901758102e-05, "loss": 0.3932, "step": 82125 }, { "epoch": 2.9599596352758857, "grad_norm": 0.23506419360637665, "learning_rate": 1.884651650417063e-05, "loss": 0.3539, "step": 82130 }, { "epoch": 2.9601398349371104, "grad_norm": 0.23253336548805237, "learning_rate": 1.8843688190423175e-05, "loss": 0.3832, "step": 82135 }, { "epoch": 2.960320034598335, "grad_norm": 0.19187670946121216, "learning_rate": 1.884085996055428e-05, "loss": 0.3612, "step": 82140 }, { "epoch": 2.9605002342595594, "grad_norm": 0.2828364670276642, "learning_rate": 1.883803181460248e-05, "loss": 0.3848, "step": 82145 }, { "epoch": 2.960680433920784, "grad_norm": 0.19735077023506165, "learning_rate": 1.8835203752606294e-05, "loss": 0.3615, "step": 82150 }, { "epoch": 2.960860633582009, "grad_norm": 0.2174484133720398, "learning_rate": 1.8832375774604272e-05, "loss": 0.3571, "step": 82155 }, { "epoch": 2.9610408332432336, "grad_norm": 0.1863182634115219, "learning_rate": 1.882954788063493e-05, "loss": 0.3636, "step": 82160 }, { "epoch": 2.961221032904458, "grad_norm": 0.1913149356842041, "learning_rate": 1.8826720070736804e-05, "loss": 0.3881, "step": 82165 }, { "epoch": 2.9614012325656827, "grad_norm": 0.21392560005187988, "learning_rate": 1.8823892344948428e-05, "loss": 0.3875, "step": 82170 }, { "epoch": 2.9615814322269074, "grad_norm": 0.18154218792915344, "learning_rate": 1.8821064703308315e-05, "loss": 0.4115, "step": 82175 }, { "epoch": 2.961761631888132, "grad_norm": 0.2097567617893219, "learning_rate": 1.8818237145855004e-05, "loss": 0.3887, "step": 82180 }, { "epoch": 2.961941831549357, "grad_norm": 0.218689426779747, "learning_rate": 1.881540967262702e-05, "loss": 0.3997, "step": 82185 }, { "epoch": 2.9621220312105816, "grad_norm": 0.19388610124588013, "learning_rate": 1.8812582283662865e-05, "loss": 0.3715, "step": 82190 }, { "epoch": 2.962302230871806, "grad_norm": 0.19577103853225708, "learning_rate": 1.88097549790011e-05, "loss": 0.3694, "step": 82195 }, { "epoch": 2.9624824305330306, "grad_norm": 0.21979494392871857, "learning_rate": 1.8806927758680203e-05, "loss": 0.4049, "step": 82200 }, { "epoch": 2.9626626301942554, "grad_norm": 0.19626237452030182, "learning_rate": 1.880410062273873e-05, "loss": 0.3738, "step": 82205 }, { "epoch": 2.9628428298554796, "grad_norm": 0.21775542199611664, "learning_rate": 1.8801273571215193e-05, "loss": 0.3772, "step": 82210 }, { "epoch": 2.9630230295167044, "grad_norm": 0.16890302300453186, "learning_rate": 1.8798446604148085e-05, "loss": 0.3957, "step": 82215 }, { "epoch": 2.963203229177929, "grad_norm": 0.2454291582107544, "learning_rate": 1.879561972157596e-05, "loss": 0.367, "step": 82220 }, { "epoch": 2.963383428839154, "grad_norm": 0.21003590524196625, "learning_rate": 1.8792792923537304e-05, "loss": 0.3988, "step": 82225 }, { "epoch": 2.9635636285003786, "grad_norm": 0.21214927732944489, "learning_rate": 1.878996621007064e-05, "loss": 0.3633, "step": 82230 }, { "epoch": 2.9637438281616033, "grad_norm": 0.24998506903648376, "learning_rate": 1.878713958121449e-05, "loss": 0.3908, "step": 82235 }, { "epoch": 2.9639240278228276, "grad_norm": 0.21911552548408508, "learning_rate": 1.8784313037007355e-05, "loss": 0.3708, "step": 82240 }, { "epoch": 2.9641042274840523, "grad_norm": 0.24015755951404572, "learning_rate": 1.8781486577487763e-05, "loss": 0.43, "step": 82245 }, { "epoch": 2.964284427145277, "grad_norm": 0.17950226366519928, "learning_rate": 1.8778660202694203e-05, "loss": 0.3546, "step": 82250 }, { "epoch": 2.9644646268065014, "grad_norm": 0.14863833785057068, "learning_rate": 1.877583391266519e-05, "loss": 0.3852, "step": 82255 }, { "epoch": 2.964644826467726, "grad_norm": 0.207986518740654, "learning_rate": 1.8773007707439245e-05, "loss": 0.3883, "step": 82260 }, { "epoch": 2.964825026128951, "grad_norm": 0.1966986507177353, "learning_rate": 1.877018158705486e-05, "loss": 0.3868, "step": 82265 }, { "epoch": 2.9650052257901756, "grad_norm": 0.19852174818515778, "learning_rate": 1.876735555155054e-05, "loss": 0.429, "step": 82270 }, { "epoch": 2.9651854254514003, "grad_norm": 0.2330932915210724, "learning_rate": 1.87645296009648e-05, "loss": 0.4144, "step": 82275 }, { "epoch": 2.965365625112625, "grad_norm": 0.2199668288230896, "learning_rate": 1.8761703735336134e-05, "loss": 0.4, "step": 82280 }, { "epoch": 2.9655458247738493, "grad_norm": 0.24273623526096344, "learning_rate": 1.8758877954703047e-05, "loss": 0.3756, "step": 82285 }, { "epoch": 2.965726024435074, "grad_norm": 0.2339378446340561, "learning_rate": 1.8756052259104048e-05, "loss": 0.4029, "step": 82290 }, { "epoch": 2.965906224096299, "grad_norm": 0.1760188341140747, "learning_rate": 1.8753226648577613e-05, "loss": 0.3748, "step": 82295 }, { "epoch": 2.966086423757523, "grad_norm": 0.24585866928100586, "learning_rate": 1.875040112316227e-05, "loss": 0.4141, "step": 82300 }, { "epoch": 2.966266623418748, "grad_norm": 0.2293357104063034, "learning_rate": 1.8747575682896483e-05, "loss": 0.3367, "step": 82305 }, { "epoch": 2.9664468230799725, "grad_norm": 0.19390980899333954, "learning_rate": 1.874475032781879e-05, "loss": 0.3877, "step": 82310 }, { "epoch": 2.9666270227411973, "grad_norm": 0.20741982758045197, "learning_rate": 1.874192505796765e-05, "loss": 0.4016, "step": 82315 }, { "epoch": 2.966807222402422, "grad_norm": 0.21530795097351074, "learning_rate": 1.873909987338157e-05, "loss": 0.3675, "step": 82320 }, { "epoch": 2.9669874220636467, "grad_norm": 0.2282307744026184, "learning_rate": 1.8736274774099044e-05, "loss": 0.3606, "step": 82325 }, { "epoch": 2.967167621724871, "grad_norm": 0.22964274883270264, "learning_rate": 1.873344976015856e-05, "loss": 0.3933, "step": 82330 }, { "epoch": 2.9673478213860958, "grad_norm": 0.2597173750400543, "learning_rate": 1.873062483159861e-05, "loss": 0.3803, "step": 82335 }, { "epoch": 2.9675280210473205, "grad_norm": 0.19982276856899261, "learning_rate": 1.8727799988457682e-05, "loss": 0.4066, "step": 82340 }, { "epoch": 2.9677082207085452, "grad_norm": 0.19702062010765076, "learning_rate": 1.8724975230774265e-05, "loss": 0.3729, "step": 82345 }, { "epoch": 2.9678884203697695, "grad_norm": 0.19733615219593048, "learning_rate": 1.872215055858685e-05, "loss": 0.4182, "step": 82350 }, { "epoch": 2.9680686200309943, "grad_norm": 0.21504078805446625, "learning_rate": 1.8719325971933912e-05, "loss": 0.3605, "step": 82355 }, { "epoch": 2.968248819692219, "grad_norm": 0.19036151468753815, "learning_rate": 1.8716501470853942e-05, "loss": 0.4204, "step": 82360 }, { "epoch": 2.9684290193534437, "grad_norm": 0.2703343331813812, "learning_rate": 1.8713677055385425e-05, "loss": 0.3874, "step": 82365 }, { "epoch": 2.9686092190146685, "grad_norm": 0.20502404868602753, "learning_rate": 1.8710852725566847e-05, "loss": 0.3794, "step": 82370 }, { "epoch": 2.9687894186758927, "grad_norm": 0.2101486623287201, "learning_rate": 1.8708028481436672e-05, "loss": 0.4003, "step": 82375 }, { "epoch": 2.9689696183371175, "grad_norm": 0.19613702595233917, "learning_rate": 1.8705204323033398e-05, "loss": 0.3774, "step": 82380 }, { "epoch": 2.969149817998342, "grad_norm": 0.22090743482112885, "learning_rate": 1.870238025039549e-05, "loss": 0.4098, "step": 82385 }, { "epoch": 2.969330017659567, "grad_norm": 0.2624460756778717, "learning_rate": 1.869955626356144e-05, "loss": 0.3689, "step": 82390 }, { "epoch": 2.9695102173207912, "grad_norm": 0.2443309724330902, "learning_rate": 1.869673236256972e-05, "loss": 0.4143, "step": 82395 }, { "epoch": 2.969690416982016, "grad_norm": 0.2217528223991394, "learning_rate": 1.8693908547458782e-05, "loss": 0.386, "step": 82400 }, { "epoch": 2.9698706166432407, "grad_norm": 0.19909748435020447, "learning_rate": 1.869108481826714e-05, "loss": 0.4024, "step": 82405 }, { "epoch": 2.9700508163044654, "grad_norm": 0.23337341845035553, "learning_rate": 1.8688261175033238e-05, "loss": 0.4011, "step": 82410 }, { "epoch": 2.97023101596569, "grad_norm": 0.20584703981876373, "learning_rate": 1.868543761779555e-05, "loss": 0.4166, "step": 82415 }, { "epoch": 2.9704112156269145, "grad_norm": 0.17910712957382202, "learning_rate": 1.8682614146592558e-05, "loss": 0.3962, "step": 82420 }, { "epoch": 2.970591415288139, "grad_norm": 0.16632866859436035, "learning_rate": 1.8679790761462717e-05, "loss": 0.388, "step": 82425 }, { "epoch": 2.970771614949364, "grad_norm": 0.24808615446090698, "learning_rate": 1.8676967462444513e-05, "loss": 0.3801, "step": 82430 }, { "epoch": 2.9709518146105887, "grad_norm": 0.2281518578529358, "learning_rate": 1.8674144249576403e-05, "loss": 0.363, "step": 82435 }, { "epoch": 2.971132014271813, "grad_norm": 0.14790934324264526, "learning_rate": 1.8671321122896847e-05, "loss": 0.3396, "step": 82440 }, { "epoch": 2.9713122139330377, "grad_norm": 0.2927576005458832, "learning_rate": 1.866849808244432e-05, "loss": 0.3867, "step": 82445 }, { "epoch": 2.9714924135942624, "grad_norm": 0.24208620190620422, "learning_rate": 1.866567512825728e-05, "loss": 0.4101, "step": 82450 }, { "epoch": 2.971672613255487, "grad_norm": 0.19803206622600555, "learning_rate": 1.8662852260374183e-05, "loss": 0.3986, "step": 82455 }, { "epoch": 2.971852812916712, "grad_norm": 0.22639377415180206, "learning_rate": 1.866002947883351e-05, "loss": 0.4211, "step": 82460 }, { "epoch": 2.9720330125779366, "grad_norm": 0.20173002779483795, "learning_rate": 1.865720678367369e-05, "loss": 0.3984, "step": 82465 }, { "epoch": 2.972213212239161, "grad_norm": 0.28004732728004456, "learning_rate": 1.8654384174933214e-05, "loss": 0.4313, "step": 82470 }, { "epoch": 2.9723934119003856, "grad_norm": 0.19623763859272003, "learning_rate": 1.865156165265053e-05, "loss": 0.408, "step": 82475 }, { "epoch": 2.9725736115616104, "grad_norm": 0.17043015360832214, "learning_rate": 1.864873921686407e-05, "loss": 0.3979, "step": 82480 }, { "epoch": 2.9727538112228347, "grad_norm": 0.2070167362689972, "learning_rate": 1.864591686761233e-05, "loss": 0.3852, "step": 82485 }, { "epoch": 2.9729340108840594, "grad_norm": 0.21379338204860687, "learning_rate": 1.864309460493372e-05, "loss": 0.4143, "step": 82490 }, { "epoch": 2.973114210545284, "grad_norm": 0.2483922243118286, "learning_rate": 1.864027242886673e-05, "loss": 0.4423, "step": 82495 }, { "epoch": 2.973294410206509, "grad_norm": 0.2155703455209732, "learning_rate": 1.86374503394498e-05, "loss": 0.4044, "step": 82500 }, { "epoch": 2.973294410206509, "eval_loss": 0.4298676550388336, "eval_runtime": 3.5326, "eval_samples_per_second": 28.308, "eval_steps_per_second": 7.077, "step": 82500 }, { "epoch": 2.9734746098677336, "grad_norm": 0.24112021923065186, "learning_rate": 1.863462833672136e-05, "loss": 0.3763, "step": 82505 }, { "epoch": 2.9736548095289583, "grad_norm": 0.19381298124790192, "learning_rate": 1.8631806420719896e-05, "loss": 0.3785, "step": 82510 }, { "epoch": 2.9738350091901826, "grad_norm": 0.21174710988998413, "learning_rate": 1.862898459148383e-05, "loss": 0.3688, "step": 82515 }, { "epoch": 2.9740152088514074, "grad_norm": 0.18652644753456116, "learning_rate": 1.862616284905161e-05, "loss": 0.3974, "step": 82520 }, { "epoch": 2.974195408512632, "grad_norm": 0.24465037882328033, "learning_rate": 1.8623341193461696e-05, "loss": 0.4212, "step": 82525 }, { "epoch": 2.9743756081738564, "grad_norm": 0.19063496589660645, "learning_rate": 1.8620519624752512e-05, "loss": 0.3724, "step": 82530 }, { "epoch": 2.974555807835081, "grad_norm": 0.21079134941101074, "learning_rate": 1.8617698142962523e-05, "loss": 0.401, "step": 82535 }, { "epoch": 2.974736007496306, "grad_norm": 0.22930555045604706, "learning_rate": 1.861487674813016e-05, "loss": 0.3471, "step": 82540 }, { "epoch": 2.9749162071575306, "grad_norm": 0.22107750177383423, "learning_rate": 1.8612055440293856e-05, "loss": 0.3809, "step": 82545 }, { "epoch": 2.9750964068187553, "grad_norm": 0.21329925954341888, "learning_rate": 1.8609234219492072e-05, "loss": 0.3947, "step": 82550 }, { "epoch": 2.97527660647998, "grad_norm": 0.1962546557188034, "learning_rate": 1.860641308576323e-05, "loss": 0.3516, "step": 82555 }, { "epoch": 2.9754568061412043, "grad_norm": 0.19978949427604675, "learning_rate": 1.8603592039145767e-05, "loss": 0.3896, "step": 82560 }, { "epoch": 2.975637005802429, "grad_norm": 0.23347966372966766, "learning_rate": 1.860077107967813e-05, "loss": 0.3658, "step": 82565 }, { "epoch": 2.975817205463654, "grad_norm": 0.19242475926876068, "learning_rate": 1.8597950207398745e-05, "loss": 0.4014, "step": 82570 }, { "epoch": 2.975997405124878, "grad_norm": 0.20746643841266632, "learning_rate": 1.8595129422346054e-05, "loss": 0.3827, "step": 82575 }, { "epoch": 2.976177604786103, "grad_norm": 0.20090548694133759, "learning_rate": 1.8592308724558485e-05, "loss": 0.3629, "step": 82580 }, { "epoch": 2.9763578044473276, "grad_norm": 0.17902597784996033, "learning_rate": 1.8589488114074456e-05, "loss": 0.3842, "step": 82585 }, { "epoch": 2.9765380041085523, "grad_norm": 0.2532462477684021, "learning_rate": 1.8586667590932426e-05, "loss": 0.4095, "step": 82590 }, { "epoch": 2.976718203769777, "grad_norm": 0.26254743337631226, "learning_rate": 1.8583847155170803e-05, "loss": 0.3585, "step": 82595 }, { "epoch": 2.9768984034310018, "grad_norm": 0.21005457639694214, "learning_rate": 1.858102680682802e-05, "loss": 0.3766, "step": 82600 }, { "epoch": 2.977078603092226, "grad_norm": 0.23283547163009644, "learning_rate": 1.85782065459425e-05, "loss": 0.3884, "step": 82605 }, { "epoch": 2.977258802753451, "grad_norm": 0.21179614961147308, "learning_rate": 1.8575386372552673e-05, "loss": 0.3647, "step": 82610 }, { "epoch": 2.9774390024146755, "grad_norm": 0.1830110400915146, "learning_rate": 1.857256628669697e-05, "loss": 0.3991, "step": 82615 }, { "epoch": 2.9776192020759003, "grad_norm": 0.24650508165359497, "learning_rate": 1.8569746288413802e-05, "loss": 0.363, "step": 82620 }, { "epoch": 2.9777994017371245, "grad_norm": 0.23117853701114655, "learning_rate": 1.856692637774159e-05, "loss": 0.3747, "step": 82625 }, { "epoch": 2.9779796013983493, "grad_norm": 0.18374405801296234, "learning_rate": 1.8564106554718767e-05, "loss": 0.3854, "step": 82630 }, { "epoch": 2.978159801059574, "grad_norm": 0.1933712363243103, "learning_rate": 1.856128681938375e-05, "loss": 0.3719, "step": 82635 }, { "epoch": 2.9783400007207987, "grad_norm": 0.24124352633953094, "learning_rate": 1.8558467171774946e-05, "loss": 0.4093, "step": 82640 }, { "epoch": 2.9785202003820235, "grad_norm": 0.22895336151123047, "learning_rate": 1.8555647611930788e-05, "loss": 0.3979, "step": 82645 }, { "epoch": 2.9787004000432478, "grad_norm": 0.240083709359169, "learning_rate": 1.8552828139889673e-05, "loss": 0.403, "step": 82650 }, { "epoch": 2.9788805997044725, "grad_norm": 0.18165172636508942, "learning_rate": 1.8550008755690034e-05, "loss": 0.3757, "step": 82655 }, { "epoch": 2.9790607993656972, "grad_norm": 0.2198115587234497, "learning_rate": 1.8547189459370275e-05, "loss": 0.374, "step": 82660 }, { "epoch": 2.979240999026922, "grad_norm": 0.19602136313915253, "learning_rate": 1.8544370250968808e-05, "loss": 0.3852, "step": 82665 }, { "epoch": 2.9794211986881463, "grad_norm": 0.24065439403057098, "learning_rate": 1.854155113052405e-05, "loss": 0.3971, "step": 82670 }, { "epoch": 2.979601398349371, "grad_norm": 0.20084550976753235, "learning_rate": 1.853873209807441e-05, "loss": 0.3652, "step": 82675 }, { "epoch": 2.9797815980105957, "grad_norm": 0.18546749651432037, "learning_rate": 1.8535913153658284e-05, "loss": 0.3908, "step": 82680 }, { "epoch": 2.9799617976718205, "grad_norm": 0.19629612565040588, "learning_rate": 1.8533094297314108e-05, "loss": 0.3711, "step": 82685 }, { "epoch": 2.980141997333045, "grad_norm": 0.19778327643871307, "learning_rate": 1.8530275529080248e-05, "loss": 0.4318, "step": 82690 }, { "epoch": 2.98032219699427, "grad_norm": 0.19688297808170319, "learning_rate": 1.8527456848995148e-05, "loss": 0.3831, "step": 82695 }, { "epoch": 2.980502396655494, "grad_norm": 0.22742833197116852, "learning_rate": 1.852463825709719e-05, "loss": 0.4061, "step": 82700 }, { "epoch": 2.980682596316719, "grad_norm": 0.20782898366451263, "learning_rate": 1.852181975342478e-05, "loss": 0.3998, "step": 82705 }, { "epoch": 2.9808627959779437, "grad_norm": 0.20285415649414062, "learning_rate": 1.8519001338016324e-05, "loss": 0.3738, "step": 82710 }, { "epoch": 2.981042995639168, "grad_norm": 0.26227110624313354, "learning_rate": 1.8516183010910216e-05, "loss": 0.3849, "step": 82715 }, { "epoch": 2.9812231953003927, "grad_norm": 0.20241287350654602, "learning_rate": 1.8513364772144863e-05, "loss": 0.3869, "step": 82720 }, { "epoch": 2.9814033949616174, "grad_norm": 0.25886163115501404, "learning_rate": 1.851054662175866e-05, "loss": 0.43, "step": 82725 }, { "epoch": 2.981583594622842, "grad_norm": 0.20351019501686096, "learning_rate": 1.8507728559789996e-05, "loss": 0.3872, "step": 82730 }, { "epoch": 2.981763794284067, "grad_norm": 0.27192798256874084, "learning_rate": 1.850491058627728e-05, "loss": 0.3813, "step": 82735 }, { "epoch": 2.9819439939452916, "grad_norm": 0.19435738027095795, "learning_rate": 1.85020927012589e-05, "loss": 0.3965, "step": 82740 }, { "epoch": 2.982124193606516, "grad_norm": 0.20769937336444855, "learning_rate": 1.8499274904773245e-05, "loss": 0.3737, "step": 82745 }, { "epoch": 2.9823043932677407, "grad_norm": 0.22676123678684235, "learning_rate": 1.8496457196858714e-05, "loss": 0.3807, "step": 82750 }, { "epoch": 2.9824845929289654, "grad_norm": 0.18023423850536346, "learning_rate": 1.849363957755369e-05, "loss": 0.4336, "step": 82755 }, { "epoch": 2.9826647925901897, "grad_norm": 0.2146742194890976, "learning_rate": 1.849082204689657e-05, "loss": 0.3597, "step": 82760 }, { "epoch": 2.9828449922514144, "grad_norm": 0.24750038981437683, "learning_rate": 1.848800460492575e-05, "loss": 0.3814, "step": 82765 }, { "epoch": 2.983025191912639, "grad_norm": 0.20825496315956116, "learning_rate": 1.8485187251679586e-05, "loss": 0.4108, "step": 82770 }, { "epoch": 2.983205391573864, "grad_norm": 0.23861125111579895, "learning_rate": 1.8482369987196503e-05, "loss": 0.4002, "step": 82775 }, { "epoch": 2.9833855912350886, "grad_norm": 0.2297501564025879, "learning_rate": 1.847955281151486e-05, "loss": 0.4099, "step": 82780 }, { "epoch": 2.9835657908963134, "grad_norm": 0.2385878562927246, "learning_rate": 1.847673572467304e-05, "loss": 0.4512, "step": 82785 }, { "epoch": 2.9837459905575376, "grad_norm": 0.21177738904953003, "learning_rate": 1.8473918726709442e-05, "loss": 0.394, "step": 82790 }, { "epoch": 2.9839261902187624, "grad_norm": 0.17175810039043427, "learning_rate": 1.8471101817662428e-05, "loss": 0.4103, "step": 82795 }, { "epoch": 2.984106389879987, "grad_norm": 0.22006794810295105, "learning_rate": 1.8468284997570404e-05, "loss": 0.3797, "step": 82800 }, { "epoch": 2.9842865895412114, "grad_norm": 0.20512345433235168, "learning_rate": 1.8465468266471724e-05, "loss": 0.4287, "step": 82805 }, { "epoch": 2.984466789202436, "grad_norm": 0.24116021394729614, "learning_rate": 1.846265162440477e-05, "loss": 0.3939, "step": 82810 }, { "epoch": 2.984646988863661, "grad_norm": 0.21826866269111633, "learning_rate": 1.8459835071407928e-05, "loss": 0.3739, "step": 82815 }, { "epoch": 2.9848271885248856, "grad_norm": 0.20197410881519318, "learning_rate": 1.8457018607519567e-05, "loss": 0.4008, "step": 82820 }, { "epoch": 2.9850073881861103, "grad_norm": 0.21542875468730927, "learning_rate": 1.845420223277805e-05, "loss": 0.3851, "step": 82825 }, { "epoch": 2.985187587847335, "grad_norm": 0.19995024800300598, "learning_rate": 1.8451385947221773e-05, "loss": 0.3678, "step": 82830 }, { "epoch": 2.9853677875085594, "grad_norm": 0.20630434155464172, "learning_rate": 1.8448569750889083e-05, "loss": 0.3832, "step": 82835 }, { "epoch": 2.985547987169784, "grad_norm": 0.20057667791843414, "learning_rate": 1.844575364381837e-05, "loss": 0.394, "step": 82840 }, { "epoch": 2.985728186831009, "grad_norm": 0.2470291405916214, "learning_rate": 1.8442937626047995e-05, "loss": 0.3905, "step": 82845 }, { "epoch": 2.9859083864922336, "grad_norm": 0.20389066636562347, "learning_rate": 1.844012169761632e-05, "loss": 0.4083, "step": 82850 }, { "epoch": 2.986088586153458, "grad_norm": 0.26393190026283264, "learning_rate": 1.843730585856172e-05, "loss": 0.4205, "step": 82855 }, { "epoch": 2.9862687858146826, "grad_norm": 0.2350768744945526, "learning_rate": 1.8434490108922566e-05, "loss": 0.3914, "step": 82860 }, { "epoch": 2.9864489854759073, "grad_norm": 0.20153537392616272, "learning_rate": 1.8431674448737195e-05, "loss": 0.3954, "step": 82865 }, { "epoch": 2.986629185137132, "grad_norm": 0.22236128151416779, "learning_rate": 1.8428858878044007e-05, "loss": 0.4122, "step": 82870 }, { "epoch": 2.986809384798357, "grad_norm": 0.18074600398540497, "learning_rate": 1.8426043396881326e-05, "loss": 0.3938, "step": 82875 }, { "epoch": 2.986989584459581, "grad_norm": 0.21660077571868896, "learning_rate": 1.8423228005287546e-05, "loss": 0.4038, "step": 82880 }, { "epoch": 2.987169784120806, "grad_norm": 0.23685570061206818, "learning_rate": 1.8420412703301004e-05, "loss": 0.3751, "step": 82885 }, { "epoch": 2.9873499837820305, "grad_norm": 0.20489926636219025, "learning_rate": 1.8417597490960066e-05, "loss": 0.4195, "step": 82890 }, { "epoch": 2.9875301834432553, "grad_norm": 0.23835612833499908, "learning_rate": 1.8414782368303088e-05, "loss": 0.3734, "step": 82895 }, { "epoch": 2.9877103831044796, "grad_norm": 0.24969185888767242, "learning_rate": 1.841196733536842e-05, "loss": 0.3929, "step": 82900 }, { "epoch": 2.9878905827657043, "grad_norm": 0.22482214868068695, "learning_rate": 1.8409152392194427e-05, "loss": 0.4001, "step": 82905 }, { "epoch": 2.988070782426929, "grad_norm": 0.19898438453674316, "learning_rate": 1.8406337538819462e-05, "loss": 0.3767, "step": 82910 }, { "epoch": 2.9882509820881538, "grad_norm": 0.2599036991596222, "learning_rate": 1.840352277528186e-05, "loss": 0.4169, "step": 82915 }, { "epoch": 2.9884311817493785, "grad_norm": 0.1831098049879074, "learning_rate": 1.840070810161999e-05, "loss": 0.3763, "step": 82920 }, { "epoch": 2.988611381410603, "grad_norm": 0.26548436284065247, "learning_rate": 1.8397893517872196e-05, "loss": 0.3815, "step": 82925 }, { "epoch": 2.9887915810718275, "grad_norm": 0.24817430973052979, "learning_rate": 1.839507902407682e-05, "loss": 0.4, "step": 82930 }, { "epoch": 2.9889717807330523, "grad_norm": NaN, "learning_rate": 1.839282749383203e-05, "loss": 0.4199, "step": 82935 }, { "epoch": 2.989151980394277, "grad_norm": 0.2185213267803192, "learning_rate": 1.839001316204766e-05, "loss": 0.3663, "step": 82940 }, { "epoch": 2.9893321800555013, "grad_norm": 0.24561047554016113, "learning_rate": 1.8387198920323063e-05, "loss": 0.4042, "step": 82945 }, { "epoch": 2.989512379716726, "grad_norm": 0.1906471997499466, "learning_rate": 1.8384384768696615e-05, "loss": 0.3998, "step": 82950 }, { "epoch": 2.9896925793779507, "grad_norm": 0.2308565229177475, "learning_rate": 1.8381570707206637e-05, "loss": 0.3625, "step": 82955 }, { "epoch": 2.9898727790391755, "grad_norm": 0.2136189490556717, "learning_rate": 1.837875673589147e-05, "loss": 0.3868, "step": 82960 }, { "epoch": 2.9900529787004, "grad_norm": 0.21687231957912445, "learning_rate": 1.8375942854789475e-05, "loss": 0.3923, "step": 82965 }, { "epoch": 2.990233178361625, "grad_norm": 0.24081559479236603, "learning_rate": 1.837312906393896e-05, "loss": 0.3877, "step": 82970 }, { "epoch": 2.9904133780228492, "grad_norm": 0.19234254956245422, "learning_rate": 1.8370315363378293e-05, "loss": 0.4074, "step": 82975 }, { "epoch": 2.990593577684074, "grad_norm": 0.22776223719120026, "learning_rate": 1.8367501753145792e-05, "loss": 0.4108, "step": 82980 }, { "epoch": 2.9907737773452987, "grad_norm": 0.23385420441627502, "learning_rate": 1.836468823327979e-05, "loss": 0.3966, "step": 82985 }, { "epoch": 2.990953977006523, "grad_norm": 0.25053873658180237, "learning_rate": 1.836187480381863e-05, "loss": 0.4033, "step": 82990 }, { "epoch": 2.9911341766677477, "grad_norm": 0.22640636563301086, "learning_rate": 1.835906146480064e-05, "loss": 0.3687, "step": 82995 }, { "epoch": 2.9913143763289725, "grad_norm": 0.2172614485025406, "learning_rate": 1.835624821626415e-05, "loss": 0.3921, "step": 83000 }, { "epoch": 2.9913143763289725, "eval_loss": 0.4294629693031311, "eval_runtime": 3.5292, "eval_samples_per_second": 28.335, "eval_steps_per_second": 7.084, "step": 83000 }, { "epoch": 2.991494575990197, "grad_norm": 0.17745208740234375, "learning_rate": 1.8353435058247496e-05, "loss": 0.3833, "step": 83005 }, { "epoch": 2.991674775651422, "grad_norm": 0.25056445598602295, "learning_rate": 1.8350621990788998e-05, "loss": 0.4054, "step": 83010 }, { "epoch": 2.9918549753126467, "grad_norm": 0.20053359866142273, "learning_rate": 1.834780901392699e-05, "loss": 0.4106, "step": 83015 }, { "epoch": 2.992035174973871, "grad_norm": 0.18675009906291962, "learning_rate": 1.83449961276998e-05, "loss": 0.3792, "step": 83020 }, { "epoch": 2.9922153746350957, "grad_norm": 0.22878988087177277, "learning_rate": 1.834218333214574e-05, "loss": 0.4279, "step": 83025 }, { "epoch": 2.9923955742963204, "grad_norm": 0.20670390129089355, "learning_rate": 1.833937062730315e-05, "loss": 0.3876, "step": 83030 }, { "epoch": 2.9925757739575447, "grad_norm": 0.18799588084220886, "learning_rate": 1.8336558013210355e-05, "loss": 0.3937, "step": 83035 }, { "epoch": 2.9927559736187694, "grad_norm": 0.19332025945186615, "learning_rate": 1.8333745489905646e-05, "loss": 0.4095, "step": 83040 }, { "epoch": 2.992936173279994, "grad_norm": 0.22102457284927368, "learning_rate": 1.8330933057427378e-05, "loss": 0.3956, "step": 83045 }, { "epoch": 2.993116372941219, "grad_norm": 0.2692544758319855, "learning_rate": 1.8328120715813845e-05, "loss": 0.403, "step": 83050 }, { "epoch": 2.9932965726024436, "grad_norm": 0.20256653428077698, "learning_rate": 1.832530846510339e-05, "loss": 0.3552, "step": 83055 }, { "epoch": 2.9934767722636684, "grad_norm": 0.2464403659105301, "learning_rate": 1.8322496305334312e-05, "loss": 0.3758, "step": 83060 }, { "epoch": 2.9936569719248927, "grad_norm": 0.20932310819625854, "learning_rate": 1.831968423654492e-05, "loss": 0.3809, "step": 83065 }, { "epoch": 2.9938371715861174, "grad_norm": 0.19015516340732574, "learning_rate": 1.8316872258773543e-05, "loss": 0.4073, "step": 83070 }, { "epoch": 2.994017371247342, "grad_norm": 0.2418580800294876, "learning_rate": 1.831406037205849e-05, "loss": 0.432, "step": 83075 }, { "epoch": 2.9941975709085664, "grad_norm": 0.23939929902553558, "learning_rate": 1.831124857643806e-05, "loss": 0.4024, "step": 83080 }, { "epoch": 2.994377770569791, "grad_norm": 0.259090781211853, "learning_rate": 1.8308436871950584e-05, "loss": 0.3824, "step": 83085 }, { "epoch": 2.994557970231016, "grad_norm": 0.2107851803302765, "learning_rate": 1.8305625258634353e-05, "loss": 0.4087, "step": 83090 }, { "epoch": 2.9947381698922406, "grad_norm": 0.22065068781375885, "learning_rate": 1.8302813736527686e-05, "loss": 0.4059, "step": 83095 }, { "epoch": 2.9949183695534654, "grad_norm": 0.21330714225769043, "learning_rate": 1.8300002305668884e-05, "loss": 0.3922, "step": 83100 }, { "epoch": 2.99509856921469, "grad_norm": 0.19063615798950195, "learning_rate": 1.829719096609625e-05, "loss": 0.3862, "step": 83105 }, { "epoch": 2.9952787688759144, "grad_norm": 0.2311249077320099, "learning_rate": 1.8294379717848095e-05, "loss": 0.3986, "step": 83110 }, { "epoch": 2.995458968537139, "grad_norm": 0.18495197594165802, "learning_rate": 1.8291568560962723e-05, "loss": 0.4199, "step": 83115 }, { "epoch": 2.995639168198364, "grad_norm": 0.1809265911579132, "learning_rate": 1.828875749547842e-05, "loss": 0.4077, "step": 83120 }, { "epoch": 2.9958193678595886, "grad_norm": 0.2279941588640213, "learning_rate": 1.82859465214335e-05, "loss": 0.4244, "step": 83125 }, { "epoch": 2.995999567520813, "grad_norm": 0.23656921088695526, "learning_rate": 1.8283135638866263e-05, "loss": 0.3996, "step": 83130 }, { "epoch": 2.9961797671820376, "grad_norm": 0.2122529149055481, "learning_rate": 1.8280324847815e-05, "loss": 0.4041, "step": 83135 }, { "epoch": 2.9963599668432623, "grad_norm": 0.21249407529830933, "learning_rate": 1.8277514148318014e-05, "loss": 0.3736, "step": 83140 }, { "epoch": 2.996540166504487, "grad_norm": 0.2389555722475052, "learning_rate": 1.8274703540413584e-05, "loss": 0.3884, "step": 83145 }, { "epoch": 2.996720366165712, "grad_norm": 0.24687263369560242, "learning_rate": 1.8271893024140034e-05, "loss": 0.4034, "step": 83150 }, { "epoch": 2.996900565826936, "grad_norm": 0.19258186221122742, "learning_rate": 1.826908259953562e-05, "loss": 0.3823, "step": 83155 }, { "epoch": 2.997080765488161, "grad_norm": 0.21487314999103546, "learning_rate": 1.826627226663867e-05, "loss": 0.4198, "step": 83160 }, { "epoch": 2.9972609651493856, "grad_norm": 0.17476046085357666, "learning_rate": 1.826346202548745e-05, "loss": 0.3885, "step": 83165 }, { "epoch": 2.9974411648106103, "grad_norm": 0.22289486229419708, "learning_rate": 1.826065187612025e-05, "loss": 0.3922, "step": 83170 }, { "epoch": 2.9976213644718346, "grad_norm": 0.21170969307422638, "learning_rate": 1.825784181857537e-05, "loss": 0.3817, "step": 83175 }, { "epoch": 2.9978015641330593, "grad_norm": 0.17165972292423248, "learning_rate": 1.825503185289109e-05, "loss": 0.3939, "step": 83180 }, { "epoch": 2.997981763794284, "grad_norm": 0.2255578637123108, "learning_rate": 1.8252221979105687e-05, "loss": 0.3985, "step": 83185 }, { "epoch": 2.998161963455509, "grad_norm": 0.23678001761436462, "learning_rate": 1.824941219725746e-05, "loss": 0.4077, "step": 83190 }, { "epoch": 2.9983421631167335, "grad_norm": 0.32480308413505554, "learning_rate": 1.824660250738468e-05, "loss": 0.4023, "step": 83195 }, { "epoch": 2.9985223627779583, "grad_norm": 0.18831786513328552, "learning_rate": 1.8243792909525643e-05, "loss": 0.3703, "step": 83200 }, { "epoch": 2.9987025624391825, "grad_norm": 0.22749020159244537, "learning_rate": 1.8240983403718614e-05, "loss": 0.3943, "step": 83205 }, { "epoch": 2.9988827621004073, "grad_norm": 0.2319597750902176, "learning_rate": 1.8238173990001874e-05, "loss": 0.3893, "step": 83210 }, { "epoch": 2.999062961761632, "grad_norm": 0.2220054417848587, "learning_rate": 1.8235364668413705e-05, "loss": 0.3845, "step": 83215 }, { "epoch": 2.9992431614228563, "grad_norm": 0.29346784949302673, "learning_rate": 1.8232555438992395e-05, "loss": 0.4334, "step": 83220 }, { "epoch": 2.999423361084081, "grad_norm": 0.19180335104465485, "learning_rate": 1.8229746301776187e-05, "loss": 0.3889, "step": 83225 }, { "epoch": 2.9996035607453058, "grad_norm": 0.2530876100063324, "learning_rate": 1.8226937256803396e-05, "loss": 0.3928, "step": 83230 }, { "epoch": 2.9997837604065305, "grad_norm": 0.1911705732345581, "learning_rate": 1.8224128304112252e-05, "loss": 0.4052, "step": 83235 }, { "epoch": 2.9999639600677552, "grad_norm": 0.21257303655147552, "learning_rate": 1.822131944374106e-05, "loss": 0.3966, "step": 83240 }, { "epoch": 3.0001441597289795, "grad_norm": 0.24478384852409363, "learning_rate": 1.8218510675728085e-05, "loss": 0.3972, "step": 83245 }, { "epoch": 3.0003243593902043, "grad_norm": 0.21207980811595917, "learning_rate": 1.821570200011157e-05, "loss": 0.3956, "step": 83250 }, { "epoch": 3.000504559051429, "grad_norm": 0.19810383021831512, "learning_rate": 1.8212893416929817e-05, "loss": 0.3571, "step": 83255 }, { "epoch": 3.0006847587126537, "grad_norm": 0.21242433786392212, "learning_rate": 1.8210084926221078e-05, "loss": 0.3709, "step": 83260 }, { "epoch": 3.0008649583738785, "grad_norm": 0.21151970326900482, "learning_rate": 1.820727652802361e-05, "loss": 0.3948, "step": 83265 }, { "epoch": 3.0010451580351027, "grad_norm": 0.27678802609443665, "learning_rate": 1.8204468222375687e-05, "loss": 0.3665, "step": 83270 }, { "epoch": 3.0012253576963275, "grad_norm": 0.2054269164800644, "learning_rate": 1.8201660009315563e-05, "loss": 0.3515, "step": 83275 }, { "epoch": 3.001405557357552, "grad_norm": 0.27250099182128906, "learning_rate": 1.8198851888881513e-05, "loss": 0.3725, "step": 83280 }, { "epoch": 3.001585757018777, "grad_norm": 0.22905905544757843, "learning_rate": 1.8196043861111788e-05, "loss": 0.3983, "step": 83285 }, { "epoch": 3.0017659566800012, "grad_norm": 0.19048994779586792, "learning_rate": 1.819323592604464e-05, "loss": 0.3538, "step": 83290 }, { "epoch": 3.001946156341226, "grad_norm": 0.2080589234828949, "learning_rate": 1.8190428083718346e-05, "loss": 0.3769, "step": 83295 }, { "epoch": 3.0021263560024507, "grad_norm": 0.20519669353961945, "learning_rate": 1.8187620334171147e-05, "loss": 0.38, "step": 83300 }, { "epoch": 3.0023065556636754, "grad_norm": 0.20568309724330902, "learning_rate": 1.81848126774413e-05, "loss": 0.4157, "step": 83305 }, { "epoch": 3.0024867553249, "grad_norm": 0.1868577003479004, "learning_rate": 1.8182005113567064e-05, "loss": 0.3971, "step": 83310 }, { "epoch": 3.0026669549861245, "grad_norm": 0.21570152044296265, "learning_rate": 1.817919764258668e-05, "loss": 0.3962, "step": 83315 }, { "epoch": 3.002847154647349, "grad_norm": 0.2413526475429535, "learning_rate": 1.817639026453842e-05, "loss": 0.3816, "step": 83320 }, { "epoch": 3.003027354308574, "grad_norm": 0.2983512282371521, "learning_rate": 1.8173582979460517e-05, "loss": 0.4202, "step": 83325 }, { "epoch": 3.0032075539697987, "grad_norm": 0.18553189933300018, "learning_rate": 1.8170775787391213e-05, "loss": 0.3656, "step": 83330 }, { "epoch": 3.0033877536310234, "grad_norm": 0.2300480157136917, "learning_rate": 1.8167968688368787e-05, "loss": 0.3999, "step": 83335 }, { "epoch": 3.0035679532922477, "grad_norm": 0.18656012415885925, "learning_rate": 1.8165161682431444e-05, "loss": 0.3956, "step": 83340 }, { "epoch": 3.0037481529534724, "grad_norm": 0.18259282410144806, "learning_rate": 1.8162354769617468e-05, "loss": 0.3691, "step": 83345 }, { "epoch": 3.003928352614697, "grad_norm": 0.21580132842063904, "learning_rate": 1.815954794996508e-05, "loss": 0.3893, "step": 83350 }, { "epoch": 3.004108552275922, "grad_norm": 0.20566125214099884, "learning_rate": 1.815674122351252e-05, "loss": 0.3643, "step": 83355 }, { "epoch": 3.004288751937146, "grad_norm": 0.2558181583881378, "learning_rate": 1.815393459029804e-05, "loss": 0.3879, "step": 83360 }, { "epoch": 3.004468951598371, "grad_norm": 0.1984751671552658, "learning_rate": 1.815112805035988e-05, "loss": 0.4131, "step": 83365 }, { "epoch": 3.0046491512595956, "grad_norm": 0.19502590596675873, "learning_rate": 1.8148321603736263e-05, "loss": 0.3646, "step": 83370 }, { "epoch": 3.0048293509208204, "grad_norm": 0.22936676442623138, "learning_rate": 1.8145515250465446e-05, "loss": 0.3653, "step": 83375 }, { "epoch": 3.005009550582045, "grad_norm": 0.2669048309326172, "learning_rate": 1.814270899058565e-05, "loss": 0.4142, "step": 83380 }, { "epoch": 3.0051897502432694, "grad_norm": 0.21119844913482666, "learning_rate": 1.8139902824135124e-05, "loss": 0.4132, "step": 83385 }, { "epoch": 3.005369949904494, "grad_norm": 0.20271877944469452, "learning_rate": 1.8137096751152093e-05, "loss": 0.3699, "step": 83390 }, { "epoch": 3.005550149565719, "grad_norm": 0.21155229210853577, "learning_rate": 1.8134290771674784e-05, "loss": 0.3365, "step": 83395 }, { "epoch": 3.0057303492269436, "grad_norm": 0.21775977313518524, "learning_rate": 1.813148488574144e-05, "loss": 0.3988, "step": 83400 }, { "epoch": 3.005910548888168, "grad_norm": 0.2257860004901886, "learning_rate": 1.8128679093390282e-05, "loss": 0.3699, "step": 83405 }, { "epoch": 3.0060907485493926, "grad_norm": 0.1963503360748291, "learning_rate": 1.8125873394659543e-05, "loss": 0.358, "step": 83410 }, { "epoch": 3.0062709482106174, "grad_norm": 0.17363089323043823, "learning_rate": 1.8123067789587443e-05, "loss": 0.3867, "step": 83415 }, { "epoch": 3.006451147871842, "grad_norm": 0.22728854417800903, "learning_rate": 1.8120262278212216e-05, "loss": 0.3821, "step": 83420 }, { "epoch": 3.006631347533067, "grad_norm": 0.2198934406042099, "learning_rate": 1.8117456860572085e-05, "loss": 0.3799, "step": 83425 }, { "epoch": 3.006811547194291, "grad_norm": 0.27746328711509705, "learning_rate": 1.811465153670528e-05, "loss": 0.3917, "step": 83430 }, { "epoch": 3.006991746855516, "grad_norm": 0.25935912132263184, "learning_rate": 1.811184630665e-05, "loss": 0.3629, "step": 83435 }, { "epoch": 3.0071719465167406, "grad_norm": 0.2055579125881195, "learning_rate": 1.810904117044449e-05, "loss": 0.361, "step": 83440 }, { "epoch": 3.0073521461779653, "grad_norm": 0.2863004207611084, "learning_rate": 1.810623612812696e-05, "loss": 0.3877, "step": 83445 }, { "epoch": 3.0075323458391896, "grad_norm": 0.21766206622123718, "learning_rate": 1.810343117973562e-05, "loss": 0.3755, "step": 83450 }, { "epoch": 3.0077125455004143, "grad_norm": 0.21764706075191498, "learning_rate": 1.81006263253087e-05, "loss": 0.4227, "step": 83455 }, { "epoch": 3.007892745161639, "grad_norm": 0.25549066066741943, "learning_rate": 1.8097821564884408e-05, "loss": 0.4138, "step": 83460 }, { "epoch": 3.008072944822864, "grad_norm": 0.22530651092529297, "learning_rate": 1.8095016898500962e-05, "loss": 0.4006, "step": 83465 }, { "epoch": 3.0082531444840885, "grad_norm": 0.20702116191387177, "learning_rate": 1.8092212326196578e-05, "loss": 0.3831, "step": 83470 }, { "epoch": 3.008433344145313, "grad_norm": 0.21814846992492676, "learning_rate": 1.8089407848009457e-05, "loss": 0.3723, "step": 83475 }, { "epoch": 3.0086135438065376, "grad_norm": 0.19958257675170898, "learning_rate": 1.808660346397782e-05, "loss": 0.3971, "step": 83480 }, { "epoch": 3.0087937434677623, "grad_norm": 0.2572198510169983, "learning_rate": 1.8083799174139876e-05, "loss": 0.3909, "step": 83485 }, { "epoch": 3.008973943128987, "grad_norm": 0.1767335683107376, "learning_rate": 1.8080994978533822e-05, "loss": 0.3866, "step": 83490 }, { "epoch": 3.0091541427902118, "grad_norm": 0.22340673208236694, "learning_rate": 1.8078190877197876e-05, "loss": 0.3917, "step": 83495 }, { "epoch": 3.009334342451436, "grad_norm": 0.21768926084041595, "learning_rate": 1.8075386870170233e-05, "loss": 0.3831, "step": 83500 }, { "epoch": 3.009334342451436, "eval_loss": 0.4312828481197357, "eval_runtime": 3.5249, "eval_samples_per_second": 28.369, "eval_steps_per_second": 7.092, "step": 83500 }, { "epoch": 3.009514542112661, "grad_norm": 0.23713305592536926, "learning_rate": 1.8072582957489108e-05, "loss": 0.3742, "step": 83505 }, { "epoch": 3.0096947417738855, "grad_norm": 0.18255163729190826, "learning_rate": 1.806977913919271e-05, "loss": 0.407, "step": 83510 }, { "epoch": 3.0098749414351103, "grad_norm": 0.20699362456798553, "learning_rate": 1.806697541531921e-05, "loss": 0.348, "step": 83515 }, { "epoch": 3.0100551410963345, "grad_norm": 0.22750404477119446, "learning_rate": 1.806417178590684e-05, "loss": 0.3584, "step": 83520 }, { "epoch": 3.0102353407575593, "grad_norm": 0.2072783261537552, "learning_rate": 1.8061368250993777e-05, "loss": 0.42, "step": 83525 }, { "epoch": 3.010415540418784, "grad_norm": 0.18794316053390503, "learning_rate": 1.8058564810618233e-05, "loss": 0.3754, "step": 83530 }, { "epoch": 3.0105957400800087, "grad_norm": 0.15637893974781036, "learning_rate": 1.805576146481841e-05, "loss": 0.3791, "step": 83535 }, { "epoch": 3.0107759397412335, "grad_norm": 0.23354141414165497, "learning_rate": 1.8052958213632477e-05, "loss": 0.3636, "step": 83540 }, { "epoch": 3.0109561394024578, "grad_norm": 0.20593926310539246, "learning_rate": 1.8050155057098656e-05, "loss": 0.35, "step": 83545 }, { "epoch": 3.0111363390636825, "grad_norm": 0.26360243558883667, "learning_rate": 1.8047351995255123e-05, "loss": 0.3946, "step": 83550 }, { "epoch": 3.0113165387249072, "grad_norm": 0.20182831585407257, "learning_rate": 1.8044549028140068e-05, "loss": 0.3674, "step": 83555 }, { "epoch": 3.011496738386132, "grad_norm": 0.1877097487449646, "learning_rate": 1.804174615579169e-05, "loss": 0.3906, "step": 83560 }, { "epoch": 3.0116769380473563, "grad_norm": 0.23039346933364868, "learning_rate": 1.8038943378248165e-05, "loss": 0.3893, "step": 83565 }, { "epoch": 3.011857137708581, "grad_norm": 0.25139108300209045, "learning_rate": 1.80361406955477e-05, "loss": 0.3811, "step": 83570 }, { "epoch": 3.0120373373698057, "grad_norm": 0.2260846644639969, "learning_rate": 1.8033338107728465e-05, "loss": 0.3805, "step": 83575 }, { "epoch": 3.0122175370310305, "grad_norm": 0.21495521068572998, "learning_rate": 1.803053561482865e-05, "loss": 0.3935, "step": 83580 }, { "epoch": 3.012397736692255, "grad_norm": 0.24546730518341064, "learning_rate": 1.802773321688644e-05, "loss": 0.3861, "step": 83585 }, { "epoch": 3.0125779363534795, "grad_norm": 0.2514590322971344, "learning_rate": 1.8024930913940015e-05, "loss": 0.3718, "step": 83590 }, { "epoch": 3.012758136014704, "grad_norm": 0.23116329312324524, "learning_rate": 1.8022128706027547e-05, "loss": 0.354, "step": 83595 }, { "epoch": 3.012938335675929, "grad_norm": 0.17475327849388123, "learning_rate": 1.8019326593187232e-05, "loss": 0.3732, "step": 83600 }, { "epoch": 3.0131185353371537, "grad_norm": 0.2183002233505249, "learning_rate": 1.801652457545724e-05, "loss": 0.3881, "step": 83605 }, { "epoch": 3.0132987349983784, "grad_norm": 0.23634588718414307, "learning_rate": 1.8013722652875747e-05, "loss": 0.4003, "step": 83610 }, { "epoch": 3.0134789346596027, "grad_norm": 0.22611938416957855, "learning_rate": 1.801092082548094e-05, "loss": 0.3966, "step": 83615 }, { "epoch": 3.0136591343208274, "grad_norm": 0.23099973797798157, "learning_rate": 1.8008119093310965e-05, "loss": 0.3739, "step": 83620 }, { "epoch": 3.013839333982052, "grad_norm": 0.2718352675437927, "learning_rate": 1.8005317456404034e-05, "loss": 0.3796, "step": 83625 }, { "epoch": 3.014019533643277, "grad_norm": 0.18968239426612854, "learning_rate": 1.800251591479829e-05, "loss": 0.3575, "step": 83630 }, { "epoch": 3.014199733304501, "grad_norm": 0.1754741370677948, "learning_rate": 1.7999714468531906e-05, "loss": 0.374, "step": 83635 }, { "epoch": 3.014379932965726, "grad_norm": 0.25998741388320923, "learning_rate": 1.7996913117643064e-05, "loss": 0.3882, "step": 83640 }, { "epoch": 3.0145601326269507, "grad_norm": 0.25412383675575256, "learning_rate": 1.799411186216992e-05, "loss": 0.3873, "step": 83645 }, { "epoch": 3.0147403322881754, "grad_norm": 0.23785263299942017, "learning_rate": 1.7991310702150647e-05, "loss": 0.3314, "step": 83650 }, { "epoch": 3.0149205319494, "grad_norm": 0.21944360435009003, "learning_rate": 1.7988509637623414e-05, "loss": 0.3885, "step": 83655 }, { "epoch": 3.0151007316106244, "grad_norm": 0.20874114334583282, "learning_rate": 1.7985708668626373e-05, "loss": 0.3503, "step": 83660 }, { "epoch": 3.015280931271849, "grad_norm": 0.19543851912021637, "learning_rate": 1.7982907795197695e-05, "loss": 0.3552, "step": 83665 }, { "epoch": 3.015461130933074, "grad_norm": 0.17719921469688416, "learning_rate": 1.7980107017375543e-05, "loss": 0.3681, "step": 83670 }, { "epoch": 3.0156413305942986, "grad_norm": 0.20741024613380432, "learning_rate": 1.7977306335198067e-05, "loss": 0.3569, "step": 83675 }, { "epoch": 3.015821530255523, "grad_norm": 0.2016208916902542, "learning_rate": 1.797450574870344e-05, "loss": 0.3684, "step": 83680 }, { "epoch": 3.0160017299167476, "grad_norm": 0.23120005428791046, "learning_rate": 1.7971705257929806e-05, "loss": 0.3627, "step": 83685 }, { "epoch": 3.0161819295779724, "grad_norm": 0.22338317334651947, "learning_rate": 1.796890486291533e-05, "loss": 0.3849, "step": 83690 }, { "epoch": 3.016362129239197, "grad_norm": 0.2729624807834625, "learning_rate": 1.7966104563698165e-05, "loss": 0.3771, "step": 83695 }, { "epoch": 3.016542328900422, "grad_norm": 0.24947573244571686, "learning_rate": 1.796330436031646e-05, "loss": 0.3744, "step": 83700 }, { "epoch": 3.016722528561646, "grad_norm": 0.22686618566513062, "learning_rate": 1.7960504252808374e-05, "loss": 0.3786, "step": 83705 }, { "epoch": 3.016902728222871, "grad_norm": 0.2134973108768463, "learning_rate": 1.7957704241212063e-05, "loss": 0.4183, "step": 83710 }, { "epoch": 3.0170829278840956, "grad_norm": 0.17952316999435425, "learning_rate": 1.795490432556565e-05, "loss": 0.3963, "step": 83715 }, { "epoch": 3.0172631275453203, "grad_norm": 0.22514955699443817, "learning_rate": 1.795210450590732e-05, "loss": 0.3789, "step": 83720 }, { "epoch": 3.017443327206545, "grad_norm": 0.19981583952903748, "learning_rate": 1.794930478227518e-05, "loss": 0.3763, "step": 83725 }, { "epoch": 3.0176235268677694, "grad_norm": 0.2752089500427246, "learning_rate": 1.7946505154707416e-05, "loss": 0.3775, "step": 83730 }, { "epoch": 3.017803726528994, "grad_norm": 0.21153953671455383, "learning_rate": 1.7943705623242153e-05, "loss": 0.3939, "step": 83735 }, { "epoch": 3.017983926190219, "grad_norm": 0.22683672606945038, "learning_rate": 1.794090618791752e-05, "loss": 0.3739, "step": 83740 }, { "epoch": 3.0181641258514436, "grad_norm": 0.2554776668548584, "learning_rate": 1.793810684877169e-05, "loss": 0.4071, "step": 83745 }, { "epoch": 3.018344325512668, "grad_norm": 0.2107311338186264, "learning_rate": 1.7935307605842775e-05, "loss": 0.3646, "step": 83750 }, { "epoch": 3.0185245251738926, "grad_norm": 0.22267073392868042, "learning_rate": 1.793250845916893e-05, "loss": 0.4121, "step": 83755 }, { "epoch": 3.0187047248351173, "grad_norm": 0.2603355646133423, "learning_rate": 1.7929709408788292e-05, "loss": 0.371, "step": 83760 }, { "epoch": 3.018884924496342, "grad_norm": 0.22211819887161255, "learning_rate": 1.792691045473899e-05, "loss": 0.353, "step": 83765 }, { "epoch": 3.019065124157567, "grad_norm": 0.1939542144536972, "learning_rate": 1.792411159705917e-05, "loss": 0.3708, "step": 83770 }, { "epoch": 3.019245323818791, "grad_norm": 0.23032665252685547, "learning_rate": 1.7921312835786952e-05, "loss": 0.3856, "step": 83775 }, { "epoch": 3.019425523480016, "grad_norm": 0.22775132954120636, "learning_rate": 1.7918514170960478e-05, "loss": 0.3811, "step": 83780 }, { "epoch": 3.0196057231412405, "grad_norm": 0.2607553005218506, "learning_rate": 1.791571560261788e-05, "loss": 0.3733, "step": 83785 }, { "epoch": 3.0197859228024653, "grad_norm": 0.22313840687274933, "learning_rate": 1.791291713079728e-05, "loss": 0.4006, "step": 83790 }, { "epoch": 3.0199661224636896, "grad_norm": 0.22880607843399048, "learning_rate": 1.791011875553682e-05, "loss": 0.4368, "step": 83795 }, { "epoch": 3.0201463221249143, "grad_norm": 0.2502082288265228, "learning_rate": 1.790732047687462e-05, "loss": 0.3866, "step": 83800 }, { "epoch": 3.020326521786139, "grad_norm": 0.23791562020778656, "learning_rate": 1.7904522294848792e-05, "loss": 0.3749, "step": 83805 }, { "epoch": 3.0205067214473638, "grad_norm": 0.22234022617340088, "learning_rate": 1.790172420949749e-05, "loss": 0.3662, "step": 83810 }, { "epoch": 3.0206869211085885, "grad_norm": 0.169847309589386, "learning_rate": 1.7898926220858817e-05, "loss": 0.361, "step": 83815 }, { "epoch": 3.020867120769813, "grad_norm": 0.1907045543193817, "learning_rate": 1.7896128328970886e-05, "loss": 0.3964, "step": 83820 }, { "epoch": 3.0210473204310375, "grad_norm": 0.21488279104232788, "learning_rate": 1.789333053387185e-05, "loss": 0.3986, "step": 83825 }, { "epoch": 3.0212275200922623, "grad_norm": 0.193211168050766, "learning_rate": 1.7890532835599793e-05, "loss": 0.3955, "step": 83830 }, { "epoch": 3.021407719753487, "grad_norm": 0.2113611400127411, "learning_rate": 1.7887735234192864e-05, "loss": 0.3767, "step": 83835 }, { "epoch": 3.0215879194147117, "grad_norm": 0.2665266990661621, "learning_rate": 1.788493772968916e-05, "loss": 0.3705, "step": 83840 }, { "epoch": 3.021768119075936, "grad_norm": 0.24903304874897003, "learning_rate": 1.78821403221268e-05, "loss": 0.4004, "step": 83845 }, { "epoch": 3.0219483187371607, "grad_norm": 0.2464405745267868, "learning_rate": 1.7879343011543905e-05, "loss": 0.3908, "step": 83850 }, { "epoch": 3.0221285183983855, "grad_norm": 0.2581247389316559, "learning_rate": 1.7876545797978576e-05, "loss": 0.3983, "step": 83855 }, { "epoch": 3.02230871805961, "grad_norm": 0.3195907473564148, "learning_rate": 1.7873748681468932e-05, "loss": 0.3965, "step": 83860 }, { "epoch": 3.0224889177208345, "grad_norm": 0.22984401881694794, "learning_rate": 1.787095166205309e-05, "loss": 0.3762, "step": 83865 }, { "epoch": 3.0226691173820592, "grad_norm": 0.22616103291511536, "learning_rate": 1.7868154739769144e-05, "loss": 0.3776, "step": 83870 }, { "epoch": 3.022849317043284, "grad_norm": 0.17865821719169617, "learning_rate": 1.7865357914655212e-05, "loss": 0.3676, "step": 83875 }, { "epoch": 3.0230295167045087, "grad_norm": 0.21580396592617035, "learning_rate": 1.78625611867494e-05, "loss": 0.3549, "step": 83880 }, { "epoch": 3.0232097163657334, "grad_norm": 0.22355122864246368, "learning_rate": 1.7859764556089804e-05, "loss": 0.4141, "step": 83885 }, { "epoch": 3.0233899160269577, "grad_norm": 0.2217842936515808, "learning_rate": 1.7856968022714535e-05, "loss": 0.387, "step": 83890 }, { "epoch": 3.0235701156881825, "grad_norm": 0.29023489356040955, "learning_rate": 1.78541715866617e-05, "loss": 0.3755, "step": 83895 }, { "epoch": 3.023750315349407, "grad_norm": 0.19948424398899078, "learning_rate": 1.785137524796938e-05, "loss": 0.3434, "step": 83900 }, { "epoch": 3.023930515010632, "grad_norm": 0.1968270093202591, "learning_rate": 1.7848579006675707e-05, "loss": 0.3828, "step": 83905 }, { "epoch": 3.024110714671856, "grad_norm": 0.20622436702251434, "learning_rate": 1.784578286281874e-05, "loss": 0.3828, "step": 83910 }, { "epoch": 3.024290914333081, "grad_norm": 0.25507232546806335, "learning_rate": 1.7842986816436613e-05, "loss": 0.3875, "step": 83915 }, { "epoch": 3.0244711139943057, "grad_norm": 0.18650664389133453, "learning_rate": 1.7840190867567403e-05, "loss": 0.3826, "step": 83920 }, { "epoch": 3.0246513136555304, "grad_norm": 0.23238393664360046, "learning_rate": 1.7837395016249198e-05, "loss": 0.3914, "step": 83925 }, { "epoch": 3.024831513316755, "grad_norm": 0.23915240168571472, "learning_rate": 1.7834599262520102e-05, "loss": 0.3834, "step": 83930 }, { "epoch": 3.0250117129779794, "grad_norm": 0.2193489819765091, "learning_rate": 1.78318036064182e-05, "loss": 0.4019, "step": 83935 }, { "epoch": 3.025191912639204, "grad_norm": 0.2331564873456955, "learning_rate": 1.7829008047981594e-05, "loss": 0.3714, "step": 83940 }, { "epoch": 3.025372112300429, "grad_norm": 0.22607645392417908, "learning_rate": 1.7826212587248364e-05, "loss": 0.4046, "step": 83945 }, { "epoch": 3.0255523119616536, "grad_norm": 0.235866516828537, "learning_rate": 1.7823417224256594e-05, "loss": 0.3759, "step": 83950 }, { "epoch": 3.025732511622878, "grad_norm": 0.23294222354888916, "learning_rate": 1.782062195904438e-05, "loss": 0.3864, "step": 83955 }, { "epoch": 3.0259127112841027, "grad_norm": 0.22986343502998352, "learning_rate": 1.78178267916498e-05, "loss": 0.3758, "step": 83960 }, { "epoch": 3.0260929109453274, "grad_norm": 0.17476344108581543, "learning_rate": 1.7815031722110935e-05, "loss": 0.3921, "step": 83965 }, { "epoch": 3.026273110606552, "grad_norm": 0.23283426463603973, "learning_rate": 1.7812236750465876e-05, "loss": 0.4051, "step": 83970 }, { "epoch": 3.026453310267777, "grad_norm": 0.19704559445381165, "learning_rate": 1.7809441876752695e-05, "loss": 0.3977, "step": 83975 }, { "epoch": 3.026633509929001, "grad_norm": 0.1991409957408905, "learning_rate": 1.7806647101009484e-05, "loss": 0.3977, "step": 83980 }, { "epoch": 3.026813709590226, "grad_norm": 0.2306070774793625, "learning_rate": 1.780385242327431e-05, "loss": 0.3909, "step": 83985 }, { "epoch": 3.0269939092514506, "grad_norm": 0.25500649213790894, "learning_rate": 1.780105784358525e-05, "loss": 0.4118, "step": 83990 }, { "epoch": 3.0271741089126754, "grad_norm": 0.19943642616271973, "learning_rate": 1.7798263361980388e-05, "loss": 0.366, "step": 83995 }, { "epoch": 3.0273543085739, "grad_norm": 0.22723056375980377, "learning_rate": 1.77954689784978e-05, "loss": 0.4048, "step": 84000 }, { "epoch": 3.0273543085739, "eval_loss": 0.4311674237251282, "eval_runtime": 3.5287, "eval_samples_per_second": 28.339, "eval_steps_per_second": 7.085, "step": 84000 }, { "epoch": 3.0275345082351244, "grad_norm": 0.20490515232086182, "learning_rate": 1.7792674693175534e-05, "loss": 0.3958, "step": 84005 }, { "epoch": 3.027714707896349, "grad_norm": 0.24055179953575134, "learning_rate": 1.77898805060517e-05, "loss": 0.3762, "step": 84010 }, { "epoch": 3.027894907557574, "grad_norm": 0.25771138072013855, "learning_rate": 1.778708641716433e-05, "loss": 0.4009, "step": 84015 }, { "epoch": 3.0280751072187986, "grad_norm": 0.19675886631011963, "learning_rate": 1.7784292426551525e-05, "loss": 0.3787, "step": 84020 }, { "epoch": 3.028255306880023, "grad_norm": 0.2063799649477005, "learning_rate": 1.7781498534251334e-05, "loss": 0.428, "step": 84025 }, { "epoch": 3.0284355065412476, "grad_norm": 0.23454803228378296, "learning_rate": 1.7778704740301823e-05, "loss": 0.3933, "step": 84030 }, { "epoch": 3.0286157062024723, "grad_norm": 0.20957069098949432, "learning_rate": 1.7775911044741067e-05, "loss": 0.3725, "step": 84035 }, { "epoch": 3.028795905863697, "grad_norm": 0.24163176119327545, "learning_rate": 1.7773117447607128e-05, "loss": 0.3617, "step": 84040 }, { "epoch": 3.028976105524922, "grad_norm": 0.19776524603366852, "learning_rate": 1.7770323948938056e-05, "loss": 0.3595, "step": 84045 }, { "epoch": 3.029156305186146, "grad_norm": 0.21786345541477203, "learning_rate": 1.7767530548771926e-05, "loss": 0.4257, "step": 84050 }, { "epoch": 3.029336504847371, "grad_norm": 0.2272718995809555, "learning_rate": 1.7764737247146783e-05, "loss": 0.3771, "step": 84055 }, { "epoch": 3.0295167045085956, "grad_norm": 0.2050115466117859, "learning_rate": 1.7761944044100707e-05, "loss": 0.3718, "step": 84060 }, { "epoch": 3.0296969041698203, "grad_norm": 0.26552021503448486, "learning_rate": 1.7759150939671735e-05, "loss": 0.3983, "step": 84065 }, { "epoch": 3.0298771038310446, "grad_norm": 0.22904445230960846, "learning_rate": 1.775635793389793e-05, "loss": 0.377, "step": 84070 }, { "epoch": 3.0300573034922693, "grad_norm": 0.1924734264612198, "learning_rate": 1.7753565026817348e-05, "loss": 0.3702, "step": 84075 }, { "epoch": 3.030237503153494, "grad_norm": 0.2412857860326767, "learning_rate": 1.775077221846805e-05, "loss": 0.3691, "step": 84080 }, { "epoch": 3.030417702814719, "grad_norm": 0.2012851983308792, "learning_rate": 1.7747979508888053e-05, "loss": 0.3935, "step": 84085 }, { "epoch": 3.0305979024759435, "grad_norm": 0.21327096223831177, "learning_rate": 1.774518689811545e-05, "loss": 0.3731, "step": 84090 }, { "epoch": 3.030778102137168, "grad_norm": 0.19363613426685333, "learning_rate": 1.7742394386188255e-05, "loss": 0.3753, "step": 84095 }, { "epoch": 3.0309583017983925, "grad_norm": 0.19046692550182343, "learning_rate": 1.7739601973144547e-05, "loss": 0.3963, "step": 84100 }, { "epoch": 3.0311385014596173, "grad_norm": 0.2194364368915558, "learning_rate": 1.7736809659022354e-05, "loss": 0.374, "step": 84105 }, { "epoch": 3.031318701120842, "grad_norm": 0.1966681331396103, "learning_rate": 1.7734017443859715e-05, "loss": 0.3419, "step": 84110 }, { "epoch": 3.0314989007820667, "grad_norm": 0.2259567826986313, "learning_rate": 1.7731225327694688e-05, "loss": 0.3931, "step": 84115 }, { "epoch": 3.031679100443291, "grad_norm": 0.21452829241752625, "learning_rate": 1.7728433310565302e-05, "loss": 0.3811, "step": 84120 }, { "epoch": 3.0318593001045158, "grad_norm": 0.20977729558944702, "learning_rate": 1.7725641392509614e-05, "loss": 0.3805, "step": 84125 }, { "epoch": 3.0320394997657405, "grad_norm": 0.2387772798538208, "learning_rate": 1.772284957356565e-05, "loss": 0.3516, "step": 84130 }, { "epoch": 3.0322196994269652, "grad_norm": 0.21212005615234375, "learning_rate": 1.772005785377145e-05, "loss": 0.3751, "step": 84135 }, { "epoch": 3.0323998990881895, "grad_norm": 0.22513705492019653, "learning_rate": 1.7717266233165054e-05, "loss": 0.3696, "step": 84140 }, { "epoch": 3.0325800987494143, "grad_norm": 0.25777748227119446, "learning_rate": 1.7714474711784496e-05, "loss": 0.3954, "step": 84145 }, { "epoch": 3.032760298410639, "grad_norm": 0.20755411684513092, "learning_rate": 1.77116832896678e-05, "loss": 0.3365, "step": 84150 }, { "epoch": 3.0329404980718637, "grad_norm": 0.18738418817520142, "learning_rate": 1.770889196685302e-05, "loss": 0.3949, "step": 84155 }, { "epoch": 3.0331206977330885, "grad_norm": 0.21156027913093567, "learning_rate": 1.7706100743378168e-05, "loss": 0.3566, "step": 84160 }, { "epoch": 3.0333008973943127, "grad_norm": 0.28246161341667175, "learning_rate": 1.770330961928129e-05, "loss": 0.3625, "step": 84165 }, { "epoch": 3.0334810970555375, "grad_norm": 0.22130875289440155, "learning_rate": 1.77005185946004e-05, "loss": 0.3402, "step": 84170 }, { "epoch": 3.033661296716762, "grad_norm": 0.22155120968818665, "learning_rate": 1.7697727669373525e-05, "loss": 0.375, "step": 84175 }, { "epoch": 3.033841496377987, "grad_norm": 0.22286799550056458, "learning_rate": 1.7694936843638707e-05, "loss": 0.3858, "step": 84180 }, { "epoch": 3.0340216960392112, "grad_norm": 0.1948540061712265, "learning_rate": 1.7692146117433965e-05, "loss": 0.3905, "step": 84185 }, { "epoch": 3.034201895700436, "grad_norm": 0.228069469332695, "learning_rate": 1.76893554907973e-05, "loss": 0.41, "step": 84190 }, { "epoch": 3.0343820953616607, "grad_norm": 0.21206477284431458, "learning_rate": 1.7686564963766765e-05, "loss": 0.3649, "step": 84195 }, { "epoch": 3.0345622950228854, "grad_norm": 0.2614726722240448, "learning_rate": 1.768377453638035e-05, "loss": 0.377, "step": 84200 }, { "epoch": 3.03474249468411, "grad_norm": 0.21512839198112488, "learning_rate": 1.768098420867611e-05, "loss": 0.3931, "step": 84205 }, { "epoch": 3.0349226943453345, "grad_norm": 0.22823360562324524, "learning_rate": 1.7678193980692036e-05, "loss": 0.368, "step": 84210 }, { "epoch": 3.035102894006559, "grad_norm": 0.2092510610818863, "learning_rate": 1.767540385246615e-05, "loss": 0.3904, "step": 84215 }, { "epoch": 3.035283093667784, "grad_norm": 0.22960545122623444, "learning_rate": 1.767261382403647e-05, "loss": 0.4228, "step": 84220 }, { "epoch": 3.0354632933290087, "grad_norm": 0.2698182761669159, "learning_rate": 1.7669823895441007e-05, "loss": 0.3652, "step": 84225 }, { "epoch": 3.0356434929902334, "grad_norm": 0.27109652757644653, "learning_rate": 1.7667034066717768e-05, "loss": 0.3765, "step": 84230 }, { "epoch": 3.0358236926514577, "grad_norm": 0.178133025765419, "learning_rate": 1.766424433790478e-05, "loss": 0.3916, "step": 84235 }, { "epoch": 3.0360038923126824, "grad_norm": 0.26019546389579773, "learning_rate": 1.7661454709040036e-05, "loss": 0.4084, "step": 84240 }, { "epoch": 3.036184091973907, "grad_norm": 0.21639469265937805, "learning_rate": 1.7658665180161555e-05, "loss": 0.3909, "step": 84245 }, { "epoch": 3.036364291635132, "grad_norm": 0.2332758754491806, "learning_rate": 1.7655875751307338e-05, "loss": 0.3712, "step": 84250 }, { "epoch": 3.036544491296356, "grad_norm": 0.19095510244369507, "learning_rate": 1.765308642251539e-05, "loss": 0.3552, "step": 84255 }, { "epoch": 3.036724690957581, "grad_norm": 0.18508897721767426, "learning_rate": 1.765029719382372e-05, "loss": 0.4014, "step": 84260 }, { "epoch": 3.0369048906188056, "grad_norm": 0.22159694135189056, "learning_rate": 1.7647508065270335e-05, "loss": 0.3882, "step": 84265 }, { "epoch": 3.0370850902800304, "grad_norm": 0.22813962399959564, "learning_rate": 1.764471903689321e-05, "loss": 0.3698, "step": 84270 }, { "epoch": 3.037265289941255, "grad_norm": 0.21553412079811096, "learning_rate": 1.7641930108730377e-05, "loss": 0.3682, "step": 84275 }, { "epoch": 3.0374454896024794, "grad_norm": 0.2294461727142334, "learning_rate": 1.7639141280819815e-05, "loss": 0.3852, "step": 84280 }, { "epoch": 3.037625689263704, "grad_norm": 0.240443617105484, "learning_rate": 1.7636352553199537e-05, "loss": 0.3859, "step": 84285 }, { "epoch": 3.037805888924929, "grad_norm": 0.21389326453208923, "learning_rate": 1.7633563925907533e-05, "loss": 0.36, "step": 84290 }, { "epoch": 3.0379860885861536, "grad_norm": 0.21817508339881897, "learning_rate": 1.7630775398981777e-05, "loss": 0.4133, "step": 84295 }, { "epoch": 3.038166288247378, "grad_norm": 0.2270621359348297, "learning_rate": 1.7627986972460298e-05, "loss": 0.4165, "step": 84300 }, { "epoch": 3.0383464879086026, "grad_norm": 0.24196819961071014, "learning_rate": 1.762519864638106e-05, "loss": 0.393, "step": 84305 }, { "epoch": 3.0385266875698274, "grad_norm": 0.2399672567844391, "learning_rate": 1.7622410420782058e-05, "loss": 0.4056, "step": 84310 }, { "epoch": 3.038706887231052, "grad_norm": 0.23226651549339294, "learning_rate": 1.7619622295701296e-05, "loss": 0.3552, "step": 84315 }, { "epoch": 3.038887086892277, "grad_norm": 0.2692866027355194, "learning_rate": 1.761683427117674e-05, "loss": 0.3849, "step": 84320 }, { "epoch": 3.039067286553501, "grad_norm": 0.2093464732170105, "learning_rate": 1.76140463472464e-05, "loss": 0.3731, "step": 84325 }, { "epoch": 3.039247486214726, "grad_norm": 0.227401465177536, "learning_rate": 1.7611258523948242e-05, "loss": 0.3783, "step": 84330 }, { "epoch": 3.0394276858759506, "grad_norm": 0.212887704372406, "learning_rate": 1.7608470801320253e-05, "loss": 0.354, "step": 84335 }, { "epoch": 3.0396078855371753, "grad_norm": 0.2043050229549408, "learning_rate": 1.7605683179400423e-05, "loss": 0.3788, "step": 84340 }, { "epoch": 3.0397880851984, "grad_norm": 0.24479785561561584, "learning_rate": 1.7602895658226725e-05, "loss": 0.3753, "step": 84345 }, { "epoch": 3.0399682848596243, "grad_norm": 0.2218136191368103, "learning_rate": 1.7600108237837144e-05, "loss": 0.4084, "step": 84350 }, { "epoch": 3.040148484520849, "grad_norm": 0.23123931884765625, "learning_rate": 1.7597320918269658e-05, "loss": 0.4059, "step": 84355 }, { "epoch": 3.040328684182074, "grad_norm": 0.2367209494113922, "learning_rate": 1.7594533699562234e-05, "loss": 0.3901, "step": 84360 }, { "epoch": 3.0405088838432985, "grad_norm": 0.22968047857284546, "learning_rate": 1.759174658175286e-05, "loss": 0.3823, "step": 84365 }, { "epoch": 3.040689083504523, "grad_norm": 0.23370803892612457, "learning_rate": 1.758895956487951e-05, "loss": 0.384, "step": 84370 }, { "epoch": 3.0408692831657476, "grad_norm": 0.19679541885852814, "learning_rate": 1.758617264898014e-05, "loss": 0.3785, "step": 84375 }, { "epoch": 3.0410494828269723, "grad_norm": 0.2017953097820282, "learning_rate": 1.758338583409274e-05, "loss": 0.3986, "step": 84380 }, { "epoch": 3.041229682488197, "grad_norm": 0.1906876564025879, "learning_rate": 1.7580599120255263e-05, "loss": 0.3619, "step": 84385 }, { "epoch": 3.0414098821494218, "grad_norm": 0.1913730502128601, "learning_rate": 1.75778125075057e-05, "loss": 0.3926, "step": 84390 }, { "epoch": 3.041590081810646, "grad_norm": 0.24644358456134796, "learning_rate": 1.7575025995881998e-05, "loss": 0.3757, "step": 84395 }, { "epoch": 3.041770281471871, "grad_norm": 0.2385040670633316, "learning_rate": 1.7572239585422116e-05, "loss": 0.4107, "step": 84400 }, { "epoch": 3.0419504811330955, "grad_norm": 0.18101677298545837, "learning_rate": 1.7569453276164054e-05, "loss": 0.387, "step": 84405 }, { "epoch": 3.0421306807943203, "grad_norm": 0.21769458055496216, "learning_rate": 1.756666706814574e-05, "loss": 0.4045, "step": 84410 }, { "epoch": 3.0423108804555445, "grad_norm": 0.2499932199716568, "learning_rate": 1.7563880961405148e-05, "loss": 0.3898, "step": 84415 }, { "epoch": 3.0424910801167693, "grad_norm": 0.24783965945243835, "learning_rate": 1.7561094955980247e-05, "loss": 0.3607, "step": 84420 }, { "epoch": 3.042671279777994, "grad_norm": 0.21402627229690552, "learning_rate": 1.7558309051908976e-05, "loss": 0.3521, "step": 84425 }, { "epoch": 3.0428514794392187, "grad_norm": 0.21035562455654144, "learning_rate": 1.7555523249229312e-05, "loss": 0.4272, "step": 84430 }, { "epoch": 3.0430316791004435, "grad_norm": 0.2314184308052063, "learning_rate": 1.7552737547979197e-05, "loss": 0.3946, "step": 84435 }, { "epoch": 3.0432118787616678, "grad_norm": 0.18554693460464478, "learning_rate": 1.754995194819659e-05, "loss": 0.3658, "step": 84440 }, { "epoch": 3.0433920784228925, "grad_norm": 0.20421580970287323, "learning_rate": 1.754716644991945e-05, "loss": 0.3572, "step": 84445 }, { "epoch": 3.0435722780841172, "grad_norm": 0.29508501291275024, "learning_rate": 1.7544381053185723e-05, "loss": 0.4097, "step": 84450 }, { "epoch": 3.043752477745342, "grad_norm": 0.24256087839603424, "learning_rate": 1.7541595758033357e-05, "loss": 0.3888, "step": 84455 }, { "epoch": 3.0439326774065663, "grad_norm": 0.17925018072128296, "learning_rate": 1.753881056450031e-05, "loss": 0.3381, "step": 84460 }, { "epoch": 3.044112877067791, "grad_norm": 0.2275160253047943, "learning_rate": 1.7536025472624518e-05, "loss": 0.4123, "step": 84465 }, { "epoch": 3.0442930767290157, "grad_norm": 0.2509187161922455, "learning_rate": 1.7533240482443942e-05, "loss": 0.3866, "step": 84470 }, { "epoch": 3.0444732763902405, "grad_norm": 0.231712207198143, "learning_rate": 1.753045559399652e-05, "loss": 0.4006, "step": 84475 }, { "epoch": 3.044653476051465, "grad_norm": 0.23315437138080597, "learning_rate": 1.7527670807320183e-05, "loss": 0.4009, "step": 84480 }, { "epoch": 3.0448336757126895, "grad_norm": 0.24659566581249237, "learning_rate": 1.7524886122452897e-05, "loss": 0.4056, "step": 84485 }, { "epoch": 3.045013875373914, "grad_norm": 0.22627463936805725, "learning_rate": 1.7522101539432582e-05, "loss": 0.3803, "step": 84490 }, { "epoch": 3.045194075035139, "grad_norm": 0.2701631486415863, "learning_rate": 1.7519317058297188e-05, "loss": 0.3936, "step": 84495 }, { "epoch": 3.0453742746963637, "grad_norm": 0.23009376227855682, "learning_rate": 1.7516532679084652e-05, "loss": 0.3687, "step": 84500 }, { "epoch": 3.0453742746963637, "eval_loss": 0.43199047446250916, "eval_runtime": 3.5354, "eval_samples_per_second": 28.286, "eval_steps_per_second": 7.071, "step": 84500 }, { "epoch": 3.0455544743575884, "grad_norm": 0.24204762279987335, "learning_rate": 1.7513748401832904e-05, "loss": 0.3531, "step": 84505 }, { "epoch": 3.0457346740188127, "grad_norm": 0.27944275736808777, "learning_rate": 1.751096422657989e-05, "loss": 0.3562, "step": 84510 }, { "epoch": 3.0459148736800374, "grad_norm": 0.2434234917163849, "learning_rate": 1.750818015336354e-05, "loss": 0.406, "step": 84515 }, { "epoch": 3.046095073341262, "grad_norm": 0.18763190507888794, "learning_rate": 1.7505396182221777e-05, "loss": 0.3698, "step": 84520 }, { "epoch": 3.046275273002487, "grad_norm": 0.21916723251342773, "learning_rate": 1.750261231319255e-05, "loss": 0.4321, "step": 84525 }, { "epoch": 3.046455472663711, "grad_norm": 0.22559642791748047, "learning_rate": 1.749982854631377e-05, "loss": 0.3608, "step": 84530 }, { "epoch": 3.046635672324936, "grad_norm": 0.20946092903614044, "learning_rate": 1.749704488162338e-05, "loss": 0.3531, "step": 84535 }, { "epoch": 3.0468158719861607, "grad_norm": 0.2142314463853836, "learning_rate": 1.74942613191593e-05, "loss": 0.3761, "step": 84540 }, { "epoch": 3.0469960716473854, "grad_norm": 0.18263986706733704, "learning_rate": 1.7491477858959453e-05, "loss": 0.3924, "step": 84545 }, { "epoch": 3.04717627130861, "grad_norm": 0.19309696555137634, "learning_rate": 1.748869450106177e-05, "loss": 0.3453, "step": 84550 }, { "epoch": 3.0473564709698344, "grad_norm": 0.2552222013473511, "learning_rate": 1.7485911245504173e-05, "loss": 0.3855, "step": 84555 }, { "epoch": 3.047536670631059, "grad_norm": 0.2853567600250244, "learning_rate": 1.7483128092324567e-05, "loss": 0.4424, "step": 84560 }, { "epoch": 3.047716870292284, "grad_norm": 0.2120617926120758, "learning_rate": 1.74803450415609e-05, "loss": 0.3929, "step": 84565 }, { "epoch": 3.0478970699535086, "grad_norm": 0.2228347659111023, "learning_rate": 1.7477562093251066e-05, "loss": 0.3744, "step": 84570 }, { "epoch": 3.048077269614733, "grad_norm": 0.2142227590084076, "learning_rate": 1.7474779247432998e-05, "loss": 0.4072, "step": 84575 }, { "epoch": 3.0482574692759576, "grad_norm": 0.23196277022361755, "learning_rate": 1.7471996504144612e-05, "loss": 0.3609, "step": 84580 }, { "epoch": 3.0484376689371824, "grad_norm": 0.20943579077720642, "learning_rate": 1.74692138634238e-05, "loss": 0.3989, "step": 84585 }, { "epoch": 3.048617868598407, "grad_norm": 0.28706827759742737, "learning_rate": 1.7466431325308507e-05, "loss": 0.3669, "step": 84590 }, { "epoch": 3.048798068259632, "grad_norm": 0.20854423940181732, "learning_rate": 1.7463648889836627e-05, "loss": 0.3551, "step": 84595 }, { "epoch": 3.048978267920856, "grad_norm": 0.20572051405906677, "learning_rate": 1.746086655704606e-05, "loss": 0.3609, "step": 84600 }, { "epoch": 3.049158467582081, "grad_norm": 0.1955312341451645, "learning_rate": 1.7458084326974732e-05, "loss": 0.3764, "step": 84605 }, { "epoch": 3.0493386672433056, "grad_norm": 0.1977349817752838, "learning_rate": 1.7455302199660544e-05, "loss": 0.3836, "step": 84610 }, { "epoch": 3.0495188669045303, "grad_norm": 0.2958627939224243, "learning_rate": 1.7452520175141406e-05, "loss": 0.4121, "step": 84615 }, { "epoch": 3.049699066565755, "grad_norm": 0.20005503296852112, "learning_rate": 1.7449738253455223e-05, "loss": 0.3998, "step": 84620 }, { "epoch": 3.0498792662269794, "grad_norm": 0.2881629168987274, "learning_rate": 1.7446956434639884e-05, "loss": 0.4127, "step": 84625 }, { "epoch": 3.050059465888204, "grad_norm": 0.2254873812198639, "learning_rate": 1.7444174718733308e-05, "loss": 0.4088, "step": 84630 }, { "epoch": 3.050239665549429, "grad_norm": 0.21899674832820892, "learning_rate": 1.744139310577339e-05, "loss": 0.3857, "step": 84635 }, { "epoch": 3.0504198652106536, "grad_norm": 0.23545971512794495, "learning_rate": 1.743861159579802e-05, "loss": 0.3672, "step": 84640 }, { "epoch": 3.050600064871878, "grad_norm": 0.23571446537971497, "learning_rate": 1.743583018884511e-05, "loss": 0.3931, "step": 84645 }, { "epoch": 3.0507802645331026, "grad_norm": 0.23098576068878174, "learning_rate": 1.7433048884952548e-05, "loss": 0.3968, "step": 84650 }, { "epoch": 3.0509604641943273, "grad_norm": 0.2752792537212372, "learning_rate": 1.7430267684158226e-05, "loss": 0.3622, "step": 84655 }, { "epoch": 3.051140663855552, "grad_norm": 0.2383970022201538, "learning_rate": 1.7427486586500053e-05, "loss": 0.4099, "step": 84660 }, { "epoch": 3.051320863516777, "grad_norm": 0.22088317573070526, "learning_rate": 1.742470559201589e-05, "loss": 0.382, "step": 84665 }, { "epoch": 3.051501063178001, "grad_norm": 0.2405635267496109, "learning_rate": 1.7421924700743668e-05, "loss": 0.3674, "step": 84670 }, { "epoch": 3.051681262839226, "grad_norm": 0.2541629672050476, "learning_rate": 1.741914391272124e-05, "loss": 0.3796, "step": 84675 }, { "epoch": 3.0518614625004505, "grad_norm": 0.18502090871334076, "learning_rate": 1.7416363227986507e-05, "loss": 0.4146, "step": 84680 }, { "epoch": 3.0520416621616753, "grad_norm": 0.20814743638038635, "learning_rate": 1.741358264657737e-05, "loss": 0.3825, "step": 84685 }, { "epoch": 3.0522218618228996, "grad_norm": 0.22809316217899323, "learning_rate": 1.7410802168531684e-05, "loss": 0.4106, "step": 84690 }, { "epoch": 3.0524020614841243, "grad_norm": 0.2442580610513687, "learning_rate": 1.7408021793887363e-05, "loss": 0.3786, "step": 84695 }, { "epoch": 3.052582261145349, "grad_norm": 0.2511539161205292, "learning_rate": 1.7405241522682276e-05, "loss": 0.4042, "step": 84700 }, { "epoch": 3.0527624608065738, "grad_norm": 0.20690898597240448, "learning_rate": 1.740246135495429e-05, "loss": 0.3782, "step": 84705 }, { "epoch": 3.0529426604677985, "grad_norm": 0.20105348527431488, "learning_rate": 1.7399681290741308e-05, "loss": 0.3946, "step": 84710 }, { "epoch": 3.053122860129023, "grad_norm": 0.2233029156923294, "learning_rate": 1.739690133008119e-05, "loss": 0.3783, "step": 84715 }, { "epoch": 3.0533030597902475, "grad_norm": 0.2557271122932434, "learning_rate": 1.7394121473011825e-05, "loss": 0.3672, "step": 84720 }, { "epoch": 3.0534832594514723, "grad_norm": 0.2504826784133911, "learning_rate": 1.739134171957108e-05, "loss": 0.3653, "step": 84725 }, { "epoch": 3.053663459112697, "grad_norm": 0.19489239156246185, "learning_rate": 1.7388562069796827e-05, "loss": 0.3771, "step": 84730 }, { "epoch": 3.0538436587739217, "grad_norm": 0.18892985582351685, "learning_rate": 1.738578252372695e-05, "loss": 0.3507, "step": 84735 }, { "epoch": 3.054023858435146, "grad_norm": 0.2819010317325592, "learning_rate": 1.7383003081399308e-05, "loss": 0.3892, "step": 84740 }, { "epoch": 3.0542040580963707, "grad_norm": 0.22074325382709503, "learning_rate": 1.738022374285177e-05, "loss": 0.3679, "step": 84745 }, { "epoch": 3.0543842577575955, "grad_norm": 0.23581378161907196, "learning_rate": 1.7377444508122215e-05, "loss": 0.3676, "step": 84750 }, { "epoch": 3.05456445741882, "grad_norm": 0.23171649873256683, "learning_rate": 1.73746653772485e-05, "loss": 0.3905, "step": 84755 }, { "epoch": 3.0547446570800445, "grad_norm": 0.22440069913864136, "learning_rate": 1.7371886350268494e-05, "loss": 0.3745, "step": 84760 }, { "epoch": 3.0549248567412692, "grad_norm": 0.17826688289642334, "learning_rate": 1.7369107427220066e-05, "loss": 0.3795, "step": 84765 }, { "epoch": 3.055105056402494, "grad_norm": 0.2648460268974304, "learning_rate": 1.7366328608141057e-05, "loss": 0.3735, "step": 84770 }, { "epoch": 3.0552852560637187, "grad_norm": 0.18997584283351898, "learning_rate": 1.7363549893069355e-05, "loss": 0.3955, "step": 84775 }, { "epoch": 3.0554654557249434, "grad_norm": 0.18987253308296204, "learning_rate": 1.7360771282042807e-05, "loss": 0.383, "step": 84780 }, { "epoch": 3.0556456553861677, "grad_norm": 0.23954762518405914, "learning_rate": 1.7357992775099264e-05, "loss": 0.407, "step": 84785 }, { "epoch": 3.0558258550473925, "grad_norm": 0.19741961359977722, "learning_rate": 1.7355214372276596e-05, "loss": 0.3922, "step": 84790 }, { "epoch": 3.056006054708617, "grad_norm": 0.20754849910736084, "learning_rate": 1.7352436073612644e-05, "loss": 0.3883, "step": 84795 }, { "epoch": 3.056186254369842, "grad_norm": 0.2479812055826187, "learning_rate": 1.7349657879145274e-05, "loss": 0.3505, "step": 84800 }, { "epoch": 3.056366454031066, "grad_norm": 0.26793172955513, "learning_rate": 1.734687978891234e-05, "loss": 0.4124, "step": 84805 }, { "epoch": 3.056546653692291, "grad_norm": 0.2121279090642929, "learning_rate": 1.734410180295168e-05, "loss": 0.3928, "step": 84810 }, { "epoch": 3.0567268533535157, "grad_norm": 0.2421426624059677, "learning_rate": 1.7341323921301154e-05, "loss": 0.3936, "step": 84815 }, { "epoch": 3.0569070530147404, "grad_norm": 0.21488474309444427, "learning_rate": 1.733854614399861e-05, "loss": 0.3913, "step": 84820 }, { "epoch": 3.057087252675965, "grad_norm": 0.18251733481884003, "learning_rate": 1.733576847108188e-05, "loss": 0.4056, "step": 84825 }, { "epoch": 3.0572674523371894, "grad_norm": 0.25495290756225586, "learning_rate": 1.733299090258883e-05, "loss": 0.3676, "step": 84830 }, { "epoch": 3.057447651998414, "grad_norm": 0.23162417113780975, "learning_rate": 1.733021343855729e-05, "loss": 0.3768, "step": 84835 }, { "epoch": 3.057627851659639, "grad_norm": 0.20512209832668304, "learning_rate": 1.7327436079025112e-05, "loss": 0.3293, "step": 84840 }, { "epoch": 3.0578080513208636, "grad_norm": 0.197612926363945, "learning_rate": 1.7324658824030133e-05, "loss": 0.3904, "step": 84845 }, { "epoch": 3.0579882509820884, "grad_norm": 0.29313725233078003, "learning_rate": 1.7321881673610184e-05, "loss": 0.396, "step": 84850 }, { "epoch": 3.0581684506433127, "grad_norm": 0.2089136838912964, "learning_rate": 1.7319104627803117e-05, "loss": 0.3205, "step": 84855 }, { "epoch": 3.0583486503045374, "grad_norm": 0.25626543164253235, "learning_rate": 1.7316327686646767e-05, "loss": 0.3811, "step": 84860 }, { "epoch": 3.058528849965762, "grad_norm": 0.2188102900981903, "learning_rate": 1.731355085017895e-05, "loss": 0.3782, "step": 84865 }, { "epoch": 3.058709049626987, "grad_norm": 0.2118126004934311, "learning_rate": 1.731077411843753e-05, "loss": 0.3928, "step": 84870 }, { "epoch": 3.058889249288211, "grad_norm": 0.25317180156707764, "learning_rate": 1.7307997491460306e-05, "loss": 0.3693, "step": 84875 }, { "epoch": 3.059069448949436, "grad_norm": 0.2602880299091339, "learning_rate": 1.7305220969285148e-05, "loss": 0.3928, "step": 84880 }, { "epoch": 3.0592496486106606, "grad_norm": 0.23756670951843262, "learning_rate": 1.7302444551949853e-05, "loss": 0.3839, "step": 84885 }, { "epoch": 3.0594298482718854, "grad_norm": 0.2189938724040985, "learning_rate": 1.729966823949226e-05, "loss": 0.3825, "step": 84890 }, { "epoch": 3.05961004793311, "grad_norm": 0.24415577948093414, "learning_rate": 1.72968920319502e-05, "loss": 0.3731, "step": 84895 }, { "epoch": 3.0597902475943344, "grad_norm": 0.1736556589603424, "learning_rate": 1.7294115929361492e-05, "loss": 0.3822, "step": 84900 }, { "epoch": 3.059970447255559, "grad_norm": 0.2100590169429779, "learning_rate": 1.7291339931763962e-05, "loss": 0.3883, "step": 84905 }, { "epoch": 3.060150646916784, "grad_norm": 0.2625679671764374, "learning_rate": 1.7288564039195434e-05, "loss": 0.3733, "step": 84910 }, { "epoch": 3.0603308465780086, "grad_norm": 0.2331903725862503, "learning_rate": 1.7285788251693723e-05, "loss": 0.3834, "step": 84915 }, { "epoch": 3.060511046239233, "grad_norm": 0.21731112897396088, "learning_rate": 1.7283012569296665e-05, "loss": 0.3721, "step": 84920 }, { "epoch": 3.0606912459004576, "grad_norm": 0.22888799011707306, "learning_rate": 1.728023699204206e-05, "loss": 0.3532, "step": 84925 }, { "epoch": 3.0608714455616823, "grad_norm": 0.2064668983221054, "learning_rate": 1.727746151996773e-05, "loss": 0.3789, "step": 84930 }, { "epoch": 3.061051645222907, "grad_norm": 0.1944301277399063, "learning_rate": 1.7274686153111496e-05, "loss": 0.3762, "step": 84935 }, { "epoch": 3.061231844884132, "grad_norm": 0.2432466596364975, "learning_rate": 1.7271910891511163e-05, "loss": 0.3953, "step": 84940 }, { "epoch": 3.061412044545356, "grad_norm": 0.1888858526945114, "learning_rate": 1.726913573520455e-05, "loss": 0.3832, "step": 84945 }, { "epoch": 3.061592244206581, "grad_norm": 0.27773165702819824, "learning_rate": 1.7266360684229473e-05, "loss": 0.3922, "step": 84950 }, { "epoch": 3.0617724438678056, "grad_norm": 0.1935848444700241, "learning_rate": 1.7263585738623715e-05, "loss": 0.3716, "step": 84955 }, { "epoch": 3.0619526435290303, "grad_norm": 0.2068513184785843, "learning_rate": 1.7260810898425126e-05, "loss": 0.3646, "step": 84960 }, { "epoch": 3.0621328431902546, "grad_norm": 0.22322344779968262, "learning_rate": 1.725803616367148e-05, "loss": 0.3975, "step": 84965 }, { "epoch": 3.0623130428514793, "grad_norm": 0.19500023126602173, "learning_rate": 1.7255261534400585e-05, "loss": 0.3771, "step": 84970 }, { "epoch": 3.062493242512704, "grad_norm": 0.20665855705738068, "learning_rate": 1.7252487010650266e-05, "loss": 0.3912, "step": 84975 }, { "epoch": 3.062673442173929, "grad_norm": 0.24489200115203857, "learning_rate": 1.7249712592458294e-05, "loss": 0.3861, "step": 84980 }, { "epoch": 3.0628536418351535, "grad_norm": 0.22476324439048767, "learning_rate": 1.7246938279862508e-05, "loss": 0.35, "step": 84985 }, { "epoch": 3.063033841496378, "grad_norm": 0.2135080248117447, "learning_rate": 1.7244164072900678e-05, "loss": 0.3628, "step": 84990 }, { "epoch": 3.0632140411576025, "grad_norm": 0.2153581976890564, "learning_rate": 1.7241389971610607e-05, "loss": 0.3808, "step": 84995 }, { "epoch": 3.0633942408188273, "grad_norm": 0.27249398827552795, "learning_rate": 1.72386159760301e-05, "loss": 0.3824, "step": 85000 }, { "epoch": 3.0633942408188273, "eval_loss": 0.43113651871681213, "eval_runtime": 3.5309, "eval_samples_per_second": 28.322, "eval_steps_per_second": 7.08, "step": 85000 }, { "epoch": 3.063574440480052, "grad_norm": 0.23285353183746338, "learning_rate": 1.723584208619695e-05, "loss": 0.3787, "step": 85005 }, { "epoch": 3.0637546401412767, "grad_norm": 0.21958500146865845, "learning_rate": 1.7233068302148943e-05, "loss": 0.4006, "step": 85010 }, { "epoch": 3.063934839802501, "grad_norm": 0.26001232862472534, "learning_rate": 1.7230294623923876e-05, "loss": 0.3732, "step": 85015 }, { "epoch": 3.0641150394637258, "grad_norm": 0.20157134532928467, "learning_rate": 1.722752105155954e-05, "loss": 0.3474, "step": 85020 }, { "epoch": 3.0642952391249505, "grad_norm": 0.2562786638736725, "learning_rate": 1.722474758509373e-05, "loss": 0.4045, "step": 85025 }, { "epoch": 3.0644754387861752, "grad_norm": 0.22175267338752747, "learning_rate": 1.722197422456423e-05, "loss": 0.3797, "step": 85030 }, { "epoch": 3.0646556384473995, "grad_norm": 0.19869141280651093, "learning_rate": 1.721920097000882e-05, "loss": 0.3716, "step": 85035 }, { "epoch": 3.0648358381086243, "grad_norm": 0.2057720124721527, "learning_rate": 1.7216427821465292e-05, "loss": 0.3932, "step": 85040 }, { "epoch": 3.065016037769849, "grad_norm": 0.24935896694660187, "learning_rate": 1.7213654778971436e-05, "loss": 0.3798, "step": 85045 }, { "epoch": 3.0651962374310737, "grad_norm": 0.2382826954126358, "learning_rate": 1.7210881842565007e-05, "loss": 0.3826, "step": 85050 }, { "epoch": 3.0653764370922985, "grad_norm": 0.2057560682296753, "learning_rate": 1.7208109012283824e-05, "loss": 0.3825, "step": 85055 }, { "epoch": 3.0655566367535227, "grad_norm": 0.20751681923866272, "learning_rate": 1.720533628816563e-05, "loss": 0.3988, "step": 85060 }, { "epoch": 3.0657368364147475, "grad_norm": 0.22527416050434113, "learning_rate": 1.720256367024824e-05, "loss": 0.3742, "step": 85065 }, { "epoch": 3.065917036075972, "grad_norm": 0.24858717620372772, "learning_rate": 1.71997911585694e-05, "loss": 0.385, "step": 85070 }, { "epoch": 3.066097235737197, "grad_norm": 0.2016977220773697, "learning_rate": 1.7197018753166895e-05, "loss": 0.3803, "step": 85075 }, { "epoch": 3.0662774353984212, "grad_norm": 0.23507557809352875, "learning_rate": 1.71942464540785e-05, "loss": 0.3956, "step": 85080 }, { "epoch": 3.066457635059646, "grad_norm": 0.18204432725906372, "learning_rate": 1.7191474261341982e-05, "loss": 0.4038, "step": 85085 }, { "epoch": 3.0666378347208707, "grad_norm": 0.19573719799518585, "learning_rate": 1.7188702174995115e-05, "loss": 0.3751, "step": 85090 }, { "epoch": 3.0668180343820954, "grad_norm": 0.2764735221862793, "learning_rate": 1.7185930195075675e-05, "loss": 0.3545, "step": 85095 }, { "epoch": 3.06699823404332, "grad_norm": 0.2690604031085968, "learning_rate": 1.718315832162142e-05, "loss": 0.3845, "step": 85100 }, { "epoch": 3.0671784337045445, "grad_norm": 0.22735117375850677, "learning_rate": 1.718038655467012e-05, "loss": 0.4114, "step": 85105 }, { "epoch": 3.067358633365769, "grad_norm": 0.25227391719818115, "learning_rate": 1.7177614894259538e-05, "loss": 0.4067, "step": 85110 }, { "epoch": 3.067538833026994, "grad_norm": 0.22190016508102417, "learning_rate": 1.717484334042744e-05, "loss": 0.4165, "step": 85115 }, { "epoch": 3.0677190326882187, "grad_norm": 0.2443481832742691, "learning_rate": 1.7172071893211583e-05, "loss": 0.4004, "step": 85120 }, { "epoch": 3.0678992323494434, "grad_norm": 0.19374127686023712, "learning_rate": 1.716930055264973e-05, "loss": 0.3809, "step": 85125 }, { "epoch": 3.0680794320106677, "grad_norm": 0.21551166474819183, "learning_rate": 1.716652931877965e-05, "loss": 0.3959, "step": 85130 }, { "epoch": 3.0682596316718924, "grad_norm": 0.2144090086221695, "learning_rate": 1.7163758191639085e-05, "loss": 0.4067, "step": 85135 }, { "epoch": 3.068439831333117, "grad_norm": 0.25093093514442444, "learning_rate": 1.7160987171265798e-05, "loss": 0.3913, "step": 85140 }, { "epoch": 3.068620030994342, "grad_norm": 0.24009042978286743, "learning_rate": 1.7158216257697545e-05, "loss": 0.3445, "step": 85145 }, { "epoch": 3.068800230655566, "grad_norm": 0.21893219649791718, "learning_rate": 1.715544545097208e-05, "loss": 0.3871, "step": 85150 }, { "epoch": 3.068980430316791, "grad_norm": 0.21242910623550415, "learning_rate": 1.715267475112714e-05, "loss": 0.3822, "step": 85155 }, { "epoch": 3.0691606299780156, "grad_norm": 0.209553524851799, "learning_rate": 1.7149904158200504e-05, "loss": 0.379, "step": 85160 }, { "epoch": 3.0693408296392404, "grad_norm": 0.2270086407661438, "learning_rate": 1.7147133672229885e-05, "loss": 0.3807, "step": 85165 }, { "epoch": 3.069521029300465, "grad_norm": 0.1918162852525711, "learning_rate": 1.7144363293253066e-05, "loss": 0.3994, "step": 85170 }, { "epoch": 3.0697012289616894, "grad_norm": 0.18857698142528534, "learning_rate": 1.7141593021307774e-05, "loss": 0.412, "step": 85175 }, { "epoch": 3.069881428622914, "grad_norm": 0.27960577607154846, "learning_rate": 1.7138822856431746e-05, "loss": 0.3907, "step": 85180 }, { "epoch": 3.070061628284139, "grad_norm": 0.21300694346427917, "learning_rate": 1.713605279866274e-05, "loss": 0.3445, "step": 85185 }, { "epoch": 3.0702418279453636, "grad_norm": 0.2177317589521408, "learning_rate": 1.7133282848038495e-05, "loss": 0.3532, "step": 85190 }, { "epoch": 3.070422027606588, "grad_norm": 0.2585134208202362, "learning_rate": 1.713051300459674e-05, "loss": 0.4105, "step": 85195 }, { "epoch": 3.0706022272678126, "grad_norm": 0.2013564258813858, "learning_rate": 1.712774326837523e-05, "loss": 0.3809, "step": 85200 }, { "epoch": 3.0707824269290374, "grad_norm": 0.1960568130016327, "learning_rate": 1.7124973639411686e-05, "loss": 0.3597, "step": 85205 }, { "epoch": 3.070962626590262, "grad_norm": 0.2271984964609146, "learning_rate": 1.712220411774386e-05, "loss": 0.3732, "step": 85210 }, { "epoch": 3.071142826251487, "grad_norm": 0.2209077626466751, "learning_rate": 1.7119434703409475e-05, "loss": 0.3632, "step": 85215 }, { "epoch": 3.071323025912711, "grad_norm": 0.24356262385845184, "learning_rate": 1.7116665396446262e-05, "loss": 0.3656, "step": 85220 }, { "epoch": 3.071503225573936, "grad_norm": 0.2605443298816681, "learning_rate": 1.7113896196891963e-05, "loss": 0.3577, "step": 85225 }, { "epoch": 3.0716834252351606, "grad_norm": 0.23770549893379211, "learning_rate": 1.7111127104784305e-05, "loss": 0.3835, "step": 85230 }, { "epoch": 3.0718636248963853, "grad_norm": 0.225930318236351, "learning_rate": 1.7108358120160997e-05, "loss": 0.4269, "step": 85235 }, { "epoch": 3.0720438245576096, "grad_norm": 0.24659954011440277, "learning_rate": 1.7105589243059798e-05, "loss": 0.3578, "step": 85240 }, { "epoch": 3.0722240242188343, "grad_norm": 0.17880700528621674, "learning_rate": 1.7102820473518404e-05, "loss": 0.3679, "step": 85245 }, { "epoch": 3.072404223880059, "grad_norm": 0.21226419508457184, "learning_rate": 1.7100051811574564e-05, "loss": 0.3876, "step": 85250 }, { "epoch": 3.072584423541284, "grad_norm": 0.23851273953914642, "learning_rate": 1.7097283257265983e-05, "loss": 0.3621, "step": 85255 }, { "epoch": 3.0727646232025085, "grad_norm": 0.20797961950302124, "learning_rate": 1.709451481063038e-05, "loss": 0.3759, "step": 85260 }, { "epoch": 3.072944822863733, "grad_norm": 0.22600434720516205, "learning_rate": 1.70917464717055e-05, "loss": 0.3779, "step": 85265 }, { "epoch": 3.0731250225249576, "grad_norm": 0.2033650130033493, "learning_rate": 1.7088978240529034e-05, "loss": 0.3632, "step": 85270 }, { "epoch": 3.0733052221861823, "grad_norm": 0.22226615250110626, "learning_rate": 1.70862101171387e-05, "loss": 0.3842, "step": 85275 }, { "epoch": 3.073485421847407, "grad_norm": 0.22112597525119781, "learning_rate": 1.7083442101572235e-05, "loss": 0.3838, "step": 85280 }, { "epoch": 3.0736656215086318, "grad_norm": 0.2128104269504547, "learning_rate": 1.7080674193867325e-05, "loss": 0.3375, "step": 85285 }, { "epoch": 3.073845821169856, "grad_norm": 0.2777286469936371, "learning_rate": 1.7077906394061703e-05, "loss": 0.3895, "step": 85290 }, { "epoch": 3.074026020831081, "grad_norm": 0.22353100776672363, "learning_rate": 1.7075138702193074e-05, "loss": 0.3957, "step": 85295 }, { "epoch": 3.0742062204923055, "grad_norm": 0.2158191204071045, "learning_rate": 1.7072371118299142e-05, "loss": 0.4044, "step": 85300 }, { "epoch": 3.0743864201535303, "grad_norm": 0.2143658846616745, "learning_rate": 1.7069603642417622e-05, "loss": 0.3425, "step": 85305 }, { "epoch": 3.0745666198147545, "grad_norm": 0.21374057233333588, "learning_rate": 1.7066836274586214e-05, "loss": 0.3924, "step": 85310 }, { "epoch": 3.0747468194759793, "grad_norm": 0.21760691702365875, "learning_rate": 1.7064069014842626e-05, "loss": 0.4043, "step": 85315 }, { "epoch": 3.074927019137204, "grad_norm": 0.25695380568504333, "learning_rate": 1.7061301863224566e-05, "loss": 0.3793, "step": 85320 }, { "epoch": 3.0751072187984287, "grad_norm": 0.22985416650772095, "learning_rate": 1.7058534819769724e-05, "loss": 0.3659, "step": 85325 }, { "epoch": 3.0752874184596535, "grad_norm": 0.2104073315858841, "learning_rate": 1.7055767884515815e-05, "loss": 0.3891, "step": 85330 }, { "epoch": 3.0754676181208778, "grad_norm": 0.22502779960632324, "learning_rate": 1.7053001057500534e-05, "loss": 0.3922, "step": 85335 }, { "epoch": 3.0756478177821025, "grad_norm": 0.26956361532211304, "learning_rate": 1.705023433876156e-05, "loss": 0.3993, "step": 85340 }, { "epoch": 3.0758280174433272, "grad_norm": 0.22516946494579315, "learning_rate": 1.704746772833662e-05, "loss": 0.3681, "step": 85345 }, { "epoch": 3.076008217104552, "grad_norm": 0.2101748287677765, "learning_rate": 1.7044701226263374e-05, "loss": 0.3783, "step": 85350 }, { "epoch": 3.0761884167657767, "grad_norm": 0.2022121697664261, "learning_rate": 1.704193483257955e-05, "loss": 0.4104, "step": 85355 }, { "epoch": 3.076368616427001, "grad_norm": 0.2616555690765381, "learning_rate": 1.703916854732282e-05, "loss": 0.395, "step": 85360 }, { "epoch": 3.0765488160882257, "grad_norm": 0.2326449602842331, "learning_rate": 1.7036402370530867e-05, "loss": 0.3604, "step": 85365 }, { "epoch": 3.0767290157494505, "grad_norm": 0.25773555040359497, "learning_rate": 1.70336363022414e-05, "loss": 0.3656, "step": 85370 }, { "epoch": 3.076909215410675, "grad_norm": 0.228920578956604, "learning_rate": 1.7030870342492098e-05, "loss": 0.3941, "step": 85375 }, { "epoch": 3.0770894150718995, "grad_norm": 0.21083427965641022, "learning_rate": 1.7028104491320636e-05, "loss": 0.4001, "step": 85380 }, { "epoch": 3.077269614733124, "grad_norm": 0.20418711006641388, "learning_rate": 1.7025338748764713e-05, "loss": 0.37, "step": 85385 }, { "epoch": 3.077449814394349, "grad_norm": 0.2817372977733612, "learning_rate": 1.7022573114862e-05, "loss": 0.3804, "step": 85390 }, { "epoch": 3.0776300140555737, "grad_norm": 0.2094438374042511, "learning_rate": 1.7019807589650187e-05, "loss": 0.3896, "step": 85395 }, { "epoch": 3.0778102137167984, "grad_norm": 0.2494175136089325, "learning_rate": 1.701704217316696e-05, "loss": 0.3862, "step": 85400 }, { "epoch": 3.0779904133780227, "grad_norm": 0.19059158861637115, "learning_rate": 1.7014276865449973e-05, "loss": 0.3787, "step": 85405 }, { "epoch": 3.0781706130392474, "grad_norm": 0.21895138919353485, "learning_rate": 1.7011511666536923e-05, "loss": 0.3882, "step": 85410 }, { "epoch": 3.078350812700472, "grad_norm": 0.20317794382572174, "learning_rate": 1.700874657646549e-05, "loss": 0.3827, "step": 85415 }, { "epoch": 3.078531012361697, "grad_norm": 0.23056404292583466, "learning_rate": 1.700598159527332e-05, "loss": 0.3973, "step": 85420 }, { "epoch": 3.078711212022921, "grad_norm": 0.22910591959953308, "learning_rate": 1.7003216722998112e-05, "loss": 0.3841, "step": 85425 }, { "epoch": 3.078891411684146, "grad_norm": 0.22995778918266296, "learning_rate": 1.700045195967752e-05, "loss": 0.4064, "step": 85430 }, { "epoch": 3.0790716113453707, "grad_norm": 0.24501994252204895, "learning_rate": 1.6997687305349234e-05, "loss": 0.3691, "step": 85435 }, { "epoch": 3.0792518110065954, "grad_norm": 0.19479656219482422, "learning_rate": 1.699492276005091e-05, "loss": 0.3536, "step": 85440 }, { "epoch": 3.07943201066782, "grad_norm": 0.24624988436698914, "learning_rate": 1.69921583238202e-05, "loss": 0.3855, "step": 85445 }, { "epoch": 3.0796122103290444, "grad_norm": 0.23105396330356598, "learning_rate": 1.6989393996694797e-05, "loss": 0.3399, "step": 85450 }, { "epoch": 3.079792409990269, "grad_norm": 0.23639746010303497, "learning_rate": 1.6986629778712344e-05, "loss": 0.4176, "step": 85455 }, { "epoch": 3.079972609651494, "grad_norm": 0.27155548334121704, "learning_rate": 1.6983865669910504e-05, "loss": 0.3482, "step": 85460 }, { "epoch": 3.0801528093127186, "grad_norm": 0.2704147398471832, "learning_rate": 1.6981101670326945e-05, "loss": 0.3908, "step": 85465 }, { "epoch": 3.080333008973943, "grad_norm": 0.21449273824691772, "learning_rate": 1.697833777999932e-05, "loss": 0.3481, "step": 85470 }, { "epoch": 3.0805132086351676, "grad_norm": 0.2628542482852936, "learning_rate": 1.6975573998965298e-05, "loss": 0.3956, "step": 85475 }, { "epoch": 3.0806934082963924, "grad_norm": 0.20272031426429749, "learning_rate": 1.697281032726252e-05, "loss": 0.3353, "step": 85480 }, { "epoch": 3.080873607957617, "grad_norm": 0.20484726130962372, "learning_rate": 1.697004676492865e-05, "loss": 0.3335, "step": 85485 }, { "epoch": 3.081053807618842, "grad_norm": 0.21416746079921722, "learning_rate": 1.696728331200134e-05, "loss": 0.3687, "step": 85490 }, { "epoch": 3.081234007280066, "grad_norm": 0.2598028779029846, "learning_rate": 1.696451996851824e-05, "loss": 0.3842, "step": 85495 }, { "epoch": 3.081414206941291, "grad_norm": 0.23536814749240875, "learning_rate": 1.6961756734516994e-05, "loss": 0.3904, "step": 85500 }, { "epoch": 3.081414206941291, "eval_loss": 0.4317238926887512, "eval_runtime": 3.5362, "eval_samples_per_second": 28.279, "eval_steps_per_second": 7.07, "step": 85500 }, { "epoch": 3.0815944066025156, "grad_norm": 0.17988234758377075, "learning_rate": 1.695899361003526e-05, "loss": 0.3702, "step": 85505 }, { "epoch": 3.0817746062637403, "grad_norm": 0.22040648758411407, "learning_rate": 1.695623059511068e-05, "loss": 0.3716, "step": 85510 }, { "epoch": 3.081954805924965, "grad_norm": 0.23150260746479034, "learning_rate": 1.69534676897809e-05, "loss": 0.3891, "step": 85515 }, { "epoch": 3.0821350055861894, "grad_norm": 0.21168793737888336, "learning_rate": 1.6950704894083575e-05, "loss": 0.363, "step": 85520 }, { "epoch": 3.082315205247414, "grad_norm": 0.21894006431102753, "learning_rate": 1.6947942208056316e-05, "loss": 0.3604, "step": 85525 }, { "epoch": 3.082495404908639, "grad_norm": 0.23681262135505676, "learning_rate": 1.6945179631736807e-05, "loss": 0.3973, "step": 85530 }, { "epoch": 3.0826756045698636, "grad_norm": 0.2256009578704834, "learning_rate": 1.6942417165162648e-05, "loss": 0.3661, "step": 85535 }, { "epoch": 3.082855804231088, "grad_norm": 0.21623054146766663, "learning_rate": 1.6939654808371515e-05, "loss": 0.3936, "step": 85540 }, { "epoch": 3.0830360038923126, "grad_norm": 0.2143433690071106, "learning_rate": 1.693689256140101e-05, "loss": 0.3677, "step": 85545 }, { "epoch": 3.0832162035535373, "grad_norm": 0.2282250076532364, "learning_rate": 1.693413042428878e-05, "loss": 0.3712, "step": 85550 }, { "epoch": 3.083396403214762, "grad_norm": 0.19568027555942535, "learning_rate": 1.693136839707248e-05, "loss": 0.3908, "step": 85555 }, { "epoch": 3.083576602875987, "grad_norm": 0.20706535875797272, "learning_rate": 1.692860647978971e-05, "loss": 0.388, "step": 85560 }, { "epoch": 3.083756802537211, "grad_norm": 0.23546600341796875, "learning_rate": 1.6925844672478115e-05, "loss": 0.3772, "step": 85565 }, { "epoch": 3.083937002198436, "grad_norm": 0.2380140721797943, "learning_rate": 1.6923082975175325e-05, "loss": 0.3468, "step": 85570 }, { "epoch": 3.0841172018596605, "grad_norm": 0.23840586841106415, "learning_rate": 1.6920321387918957e-05, "loss": 0.3679, "step": 85575 }, { "epoch": 3.0842974015208853, "grad_norm": 0.2252638339996338, "learning_rate": 1.6917559910746657e-05, "loss": 0.4021, "step": 85580 }, { "epoch": 3.0844776011821096, "grad_norm": 0.20066937804222107, "learning_rate": 1.6914798543696036e-05, "loss": 0.4079, "step": 85585 }, { "epoch": 3.0846578008433343, "grad_norm": 0.22038015723228455, "learning_rate": 1.6912037286804717e-05, "loss": 0.3824, "step": 85590 }, { "epoch": 3.084838000504559, "grad_norm": 0.21279798448085785, "learning_rate": 1.6909276140110324e-05, "loss": 0.3655, "step": 85595 }, { "epoch": 3.0850182001657838, "grad_norm": 0.24839600920677185, "learning_rate": 1.6906515103650482e-05, "loss": 0.3607, "step": 85600 }, { "epoch": 3.0851983998270085, "grad_norm": 0.23541690409183502, "learning_rate": 1.69037541774628e-05, "loss": 0.3618, "step": 85605 }, { "epoch": 3.085378599488233, "grad_norm": 0.24776875972747803, "learning_rate": 1.69009933615849e-05, "loss": 0.3814, "step": 85610 }, { "epoch": 3.0855587991494575, "grad_norm": 0.19632692635059357, "learning_rate": 1.6898232656054398e-05, "loss": 0.413, "step": 85615 }, { "epoch": 3.0857389988106823, "grad_norm": 0.24694256484508514, "learning_rate": 1.6895472060908907e-05, "loss": 0.387, "step": 85620 }, { "epoch": 3.085919198471907, "grad_norm": 0.21106529235839844, "learning_rate": 1.689271157618605e-05, "loss": 0.3476, "step": 85625 }, { "epoch": 3.0860993981331317, "grad_norm": 0.24675370752811432, "learning_rate": 1.688995120192341e-05, "loss": 0.396, "step": 85630 }, { "epoch": 3.086279597794356, "grad_norm": 0.20427581667900085, "learning_rate": 1.688719093815863e-05, "loss": 0.3869, "step": 85635 }, { "epoch": 3.0864597974555807, "grad_norm": 0.2514338493347168, "learning_rate": 1.68844307849293e-05, "loss": 0.3824, "step": 85640 }, { "epoch": 3.0866399971168055, "grad_norm": 0.25234454870224, "learning_rate": 1.6881670742273023e-05, "loss": 0.3886, "step": 85645 }, { "epoch": 3.08682019677803, "grad_norm": 0.26698270440101624, "learning_rate": 1.6878910810227417e-05, "loss": 0.3554, "step": 85650 }, { "epoch": 3.0870003964392545, "grad_norm": 0.21970665454864502, "learning_rate": 1.687615098883007e-05, "loss": 0.3698, "step": 85655 }, { "epoch": 3.0871805961004792, "grad_norm": 0.2174971103668213, "learning_rate": 1.68733912781186e-05, "loss": 0.3598, "step": 85660 }, { "epoch": 3.087360795761704, "grad_norm": 0.26610609889030457, "learning_rate": 1.68706316781306e-05, "loss": 0.3654, "step": 85665 }, { "epoch": 3.0875409954229287, "grad_norm": 0.22846719622612, "learning_rate": 1.6867872188903667e-05, "loss": 0.3823, "step": 85670 }, { "epoch": 3.0877211950841534, "grad_norm": 0.23386985063552856, "learning_rate": 1.6865112810475403e-05, "loss": 0.3609, "step": 85675 }, { "epoch": 3.0879013947453777, "grad_norm": 0.19239924848079681, "learning_rate": 1.6862353542883404e-05, "loss": 0.3711, "step": 85680 }, { "epoch": 3.0880815944066025, "grad_norm": 0.18704915046691895, "learning_rate": 1.6859594386165255e-05, "loss": 0.3522, "step": 85685 }, { "epoch": 3.088261794067827, "grad_norm": 0.2616778016090393, "learning_rate": 1.6856835340358563e-05, "loss": 0.3941, "step": 85690 }, { "epoch": 3.088441993729052, "grad_norm": 0.21047228574752808, "learning_rate": 1.685407640550091e-05, "loss": 0.3471, "step": 85695 }, { "epoch": 3.088622193390276, "grad_norm": 0.23516976833343506, "learning_rate": 1.685131758162989e-05, "loss": 0.3674, "step": 85700 }, { "epoch": 3.088802393051501, "grad_norm": 0.23034684360027313, "learning_rate": 1.6848558868783098e-05, "loss": 0.394, "step": 85705 }, { "epoch": 3.0889825927127257, "grad_norm": 0.2383045256137848, "learning_rate": 1.6845800266998098e-05, "loss": 0.3637, "step": 85710 }, { "epoch": 3.0891627923739504, "grad_norm": 0.21178589761257172, "learning_rate": 1.6843041776312503e-05, "loss": 0.364, "step": 85715 }, { "epoch": 3.089342992035175, "grad_norm": 0.2509348392486572, "learning_rate": 1.6840283396763872e-05, "loss": 0.3614, "step": 85720 }, { "epoch": 3.0895231916963994, "grad_norm": 0.24692773818969727, "learning_rate": 1.683752512838981e-05, "loss": 0.3893, "step": 85725 }, { "epoch": 3.089703391357624, "grad_norm": 0.19897426664829254, "learning_rate": 1.6834766971227893e-05, "loss": 0.3856, "step": 85730 }, { "epoch": 3.089883591018849, "grad_norm": 0.21682973206043243, "learning_rate": 1.6832008925315677e-05, "loss": 0.3699, "step": 85735 }, { "epoch": 3.0900637906800736, "grad_norm": 0.27642059326171875, "learning_rate": 1.682925099069078e-05, "loss": 0.3887, "step": 85740 }, { "epoch": 3.090243990341298, "grad_norm": 0.23973903059959412, "learning_rate": 1.6826493167390746e-05, "loss": 0.3442, "step": 85745 }, { "epoch": 3.0904241900025227, "grad_norm": 0.21967796981334686, "learning_rate": 1.6823735455453155e-05, "loss": 0.4142, "step": 85750 }, { "epoch": 3.0906043896637474, "grad_norm": 0.1877850890159607, "learning_rate": 1.6820977854915593e-05, "loss": 0.3623, "step": 85755 }, { "epoch": 3.090784589324972, "grad_norm": 0.22713260352611542, "learning_rate": 1.681822036581562e-05, "loss": 0.3917, "step": 85760 }, { "epoch": 3.090964788986197, "grad_norm": 0.2456943690776825, "learning_rate": 1.681546298819081e-05, "loss": 0.3744, "step": 85765 }, { "epoch": 3.091144988647421, "grad_norm": 0.2482386976480484, "learning_rate": 1.6812705722078738e-05, "loss": 0.3686, "step": 85770 }, { "epoch": 3.091325188308646, "grad_norm": 0.28733116388320923, "learning_rate": 1.6809948567516955e-05, "loss": 0.3872, "step": 85775 }, { "epoch": 3.0915053879698706, "grad_norm": 0.266248881816864, "learning_rate": 1.6807191524543045e-05, "loss": 0.3957, "step": 85780 }, { "epoch": 3.0916855876310954, "grad_norm": 0.21898359060287476, "learning_rate": 1.6804434593194565e-05, "loss": 0.3749, "step": 85785 }, { "epoch": 3.09186578729232, "grad_norm": 0.22204652428627014, "learning_rate": 1.6801677773509074e-05, "loss": 0.3562, "step": 85790 }, { "epoch": 3.0920459869535444, "grad_norm": 0.228468656539917, "learning_rate": 1.6798921065524138e-05, "loss": 0.3875, "step": 85795 }, { "epoch": 3.092226186614769, "grad_norm": 0.2888953685760498, "learning_rate": 1.679616446927731e-05, "loss": 0.3727, "step": 85800 }, { "epoch": 3.092406386275994, "grad_norm": 0.2353111207485199, "learning_rate": 1.6793407984806153e-05, "loss": 0.4015, "step": 85805 }, { "epoch": 3.0925865859372186, "grad_norm": 0.21720938384532928, "learning_rate": 1.6790651612148235e-05, "loss": 0.3622, "step": 85810 }, { "epoch": 3.092766785598443, "grad_norm": 0.25723299384117126, "learning_rate": 1.6787895351341083e-05, "loss": 0.4135, "step": 85815 }, { "epoch": 3.0929469852596676, "grad_norm": 0.2206987887620926, "learning_rate": 1.678513920242228e-05, "loss": 0.4013, "step": 85820 }, { "epoch": 3.0931271849208923, "grad_norm": 0.23632338643074036, "learning_rate": 1.6782383165429364e-05, "loss": 0.3843, "step": 85825 }, { "epoch": 3.093307384582117, "grad_norm": 0.19661295413970947, "learning_rate": 1.677962724039987e-05, "loss": 0.3777, "step": 85830 }, { "epoch": 3.093487584243342, "grad_norm": 0.2105666697025299, "learning_rate": 1.6776871427371384e-05, "loss": 0.3652, "step": 85835 }, { "epoch": 3.093667783904566, "grad_norm": 0.21203243732452393, "learning_rate": 1.677411572638142e-05, "loss": 0.3687, "step": 85840 }, { "epoch": 3.093847983565791, "grad_norm": 0.20673683285713196, "learning_rate": 1.6771360137467546e-05, "loss": 0.3816, "step": 85845 }, { "epoch": 3.0940281832270156, "grad_norm": 0.23549970984458923, "learning_rate": 1.6768604660667293e-05, "loss": 0.3927, "step": 85850 }, { "epoch": 3.0942083828882403, "grad_norm": 0.18549276888370514, "learning_rate": 1.6765849296018203e-05, "loss": 0.3757, "step": 85855 }, { "epoch": 3.094388582549465, "grad_norm": 0.2178041934967041, "learning_rate": 1.6763094043557825e-05, "loss": 0.3638, "step": 85860 }, { "epoch": 3.0945687822106893, "grad_norm": 0.25327563285827637, "learning_rate": 1.67603389033237e-05, "loss": 0.3797, "step": 85865 }, { "epoch": 3.094748981871914, "grad_norm": 0.24706877768039703, "learning_rate": 1.6757583875353355e-05, "loss": 0.3832, "step": 85870 }, { "epoch": 3.094929181533139, "grad_norm": 0.20161230862140656, "learning_rate": 1.675482895968434e-05, "loss": 0.3857, "step": 85875 }, { "epoch": 3.0951093811943635, "grad_norm": 0.23628760874271393, "learning_rate": 1.675207415635418e-05, "loss": 0.3709, "step": 85880 }, { "epoch": 3.095289580855588, "grad_norm": 0.2771933376789093, "learning_rate": 1.6749319465400414e-05, "loss": 0.3934, "step": 85885 }, { "epoch": 3.0954697805168125, "grad_norm": 0.2511608898639679, "learning_rate": 1.6746564886860577e-05, "loss": 0.4084, "step": 85890 }, { "epoch": 3.0956499801780373, "grad_norm": 0.2165483683347702, "learning_rate": 1.6743810420772186e-05, "loss": 0.3714, "step": 85895 }, { "epoch": 3.095830179839262, "grad_norm": 0.22518706321716309, "learning_rate": 1.674105606717279e-05, "loss": 0.375, "step": 85900 }, { "epoch": 3.0960103795004867, "grad_norm": 0.21256521344184875, "learning_rate": 1.67383018260999e-05, "loss": 0.3688, "step": 85905 }, { "epoch": 3.096190579161711, "grad_norm": 0.2093464583158493, "learning_rate": 1.673554769759105e-05, "loss": 0.3451, "step": 85910 }, { "epoch": 3.0963707788229358, "grad_norm": 0.16468511521816254, "learning_rate": 1.673279368168377e-05, "loss": 0.4124, "step": 85915 }, { "epoch": 3.0965509784841605, "grad_norm": 0.21614669263362885, "learning_rate": 1.673003977841556e-05, "loss": 0.3886, "step": 85920 }, { "epoch": 3.0967311781453852, "grad_norm": 0.25867360830307007, "learning_rate": 1.672728598782397e-05, "loss": 0.388, "step": 85925 }, { "epoch": 3.0969113778066095, "grad_norm": 0.25877588987350464, "learning_rate": 1.6724532309946506e-05, "loss": 0.4002, "step": 85930 }, { "epoch": 3.0970915774678343, "grad_norm": 0.22398416697978973, "learning_rate": 1.6721778744820678e-05, "loss": 0.4284, "step": 85935 }, { "epoch": 3.097271777129059, "grad_norm": 0.2191021740436554, "learning_rate": 1.671902529248402e-05, "loss": 0.4128, "step": 85940 }, { "epoch": 3.0974519767902837, "grad_norm": 0.23091648519039154, "learning_rate": 1.6716271952974033e-05, "loss": 0.3747, "step": 85945 }, { "epoch": 3.0976321764515085, "grad_norm": 0.2016901671886444, "learning_rate": 1.671351872632824e-05, "loss": 0.3443, "step": 85950 }, { "epoch": 3.0978123761127327, "grad_norm": 0.2278120517730713, "learning_rate": 1.671076561258415e-05, "loss": 0.4025, "step": 85955 }, { "epoch": 3.0979925757739575, "grad_norm": 0.18883484601974487, "learning_rate": 1.6708012611779273e-05, "loss": 0.3807, "step": 85960 }, { "epoch": 3.098172775435182, "grad_norm": 0.2371702492237091, "learning_rate": 1.670525972395112e-05, "loss": 0.3861, "step": 85965 }, { "epoch": 3.098352975096407, "grad_norm": 0.2216133028268814, "learning_rate": 1.6702506949137197e-05, "loss": 0.3951, "step": 85970 }, { "epoch": 3.0985331747576312, "grad_norm": 0.25640398263931274, "learning_rate": 1.669975428737501e-05, "loss": 0.3929, "step": 85975 }, { "epoch": 3.098713374418856, "grad_norm": 0.22617505490779877, "learning_rate": 1.669700173870206e-05, "loss": 0.4057, "step": 85980 }, { "epoch": 3.0988935740800807, "grad_norm": 0.25993672013282776, "learning_rate": 1.6694249303155857e-05, "loss": 0.3954, "step": 85985 }, { "epoch": 3.0990737737413054, "grad_norm": 0.23373205959796906, "learning_rate": 1.6691496980773903e-05, "loss": 0.4111, "step": 85990 }, { "epoch": 3.09925397340253, "grad_norm": 0.19884000718593597, "learning_rate": 1.668874477159369e-05, "loss": 0.4071, "step": 85995 }, { "epoch": 3.0994341730637545, "grad_norm": 0.23313197493553162, "learning_rate": 1.6685992675652717e-05, "loss": 0.3718, "step": 86000 }, { "epoch": 3.0994341730637545, "eval_loss": 0.4313110411167145, "eval_runtime": 3.5318, "eval_samples_per_second": 28.314, "eval_steps_per_second": 7.079, "step": 86000 }, { "epoch": 3.099614372724979, "grad_norm": 0.22738061845302582, "learning_rate": 1.6683240692988492e-05, "loss": 0.3886, "step": 86005 }, { "epoch": 3.099794572386204, "grad_norm": 0.21752703189849854, "learning_rate": 1.6680488823638508e-05, "loss": 0.4185, "step": 86010 }, { "epoch": 3.0999747720474287, "grad_norm": 0.22272847592830658, "learning_rate": 1.6677737067640232e-05, "loss": 0.3903, "step": 86015 }, { "epoch": 3.1001549717086534, "grad_norm": 0.2768046259880066, "learning_rate": 1.6674985425031197e-05, "loss": 0.3713, "step": 86020 }, { "epoch": 3.1003351713698777, "grad_norm": 0.23231996595859528, "learning_rate": 1.667223389584886e-05, "loss": 0.3758, "step": 86025 }, { "epoch": 3.1005153710311024, "grad_norm": 0.21626095473766327, "learning_rate": 1.6669482480130734e-05, "loss": 0.3965, "step": 86030 }, { "epoch": 3.100695570692327, "grad_norm": 0.2107696682214737, "learning_rate": 1.6666731177914292e-05, "loss": 0.3717, "step": 86035 }, { "epoch": 3.100875770353552, "grad_norm": 0.24785815179347992, "learning_rate": 1.666397998923702e-05, "loss": 0.3698, "step": 86040 }, { "epoch": 3.101055970014776, "grad_norm": 0.21663641929626465, "learning_rate": 1.6661228914136414e-05, "loss": 0.3634, "step": 86045 }, { "epoch": 3.101236169676001, "grad_norm": 0.23445236682891846, "learning_rate": 1.6658477952649946e-05, "loss": 0.387, "step": 86050 }, { "epoch": 3.1014163693372256, "grad_norm": 0.20155306160449982, "learning_rate": 1.6655727104815104e-05, "loss": 0.3826, "step": 86055 }, { "epoch": 3.1015965689984504, "grad_norm": 0.20625048875808716, "learning_rate": 1.6652976370669362e-05, "loss": 0.388, "step": 86060 }, { "epoch": 3.101776768659675, "grad_norm": 0.2746394872665405, "learning_rate": 1.6650225750250197e-05, "loss": 0.3988, "step": 86065 }, { "epoch": 3.1019569683208994, "grad_norm": 0.21470344066619873, "learning_rate": 1.6647475243595096e-05, "loss": 0.3931, "step": 86070 }, { "epoch": 3.102137167982124, "grad_norm": 0.22546519339084625, "learning_rate": 1.6644724850741528e-05, "loss": 0.3552, "step": 86075 }, { "epoch": 3.102317367643349, "grad_norm": 0.24429123103618622, "learning_rate": 1.6641974571726958e-05, "loss": 0.3673, "step": 86080 }, { "epoch": 3.1024975673045736, "grad_norm": 0.17867647111415863, "learning_rate": 1.6639224406588876e-05, "loss": 0.3625, "step": 86085 }, { "epoch": 3.102677766965798, "grad_norm": 0.23398911952972412, "learning_rate": 1.6636474355364746e-05, "loss": 0.3595, "step": 86090 }, { "epoch": 3.1028579666270226, "grad_norm": 0.2413238137960434, "learning_rate": 1.6633724418092016e-05, "loss": 0.3635, "step": 86095 }, { "epoch": 3.1030381662882474, "grad_norm": 0.23708127439022064, "learning_rate": 1.6630974594808192e-05, "loss": 0.3597, "step": 86100 }, { "epoch": 3.103218365949472, "grad_norm": 0.20175603032112122, "learning_rate": 1.6628224885550697e-05, "loss": 0.4123, "step": 86105 }, { "epoch": 3.103398565610697, "grad_norm": 0.2378152459859848, "learning_rate": 1.6625475290357037e-05, "loss": 0.3938, "step": 86110 }, { "epoch": 3.103578765271921, "grad_norm": 0.19391091167926788, "learning_rate": 1.662272580926465e-05, "loss": 0.3861, "step": 86115 }, { "epoch": 3.103758964933146, "grad_norm": 0.20529715716838837, "learning_rate": 1.6619976442310987e-05, "loss": 0.3877, "step": 86120 }, { "epoch": 3.1039391645943706, "grad_norm": 0.21286916732788086, "learning_rate": 1.661722718953354e-05, "loss": 0.3867, "step": 86125 }, { "epoch": 3.1041193642555953, "grad_norm": 0.2500961422920227, "learning_rate": 1.6614478050969738e-05, "loss": 0.3611, "step": 86130 }, { "epoch": 3.10429956391682, "grad_norm": 0.2693064212799072, "learning_rate": 1.661172902665706e-05, "loss": 0.3954, "step": 86135 }, { "epoch": 3.1044797635780443, "grad_norm": 0.22724072635173798, "learning_rate": 1.6608980116632945e-05, "loss": 0.3488, "step": 86140 }, { "epoch": 3.104659963239269, "grad_norm": 0.24723055958747864, "learning_rate": 1.660623132093485e-05, "loss": 0.3801, "step": 86145 }, { "epoch": 3.104840162900494, "grad_norm": 0.26212194561958313, "learning_rate": 1.6603482639600232e-05, "loss": 0.3625, "step": 86150 }, { "epoch": 3.1050203625617185, "grad_norm": 0.19167543947696686, "learning_rate": 1.6600734072666535e-05, "loss": 0.3962, "step": 86155 }, { "epoch": 3.105200562222943, "grad_norm": 0.2559322118759155, "learning_rate": 1.6597985620171207e-05, "loss": 0.3864, "step": 86160 }, { "epoch": 3.1053807618841676, "grad_norm": 0.17590028047561646, "learning_rate": 1.65952372821517e-05, "loss": 0.368, "step": 86165 }, { "epoch": 3.1055609615453923, "grad_norm": 0.23424407839775085, "learning_rate": 1.6592489058645455e-05, "loss": 0.3734, "step": 86170 }, { "epoch": 3.105741161206617, "grad_norm": 0.215243399143219, "learning_rate": 1.6589740949689926e-05, "loss": 0.3799, "step": 86175 }, { "epoch": 3.1059213608678418, "grad_norm": 0.22049564123153687, "learning_rate": 1.6586992955322546e-05, "loss": 0.3542, "step": 86180 }, { "epoch": 3.106101560529066, "grad_norm": 0.24883852899074554, "learning_rate": 1.6584245075580753e-05, "loss": 0.3818, "step": 86185 }, { "epoch": 3.106281760190291, "grad_norm": 0.25981536507606506, "learning_rate": 1.6581497310502e-05, "loss": 0.3753, "step": 86190 }, { "epoch": 3.1064619598515155, "grad_norm": 0.23729638755321503, "learning_rate": 1.6578749660123715e-05, "loss": 0.3649, "step": 86195 }, { "epoch": 3.1066421595127403, "grad_norm": 0.1951550394296646, "learning_rate": 1.6576002124483324e-05, "loss": 0.3514, "step": 86200 }, { "epoch": 3.1068223591739645, "grad_norm": 0.23986147344112396, "learning_rate": 1.657325470361829e-05, "loss": 0.3732, "step": 86205 }, { "epoch": 3.1070025588351893, "grad_norm": 0.27552303671836853, "learning_rate": 1.657050739756601e-05, "loss": 0.3641, "step": 86210 }, { "epoch": 3.107182758496414, "grad_norm": 0.2763289511203766, "learning_rate": 1.6567760206363953e-05, "loss": 0.3584, "step": 86215 }, { "epoch": 3.1073629581576387, "grad_norm": 0.21052367985248566, "learning_rate": 1.6565013130049526e-05, "loss": 0.3756, "step": 86220 }, { "epoch": 3.1075431578188635, "grad_norm": 0.2306980937719345, "learning_rate": 1.6562266168660153e-05, "loss": 0.3861, "step": 86225 }, { "epoch": 3.1077233574800878, "grad_norm": 0.2257169485092163, "learning_rate": 1.655951932223328e-05, "loss": 0.4018, "step": 86230 }, { "epoch": 3.1079035571413125, "grad_norm": 0.20288598537445068, "learning_rate": 1.6556772590806318e-05, "loss": 0.3691, "step": 86235 }, { "epoch": 3.1080837568025372, "grad_norm": 0.23873363435268402, "learning_rate": 1.6554025974416693e-05, "loss": 0.3547, "step": 86240 }, { "epoch": 3.108263956463762, "grad_norm": 0.2184680998325348, "learning_rate": 1.6551279473101834e-05, "loss": 0.3309, "step": 86245 }, { "epoch": 3.1084441561249863, "grad_norm": 0.26064011454582214, "learning_rate": 1.654853308689915e-05, "loss": 0.3793, "step": 86250 }, { "epoch": 3.108624355786211, "grad_norm": 0.22625210881233215, "learning_rate": 1.6545786815846067e-05, "loss": 0.401, "step": 86255 }, { "epoch": 3.1088045554474357, "grad_norm": 0.21573030948638916, "learning_rate": 1.654304065998001e-05, "loss": 0.3825, "step": 86260 }, { "epoch": 3.1089847551086605, "grad_norm": 0.21487176418304443, "learning_rate": 1.6540294619338377e-05, "loss": 0.3793, "step": 86265 }, { "epoch": 3.109164954769885, "grad_norm": 0.19863709807395935, "learning_rate": 1.6537548693958593e-05, "loss": 0.3945, "step": 86270 }, { "epoch": 3.1093451544311095, "grad_norm": 0.2572442889213562, "learning_rate": 1.6534802883878083e-05, "loss": 0.3819, "step": 86275 }, { "epoch": 3.109525354092334, "grad_norm": 0.26214733719825745, "learning_rate": 1.6532057189134227e-05, "loss": 0.4062, "step": 86280 }, { "epoch": 3.109705553753559, "grad_norm": 0.21623064577579498, "learning_rate": 1.6529311609764458e-05, "loss": 0.3626, "step": 86285 }, { "epoch": 3.1098857534147837, "grad_norm": 0.24655792117118835, "learning_rate": 1.6526566145806177e-05, "loss": 0.4102, "step": 86290 }, { "epoch": 3.1100659530760084, "grad_norm": 0.1786569356918335, "learning_rate": 1.6523820797296796e-05, "loss": 0.3942, "step": 86295 }, { "epoch": 3.1102461527372327, "grad_norm": 0.23387044668197632, "learning_rate": 1.652107556427372e-05, "loss": 0.3798, "step": 86300 }, { "epoch": 3.1104263523984574, "grad_norm": 0.21776267886161804, "learning_rate": 1.6518330446774334e-05, "loss": 0.3593, "step": 86305 }, { "epoch": 3.110606552059682, "grad_norm": 0.21297864615917206, "learning_rate": 1.651558544483607e-05, "loss": 0.3782, "step": 86310 }, { "epoch": 3.110786751720907, "grad_norm": 0.2200326770544052, "learning_rate": 1.6512840558496295e-05, "loss": 0.3814, "step": 86315 }, { "epoch": 3.110966951382131, "grad_norm": 0.2481064796447754, "learning_rate": 1.651009578779244e-05, "loss": 0.3499, "step": 86320 }, { "epoch": 3.111147151043356, "grad_norm": 0.20087507367134094, "learning_rate": 1.650735113276188e-05, "loss": 0.3656, "step": 86325 }, { "epoch": 3.1113273507045807, "grad_norm": 0.225091814994812, "learning_rate": 1.6504606593442014e-05, "loss": 0.3703, "step": 86330 }, { "epoch": 3.1115075503658054, "grad_norm": 0.20192007720470428, "learning_rate": 1.650186216987024e-05, "loss": 0.4075, "step": 86335 }, { "epoch": 3.11168775002703, "grad_norm": 0.2589082717895508, "learning_rate": 1.6499117862083953e-05, "loss": 0.405, "step": 86340 }, { "epoch": 3.1118679496882544, "grad_norm": 0.2047402262687683, "learning_rate": 1.649637367012053e-05, "loss": 0.3555, "step": 86345 }, { "epoch": 3.112048149349479, "grad_norm": 0.2570793032646179, "learning_rate": 1.6493629594017376e-05, "loss": 0.3869, "step": 86350 }, { "epoch": 3.112228349010704, "grad_norm": 0.23956820368766785, "learning_rate": 1.6490885633811868e-05, "loss": 0.3472, "step": 86355 }, { "epoch": 3.1124085486719286, "grad_norm": 0.24650585651397705, "learning_rate": 1.64881417895414e-05, "loss": 0.3631, "step": 86360 }, { "epoch": 3.1125887483331534, "grad_norm": 0.22965595126152039, "learning_rate": 1.6485398061243353e-05, "loss": 0.3844, "step": 86365 }, { "epoch": 3.1127689479943776, "grad_norm": 0.22221916913986206, "learning_rate": 1.6482654448955097e-05, "loss": 0.3907, "step": 86370 }, { "epoch": 3.1129491476556024, "grad_norm": 0.22202599048614502, "learning_rate": 1.6479910952714038e-05, "loss": 0.4044, "step": 86375 }, { "epoch": 3.113129347316827, "grad_norm": 0.21764975786209106, "learning_rate": 1.6477167572557547e-05, "loss": 0.4054, "step": 86380 }, { "epoch": 3.113309546978052, "grad_norm": 0.29039353132247925, "learning_rate": 1.647442430852298e-05, "loss": 0.3839, "step": 86385 }, { "epoch": 3.113489746639276, "grad_norm": 0.1968279629945755, "learning_rate": 1.6471681160647752e-05, "loss": 0.3557, "step": 86390 }, { "epoch": 3.113669946300501, "grad_norm": 0.22088350355625153, "learning_rate": 1.6468938128969194e-05, "loss": 0.3762, "step": 86395 }, { "epoch": 3.1138501459617256, "grad_norm": 0.30237895250320435, "learning_rate": 1.6466195213524722e-05, "loss": 0.4007, "step": 86400 }, { "epoch": 3.1140303456229503, "grad_norm": 0.21387836337089539, "learning_rate": 1.6463452414351683e-05, "loss": 0.3915, "step": 86405 }, { "epoch": 3.114210545284175, "grad_norm": 0.22359785437583923, "learning_rate": 1.646070973148744e-05, "loss": 0.3664, "step": 86410 }, { "epoch": 3.1143907449453994, "grad_norm": 0.2460165023803711, "learning_rate": 1.645796716496939e-05, "loss": 0.4142, "step": 86415 }, { "epoch": 3.114570944606624, "grad_norm": 0.20692665874958038, "learning_rate": 1.6455224714834876e-05, "loss": 0.3838, "step": 86420 }, { "epoch": 3.114751144267849, "grad_norm": 0.18075329065322876, "learning_rate": 1.645248238112127e-05, "loss": 0.3807, "step": 86425 }, { "epoch": 3.1149313439290736, "grad_norm": 0.21312923729419708, "learning_rate": 1.644974016386594e-05, "loss": 0.3996, "step": 86430 }, { "epoch": 3.115111543590298, "grad_norm": 0.1856447458267212, "learning_rate": 1.644699806310624e-05, "loss": 0.3807, "step": 86435 }, { "epoch": 3.1152917432515226, "grad_norm": 0.23847022652626038, "learning_rate": 1.6444256078879537e-05, "loss": 0.3593, "step": 86440 }, { "epoch": 3.1154719429127473, "grad_norm": 0.2461748719215393, "learning_rate": 1.6441514211223197e-05, "loss": 0.3845, "step": 86445 }, { "epoch": 3.115652142573972, "grad_norm": 0.17290343344211578, "learning_rate": 1.6438772460174558e-05, "loss": 0.385, "step": 86450 }, { "epoch": 3.115832342235197, "grad_norm": 0.2519152760505676, "learning_rate": 1.6436030825770992e-05, "loss": 0.3341, "step": 86455 }, { "epoch": 3.116012541896421, "grad_norm": 0.22315552830696106, "learning_rate": 1.6433289308049847e-05, "loss": 0.3806, "step": 86460 }, { "epoch": 3.116192741557646, "grad_norm": 0.20392777025699615, "learning_rate": 1.6430547907048474e-05, "loss": 0.3915, "step": 86465 }, { "epoch": 3.1163729412188705, "grad_norm": 0.2624046206474304, "learning_rate": 1.6427806622804233e-05, "loss": 0.4327, "step": 86470 }, { "epoch": 3.1165531408800953, "grad_norm": 0.1981981098651886, "learning_rate": 1.6425065455354457e-05, "loss": 0.3494, "step": 86475 }, { "epoch": 3.1167333405413196, "grad_norm": 0.2108498364686966, "learning_rate": 1.6422324404736512e-05, "loss": 0.3365, "step": 86480 }, { "epoch": 3.1169135402025443, "grad_norm": 0.30326491594314575, "learning_rate": 1.6419583470987742e-05, "loss": 0.3961, "step": 86485 }, { "epoch": 3.117093739863769, "grad_norm": 0.22434242069721222, "learning_rate": 1.641684265414547e-05, "loss": 0.3698, "step": 86490 }, { "epoch": 3.1172739395249938, "grad_norm": 0.2361210584640503, "learning_rate": 1.641410195424707e-05, "loss": 0.3758, "step": 86495 }, { "epoch": 3.1174541391862185, "grad_norm": 0.23360228538513184, "learning_rate": 1.6411361371329852e-05, "loss": 0.3541, "step": 86500 }, { "epoch": 3.1174541391862185, "eval_loss": 0.43187737464904785, "eval_runtime": 3.5321, "eval_samples_per_second": 28.311, "eval_steps_per_second": 7.078, "step": 86500 }, { "epoch": 3.117634338847443, "grad_norm": 0.26229509711265564, "learning_rate": 1.6408620905431192e-05, "loss": 0.3866, "step": 86505 }, { "epoch": 3.1178145385086675, "grad_norm": 0.22185193002223969, "learning_rate": 1.64058805565884e-05, "loss": 0.3794, "step": 86510 }, { "epoch": 3.1179947381698923, "grad_norm": 0.26995113492012024, "learning_rate": 1.6403140324838817e-05, "loss": 0.3945, "step": 86515 }, { "epoch": 3.118174937831117, "grad_norm": 0.30093368887901306, "learning_rate": 1.640040021021979e-05, "loss": 0.3729, "step": 86520 }, { "epoch": 3.1183551374923417, "grad_norm": 0.24983744323253632, "learning_rate": 1.6397660212768643e-05, "loss": 0.369, "step": 86525 }, { "epoch": 3.118535337153566, "grad_norm": 0.2591390609741211, "learning_rate": 1.6394920332522705e-05, "loss": 0.398, "step": 86530 }, { "epoch": 3.1187155368147907, "grad_norm": 0.23025575280189514, "learning_rate": 1.6392180569519316e-05, "loss": 0.38, "step": 86535 }, { "epoch": 3.1188957364760155, "grad_norm": 0.1807858794927597, "learning_rate": 1.6389440923795795e-05, "loss": 0.3726, "step": 86540 }, { "epoch": 3.11907593613724, "grad_norm": 0.24324113130569458, "learning_rate": 1.6386701395389484e-05, "loss": 0.3855, "step": 86545 }, { "epoch": 3.1192561357984645, "grad_norm": 0.17595435678958893, "learning_rate": 1.6383961984337694e-05, "loss": 0.3635, "step": 86550 }, { "epoch": 3.1194363354596892, "grad_norm": 0.2983537018299103, "learning_rate": 1.638122269067775e-05, "loss": 0.393, "step": 86555 }, { "epoch": 3.119616535120914, "grad_norm": 0.25049325823783875, "learning_rate": 1.6378483514446984e-05, "loss": 0.3885, "step": 86560 }, { "epoch": 3.1197967347821387, "grad_norm": 0.26442599296569824, "learning_rate": 1.637574445568272e-05, "loss": 0.3671, "step": 86565 }, { "epoch": 3.1199769344433634, "grad_norm": 0.2337508350610733, "learning_rate": 1.6373005514422246e-05, "loss": 0.3758, "step": 86570 }, { "epoch": 3.1201571341045877, "grad_norm": 0.2032548189163208, "learning_rate": 1.6370266690702912e-05, "loss": 0.3562, "step": 86575 }, { "epoch": 3.1203373337658125, "grad_norm": 0.24068522453308105, "learning_rate": 1.6367527984562024e-05, "loss": 0.36, "step": 86580 }, { "epoch": 3.120517533427037, "grad_norm": 0.26474300026893616, "learning_rate": 1.6364789396036895e-05, "loss": 0.4097, "step": 86585 }, { "epoch": 3.120697733088262, "grad_norm": 0.17933666706085205, "learning_rate": 1.6362050925164846e-05, "loss": 0.3599, "step": 86590 }, { "epoch": 3.120877932749486, "grad_norm": 0.23188619315624237, "learning_rate": 1.635931257198317e-05, "loss": 0.3753, "step": 86595 }, { "epoch": 3.121058132410711, "grad_norm": 0.19330739974975586, "learning_rate": 1.6356574336529196e-05, "loss": 0.3503, "step": 86600 }, { "epoch": 3.1212383320719357, "grad_norm": 0.21732577681541443, "learning_rate": 1.6353836218840223e-05, "loss": 0.4139, "step": 86605 }, { "epoch": 3.1214185317331604, "grad_norm": 0.2698850929737091, "learning_rate": 1.635109821895355e-05, "loss": 0.3719, "step": 86610 }, { "epoch": 3.121598731394385, "grad_norm": 0.21727710962295532, "learning_rate": 1.6348360336906492e-05, "loss": 0.367, "step": 86615 }, { "epoch": 3.1217789310556094, "grad_norm": 0.23365622758865356, "learning_rate": 1.6345622572736345e-05, "loss": 0.4036, "step": 86620 }, { "epoch": 3.121959130716834, "grad_norm": 0.23919281363487244, "learning_rate": 1.6342884926480425e-05, "loss": 0.3967, "step": 86625 }, { "epoch": 3.122139330378059, "grad_norm": 0.1928279846906662, "learning_rate": 1.634014739817602e-05, "loss": 0.4055, "step": 86630 }, { "epoch": 3.1223195300392836, "grad_norm": 0.21561673283576965, "learning_rate": 1.6337409987860423e-05, "loss": 0.3846, "step": 86635 }, { "epoch": 3.1224997297005084, "grad_norm": 0.2624969780445099, "learning_rate": 1.6334672695570942e-05, "loss": 0.4019, "step": 86640 }, { "epoch": 3.1226799293617327, "grad_norm": 0.18644605576992035, "learning_rate": 1.633193552134487e-05, "loss": 0.3881, "step": 86645 }, { "epoch": 3.1228601290229574, "grad_norm": 0.20565631985664368, "learning_rate": 1.632919846521949e-05, "loss": 0.3493, "step": 86650 }, { "epoch": 3.123040328684182, "grad_norm": 0.2321755588054657, "learning_rate": 1.632646152723211e-05, "loss": 0.3675, "step": 86655 }, { "epoch": 3.123220528345407, "grad_norm": 0.20580174028873444, "learning_rate": 1.6323724707420013e-05, "loss": 0.3705, "step": 86660 }, { "epoch": 3.123400728006631, "grad_norm": 0.19330060482025146, "learning_rate": 1.6320988005820484e-05, "loss": 0.3382, "step": 86665 }, { "epoch": 3.123580927667856, "grad_norm": 0.20522934198379517, "learning_rate": 1.631825142247082e-05, "loss": 0.3555, "step": 86670 }, { "epoch": 3.1237611273290806, "grad_norm": 0.23507462441921234, "learning_rate": 1.6315514957408284e-05, "loss": 0.3862, "step": 86675 }, { "epoch": 3.1239413269903054, "grad_norm": 0.2328176349401474, "learning_rate": 1.6312778610670193e-05, "loss": 0.3903, "step": 86680 }, { "epoch": 3.12412152665153, "grad_norm": 0.220947727560997, "learning_rate": 1.6310042382293806e-05, "loss": 0.3965, "step": 86685 }, { "epoch": 3.1243017263127544, "grad_norm": 0.2658975124359131, "learning_rate": 1.6307306272316402e-05, "loss": 0.3763, "step": 86690 }, { "epoch": 3.124481925973979, "grad_norm": 0.2326037585735321, "learning_rate": 1.6304570280775275e-05, "loss": 0.4033, "step": 86695 }, { "epoch": 3.124662125635204, "grad_norm": 0.22777698934078217, "learning_rate": 1.6301834407707684e-05, "loss": 0.4019, "step": 86700 }, { "epoch": 3.1248423252964286, "grad_norm": 0.2394534796476364, "learning_rate": 1.6299098653150926e-05, "loss": 0.3871, "step": 86705 }, { "epoch": 3.125022524957653, "grad_norm": 0.18719437718391418, "learning_rate": 1.6296363017142264e-05, "loss": 0.3503, "step": 86710 }, { "epoch": 3.1252027246188776, "grad_norm": 0.24379460513591766, "learning_rate": 1.6293627499718962e-05, "loss": 0.3867, "step": 86715 }, { "epoch": 3.1253829242801023, "grad_norm": 0.20296038687229156, "learning_rate": 1.629089210091831e-05, "loss": 0.3609, "step": 86720 }, { "epoch": 3.125563123941327, "grad_norm": 0.21820303797721863, "learning_rate": 1.6288156820777556e-05, "loss": 0.3685, "step": 86725 }, { "epoch": 3.125743323602552, "grad_norm": 0.25657927989959717, "learning_rate": 1.628542165933399e-05, "loss": 0.4238, "step": 86730 }, { "epoch": 3.125923523263776, "grad_norm": 0.26692578196525574, "learning_rate": 1.628268661662486e-05, "loss": 0.3551, "step": 86735 }, { "epoch": 3.126103722925001, "grad_norm": 0.23433983325958252, "learning_rate": 1.6279951692687433e-05, "loss": 0.3719, "step": 86740 }, { "epoch": 3.1262839225862256, "grad_norm": 0.19793805480003357, "learning_rate": 1.6277216887558982e-05, "loss": 0.3766, "step": 86745 }, { "epoch": 3.1264641222474503, "grad_norm": 0.2359396368265152, "learning_rate": 1.6274482201276765e-05, "loss": 0.3728, "step": 86750 }, { "epoch": 3.1266443219086746, "grad_norm": 0.24847714602947235, "learning_rate": 1.6271747633878032e-05, "loss": 0.3969, "step": 86755 }, { "epoch": 3.1268245215698993, "grad_norm": 0.26364535093307495, "learning_rate": 1.6269013185400054e-05, "loss": 0.3949, "step": 86760 }, { "epoch": 3.127004721231124, "grad_norm": 0.22220592200756073, "learning_rate": 1.6266278855880075e-05, "loss": 0.3691, "step": 86765 }, { "epoch": 3.127184920892349, "grad_norm": 0.2486381232738495, "learning_rate": 1.626354464535536e-05, "loss": 0.3708, "step": 86770 }, { "epoch": 3.1273651205535735, "grad_norm": 0.23804841935634613, "learning_rate": 1.626081055386316e-05, "loss": 0.3858, "step": 86775 }, { "epoch": 3.127545320214798, "grad_norm": 0.19268541038036346, "learning_rate": 1.6258076581440712e-05, "loss": 0.3594, "step": 86780 }, { "epoch": 3.1277255198760225, "grad_norm": 0.2013455331325531, "learning_rate": 1.6255342728125295e-05, "loss": 0.3667, "step": 86785 }, { "epoch": 3.1279057195372473, "grad_norm": 0.26369357109069824, "learning_rate": 1.6252608993954132e-05, "loss": 0.4069, "step": 86790 }, { "epoch": 3.128085919198472, "grad_norm": 0.23781251907348633, "learning_rate": 1.6249875378964475e-05, "loss": 0.3672, "step": 86795 }, { "epoch": 3.1282661188596967, "grad_norm": 0.1853938102722168, "learning_rate": 1.6247141883193578e-05, "loss": 0.3969, "step": 86800 }, { "epoch": 3.128446318520921, "grad_norm": 0.1940661072731018, "learning_rate": 1.624440850667867e-05, "loss": 0.3897, "step": 86805 }, { "epoch": 3.1286265181821458, "grad_norm": 0.21411295235157013, "learning_rate": 1.624167524945701e-05, "loss": 0.386, "step": 86810 }, { "epoch": 3.1288067178433705, "grad_norm": 0.19931474328041077, "learning_rate": 1.6238942111565826e-05, "loss": 0.4114, "step": 86815 }, { "epoch": 3.1289869175045952, "grad_norm": 0.19459813833236694, "learning_rate": 1.6236209093042355e-05, "loss": 0.3829, "step": 86820 }, { "epoch": 3.1291671171658195, "grad_norm": 0.18339698016643524, "learning_rate": 1.623347619392384e-05, "loss": 0.3981, "step": 86825 }, { "epoch": 3.1293473168270443, "grad_norm": 0.21319806575775146, "learning_rate": 1.6230743414247522e-05, "loss": 0.3896, "step": 86830 }, { "epoch": 3.129527516488269, "grad_norm": 0.2697208523750305, "learning_rate": 1.6228010754050615e-05, "loss": 0.4042, "step": 86835 }, { "epoch": 3.1297077161494937, "grad_norm": 0.2193807065486908, "learning_rate": 1.6225278213370373e-05, "loss": 0.4035, "step": 86840 }, { "epoch": 3.1298879158107185, "grad_norm": 0.195016011595726, "learning_rate": 1.622254579224401e-05, "loss": 0.3613, "step": 86845 }, { "epoch": 3.1300681154719427, "grad_norm": 0.19086046516895294, "learning_rate": 1.6219813490708767e-05, "loss": 0.4081, "step": 86850 }, { "epoch": 3.1302483151331675, "grad_norm": 0.1884526163339615, "learning_rate": 1.621708130880187e-05, "loss": 0.3415, "step": 86855 }, { "epoch": 3.130428514794392, "grad_norm": 0.2053939402103424, "learning_rate": 1.6214349246560522e-05, "loss": 0.38, "step": 86860 }, { "epoch": 3.130608714455617, "grad_norm": 0.18308919668197632, "learning_rate": 1.6211617304021977e-05, "loss": 0.3578, "step": 86865 }, { "epoch": 3.1307889141168417, "grad_norm": 0.21582022309303284, "learning_rate": 1.620888548122345e-05, "loss": 0.3648, "step": 86870 }, { "epoch": 3.130969113778066, "grad_norm": 0.2522687315940857, "learning_rate": 1.6206153778202144e-05, "loss": 0.3881, "step": 86875 }, { "epoch": 3.1311493134392907, "grad_norm": 0.2382495403289795, "learning_rate": 1.62034221949953e-05, "loss": 0.3718, "step": 86880 }, { "epoch": 3.1313295131005154, "grad_norm": 0.2659468650817871, "learning_rate": 1.6200690731640112e-05, "loss": 0.3654, "step": 86885 }, { "epoch": 3.13150971276174, "grad_norm": 0.24328552186489105, "learning_rate": 1.6197959388173825e-05, "loss": 0.3785, "step": 86890 }, { "epoch": 3.1316899124229645, "grad_norm": 0.22692778706550598, "learning_rate": 1.6195228164633634e-05, "loss": 0.363, "step": 86895 }, { "epoch": 3.131870112084189, "grad_norm": 0.24278061091899872, "learning_rate": 1.6192497061056747e-05, "loss": 0.3806, "step": 86900 }, { "epoch": 3.132050311745414, "grad_norm": 0.23407572507858276, "learning_rate": 1.6189766077480386e-05, "loss": 0.3659, "step": 86905 }, { "epoch": 3.1322305114066387, "grad_norm": 0.24145354330539703, "learning_rate": 1.6187035213941754e-05, "loss": 0.3704, "step": 86910 }, { "epoch": 3.1324107110678634, "grad_norm": 0.18080276250839233, "learning_rate": 1.618430447047807e-05, "loss": 0.3888, "step": 86915 }, { "epoch": 3.1325909107290877, "grad_norm": 0.2304733842611313, "learning_rate": 1.6181573847126525e-05, "loss": 0.3904, "step": 86920 }, { "epoch": 3.1327711103903124, "grad_norm": 0.261940598487854, "learning_rate": 1.617884334392432e-05, "loss": 0.3794, "step": 86925 }, { "epoch": 3.132951310051537, "grad_norm": 0.21325023472309113, "learning_rate": 1.617611296090868e-05, "loss": 0.3466, "step": 86930 }, { "epoch": 3.133131509712762, "grad_norm": 0.21175888180732727, "learning_rate": 1.617338269811679e-05, "loss": 0.3665, "step": 86935 }, { "epoch": 3.133311709373986, "grad_norm": 0.2089497148990631, "learning_rate": 1.6170652555585847e-05, "loss": 0.3469, "step": 86940 }, { "epoch": 3.133491909035211, "grad_norm": 0.2231149524450302, "learning_rate": 1.6167922533353057e-05, "loss": 0.3621, "step": 86945 }, { "epoch": 3.1336721086964356, "grad_norm": 0.19638942182064056, "learning_rate": 1.6165192631455605e-05, "loss": 0.3358, "step": 86950 }, { "epoch": 3.1338523083576604, "grad_norm": 0.22249744832515717, "learning_rate": 1.6162462849930704e-05, "loss": 0.3728, "step": 86955 }, { "epoch": 3.134032508018885, "grad_norm": 0.27574974298477173, "learning_rate": 1.6159733188815537e-05, "loss": 0.3883, "step": 86960 }, { "epoch": 3.1342127076801094, "grad_norm": 0.23349010944366455, "learning_rate": 1.615700364814728e-05, "loss": 0.3996, "step": 86965 }, { "epoch": 3.134392907341334, "grad_norm": 0.2402789145708084, "learning_rate": 1.615427422796315e-05, "loss": 0.4117, "step": 86970 }, { "epoch": 3.134573107002559, "grad_norm": 0.2527726888656616, "learning_rate": 1.6151544928300317e-05, "loss": 0.3607, "step": 86975 }, { "epoch": 3.1347533066637836, "grad_norm": 0.2573539614677429, "learning_rate": 1.614881574919596e-05, "loss": 0.3634, "step": 86980 }, { "epoch": 3.134933506325008, "grad_norm": 0.2581014037132263, "learning_rate": 1.6146086690687294e-05, "loss": 0.3679, "step": 86985 }, { "epoch": 3.1351137059862326, "grad_norm": 0.2501007616519928, "learning_rate": 1.6143357752811463e-05, "loss": 0.3852, "step": 86990 }, { "epoch": 3.1352939056474574, "grad_norm": 0.2757335603237152, "learning_rate": 1.6140628935605684e-05, "loss": 0.3801, "step": 86995 }, { "epoch": 3.135474105308682, "grad_norm": 0.22444647550582886, "learning_rate": 1.6137900239107118e-05, "loss": 0.4285, "step": 87000 }, { "epoch": 3.135474105308682, "eval_loss": 0.43149107694625854, "eval_runtime": 3.5463, "eval_samples_per_second": 28.199, "eval_steps_per_second": 7.05, "step": 87000 }, { "epoch": 3.135654304969907, "grad_norm": 0.20283716917037964, "learning_rate": 1.613517166335294e-05, "loss": 0.3473, "step": 87005 }, { "epoch": 3.135834504631131, "grad_norm": 0.2266281247138977, "learning_rate": 1.6132443208380333e-05, "loss": 0.3585, "step": 87010 }, { "epoch": 3.136014704292356, "grad_norm": 0.21769681572914124, "learning_rate": 1.6129714874226473e-05, "loss": 0.3786, "step": 87015 }, { "epoch": 3.1361949039535806, "grad_norm": 0.19556161761283875, "learning_rate": 1.6126986660928525e-05, "loss": 0.3848, "step": 87020 }, { "epoch": 3.1363751036148053, "grad_norm": 0.2081514149904251, "learning_rate": 1.6124258568523672e-05, "loss": 0.4028, "step": 87025 }, { "epoch": 3.13655530327603, "grad_norm": 0.2071877270936966, "learning_rate": 1.6121530597049072e-05, "loss": 0.4077, "step": 87030 }, { "epoch": 3.1367355029372543, "grad_norm": 0.1915891468524933, "learning_rate": 1.61188027465419e-05, "loss": 0.3885, "step": 87035 }, { "epoch": 3.136915702598479, "grad_norm": 0.2584543228149414, "learning_rate": 1.611607501703933e-05, "loss": 0.3638, "step": 87040 }, { "epoch": 3.137095902259704, "grad_norm": 0.20328520238399506, "learning_rate": 1.6113347408578504e-05, "loss": 0.3868, "step": 87045 }, { "epoch": 3.1372761019209285, "grad_norm": 0.2620421350002289, "learning_rate": 1.6110619921196607e-05, "loss": 0.4069, "step": 87050 }, { "epoch": 3.137456301582153, "grad_norm": 0.20489437878131866, "learning_rate": 1.61078925549308e-05, "loss": 0.352, "step": 87055 }, { "epoch": 3.1376365012433776, "grad_norm": 0.2264631986618042, "learning_rate": 1.610516530981822e-05, "loss": 0.3975, "step": 87060 }, { "epoch": 3.1378167009046023, "grad_norm": 0.21898336708545685, "learning_rate": 1.6102438185896052e-05, "loss": 0.3688, "step": 87065 }, { "epoch": 3.137996900565827, "grad_norm": 0.21995946764945984, "learning_rate": 1.6099711183201427e-05, "loss": 0.388, "step": 87070 }, { "epoch": 3.1381771002270518, "grad_norm": 0.2543283998966217, "learning_rate": 1.6096984301771535e-05, "loss": 0.3847, "step": 87075 }, { "epoch": 3.138357299888276, "grad_norm": 0.210366889834404, "learning_rate": 1.6094257541643494e-05, "loss": 0.4029, "step": 87080 }, { "epoch": 3.138537499549501, "grad_norm": 0.22760994732379913, "learning_rate": 1.609153090285447e-05, "loss": 0.4536, "step": 87085 }, { "epoch": 3.1387176992107255, "grad_norm": 0.23996824026107788, "learning_rate": 1.6088804385441616e-05, "loss": 0.3483, "step": 87090 }, { "epoch": 3.1388978988719503, "grad_norm": 0.2677284777164459, "learning_rate": 1.608607798944207e-05, "loss": 0.3906, "step": 87095 }, { "epoch": 3.139078098533175, "grad_norm": 0.28425002098083496, "learning_rate": 1.6083351714892997e-05, "loss": 0.3969, "step": 87100 }, { "epoch": 3.1392582981943993, "grad_norm": 0.2697744369506836, "learning_rate": 1.6080625561831528e-05, "loss": 0.3645, "step": 87105 }, { "epoch": 3.139438497855624, "grad_norm": 0.1847570687532425, "learning_rate": 1.60778995302948e-05, "loss": 0.3889, "step": 87110 }, { "epoch": 3.1396186975168487, "grad_norm": 0.2160649597644806, "learning_rate": 1.6075173620319972e-05, "loss": 0.3851, "step": 87115 }, { "epoch": 3.1397988971780735, "grad_norm": 0.22959807515144348, "learning_rate": 1.6072447831944177e-05, "loss": 0.36, "step": 87120 }, { "epoch": 3.1399790968392978, "grad_norm": 0.2232397347688675, "learning_rate": 1.6069722165204544e-05, "loss": 0.3793, "step": 87125 }, { "epoch": 3.1401592965005225, "grad_norm": 0.22282375395298004, "learning_rate": 1.6066996620138224e-05, "loss": 0.3569, "step": 87130 }, { "epoch": 3.1403394961617472, "grad_norm": 0.25350263714790344, "learning_rate": 1.6064271196782337e-05, "loss": 0.4028, "step": 87135 }, { "epoch": 3.140519695822972, "grad_norm": 0.19245287775993347, "learning_rate": 1.6061545895174036e-05, "loss": 0.3635, "step": 87140 }, { "epoch": 3.1406998954841967, "grad_norm": 0.18547998368740082, "learning_rate": 1.6058820715350438e-05, "loss": 0.3974, "step": 87145 }, { "epoch": 3.140880095145421, "grad_norm": 0.20061656832695007, "learning_rate": 1.605609565734867e-05, "loss": 0.3442, "step": 87150 }, { "epoch": 3.1410602948066457, "grad_norm": 0.2687702178955078, "learning_rate": 1.6053370721205877e-05, "loss": 0.3709, "step": 87155 }, { "epoch": 3.1412404944678705, "grad_norm": 0.22766318917274475, "learning_rate": 1.6050645906959178e-05, "loss": 0.3863, "step": 87160 }, { "epoch": 3.141420694129095, "grad_norm": 0.21267448365688324, "learning_rate": 1.6047921214645677e-05, "loss": 0.3719, "step": 87165 }, { "epoch": 3.1416008937903195, "grad_norm": 0.19360697269439697, "learning_rate": 1.604519664430254e-05, "loss": 0.39, "step": 87170 }, { "epoch": 3.141781093451544, "grad_norm": 0.1883769929409027, "learning_rate": 1.6042472195966843e-05, "loss": 0.3667, "step": 87175 }, { "epoch": 3.141961293112769, "grad_norm": 0.22920463979244232, "learning_rate": 1.6039747869675747e-05, "loss": 0.3861, "step": 87180 }, { "epoch": 3.1421414927739937, "grad_norm": 0.23282212018966675, "learning_rate": 1.6037023665466345e-05, "loss": 0.3846, "step": 87185 }, { "epoch": 3.1423216924352184, "grad_norm": 0.23070281744003296, "learning_rate": 1.6034299583375752e-05, "loss": 0.408, "step": 87190 }, { "epoch": 3.1425018920964427, "grad_norm": 0.22182480990886688, "learning_rate": 1.60315756234411e-05, "loss": 0.3623, "step": 87195 }, { "epoch": 3.1426820917576674, "grad_norm": 0.23669365048408508, "learning_rate": 1.6028851785699496e-05, "loss": 0.3811, "step": 87200 }, { "epoch": 3.142862291418892, "grad_norm": 0.2434239238500595, "learning_rate": 1.602612807018804e-05, "loss": 0.4078, "step": 87205 }, { "epoch": 3.143042491080117, "grad_norm": 0.190851092338562, "learning_rate": 1.602340447694386e-05, "loss": 0.363, "step": 87210 }, { "epoch": 3.143222690741341, "grad_norm": 0.26526564359664917, "learning_rate": 1.6020681006004046e-05, "loss": 0.4028, "step": 87215 }, { "epoch": 3.143402890402566, "grad_norm": 0.2453896403312683, "learning_rate": 1.6017957657405724e-05, "loss": 0.3361, "step": 87220 }, { "epoch": 3.1435830900637907, "grad_norm": 0.28423580527305603, "learning_rate": 1.601523443118599e-05, "loss": 0.3749, "step": 87225 }, { "epoch": 3.1437632897250154, "grad_norm": 0.24882547557353973, "learning_rate": 1.601251132738194e-05, "loss": 0.4034, "step": 87230 }, { "epoch": 3.14394348938624, "grad_norm": 0.2349334955215454, "learning_rate": 1.600978834603069e-05, "loss": 0.3975, "step": 87235 }, { "epoch": 3.1441236890474644, "grad_norm": 0.2091837227344513, "learning_rate": 1.6007065487169338e-05, "loss": 0.3942, "step": 87240 }, { "epoch": 3.144303888708689, "grad_norm": 0.19795401394367218, "learning_rate": 1.6004342750834965e-05, "loss": 0.3833, "step": 87245 }, { "epoch": 3.144484088369914, "grad_norm": 0.21962173283100128, "learning_rate": 1.6001620137064694e-05, "loss": 0.3709, "step": 87250 }, { "epoch": 3.1446642880311386, "grad_norm": 0.2462879866361618, "learning_rate": 1.5998897645895587e-05, "loss": 0.3942, "step": 87255 }, { "epoch": 3.144844487692363, "grad_norm": 0.21551023423671722, "learning_rate": 1.5996175277364777e-05, "loss": 0.3778, "step": 87260 }, { "epoch": 3.1450246873535876, "grad_norm": 0.23634564876556396, "learning_rate": 1.599345303150933e-05, "loss": 0.3598, "step": 87265 }, { "epoch": 3.1452048870148124, "grad_norm": 0.18265362083911896, "learning_rate": 1.599073090836633e-05, "loss": 0.3772, "step": 87270 }, { "epoch": 3.145385086676037, "grad_norm": 0.1682208925485611, "learning_rate": 1.5988008907972894e-05, "loss": 0.336, "step": 87275 }, { "epoch": 3.145565286337262, "grad_norm": 0.28196775913238525, "learning_rate": 1.5985287030366086e-05, "loss": 0.4021, "step": 87280 }, { "epoch": 3.145745485998486, "grad_norm": 0.23696090281009674, "learning_rate": 1.598256527558299e-05, "loss": 0.4015, "step": 87285 }, { "epoch": 3.145925685659711, "grad_norm": 0.24363209307193756, "learning_rate": 1.5979843643660703e-05, "loss": 0.3606, "step": 87290 }, { "epoch": 3.1461058853209356, "grad_norm": 0.24763990938663483, "learning_rate": 1.5977122134636297e-05, "loss": 0.3774, "step": 87295 }, { "epoch": 3.1462860849821603, "grad_norm": 0.23061999678611755, "learning_rate": 1.597440074854686e-05, "loss": 0.3816, "step": 87300 }, { "epoch": 3.146466284643385, "grad_norm": 0.22484789788722992, "learning_rate": 1.5971679485429457e-05, "loss": 0.3834, "step": 87305 }, { "epoch": 3.1466464843046094, "grad_norm": 0.24081788957118988, "learning_rate": 1.5968958345321178e-05, "loss": 0.3798, "step": 87310 }, { "epoch": 3.146826683965834, "grad_norm": 0.21690495312213898, "learning_rate": 1.5966237328259092e-05, "loss": 0.3613, "step": 87315 }, { "epoch": 3.147006883627059, "grad_norm": 0.1942356824874878, "learning_rate": 1.5963516434280275e-05, "loss": 0.3946, "step": 87320 }, { "epoch": 3.1471870832882836, "grad_norm": 0.23814404010772705, "learning_rate": 1.5960795663421796e-05, "loss": 0.4, "step": 87325 }, { "epoch": 3.147367282949508, "grad_norm": 0.20741520822048187, "learning_rate": 1.595807501572073e-05, "loss": 0.3594, "step": 87330 }, { "epoch": 3.1475474826107326, "grad_norm": 0.24436409771442413, "learning_rate": 1.595535449121413e-05, "loss": 0.383, "step": 87335 }, { "epoch": 3.1477276822719573, "grad_norm": 0.24962151050567627, "learning_rate": 1.5952634089939083e-05, "loss": 0.4004, "step": 87340 }, { "epoch": 3.147907881933182, "grad_norm": 0.2126917541027069, "learning_rate": 1.5949913811932654e-05, "loss": 0.4005, "step": 87345 }, { "epoch": 3.148088081594407, "grad_norm": 0.19167962670326233, "learning_rate": 1.5947193657231878e-05, "loss": 0.3429, "step": 87350 }, { "epoch": 3.148268281255631, "grad_norm": 0.22857992351055145, "learning_rate": 1.594447362587385e-05, "loss": 0.368, "step": 87355 }, { "epoch": 3.148448480916856, "grad_norm": 0.25584468245506287, "learning_rate": 1.5941753717895604e-05, "loss": 0.4065, "step": 87360 }, { "epoch": 3.1486286805780805, "grad_norm": 0.25212666392326355, "learning_rate": 1.5939033933334222e-05, "loss": 0.3659, "step": 87365 }, { "epoch": 3.1488088802393053, "grad_norm": 0.21378327906131744, "learning_rate": 1.5936314272226743e-05, "loss": 0.3726, "step": 87370 }, { "epoch": 3.14898907990053, "grad_norm": 0.2281913012266159, "learning_rate": 1.5933594734610225e-05, "loss": 0.3816, "step": 87375 }, { "epoch": 3.1491692795617543, "grad_norm": 0.21938398480415344, "learning_rate": 1.5930875320521728e-05, "loss": 0.3704, "step": 87380 }, { "epoch": 3.149349479222979, "grad_norm": 0.23328323662281036, "learning_rate": 1.5928156029998296e-05, "loss": 0.3732, "step": 87385 }, { "epoch": 3.1495296788842038, "grad_norm": 0.26327353715896606, "learning_rate": 1.5925436863076975e-05, "loss": 0.3539, "step": 87390 }, { "epoch": 3.1497098785454285, "grad_norm": 0.2839113175868988, "learning_rate": 1.592271781979483e-05, "loss": 0.3575, "step": 87395 }, { "epoch": 3.149890078206653, "grad_norm": 0.23041868209838867, "learning_rate": 1.5919998900188894e-05, "loss": 0.3847, "step": 87400 }, { "epoch": 3.1500702778678775, "grad_norm": 0.20966331660747528, "learning_rate": 1.5917280104296213e-05, "loss": 0.3631, "step": 87405 }, { "epoch": 3.1502504775291023, "grad_norm": 0.25130027532577515, "learning_rate": 1.5914561432153836e-05, "loss": 0.4388, "step": 87410 }, { "epoch": 3.150430677190327, "grad_norm": 0.20757076144218445, "learning_rate": 1.5911842883798796e-05, "loss": 0.3988, "step": 87415 }, { "epoch": 3.1506108768515517, "grad_norm": 0.19644039869308472, "learning_rate": 1.590912445926814e-05, "loss": 0.386, "step": 87420 }, { "epoch": 3.150791076512776, "grad_norm": 0.23030658066272736, "learning_rate": 1.590640615859891e-05, "loss": 0.3882, "step": 87425 }, { "epoch": 3.1509712761740007, "grad_norm": 0.24640274047851562, "learning_rate": 1.590368798182812e-05, "loss": 0.3869, "step": 87430 }, { "epoch": 3.1511514758352255, "grad_norm": 0.24621576070785522, "learning_rate": 1.5900969928992832e-05, "loss": 0.3922, "step": 87435 }, { "epoch": 3.15133167549645, "grad_norm": 0.2239450216293335, "learning_rate": 1.589825200013006e-05, "loss": 0.3877, "step": 87440 }, { "epoch": 3.1515118751576745, "grad_norm": 0.22715716063976288, "learning_rate": 1.5895534195276846e-05, "loss": 0.3739, "step": 87445 }, { "epoch": 3.1516920748188992, "grad_norm": 0.22491981089115143, "learning_rate": 1.5892816514470223e-05, "loss": 0.363, "step": 87450 }, { "epoch": 3.151872274480124, "grad_norm": 0.23676250874996185, "learning_rate": 1.5890098957747197e-05, "loss": 0.3657, "step": 87455 }, { "epoch": 3.1520524741413487, "grad_norm": 0.22871702909469604, "learning_rate": 1.5887381525144823e-05, "loss": 0.4015, "step": 87460 }, { "epoch": 3.1522326738025734, "grad_norm": 0.2012801617383957, "learning_rate": 1.5884664216700107e-05, "loss": 0.3338, "step": 87465 }, { "epoch": 3.1524128734637977, "grad_norm": 0.2810083031654358, "learning_rate": 1.588194703245007e-05, "loss": 0.3743, "step": 87470 }, { "epoch": 3.1525930731250225, "grad_norm": 0.21618230640888214, "learning_rate": 1.5879229972431748e-05, "loss": 0.3838, "step": 87475 }, { "epoch": 3.152773272786247, "grad_norm": 0.23298971354961395, "learning_rate": 1.5876513036682143e-05, "loss": 0.359, "step": 87480 }, { "epoch": 3.152953472447472, "grad_norm": 0.23836317658424377, "learning_rate": 1.5873796225238284e-05, "loss": 0.3654, "step": 87485 }, { "epoch": 3.153133672108696, "grad_norm": 0.20612043142318726, "learning_rate": 1.587107953813719e-05, "loss": 0.3643, "step": 87490 }, { "epoch": 3.153313871769921, "grad_norm": 0.20500215888023376, "learning_rate": 1.5868362975415862e-05, "loss": 0.3322, "step": 87495 }, { "epoch": 3.1534940714311457, "grad_norm": 0.22878599166870117, "learning_rate": 1.586564653711133e-05, "loss": 0.347, "step": 87500 }, { "epoch": 3.1534940714311457, "eval_loss": 0.4312251806259155, "eval_runtime": 3.5283, "eval_samples_per_second": 28.342, "eval_steps_per_second": 7.086, "step": 87500 }, { "epoch": 3.1536742710923704, "grad_norm": 0.23596695065498352, "learning_rate": 1.5862930223260587e-05, "loss": 0.4027, "step": 87505 }, { "epoch": 3.153854470753595, "grad_norm": 0.2374979853630066, "learning_rate": 1.5860214033900657e-05, "loss": 0.3555, "step": 87510 }, { "epoch": 3.1540346704148194, "grad_norm": 0.2398587167263031, "learning_rate": 1.5857497969068535e-05, "loss": 0.3814, "step": 87515 }, { "epoch": 3.154214870076044, "grad_norm": 0.1979122906923294, "learning_rate": 1.5854782028801235e-05, "loss": 0.3692, "step": 87520 }, { "epoch": 3.154395069737269, "grad_norm": 0.23171058297157288, "learning_rate": 1.5852066213135764e-05, "loss": 0.3734, "step": 87525 }, { "epoch": 3.1545752693984936, "grad_norm": 0.221579447388649, "learning_rate": 1.5849350522109124e-05, "loss": 0.3978, "step": 87530 }, { "epoch": 3.1547554690597184, "grad_norm": 0.23226810991764069, "learning_rate": 1.5846634955758294e-05, "loss": 0.3808, "step": 87535 }, { "epoch": 3.1549356687209427, "grad_norm": 0.25539645552635193, "learning_rate": 1.584391951412031e-05, "loss": 0.3677, "step": 87540 }, { "epoch": 3.1551158683821674, "grad_norm": 0.23339438438415527, "learning_rate": 1.5841204197232128e-05, "loss": 0.4242, "step": 87545 }, { "epoch": 3.155296068043392, "grad_norm": 0.20781894028186798, "learning_rate": 1.5838489005130784e-05, "loss": 0.3916, "step": 87550 }, { "epoch": 3.155476267704617, "grad_norm": 0.2721303701400757, "learning_rate": 1.5835773937853254e-05, "loss": 0.3906, "step": 87555 }, { "epoch": 3.155656467365841, "grad_norm": 0.20113010704517365, "learning_rate": 1.5833058995436512e-05, "loss": 0.3924, "step": 87560 }, { "epoch": 3.155836667027066, "grad_norm": 0.22510038316249847, "learning_rate": 1.5830344177917587e-05, "loss": 0.4007, "step": 87565 }, { "epoch": 3.1560168666882906, "grad_norm": 0.2468447983264923, "learning_rate": 1.582762948533344e-05, "loss": 0.3602, "step": 87570 }, { "epoch": 3.1561970663495154, "grad_norm": 0.22670796513557434, "learning_rate": 1.582491491772106e-05, "loss": 0.3703, "step": 87575 }, { "epoch": 3.15637726601074, "grad_norm": 0.2570918798446655, "learning_rate": 1.5822200475117442e-05, "loss": 0.3967, "step": 87580 }, { "epoch": 3.1565574656719644, "grad_norm": 0.23322150111198425, "learning_rate": 1.581948615755956e-05, "loss": 0.416, "step": 87585 }, { "epoch": 3.156737665333189, "grad_norm": 0.20641043782234192, "learning_rate": 1.581677196508441e-05, "loss": 0.3588, "step": 87590 }, { "epoch": 3.156917864994414, "grad_norm": 0.21482810378074646, "learning_rate": 1.5814057897728964e-05, "loss": 0.3548, "step": 87595 }, { "epoch": 3.1570980646556386, "grad_norm": 0.2732873260974884, "learning_rate": 1.5811343955530194e-05, "loss": 0.3961, "step": 87600 }, { "epoch": 3.1572782643168633, "grad_norm": 0.23953673243522644, "learning_rate": 1.5808630138525088e-05, "loss": 0.4481, "step": 87605 }, { "epoch": 3.1574584639780876, "grad_norm": 0.23516350984573364, "learning_rate": 1.580591644675062e-05, "loss": 0.3804, "step": 87610 }, { "epoch": 3.1576386636393123, "grad_norm": 0.21418847143650055, "learning_rate": 1.5803202880243754e-05, "loss": 0.3417, "step": 87615 }, { "epoch": 3.157818863300537, "grad_norm": 0.21651922166347504, "learning_rate": 1.580048943904147e-05, "loss": 0.3643, "step": 87620 }, { "epoch": 3.157999062961762, "grad_norm": 0.309662789106369, "learning_rate": 1.5797776123180736e-05, "loss": 0.4107, "step": 87625 }, { "epoch": 3.158179262622986, "grad_norm": 0.2053297758102417, "learning_rate": 1.5795062932698522e-05, "loss": 0.3574, "step": 87630 }, { "epoch": 3.158359462284211, "grad_norm": 0.2058108001947403, "learning_rate": 1.57923498676318e-05, "loss": 0.3742, "step": 87635 }, { "epoch": 3.1585396619454356, "grad_norm": 0.22333082556724548, "learning_rate": 1.5789636928017514e-05, "loss": 0.3823, "step": 87640 }, { "epoch": 3.1587198616066603, "grad_norm": 0.2727280259132385, "learning_rate": 1.5786924113892655e-05, "loss": 0.3995, "step": 87645 }, { "epoch": 3.158900061267885, "grad_norm": 0.2285049855709076, "learning_rate": 1.578421142529417e-05, "loss": 0.3648, "step": 87650 }, { "epoch": 3.1590802609291093, "grad_norm": 0.24408283829689026, "learning_rate": 1.5781498862259007e-05, "loss": 0.4071, "step": 87655 }, { "epoch": 3.159260460590334, "grad_norm": 0.23655414581298828, "learning_rate": 1.577878642482415e-05, "loss": 0.3995, "step": 87660 }, { "epoch": 3.159440660251559, "grad_norm": 0.21964088082313538, "learning_rate": 1.5776074113026533e-05, "loss": 0.3779, "step": 87665 }, { "epoch": 3.1596208599127835, "grad_norm": 0.18904650211334229, "learning_rate": 1.577336192690313e-05, "loss": 0.404, "step": 87670 }, { "epoch": 3.159801059574008, "grad_norm": 0.2197270691394806, "learning_rate": 1.577064986649088e-05, "loss": 0.3763, "step": 87675 }, { "epoch": 3.1599812592352325, "grad_norm": 0.2542881667613983, "learning_rate": 1.576793793182674e-05, "loss": 0.4108, "step": 87680 }, { "epoch": 3.1601614588964573, "grad_norm": 0.2189742773771286, "learning_rate": 1.5765768474658893e-05, "loss": 0.4039, "step": 87685 }, { "epoch": 3.160341658557682, "grad_norm": 0.2676211893558502, "learning_rate": 1.576305676643446e-05, "loss": 0.3957, "step": 87690 }, { "epoch": 3.1605218582189067, "grad_norm": 0.2037624716758728, "learning_rate": 1.5760345184061586e-05, "loss": 0.3458, "step": 87695 }, { "epoch": 3.160702057880131, "grad_norm": 0.21000508964061737, "learning_rate": 1.575763372757723e-05, "loss": 0.3576, "step": 87700 }, { "epoch": 3.1608822575413558, "grad_norm": 0.27110737562179565, "learning_rate": 1.5754922397018323e-05, "loss": 0.35, "step": 87705 }, { "epoch": 3.1610624572025805, "grad_norm": 0.24699506163597107, "learning_rate": 1.5752211192421794e-05, "loss": 0.3493, "step": 87710 }, { "epoch": 3.1612426568638052, "grad_norm": 0.25574126839637756, "learning_rate": 1.574950011382462e-05, "loss": 0.3795, "step": 87715 }, { "epoch": 3.1614228565250295, "grad_norm": 0.24138563871383667, "learning_rate": 1.5746789161263702e-05, "loss": 0.3382, "step": 87720 }, { "epoch": 3.1616030561862543, "grad_norm": 0.2855709493160248, "learning_rate": 1.574407833477599e-05, "loss": 0.3956, "step": 87725 }, { "epoch": 3.161783255847479, "grad_norm": 0.1987280547618866, "learning_rate": 1.574136763439842e-05, "loss": 0.3667, "step": 87730 }, { "epoch": 3.1619634555087037, "grad_norm": 0.19212450087070465, "learning_rate": 1.5738657060167916e-05, "loss": 0.3855, "step": 87735 }, { "epoch": 3.1621436551699285, "grad_norm": 0.20062510669231415, "learning_rate": 1.573594661212143e-05, "loss": 0.3914, "step": 87740 }, { "epoch": 3.1623238548311527, "grad_norm": 0.24137015640735626, "learning_rate": 1.5733236290295873e-05, "loss": 0.3458, "step": 87745 }, { "epoch": 3.1625040544923775, "grad_norm": 0.1870691478252411, "learning_rate": 1.5730526094728173e-05, "loss": 0.3664, "step": 87750 }, { "epoch": 3.162684254153602, "grad_norm": 0.21274974942207336, "learning_rate": 1.5727816025455267e-05, "loss": 0.3506, "step": 87755 }, { "epoch": 3.162864453814827, "grad_norm": 0.21418865025043488, "learning_rate": 1.5725106082514066e-05, "loss": 0.4101, "step": 87760 }, { "epoch": 3.1630446534760512, "grad_norm": 0.2074471414089203, "learning_rate": 1.5722396265941504e-05, "loss": 0.3705, "step": 87765 }, { "epoch": 3.163224853137276, "grad_norm": 0.26662349700927734, "learning_rate": 1.5719686575774494e-05, "loss": 0.3715, "step": 87770 }, { "epoch": 3.1634050527985007, "grad_norm": 0.22255268692970276, "learning_rate": 1.5716977012049958e-05, "loss": 0.3973, "step": 87775 }, { "epoch": 3.1635852524597254, "grad_norm": 0.2301638126373291, "learning_rate": 1.5714267574804814e-05, "loss": 0.3646, "step": 87780 }, { "epoch": 3.16376545212095, "grad_norm": 0.2992319166660309, "learning_rate": 1.5711558264075978e-05, "loss": 0.3886, "step": 87785 }, { "epoch": 3.1639456517821745, "grad_norm": 0.2510296106338501, "learning_rate": 1.5708849079900355e-05, "loss": 0.3778, "step": 87790 }, { "epoch": 3.164125851443399, "grad_norm": 0.23558221757411957, "learning_rate": 1.5706140022314874e-05, "loss": 0.3634, "step": 87795 }, { "epoch": 3.164306051104624, "grad_norm": 0.1713656485080719, "learning_rate": 1.570343109135643e-05, "loss": 0.3438, "step": 87800 }, { "epoch": 3.1644862507658487, "grad_norm": 0.26629698276519775, "learning_rate": 1.570072228706194e-05, "loss": 0.3836, "step": 87805 }, { "epoch": 3.1646664504270734, "grad_norm": 0.2552080750465393, "learning_rate": 1.5698013609468314e-05, "loss": 0.3914, "step": 87810 }, { "epoch": 3.1648466500882977, "grad_norm": 0.2017066776752472, "learning_rate": 1.5695305058612436e-05, "loss": 0.3464, "step": 87815 }, { "epoch": 3.1650268497495224, "grad_norm": 0.18393248319625854, "learning_rate": 1.569259663453124e-05, "loss": 0.4058, "step": 87820 }, { "epoch": 3.165207049410747, "grad_norm": 0.2161942571401596, "learning_rate": 1.568988833726161e-05, "loss": 0.3795, "step": 87825 }, { "epoch": 3.165387249071972, "grad_norm": 0.21547988057136536, "learning_rate": 1.5687180166840442e-05, "loss": 0.3861, "step": 87830 }, { "epoch": 3.165567448733196, "grad_norm": 0.27021363377571106, "learning_rate": 1.5684472123304648e-05, "loss": 0.3855, "step": 87835 }, { "epoch": 3.165747648394421, "grad_norm": 0.2427399456501007, "learning_rate": 1.5681764206691114e-05, "loss": 0.3669, "step": 87840 }, { "epoch": 3.1659278480556456, "grad_norm": 0.2702856957912445, "learning_rate": 1.5679056417036743e-05, "loss": 0.403, "step": 87845 }, { "epoch": 3.1661080477168704, "grad_norm": 0.24403263628482819, "learning_rate": 1.567634875437842e-05, "loss": 0.3985, "step": 87850 }, { "epoch": 3.166288247378095, "grad_norm": 0.24820050597190857, "learning_rate": 1.567364121875304e-05, "loss": 0.3987, "step": 87855 }, { "epoch": 3.1664684470393194, "grad_norm": 0.20138351619243622, "learning_rate": 1.567093381019749e-05, "loss": 0.3929, "step": 87860 }, { "epoch": 3.166648646700544, "grad_norm": 0.22523026168346405, "learning_rate": 1.566822652874867e-05, "loss": 0.3857, "step": 87865 }, { "epoch": 3.166828846361769, "grad_norm": 0.20283663272857666, "learning_rate": 1.5665519374443447e-05, "loss": 0.3864, "step": 87870 }, { "epoch": 3.1670090460229936, "grad_norm": 0.22071103751659393, "learning_rate": 1.566281234731872e-05, "loss": 0.3571, "step": 87875 }, { "epoch": 3.1671892456842183, "grad_norm": 0.262087345123291, "learning_rate": 1.5660105447411364e-05, "loss": 0.365, "step": 87880 }, { "epoch": 3.1673694453454426, "grad_norm": 0.26408714056015015, "learning_rate": 1.565739867475827e-05, "loss": 0.4295, "step": 87885 }, { "epoch": 3.1675496450066674, "grad_norm": 0.30089375376701355, "learning_rate": 1.5654692029396308e-05, "loss": 0.4188, "step": 87890 }, { "epoch": 3.167729844667892, "grad_norm": 0.24746596813201904, "learning_rate": 1.565198551136235e-05, "loss": 0.3927, "step": 87895 }, { "epoch": 3.167910044329117, "grad_norm": 0.2371826320886612, "learning_rate": 1.5649279120693283e-05, "loss": 0.3807, "step": 87900 }, { "epoch": 3.168090243990341, "grad_norm": 0.2455732524394989, "learning_rate": 1.564657285742599e-05, "loss": 0.3812, "step": 87905 }, { "epoch": 3.168270443651566, "grad_norm": 0.22298307716846466, "learning_rate": 1.564386672159731e-05, "loss": 0.4142, "step": 87910 }, { "epoch": 3.1684506433127906, "grad_norm": 0.24019892513751984, "learning_rate": 1.5641160713244153e-05, "loss": 0.3813, "step": 87915 }, { "epoch": 3.1686308429740153, "grad_norm": 0.22240878641605377, "learning_rate": 1.563845483240335e-05, "loss": 0.4095, "step": 87920 }, { "epoch": 3.16881104263524, "grad_norm": 0.22564803063869476, "learning_rate": 1.5635749079111807e-05, "loss": 0.3787, "step": 87925 }, { "epoch": 3.1689912422964643, "grad_norm": 0.23460455238819122, "learning_rate": 1.5633043453406364e-05, "loss": 0.3668, "step": 87930 }, { "epoch": 3.169171441957689, "grad_norm": 0.2320113182067871, "learning_rate": 1.5630337955323883e-05, "loss": 0.401, "step": 87935 }, { "epoch": 3.169351641618914, "grad_norm": 0.21181990206241608, "learning_rate": 1.562763258490124e-05, "loss": 0.379, "step": 87940 }, { "epoch": 3.1695318412801385, "grad_norm": 0.23171097040176392, "learning_rate": 1.5624927342175287e-05, "loss": 0.3618, "step": 87945 }, { "epoch": 3.169712040941363, "grad_norm": 0.2735525965690613, "learning_rate": 1.5622222227182884e-05, "loss": 0.4178, "step": 87950 }, { "epoch": 3.1698922406025876, "grad_norm": 0.23134300112724304, "learning_rate": 1.561951723996089e-05, "loss": 0.3827, "step": 87955 }, { "epoch": 3.1700724402638123, "grad_norm": 0.21240036189556122, "learning_rate": 1.561681238054615e-05, "loss": 0.382, "step": 87960 }, { "epoch": 3.170252639925037, "grad_norm": 0.26621800661087036, "learning_rate": 1.5614107648975532e-05, "loss": 0.3772, "step": 87965 }, { "epoch": 3.1704328395862618, "grad_norm": 0.22227172553539276, "learning_rate": 1.561140304528588e-05, "loss": 0.3594, "step": 87970 }, { "epoch": 3.170613039247486, "grad_norm": 0.23364649713039398, "learning_rate": 1.5608698569514037e-05, "loss": 0.372, "step": 87975 }, { "epoch": 3.170793238908711, "grad_norm": 0.23514457046985626, "learning_rate": 1.5605994221696864e-05, "loss": 0.4162, "step": 87980 }, { "epoch": 3.1709734385699355, "grad_norm": 0.25838416814804077, "learning_rate": 1.5603290001871197e-05, "loss": 0.4076, "step": 87985 }, { "epoch": 3.1711536382311603, "grad_norm": 0.2554473280906677, "learning_rate": 1.560058591007389e-05, "loss": 0.422, "step": 87990 }, { "epoch": 3.1713338378923845, "grad_norm": 0.23790916800498962, "learning_rate": 1.559788194634178e-05, "loss": 0.3746, "step": 87995 }, { "epoch": 3.1715140375536093, "grad_norm": 0.21130967140197754, "learning_rate": 1.55951781107117e-05, "loss": 0.3621, "step": 88000 }, { "epoch": 3.1715140375536093, "eval_loss": 0.4313719570636749, "eval_runtime": 3.532, "eval_samples_per_second": 28.313, "eval_steps_per_second": 7.078, "step": 88000 }, { "epoch": 3.171694237214834, "grad_norm": 0.2613160312175751, "learning_rate": 1.559247440322051e-05, "loss": 0.3657, "step": 88005 }, { "epoch": 3.1718744368760587, "grad_norm": 0.17358632385730743, "learning_rate": 1.5589770823905027e-05, "loss": 0.36, "step": 88010 }, { "epoch": 3.1720546365372835, "grad_norm": 0.2220027893781662, "learning_rate": 1.5587067372802092e-05, "loss": 0.3705, "step": 88015 }, { "epoch": 3.1722348361985078, "grad_norm": 0.20974379777908325, "learning_rate": 1.5584364049948548e-05, "loss": 0.3666, "step": 88020 }, { "epoch": 3.1724150358597325, "grad_norm": 0.20734605193138123, "learning_rate": 1.5581660855381207e-05, "loss": 0.3625, "step": 88025 }, { "epoch": 3.1725952355209572, "grad_norm": 0.24206040799617767, "learning_rate": 1.557895778913693e-05, "loss": 0.3759, "step": 88030 }, { "epoch": 3.172775435182182, "grad_norm": 0.20642173290252686, "learning_rate": 1.5576254851252526e-05, "loss": 0.3891, "step": 88035 }, { "epoch": 3.1729556348434063, "grad_norm": 0.23246215283870697, "learning_rate": 1.5573552041764818e-05, "loss": 0.4263, "step": 88040 }, { "epoch": 3.173135834504631, "grad_norm": 0.2090509980916977, "learning_rate": 1.557084936071064e-05, "loss": 0.3807, "step": 88045 }, { "epoch": 3.1733160341658557, "grad_norm": 0.22444801032543182, "learning_rate": 1.5568146808126823e-05, "loss": 0.3697, "step": 88050 }, { "epoch": 3.1734962338270805, "grad_norm": 0.24101091921329498, "learning_rate": 1.556544438405017e-05, "loss": 0.3802, "step": 88055 }, { "epoch": 3.173676433488305, "grad_norm": 0.2750411927700043, "learning_rate": 1.556274208851752e-05, "loss": 0.391, "step": 88060 }, { "epoch": 3.1738566331495295, "grad_norm": 0.21167200803756714, "learning_rate": 1.5560039921565667e-05, "loss": 0.3587, "step": 88065 }, { "epoch": 3.174036832810754, "grad_norm": 0.28249591588974, "learning_rate": 1.5557337883231455e-05, "loss": 0.3977, "step": 88070 }, { "epoch": 3.174217032471979, "grad_norm": 0.23094874620437622, "learning_rate": 1.5554635973551684e-05, "loss": 0.3555, "step": 88075 }, { "epoch": 3.1743972321332037, "grad_norm": 0.24824340641498566, "learning_rate": 1.5551934192563165e-05, "loss": 0.3596, "step": 88080 }, { "epoch": 3.1745774317944284, "grad_norm": 0.23412884771823883, "learning_rate": 1.5549232540302718e-05, "loss": 0.4005, "step": 88085 }, { "epoch": 3.1747576314556527, "grad_norm": 0.2383238971233368, "learning_rate": 1.5546531016807152e-05, "loss": 0.4006, "step": 88090 }, { "epoch": 3.1749378311168774, "grad_norm": 0.2111070156097412, "learning_rate": 1.5543829622113253e-05, "loss": 0.3623, "step": 88095 }, { "epoch": 3.175118030778102, "grad_norm": 0.2079268842935562, "learning_rate": 1.5541128356257866e-05, "loss": 0.3812, "step": 88100 }, { "epoch": 3.175298230439327, "grad_norm": 0.2111787348985672, "learning_rate": 1.5538427219277756e-05, "loss": 0.4066, "step": 88105 }, { "epoch": 3.1754784301005516, "grad_norm": 0.22174887359142303, "learning_rate": 1.5535726211209758e-05, "loss": 0.3579, "step": 88110 }, { "epoch": 3.175658629761776, "grad_norm": 0.2652639150619507, "learning_rate": 1.5533025332090655e-05, "loss": 0.4119, "step": 88115 }, { "epoch": 3.1758388294230007, "grad_norm": 0.22351694107055664, "learning_rate": 1.553032458195724e-05, "loss": 0.3969, "step": 88120 }, { "epoch": 3.1760190290842254, "grad_norm": 0.2620088756084442, "learning_rate": 1.5527623960846328e-05, "loss": 0.3849, "step": 88125 }, { "epoch": 3.17619922874545, "grad_norm": 0.24307698011398315, "learning_rate": 1.55249234687947e-05, "loss": 0.38, "step": 88130 }, { "epoch": 3.1763794284066744, "grad_norm": 0.2753578722476959, "learning_rate": 1.5522223105839157e-05, "loss": 0.369, "step": 88135 }, { "epoch": 3.176559628067899, "grad_norm": 0.21059057116508484, "learning_rate": 1.551952287201649e-05, "loss": 0.4148, "step": 88140 }, { "epoch": 3.176739827729124, "grad_norm": 0.21120493113994598, "learning_rate": 1.551682276736349e-05, "loss": 0.3609, "step": 88145 }, { "epoch": 3.1769200273903486, "grad_norm": 0.22285541892051697, "learning_rate": 1.551412279191694e-05, "loss": 0.3983, "step": 88150 }, { "epoch": 3.1771002270515734, "grad_norm": 0.22482122480869293, "learning_rate": 1.5511422945713633e-05, "loss": 0.3582, "step": 88155 }, { "epoch": 3.1772804267127976, "grad_norm": 0.21789222955703735, "learning_rate": 1.550872322879035e-05, "loss": 0.3925, "step": 88160 }, { "epoch": 3.1774606263740224, "grad_norm": 0.21532319486141205, "learning_rate": 1.5506023641183876e-05, "loss": 0.3784, "step": 88165 }, { "epoch": 3.177640826035247, "grad_norm": 0.23733597993850708, "learning_rate": 1.550332418293099e-05, "loss": 0.3538, "step": 88170 }, { "epoch": 3.177821025696472, "grad_norm": 0.18654675781726837, "learning_rate": 1.550062485406848e-05, "loss": 0.4173, "step": 88175 }, { "epoch": 3.178001225357696, "grad_norm": 0.2497045248746872, "learning_rate": 1.5497925654633118e-05, "loss": 0.41, "step": 88180 }, { "epoch": 3.178181425018921, "grad_norm": 0.27806994318962097, "learning_rate": 1.5495226584661664e-05, "loss": 0.428, "step": 88185 }, { "epoch": 3.1783616246801456, "grad_norm": 0.20720724761486053, "learning_rate": 1.549252764419092e-05, "loss": 0.3697, "step": 88190 }, { "epoch": 3.1785418243413703, "grad_norm": 0.21904359757900238, "learning_rate": 1.548982883325765e-05, "loss": 0.3729, "step": 88195 }, { "epoch": 3.178722024002595, "grad_norm": 0.2742276191711426, "learning_rate": 1.5487130151898603e-05, "loss": 0.3845, "step": 88200 }, { "epoch": 3.1789022236638194, "grad_norm": 0.2257348746061325, "learning_rate": 1.5484431600150587e-05, "loss": 0.3508, "step": 88205 }, { "epoch": 3.179082423325044, "grad_norm": 0.23579144477844238, "learning_rate": 1.548173317805033e-05, "loss": 0.3784, "step": 88210 }, { "epoch": 3.179262622986269, "grad_norm": 0.2549505829811096, "learning_rate": 1.547903488563463e-05, "loss": 0.3866, "step": 88215 }, { "epoch": 3.1794428226474936, "grad_norm": 0.2171338051557541, "learning_rate": 1.5476336722940233e-05, "loss": 0.3705, "step": 88220 }, { "epoch": 3.179623022308718, "grad_norm": 0.21523796021938324, "learning_rate": 1.54736386900039e-05, "loss": 0.3732, "step": 88225 }, { "epoch": 3.1798032219699426, "grad_norm": 0.22669003903865814, "learning_rate": 1.5470940786862397e-05, "loss": 0.3728, "step": 88230 }, { "epoch": 3.1799834216311673, "grad_norm": 0.20555952191352844, "learning_rate": 1.5468243013552486e-05, "loss": 0.3688, "step": 88235 }, { "epoch": 3.180163621292392, "grad_norm": 0.24928292632102966, "learning_rate": 1.546554537011091e-05, "loss": 0.4122, "step": 88240 }, { "epoch": 3.180343820953617, "grad_norm": 0.2341477870941162, "learning_rate": 1.5462847856574437e-05, "loss": 0.396, "step": 88245 }, { "epoch": 3.180524020614841, "grad_norm": 0.21584312617778778, "learning_rate": 1.546015047297981e-05, "loss": 0.3566, "step": 88250 }, { "epoch": 3.180704220276066, "grad_norm": 0.25343307852745056, "learning_rate": 1.5457453219363794e-05, "loss": 0.3825, "step": 88255 }, { "epoch": 3.1808844199372905, "grad_norm": 0.26470595598220825, "learning_rate": 1.545475609576313e-05, "loss": 0.3954, "step": 88260 }, { "epoch": 3.1810646195985153, "grad_norm": 0.19923968613147736, "learning_rate": 1.5452059102214558e-05, "loss": 0.3574, "step": 88265 }, { "epoch": 3.1812448192597396, "grad_norm": 0.20713098347187042, "learning_rate": 1.5449362238754838e-05, "loss": 0.3896, "step": 88270 }, { "epoch": 3.1814250189209643, "grad_norm": 0.24674662947654724, "learning_rate": 1.5446665505420714e-05, "loss": 0.3901, "step": 88275 }, { "epoch": 3.181605218582189, "grad_norm": 0.22390885651111603, "learning_rate": 1.5443968902248913e-05, "loss": 0.375, "step": 88280 }, { "epoch": 3.1817854182434138, "grad_norm": 0.252699077129364, "learning_rate": 1.5441272429276193e-05, "loss": 0.3771, "step": 88285 }, { "epoch": 3.1819656179046385, "grad_norm": 0.25395792722702026, "learning_rate": 1.5438576086539275e-05, "loss": 0.389, "step": 88290 }, { "epoch": 3.182145817565863, "grad_norm": 0.27715232968330383, "learning_rate": 1.543587987407492e-05, "loss": 0.3793, "step": 88295 }, { "epoch": 3.1823260172270875, "grad_norm": 0.27703672647476196, "learning_rate": 1.5433183791919844e-05, "loss": 0.3654, "step": 88300 }, { "epoch": 3.1825062168883123, "grad_norm": 0.21371644735336304, "learning_rate": 1.543048784011078e-05, "loss": 0.3736, "step": 88305 }, { "epoch": 3.182686416549537, "grad_norm": 0.18627633154392242, "learning_rate": 1.542779201868448e-05, "loss": 0.332, "step": 88310 }, { "epoch": 3.1828666162107617, "grad_norm": 0.24603888392448425, "learning_rate": 1.542509632767765e-05, "loss": 0.3981, "step": 88315 }, { "epoch": 3.183046815871986, "grad_norm": 0.26360347867012024, "learning_rate": 1.5422400767127034e-05, "loss": 0.3647, "step": 88320 }, { "epoch": 3.1832270155332107, "grad_norm": 0.24063238501548767, "learning_rate": 1.5419705337069353e-05, "loss": 0.3559, "step": 88325 }, { "epoch": 3.1834072151944355, "grad_norm": 0.20714794099330902, "learning_rate": 1.5417010037541322e-05, "loss": 0.3922, "step": 88330 }, { "epoch": 3.18358741485566, "grad_norm": 0.24574214220046997, "learning_rate": 1.5414314868579687e-05, "loss": 0.384, "step": 88335 }, { "epoch": 3.1837676145168845, "grad_norm": 0.2243025153875351, "learning_rate": 1.541161983022115e-05, "loss": 0.3988, "step": 88340 }, { "epoch": 3.1839478141781092, "grad_norm": 0.22772471606731415, "learning_rate": 1.5408924922502437e-05, "loss": 0.4002, "step": 88345 }, { "epoch": 3.184128013839334, "grad_norm": 0.27433350682258606, "learning_rate": 1.5406230145460264e-05, "loss": 0.3967, "step": 88350 }, { "epoch": 3.1843082135005587, "grad_norm": 0.24873776733875275, "learning_rate": 1.540353549913135e-05, "loss": 0.3847, "step": 88355 }, { "epoch": 3.1844884131617834, "grad_norm": 0.21438409388065338, "learning_rate": 1.5400840983552405e-05, "loss": 0.3762, "step": 88360 }, { "epoch": 3.1846686128230077, "grad_norm": 0.20840908586978912, "learning_rate": 1.539814659876015e-05, "loss": 0.4169, "step": 88365 }, { "epoch": 3.1848488124842325, "grad_norm": 0.30924826860427856, "learning_rate": 1.5395452344791277e-05, "loss": 0.3751, "step": 88370 }, { "epoch": 3.185029012145457, "grad_norm": 0.25707584619522095, "learning_rate": 1.5392758221682516e-05, "loss": 0.4209, "step": 88375 }, { "epoch": 3.185209211806682, "grad_norm": 0.26053330302238464, "learning_rate": 1.5390064229470568e-05, "loss": 0.3743, "step": 88380 }, { "epoch": 3.1853894114679067, "grad_norm": 0.24853472411632538, "learning_rate": 1.5387370368192116e-05, "loss": 0.3825, "step": 88385 }, { "epoch": 3.185569611129131, "grad_norm": 0.20806756615638733, "learning_rate": 1.5384676637883904e-05, "loss": 0.3562, "step": 88390 }, { "epoch": 3.1857498107903557, "grad_norm": 0.2653723359107971, "learning_rate": 1.5381983038582588e-05, "loss": 0.374, "step": 88395 }, { "epoch": 3.1859300104515804, "grad_norm": 0.25022828578948975, "learning_rate": 1.5379289570324912e-05, "loss": 0.3863, "step": 88400 }, { "epoch": 3.186110210112805, "grad_norm": 0.2051696926355362, "learning_rate": 1.5376596233147543e-05, "loss": 0.4092, "step": 88405 }, { "epoch": 3.1862904097740294, "grad_norm": 0.1869521141052246, "learning_rate": 1.5373903027087185e-05, "loss": 0.3755, "step": 88410 }, { "epoch": 3.186470609435254, "grad_norm": 0.17560672760009766, "learning_rate": 1.537120995218054e-05, "loss": 0.3518, "step": 88415 }, { "epoch": 3.186650809096479, "grad_norm": 0.24432684481143951, "learning_rate": 1.5368517008464295e-05, "loss": 0.3625, "step": 88420 }, { "epoch": 3.1868310087577036, "grad_norm": 0.2695041298866272, "learning_rate": 1.5365824195975137e-05, "loss": 0.3801, "step": 88425 }, { "epoch": 3.1870112084189284, "grad_norm": 0.2649804651737213, "learning_rate": 1.5363131514749762e-05, "loss": 0.3637, "step": 88430 }, { "epoch": 3.1871914080801527, "grad_norm": 0.2614179849624634, "learning_rate": 1.5360438964824846e-05, "loss": 0.3891, "step": 88435 }, { "epoch": 3.1873716077413774, "grad_norm": 0.2579090893268585, "learning_rate": 1.5357746546237093e-05, "loss": 0.3654, "step": 88440 }, { "epoch": 3.187551807402602, "grad_norm": 0.2515721619129181, "learning_rate": 1.5355054259023176e-05, "loss": 0.414, "step": 88445 }, { "epoch": 3.187732007063827, "grad_norm": 0.19676761329174042, "learning_rate": 1.5352362103219772e-05, "loss": 0.4121, "step": 88450 }, { "epoch": 3.187912206725051, "grad_norm": 0.3330390751361847, "learning_rate": 1.534967007886357e-05, "loss": 0.4198, "step": 88455 }, { "epoch": 3.188092406386276, "grad_norm": 0.1731852889060974, "learning_rate": 1.5346978185991254e-05, "loss": 0.3721, "step": 88460 }, { "epoch": 3.1882726060475006, "grad_norm": 0.21727561950683594, "learning_rate": 1.5344286424639473e-05, "loss": 0.3973, "step": 88465 }, { "epoch": 3.1884528057087254, "grad_norm": 0.21226783096790314, "learning_rate": 1.534159479484493e-05, "loss": 0.3798, "step": 88470 }, { "epoch": 3.18863300536995, "grad_norm": 0.21172350645065308, "learning_rate": 1.5338903296644282e-05, "loss": 0.3675, "step": 88475 }, { "epoch": 3.1888132050311744, "grad_norm": 0.23403914272785187, "learning_rate": 1.533621193007421e-05, "loss": 0.3956, "step": 88480 }, { "epoch": 3.188993404692399, "grad_norm": 0.24958845973014832, "learning_rate": 1.533352069517139e-05, "loss": 0.3753, "step": 88485 }, { "epoch": 3.189173604353624, "grad_norm": 0.29716435074806213, "learning_rate": 1.5330829591972463e-05, "loss": 0.3674, "step": 88490 }, { "epoch": 3.1893538040148486, "grad_norm": 0.23734360933303833, "learning_rate": 1.5328138620514125e-05, "loss": 0.3635, "step": 88495 }, { "epoch": 3.189534003676073, "grad_norm": 0.22716392576694489, "learning_rate": 1.532544778083302e-05, "loss": 0.3993, "step": 88500 }, { "epoch": 3.189534003676073, "eval_loss": 0.4313284754753113, "eval_runtime": 3.5346, "eval_samples_per_second": 28.292, "eval_steps_per_second": 7.073, "step": 88500 }, { "epoch": 3.1897142033372976, "grad_norm": 0.24912869930267334, "learning_rate": 1.532275707296581e-05, "loss": 0.3939, "step": 88505 }, { "epoch": 3.1898944029985223, "grad_norm": 0.20654475688934326, "learning_rate": 1.5320066496949164e-05, "loss": 0.3825, "step": 88510 }, { "epoch": 3.190074602659747, "grad_norm": 0.23317840695381165, "learning_rate": 1.531737605281974e-05, "loss": 0.3779, "step": 88515 }, { "epoch": 3.190254802320972, "grad_norm": 0.2050599455833435, "learning_rate": 1.531468574061419e-05, "loss": 0.3616, "step": 88520 }, { "epoch": 3.190435001982196, "grad_norm": 0.24573275446891785, "learning_rate": 1.5311995560369176e-05, "loss": 0.3505, "step": 88525 }, { "epoch": 3.190615201643421, "grad_norm": 0.22783836722373962, "learning_rate": 1.530930551212134e-05, "loss": 0.3509, "step": 88530 }, { "epoch": 3.1907954013046456, "grad_norm": 0.22740013897418976, "learning_rate": 1.5306615595907348e-05, "loss": 0.35, "step": 88535 }, { "epoch": 3.1909756009658703, "grad_norm": 0.23683089017868042, "learning_rate": 1.5303925811763833e-05, "loss": 0.3592, "step": 88540 }, { "epoch": 3.1911558006270946, "grad_norm": 0.2351250797510147, "learning_rate": 1.530123615972746e-05, "loss": 0.4032, "step": 88545 }, { "epoch": 3.1913360002883193, "grad_norm": 0.23128874599933624, "learning_rate": 1.5298546639834867e-05, "loss": 0.3866, "step": 88550 }, { "epoch": 3.191516199949544, "grad_norm": 0.24116584658622742, "learning_rate": 1.529585725212269e-05, "loss": 0.3896, "step": 88555 }, { "epoch": 3.191696399610769, "grad_norm": 0.19982759654521942, "learning_rate": 1.5293167996627583e-05, "loss": 0.3896, "step": 88560 }, { "epoch": 3.1918765992719935, "grad_norm": 0.19958491623401642, "learning_rate": 1.529047887338619e-05, "loss": 0.3574, "step": 88565 }, { "epoch": 3.192056798933218, "grad_norm": 0.22124148905277252, "learning_rate": 1.5287789882435126e-05, "loss": 0.3592, "step": 88570 }, { "epoch": 3.1922369985944425, "grad_norm": 0.19804465770721436, "learning_rate": 1.528510102381106e-05, "loss": 0.3993, "step": 88575 }, { "epoch": 3.1924171982556673, "grad_norm": 0.18390753865242004, "learning_rate": 1.5282412297550603e-05, "loss": 0.4034, "step": 88580 }, { "epoch": 3.192597397916892, "grad_norm": 0.2427562028169632, "learning_rate": 1.5279723703690404e-05, "loss": 0.3776, "step": 88585 }, { "epoch": 3.1927775975781167, "grad_norm": 0.2625272572040558, "learning_rate": 1.5277035242267085e-05, "loss": 0.4091, "step": 88590 }, { "epoch": 3.192957797239341, "grad_norm": 0.20139677822589874, "learning_rate": 1.527434691331727e-05, "loss": 0.4056, "step": 88595 }, { "epoch": 3.1931379969005658, "grad_norm": 0.22303380072116852, "learning_rate": 1.5271658716877607e-05, "loss": 0.3861, "step": 88600 }, { "epoch": 3.1933181965617905, "grad_norm": 0.24220505356788635, "learning_rate": 1.526897065298471e-05, "loss": 0.3655, "step": 88605 }, { "epoch": 3.1934983962230152, "grad_norm": 0.2237461358308792, "learning_rate": 1.5266282721675196e-05, "loss": 0.3396, "step": 88610 }, { "epoch": 3.19367859588424, "grad_norm": 0.22719275951385498, "learning_rate": 1.5263594922985698e-05, "loss": 0.389, "step": 88615 }, { "epoch": 3.1938587955454643, "grad_norm": 0.20860248804092407, "learning_rate": 1.5260907256952832e-05, "loss": 0.376, "step": 88620 }, { "epoch": 3.194038995206689, "grad_norm": 0.25672057271003723, "learning_rate": 1.5258219723613226e-05, "loss": 0.3926, "step": 88625 }, { "epoch": 3.1942191948679137, "grad_norm": 0.24725201725959778, "learning_rate": 1.5255532323003488e-05, "loss": 0.3647, "step": 88630 }, { "epoch": 3.1943993945291385, "grad_norm": 0.21268433332443237, "learning_rate": 1.5252845055160231e-05, "loss": 0.4103, "step": 88635 }, { "epoch": 3.1945795941903627, "grad_norm": 0.2236996740102768, "learning_rate": 1.5250157920120079e-05, "loss": 0.3829, "step": 88640 }, { "epoch": 3.1947597938515875, "grad_norm": 0.2377428263425827, "learning_rate": 1.5247470917919634e-05, "loss": 0.3897, "step": 88645 }, { "epoch": 3.194939993512812, "grad_norm": 0.1965668499469757, "learning_rate": 1.5244784048595506e-05, "loss": 0.4063, "step": 88650 }, { "epoch": 3.195120193174037, "grad_norm": 0.21718864142894745, "learning_rate": 1.5242097312184314e-05, "loss": 0.3925, "step": 88655 }, { "epoch": 3.1953003928352617, "grad_norm": 0.25005796551704407, "learning_rate": 1.5239410708722651e-05, "loss": 0.4104, "step": 88660 }, { "epoch": 3.195480592496486, "grad_norm": 0.18256092071533203, "learning_rate": 1.5236724238247132e-05, "loss": 0.3653, "step": 88665 }, { "epoch": 3.1956607921577107, "grad_norm": 0.25699880719184875, "learning_rate": 1.5234037900794362e-05, "loss": 0.3736, "step": 88670 }, { "epoch": 3.1958409918189354, "grad_norm": 0.20626196265220642, "learning_rate": 1.5231351696400915e-05, "loss": 0.3671, "step": 88675 }, { "epoch": 3.19602119148016, "grad_norm": 0.2111903876066208, "learning_rate": 1.5228665625103428e-05, "loss": 0.3878, "step": 88680 }, { "epoch": 3.1962013911413845, "grad_norm": 0.22062833607196808, "learning_rate": 1.5225979686938474e-05, "loss": 0.3727, "step": 88685 }, { "epoch": 3.196381590802609, "grad_norm": 0.2414456009864807, "learning_rate": 1.5223293881942653e-05, "loss": 0.3594, "step": 88690 }, { "epoch": 3.196561790463834, "grad_norm": 0.21987886726856232, "learning_rate": 1.5220608210152559e-05, "loss": 0.3376, "step": 88695 }, { "epoch": 3.1967419901250587, "grad_norm": 0.20280367136001587, "learning_rate": 1.5217922671604784e-05, "loss": 0.3523, "step": 88700 }, { "epoch": 3.1969221897862834, "grad_norm": 0.259671151638031, "learning_rate": 1.5215237266335925e-05, "loss": 0.3875, "step": 88705 }, { "epoch": 3.1971023894475077, "grad_norm": 0.26262468099594116, "learning_rate": 1.5212551994382562e-05, "loss": 0.3624, "step": 88710 }, { "epoch": 3.1972825891087324, "grad_norm": 0.2648344933986664, "learning_rate": 1.5209866855781277e-05, "loss": 0.3959, "step": 88715 }, { "epoch": 3.197462788769957, "grad_norm": 0.18271714448928833, "learning_rate": 1.5207181850568666e-05, "loss": 0.3252, "step": 88720 }, { "epoch": 3.197642988431182, "grad_norm": 0.23713482916355133, "learning_rate": 1.5204496978781301e-05, "loss": 0.3915, "step": 88725 }, { "epoch": 3.197823188092406, "grad_norm": 0.2027212232351303, "learning_rate": 1.5201812240455776e-05, "loss": 0.368, "step": 88730 }, { "epoch": 3.198003387753631, "grad_norm": 0.26085880398750305, "learning_rate": 1.519912763562866e-05, "loss": 0.3783, "step": 88735 }, { "epoch": 3.1981835874148556, "grad_norm": 0.23765386641025543, "learning_rate": 1.5196443164336526e-05, "loss": 0.3829, "step": 88740 }, { "epoch": 3.1983637870760804, "grad_norm": 0.2554900348186493, "learning_rate": 1.5193758826615964e-05, "loss": 0.3891, "step": 88745 }, { "epoch": 3.198543986737305, "grad_norm": 0.21230515837669373, "learning_rate": 1.5191074622503542e-05, "loss": 0.4042, "step": 88750 }, { "epoch": 3.1987241863985294, "grad_norm": 0.31726622581481934, "learning_rate": 1.5188390552035813e-05, "loss": 0.3832, "step": 88755 }, { "epoch": 3.198904386059754, "grad_norm": 0.21085330843925476, "learning_rate": 1.5185706615249378e-05, "loss": 0.375, "step": 88760 }, { "epoch": 3.199084585720979, "grad_norm": 0.27633991837501526, "learning_rate": 1.5183022812180783e-05, "loss": 0.4169, "step": 88765 }, { "epoch": 3.1992647853822036, "grad_norm": 0.21804660558700562, "learning_rate": 1.5180339142866607e-05, "loss": 0.3719, "step": 88770 }, { "epoch": 3.199444985043428, "grad_norm": 0.24333439767360687, "learning_rate": 1.517765560734341e-05, "loss": 0.381, "step": 88775 }, { "epoch": 3.1996251847046526, "grad_norm": 0.23350374400615692, "learning_rate": 1.5174972205647744e-05, "loss": 0.3529, "step": 88780 }, { "epoch": 3.1998053843658774, "grad_norm": 0.23045288026332855, "learning_rate": 1.5172288937816193e-05, "loss": 0.3478, "step": 88785 }, { "epoch": 3.199985584027102, "grad_norm": 0.19861705601215363, "learning_rate": 1.5169605803885296e-05, "loss": 0.3737, "step": 88790 }, { "epoch": 3.200165783688327, "grad_norm": 0.3103479743003845, "learning_rate": 1.5166922803891615e-05, "loss": 0.3906, "step": 88795 }, { "epoch": 3.200345983349551, "grad_norm": 0.2640889585018158, "learning_rate": 1.5164239937871708e-05, "loss": 0.3636, "step": 88800 }, { "epoch": 3.200526183010776, "grad_norm": 0.22964945435523987, "learning_rate": 1.5161557205862127e-05, "loss": 0.3578, "step": 88805 }, { "epoch": 3.2007063826720006, "grad_norm": 0.20563103258609772, "learning_rate": 1.5158874607899423e-05, "loss": 0.3629, "step": 88810 }, { "epoch": 3.2008865823332253, "grad_norm": 0.22984810173511505, "learning_rate": 1.515619214402015e-05, "loss": 0.4054, "step": 88815 }, { "epoch": 3.20106678199445, "grad_norm": 0.23353759944438934, "learning_rate": 1.5153509814260849e-05, "loss": 0.3607, "step": 88820 }, { "epoch": 3.2012469816556743, "grad_norm": 0.2417394518852234, "learning_rate": 1.5150827618658075e-05, "loss": 0.3985, "step": 88825 }, { "epoch": 3.201427181316899, "grad_norm": 0.2204267978668213, "learning_rate": 1.514814555724837e-05, "loss": 0.3738, "step": 88830 }, { "epoch": 3.201607380978124, "grad_norm": 0.2334311157464981, "learning_rate": 1.5145463630068268e-05, "loss": 0.3758, "step": 88835 }, { "epoch": 3.2017875806393485, "grad_norm": 0.21753154695034027, "learning_rate": 1.5142781837154319e-05, "loss": 0.3782, "step": 88840 }, { "epoch": 3.201967780300573, "grad_norm": 0.27690520882606506, "learning_rate": 1.5140100178543057e-05, "loss": 0.377, "step": 88845 }, { "epoch": 3.2021479799617976, "grad_norm": 0.21353106200695038, "learning_rate": 1.5137418654271025e-05, "loss": 0.3773, "step": 88850 }, { "epoch": 3.2023281796230223, "grad_norm": 0.19500304758548737, "learning_rate": 1.513473726437476e-05, "loss": 0.351, "step": 88855 }, { "epoch": 3.202508379284247, "grad_norm": 0.21235820651054382, "learning_rate": 1.5132056008890771e-05, "loss": 0.3599, "step": 88860 }, { "epoch": 3.2026885789454718, "grad_norm": 0.2564550042152405, "learning_rate": 1.5129374887855629e-05, "loss": 0.3966, "step": 88865 }, { "epoch": 3.202868778606696, "grad_norm": 0.24082764983177185, "learning_rate": 1.5126693901305836e-05, "loss": 0.3829, "step": 88870 }, { "epoch": 3.203048978267921, "grad_norm": 0.22911947965621948, "learning_rate": 1.512401304927792e-05, "loss": 0.3408, "step": 88875 }, { "epoch": 3.2032291779291455, "grad_norm": 0.26279905438423157, "learning_rate": 1.5121332331808421e-05, "loss": 0.4124, "step": 88880 }, { "epoch": 3.2034093775903703, "grad_norm": 0.2682557702064514, "learning_rate": 1.511865174893385e-05, "loss": 0.3681, "step": 88885 }, { "epoch": 3.203589577251595, "grad_norm": 0.25883546471595764, "learning_rate": 1.5115971300690746e-05, "loss": 0.4007, "step": 88890 }, { "epoch": 3.2037697769128193, "grad_norm": 0.21053394675254822, "learning_rate": 1.5113290987115614e-05, "loss": 0.3837, "step": 88895 }, { "epoch": 3.203949976574044, "grad_norm": 0.22569601237773895, "learning_rate": 1.5110610808244974e-05, "loss": 0.3969, "step": 88900 }, { "epoch": 3.2041301762352687, "grad_norm": 0.24073798954486847, "learning_rate": 1.5107930764115352e-05, "loss": 0.3974, "step": 88905 }, { "epoch": 3.2043103758964935, "grad_norm": 0.24545975029468536, "learning_rate": 1.5105250854763259e-05, "loss": 0.3976, "step": 88910 }, { "epoch": 3.2044905755577178, "grad_norm": 0.18453896045684814, "learning_rate": 1.5102571080225203e-05, "loss": 0.3786, "step": 88915 }, { "epoch": 3.2046707752189425, "grad_norm": 0.2695164084434509, "learning_rate": 1.5099891440537705e-05, "loss": 0.3783, "step": 88920 }, { "epoch": 3.2048509748801672, "grad_norm": 0.22112925350666046, "learning_rate": 1.5097211935737263e-05, "loss": 0.4013, "step": 88925 }, { "epoch": 3.205031174541392, "grad_norm": 0.2127871960401535, "learning_rate": 1.5094532565860398e-05, "loss": 0.3885, "step": 88930 }, { "epoch": 3.2052113742026167, "grad_norm": 0.22440172731876373, "learning_rate": 1.5091853330943606e-05, "loss": 0.3864, "step": 88935 }, { "epoch": 3.205391573863841, "grad_norm": 0.2635382115840912, "learning_rate": 1.508917423102339e-05, "loss": 0.3542, "step": 88940 }, { "epoch": 3.2055717735250657, "grad_norm": 0.20323844254016876, "learning_rate": 1.5086495266136263e-05, "loss": 0.3546, "step": 88945 }, { "epoch": 3.2057519731862905, "grad_norm": 0.21139764785766602, "learning_rate": 1.5083816436318716e-05, "loss": 0.3542, "step": 88950 }, { "epoch": 3.205932172847515, "grad_norm": 0.20650440454483032, "learning_rate": 1.5081137741607249e-05, "loss": 0.3755, "step": 88955 }, { "epoch": 3.2061123725087395, "grad_norm": 0.22922226786613464, "learning_rate": 1.5078459182038367e-05, "loss": 0.3651, "step": 88960 }, { "epoch": 3.206292572169964, "grad_norm": 0.2639060914516449, "learning_rate": 1.5075780757648544e-05, "loss": 0.4042, "step": 88965 }, { "epoch": 3.206472771831189, "grad_norm": 0.20227624475955963, "learning_rate": 1.5073102468474304e-05, "loss": 0.3607, "step": 88970 }, { "epoch": 3.2066529714924137, "grad_norm": 0.23656994104385376, "learning_rate": 1.5070424314552112e-05, "loss": 0.4143, "step": 88975 }, { "epoch": 3.2068331711536384, "grad_norm": 0.21116869151592255, "learning_rate": 1.5067746295918462e-05, "loss": 0.3836, "step": 88980 }, { "epoch": 3.2070133708148627, "grad_norm": 0.20764394104480743, "learning_rate": 1.506506841260985e-05, "loss": 0.3969, "step": 88985 }, { "epoch": 3.2071935704760874, "grad_norm": 0.2260044366121292, "learning_rate": 1.5062390664662757e-05, "loss": 0.3741, "step": 88990 }, { "epoch": 3.207373770137312, "grad_norm": 0.2571514844894409, "learning_rate": 1.5059713052113666e-05, "loss": 0.3532, "step": 88995 }, { "epoch": 3.207553969798537, "grad_norm": 0.27601492404937744, "learning_rate": 1.5057035574999067e-05, "loss": 0.3968, "step": 89000 }, { "epoch": 3.207553969798537, "eval_loss": 0.4308409094810486, "eval_runtime": 3.5383, "eval_samples_per_second": 28.262, "eval_steps_per_second": 7.066, "step": 89000 }, { "epoch": 3.207734169459761, "grad_norm": 0.22598758339881897, "learning_rate": 1.5054358233355423e-05, "loss": 0.373, "step": 89005 }, { "epoch": 3.207914369120986, "grad_norm": 0.23747682571411133, "learning_rate": 1.5051681027219228e-05, "loss": 0.366, "step": 89010 }, { "epoch": 3.2080945687822107, "grad_norm": 0.1627287119626999, "learning_rate": 1.5049003956626957e-05, "loss": 0.3545, "step": 89015 }, { "epoch": 3.2082747684434354, "grad_norm": 0.28767460584640503, "learning_rate": 1.504632702161507e-05, "loss": 0.3935, "step": 89020 }, { "epoch": 3.20845496810466, "grad_norm": 0.28209447860717773, "learning_rate": 1.5043650222220057e-05, "loss": 0.3836, "step": 89025 }, { "epoch": 3.2086351677658844, "grad_norm": 0.2737613022327423, "learning_rate": 1.504097355847838e-05, "loss": 0.3978, "step": 89030 }, { "epoch": 3.208815367427109, "grad_norm": 0.2262285202741623, "learning_rate": 1.5038297030426513e-05, "loss": 0.3948, "step": 89035 }, { "epoch": 3.208995567088334, "grad_norm": 0.24471984803676605, "learning_rate": 1.5035620638100926e-05, "loss": 0.4265, "step": 89040 }, { "epoch": 3.2091757667495586, "grad_norm": 0.2190433293581009, "learning_rate": 1.5032944381538061e-05, "loss": 0.3845, "step": 89045 }, { "epoch": 3.209355966410783, "grad_norm": 0.22950243949890137, "learning_rate": 1.503026826077441e-05, "loss": 0.4014, "step": 89050 }, { "epoch": 3.2095361660720076, "grad_norm": 0.23515675961971283, "learning_rate": 1.502759227584643e-05, "loss": 0.3942, "step": 89055 }, { "epoch": 3.2097163657332324, "grad_norm": 0.24395707249641418, "learning_rate": 1.502491642679056e-05, "loss": 0.3746, "step": 89060 }, { "epoch": 3.209896565394457, "grad_norm": 0.28980615735054016, "learning_rate": 1.5022240713643287e-05, "loss": 0.401, "step": 89065 }, { "epoch": 3.210076765055682, "grad_norm": 0.1918734461069107, "learning_rate": 1.5019565136441038e-05, "loss": 0.3971, "step": 89070 }, { "epoch": 3.210256964716906, "grad_norm": 0.22146931290626526, "learning_rate": 1.50168896952203e-05, "loss": 0.3756, "step": 89075 }, { "epoch": 3.210437164378131, "grad_norm": 0.20011606812477112, "learning_rate": 1.5014214390017496e-05, "loss": 0.3584, "step": 89080 }, { "epoch": 3.2106173640393556, "grad_norm": 0.2056010365486145, "learning_rate": 1.5011539220869084e-05, "loss": 0.3509, "step": 89085 }, { "epoch": 3.2107975637005803, "grad_norm": 0.2631101608276367, "learning_rate": 1.5008864187811522e-05, "loss": 0.3855, "step": 89090 }, { "epoch": 3.210977763361805, "grad_norm": 0.2047474980354309, "learning_rate": 1.500618929088125e-05, "loss": 0.3843, "step": 89095 }, { "epoch": 3.2111579630230294, "grad_norm": 0.2543022632598877, "learning_rate": 1.5003514530114712e-05, "loss": 0.3901, "step": 89100 }, { "epoch": 3.211338162684254, "grad_norm": 0.22199147939682007, "learning_rate": 1.5000839905548359e-05, "loss": 0.372, "step": 89105 }, { "epoch": 3.211518362345479, "grad_norm": 0.19281704723834991, "learning_rate": 1.4998165417218618e-05, "loss": 0.341, "step": 89110 }, { "epoch": 3.2116985620067036, "grad_norm": 0.2250460386276245, "learning_rate": 1.499549106516194e-05, "loss": 0.3559, "step": 89115 }, { "epoch": 3.2118787616679283, "grad_norm": 0.2156555950641632, "learning_rate": 1.4992816849414765e-05, "loss": 0.3516, "step": 89120 }, { "epoch": 3.2120589613291526, "grad_norm": 0.2305130809545517, "learning_rate": 1.4990142770013512e-05, "loss": 0.3911, "step": 89125 }, { "epoch": 3.2122391609903773, "grad_norm": 0.23964077234268188, "learning_rate": 1.4987468826994632e-05, "loss": 0.4231, "step": 89130 }, { "epoch": 3.212419360651602, "grad_norm": 0.2621869742870331, "learning_rate": 1.4984795020394548e-05, "loss": 0.378, "step": 89135 }, { "epoch": 3.212599560312827, "grad_norm": 0.18443959951400757, "learning_rate": 1.49821213502497e-05, "loss": 0.3616, "step": 89140 }, { "epoch": 3.212779759974051, "grad_norm": 0.20639346539974213, "learning_rate": 1.4979447816596508e-05, "loss": 0.3615, "step": 89145 }, { "epoch": 3.212959959635276, "grad_norm": 0.24619300663471222, "learning_rate": 1.497677441947139e-05, "loss": 0.4164, "step": 89150 }, { "epoch": 3.2131401592965005, "grad_norm": 0.3176526427268982, "learning_rate": 1.4974101158910791e-05, "loss": 0.3682, "step": 89155 }, { "epoch": 3.2133203589577253, "grad_norm": 0.2009301781654358, "learning_rate": 1.497142803495112e-05, "loss": 0.3765, "step": 89160 }, { "epoch": 3.21350055861895, "grad_norm": 0.2197410613298416, "learning_rate": 1.4968755047628796e-05, "loss": 0.3795, "step": 89165 }, { "epoch": 3.2136807582801743, "grad_norm": 0.23432065546512604, "learning_rate": 1.4966082196980247e-05, "loss": 0.3779, "step": 89170 }, { "epoch": 3.213860957941399, "grad_norm": 0.29888421297073364, "learning_rate": 1.4963409483041874e-05, "loss": 0.3599, "step": 89175 }, { "epoch": 3.2140411576026238, "grad_norm": 0.19982536137104034, "learning_rate": 1.496073690585012e-05, "loss": 0.3989, "step": 89180 }, { "epoch": 3.2142213572638485, "grad_norm": 0.22147499024868011, "learning_rate": 1.4958064465441374e-05, "loss": 0.4284, "step": 89185 }, { "epoch": 3.214401556925073, "grad_norm": 0.23238062858581543, "learning_rate": 1.4955392161852056e-05, "loss": 0.368, "step": 89190 }, { "epoch": 3.2145817565862975, "grad_norm": 0.251499205827713, "learning_rate": 1.4952719995118574e-05, "loss": 0.4231, "step": 89195 }, { "epoch": 3.2147619562475223, "grad_norm": 0.24169008433818817, "learning_rate": 1.4950047965277342e-05, "loss": 0.3682, "step": 89200 }, { "epoch": 3.214942155908747, "grad_norm": 0.21621721982955933, "learning_rate": 1.4947376072364752e-05, "loss": 0.3801, "step": 89205 }, { "epoch": 3.2151223555699717, "grad_norm": 0.2918843626976013, "learning_rate": 1.4944704316417223e-05, "loss": 0.3745, "step": 89210 }, { "epoch": 3.215302555231196, "grad_norm": 0.1639867126941681, "learning_rate": 1.4942032697471148e-05, "loss": 0.3755, "step": 89215 }, { "epoch": 3.2154827548924207, "grad_norm": 0.2433352768421173, "learning_rate": 1.493936121556293e-05, "loss": 0.3648, "step": 89220 }, { "epoch": 3.2156629545536455, "grad_norm": 0.20584319531917572, "learning_rate": 1.4936689870728974e-05, "loss": 0.3858, "step": 89225 }, { "epoch": 3.21584315421487, "grad_norm": 0.2562110424041748, "learning_rate": 1.4934018663005662e-05, "loss": 0.3917, "step": 89230 }, { "epoch": 3.2160233538760945, "grad_norm": 0.19980406761169434, "learning_rate": 1.4931347592429401e-05, "loss": 0.3751, "step": 89235 }, { "epoch": 3.2162035535373192, "grad_norm": 0.24331307411193848, "learning_rate": 1.4928676659036586e-05, "loss": 0.3779, "step": 89240 }, { "epoch": 3.216383753198544, "grad_norm": 0.20756937563419342, "learning_rate": 1.4926005862863584e-05, "loss": 0.38, "step": 89245 }, { "epoch": 3.2165639528597687, "grad_norm": 0.21686789393424988, "learning_rate": 1.492333520394682e-05, "loss": 0.3911, "step": 89250 }, { "epoch": 3.2167441525209934, "grad_norm": 0.1949186772108078, "learning_rate": 1.492066468232265e-05, "loss": 0.3791, "step": 89255 }, { "epoch": 3.2169243521822177, "grad_norm": 0.19429568946361542, "learning_rate": 1.4917994298027482e-05, "loss": 0.3407, "step": 89260 }, { "epoch": 3.2171045518434425, "grad_norm": 0.25820302963256836, "learning_rate": 1.4915324051097688e-05, "loss": 0.3799, "step": 89265 }, { "epoch": 3.217284751504667, "grad_norm": 0.2778339684009552, "learning_rate": 1.4912653941569642e-05, "loss": 0.3973, "step": 89270 }, { "epoch": 3.217464951165892, "grad_norm": 0.2582213878631592, "learning_rate": 1.4909983969479747e-05, "loss": 0.3764, "step": 89275 }, { "epoch": 3.217645150827116, "grad_norm": 0.1809670329093933, "learning_rate": 1.490731413486436e-05, "loss": 0.3945, "step": 89280 }, { "epoch": 3.217825350488341, "grad_norm": 0.2597140371799469, "learning_rate": 1.490464443775986e-05, "loss": 0.4122, "step": 89285 }, { "epoch": 3.2180055501495657, "grad_norm": 0.23340429365634918, "learning_rate": 1.490197487820263e-05, "loss": 0.3487, "step": 89290 }, { "epoch": 3.2181857498107904, "grad_norm": 0.2497396469116211, "learning_rate": 1.4899305456229032e-05, "loss": 0.3891, "step": 89295 }, { "epoch": 3.218365949472015, "grad_norm": 0.28371942043304443, "learning_rate": 1.4896636171875447e-05, "loss": 0.4068, "step": 89300 }, { "epoch": 3.2185461491332394, "grad_norm": 0.21911130845546722, "learning_rate": 1.489396702517824e-05, "loss": 0.4053, "step": 89305 }, { "epoch": 3.218726348794464, "grad_norm": 0.2320789396762848, "learning_rate": 1.489129801617377e-05, "loss": 0.3911, "step": 89310 }, { "epoch": 3.218906548455689, "grad_norm": 0.21250326931476593, "learning_rate": 1.4888629144898409e-05, "loss": 0.3706, "step": 89315 }, { "epoch": 3.2190867481169136, "grad_norm": 0.26723039150238037, "learning_rate": 1.4885960411388517e-05, "loss": 0.3946, "step": 89320 }, { "epoch": 3.2192669477781384, "grad_norm": 0.20231136679649353, "learning_rate": 1.4883291815680459e-05, "loss": 0.3582, "step": 89325 }, { "epoch": 3.2194471474393627, "grad_norm": 0.2787347435951233, "learning_rate": 1.4880623357810597e-05, "loss": 0.3847, "step": 89330 }, { "epoch": 3.2196273471005874, "grad_norm": 0.2298644334077835, "learning_rate": 1.4877955037815267e-05, "loss": 0.3643, "step": 89335 }, { "epoch": 3.219807546761812, "grad_norm": 0.23724813759326935, "learning_rate": 1.4875286855730852e-05, "loss": 0.4065, "step": 89340 }, { "epoch": 3.219987746423037, "grad_norm": 0.1757110357284546, "learning_rate": 1.4872618811593697e-05, "loss": 0.3819, "step": 89345 }, { "epoch": 3.220167946084261, "grad_norm": 0.20195148885250092, "learning_rate": 1.4869950905440136e-05, "loss": 0.3823, "step": 89350 }, { "epoch": 3.220348145745486, "grad_norm": 0.1882009357213974, "learning_rate": 1.4867283137306547e-05, "loss": 0.339, "step": 89355 }, { "epoch": 3.2205283454067106, "grad_norm": 0.24396227300167084, "learning_rate": 1.4864615507229246e-05, "loss": 0.3963, "step": 89360 }, { "epoch": 3.2207085450679354, "grad_norm": 0.2632978558540344, "learning_rate": 1.4861948015244615e-05, "loss": 0.4177, "step": 89365 }, { "epoch": 3.22088874472916, "grad_norm": 0.20694774389266968, "learning_rate": 1.4859280661388974e-05, "loss": 0.367, "step": 89370 }, { "epoch": 3.2210689443903844, "grad_norm": 0.21386392414569855, "learning_rate": 1.4856613445698664e-05, "loss": 0.359, "step": 89375 }, { "epoch": 3.221249144051609, "grad_norm": 0.1855984479188919, "learning_rate": 1.4853946368210036e-05, "loss": 0.3242, "step": 89380 }, { "epoch": 3.221429343712834, "grad_norm": 0.23867137730121613, "learning_rate": 1.485127942895943e-05, "loss": 0.363, "step": 89385 }, { "epoch": 3.2216095433740586, "grad_norm": 0.19533032178878784, "learning_rate": 1.4848612627983166e-05, "loss": 0.4011, "step": 89390 }, { "epoch": 3.2217897430352833, "grad_norm": 0.22574682533740997, "learning_rate": 1.4845945965317598e-05, "loss": 0.3869, "step": 89395 }, { "epoch": 3.2219699426965076, "grad_norm": 0.268163800239563, "learning_rate": 1.4843279440999044e-05, "loss": 0.3802, "step": 89400 }, { "epoch": 3.2221501423577323, "grad_norm": 0.23362210392951965, "learning_rate": 1.4840613055063845e-05, "loss": 0.4094, "step": 89405 }, { "epoch": 3.222330342018957, "grad_norm": 0.2540440559387207, "learning_rate": 1.483794680754833e-05, "loss": 0.366, "step": 89410 }, { "epoch": 3.222510541680182, "grad_norm": 0.2499903291463852, "learning_rate": 1.4835280698488812e-05, "loss": 0.3656, "step": 89415 }, { "epoch": 3.222690741341406, "grad_norm": 0.21211077272891998, "learning_rate": 1.4832614727921634e-05, "loss": 0.3654, "step": 89420 }, { "epoch": 3.222870941002631, "grad_norm": 0.20574845373630524, "learning_rate": 1.482994889588312e-05, "loss": 0.39, "step": 89425 }, { "epoch": 3.2230511406638556, "grad_norm": 0.21333856880664825, "learning_rate": 1.4827283202409564e-05, "loss": 0.4014, "step": 89430 }, { "epoch": 3.2232313403250803, "grad_norm": 0.1983533501625061, "learning_rate": 1.4824617647537323e-05, "loss": 0.3733, "step": 89435 }, { "epoch": 3.223411539986305, "grad_norm": 0.1866043210029602, "learning_rate": 1.482195223130268e-05, "loss": 0.3471, "step": 89440 }, { "epoch": 3.2235917396475293, "grad_norm": 0.22153297066688538, "learning_rate": 1.481928695374198e-05, "loss": 0.3847, "step": 89445 }, { "epoch": 3.223771939308754, "grad_norm": 0.26498228311538696, "learning_rate": 1.4816621814891524e-05, "loss": 0.3581, "step": 89450 }, { "epoch": 3.223952138969979, "grad_norm": 0.24101939797401428, "learning_rate": 1.4813956814787616e-05, "loss": 0.3513, "step": 89455 }, { "epoch": 3.2241323386312035, "grad_norm": 0.23949559032917023, "learning_rate": 1.481129195346658e-05, "loss": 0.375, "step": 89460 }, { "epoch": 3.224312538292428, "grad_norm": 0.20637662708759308, "learning_rate": 1.4808627230964717e-05, "loss": 0.3764, "step": 89465 }, { "epoch": 3.2244927379536525, "grad_norm": 0.25291964411735535, "learning_rate": 1.4805962647318333e-05, "loss": 0.3823, "step": 89470 }, { "epoch": 3.2246729376148773, "grad_norm": 0.2150401920080185, "learning_rate": 1.4803298202563737e-05, "loss": 0.3555, "step": 89475 }, { "epoch": 3.224853137276102, "grad_norm": 0.25628653168678284, "learning_rate": 1.4800633896737226e-05, "loss": 0.3762, "step": 89480 }, { "epoch": 3.2250333369373267, "grad_norm": 0.2144256830215454, "learning_rate": 1.4797969729875106e-05, "loss": 0.3503, "step": 89485 }, { "epoch": 3.225213536598551, "grad_norm": 0.2198164016008377, "learning_rate": 1.4795305702013674e-05, "loss": 0.3467, "step": 89490 }, { "epoch": 3.2253937362597758, "grad_norm": 0.2541256844997406, "learning_rate": 1.4792641813189222e-05, "loss": 0.3787, "step": 89495 }, { "epoch": 3.2255739359210005, "grad_norm": 0.21869847178459167, "learning_rate": 1.4789978063438054e-05, "loss": 0.3738, "step": 89500 }, { "epoch": 3.2255739359210005, "eval_loss": 0.43070846796035767, "eval_runtime": 3.5348, "eval_samples_per_second": 28.29, "eval_steps_per_second": 7.073, "step": 89500 }, { "epoch": 3.2257541355822252, "grad_norm": 0.3209418058395386, "learning_rate": 1.4787314452796458e-05, "loss": 0.3811, "step": 89505 }, { "epoch": 3.2259343352434495, "grad_norm": 0.2772040069103241, "learning_rate": 1.4784650981300718e-05, "loss": 0.3696, "step": 89510 }, { "epoch": 3.2261145349046743, "grad_norm": 0.2352096289396286, "learning_rate": 1.4781987648987138e-05, "loss": 0.3697, "step": 89515 }, { "epoch": 3.226294734565899, "grad_norm": 0.19842156767845154, "learning_rate": 1.4779324455891994e-05, "loss": 0.3671, "step": 89520 }, { "epoch": 3.2264749342271237, "grad_norm": 0.2521081864833832, "learning_rate": 1.4776661402051578e-05, "loss": 0.3927, "step": 89525 }, { "epoch": 3.2266551338883485, "grad_norm": 0.23003843426704407, "learning_rate": 1.4773998487502178e-05, "loss": 0.3673, "step": 89530 }, { "epoch": 3.2268353335495727, "grad_norm": 0.20491987466812134, "learning_rate": 1.4771335712280054e-05, "loss": 0.37, "step": 89535 }, { "epoch": 3.2270155332107975, "grad_norm": 0.21708033978939056, "learning_rate": 1.4768673076421512e-05, "loss": 0.3611, "step": 89540 }, { "epoch": 3.227195732872022, "grad_norm": 0.21462254226207733, "learning_rate": 1.4766010579962808e-05, "loss": 0.392, "step": 89545 }, { "epoch": 3.227375932533247, "grad_norm": 0.20276306569576263, "learning_rate": 1.4763348222940243e-05, "loss": 0.3416, "step": 89550 }, { "epoch": 3.2275561321944712, "grad_norm": 0.2577986419200897, "learning_rate": 1.4760686005390068e-05, "loss": 0.3846, "step": 89555 }, { "epoch": 3.227736331855696, "grad_norm": 0.22566761076450348, "learning_rate": 1.4758023927348563e-05, "loss": 0.416, "step": 89560 }, { "epoch": 3.2279165315169207, "grad_norm": 0.2527579665184021, "learning_rate": 1.4755361988852002e-05, "loss": 0.4009, "step": 89565 }, { "epoch": 3.2280967311781454, "grad_norm": 0.23741693794727325, "learning_rate": 1.4752700189936649e-05, "loss": 0.3604, "step": 89570 }, { "epoch": 3.22827693083937, "grad_norm": 0.2663268446922302, "learning_rate": 1.4750038530638769e-05, "loss": 0.3795, "step": 89575 }, { "epoch": 3.2284571305005945, "grad_norm": 0.2541162371635437, "learning_rate": 1.4747377010994634e-05, "loss": 0.367, "step": 89580 }, { "epoch": 3.228637330161819, "grad_norm": 0.21635161340236664, "learning_rate": 1.4744715631040495e-05, "loss": 0.3491, "step": 89585 }, { "epoch": 3.228817529823044, "grad_norm": 0.2601666748523712, "learning_rate": 1.4742054390812631e-05, "loss": 0.4016, "step": 89590 }, { "epoch": 3.2289977294842687, "grad_norm": 0.2198321521282196, "learning_rate": 1.4739393290347286e-05, "loss": 0.3752, "step": 89595 }, { "epoch": 3.2291779291454934, "grad_norm": 0.22527672350406647, "learning_rate": 1.4736732329680714e-05, "loss": 0.3589, "step": 89600 }, { "epoch": 3.2293581288067177, "grad_norm": 0.28329557180404663, "learning_rate": 1.4734071508849184e-05, "loss": 0.3664, "step": 89605 }, { "epoch": 3.2295383284679424, "grad_norm": 0.2408505380153656, "learning_rate": 1.4731410827888947e-05, "loss": 0.4055, "step": 89610 }, { "epoch": 3.229718528129167, "grad_norm": 0.2271721065044403, "learning_rate": 1.4728750286836236e-05, "loss": 0.3716, "step": 89615 }, { "epoch": 3.229898727790392, "grad_norm": 0.20692966878414154, "learning_rate": 1.472608988572733e-05, "loss": 0.3708, "step": 89620 }, { "epoch": 3.2300789274516166, "grad_norm": 0.19603079557418823, "learning_rate": 1.472342962459844e-05, "loss": 0.3608, "step": 89625 }, { "epoch": 3.230259127112841, "grad_norm": 0.21330444514751434, "learning_rate": 1.4720769503485845e-05, "loss": 0.3808, "step": 89630 }, { "epoch": 3.2304393267740656, "grad_norm": 0.22534000873565674, "learning_rate": 1.471810952242578e-05, "loss": 0.4035, "step": 89635 }, { "epoch": 3.2306195264352904, "grad_norm": 0.22428487241268158, "learning_rate": 1.4715449681454468e-05, "loss": 0.3776, "step": 89640 }, { "epoch": 3.230799726096515, "grad_norm": 0.2968350052833557, "learning_rate": 1.4712789980608177e-05, "loss": 0.3924, "step": 89645 }, { "epoch": 3.2309799257577394, "grad_norm": 0.2184687852859497, "learning_rate": 1.4710130419923123e-05, "loss": 0.3787, "step": 89650 }, { "epoch": 3.231160125418964, "grad_norm": 0.23696725070476532, "learning_rate": 1.470747099943555e-05, "loss": 0.3938, "step": 89655 }, { "epoch": 3.231340325080189, "grad_norm": 0.24137650430202484, "learning_rate": 1.4704811719181693e-05, "loss": 0.4011, "step": 89660 }, { "epoch": 3.2315205247414136, "grad_norm": 0.2342267483472824, "learning_rate": 1.4702152579197782e-05, "loss": 0.3619, "step": 89665 }, { "epoch": 3.2317007244026383, "grad_norm": 0.2602199912071228, "learning_rate": 1.4699493579520052e-05, "loss": 0.4182, "step": 89670 }, { "epoch": 3.2318809240638626, "grad_norm": 0.2467493712902069, "learning_rate": 1.4696834720184725e-05, "loss": 0.3843, "step": 89675 }, { "epoch": 3.2320611237250874, "grad_norm": 0.23336918652057648, "learning_rate": 1.4694176001228027e-05, "loss": 0.3789, "step": 89680 }, { "epoch": 3.232241323386312, "grad_norm": 0.22967779636383057, "learning_rate": 1.4691517422686191e-05, "loss": 0.3782, "step": 89685 }, { "epoch": 3.232421523047537, "grad_norm": 0.19825366139411926, "learning_rate": 1.4688858984595432e-05, "loss": 0.3776, "step": 89690 }, { "epoch": 3.232601722708761, "grad_norm": 0.18190155923366547, "learning_rate": 1.4686200686991972e-05, "loss": 0.3398, "step": 89695 }, { "epoch": 3.232781922369986, "grad_norm": 0.21826298534870148, "learning_rate": 1.468354252991203e-05, "loss": 0.371, "step": 89700 }, { "epoch": 3.2329621220312106, "grad_norm": 0.2672412097454071, "learning_rate": 1.4680884513391824e-05, "loss": 0.3691, "step": 89705 }, { "epoch": 3.2331423216924353, "grad_norm": 0.25871455669403076, "learning_rate": 1.4678226637467574e-05, "loss": 0.3965, "step": 89710 }, { "epoch": 3.23332252135366, "grad_norm": 0.24801315367221832, "learning_rate": 1.467556890217549e-05, "loss": 0.4047, "step": 89715 }, { "epoch": 3.2335027210148843, "grad_norm": 0.20735400915145874, "learning_rate": 1.4672911307551768e-05, "loss": 0.3561, "step": 89720 }, { "epoch": 3.233682920676109, "grad_norm": 0.24072608351707458, "learning_rate": 1.4670253853632642e-05, "loss": 0.3899, "step": 89725 }, { "epoch": 3.233863120337334, "grad_norm": 0.24866987764835358, "learning_rate": 1.4667596540454296e-05, "loss": 0.3894, "step": 89730 }, { "epoch": 3.2340433199985585, "grad_norm": 0.2773805856704712, "learning_rate": 1.466493936805296e-05, "loss": 0.4052, "step": 89735 }, { "epoch": 3.234223519659783, "grad_norm": 0.2559952735900879, "learning_rate": 1.4662282336464817e-05, "loss": 0.3995, "step": 89740 }, { "epoch": 3.2344037193210076, "grad_norm": 0.20046544075012207, "learning_rate": 1.4659625445726068e-05, "loss": 0.388, "step": 89745 }, { "epoch": 3.2345839189822323, "grad_norm": 0.24034267663955688, "learning_rate": 1.4656968695872936e-05, "loss": 0.3999, "step": 89750 }, { "epoch": 3.234764118643457, "grad_norm": 0.20936627686023712, "learning_rate": 1.4654312086941597e-05, "loss": 0.3932, "step": 89755 }, { "epoch": 3.2349443183046818, "grad_norm": 0.25857529044151306, "learning_rate": 1.4651655618968246e-05, "loss": 0.3954, "step": 89760 }, { "epoch": 3.235124517965906, "grad_norm": 0.20144201815128326, "learning_rate": 1.4648999291989093e-05, "loss": 0.3765, "step": 89765 }, { "epoch": 3.235304717627131, "grad_norm": 0.25030654668807983, "learning_rate": 1.4646343106040314e-05, "loss": 0.3872, "step": 89770 }, { "epoch": 3.2354849172883555, "grad_norm": 0.18118837475776672, "learning_rate": 1.464368706115811e-05, "loss": 0.3797, "step": 89775 }, { "epoch": 3.2356651169495803, "grad_norm": 0.2649378180503845, "learning_rate": 1.4641031157378665e-05, "loss": 0.3662, "step": 89780 }, { "epoch": 3.2358453166108045, "grad_norm": 0.22662107646465302, "learning_rate": 1.4638375394738157e-05, "loss": 0.3622, "step": 89785 }, { "epoch": 3.2360255162720293, "grad_norm": 0.2865629494190216, "learning_rate": 1.4635719773272788e-05, "loss": 0.4174, "step": 89790 }, { "epoch": 3.236205715933254, "grad_norm": 0.2678220868110657, "learning_rate": 1.4633064293018728e-05, "loss": 0.3557, "step": 89795 }, { "epoch": 3.2363859155944787, "grad_norm": 0.2645963728427887, "learning_rate": 1.4630408954012153e-05, "loss": 0.3925, "step": 89800 }, { "epoch": 3.2365661152557035, "grad_norm": 0.21265912055969238, "learning_rate": 1.4627753756289258e-05, "loss": 0.349, "step": 89805 }, { "epoch": 3.2367463149169278, "grad_norm": 0.23955461382865906, "learning_rate": 1.4625098699886204e-05, "loss": 0.42, "step": 89810 }, { "epoch": 3.2369265145781525, "grad_norm": 0.28037208318710327, "learning_rate": 1.4622443784839174e-05, "loss": 0.3545, "step": 89815 }, { "epoch": 3.2371067142393772, "grad_norm": 0.20563095808029175, "learning_rate": 1.4619789011184343e-05, "loss": 0.3833, "step": 89820 }, { "epoch": 3.237286913900602, "grad_norm": 0.2627568542957306, "learning_rate": 1.4617134378957864e-05, "loss": 0.3868, "step": 89825 }, { "epoch": 3.2374671135618267, "grad_norm": 0.23573757708072662, "learning_rate": 1.4614479888195931e-05, "loss": 0.3716, "step": 89830 }, { "epoch": 3.237647313223051, "grad_norm": 0.22677142918109894, "learning_rate": 1.4611825538934699e-05, "loss": 0.39, "step": 89835 }, { "epoch": 3.2378275128842757, "grad_norm": 0.23571011424064636, "learning_rate": 1.4609171331210324e-05, "loss": 0.383, "step": 89840 }, { "epoch": 3.2380077125455005, "grad_norm": 0.2776787281036377, "learning_rate": 1.4606517265058982e-05, "loss": 0.3464, "step": 89845 }, { "epoch": 3.238187912206725, "grad_norm": 0.21779267489910126, "learning_rate": 1.4603863340516826e-05, "loss": 0.3814, "step": 89850 }, { "epoch": 3.2383681118679495, "grad_norm": 0.2640026807785034, "learning_rate": 1.4601209557620027e-05, "loss": 0.3906, "step": 89855 }, { "epoch": 3.238548311529174, "grad_norm": 0.2678093910217285, "learning_rate": 1.4598555916404732e-05, "loss": 0.3533, "step": 89860 }, { "epoch": 3.238728511190399, "grad_norm": 0.24538059532642365, "learning_rate": 1.4595902416907092e-05, "loss": 0.3712, "step": 89865 }, { "epoch": 3.2389087108516237, "grad_norm": 0.27062126994132996, "learning_rate": 1.4593249059163283e-05, "loss": 0.3648, "step": 89870 }, { "epoch": 3.2390889105128484, "grad_norm": 0.24212132394313812, "learning_rate": 1.4590595843209432e-05, "loss": 0.3855, "step": 89875 }, { "epoch": 3.2392691101740727, "grad_norm": 0.2248118668794632, "learning_rate": 1.4587942769081689e-05, "loss": 0.347, "step": 89880 }, { "epoch": 3.2394493098352974, "grad_norm": 0.21070916950702667, "learning_rate": 1.4585289836816218e-05, "loss": 0.3758, "step": 89885 }, { "epoch": 3.239629509496522, "grad_norm": 0.19322839379310608, "learning_rate": 1.4582637046449148e-05, "loss": 0.3913, "step": 89890 }, { "epoch": 3.239809709157747, "grad_norm": 0.2393057942390442, "learning_rate": 1.4579984398016649e-05, "loss": 0.3938, "step": 89895 }, { "epoch": 3.2399899088189716, "grad_norm": 0.24970926344394684, "learning_rate": 1.4577331891554839e-05, "loss": 0.3932, "step": 89900 }, { "epoch": 3.240170108480196, "grad_norm": 0.23957009613513947, "learning_rate": 1.4574679527099852e-05, "loss": 0.3849, "step": 89905 }, { "epoch": 3.2403503081414207, "grad_norm": 0.22815951704978943, "learning_rate": 1.4572027304687847e-05, "loss": 0.3937, "step": 89910 }, { "epoch": 3.2405305078026454, "grad_norm": 0.20002481341362, "learning_rate": 1.4569375224354953e-05, "loss": 0.3943, "step": 89915 }, { "epoch": 3.24071070746387, "grad_norm": 0.2669081687927246, "learning_rate": 1.4566723286137301e-05, "loss": 0.4006, "step": 89920 }, { "epoch": 3.2408909071250944, "grad_norm": 0.2839129567146301, "learning_rate": 1.4564071490071024e-05, "loss": 0.3667, "step": 89925 }, { "epoch": 3.241071106786319, "grad_norm": 0.1999591588973999, "learning_rate": 1.4561419836192241e-05, "loss": 0.3618, "step": 89930 }, { "epoch": 3.241251306447544, "grad_norm": 0.21648964285850525, "learning_rate": 1.4558768324537105e-05, "loss": 0.3854, "step": 89935 }, { "epoch": 3.2414315061087686, "grad_norm": 0.17775964736938477, "learning_rate": 1.4556116955141729e-05, "loss": 0.3296, "step": 89940 }, { "epoch": 3.2416117057699934, "grad_norm": 0.24124689400196075, "learning_rate": 1.4553465728042236e-05, "loss": 0.3892, "step": 89945 }, { "epoch": 3.2417919054312176, "grad_norm": 0.19329720735549927, "learning_rate": 1.4550814643274752e-05, "loss": 0.3676, "step": 89950 }, { "epoch": 3.2419721050924424, "grad_norm": 0.1987384855747223, "learning_rate": 1.4548163700875384e-05, "loss": 0.3755, "step": 89955 }, { "epoch": 3.242152304753667, "grad_norm": 0.2366037666797638, "learning_rate": 1.4545512900880271e-05, "loss": 0.3561, "step": 89960 }, { "epoch": 3.242332504414892, "grad_norm": 0.2112271934747696, "learning_rate": 1.454286224332553e-05, "loss": 0.381, "step": 89965 }, { "epoch": 3.242512704076116, "grad_norm": 0.2448207587003708, "learning_rate": 1.454021172824725e-05, "loss": 0.4151, "step": 89970 }, { "epoch": 3.242692903737341, "grad_norm": 0.24623465538024902, "learning_rate": 1.4537561355681567e-05, "loss": 0.369, "step": 89975 }, { "epoch": 3.2428731033985656, "grad_norm": 0.2527907192707062, "learning_rate": 1.4534911125664585e-05, "loss": 0.4135, "step": 89980 }, { "epoch": 3.2430533030597903, "grad_norm": 0.21829581260681152, "learning_rate": 1.4532261038232414e-05, "loss": 0.3546, "step": 89985 }, { "epoch": 3.243233502721015, "grad_norm": 0.196324422955513, "learning_rate": 1.4529611093421158e-05, "loss": 0.4074, "step": 89990 }, { "epoch": 3.2434137023822394, "grad_norm": 0.22843465209007263, "learning_rate": 1.4526961291266916e-05, "loss": 0.3604, "step": 89995 }, { "epoch": 3.243593902043464, "grad_norm": 0.19749246537685394, "learning_rate": 1.452431163180581e-05, "loss": 0.3636, "step": 90000 }, { "epoch": 3.243593902043464, "eval_loss": 0.4301738440990448, "eval_runtime": 3.5334, "eval_samples_per_second": 28.301, "eval_steps_per_second": 7.075, "step": 90000 }, { "epoch": 3.243774101704689, "grad_norm": 0.2458840310573578, "learning_rate": 1.4521662115073925e-05, "loss": 0.3655, "step": 90005 }, { "epoch": 3.2439543013659136, "grad_norm": 0.23517417907714844, "learning_rate": 1.4519012741107368e-05, "loss": 0.3811, "step": 90010 }, { "epoch": 3.244134501027138, "grad_norm": 0.21552741527557373, "learning_rate": 1.4516363509942233e-05, "loss": 0.3699, "step": 90015 }, { "epoch": 3.2443147006883626, "grad_norm": 0.2056073546409607, "learning_rate": 1.4513714421614617e-05, "loss": 0.3778, "step": 90020 }, { "epoch": 3.2444949003495873, "grad_norm": 0.219630628824234, "learning_rate": 1.4511065476160598e-05, "loss": 0.3692, "step": 90025 }, { "epoch": 3.244675100010812, "grad_norm": 0.2458181381225586, "learning_rate": 1.4508416673616292e-05, "loss": 0.3989, "step": 90030 }, { "epoch": 3.244855299672037, "grad_norm": 0.23041844367980957, "learning_rate": 1.4505768014017782e-05, "loss": 0.3763, "step": 90035 }, { "epoch": 3.245035499333261, "grad_norm": 0.2688606083393097, "learning_rate": 1.4503119497401147e-05, "loss": 0.3814, "step": 90040 }, { "epoch": 3.245215698994486, "grad_norm": 0.20905734598636627, "learning_rate": 1.4500471123802478e-05, "loss": 0.3603, "step": 90045 }, { "epoch": 3.2453958986557105, "grad_norm": 0.2165406346321106, "learning_rate": 1.4497822893257845e-05, "loss": 0.3778, "step": 90050 }, { "epoch": 3.2455760983169353, "grad_norm": 0.28338685631752014, "learning_rate": 1.4495174805803358e-05, "loss": 0.3849, "step": 90055 }, { "epoch": 3.2457562979781596, "grad_norm": 0.24347181618213654, "learning_rate": 1.4492526861475084e-05, "loss": 0.3794, "step": 90060 }, { "epoch": 3.2459364976393843, "grad_norm": 0.17866498231887817, "learning_rate": 1.448987906030908e-05, "loss": 0.3434, "step": 90065 }, { "epoch": 3.246116697300609, "grad_norm": 0.23330989480018616, "learning_rate": 1.4487231402341447e-05, "loss": 0.377, "step": 90070 }, { "epoch": 3.2462968969618338, "grad_norm": 0.23260824382305145, "learning_rate": 1.4484583887608243e-05, "loss": 0.3774, "step": 90075 }, { "epoch": 3.2464770966230585, "grad_norm": 0.20770181715488434, "learning_rate": 1.4481936516145567e-05, "loss": 0.3724, "step": 90080 }, { "epoch": 3.246657296284283, "grad_norm": 0.20958863198757172, "learning_rate": 1.447928928798946e-05, "loss": 0.3703, "step": 90085 }, { "epoch": 3.2468374959455075, "grad_norm": 0.2624190151691437, "learning_rate": 1.4476642203175988e-05, "loss": 0.3906, "step": 90090 }, { "epoch": 3.2470176956067323, "grad_norm": 0.27643921971321106, "learning_rate": 1.4473995261741238e-05, "loss": 0.36, "step": 90095 }, { "epoch": 3.247197895267957, "grad_norm": 0.24671484529972076, "learning_rate": 1.4471348463721268e-05, "loss": 0.3819, "step": 90100 }, { "epoch": 3.2473780949291817, "grad_norm": 0.22529368102550507, "learning_rate": 1.4468701809152135e-05, "loss": 0.352, "step": 90105 }, { "epoch": 3.247558294590406, "grad_norm": 0.22821401059627533, "learning_rate": 1.44660552980699e-05, "loss": 0.3884, "step": 90110 }, { "epoch": 3.2477384942516307, "grad_norm": 0.22324234247207642, "learning_rate": 1.4463408930510614e-05, "loss": 0.3781, "step": 90115 }, { "epoch": 3.2479186939128555, "grad_norm": 0.25382617115974426, "learning_rate": 1.446076270651035e-05, "loss": 0.3944, "step": 90120 }, { "epoch": 3.24809889357408, "grad_norm": 0.2712723910808563, "learning_rate": 1.4458116626105154e-05, "loss": 0.3762, "step": 90125 }, { "epoch": 3.248279093235305, "grad_norm": 0.2119145393371582, "learning_rate": 1.4455470689331078e-05, "loss": 0.3578, "step": 90130 }, { "epoch": 3.2484592928965292, "grad_norm": 0.2648928165435791, "learning_rate": 1.4452824896224171e-05, "loss": 0.3913, "step": 90135 }, { "epoch": 3.248639492557754, "grad_norm": 0.21984988451004028, "learning_rate": 1.4450179246820475e-05, "loss": 0.3591, "step": 90140 }, { "epoch": 3.2488196922189787, "grad_norm": 0.25489935278892517, "learning_rate": 1.4447533741156055e-05, "loss": 0.3913, "step": 90145 }, { "epoch": 3.2489998918802034, "grad_norm": 0.19354936480522156, "learning_rate": 1.4444888379266952e-05, "loss": 0.3815, "step": 90150 }, { "epoch": 3.2491800915414277, "grad_norm": 0.2649206519126892, "learning_rate": 1.4442243161189184e-05, "loss": 0.3611, "step": 90155 }, { "epoch": 3.2493602912026525, "grad_norm": 0.23419325053691864, "learning_rate": 1.4439598086958816e-05, "loss": 0.3869, "step": 90160 }, { "epoch": 3.249540490863877, "grad_norm": 0.2443428486585617, "learning_rate": 1.4436953156611881e-05, "loss": 0.4055, "step": 90165 }, { "epoch": 3.249720690525102, "grad_norm": 0.18674413859844208, "learning_rate": 1.4434308370184413e-05, "loss": 0.3629, "step": 90170 }, { "epoch": 3.2499008901863267, "grad_norm": 0.2267744094133377, "learning_rate": 1.443166372771245e-05, "loss": 0.3401, "step": 90175 }, { "epoch": 3.250081089847551, "grad_norm": 0.1851803958415985, "learning_rate": 1.442901922923201e-05, "loss": 0.3667, "step": 90180 }, { "epoch": 3.2502612895087757, "grad_norm": 0.2092495560646057, "learning_rate": 1.4426374874779146e-05, "loss": 0.4123, "step": 90185 }, { "epoch": 3.2504414891700004, "grad_norm": 0.20582100749015808, "learning_rate": 1.4423730664389883e-05, "loss": 0.4061, "step": 90190 }, { "epoch": 3.250621688831225, "grad_norm": 0.23701712489128113, "learning_rate": 1.4421086598100237e-05, "loss": 0.3766, "step": 90195 }, { "epoch": 3.2508018884924494, "grad_norm": 0.1818612813949585, "learning_rate": 1.441844267594624e-05, "loss": 0.3682, "step": 90200 }, { "epoch": 3.250982088153674, "grad_norm": 0.23264910280704498, "learning_rate": 1.4415798897963911e-05, "loss": 0.3912, "step": 90205 }, { "epoch": 3.251162287814899, "grad_norm": 0.23024223744869232, "learning_rate": 1.4413155264189266e-05, "loss": 0.3769, "step": 90210 }, { "epoch": 3.2513424874761236, "grad_norm": 0.1898718625307083, "learning_rate": 1.4410511774658336e-05, "loss": 0.3937, "step": 90215 }, { "epoch": 3.2515226871373484, "grad_norm": 0.2725496292114258, "learning_rate": 1.4407868429407138e-05, "loss": 0.4097, "step": 90220 }, { "epoch": 3.2517028867985727, "grad_norm": 0.2061624974012375, "learning_rate": 1.4405225228471678e-05, "loss": 0.3782, "step": 90225 }, { "epoch": 3.2518830864597974, "grad_norm": 0.240001380443573, "learning_rate": 1.4402582171887973e-05, "loss": 0.3728, "step": 90230 }, { "epoch": 3.252063286121022, "grad_norm": 0.22183412313461304, "learning_rate": 1.4399939259692027e-05, "loss": 0.3642, "step": 90235 }, { "epoch": 3.252243485782247, "grad_norm": 0.2705990672111511, "learning_rate": 1.4397296491919865e-05, "loss": 0.3807, "step": 90240 }, { "epoch": 3.252423685443471, "grad_norm": 0.2311464250087738, "learning_rate": 1.4394653868607497e-05, "loss": 0.3711, "step": 90245 }, { "epoch": 3.252603885104696, "grad_norm": 0.226415753364563, "learning_rate": 1.4392011389790893e-05, "loss": 0.4024, "step": 90250 }, { "epoch": 3.2527840847659206, "grad_norm": 0.24060435593128204, "learning_rate": 1.4389369055506094e-05, "loss": 0.4227, "step": 90255 }, { "epoch": 3.2529642844271454, "grad_norm": 0.2353263795375824, "learning_rate": 1.4386726865789077e-05, "loss": 0.3729, "step": 90260 }, { "epoch": 3.25314448408837, "grad_norm": 0.23941341042518616, "learning_rate": 1.438408482067587e-05, "loss": 0.3689, "step": 90265 }, { "epoch": 3.2533246837495944, "grad_norm": 0.24477094411849976, "learning_rate": 1.4381442920202442e-05, "loss": 0.4205, "step": 90270 }, { "epoch": 3.253504883410819, "grad_norm": 0.21615129709243774, "learning_rate": 1.437880116440479e-05, "loss": 0.3556, "step": 90275 }, { "epoch": 3.253685083072044, "grad_norm": 0.2210584431886673, "learning_rate": 1.4376159553318924e-05, "loss": 0.3909, "step": 90280 }, { "epoch": 3.2538652827332686, "grad_norm": 0.26296374201774597, "learning_rate": 1.4373518086980827e-05, "loss": 0.3978, "step": 90285 }, { "epoch": 3.254045482394493, "grad_norm": 0.26196447014808655, "learning_rate": 1.437087676542649e-05, "loss": 0.3911, "step": 90290 }, { "epoch": 3.2542256820557176, "grad_norm": 0.2408955991268158, "learning_rate": 1.43682355886919e-05, "loss": 0.38, "step": 90295 }, { "epoch": 3.2544058817169423, "grad_norm": 0.21061177551746368, "learning_rate": 1.436559455681303e-05, "loss": 0.3891, "step": 90300 }, { "epoch": 3.254586081378167, "grad_norm": 0.31329545378685, "learning_rate": 1.4362953669825885e-05, "loss": 0.3841, "step": 90305 }, { "epoch": 3.254766281039392, "grad_norm": 0.22759318351745605, "learning_rate": 1.4360312927766439e-05, "loss": 0.3843, "step": 90310 }, { "epoch": 3.254946480700616, "grad_norm": 0.22500398755073547, "learning_rate": 1.4357672330670668e-05, "loss": 0.389, "step": 90315 }, { "epoch": 3.255126680361841, "grad_norm": 0.2273656278848648, "learning_rate": 1.435503187857455e-05, "loss": 0.4046, "step": 90320 }, { "epoch": 3.2553068800230656, "grad_norm": 0.22215020656585693, "learning_rate": 1.4352391571514053e-05, "loss": 0.4307, "step": 90325 }, { "epoch": 3.2554870796842903, "grad_norm": 0.2710254490375519, "learning_rate": 1.434975140952517e-05, "loss": 0.3778, "step": 90330 }, { "epoch": 3.2556672793455146, "grad_norm": 0.2155771702528, "learning_rate": 1.4347111392643873e-05, "loss": 0.3671, "step": 90335 }, { "epoch": 3.2558474790067393, "grad_norm": 0.2274942249059677, "learning_rate": 1.43444715209061e-05, "loss": 0.4002, "step": 90340 }, { "epoch": 3.256027678667964, "grad_norm": 0.22359316051006317, "learning_rate": 1.4341831794347849e-05, "loss": 0.3652, "step": 90345 }, { "epoch": 3.256207878329189, "grad_norm": 0.239881694316864, "learning_rate": 1.4339192213005076e-05, "loss": 0.3881, "step": 90350 }, { "epoch": 3.2563880779904135, "grad_norm": 0.22023212909698486, "learning_rate": 1.4336552776913733e-05, "loss": 0.3707, "step": 90355 }, { "epoch": 3.2565682776516383, "grad_norm": 0.2799815535545349, "learning_rate": 1.4333913486109817e-05, "loss": 0.3659, "step": 90360 }, { "epoch": 3.2567484773128625, "grad_norm": 0.19893276691436768, "learning_rate": 1.4331274340629245e-05, "loss": 0.4441, "step": 90365 }, { "epoch": 3.2569286769740873, "grad_norm": 0.2362791895866394, "learning_rate": 1.4328635340508001e-05, "loss": 0.3203, "step": 90370 }, { "epoch": 3.257108876635312, "grad_norm": 0.29334524273872375, "learning_rate": 1.4325996485782039e-05, "loss": 0.3978, "step": 90375 }, { "epoch": 3.2572890762965367, "grad_norm": 0.20384852588176727, "learning_rate": 1.4323357776487307e-05, "loss": 0.3841, "step": 90380 }, { "epoch": 3.257469275957761, "grad_norm": 0.21409177780151367, "learning_rate": 1.4320719212659756e-05, "loss": 0.3804, "step": 90385 }, { "epoch": 3.2576494756189858, "grad_norm": 0.26546648144721985, "learning_rate": 1.431808079433534e-05, "loss": 0.3746, "step": 90390 }, { "epoch": 3.2578296752802105, "grad_norm": 0.24117311835289001, "learning_rate": 1.4315442521549993e-05, "loss": 0.3813, "step": 90395 }, { "epoch": 3.2580098749414352, "grad_norm": 0.2505226731300354, "learning_rate": 1.4312804394339686e-05, "loss": 0.3831, "step": 90400 }, { "epoch": 3.25819007460266, "grad_norm": 0.24122489988803864, "learning_rate": 1.431016641274035e-05, "loss": 0.3852, "step": 90405 }, { "epoch": 3.2583702742638843, "grad_norm": 0.2857030928134918, "learning_rate": 1.4307528576787927e-05, "loss": 0.3887, "step": 90410 }, { "epoch": 3.258550473925109, "grad_norm": 0.22176389396190643, "learning_rate": 1.4304890886518352e-05, "loss": 0.3954, "step": 90415 }, { "epoch": 3.2587306735863337, "grad_norm": 0.2822429835796356, "learning_rate": 1.4302253341967564e-05, "loss": 0.3836, "step": 90420 }, { "epoch": 3.2589108732475585, "grad_norm": 0.23664531111717224, "learning_rate": 1.4299615943171509e-05, "loss": 0.3671, "step": 90425 }, { "epoch": 3.2590910729087827, "grad_norm": 0.24214255809783936, "learning_rate": 1.429697869016613e-05, "loss": 0.404, "step": 90430 }, { "epoch": 3.2592712725700075, "grad_norm": 0.21557815372943878, "learning_rate": 1.429434158298732e-05, "loss": 0.3875, "step": 90435 }, { "epoch": 3.259451472231232, "grad_norm": 0.2581924498081207, "learning_rate": 1.4291704621671043e-05, "loss": 0.3645, "step": 90440 }, { "epoch": 3.259631671892457, "grad_norm": 0.23095859587192535, "learning_rate": 1.4289067806253208e-05, "loss": 0.3912, "step": 90445 }, { "epoch": 3.2598118715536817, "grad_norm": 0.25443336367607117, "learning_rate": 1.4286431136769774e-05, "loss": 0.3588, "step": 90450 }, { "epoch": 3.259992071214906, "grad_norm": 0.3058500289916992, "learning_rate": 1.428379461325663e-05, "loss": 0.4221, "step": 90455 }, { "epoch": 3.2601722708761307, "grad_norm": 0.2576063573360443, "learning_rate": 1.4281158235749698e-05, "loss": 0.3781, "step": 90460 }, { "epoch": 3.2603524705373554, "grad_norm": 0.25434017181396484, "learning_rate": 1.4278522004284919e-05, "loss": 0.3894, "step": 90465 }, { "epoch": 3.26053267019858, "grad_norm": 0.2770051956176758, "learning_rate": 1.4275885918898201e-05, "loss": 0.4269, "step": 90470 }, { "epoch": 3.2607128698598045, "grad_norm": 0.24559049308300018, "learning_rate": 1.4273249979625464e-05, "loss": 0.3713, "step": 90475 }, { "epoch": 3.260893069521029, "grad_norm": 0.24192962050437927, "learning_rate": 1.427061418650262e-05, "loss": 0.3708, "step": 90480 }, { "epoch": 3.261073269182254, "grad_norm": 0.23208360373973846, "learning_rate": 1.4268505657256397e-05, "loss": 0.3847, "step": 90485 }, { "epoch": 3.2612534688434787, "grad_norm": 0.26153233647346497, "learning_rate": 1.4265870127293846e-05, "loss": 0.425, "step": 90490 }, { "epoch": 3.2614336685047034, "grad_norm": 0.22417205572128296, "learning_rate": 1.4263234743581744e-05, "loss": 0.396, "step": 90495 }, { "epoch": 3.2616138681659277, "grad_norm": 0.3092048764228821, "learning_rate": 1.4260599506156002e-05, "loss": 0.392, "step": 90500 }, { "epoch": 3.2616138681659277, "eval_loss": 0.4304320514202118, "eval_runtime": 3.5275, "eval_samples_per_second": 28.349, "eval_steps_per_second": 7.087, "step": 90500 }, { "epoch": 3.2617940678271524, "grad_norm": 0.19845503568649292, "learning_rate": 1.4257964415052491e-05, "loss": 0.3698, "step": 90505 }, { "epoch": 3.261974267488377, "grad_norm": 0.29068851470947266, "learning_rate": 1.4255329470307146e-05, "loss": 0.393, "step": 90510 }, { "epoch": 3.262154467149602, "grad_norm": 0.22075437009334564, "learning_rate": 1.4252694671955847e-05, "loss": 0.3644, "step": 90515 }, { "epoch": 3.262334666810826, "grad_norm": 0.2166365683078766, "learning_rate": 1.4250060020034511e-05, "loss": 0.362, "step": 90520 }, { "epoch": 3.262514866472051, "grad_norm": 0.23188582062721252, "learning_rate": 1.4247425514579038e-05, "loss": 0.3859, "step": 90525 }, { "epoch": 3.2626950661332756, "grad_norm": 0.1937790811061859, "learning_rate": 1.4244791155625289e-05, "loss": 0.3916, "step": 90530 }, { "epoch": 3.2628752657945004, "grad_norm": 0.20566551387310028, "learning_rate": 1.424215694320919e-05, "loss": 0.3681, "step": 90535 }, { "epoch": 3.263055465455725, "grad_norm": 0.21733631193637848, "learning_rate": 1.4239522877366618e-05, "loss": 0.3715, "step": 90540 }, { "epoch": 3.2632356651169494, "grad_norm": 0.23890431225299835, "learning_rate": 1.4236888958133464e-05, "loss": 0.3793, "step": 90545 }, { "epoch": 3.263415864778174, "grad_norm": 0.19045878946781158, "learning_rate": 1.4234255185545614e-05, "loss": 0.3516, "step": 90550 }, { "epoch": 3.263596064439399, "grad_norm": 0.27482134103775024, "learning_rate": 1.4231621559638941e-05, "loss": 0.3766, "step": 90555 }, { "epoch": 3.2637762641006236, "grad_norm": 0.20117837190628052, "learning_rate": 1.422898808044935e-05, "loss": 0.3557, "step": 90560 }, { "epoch": 3.263956463761848, "grad_norm": 0.30366307497024536, "learning_rate": 1.4226354748012716e-05, "loss": 0.4117, "step": 90565 }, { "epoch": 3.2641366634230726, "grad_norm": 0.23265069723129272, "learning_rate": 1.4223721562364908e-05, "loss": 0.3705, "step": 90570 }, { "epoch": 3.2643168630842974, "grad_norm": 0.2623595893383026, "learning_rate": 1.4221088523541809e-05, "loss": 0.3727, "step": 90575 }, { "epoch": 3.264497062745522, "grad_norm": 0.22619140148162842, "learning_rate": 1.421845563157928e-05, "loss": 0.4174, "step": 90580 }, { "epoch": 3.264677262406747, "grad_norm": 0.25267648696899414, "learning_rate": 1.421582288651322e-05, "loss": 0.395, "step": 90585 }, { "epoch": 3.2648574620679716, "grad_norm": 0.30874866247177124, "learning_rate": 1.421319028837948e-05, "loss": 0.4297, "step": 90590 }, { "epoch": 3.265037661729196, "grad_norm": 0.2350592315196991, "learning_rate": 1.421055783721394e-05, "loss": 0.3709, "step": 90595 }, { "epoch": 3.2652178613904206, "grad_norm": 0.22834807634353638, "learning_rate": 1.4207925533052455e-05, "loss": 0.3666, "step": 90600 }, { "epoch": 3.2653980610516453, "grad_norm": 0.21915757656097412, "learning_rate": 1.4205293375930898e-05, "loss": 0.3658, "step": 90605 }, { "epoch": 3.26557826071287, "grad_norm": 0.2512879967689514, "learning_rate": 1.4202661365885116e-05, "loss": 0.3719, "step": 90610 }, { "epoch": 3.2657584603740943, "grad_norm": 0.19401970505714417, "learning_rate": 1.4200029502951007e-05, "loss": 0.3696, "step": 90615 }, { "epoch": 3.265938660035319, "grad_norm": 0.2430254966020584, "learning_rate": 1.419739778716438e-05, "loss": 0.3845, "step": 90620 }, { "epoch": 3.266118859696544, "grad_norm": 0.24256938695907593, "learning_rate": 1.4194766218561128e-05, "loss": 0.3866, "step": 90625 }, { "epoch": 3.2662990593577685, "grad_norm": 0.2600161135196686, "learning_rate": 1.4192134797177098e-05, "loss": 0.3928, "step": 90630 }, { "epoch": 3.2664792590189933, "grad_norm": 0.21937114000320435, "learning_rate": 1.4189503523048137e-05, "loss": 0.4063, "step": 90635 }, { "epoch": 3.2666594586802176, "grad_norm": 0.21056115627288818, "learning_rate": 1.4186872396210096e-05, "loss": 0.3868, "step": 90640 }, { "epoch": 3.2668396583414423, "grad_norm": 0.2408798485994339, "learning_rate": 1.4184241416698827e-05, "loss": 0.3905, "step": 90645 }, { "epoch": 3.267019858002667, "grad_norm": 0.2301928699016571, "learning_rate": 1.4181610584550165e-05, "loss": 0.4008, "step": 90650 }, { "epoch": 3.2672000576638918, "grad_norm": 0.21639235317707062, "learning_rate": 1.4178979899799971e-05, "loss": 0.3618, "step": 90655 }, { "epoch": 3.267380257325116, "grad_norm": 0.20622292160987854, "learning_rate": 1.4176349362484085e-05, "loss": 0.3987, "step": 90660 }, { "epoch": 3.267560456986341, "grad_norm": 0.24098491668701172, "learning_rate": 1.4173718972638339e-05, "loss": 0.4114, "step": 90665 }, { "epoch": 3.2677406566475655, "grad_norm": 0.2708558142185211, "learning_rate": 1.417108873029858e-05, "loss": 0.388, "step": 90670 }, { "epoch": 3.2679208563087903, "grad_norm": 0.24035510420799255, "learning_rate": 1.4168458635500628e-05, "loss": 0.3668, "step": 90675 }, { "epoch": 3.268101055970015, "grad_norm": 0.22504648566246033, "learning_rate": 1.416582868828034e-05, "loss": 0.4099, "step": 90680 }, { "epoch": 3.2682812556312393, "grad_norm": 0.21226103603839874, "learning_rate": 1.416319888867354e-05, "loss": 0.3926, "step": 90685 }, { "epoch": 3.268461455292464, "grad_norm": 0.2621418535709381, "learning_rate": 1.4160569236716059e-05, "loss": 0.4155, "step": 90690 }, { "epoch": 3.2686416549536887, "grad_norm": 0.2505376636981964, "learning_rate": 1.415793973244372e-05, "loss": 0.3616, "step": 90695 }, { "epoch": 3.2688218546149135, "grad_norm": 0.24424095451831818, "learning_rate": 1.4155310375892345e-05, "loss": 0.39, "step": 90700 }, { "epoch": 3.2690020542761378, "grad_norm": 0.2086549997329712, "learning_rate": 1.4152681167097775e-05, "loss": 0.3737, "step": 90705 }, { "epoch": 3.2691822539373625, "grad_norm": 0.25878942012786865, "learning_rate": 1.4150052106095834e-05, "loss": 0.4075, "step": 90710 }, { "epoch": 3.2693624535985872, "grad_norm": 0.21511436998844147, "learning_rate": 1.4147423192922314e-05, "loss": 0.3487, "step": 90715 }, { "epoch": 3.269542653259812, "grad_norm": 0.27404505014419556, "learning_rate": 1.4144794427613061e-05, "loss": 0.3914, "step": 90720 }, { "epoch": 3.2697228529210367, "grad_norm": 0.24359211325645447, "learning_rate": 1.4142165810203881e-05, "loss": 0.3774, "step": 90725 }, { "epoch": 3.269903052582261, "grad_norm": 0.19888465106487274, "learning_rate": 1.413953734073059e-05, "loss": 0.3661, "step": 90730 }, { "epoch": 3.2700832522434857, "grad_norm": 0.250573992729187, "learning_rate": 1.4136909019229003e-05, "loss": 0.3367, "step": 90735 }, { "epoch": 3.2702634519047105, "grad_norm": 0.2471703588962555, "learning_rate": 1.4134280845734915e-05, "loss": 0.3536, "step": 90740 }, { "epoch": 3.270443651565935, "grad_norm": 0.2185695767402649, "learning_rate": 1.4131652820284158e-05, "loss": 0.39, "step": 90745 }, { "epoch": 3.2706238512271595, "grad_norm": 0.22518201172351837, "learning_rate": 1.4129024942912525e-05, "loss": 0.3999, "step": 90750 }, { "epoch": 3.270804050888384, "grad_norm": 0.23426784574985504, "learning_rate": 1.4126397213655824e-05, "loss": 0.3719, "step": 90755 }, { "epoch": 3.270984250549609, "grad_norm": 0.28812581300735474, "learning_rate": 1.4123769632549855e-05, "loss": 0.3956, "step": 90760 }, { "epoch": 3.2711644502108337, "grad_norm": 0.2103385478258133, "learning_rate": 1.412114219963041e-05, "loss": 0.4033, "step": 90765 }, { "epoch": 3.2713446498720584, "grad_norm": 0.23068909347057343, "learning_rate": 1.4118514914933306e-05, "loss": 0.3719, "step": 90770 }, { "epoch": 3.2715248495332827, "grad_norm": 0.219300776720047, "learning_rate": 1.411588777849433e-05, "loss": 0.3917, "step": 90775 }, { "epoch": 3.2717050491945074, "grad_norm": 0.18937250971794128, "learning_rate": 1.4113260790349275e-05, "loss": 0.3667, "step": 90780 }, { "epoch": 3.271885248855732, "grad_norm": 0.24838852882385254, "learning_rate": 1.4110633950533936e-05, "loss": 0.3858, "step": 90785 }, { "epoch": 3.272065448516957, "grad_norm": 0.23256389796733856, "learning_rate": 1.4108007259084099e-05, "loss": 0.3784, "step": 90790 }, { "epoch": 3.272245648178181, "grad_norm": 0.2195148915052414, "learning_rate": 1.4105380716035544e-05, "loss": 0.3441, "step": 90795 }, { "epoch": 3.272425847839406, "grad_norm": 0.23615360260009766, "learning_rate": 1.4102754321424088e-05, "loss": 0.4046, "step": 90800 }, { "epoch": 3.2726060475006307, "grad_norm": 0.25502121448516846, "learning_rate": 1.4100128075285473e-05, "loss": 0.3555, "step": 90805 }, { "epoch": 3.2727862471618554, "grad_norm": 0.23027649521827698, "learning_rate": 1.4097501977655517e-05, "loss": 0.3715, "step": 90810 }, { "epoch": 3.27296644682308, "grad_norm": 0.25443360209465027, "learning_rate": 1.4094876028569983e-05, "loss": 0.3668, "step": 90815 }, { "epoch": 3.2731466464843044, "grad_norm": 0.28707796335220337, "learning_rate": 1.4092250228064643e-05, "loss": 0.4048, "step": 90820 }, { "epoch": 3.273326846145529, "grad_norm": 0.2626727521419525, "learning_rate": 1.4089624576175301e-05, "loss": 0.4063, "step": 90825 }, { "epoch": 3.273507045806754, "grad_norm": 0.2237686812877655, "learning_rate": 1.4086999072937702e-05, "loss": 0.3797, "step": 90830 }, { "epoch": 3.2736872454679786, "grad_norm": 0.20242911577224731, "learning_rate": 1.4084373718387617e-05, "loss": 0.3658, "step": 90835 }, { "epoch": 3.273867445129203, "grad_norm": 0.18086695671081543, "learning_rate": 1.4081748512560838e-05, "loss": 0.3622, "step": 90840 }, { "epoch": 3.2740476447904276, "grad_norm": 0.269579142332077, "learning_rate": 1.4079123455493121e-05, "loss": 0.4093, "step": 90845 }, { "epoch": 3.2742278444516524, "grad_norm": 0.20566615462303162, "learning_rate": 1.4076498547220234e-05, "loss": 0.3837, "step": 90850 }, { "epoch": 3.274408044112877, "grad_norm": 0.22256791591644287, "learning_rate": 1.4073873787777936e-05, "loss": 0.3359, "step": 90855 }, { "epoch": 3.274588243774102, "grad_norm": 0.2186269313097, "learning_rate": 1.4071249177201984e-05, "loss": 0.372, "step": 90860 }, { "epoch": 3.2747684434353266, "grad_norm": 0.21954451501369476, "learning_rate": 1.4068624715528158e-05, "loss": 0.3913, "step": 90865 }, { "epoch": 3.274948643096551, "grad_norm": 0.2596687972545624, "learning_rate": 1.40660004027922e-05, "loss": 0.371, "step": 90870 }, { "epoch": 3.2751288427577756, "grad_norm": 0.2262258678674698, "learning_rate": 1.4063376239029875e-05, "loss": 0.4175, "step": 90875 }, { "epoch": 3.2753090424190003, "grad_norm": 0.26203519105911255, "learning_rate": 1.406075222427693e-05, "loss": 0.3837, "step": 90880 }, { "epoch": 3.275489242080225, "grad_norm": 0.20488256216049194, "learning_rate": 1.4058128358569106e-05, "loss": 0.3964, "step": 90885 }, { "epoch": 3.2756694417414494, "grad_norm": 0.23428840935230255, "learning_rate": 1.4055504641942175e-05, "loss": 0.3849, "step": 90890 }, { "epoch": 3.275849641402674, "grad_norm": 0.23843036592006683, "learning_rate": 1.4052881074431884e-05, "loss": 0.3872, "step": 90895 }, { "epoch": 3.276029841063899, "grad_norm": 0.22145730257034302, "learning_rate": 1.4050257656073947e-05, "loss": 0.4031, "step": 90900 }, { "epoch": 3.2762100407251236, "grad_norm": 0.2753254175186157, "learning_rate": 1.4047634386904142e-05, "loss": 0.3998, "step": 90905 }, { "epoch": 3.2763902403863483, "grad_norm": 0.18128567934036255, "learning_rate": 1.4045011266958197e-05, "loss": 0.4172, "step": 90910 }, { "epoch": 3.2765704400475726, "grad_norm": 0.2414032369852066, "learning_rate": 1.4042388296271856e-05, "loss": 0.3816, "step": 90915 }, { "epoch": 3.2767506397087973, "grad_norm": 0.21249370276927948, "learning_rate": 1.4039765474880851e-05, "loss": 0.3528, "step": 90920 }, { "epoch": 3.276930839370022, "grad_norm": 0.238897442817688, "learning_rate": 1.4037142802820907e-05, "loss": 0.4205, "step": 90925 }, { "epoch": 3.277111039031247, "grad_norm": 0.2670360803604126, "learning_rate": 1.4034520280127782e-05, "loss": 0.3664, "step": 90930 }, { "epoch": 3.277291238692471, "grad_norm": 0.20420727133750916, "learning_rate": 1.4031897906837194e-05, "loss": 0.3831, "step": 90935 }, { "epoch": 3.277471438353696, "grad_norm": 0.2537064850330353, "learning_rate": 1.4029275682984878e-05, "loss": 0.3979, "step": 90940 }, { "epoch": 3.2776516380149205, "grad_norm": 0.3141496181488037, "learning_rate": 1.402665360860655e-05, "loss": 0.3572, "step": 90945 }, { "epoch": 3.2778318376761453, "grad_norm": 0.18175874650478363, "learning_rate": 1.4024031683737937e-05, "loss": 0.387, "step": 90950 }, { "epoch": 3.27801203733737, "grad_norm": 0.2895739674568176, "learning_rate": 1.4021409908414773e-05, "loss": 0.367, "step": 90955 }, { "epoch": 3.2781922369985943, "grad_norm": 0.21518366038799286, "learning_rate": 1.401878828267278e-05, "loss": 0.3816, "step": 90960 }, { "epoch": 3.278372436659819, "grad_norm": 0.25440242886543274, "learning_rate": 1.4016166806547664e-05, "loss": 0.3703, "step": 90965 }, { "epoch": 3.2785526363210438, "grad_norm": 0.271392822265625, "learning_rate": 1.4013545480075153e-05, "loss": 0.4075, "step": 90970 }, { "epoch": 3.2787328359822685, "grad_norm": 0.22662685811519623, "learning_rate": 1.4010924303290957e-05, "loss": 0.3931, "step": 90975 }, { "epoch": 3.278913035643493, "grad_norm": 0.24230170249938965, "learning_rate": 1.4008303276230777e-05, "loss": 0.378, "step": 90980 }, { "epoch": 3.2790932353047175, "grad_norm": 0.2077885866165161, "learning_rate": 1.4005682398930347e-05, "loss": 0.3544, "step": 90985 }, { "epoch": 3.2792734349659423, "grad_norm": 0.22285114228725433, "learning_rate": 1.4003061671425368e-05, "loss": 0.3823, "step": 90990 }, { "epoch": 3.279453634627167, "grad_norm": 0.22684328258037567, "learning_rate": 1.4000441093751546e-05, "loss": 0.3657, "step": 90995 }, { "epoch": 3.2796338342883917, "grad_norm": 0.2049197107553482, "learning_rate": 1.3997820665944584e-05, "loss": 0.4181, "step": 91000 }, { "epoch": 3.2796338342883917, "eval_loss": 0.4301852881908417, "eval_runtime": 3.5299, "eval_samples_per_second": 28.329, "eval_steps_per_second": 7.082, "step": 91000 }, { "epoch": 3.279814033949616, "grad_norm": 0.22128911316394806, "learning_rate": 1.3995200388040172e-05, "loss": 0.3903, "step": 91005 }, { "epoch": 3.2799942336108407, "grad_norm": 0.3121122717857361, "learning_rate": 1.3992580260074046e-05, "loss": 0.4014, "step": 91010 }, { "epoch": 3.2801744332720655, "grad_norm": 0.25039294362068176, "learning_rate": 1.3989960282081874e-05, "loss": 0.3716, "step": 91015 }, { "epoch": 3.28035463293329, "grad_norm": 0.20780634880065918, "learning_rate": 1.398734045409935e-05, "loss": 0.3885, "step": 91020 }, { "epoch": 3.2805348325945145, "grad_norm": 0.204863503575325, "learning_rate": 1.3984720776162192e-05, "loss": 0.3911, "step": 91025 }, { "epoch": 3.2807150322557392, "grad_norm": 0.24138136208057404, "learning_rate": 1.398210124830608e-05, "loss": 0.3948, "step": 91030 }, { "epoch": 3.280895231916964, "grad_norm": 0.21834784746170044, "learning_rate": 1.3979481870566703e-05, "loss": 0.3692, "step": 91035 }, { "epoch": 3.2810754315781887, "grad_norm": 0.20662029087543488, "learning_rate": 1.3976862642979755e-05, "loss": 0.4021, "step": 91040 }, { "epoch": 3.2812556312394134, "grad_norm": 0.2747882604598999, "learning_rate": 1.3974243565580907e-05, "loss": 0.355, "step": 91045 }, { "epoch": 3.2814358309006377, "grad_norm": 0.2811215817928314, "learning_rate": 1.3971624638405867e-05, "loss": 0.3675, "step": 91050 }, { "epoch": 3.2816160305618625, "grad_norm": 0.3407902121543884, "learning_rate": 1.3969005861490305e-05, "loss": 0.429, "step": 91055 }, { "epoch": 3.281796230223087, "grad_norm": 0.1697690635919571, "learning_rate": 1.3966387234869904e-05, "loss": 0.3446, "step": 91060 }, { "epoch": 3.281976429884312, "grad_norm": 0.20448936522006989, "learning_rate": 1.3963768758580342e-05, "loss": 0.3826, "step": 91065 }, { "epoch": 3.282156629545536, "grad_norm": 0.2799994647502899, "learning_rate": 1.3961150432657283e-05, "loss": 0.3621, "step": 91070 }, { "epoch": 3.282336829206761, "grad_norm": 0.21096967160701752, "learning_rate": 1.395853225713642e-05, "loss": 0.3655, "step": 91075 }, { "epoch": 3.2825170288679857, "grad_norm": 0.23864832520484924, "learning_rate": 1.395591423205343e-05, "loss": 0.3979, "step": 91080 }, { "epoch": 3.2826972285292104, "grad_norm": 0.2114400416612625, "learning_rate": 1.3953296357443946e-05, "loss": 0.4144, "step": 91085 }, { "epoch": 3.282877428190435, "grad_norm": 0.2655703127384186, "learning_rate": 1.3950678633343675e-05, "loss": 0.3678, "step": 91090 }, { "epoch": 3.28305762785166, "grad_norm": 0.24133309721946716, "learning_rate": 1.3948061059788267e-05, "loss": 0.3951, "step": 91095 }, { "epoch": 3.283237827512884, "grad_norm": 0.22033952176570892, "learning_rate": 1.3945443636813376e-05, "loss": 0.3699, "step": 91100 }, { "epoch": 3.283418027174109, "grad_norm": 0.23784691095352173, "learning_rate": 1.3942826364454697e-05, "loss": 0.4171, "step": 91105 }, { "epoch": 3.2835982268353336, "grad_norm": 0.17935135960578918, "learning_rate": 1.3940209242747847e-05, "loss": 0.3836, "step": 91110 }, { "epoch": 3.2837784264965584, "grad_norm": 0.2037777453660965, "learning_rate": 1.3937592271728517e-05, "loss": 0.3883, "step": 91115 }, { "epoch": 3.2839586261577827, "grad_norm": 0.24445310235023499, "learning_rate": 1.3934975451432346e-05, "loss": 0.3805, "step": 91120 }, { "epoch": 3.2841388258190074, "grad_norm": 0.24390850961208344, "learning_rate": 1.3932358781894996e-05, "loss": 0.3939, "step": 91125 }, { "epoch": 3.284319025480232, "grad_norm": 0.19269582629203796, "learning_rate": 1.3929742263152115e-05, "loss": 0.3994, "step": 91130 }, { "epoch": 3.284499225141457, "grad_norm": 0.2648945748806, "learning_rate": 1.3927125895239352e-05, "loss": 0.3578, "step": 91135 }, { "epoch": 3.2846794248026816, "grad_norm": 0.18639974296092987, "learning_rate": 1.3924509678192343e-05, "loss": 0.3366, "step": 91140 }, { "epoch": 3.284859624463906, "grad_norm": 0.1827782690525055, "learning_rate": 1.3921893612046757e-05, "loss": 0.3588, "step": 91145 }, { "epoch": 3.2850398241251306, "grad_norm": 0.20156405866146088, "learning_rate": 1.3919277696838227e-05, "loss": 0.3694, "step": 91150 }, { "epoch": 3.2852200237863554, "grad_norm": 0.1734452098608017, "learning_rate": 1.391666193260239e-05, "loss": 0.3947, "step": 91155 }, { "epoch": 3.28540022344758, "grad_norm": 0.2353818714618683, "learning_rate": 1.3914046319374891e-05, "loss": 0.3996, "step": 91160 }, { "epoch": 3.2855804231088044, "grad_norm": 0.23739945888519287, "learning_rate": 1.3911430857191351e-05, "loss": 0.3798, "step": 91165 }, { "epoch": 3.285760622770029, "grad_norm": 0.20489168167114258, "learning_rate": 1.3908815546087434e-05, "loss": 0.3993, "step": 91170 }, { "epoch": 3.285940822431254, "grad_norm": 0.22435033321380615, "learning_rate": 1.3906200386098753e-05, "loss": 0.3806, "step": 91175 }, { "epoch": 3.2861210220924786, "grad_norm": 0.227759450674057, "learning_rate": 1.3903585377260947e-05, "loss": 0.3604, "step": 91180 }, { "epoch": 3.2863012217537033, "grad_norm": 0.23122897744178772, "learning_rate": 1.390097051960964e-05, "loss": 0.3973, "step": 91185 }, { "epoch": 3.2864814214149276, "grad_norm": 0.2522181570529938, "learning_rate": 1.3898355813180453e-05, "loss": 0.3383, "step": 91190 }, { "epoch": 3.2866616210761523, "grad_norm": 0.3048706650733948, "learning_rate": 1.3895741258009038e-05, "loss": 0.3729, "step": 91195 }, { "epoch": 3.286841820737377, "grad_norm": 0.20635949075222015, "learning_rate": 1.3893126854130985e-05, "loss": 0.3899, "step": 91200 }, { "epoch": 3.287022020398602, "grad_norm": 0.2326965183019638, "learning_rate": 1.3890512601581923e-05, "loss": 0.3528, "step": 91205 }, { "epoch": 3.287202220059826, "grad_norm": 0.22335182130336761, "learning_rate": 1.3887898500397484e-05, "loss": 0.3865, "step": 91210 }, { "epoch": 3.287382419721051, "grad_norm": 0.2084750384092331, "learning_rate": 1.3885284550613264e-05, "loss": 0.3724, "step": 91215 }, { "epoch": 3.2875626193822756, "grad_norm": 0.18253043293952942, "learning_rate": 1.3882670752264915e-05, "loss": 0.3941, "step": 91220 }, { "epoch": 3.2877428190435003, "grad_norm": 0.20223382115364075, "learning_rate": 1.388005710538801e-05, "loss": 0.3569, "step": 91225 }, { "epoch": 3.287923018704725, "grad_norm": 0.20182040333747864, "learning_rate": 1.3877443610018168e-05, "loss": 0.3787, "step": 91230 }, { "epoch": 3.2881032183659493, "grad_norm": 0.2429923266172409, "learning_rate": 1.3874830266191014e-05, "loss": 0.3608, "step": 91235 }, { "epoch": 3.288283418027174, "grad_norm": 0.23385964334011078, "learning_rate": 1.387221707394214e-05, "loss": 0.3892, "step": 91240 }, { "epoch": 3.288463617688399, "grad_norm": 0.20496410131454468, "learning_rate": 1.3869604033307154e-05, "loss": 0.3716, "step": 91245 }, { "epoch": 3.2886438173496235, "grad_norm": 0.20968787372112274, "learning_rate": 1.3866991144321661e-05, "loss": 0.3497, "step": 91250 }, { "epoch": 3.288824017010848, "grad_norm": 0.2523500323295593, "learning_rate": 1.3864378407021244e-05, "loss": 0.3817, "step": 91255 }, { "epoch": 3.2890042166720725, "grad_norm": 0.1904435008764267, "learning_rate": 1.386176582144153e-05, "loss": 0.3555, "step": 91260 }, { "epoch": 3.2891844163332973, "grad_norm": 0.2404657006263733, "learning_rate": 1.3859153387618098e-05, "loss": 0.3649, "step": 91265 }, { "epoch": 3.289364615994522, "grad_norm": 0.23486708104610443, "learning_rate": 1.3856541105586545e-05, "loss": 0.3715, "step": 91270 }, { "epoch": 3.2895448156557467, "grad_norm": 0.20871885120868683, "learning_rate": 1.3853928975382464e-05, "loss": 0.3904, "step": 91275 }, { "epoch": 3.289725015316971, "grad_norm": 0.23163028061389923, "learning_rate": 1.3851316997041438e-05, "loss": 0.3893, "step": 91280 }, { "epoch": 3.2899052149781958, "grad_norm": 0.32008373737335205, "learning_rate": 1.3848705170599053e-05, "loss": 0.3878, "step": 91285 }, { "epoch": 3.2900854146394205, "grad_norm": 0.26068592071533203, "learning_rate": 1.3846093496090918e-05, "loss": 0.3768, "step": 91290 }, { "epoch": 3.2902656143006452, "grad_norm": 0.2309846431016922, "learning_rate": 1.384348197355258e-05, "loss": 0.4094, "step": 91295 }, { "epoch": 3.2904458139618695, "grad_norm": 0.250918447971344, "learning_rate": 1.3840870603019655e-05, "loss": 0.3681, "step": 91300 }, { "epoch": 3.2906260136230943, "grad_norm": 0.22328318655490875, "learning_rate": 1.3838259384527702e-05, "loss": 0.3665, "step": 91305 }, { "epoch": 3.290806213284319, "grad_norm": 0.25075024366378784, "learning_rate": 1.3835648318112307e-05, "loss": 0.3973, "step": 91310 }, { "epoch": 3.2909864129455437, "grad_norm": 0.20350109040737152, "learning_rate": 1.383303740380904e-05, "loss": 0.3423, "step": 91315 }, { "epoch": 3.2911666126067685, "grad_norm": 0.2606266438961029, "learning_rate": 1.383042664165348e-05, "loss": 0.3894, "step": 91320 }, { "epoch": 3.2913468122679927, "grad_norm": 0.20949092507362366, "learning_rate": 1.3827816031681182e-05, "loss": 0.4185, "step": 91325 }, { "epoch": 3.2915270119292175, "grad_norm": 0.23206953704357147, "learning_rate": 1.3825205573927736e-05, "loss": 0.3802, "step": 91330 }, { "epoch": 3.291707211590442, "grad_norm": 0.21940544247627258, "learning_rate": 1.3822595268428703e-05, "loss": 0.3655, "step": 91335 }, { "epoch": 3.291887411251667, "grad_norm": 0.1804596483707428, "learning_rate": 1.3819985115219644e-05, "loss": 0.3701, "step": 91340 }, { "epoch": 3.2920676109128912, "grad_norm": 0.22188700735569, "learning_rate": 1.3817375114336123e-05, "loss": 0.3908, "step": 91345 }, { "epoch": 3.292247810574116, "grad_norm": 0.2676689326763153, "learning_rate": 1.3814765265813695e-05, "loss": 0.4065, "step": 91350 }, { "epoch": 3.2924280102353407, "grad_norm": 0.20868746936321259, "learning_rate": 1.381215556968793e-05, "loss": 0.3471, "step": 91355 }, { "epoch": 3.2926082098965654, "grad_norm": 0.21724528074264526, "learning_rate": 1.380954602599438e-05, "loss": 0.382, "step": 91360 }, { "epoch": 3.29278840955779, "grad_norm": 0.21302174031734467, "learning_rate": 1.3806936634768603e-05, "loss": 0.37, "step": 91365 }, { "epoch": 3.292968609219015, "grad_norm": 0.21098196506500244, "learning_rate": 1.3804327396046143e-05, "loss": 0.3938, "step": 91370 }, { "epoch": 3.293148808880239, "grad_norm": 0.20903551578521729, "learning_rate": 1.3801718309862546e-05, "loss": 0.3812, "step": 91375 }, { "epoch": 3.293329008541464, "grad_norm": 0.2137085646390915, "learning_rate": 1.3799109376253378e-05, "loss": 0.3976, "step": 91380 }, { "epoch": 3.2935092082026887, "grad_norm": 0.2167220115661621, "learning_rate": 1.3796500595254187e-05, "loss": 0.3906, "step": 91385 }, { "epoch": 3.2936894078639134, "grad_norm": 0.19560924172401428, "learning_rate": 1.3793891966900488e-05, "loss": 0.3904, "step": 91390 }, { "epoch": 3.2938696075251377, "grad_norm": 0.24281176924705505, "learning_rate": 1.379128349122785e-05, "loss": 0.3647, "step": 91395 }, { "epoch": 3.2940498071863624, "grad_norm": 0.2524743676185608, "learning_rate": 1.3788675168271791e-05, "loss": 0.3615, "step": 91400 }, { "epoch": 3.294230006847587, "grad_norm": 0.2531737983226776, "learning_rate": 1.3786066998067887e-05, "loss": 0.3995, "step": 91405 }, { "epoch": 3.294410206508812, "grad_norm": 0.2063448578119278, "learning_rate": 1.3783458980651637e-05, "loss": 0.3758, "step": 91410 }, { "epoch": 3.2945904061700366, "grad_norm": 0.23044107854366302, "learning_rate": 1.3780851116058579e-05, "loss": 0.3673, "step": 91415 }, { "epoch": 3.294770605831261, "grad_norm": 0.23507073521614075, "learning_rate": 1.377824340432426e-05, "loss": 0.3559, "step": 91420 }, { "epoch": 3.2949508054924856, "grad_norm": 0.22942350804805756, "learning_rate": 1.3775635845484203e-05, "loss": 0.3899, "step": 91425 }, { "epoch": 3.2951310051537104, "grad_norm": 0.282632976770401, "learning_rate": 1.3773028439573935e-05, "loss": 0.3621, "step": 91430 }, { "epoch": 3.295311204814935, "grad_norm": 0.2871980369091034, "learning_rate": 1.3770421186628979e-05, "loss": 0.3672, "step": 91435 }, { "epoch": 3.2954914044761594, "grad_norm": 0.27828335762023926, "learning_rate": 1.3767814086684853e-05, "loss": 0.4216, "step": 91440 }, { "epoch": 3.295671604137384, "grad_norm": 0.268162339925766, "learning_rate": 1.3765207139777091e-05, "loss": 0.4116, "step": 91445 }, { "epoch": 3.295851803798609, "grad_norm": 0.20781943202018738, "learning_rate": 1.3762600345941213e-05, "loss": 0.3331, "step": 91450 }, { "epoch": 3.2960320034598336, "grad_norm": 0.3062985837459564, "learning_rate": 1.3759993705212726e-05, "loss": 0.392, "step": 91455 }, { "epoch": 3.2962122031210583, "grad_norm": 0.1981145143508911, "learning_rate": 1.3757387217627146e-05, "loss": 0.3496, "step": 91460 }, { "epoch": 3.2963924027822826, "grad_norm": 0.2891653776168823, "learning_rate": 1.3754780883219992e-05, "loss": 0.4288, "step": 91465 }, { "epoch": 3.2965726024435074, "grad_norm": 0.23202994465827942, "learning_rate": 1.375217470202676e-05, "loss": 0.4005, "step": 91470 }, { "epoch": 3.296752802104732, "grad_norm": 0.26071614027023315, "learning_rate": 1.374956867408299e-05, "loss": 0.4132, "step": 91475 }, { "epoch": 3.296933001765957, "grad_norm": 0.22965455055236816, "learning_rate": 1.3746962799424151e-05, "loss": 0.3659, "step": 91480 }, { "epoch": 3.297113201427181, "grad_norm": 0.19163905084133148, "learning_rate": 1.3744357078085773e-05, "loss": 0.3467, "step": 91485 }, { "epoch": 3.297293401088406, "grad_norm": 0.21488019824028015, "learning_rate": 1.3741751510103352e-05, "loss": 0.3741, "step": 91490 }, { "epoch": 3.2974736007496306, "grad_norm": 0.19963303208351135, "learning_rate": 1.3739146095512383e-05, "loss": 0.3876, "step": 91495 }, { "epoch": 3.2976538004108553, "grad_norm": 0.18200761079788208, "learning_rate": 1.3736540834348372e-05, "loss": 0.3782, "step": 91500 }, { "epoch": 3.2976538004108553, "eval_loss": 0.430542528629303, "eval_runtime": 3.5303, "eval_samples_per_second": 28.326, "eval_steps_per_second": 7.081, "step": 91500 }, { "epoch": 3.29783400007208, "grad_norm": 0.2648855149745941, "learning_rate": 1.3733935726646812e-05, "loss": 0.3551, "step": 91505 }, { "epoch": 3.2980141997333043, "grad_norm": 0.26277956366539, "learning_rate": 1.3731330772443186e-05, "loss": 0.4238, "step": 91510 }, { "epoch": 3.298194399394529, "grad_norm": 0.2899339199066162, "learning_rate": 1.3728725971773004e-05, "loss": 0.3986, "step": 91515 }, { "epoch": 3.298374599055754, "grad_norm": 0.23275549709796906, "learning_rate": 1.372612132467175e-05, "loss": 0.3944, "step": 91520 }, { "epoch": 3.2985547987169785, "grad_norm": 0.3062422573566437, "learning_rate": 1.3723516831174912e-05, "loss": 0.4204, "step": 91525 }, { "epoch": 3.298734998378203, "grad_norm": 0.17250539362430573, "learning_rate": 1.3720912491317972e-05, "loss": 0.3552, "step": 91530 }, { "epoch": 3.2989151980394276, "grad_norm": 0.17668567597866058, "learning_rate": 1.3718308305136407e-05, "loss": 0.3811, "step": 91535 }, { "epoch": 3.2990953977006523, "grad_norm": 0.267254501581192, "learning_rate": 1.3715704272665717e-05, "loss": 0.3756, "step": 91540 }, { "epoch": 3.299275597361877, "grad_norm": 0.23662297427654266, "learning_rate": 1.3713100393941369e-05, "loss": 0.366, "step": 91545 }, { "epoch": 3.2994557970231018, "grad_norm": 0.21058867871761322, "learning_rate": 1.3710496668998845e-05, "loss": 0.3798, "step": 91550 }, { "epoch": 3.299635996684326, "grad_norm": 0.22281815111637115, "learning_rate": 1.3707893097873619e-05, "loss": 0.3874, "step": 91555 }, { "epoch": 3.299816196345551, "grad_norm": 0.29249441623687744, "learning_rate": 1.3705289680601152e-05, "loss": 0.3828, "step": 91560 }, { "epoch": 3.2999963960067755, "grad_norm": 0.2529570460319519, "learning_rate": 1.3702686417216937e-05, "loss": 0.3894, "step": 91565 }, { "epoch": 3.3001765956680003, "grad_norm": 0.23126383125782013, "learning_rate": 1.3700083307756443e-05, "loss": 0.3985, "step": 91570 }, { "epoch": 3.3003567953292245, "grad_norm": 0.21352465450763702, "learning_rate": 1.3697480352255105e-05, "loss": 0.3434, "step": 91575 }, { "epoch": 3.3005369949904493, "grad_norm": 0.2859989404678345, "learning_rate": 1.3694877550748419e-05, "loss": 0.3767, "step": 91580 }, { "epoch": 3.300717194651674, "grad_norm": 0.2089170217514038, "learning_rate": 1.3692274903271828e-05, "loss": 0.3966, "step": 91585 }, { "epoch": 3.3008973943128987, "grad_norm": 0.24411603808403015, "learning_rate": 1.368967240986082e-05, "loss": 0.3803, "step": 91590 }, { "epoch": 3.3010775939741235, "grad_norm": 0.2391398698091507, "learning_rate": 1.3687070070550829e-05, "loss": 0.361, "step": 91595 }, { "epoch": 3.301257793635348, "grad_norm": 0.23561078310012817, "learning_rate": 1.3684467885377306e-05, "loss": 0.373, "step": 91600 }, { "epoch": 3.3014379932965725, "grad_norm": 0.20145799219608307, "learning_rate": 1.3681865854375728e-05, "loss": 0.3838, "step": 91605 }, { "epoch": 3.3016181929577972, "grad_norm": 0.26969730854034424, "learning_rate": 1.3679263977581536e-05, "loss": 0.388, "step": 91610 }, { "epoch": 3.301798392619022, "grad_norm": 0.28870394825935364, "learning_rate": 1.3676662255030182e-05, "loss": 0.4321, "step": 91615 }, { "epoch": 3.3019785922802467, "grad_norm": 0.2489212453365326, "learning_rate": 1.3674060686757113e-05, "loss": 0.3873, "step": 91620 }, { "epoch": 3.302158791941471, "grad_norm": 0.2643071115016937, "learning_rate": 1.367145927279776e-05, "loss": 0.3973, "step": 91625 }, { "epoch": 3.3023389916026957, "grad_norm": 0.25098204612731934, "learning_rate": 1.3668858013187597e-05, "loss": 0.3781, "step": 91630 }, { "epoch": 3.3025191912639205, "grad_norm": 0.20655451714992523, "learning_rate": 1.3666256907962044e-05, "loss": 0.3913, "step": 91635 }, { "epoch": 3.302699390925145, "grad_norm": 0.20496463775634766, "learning_rate": 1.366365595715655e-05, "loss": 0.3778, "step": 91640 }, { "epoch": 3.30287959058637, "grad_norm": 0.2476882040500641, "learning_rate": 1.3661055160806546e-05, "loss": 0.398, "step": 91645 }, { "epoch": 3.303059790247594, "grad_norm": 0.20680633187294006, "learning_rate": 1.3658454518947475e-05, "loss": 0.353, "step": 91650 }, { "epoch": 3.303239989908819, "grad_norm": 0.21282508969306946, "learning_rate": 1.3655854031614751e-05, "loss": 0.3972, "step": 91655 }, { "epoch": 3.3034201895700437, "grad_norm": 0.2705020010471344, "learning_rate": 1.3653253698843844e-05, "loss": 0.3935, "step": 91660 }, { "epoch": 3.3036003892312684, "grad_norm": 0.32260480523109436, "learning_rate": 1.3650653520670134e-05, "loss": 0.3799, "step": 91665 }, { "epoch": 3.3037805888924927, "grad_norm": 0.2574411928653717, "learning_rate": 1.3648053497129082e-05, "loss": 0.4028, "step": 91670 }, { "epoch": 3.3039607885537174, "grad_norm": 0.22409643232822418, "learning_rate": 1.3645453628256105e-05, "loss": 0.3615, "step": 91675 }, { "epoch": 3.304140988214942, "grad_norm": 0.2090173065662384, "learning_rate": 1.3642853914086617e-05, "loss": 0.373, "step": 91680 }, { "epoch": 3.304321187876167, "grad_norm": 0.1917799711227417, "learning_rate": 1.3640254354656062e-05, "loss": 0.3696, "step": 91685 }, { "epoch": 3.3045013875373916, "grad_norm": 0.2533906400203705, "learning_rate": 1.3637654949999833e-05, "loss": 0.4081, "step": 91690 }, { "epoch": 3.304681587198616, "grad_norm": 0.19297879934310913, "learning_rate": 1.3635055700153346e-05, "loss": 0.3785, "step": 91695 }, { "epoch": 3.3048617868598407, "grad_norm": 0.20561909675598145, "learning_rate": 1.3632456605152033e-05, "loss": 0.3966, "step": 91700 }, { "epoch": 3.3050419865210654, "grad_norm": 0.22225363552570343, "learning_rate": 1.36298576650313e-05, "loss": 0.4088, "step": 91705 }, { "epoch": 3.30522218618229, "grad_norm": 0.23833619058132172, "learning_rate": 1.3627258879826554e-05, "loss": 0.3973, "step": 91710 }, { "epoch": 3.3054023858435144, "grad_norm": 0.23555156588554382, "learning_rate": 1.3624660249573207e-05, "loss": 0.3813, "step": 91715 }, { "epoch": 3.305582585504739, "grad_norm": 0.21567942202091217, "learning_rate": 1.3622061774306647e-05, "loss": 0.381, "step": 91720 }, { "epoch": 3.305762785165964, "grad_norm": 0.2811535894870758, "learning_rate": 1.3619463454062304e-05, "loss": 0.3864, "step": 91725 }, { "epoch": 3.3059429848271886, "grad_norm": 0.23214657604694366, "learning_rate": 1.3616865288875568e-05, "loss": 0.3726, "step": 91730 }, { "epoch": 3.3061231844884134, "grad_norm": 0.2762070596218109, "learning_rate": 1.3614267278781839e-05, "loss": 0.3761, "step": 91735 }, { "epoch": 3.3063033841496376, "grad_norm": 0.21005326509475708, "learning_rate": 1.3611669423816514e-05, "loss": 0.3974, "step": 91740 }, { "epoch": 3.3064835838108624, "grad_norm": 0.2921294569969177, "learning_rate": 1.360907172401498e-05, "loss": 0.3958, "step": 91745 }, { "epoch": 3.306663783472087, "grad_norm": 0.24011696875095367, "learning_rate": 1.3606474179412645e-05, "loss": 0.3846, "step": 91750 }, { "epoch": 3.306843983133312, "grad_norm": 0.2656587064266205, "learning_rate": 1.3603876790044906e-05, "loss": 0.3611, "step": 91755 }, { "epoch": 3.307024182794536, "grad_norm": 0.2405286282300949, "learning_rate": 1.360127955594712e-05, "loss": 0.3816, "step": 91760 }, { "epoch": 3.307204382455761, "grad_norm": 0.23016515374183655, "learning_rate": 1.3598682477154701e-05, "loss": 0.3776, "step": 91765 }, { "epoch": 3.3073845821169856, "grad_norm": 0.24398526549339294, "learning_rate": 1.3596085553703014e-05, "loss": 0.3715, "step": 91770 }, { "epoch": 3.3075647817782103, "grad_norm": 0.25272274017333984, "learning_rate": 1.3593488785627478e-05, "loss": 0.3561, "step": 91775 }, { "epoch": 3.307744981439435, "grad_norm": 0.22344271838665009, "learning_rate": 1.3590892172963437e-05, "loss": 0.3864, "step": 91780 }, { "epoch": 3.3079251811006594, "grad_norm": 0.2128763496875763, "learning_rate": 1.3588295715746272e-05, "loss": 0.3617, "step": 91785 }, { "epoch": 3.308105380761884, "grad_norm": 0.24568325281143188, "learning_rate": 1.3585699414011376e-05, "loss": 0.3945, "step": 91790 }, { "epoch": 3.308285580423109, "grad_norm": 0.230985626578331, "learning_rate": 1.3583103267794117e-05, "loss": 0.3645, "step": 91795 }, { "epoch": 3.3084657800843336, "grad_norm": 0.25532665848731995, "learning_rate": 1.3580507277129867e-05, "loss": 0.3828, "step": 91800 }, { "epoch": 3.308645979745558, "grad_norm": 0.26051023602485657, "learning_rate": 1.3577911442053992e-05, "loss": 0.377, "step": 91805 }, { "epoch": 3.3088261794067826, "grad_norm": 0.22641240060329437, "learning_rate": 1.3575315762601853e-05, "loss": 0.3672, "step": 91810 }, { "epoch": 3.3090063790680073, "grad_norm": 0.29088813066482544, "learning_rate": 1.357272023880884e-05, "loss": 0.3662, "step": 91815 }, { "epoch": 3.309186578729232, "grad_norm": 0.24642913043498993, "learning_rate": 1.3570124870710293e-05, "loss": 0.3872, "step": 91820 }, { "epoch": 3.309366778390457, "grad_norm": 0.20131155848503113, "learning_rate": 1.3567529658341587e-05, "loss": 0.3784, "step": 91825 }, { "epoch": 3.309546978051681, "grad_norm": 0.2340930700302124, "learning_rate": 1.3564934601738075e-05, "loss": 0.409, "step": 91830 }, { "epoch": 3.309727177712906, "grad_norm": 0.22841165959835052, "learning_rate": 1.3562339700935114e-05, "loss": 0.3586, "step": 91835 }, { "epoch": 3.3099073773741305, "grad_norm": 0.2462332546710968, "learning_rate": 1.355974495596805e-05, "loss": 0.4212, "step": 91840 }, { "epoch": 3.3100875770353553, "grad_norm": 0.2860056459903717, "learning_rate": 1.355715036687226e-05, "loss": 0.3739, "step": 91845 }, { "epoch": 3.3102677766965796, "grad_norm": 0.24574606120586395, "learning_rate": 1.3554555933683077e-05, "loss": 0.3489, "step": 91850 }, { "epoch": 3.3104479763578043, "grad_norm": 0.20963186025619507, "learning_rate": 1.3551961656435852e-05, "loss": 0.3463, "step": 91855 }, { "epoch": 3.310628176019029, "grad_norm": 0.2524944841861725, "learning_rate": 1.3549367535165935e-05, "loss": 0.3816, "step": 91860 }, { "epoch": 3.3108083756802538, "grad_norm": 0.22844867408275604, "learning_rate": 1.3546773569908661e-05, "loss": 0.3623, "step": 91865 }, { "epoch": 3.3109885753414785, "grad_norm": 0.20473162829875946, "learning_rate": 1.35441797606994e-05, "loss": 0.4076, "step": 91870 }, { "epoch": 3.3111687750027032, "grad_norm": 0.2806645631790161, "learning_rate": 1.3541586107573456e-05, "loss": 0.3582, "step": 91875 }, { "epoch": 3.3113489746639275, "grad_norm": 0.2380189746618271, "learning_rate": 1.3538992610566175e-05, "loss": 0.3897, "step": 91880 }, { "epoch": 3.3115291743251523, "grad_norm": 0.20420801639556885, "learning_rate": 1.3536399269712912e-05, "loss": 0.3863, "step": 91885 }, { "epoch": 3.311709373986377, "grad_norm": 0.27330854535102844, "learning_rate": 1.3533806085048991e-05, "loss": 0.3967, "step": 91890 }, { "epoch": 3.3118895736476017, "grad_norm": 0.26601719856262207, "learning_rate": 1.3531213056609744e-05, "loss": 0.3883, "step": 91895 }, { "epoch": 3.312069773308826, "grad_norm": 0.22995233535766602, "learning_rate": 1.352862018443049e-05, "loss": 0.3689, "step": 91900 }, { "epoch": 3.3122499729700507, "grad_norm": 0.26952049136161804, "learning_rate": 1.3526027468546562e-05, "loss": 0.3816, "step": 91905 }, { "epoch": 3.3124301726312755, "grad_norm": 0.23863959312438965, "learning_rate": 1.3523434908993299e-05, "loss": 0.3797, "step": 91910 }, { "epoch": 3.3126103722925, "grad_norm": 0.2401474565267563, "learning_rate": 1.3520842505806008e-05, "loss": 0.3767, "step": 91915 }, { "epoch": 3.312790571953725, "grad_norm": 0.24980275332927704, "learning_rate": 1.3518250259020021e-05, "loss": 0.3933, "step": 91920 }, { "epoch": 3.3129707716149492, "grad_norm": 0.2250710129737854, "learning_rate": 1.3515658168670647e-05, "loss": 0.4035, "step": 91925 }, { "epoch": 3.313150971276174, "grad_norm": 0.23497453331947327, "learning_rate": 1.3513066234793198e-05, "loss": 0.3815, "step": 91930 }, { "epoch": 3.3133311709373987, "grad_norm": Infinity, "learning_rate": 1.3510992800374772e-05, "loss": 0.3933, "step": 91935 }, { "epoch": 3.3135113705986234, "grad_norm": 0.20314912497997284, "learning_rate": 1.3508401148235816e-05, "loss": 0.4087, "step": 91940 }, { "epoch": 3.3136915702598477, "grad_norm": 0.25075605511665344, "learning_rate": 1.3505809652667658e-05, "loss": 0.3749, "step": 91945 }, { "epoch": 3.3138717699210725, "grad_norm": 0.2547110617160797, "learning_rate": 1.3503218313705612e-05, "loss": 0.4021, "step": 91950 }, { "epoch": 3.314051969582297, "grad_norm": 0.28053396940231323, "learning_rate": 1.3500627131384996e-05, "loss": 0.399, "step": 91955 }, { "epoch": 3.314232169243522, "grad_norm": 0.2546713054180145, "learning_rate": 1.3498036105741113e-05, "loss": 0.394, "step": 91960 }, { "epoch": 3.3144123689047467, "grad_norm": 0.24178683757781982, "learning_rate": 1.3495445236809263e-05, "loss": 0.3621, "step": 91965 }, { "epoch": 3.314592568565971, "grad_norm": 0.2593505084514618, "learning_rate": 1.3492854524624737e-05, "loss": 0.3819, "step": 91970 }, { "epoch": 3.3147727682271957, "grad_norm": 0.25846415758132935, "learning_rate": 1.3490263969222838e-05, "loss": 0.3856, "step": 91975 }, { "epoch": 3.3149529678884204, "grad_norm": 0.20164726674556732, "learning_rate": 1.348767357063887e-05, "loss": 0.3679, "step": 91980 }, { "epoch": 3.315133167549645, "grad_norm": 0.22583597898483276, "learning_rate": 1.348508332890812e-05, "loss": 0.3902, "step": 91985 }, { "epoch": 3.3153133672108694, "grad_norm": 0.223631352186203, "learning_rate": 1.348249324406588e-05, "loss": 0.3685, "step": 91990 }, { "epoch": 3.315493566872094, "grad_norm": 0.2747786343097687, "learning_rate": 1.347990331614744e-05, "loss": 0.4107, "step": 91995 }, { "epoch": 3.315673766533319, "grad_norm": 0.22893761098384857, "learning_rate": 1.347731354518808e-05, "loss": 0.3957, "step": 92000 }, { "epoch": 3.315673766533319, "eval_loss": 0.4305948317050934, "eval_runtime": 3.5291, "eval_samples_per_second": 28.336, "eval_steps_per_second": 7.084, "step": 92000 }, { "epoch": 3.3158539661945436, "grad_norm": 0.269552081823349, "learning_rate": 1.3474723931223102e-05, "loss": 0.3697, "step": 92005 }, { "epoch": 3.3160341658557684, "grad_norm": 0.21320883929729462, "learning_rate": 1.3472134474287777e-05, "loss": 0.3593, "step": 92010 }, { "epoch": 3.3162143655169927, "grad_norm": 0.25191807746887207, "learning_rate": 1.3469545174417386e-05, "loss": 0.4161, "step": 92015 }, { "epoch": 3.3163945651782174, "grad_norm": 0.3065241575241089, "learning_rate": 1.3466956031647212e-05, "loss": 0.3889, "step": 92020 }, { "epoch": 3.316574764839442, "grad_norm": 0.22870443761348724, "learning_rate": 1.3464367046012521e-05, "loss": 0.3896, "step": 92025 }, { "epoch": 3.316754964500667, "grad_norm": 0.2070244699716568, "learning_rate": 1.3461778217548603e-05, "loss": 0.4069, "step": 92030 }, { "epoch": 3.316935164161891, "grad_norm": 0.20733942091464996, "learning_rate": 1.3459189546290735e-05, "loss": 0.3833, "step": 92035 }, { "epoch": 3.317115363823116, "grad_norm": 0.22954271733760834, "learning_rate": 1.3456601032274153e-05, "loss": 0.4083, "step": 92040 }, { "epoch": 3.3172955634843406, "grad_norm": 0.23598207533359528, "learning_rate": 1.3454012675534156e-05, "loss": 0.3831, "step": 92045 }, { "epoch": 3.3174757631455654, "grad_norm": 0.2571483850479126, "learning_rate": 1.3451424476106004e-05, "loss": 0.3722, "step": 92050 }, { "epoch": 3.31765596280679, "grad_norm": 0.22039219737052917, "learning_rate": 1.3448836434024955e-05, "loss": 0.3507, "step": 92055 }, { "epoch": 3.3178361624680144, "grad_norm": 0.2569846510887146, "learning_rate": 1.3446248549326274e-05, "loss": 0.3714, "step": 92060 }, { "epoch": 3.318016362129239, "grad_norm": 0.24034719169139862, "learning_rate": 1.3443660822045211e-05, "loss": 0.3962, "step": 92065 }, { "epoch": 3.318196561790464, "grad_norm": 0.27031049132347107, "learning_rate": 1.344107325221704e-05, "loss": 0.3683, "step": 92070 }, { "epoch": 3.3183767614516886, "grad_norm": 0.31037384271621704, "learning_rate": 1.343848583987701e-05, "loss": 0.4435, "step": 92075 }, { "epoch": 3.318556961112913, "grad_norm": 0.21680322289466858, "learning_rate": 1.3435898585060372e-05, "loss": 0.3767, "step": 92080 }, { "epoch": 3.3187371607741376, "grad_norm": 0.22905753552913666, "learning_rate": 1.3433311487802375e-05, "loss": 0.3935, "step": 92085 }, { "epoch": 3.3189173604353623, "grad_norm": 0.27699750661849976, "learning_rate": 1.343072454813827e-05, "loss": 0.4165, "step": 92090 }, { "epoch": 3.319097560096587, "grad_norm": 0.19221347570419312, "learning_rate": 1.3428137766103293e-05, "loss": 0.3773, "step": 92095 }, { "epoch": 3.319277759757812, "grad_norm": 0.2585064172744751, "learning_rate": 1.3425551141732712e-05, "loss": 0.3909, "step": 92100 }, { "epoch": 3.3194579594190365, "grad_norm": 0.21457339823246002, "learning_rate": 1.3422964675061752e-05, "loss": 0.3761, "step": 92105 }, { "epoch": 3.319638159080261, "grad_norm": 0.22903072834014893, "learning_rate": 1.3420378366125657e-05, "loss": 0.3808, "step": 92110 }, { "epoch": 3.3198183587414856, "grad_norm": 0.24865801632404327, "learning_rate": 1.3417792214959668e-05, "loss": 0.3829, "step": 92115 }, { "epoch": 3.3199985584027103, "grad_norm": 0.22578608989715576, "learning_rate": 1.3415206221599006e-05, "loss": 0.3397, "step": 92120 }, { "epoch": 3.320178758063935, "grad_norm": 0.21695365011692047, "learning_rate": 1.3412620386078933e-05, "loss": 0.3552, "step": 92125 }, { "epoch": 3.3203589577251593, "grad_norm": 0.19830626249313354, "learning_rate": 1.341003470843466e-05, "loss": 0.3794, "step": 92130 }, { "epoch": 3.320539157386384, "grad_norm": 0.25142696499824524, "learning_rate": 1.3407449188701408e-05, "loss": 0.387, "step": 92135 }, { "epoch": 3.320719357047609, "grad_norm": 0.25763440132141113, "learning_rate": 1.3404863826914424e-05, "loss": 0.4225, "step": 92140 }, { "epoch": 3.3208995567088335, "grad_norm": 0.2544826567173004, "learning_rate": 1.3402278623108915e-05, "loss": 0.3771, "step": 92145 }, { "epoch": 3.3210797563700583, "grad_norm": 0.24022381007671356, "learning_rate": 1.3399693577320137e-05, "loss": 0.3724, "step": 92150 }, { "epoch": 3.3212599560312825, "grad_norm": 0.24866530299186707, "learning_rate": 1.3397108689583276e-05, "loss": 0.3865, "step": 92155 }, { "epoch": 3.3214401556925073, "grad_norm": 0.2202133685350418, "learning_rate": 1.3394523959933559e-05, "loss": 0.3777, "step": 92160 }, { "epoch": 3.321620355353732, "grad_norm": 0.1919008046388626, "learning_rate": 1.339193938840621e-05, "loss": 0.3427, "step": 92165 }, { "epoch": 3.3218005550149567, "grad_norm": 0.21456719934940338, "learning_rate": 1.338935497503644e-05, "loss": 0.406, "step": 92170 }, { "epoch": 3.321980754676181, "grad_norm": 0.17022041976451874, "learning_rate": 1.338677071985946e-05, "loss": 0.3656, "step": 92175 }, { "epoch": 3.3221609543374058, "grad_norm": 0.2558014392852783, "learning_rate": 1.3384186622910482e-05, "loss": 0.3812, "step": 92180 }, { "epoch": 3.3223411539986305, "grad_norm": 0.24474850296974182, "learning_rate": 1.3381602684224703e-05, "loss": 0.372, "step": 92185 }, { "epoch": 3.3225213536598552, "grad_norm": 0.2648925483226776, "learning_rate": 1.3379018903837348e-05, "loss": 0.3879, "step": 92190 }, { "epoch": 3.32270155332108, "grad_norm": 0.2510066330432892, "learning_rate": 1.3376435281783608e-05, "loss": 0.3624, "step": 92195 }, { "epoch": 3.3228817529823043, "grad_norm": 0.20649553835391998, "learning_rate": 1.337385181809869e-05, "loss": 0.3811, "step": 92200 }, { "epoch": 3.323061952643529, "grad_norm": 0.24206553399562836, "learning_rate": 1.337126851281779e-05, "loss": 0.3888, "step": 92205 }, { "epoch": 3.3232421523047537, "grad_norm": 0.21972547471523285, "learning_rate": 1.3368685365976091e-05, "loss": 0.3822, "step": 92210 }, { "epoch": 3.3234223519659785, "grad_norm": 0.22937068343162537, "learning_rate": 1.3366102377608817e-05, "loss": 0.3684, "step": 92215 }, { "epoch": 3.3236025516272028, "grad_norm": 0.23047223687171936, "learning_rate": 1.3363519547751152e-05, "loss": 0.3663, "step": 92220 }, { "epoch": 3.3237827512884275, "grad_norm": 0.19826874136924744, "learning_rate": 1.3360936876438263e-05, "loss": 0.3945, "step": 92225 }, { "epoch": 3.323962950949652, "grad_norm": 0.2385808378458023, "learning_rate": 1.3358354363705365e-05, "loss": 0.4018, "step": 92230 }, { "epoch": 3.324143150610877, "grad_norm": 0.24572807550430298, "learning_rate": 1.3355772009587634e-05, "loss": 0.405, "step": 92235 }, { "epoch": 3.3243233502721017, "grad_norm": 0.237590491771698, "learning_rate": 1.3353189814120253e-05, "loss": 0.3893, "step": 92240 }, { "epoch": 3.324503549933326, "grad_norm": 0.20121124386787415, "learning_rate": 1.3350607777338409e-05, "loss": 0.3707, "step": 92245 }, { "epoch": 3.3246837495945507, "grad_norm": 0.260689377784729, "learning_rate": 1.334802589927727e-05, "loss": 0.3887, "step": 92250 }, { "epoch": 3.3248639492557754, "grad_norm": 0.31178975105285645, "learning_rate": 1.3345444179972027e-05, "loss": 0.4013, "step": 92255 }, { "epoch": 3.325044148917, "grad_norm": 0.24525390565395355, "learning_rate": 1.3342862619457858e-05, "loss": 0.3908, "step": 92260 }, { "epoch": 3.3252243485782245, "grad_norm": 0.20837341248989105, "learning_rate": 1.3340281217769922e-05, "loss": 0.3367, "step": 92265 }, { "epoch": 3.325404548239449, "grad_norm": 0.24146051704883575, "learning_rate": 1.3337699974943401e-05, "loss": 0.3905, "step": 92270 }, { "epoch": 3.325584747900674, "grad_norm": 0.2158283293247223, "learning_rate": 1.3335118891013459e-05, "loss": 0.3847, "step": 92275 }, { "epoch": 3.3257649475618987, "grad_norm": 0.2629423141479492, "learning_rate": 1.3332537966015251e-05, "loss": 0.4141, "step": 92280 }, { "epoch": 3.3259451472231234, "grad_norm": 0.22925181686878204, "learning_rate": 1.3329957199983967e-05, "loss": 0.3676, "step": 92285 }, { "epoch": 3.3261253468843477, "grad_norm": 0.2732671797275543, "learning_rate": 1.3327376592954757e-05, "loss": 0.3693, "step": 92290 }, { "epoch": 3.3263055465455724, "grad_norm": 0.21771977841854095, "learning_rate": 1.3324796144962781e-05, "loss": 0.3854, "step": 92295 }, { "epoch": 3.326485746206797, "grad_norm": 0.19893218576908112, "learning_rate": 1.3322215856043196e-05, "loss": 0.3719, "step": 92300 }, { "epoch": 3.326665945868022, "grad_norm": 0.253020703792572, "learning_rate": 1.331963572623115e-05, "loss": 0.3934, "step": 92305 }, { "epoch": 3.326846145529246, "grad_norm": 0.272172749042511, "learning_rate": 1.3317055755561817e-05, "loss": 0.4027, "step": 92310 }, { "epoch": 3.327026345190471, "grad_norm": 0.22048524022102356, "learning_rate": 1.3314475944070345e-05, "loss": 0.383, "step": 92315 }, { "epoch": 3.3272065448516956, "grad_norm": 0.188111811876297, "learning_rate": 1.3311896291791859e-05, "loss": 0.3955, "step": 92320 }, { "epoch": 3.3273867445129204, "grad_norm": 0.21891506016254425, "learning_rate": 1.3309316798761529e-05, "loss": 0.3913, "step": 92325 }, { "epoch": 3.327566944174145, "grad_norm": 0.24252718687057495, "learning_rate": 1.3306737465014485e-05, "loss": 0.3619, "step": 92330 }, { "epoch": 3.3277471438353694, "grad_norm": 0.23667490482330322, "learning_rate": 1.33041582905859e-05, "loss": 0.3844, "step": 92335 }, { "epoch": 3.327927343496594, "grad_norm": 0.21754367649555206, "learning_rate": 1.3301579275510887e-05, "loss": 0.3438, "step": 92340 }, { "epoch": 3.328107543157819, "grad_norm": 0.2473175972700119, "learning_rate": 1.3299000419824578e-05, "loss": 0.4039, "step": 92345 }, { "epoch": 3.3282877428190436, "grad_norm": 0.2296813279390335, "learning_rate": 1.3296421723562133e-05, "loss": 0.3446, "step": 92350 }, { "epoch": 3.328467942480268, "grad_norm": 0.24328921735286713, "learning_rate": 1.3293843186758675e-05, "loss": 0.3843, "step": 92355 }, { "epoch": 3.3286481421414926, "grad_norm": 0.23065999150276184, "learning_rate": 1.329126480944934e-05, "loss": 0.3811, "step": 92360 }, { "epoch": 3.3288283418027174, "grad_norm": 0.2626056373119354, "learning_rate": 1.3288686591669255e-05, "loss": 0.3911, "step": 92365 }, { "epoch": 3.329008541463942, "grad_norm": 0.2469651997089386, "learning_rate": 1.3286108533453537e-05, "loss": 0.3647, "step": 92370 }, { "epoch": 3.329188741125167, "grad_norm": 0.23114638030529022, "learning_rate": 1.3283530634837333e-05, "loss": 0.374, "step": 92375 }, { "epoch": 3.3293689407863916, "grad_norm": 0.209586039185524, "learning_rate": 1.3280952895855753e-05, "loss": 0.3747, "step": 92380 }, { "epoch": 3.329549140447616, "grad_norm": 0.25766298174858093, "learning_rate": 1.3278375316543924e-05, "loss": 0.3928, "step": 92385 }, { "epoch": 3.3297293401088406, "grad_norm": 0.20684154331684113, "learning_rate": 1.3275797896936958e-05, "loss": 0.3196, "step": 92390 }, { "epoch": 3.3299095397700653, "grad_norm": 0.27292490005493164, "learning_rate": 1.3273220637069971e-05, "loss": 0.3889, "step": 92395 }, { "epoch": 3.33008973943129, "grad_norm": 0.2662750780582428, "learning_rate": 1.327064353697809e-05, "loss": 0.385, "step": 92400 }, { "epoch": 3.3302699390925143, "grad_norm": 0.23216348886489868, "learning_rate": 1.326806659669643e-05, "loss": 0.3605, "step": 92405 }, { "epoch": 3.330450138753739, "grad_norm": 0.2415631115436554, "learning_rate": 1.3265489816260071e-05, "loss": 0.4063, "step": 92410 }, { "epoch": 3.330630338414964, "grad_norm": 0.2386670559644699, "learning_rate": 1.3262913195704152e-05, "loss": 0.405, "step": 92415 }, { "epoch": 3.3308105380761885, "grad_norm": 0.29892489314079285, "learning_rate": 1.3260336735063766e-05, "loss": 0.3817, "step": 92420 }, { "epoch": 3.3309907377374133, "grad_norm": 0.2595389485359192, "learning_rate": 1.325776043437401e-05, "loss": 0.4289, "step": 92425 }, { "epoch": 3.3311709373986376, "grad_norm": 0.23359209299087524, "learning_rate": 1.325518429367002e-05, "loss": 0.3704, "step": 92430 }, { "epoch": 3.3313511370598623, "grad_norm": 0.22734783589839935, "learning_rate": 1.3252608312986844e-05, "loss": 0.3439, "step": 92435 }, { "epoch": 3.331531336721087, "grad_norm": 0.21022412180900574, "learning_rate": 1.3250032492359618e-05, "loss": 0.3662, "step": 92440 }, { "epoch": 3.3317115363823118, "grad_norm": 0.2911469042301178, "learning_rate": 1.3247456831823423e-05, "loss": 0.3742, "step": 92445 }, { "epoch": 3.331891736043536, "grad_norm": 0.24090684950351715, "learning_rate": 1.3244881331413356e-05, "loss": 0.371, "step": 92450 }, { "epoch": 3.332071935704761, "grad_norm": 0.27764245867729187, "learning_rate": 1.32423059911645e-05, "loss": 0.3933, "step": 92455 }, { "epoch": 3.3322521353659855, "grad_norm": 0.23873978853225708, "learning_rate": 1.3239730811111952e-05, "loss": 0.3861, "step": 92460 }, { "epoch": 3.3324323350272103, "grad_norm": 0.19886447489261627, "learning_rate": 1.3237155791290784e-05, "loss": 0.3883, "step": 92465 }, { "epoch": 3.332612534688435, "grad_norm": 0.24735817313194275, "learning_rate": 1.3234580931736102e-05, "loss": 0.366, "step": 92470 }, { "epoch": 3.3327927343496593, "grad_norm": 0.21200679242610931, "learning_rate": 1.3232006232482978e-05, "loss": 0.3777, "step": 92475 }, { "epoch": 3.332972934010884, "grad_norm": 0.22038812935352325, "learning_rate": 1.3229431693566488e-05, "loss": 0.3966, "step": 92480 }, { "epoch": 3.3331531336721087, "grad_norm": 0.21706204116344452, "learning_rate": 1.3226857315021712e-05, "loss": 0.3784, "step": 92485 }, { "epoch": 3.3333333333333335, "grad_norm": 0.26988691091537476, "learning_rate": 1.3224283096883717e-05, "loss": 0.3762, "step": 92490 }, { "epoch": 3.3335135329945578, "grad_norm": 0.2401093691587448, "learning_rate": 1.3221709039187597e-05, "loss": 0.4154, "step": 92495 }, { "epoch": 3.3336937326557825, "grad_norm": 0.18479788303375244, "learning_rate": 1.321913514196842e-05, "loss": 0.3788, "step": 92500 }, { "epoch": 3.3336937326557825, "eval_loss": 0.42996159195899963, "eval_runtime": 3.5289, "eval_samples_per_second": 28.337, "eval_steps_per_second": 7.084, "step": 92500 }, { "epoch": 3.3338739323170072, "grad_norm": 0.28573232889175415, "learning_rate": 1.3216561405261224e-05, "loss": 0.4017, "step": 92505 }, { "epoch": 3.334054131978232, "grad_norm": 0.2534043490886688, "learning_rate": 1.3213987829101108e-05, "loss": 0.3497, "step": 92510 }, { "epoch": 3.3342343316394567, "grad_norm": 0.218218594789505, "learning_rate": 1.3211414413523115e-05, "loss": 0.4098, "step": 92515 }, { "epoch": 3.334414531300681, "grad_norm": 0.21875832974910736, "learning_rate": 1.320884115856234e-05, "loss": 0.3693, "step": 92520 }, { "epoch": 3.3345947309619057, "grad_norm": 0.2261316180229187, "learning_rate": 1.320626806425381e-05, "loss": 0.3835, "step": 92525 }, { "epoch": 3.3347749306231305, "grad_norm": 0.21817557513713837, "learning_rate": 1.3203695130632587e-05, "loss": 0.3939, "step": 92530 }, { "epoch": 3.334955130284355, "grad_norm": 0.21988247334957123, "learning_rate": 1.3201122357733742e-05, "loss": 0.4075, "step": 92535 }, { "epoch": 3.3351353299455795, "grad_norm": 0.26080963015556335, "learning_rate": 1.319854974559232e-05, "loss": 0.4094, "step": 92540 }, { "epoch": 3.335315529606804, "grad_norm": 0.2803710103034973, "learning_rate": 1.3195977294243372e-05, "loss": 0.431, "step": 92545 }, { "epoch": 3.335495729268029, "grad_norm": 0.21274006366729736, "learning_rate": 1.3193405003721951e-05, "loss": 0.3808, "step": 92550 }, { "epoch": 3.3356759289292537, "grad_norm": 0.23850922286510468, "learning_rate": 1.3190832874063092e-05, "loss": 0.406, "step": 92555 }, { "epoch": 3.3358561285904784, "grad_norm": 0.22884266078472137, "learning_rate": 1.3188260905301852e-05, "loss": 0.373, "step": 92560 }, { "epoch": 3.3360363282517027, "grad_norm": 0.20494328439235687, "learning_rate": 1.3185689097473278e-05, "loss": 0.3745, "step": 92565 }, { "epoch": 3.3362165279129274, "grad_norm": 0.24264514446258545, "learning_rate": 1.3183117450612398e-05, "loss": 0.3718, "step": 92570 }, { "epoch": 3.336396727574152, "grad_norm": 0.1675427258014679, "learning_rate": 1.3180545964754257e-05, "loss": 0.3765, "step": 92575 }, { "epoch": 3.336576927235377, "grad_norm": 0.25368401408195496, "learning_rate": 1.3177974639933877e-05, "loss": 0.3799, "step": 92580 }, { "epoch": 3.336757126896601, "grad_norm": 0.30531957745552063, "learning_rate": 1.3175403476186315e-05, "loss": 0.4008, "step": 92585 }, { "epoch": 3.336937326557826, "grad_norm": 0.2088593989610672, "learning_rate": 1.3172832473546593e-05, "loss": 0.3525, "step": 92590 }, { "epoch": 3.3371175262190507, "grad_norm": 0.28222382068634033, "learning_rate": 1.3170261632049736e-05, "loss": 0.3578, "step": 92595 }, { "epoch": 3.3372977258802754, "grad_norm": 0.18309058248996735, "learning_rate": 1.3167690951730777e-05, "loss": 0.3696, "step": 92600 }, { "epoch": 3.3374779255415, "grad_norm": 0.22721010446548462, "learning_rate": 1.3165120432624734e-05, "loss": 0.3963, "step": 92605 }, { "epoch": 3.337658125202725, "grad_norm": 0.30004847049713135, "learning_rate": 1.3162550074766627e-05, "loss": 0.4081, "step": 92610 }, { "epoch": 3.337838324863949, "grad_norm": 0.29648730158805847, "learning_rate": 1.3159979878191503e-05, "loss": 0.3961, "step": 92615 }, { "epoch": 3.338018524525174, "grad_norm": 0.19478410482406616, "learning_rate": 1.3157409842934338e-05, "loss": 0.3629, "step": 92620 }, { "epoch": 3.3381987241863986, "grad_norm": 0.2053857296705246, "learning_rate": 1.3154839969030186e-05, "loss": 0.3789, "step": 92625 }, { "epoch": 3.3383789238476234, "grad_norm": 0.19927579164505005, "learning_rate": 1.3152270256514044e-05, "loss": 0.3949, "step": 92630 }, { "epoch": 3.3385591235088476, "grad_norm": 0.2706872224807739, "learning_rate": 1.3149700705420923e-05, "loss": 0.3705, "step": 92635 }, { "epoch": 3.3387393231700724, "grad_norm": 0.2197032868862152, "learning_rate": 1.3147131315785837e-05, "loss": 0.3917, "step": 92640 }, { "epoch": 3.338919522831297, "grad_norm": 0.2595154643058777, "learning_rate": 1.3144562087643794e-05, "loss": 0.3981, "step": 92645 }, { "epoch": 3.339099722492522, "grad_norm": 0.20744380354881287, "learning_rate": 1.3141993021029786e-05, "loss": 0.3941, "step": 92650 }, { "epoch": 3.3392799221537466, "grad_norm": 0.2614065408706665, "learning_rate": 1.3139424115978838e-05, "loss": 0.3667, "step": 92655 }, { "epoch": 3.339460121814971, "grad_norm": 0.26152557134628296, "learning_rate": 1.3136855372525939e-05, "loss": 0.3953, "step": 92660 }, { "epoch": 3.3396403214761956, "grad_norm": 0.2180803269147873, "learning_rate": 1.313428679070609e-05, "loss": 0.3865, "step": 92665 }, { "epoch": 3.3398205211374203, "grad_norm": 0.21196460723876953, "learning_rate": 1.3131718370554286e-05, "loss": 0.3598, "step": 92670 }, { "epoch": 3.340000720798645, "grad_norm": 0.23230072855949402, "learning_rate": 1.312915011210551e-05, "loss": 0.3827, "step": 92675 }, { "epoch": 3.3401809204598694, "grad_norm": 0.24028299748897552, "learning_rate": 1.3126582015394778e-05, "loss": 0.41, "step": 92680 }, { "epoch": 3.340361120121094, "grad_norm": 0.29149261116981506, "learning_rate": 1.3124014080457076e-05, "loss": 0.3888, "step": 92685 }, { "epoch": 3.340541319782319, "grad_norm": 0.2105678766965866, "learning_rate": 1.3121446307327362e-05, "loss": 0.3682, "step": 92690 }, { "epoch": 3.3407215194435436, "grad_norm": 0.29433146119117737, "learning_rate": 1.3118878696040655e-05, "loss": 0.3464, "step": 92695 }, { "epoch": 3.3409017191047683, "grad_norm": 0.19459545612335205, "learning_rate": 1.311631124663191e-05, "loss": 0.372, "step": 92700 }, { "epoch": 3.3410819187659926, "grad_norm": 0.22271791100502014, "learning_rate": 1.3113743959136137e-05, "loss": 0.3698, "step": 92705 }, { "epoch": 3.3412621184272173, "grad_norm": 0.24294120073318481, "learning_rate": 1.3111176833588312e-05, "loss": 0.3947, "step": 92710 }, { "epoch": 3.341442318088442, "grad_norm": 0.2713625729084015, "learning_rate": 1.3108609870023381e-05, "loss": 0.3898, "step": 92715 }, { "epoch": 3.341622517749667, "grad_norm": 0.24030660092830658, "learning_rate": 1.3106043068476348e-05, "loss": 0.3998, "step": 92720 }, { "epoch": 3.341802717410891, "grad_norm": 0.23403751850128174, "learning_rate": 1.3103476428982172e-05, "loss": 0.4276, "step": 92725 }, { "epoch": 3.341982917072116, "grad_norm": 0.23179000616073608, "learning_rate": 1.3100909951575829e-05, "loss": 0.3936, "step": 92730 }, { "epoch": 3.3421631167333405, "grad_norm": 0.2009132206439972, "learning_rate": 1.3098343636292287e-05, "loss": 0.3873, "step": 92735 }, { "epoch": 3.3423433163945653, "grad_norm": 0.22999610006809235, "learning_rate": 1.3095777483166493e-05, "loss": 0.3685, "step": 92740 }, { "epoch": 3.34252351605579, "grad_norm": 0.19379504024982452, "learning_rate": 1.3093211492233443e-05, "loss": 0.3747, "step": 92745 }, { "epoch": 3.3427037157170143, "grad_norm": 0.21033187210559845, "learning_rate": 1.3090645663528075e-05, "loss": 0.3403, "step": 92750 }, { "epoch": 3.342883915378239, "grad_norm": 0.19492049515247345, "learning_rate": 1.3088079997085356e-05, "loss": 0.3598, "step": 92755 }, { "epoch": 3.3430641150394638, "grad_norm": 0.21218262612819672, "learning_rate": 1.3085514492940243e-05, "loss": 0.3883, "step": 92760 }, { "epoch": 3.3432443147006885, "grad_norm": 0.21746402978897095, "learning_rate": 1.3082949151127688e-05, "loss": 0.3835, "step": 92765 }, { "epoch": 3.343424514361913, "grad_norm": 0.22117747366428375, "learning_rate": 1.308038397168263e-05, "loss": 0.3615, "step": 92770 }, { "epoch": 3.3436047140231375, "grad_norm": 0.19114363193511963, "learning_rate": 1.3077818954640048e-05, "loss": 0.3707, "step": 92775 }, { "epoch": 3.3437849136843623, "grad_norm": 0.19133096933364868, "learning_rate": 1.307525410003487e-05, "loss": 0.3634, "step": 92780 }, { "epoch": 3.343965113345587, "grad_norm": 0.21959801018238068, "learning_rate": 1.3072689407902047e-05, "loss": 0.3866, "step": 92785 }, { "epoch": 3.3441453130068117, "grad_norm": 0.26673710346221924, "learning_rate": 1.3070124878276524e-05, "loss": 0.3999, "step": 92790 }, { "epoch": 3.344325512668036, "grad_norm": 0.27120256423950195, "learning_rate": 1.3067560511193227e-05, "loss": 0.3931, "step": 92795 }, { "epoch": 3.3445057123292607, "grad_norm": 0.22337371110916138, "learning_rate": 1.3064996306687127e-05, "loss": 0.3907, "step": 92800 }, { "epoch": 3.3446859119904855, "grad_norm": 0.22518573701381683, "learning_rate": 1.3062432264793122e-05, "loss": 0.3561, "step": 92805 }, { "epoch": 3.34486611165171, "grad_norm": 0.23381398618221283, "learning_rate": 1.3059868385546181e-05, "loss": 0.3455, "step": 92810 }, { "epoch": 3.3450463113129345, "grad_norm": 0.20269441604614258, "learning_rate": 1.3057304668981218e-05, "loss": 0.3855, "step": 92815 }, { "epoch": 3.3452265109741592, "grad_norm": 0.20035311579704285, "learning_rate": 1.3054741115133168e-05, "loss": 0.3773, "step": 92820 }, { "epoch": 3.345406710635384, "grad_norm": 0.1864159256219864, "learning_rate": 1.3052177724036957e-05, "loss": 0.3679, "step": 92825 }, { "epoch": 3.3455869102966087, "grad_norm": 0.22926171123981476, "learning_rate": 1.3049614495727507e-05, "loss": 0.348, "step": 92830 }, { "epoch": 3.3457671099578334, "grad_norm": 0.27731531858444214, "learning_rate": 1.3047051430239744e-05, "loss": 0.3804, "step": 92835 }, { "epoch": 3.3459473096190577, "grad_norm": 0.2885668873786926, "learning_rate": 1.3044488527608595e-05, "loss": 0.397, "step": 92840 }, { "epoch": 3.3461275092802825, "grad_norm": 0.23300670087337494, "learning_rate": 1.3041925787868978e-05, "loss": 0.3717, "step": 92845 }, { "epoch": 3.346307708941507, "grad_norm": 0.2299395352602005, "learning_rate": 1.3039363211055806e-05, "loss": 0.3944, "step": 92850 }, { "epoch": 3.346487908602732, "grad_norm": 0.23562583327293396, "learning_rate": 1.3036800797203997e-05, "loss": 0.3838, "step": 92855 }, { "epoch": 3.346668108263956, "grad_norm": 0.18811137974262238, "learning_rate": 1.3034238546348449e-05, "loss": 0.4058, "step": 92860 }, { "epoch": 3.346848307925181, "grad_norm": 0.3136909306049347, "learning_rate": 1.3031676458524092e-05, "loss": 0.3802, "step": 92865 }, { "epoch": 3.3470285075864057, "grad_norm": 0.2185976505279541, "learning_rate": 1.302911453376583e-05, "loss": 0.3782, "step": 92870 }, { "epoch": 3.3472087072476304, "grad_norm": 0.2267122119665146, "learning_rate": 1.302655277210856e-05, "loss": 0.3729, "step": 92875 }, { "epoch": 3.347388906908855, "grad_norm": 0.20721164345741272, "learning_rate": 1.3023991173587193e-05, "loss": 0.3868, "step": 92880 }, { "epoch": 3.34756910657008, "grad_norm": 0.21538670361042023, "learning_rate": 1.3021429738236618e-05, "loss": 0.3675, "step": 92885 }, { "epoch": 3.347749306231304, "grad_norm": 0.24019305408000946, "learning_rate": 1.3018868466091755e-05, "loss": 0.3555, "step": 92890 }, { "epoch": 3.347929505892529, "grad_norm": 0.25520575046539307, "learning_rate": 1.3016307357187496e-05, "loss": 0.4072, "step": 92895 }, { "epoch": 3.3481097055537536, "grad_norm": 0.21573813259601593, "learning_rate": 1.3013746411558714e-05, "loss": 0.3914, "step": 92900 }, { "epoch": 3.3482899052149784, "grad_norm": 0.22230984270572662, "learning_rate": 1.3011185629240321e-05, "loss": 0.3971, "step": 92905 }, { "epoch": 3.3484701048762027, "grad_norm": 0.21699829399585724, "learning_rate": 1.3008625010267206e-05, "loss": 0.373, "step": 92910 }, { "epoch": 3.3486503045374274, "grad_norm": 0.27921581268310547, "learning_rate": 1.3006064554674252e-05, "loss": 0.3984, "step": 92915 }, { "epoch": 3.348830504198652, "grad_norm": 0.20376195013523102, "learning_rate": 1.3003504262496347e-05, "loss": 0.3565, "step": 92920 }, { "epoch": 3.349010703859877, "grad_norm": 0.26665177941322327, "learning_rate": 1.3000944133768368e-05, "loss": 0.3716, "step": 92925 }, { "epoch": 3.3491909035211016, "grad_norm": 0.22333024442195892, "learning_rate": 1.2998384168525208e-05, "loss": 0.3586, "step": 92930 }, { "epoch": 3.349371103182326, "grad_norm": 0.26054447889328003, "learning_rate": 1.299582436680174e-05, "loss": 0.4145, "step": 92935 }, { "epoch": 3.3495513028435506, "grad_norm": 0.25818753242492676, "learning_rate": 1.299326472863284e-05, "loss": 0.3409, "step": 92940 }, { "epoch": 3.3497315025047754, "grad_norm": 0.17389363050460815, "learning_rate": 1.2990705254053385e-05, "loss": 0.3626, "step": 92945 }, { "epoch": 3.349911702166, "grad_norm": 0.24069735407829285, "learning_rate": 1.2988145943098245e-05, "loss": 0.3704, "step": 92950 }, { "epoch": 3.3500919018272244, "grad_norm": 0.2535139322280884, "learning_rate": 1.2985586795802279e-05, "loss": 0.3725, "step": 92955 }, { "epoch": 3.350272101488449, "grad_norm": 0.2423035055398941, "learning_rate": 1.2983027812200382e-05, "loss": 0.3808, "step": 92960 }, { "epoch": 3.350452301149674, "grad_norm": 0.2165379524230957, "learning_rate": 1.2980468992327402e-05, "loss": 0.3942, "step": 92965 }, { "epoch": 3.3506325008108986, "grad_norm": 0.27460017800331116, "learning_rate": 1.2977910336218204e-05, "loss": 0.3583, "step": 92970 }, { "epoch": 3.3508127004721233, "grad_norm": 0.2811183035373688, "learning_rate": 1.2975351843907652e-05, "loss": 0.369, "step": 92975 }, { "epoch": 3.3509929001333476, "grad_norm": 0.22959589958190918, "learning_rate": 1.2972793515430592e-05, "loss": 0.3604, "step": 92980 }, { "epoch": 3.3511730997945723, "grad_norm": 0.2398064136505127, "learning_rate": 1.2970235350821911e-05, "loss": 0.3713, "step": 92985 }, { "epoch": 3.351353299455797, "grad_norm": 0.23232856392860413, "learning_rate": 1.2967677350116425e-05, "loss": 0.3658, "step": 92990 }, { "epoch": 3.351533499117022, "grad_norm": 0.2607458829879761, "learning_rate": 1.2965119513349016e-05, "loss": 0.3745, "step": 92995 }, { "epoch": 3.351713698778246, "grad_norm": 0.17985029518604279, "learning_rate": 1.2962561840554524e-05, "loss": 0.3852, "step": 93000 }, { "epoch": 3.351713698778246, "eval_loss": 0.4302482306957245, "eval_runtime": 3.5321, "eval_samples_per_second": 28.312, "eval_steps_per_second": 7.078, "step": 93000 }, { "epoch": 3.351893898439471, "grad_norm": 0.22021250426769257, "learning_rate": 1.2960004331767786e-05, "loss": 0.36, "step": 93005 }, { "epoch": 3.3520740981006956, "grad_norm": 0.24476410448551178, "learning_rate": 1.2957446987023678e-05, "loss": 0.4055, "step": 93010 }, { "epoch": 3.3522542977619203, "grad_norm": 0.20762693881988525, "learning_rate": 1.2954889806357012e-05, "loss": 0.3946, "step": 93015 }, { "epoch": 3.352434497423145, "grad_norm": 0.30094394087791443, "learning_rate": 1.2952332789802631e-05, "loss": 0.3695, "step": 93020 }, { "epoch": 3.3526146970843693, "grad_norm": 0.27970486879348755, "learning_rate": 1.2949775937395392e-05, "loss": 0.3885, "step": 93025 }, { "epoch": 3.352794896745594, "grad_norm": 0.2970288395881653, "learning_rate": 1.2947219249170128e-05, "loss": 0.3899, "step": 93030 }, { "epoch": 3.352975096406819, "grad_norm": 0.18664143979549408, "learning_rate": 1.2944662725161663e-05, "loss": 0.3715, "step": 93035 }, { "epoch": 3.3531552960680435, "grad_norm": 0.2144254446029663, "learning_rate": 1.2942106365404839e-05, "loss": 0.3757, "step": 93040 }, { "epoch": 3.353335495729268, "grad_norm": 0.24365659058094025, "learning_rate": 1.2939550169934466e-05, "loss": 0.3741, "step": 93045 }, { "epoch": 3.3535156953904925, "grad_norm": 0.2775290012359619, "learning_rate": 1.29369941387854e-05, "loss": 0.3854, "step": 93050 }, { "epoch": 3.3536958950517173, "grad_norm": 0.20349471271038055, "learning_rate": 1.2934438271992452e-05, "loss": 0.3809, "step": 93055 }, { "epoch": 3.353876094712942, "grad_norm": 0.27783820033073425, "learning_rate": 1.2931882569590448e-05, "loss": 0.353, "step": 93060 }, { "epoch": 3.3540562943741667, "grad_norm": 0.26392999291419983, "learning_rate": 1.2929327031614204e-05, "loss": 0.4006, "step": 93065 }, { "epoch": 3.354236494035391, "grad_norm": 0.2420482635498047, "learning_rate": 1.2926771658098535e-05, "loss": 0.3974, "step": 93070 }, { "epoch": 3.3544166936966158, "grad_norm": 0.2563861012458801, "learning_rate": 1.2924216449078274e-05, "loss": 0.3876, "step": 93075 }, { "epoch": 3.3545968933578405, "grad_norm": 0.23027673363685608, "learning_rate": 1.292166140458824e-05, "loss": 0.3777, "step": 93080 }, { "epoch": 3.3547770930190652, "grad_norm": 0.2890183925628662, "learning_rate": 1.2919106524663208e-05, "loss": 0.3627, "step": 93085 }, { "epoch": 3.3549572926802895, "grad_norm": 0.2011183500289917, "learning_rate": 1.2916551809338018e-05, "loss": 0.3793, "step": 93090 }, { "epoch": 3.3551374923415143, "grad_norm": 0.23442140221595764, "learning_rate": 1.2913997258647476e-05, "loss": 0.3532, "step": 93095 }, { "epoch": 3.355317692002739, "grad_norm": 0.18820983171463013, "learning_rate": 1.2911442872626376e-05, "loss": 0.3741, "step": 93100 }, { "epoch": 3.3554978916639637, "grad_norm": 0.17393651604652405, "learning_rate": 1.2908888651309528e-05, "loss": 0.3554, "step": 93105 }, { "epoch": 3.3556780913251885, "grad_norm": 0.2209923267364502, "learning_rate": 1.2906334594731722e-05, "loss": 0.4142, "step": 93110 }, { "epoch": 3.355858290986413, "grad_norm": 0.2804467976093292, "learning_rate": 1.2903780702927776e-05, "loss": 0.3901, "step": 93115 }, { "epoch": 3.3560384906476375, "grad_norm": 0.2660714089870453, "learning_rate": 1.2901226975932474e-05, "loss": 0.3976, "step": 93120 }, { "epoch": 3.356218690308862, "grad_norm": 0.24791589379310608, "learning_rate": 1.2898673413780616e-05, "loss": 0.4206, "step": 93125 }, { "epoch": 3.356398889970087, "grad_norm": 0.25095242261886597, "learning_rate": 1.2896120016506985e-05, "loss": 0.373, "step": 93130 }, { "epoch": 3.3565790896313117, "grad_norm": 0.31840965151786804, "learning_rate": 1.2893566784146377e-05, "loss": 0.3655, "step": 93135 }, { "epoch": 3.356759289292536, "grad_norm": 0.20555394887924194, "learning_rate": 1.2891013716733569e-05, "loss": 0.3744, "step": 93140 }, { "epoch": 3.3569394889537607, "grad_norm": 0.24305355548858643, "learning_rate": 1.2888460814303363e-05, "loss": 0.3667, "step": 93145 }, { "epoch": 3.3571196886149854, "grad_norm": 0.22731393575668335, "learning_rate": 1.2885908076890536e-05, "loss": 0.3577, "step": 93150 }, { "epoch": 3.35729988827621, "grad_norm": 0.22640764713287354, "learning_rate": 1.2883355504529865e-05, "loss": 0.3424, "step": 93155 }, { "epoch": 3.357480087937435, "grad_norm": 0.21849682927131653, "learning_rate": 1.2880803097256128e-05, "loss": 0.3742, "step": 93160 }, { "epoch": 3.357660287598659, "grad_norm": 0.20635724067687988, "learning_rate": 1.2878250855104094e-05, "loss": 0.4043, "step": 93165 }, { "epoch": 3.357840487259884, "grad_norm": 0.17877210676670074, "learning_rate": 1.2875698778108552e-05, "loss": 0.3761, "step": 93170 }, { "epoch": 3.3580206869211087, "grad_norm": 0.19038045406341553, "learning_rate": 1.287314686630427e-05, "loss": 0.3983, "step": 93175 }, { "epoch": 3.3582008865823334, "grad_norm": 0.25942298769950867, "learning_rate": 1.2870595119726015e-05, "loss": 0.42, "step": 93180 }, { "epoch": 3.3583810862435577, "grad_norm": 0.2063370645046234, "learning_rate": 1.2868043538408553e-05, "loss": 0.4054, "step": 93185 }, { "epoch": 3.3585612859047824, "grad_norm": 0.22962912917137146, "learning_rate": 1.2865492122386636e-05, "loss": 0.3537, "step": 93190 }, { "epoch": 3.358741485566007, "grad_norm": 0.21510834991931915, "learning_rate": 1.2862940871695062e-05, "loss": 0.3888, "step": 93195 }, { "epoch": 3.358921685227232, "grad_norm": 0.22031091153621674, "learning_rate": 1.2860389786368559e-05, "loss": 0.3752, "step": 93200 }, { "epoch": 3.3591018848884566, "grad_norm": 0.2270020842552185, "learning_rate": 1.2857838866441885e-05, "loss": 0.3664, "step": 93205 }, { "epoch": 3.359282084549681, "grad_norm": 0.22233691811561584, "learning_rate": 1.2855288111949811e-05, "loss": 0.3757, "step": 93210 }, { "epoch": 3.3594622842109056, "grad_norm": 0.25379857420921326, "learning_rate": 1.2852737522927089e-05, "loss": 0.3823, "step": 93215 }, { "epoch": 3.3596424838721304, "grad_norm": 0.19441834092140198, "learning_rate": 1.2850187099408467e-05, "loss": 0.3814, "step": 93220 }, { "epoch": 3.359822683533355, "grad_norm": 0.21130891144275665, "learning_rate": 1.2847636841428695e-05, "loss": 0.3634, "step": 93225 }, { "epoch": 3.3600028831945794, "grad_norm": 0.22752270102500916, "learning_rate": 1.2845086749022506e-05, "loss": 0.3821, "step": 93230 }, { "epoch": 3.360183082855804, "grad_norm": 0.2163986712694168, "learning_rate": 1.2842536822224666e-05, "loss": 0.3591, "step": 93235 }, { "epoch": 3.360363282517029, "grad_norm": 0.2803928852081299, "learning_rate": 1.2839987061069908e-05, "loss": 0.356, "step": 93240 }, { "epoch": 3.3605434821782536, "grad_norm": 0.26921606063842773, "learning_rate": 1.2837437465592972e-05, "loss": 0.4204, "step": 93245 }, { "epoch": 3.3607236818394783, "grad_norm": 0.2192206233739853, "learning_rate": 1.2834888035828596e-05, "loss": 0.3889, "step": 93250 }, { "epoch": 3.3609038815007026, "grad_norm": 0.26289498805999756, "learning_rate": 1.2832338771811508e-05, "loss": 0.3843, "step": 93255 }, { "epoch": 3.3610840811619274, "grad_norm": 0.2620001435279846, "learning_rate": 1.2829789673576456e-05, "loss": 0.3524, "step": 93260 }, { "epoch": 3.361264280823152, "grad_norm": 0.2221224457025528, "learning_rate": 1.2827240741158175e-05, "loss": 0.3894, "step": 93265 }, { "epoch": 3.361444480484377, "grad_norm": 0.21585452556610107, "learning_rate": 1.282469197459136e-05, "loss": 0.3812, "step": 93270 }, { "epoch": 3.361624680145601, "grad_norm": 0.21924617886543274, "learning_rate": 1.2822143373910771e-05, "loss": 0.3903, "step": 93275 }, { "epoch": 3.361804879806826, "grad_norm": 0.1916201114654541, "learning_rate": 1.281959493915112e-05, "loss": 0.386, "step": 93280 }, { "epoch": 3.3619850794680506, "grad_norm": 0.27522167563438416, "learning_rate": 1.2817046670347121e-05, "loss": 0.3835, "step": 93285 }, { "epoch": 3.3621652791292753, "grad_norm": 0.25006136298179626, "learning_rate": 1.2814498567533523e-05, "loss": 0.3698, "step": 93290 }, { "epoch": 3.3623454787905, "grad_norm": 0.25098854303359985, "learning_rate": 1.2811950630745e-05, "loss": 0.3341, "step": 93295 }, { "epoch": 3.3625256784517243, "grad_norm": 0.25163203477859497, "learning_rate": 1.28094028600163e-05, "loss": 0.4086, "step": 93300 }, { "epoch": 3.362705878112949, "grad_norm": 0.321817547082901, "learning_rate": 1.2806855255382127e-05, "loss": 0.3856, "step": 93305 }, { "epoch": 3.362886077774174, "grad_norm": 0.22713923454284668, "learning_rate": 1.2804307816877193e-05, "loss": 0.3978, "step": 93310 }, { "epoch": 3.3630662774353985, "grad_norm": 0.24719440937042236, "learning_rate": 1.2801760544536202e-05, "loss": 0.3796, "step": 93315 }, { "epoch": 3.363246477096623, "grad_norm": 0.24784217774868011, "learning_rate": 1.2799213438393858e-05, "loss": 0.3683, "step": 93320 }, { "epoch": 3.3634266767578476, "grad_norm": 0.24816350638866425, "learning_rate": 1.2796666498484865e-05, "loss": 0.3849, "step": 93325 }, { "epoch": 3.3636068764190723, "grad_norm": 0.2373788207769394, "learning_rate": 1.2794119724843934e-05, "loss": 0.388, "step": 93330 }, { "epoch": 3.363787076080297, "grad_norm": 0.2272450029850006, "learning_rate": 1.2791573117505761e-05, "loss": 0.3771, "step": 93335 }, { "epoch": 3.3639672757415218, "grad_norm": 0.2307075560092926, "learning_rate": 1.2789026676505039e-05, "loss": 0.362, "step": 93340 }, { "epoch": 3.364147475402746, "grad_norm": 0.22494341433048248, "learning_rate": 1.2786480401876466e-05, "loss": 0.3934, "step": 93345 }, { "epoch": 3.364327675063971, "grad_norm": 0.2284454107284546, "learning_rate": 1.2783934293654726e-05, "loss": 0.4008, "step": 93350 }, { "epoch": 3.3645078747251955, "grad_norm": 0.2578500807285309, "learning_rate": 1.2781388351874518e-05, "loss": 0.3791, "step": 93355 }, { "epoch": 3.3646880743864203, "grad_norm": 0.23964065313339233, "learning_rate": 1.2778842576570543e-05, "loss": 0.3465, "step": 93360 }, { "epoch": 3.3648682740476445, "grad_norm": 0.23482295870780945, "learning_rate": 1.277629696777745e-05, "loss": 0.3947, "step": 93365 }, { "epoch": 3.3650484737088693, "grad_norm": 0.27758488059043884, "learning_rate": 1.277375152552996e-05, "loss": 0.3551, "step": 93370 }, { "epoch": 3.365228673370094, "grad_norm": 0.26252129673957825, "learning_rate": 1.2771206249862722e-05, "loss": 0.3843, "step": 93375 }, { "epoch": 3.3654088730313187, "grad_norm": 0.23185403645038605, "learning_rate": 1.2768661140810454e-05, "loss": 0.4336, "step": 93380 }, { "epoch": 3.3655890726925435, "grad_norm": 0.32735198736190796, "learning_rate": 1.2766116198407798e-05, "loss": 0.3843, "step": 93385 }, { "epoch": 3.365769272353768, "grad_norm": 0.2612536549568176, "learning_rate": 1.2763571422689432e-05, "loss": 0.3721, "step": 93390 }, { "epoch": 3.3659494720149925, "grad_norm": 0.22687096893787384, "learning_rate": 1.2761026813690047e-05, "loss": 0.3867, "step": 93395 }, { "epoch": 3.3661296716762172, "grad_norm": 0.2720807194709778, "learning_rate": 1.2758482371444299e-05, "loss": 0.3975, "step": 93400 }, { "epoch": 3.366309871337442, "grad_norm": 0.2100588083267212, "learning_rate": 1.2755938095986861e-05, "loss": 0.3865, "step": 93405 }, { "epoch": 3.3664900709986667, "grad_norm": 0.23490869998931885, "learning_rate": 1.2753393987352396e-05, "loss": 0.4074, "step": 93410 }, { "epoch": 3.366670270659891, "grad_norm": 0.25652775168418884, "learning_rate": 1.2750850045575558e-05, "loss": 0.3773, "step": 93415 }, { "epoch": 3.3668504703211157, "grad_norm": 0.25951749086380005, "learning_rate": 1.2748306270691024e-05, "loss": 0.3564, "step": 93420 }, { "epoch": 3.3670306699823405, "grad_norm": 0.2929254174232483, "learning_rate": 1.2745762662733445e-05, "loss": 0.3865, "step": 93425 }, { "epoch": 3.367210869643565, "grad_norm": 0.24950219690799713, "learning_rate": 1.274321922173748e-05, "loss": 0.3821, "step": 93430 }, { "epoch": 3.36739106930479, "grad_norm": 0.2351795881986618, "learning_rate": 1.2740675947737777e-05, "loss": 0.4021, "step": 93435 }, { "epoch": 3.367571268966014, "grad_norm": 0.267691969871521, "learning_rate": 1.273813284076898e-05, "loss": 0.3983, "step": 93440 }, { "epoch": 3.367751468627239, "grad_norm": 0.18753398954868317, "learning_rate": 1.2735589900865762e-05, "loss": 0.3566, "step": 93445 }, { "epoch": 3.3679316682884637, "grad_norm": 0.243241548538208, "learning_rate": 1.2733047128062756e-05, "loss": 0.3818, "step": 93450 }, { "epoch": 3.3681118679496884, "grad_norm": 0.28631412982940674, "learning_rate": 1.273050452239461e-05, "loss": 0.3818, "step": 93455 }, { "epoch": 3.3682920676109127, "grad_norm": 0.22822198271751404, "learning_rate": 1.272796208389596e-05, "loss": 0.3824, "step": 93460 }, { "epoch": 3.3684722672721374, "grad_norm": 0.262082040309906, "learning_rate": 1.2725419812601453e-05, "loss": 0.377, "step": 93465 }, { "epoch": 3.368652466933362, "grad_norm": 0.21219515800476074, "learning_rate": 1.2722877708545717e-05, "loss": 0.3848, "step": 93470 }, { "epoch": 3.368832666594587, "grad_norm": 0.2659531533718109, "learning_rate": 1.272033577176341e-05, "loss": 0.3382, "step": 93475 }, { "epoch": 3.3690128662558116, "grad_norm": 0.2504463791847229, "learning_rate": 1.2717794002289134e-05, "loss": 0.3823, "step": 93480 }, { "epoch": 3.369193065917036, "grad_norm": 0.27462002635002136, "learning_rate": 1.2715252400157548e-05, "loss": 0.4201, "step": 93485 }, { "epoch": 3.3693732655782607, "grad_norm": 0.2492084950208664, "learning_rate": 1.2712710965403269e-05, "loss": 0.3777, "step": 93490 }, { "epoch": 3.3695534652394854, "grad_norm": 0.2124946117401123, "learning_rate": 1.2710169698060922e-05, "loss": 0.3578, "step": 93495 }, { "epoch": 3.36973366490071, "grad_norm": 0.24669291079044342, "learning_rate": 1.2707628598165137e-05, "loss": 0.4265, "step": 93500 }, { "epoch": 3.36973366490071, "eval_loss": 0.4303239583969116, "eval_runtime": 3.5298, "eval_samples_per_second": 28.331, "eval_steps_per_second": 7.083, "step": 93500 }, { "epoch": 3.3699138645619344, "grad_norm": 0.22059577703475952, "learning_rate": 1.2705087665750531e-05, "loss": 0.3647, "step": 93505 }, { "epoch": 3.370094064223159, "grad_norm": 0.20390857756137848, "learning_rate": 1.2702546900851715e-05, "loss": 0.3714, "step": 93510 }, { "epoch": 3.370274263884384, "grad_norm": 0.22258242964744568, "learning_rate": 1.2700006303503325e-05, "loss": 0.3766, "step": 93515 }, { "epoch": 3.3704544635456086, "grad_norm": 0.26116088032722473, "learning_rate": 1.2697465873739966e-05, "loss": 0.403, "step": 93520 }, { "epoch": 3.3706346632068334, "grad_norm": 0.2494215965270996, "learning_rate": 1.269492561159626e-05, "loss": 0.3681, "step": 93525 }, { "epoch": 3.3708148628680576, "grad_norm": 0.24651680886745453, "learning_rate": 1.2692385517106802e-05, "loss": 0.3774, "step": 93530 }, { "epoch": 3.3709950625292824, "grad_norm": 0.22229456901550293, "learning_rate": 1.2689845590306204e-05, "loss": 0.3997, "step": 93535 }, { "epoch": 3.371175262190507, "grad_norm": 0.2545780539512634, "learning_rate": 1.2687305831229084e-05, "loss": 0.3661, "step": 93540 }, { "epoch": 3.371355461851732, "grad_norm": 0.19883494079113007, "learning_rate": 1.268476623991005e-05, "loss": 0.3843, "step": 93545 }, { "epoch": 3.371535661512956, "grad_norm": 0.19383443892002106, "learning_rate": 1.2682226816383668e-05, "loss": 0.3768, "step": 93550 }, { "epoch": 3.371715861174181, "grad_norm": 0.2644541263580322, "learning_rate": 1.2679687560684578e-05, "loss": 0.3745, "step": 93555 }, { "epoch": 3.3718960608354056, "grad_norm": 0.24296367168426514, "learning_rate": 1.2677148472847344e-05, "loss": 0.4342, "step": 93560 }, { "epoch": 3.3720762604966303, "grad_norm": 0.23606744408607483, "learning_rate": 1.2674609552906586e-05, "loss": 0.3628, "step": 93565 }, { "epoch": 3.372256460157855, "grad_norm": 0.20120225846767426, "learning_rate": 1.2672070800896899e-05, "loss": 0.3939, "step": 93570 }, { "epoch": 3.3724366598190794, "grad_norm": 0.20473100244998932, "learning_rate": 1.2669532216852842e-05, "loss": 0.3502, "step": 93575 }, { "epoch": 3.372616859480304, "grad_norm": 0.2535666823387146, "learning_rate": 1.2666993800809026e-05, "loss": 0.3759, "step": 93580 }, { "epoch": 3.372797059141529, "grad_norm": 0.2333008199930191, "learning_rate": 1.2664455552800024e-05, "loss": 0.3961, "step": 93585 }, { "epoch": 3.3729772588027536, "grad_norm": 0.1824401170015335, "learning_rate": 1.2661917472860449e-05, "loss": 0.3831, "step": 93590 }, { "epoch": 3.373157458463978, "grad_norm": 0.26639214158058167, "learning_rate": 1.2659379561024848e-05, "loss": 0.3991, "step": 93595 }, { "epoch": 3.3733376581252026, "grad_norm": 0.2088332623243332, "learning_rate": 1.2656841817327803e-05, "loss": 0.3711, "step": 93600 }, { "epoch": 3.3735178577864273, "grad_norm": 0.24687723815441132, "learning_rate": 1.2654304241803904e-05, "loss": 0.4086, "step": 93605 }, { "epoch": 3.373698057447652, "grad_norm": 0.2668485641479492, "learning_rate": 1.2651766834487724e-05, "loss": 0.3932, "step": 93610 }, { "epoch": 3.373878257108877, "grad_norm": 0.26824715733528137, "learning_rate": 1.2649229595413831e-05, "loss": 0.4194, "step": 93615 }, { "epoch": 3.3740584567701015, "grad_norm": 0.27278998494148254, "learning_rate": 1.2646692524616788e-05, "loss": 0.3771, "step": 93620 }, { "epoch": 3.374238656431326, "grad_norm": 0.2048804759979248, "learning_rate": 1.2644155622131163e-05, "loss": 0.3519, "step": 93625 }, { "epoch": 3.3744188560925505, "grad_norm": 0.2662324011325836, "learning_rate": 1.2641618887991532e-05, "loss": 0.3949, "step": 93630 }, { "epoch": 3.3745990557537753, "grad_norm": 0.24462181329727173, "learning_rate": 1.2639082322232455e-05, "loss": 0.3855, "step": 93635 }, { "epoch": 3.374779255415, "grad_norm": 0.2650309205055237, "learning_rate": 1.2636545924888485e-05, "loss": 0.3813, "step": 93640 }, { "epoch": 3.3749594550762243, "grad_norm": 0.26425033807754517, "learning_rate": 1.2634009695994182e-05, "loss": 0.3943, "step": 93645 }, { "epoch": 3.375139654737449, "grad_norm": 0.21526920795440674, "learning_rate": 1.2631473635584107e-05, "loss": 0.389, "step": 93650 }, { "epoch": 3.3753198543986738, "grad_norm": 0.2526695430278778, "learning_rate": 1.2628937743692795e-05, "loss": 0.383, "step": 93655 }, { "epoch": 3.3755000540598985, "grad_norm": 0.26634764671325684, "learning_rate": 1.2626402020354832e-05, "loss": 0.3667, "step": 93660 }, { "epoch": 3.3756802537211232, "grad_norm": 0.21848170459270477, "learning_rate": 1.2623866465604727e-05, "loss": 0.3813, "step": 93665 }, { "epoch": 3.3758604533823475, "grad_norm": 0.25666651129722595, "learning_rate": 1.2621331079477056e-05, "loss": 0.3716, "step": 93670 }, { "epoch": 3.3760406530435723, "grad_norm": 0.2850119471549988, "learning_rate": 1.261879586200635e-05, "loss": 0.3825, "step": 93675 }, { "epoch": 3.376220852704797, "grad_norm": 0.22654448449611664, "learning_rate": 1.2616260813227155e-05, "loss": 0.3777, "step": 93680 }, { "epoch": 3.3764010523660217, "grad_norm": 0.2590138912200928, "learning_rate": 1.2613725933174009e-05, "loss": 0.3641, "step": 93685 }, { "epoch": 3.376581252027246, "grad_norm": 0.20931139588356018, "learning_rate": 1.2611191221881449e-05, "loss": 0.3663, "step": 93690 }, { "epoch": 3.3767614516884707, "grad_norm": 0.2435794174671173, "learning_rate": 1.2608656679384002e-05, "loss": 0.3863, "step": 93695 }, { "epoch": 3.3769416513496955, "grad_norm": 0.26772207021713257, "learning_rate": 1.2606122305716215e-05, "loss": 0.3818, "step": 93700 }, { "epoch": 3.37712185101092, "grad_norm": 0.21198633313179016, "learning_rate": 1.2603588100912611e-05, "loss": 0.3792, "step": 93705 }, { "epoch": 3.377302050672145, "grad_norm": 0.18529768288135529, "learning_rate": 1.2601054065007722e-05, "loss": 0.3644, "step": 93710 }, { "epoch": 3.3774822503333692, "grad_norm": 0.24963034689426422, "learning_rate": 1.2598520198036074e-05, "loss": 0.3925, "step": 93715 }, { "epoch": 3.377662449994594, "grad_norm": 0.27459418773651123, "learning_rate": 1.259598650003217e-05, "loss": 0.3957, "step": 93720 }, { "epoch": 3.3778426496558187, "grad_norm": 0.23690108954906464, "learning_rate": 1.2593452971030564e-05, "loss": 0.3768, "step": 93725 }, { "epoch": 3.3780228493170434, "grad_norm": 0.25995904207229614, "learning_rate": 1.2590919611065757e-05, "loss": 0.3882, "step": 93730 }, { "epoch": 3.3782030489782677, "grad_norm": 0.2160549908876419, "learning_rate": 1.2588386420172269e-05, "loss": 0.3617, "step": 93735 }, { "epoch": 3.3783832486394925, "grad_norm": 0.20487286150455475, "learning_rate": 1.2585853398384612e-05, "loss": 0.3748, "step": 93740 }, { "epoch": 3.378563448300717, "grad_norm": 0.23326444625854492, "learning_rate": 1.258332054573729e-05, "loss": 0.3864, "step": 93745 }, { "epoch": 3.378743647961942, "grad_norm": 0.23716379702091217, "learning_rate": 1.2580787862264832e-05, "loss": 0.3846, "step": 93750 }, { "epoch": 3.3789238476231667, "grad_norm": 0.23601892590522766, "learning_rate": 1.2578255348001741e-05, "loss": 0.371, "step": 93755 }, { "epoch": 3.379104047284391, "grad_norm": 0.2128582000732422, "learning_rate": 1.2575723002982498e-05, "loss": 0.3677, "step": 93760 }, { "epoch": 3.3792842469456157, "grad_norm": 0.24130862951278687, "learning_rate": 1.2573190827241637e-05, "loss": 0.3728, "step": 93765 }, { "epoch": 3.3794644466068404, "grad_norm": 0.24159899353981018, "learning_rate": 1.2570658820813633e-05, "loss": 0.3939, "step": 93770 }, { "epoch": 3.379644646268065, "grad_norm": 0.2206299901008606, "learning_rate": 1.2568126983733015e-05, "loss": 0.3902, "step": 93775 }, { "epoch": 3.3798248459292894, "grad_norm": 0.20361188054084778, "learning_rate": 1.256559531603425e-05, "loss": 0.3666, "step": 93780 }, { "epoch": 3.380005045590514, "grad_norm": 0.22459423542022705, "learning_rate": 1.2563063817751834e-05, "loss": 0.3898, "step": 93785 }, { "epoch": 3.380185245251739, "grad_norm": 0.3262191414833069, "learning_rate": 1.2560532488920274e-05, "loss": 0.3945, "step": 93790 }, { "epoch": 3.3803654449129636, "grad_norm": 0.23222355544567108, "learning_rate": 1.2558001329574049e-05, "loss": 0.3799, "step": 93795 }, { "epoch": 3.3805456445741884, "grad_norm": 0.24489018321037292, "learning_rate": 1.255547033974765e-05, "loss": 0.4148, "step": 93800 }, { "epoch": 3.3807258442354127, "grad_norm": 0.246272012591362, "learning_rate": 1.2552939519475554e-05, "loss": 0.3642, "step": 93805 }, { "epoch": 3.3809060438966374, "grad_norm": 0.2430204451084137, "learning_rate": 1.255040886879224e-05, "loss": 0.3893, "step": 93810 }, { "epoch": 3.381086243557862, "grad_norm": 0.28567835688591003, "learning_rate": 1.2547878387732203e-05, "loss": 0.4011, "step": 93815 }, { "epoch": 3.381266443219087, "grad_norm": 0.28451603651046753, "learning_rate": 1.2545348076329916e-05, "loss": 0.3964, "step": 93820 }, { "epoch": 3.381446642880311, "grad_norm": 0.20606903731822968, "learning_rate": 1.254281793461985e-05, "loss": 0.3684, "step": 93825 }, { "epoch": 3.381626842541536, "grad_norm": 0.2716697156429291, "learning_rate": 1.2540287962636473e-05, "loss": 0.3818, "step": 93830 }, { "epoch": 3.3818070422027606, "grad_norm": 0.2406664490699768, "learning_rate": 1.2537758160414265e-05, "loss": 0.3719, "step": 93835 }, { "epoch": 3.3819872418639854, "grad_norm": 0.23287048935890198, "learning_rate": 1.2535228527987674e-05, "loss": 0.373, "step": 93840 }, { "epoch": 3.38216744152521, "grad_norm": 0.20795047283172607, "learning_rate": 1.2532699065391206e-05, "loss": 0.3667, "step": 93845 }, { "epoch": 3.3823476411864344, "grad_norm": 0.23873348534107208, "learning_rate": 1.2530169772659278e-05, "loss": 0.3937, "step": 93850 }, { "epoch": 3.382527840847659, "grad_norm": 0.3395892083644867, "learning_rate": 1.2527640649826384e-05, "loss": 0.408, "step": 93855 }, { "epoch": 3.382708040508884, "grad_norm": 0.2514219582080841, "learning_rate": 1.2525111696926967e-05, "loss": 0.401, "step": 93860 }, { "epoch": 3.3828882401701086, "grad_norm": 0.23013897240161896, "learning_rate": 1.2522582913995484e-05, "loss": 0.3651, "step": 93865 }, { "epoch": 3.383068439831333, "grad_norm": 0.2501782476902008, "learning_rate": 1.2520054301066409e-05, "loss": 0.3569, "step": 93870 }, { "epoch": 3.3832486394925576, "grad_norm": 0.2740953266620636, "learning_rate": 1.2517525858174166e-05, "loss": 0.3837, "step": 93875 }, { "epoch": 3.3834288391537823, "grad_norm": 0.2235153615474701, "learning_rate": 1.251499758535321e-05, "loss": 0.344, "step": 93880 }, { "epoch": 3.383609038815007, "grad_norm": 0.21870172023773193, "learning_rate": 1.2512469482638006e-05, "loss": 0.3887, "step": 93885 }, { "epoch": 3.383789238476232, "grad_norm": 0.21588364243507385, "learning_rate": 1.2509941550062987e-05, "loss": 0.3651, "step": 93890 }, { "epoch": 3.3839694381374565, "grad_norm": 0.24432948231697083, "learning_rate": 1.2507413787662592e-05, "loss": 0.3971, "step": 93895 }, { "epoch": 3.384149637798681, "grad_norm": 0.23872052133083344, "learning_rate": 1.2504886195471272e-05, "loss": 0.3919, "step": 93900 }, { "epoch": 3.3843298374599056, "grad_norm": 0.25494420528411865, "learning_rate": 1.2502358773523443e-05, "loss": 0.3789, "step": 93905 }, { "epoch": 3.3845100371211303, "grad_norm": 0.2569286525249481, "learning_rate": 1.2499831521853567e-05, "loss": 0.3544, "step": 93910 }, { "epoch": 3.384690236782355, "grad_norm": 0.2678029537200928, "learning_rate": 1.2497304440496068e-05, "loss": 0.4016, "step": 93915 }, { "epoch": 3.3848704364435793, "grad_norm": 0.21244113147258759, "learning_rate": 1.2494777529485374e-05, "loss": 0.3636, "step": 93920 }, { "epoch": 3.385050636104804, "grad_norm": 0.21646173298358917, "learning_rate": 1.2492250788855916e-05, "loss": 0.3687, "step": 93925 }, { "epoch": 3.385230835766029, "grad_norm": 0.214036226272583, "learning_rate": 1.248972421864211e-05, "loss": 0.3638, "step": 93930 }, { "epoch": 3.3854110354272535, "grad_norm": 0.19108587503433228, "learning_rate": 1.2487197818878399e-05, "loss": 0.3568, "step": 93935 }, { "epoch": 3.3855912350884783, "grad_norm": 0.2635771632194519, "learning_rate": 1.2484671589599204e-05, "loss": 0.3701, "step": 93940 }, { "epoch": 3.3857714347497025, "grad_norm": 0.27943238615989685, "learning_rate": 1.2482145530838918e-05, "loss": 0.3563, "step": 93945 }, { "epoch": 3.3859516344109273, "grad_norm": 0.2727324366569519, "learning_rate": 1.2479619642631985e-05, "loss": 0.3783, "step": 93950 }, { "epoch": 3.386131834072152, "grad_norm": 0.28211259841918945, "learning_rate": 1.2477093925012808e-05, "loss": 0.4027, "step": 93955 }, { "epoch": 3.3863120337333767, "grad_norm": 0.24526701867580414, "learning_rate": 1.2474568378015802e-05, "loss": 0.3788, "step": 93960 }, { "epoch": 3.386492233394601, "grad_norm": 0.2575979232788086, "learning_rate": 1.247204300167538e-05, "loss": 0.358, "step": 93965 }, { "epoch": 3.3866724330558258, "grad_norm": 0.261115700006485, "learning_rate": 1.2469517796025934e-05, "loss": 0.3946, "step": 93970 }, { "epoch": 3.3868526327170505, "grad_norm": 0.2626388370990753, "learning_rate": 1.2466992761101893e-05, "loss": 0.3957, "step": 93975 }, { "epoch": 3.3870328323782752, "grad_norm": 0.2778167724609375, "learning_rate": 1.2464467896937649e-05, "loss": 0.3642, "step": 93980 }, { "epoch": 3.3872130320395, "grad_norm": 0.2699746787548065, "learning_rate": 1.2461943203567602e-05, "loss": 0.3913, "step": 93985 }, { "epoch": 3.3873932317007243, "grad_norm": 0.2563049793243408, "learning_rate": 1.2459418681026152e-05, "loss": 0.3911, "step": 93990 }, { "epoch": 3.387573431361949, "grad_norm": 0.27618205547332764, "learning_rate": 1.2456894329347685e-05, "loss": 0.4132, "step": 93995 }, { "epoch": 3.3877536310231737, "grad_norm": 0.19964580237865448, "learning_rate": 1.2454370148566613e-05, "loss": 0.3732, "step": 94000 }, { "epoch": 3.3877536310231737, "eval_loss": 0.43040722608566284, "eval_runtime": 3.5294, "eval_samples_per_second": 28.334, "eval_steps_per_second": 7.083, "step": 94000 }, { "epoch": 3.3879338306843985, "grad_norm": 0.2523888647556305, "learning_rate": 1.2451846138717321e-05, "loss": 0.3964, "step": 94005 }, { "epoch": 3.3881140303456228, "grad_norm": 0.2638121247291565, "learning_rate": 1.2449322299834196e-05, "loss": 0.4108, "step": 94010 }, { "epoch": 3.3882942300068475, "grad_norm": 0.2701154053211212, "learning_rate": 1.2446798631951623e-05, "loss": 0.3477, "step": 94015 }, { "epoch": 3.388474429668072, "grad_norm": 0.2638227641582489, "learning_rate": 1.2444275135103988e-05, "loss": 0.4001, "step": 94020 }, { "epoch": 3.388654629329297, "grad_norm": 0.20842356979846954, "learning_rate": 1.2441751809325666e-05, "loss": 0.3629, "step": 94025 }, { "epoch": 3.3888348289905217, "grad_norm": 0.24552541971206665, "learning_rate": 1.2439228654651053e-05, "loss": 0.381, "step": 94030 }, { "epoch": 3.389015028651746, "grad_norm": 0.22910676896572113, "learning_rate": 1.2436705671114515e-05, "loss": 0.3913, "step": 94035 }, { "epoch": 3.3891952283129707, "grad_norm": 0.1990845948457718, "learning_rate": 1.2434182858750431e-05, "loss": 0.4065, "step": 94040 }, { "epoch": 3.3893754279741954, "grad_norm": 0.24163967370986938, "learning_rate": 1.2431660217593175e-05, "loss": 0.4221, "step": 94045 }, { "epoch": 3.38955562763542, "grad_norm": 0.20850235223770142, "learning_rate": 1.2429137747677105e-05, "loss": 0.3812, "step": 94050 }, { "epoch": 3.3897358272966445, "grad_norm": 0.22851799428462982, "learning_rate": 1.2426615449036619e-05, "loss": 0.4011, "step": 94055 }, { "epoch": 3.389916026957869, "grad_norm": 0.24421176314353943, "learning_rate": 1.2424093321706052e-05, "loss": 0.432, "step": 94060 }, { "epoch": 3.390096226619094, "grad_norm": 0.20470160245895386, "learning_rate": 1.242157136571977e-05, "loss": 0.4326, "step": 94065 }, { "epoch": 3.3902764262803187, "grad_norm": 0.2451028823852539, "learning_rate": 1.2419049581112152e-05, "loss": 0.329, "step": 94070 }, { "epoch": 3.3904566259415434, "grad_norm": 0.20993046462535858, "learning_rate": 1.2416527967917548e-05, "loss": 0.3511, "step": 94075 }, { "epoch": 3.3906368256027677, "grad_norm": 0.2813032567501068, "learning_rate": 1.2414006526170312e-05, "loss": 0.3764, "step": 94080 }, { "epoch": 3.3908170252639924, "grad_norm": 0.22048485279083252, "learning_rate": 1.2411485255904807e-05, "loss": 0.3629, "step": 94085 }, { "epoch": 3.390997224925217, "grad_norm": 0.2146688997745514, "learning_rate": 1.2408964157155362e-05, "loss": 0.3739, "step": 94090 }, { "epoch": 3.391177424586442, "grad_norm": 0.26328352093696594, "learning_rate": 1.2406443229956355e-05, "loss": 0.4042, "step": 94095 }, { "epoch": 3.391357624247666, "grad_norm": 0.22812364995479584, "learning_rate": 1.240392247434212e-05, "loss": 0.3294, "step": 94100 }, { "epoch": 3.391537823908891, "grad_norm": 0.19867856800556183, "learning_rate": 1.2401401890347003e-05, "loss": 0.4121, "step": 94105 }, { "epoch": 3.3917180235701156, "grad_norm": 0.24459275603294373, "learning_rate": 1.2398881478005345e-05, "loss": 0.3386, "step": 94110 }, { "epoch": 3.3918982232313404, "grad_norm": 0.2276408076286316, "learning_rate": 1.2396361237351478e-05, "loss": 0.3889, "step": 94115 }, { "epoch": 3.392078422892565, "grad_norm": 0.1976245641708374, "learning_rate": 1.2393841168419759e-05, "loss": 0.4016, "step": 94120 }, { "epoch": 3.39225862255379, "grad_norm": 0.2941203713417053, "learning_rate": 1.2391321271244524e-05, "loss": 0.3932, "step": 94125 }, { "epoch": 3.392438822215014, "grad_norm": 0.3133787214756012, "learning_rate": 1.2388801545860076e-05, "loss": 0.3873, "step": 94130 }, { "epoch": 3.392619021876239, "grad_norm": 0.2651798725128174, "learning_rate": 1.2386281992300775e-05, "loss": 0.4417, "step": 94135 }, { "epoch": 3.3927992215374636, "grad_norm": 0.23838862776756287, "learning_rate": 1.2383762610600938e-05, "loss": 0.3781, "step": 94140 }, { "epoch": 3.3929794211986883, "grad_norm": 0.2366458624601364, "learning_rate": 1.2381243400794885e-05, "loss": 0.3972, "step": 94145 }, { "epoch": 3.3931596208599126, "grad_norm": 0.2220826894044876, "learning_rate": 1.2378724362916966e-05, "loss": 0.4127, "step": 94150 }, { "epoch": 3.3933398205211374, "grad_norm": 0.20894181728363037, "learning_rate": 1.2376205497001465e-05, "loss": 0.3961, "step": 94155 }, { "epoch": 3.393520020182362, "grad_norm": 0.24846115708351135, "learning_rate": 1.2373686803082728e-05, "loss": 0.3631, "step": 94160 }, { "epoch": 3.393700219843587, "grad_norm": 0.20306502282619476, "learning_rate": 1.2371168281195067e-05, "loss": 0.3865, "step": 94165 }, { "epoch": 3.3938804195048116, "grad_norm": 0.222940593957901, "learning_rate": 1.2368649931372791e-05, "loss": 0.3733, "step": 94170 }, { "epoch": 3.394060619166036, "grad_norm": 0.2514864206314087, "learning_rate": 1.2366131753650214e-05, "loss": 0.4155, "step": 94175 }, { "epoch": 3.3942408188272606, "grad_norm": 0.2028576135635376, "learning_rate": 1.2363613748061639e-05, "loss": 0.3471, "step": 94180 }, { "epoch": 3.3944210184884853, "grad_norm": 0.21311317384243011, "learning_rate": 1.2361095914641388e-05, "loss": 0.3696, "step": 94185 }, { "epoch": 3.39460121814971, "grad_norm": 0.21538734436035156, "learning_rate": 1.2358578253423757e-05, "loss": 0.3949, "step": 94190 }, { "epoch": 3.3947814178109343, "grad_norm": 0.25031110644340515, "learning_rate": 1.235606076444305e-05, "loss": 0.3706, "step": 94195 }, { "epoch": 3.394961617472159, "grad_norm": 0.1601521223783493, "learning_rate": 1.235354344773357e-05, "loss": 0.3664, "step": 94200 }, { "epoch": 3.395141817133384, "grad_norm": 0.24917049705982208, "learning_rate": 1.235102630332961e-05, "loss": 0.37, "step": 94205 }, { "epoch": 3.3953220167946085, "grad_norm": 0.24421679973602295, "learning_rate": 1.234850933126546e-05, "loss": 0.3709, "step": 94210 }, { "epoch": 3.3955022164558333, "grad_norm": 0.20778076350688934, "learning_rate": 1.2345992531575426e-05, "loss": 0.4043, "step": 94215 }, { "epoch": 3.3956824161170576, "grad_norm": 0.22638878226280212, "learning_rate": 1.2343475904293797e-05, "loss": 0.3479, "step": 94220 }, { "epoch": 3.3958626157782823, "grad_norm": 0.2236730009317398, "learning_rate": 1.2340959449454859e-05, "loss": 0.3658, "step": 94225 }, { "epoch": 3.396042815439507, "grad_norm": 0.22104784846305847, "learning_rate": 1.2338443167092898e-05, "loss": 0.3816, "step": 94230 }, { "epoch": 3.3962230151007318, "grad_norm": 0.26684364676475525, "learning_rate": 1.2335927057242185e-05, "loss": 0.3682, "step": 94235 }, { "epoch": 3.396403214761956, "grad_norm": 0.22945067286491394, "learning_rate": 1.2333411119937036e-05, "loss": 0.3779, "step": 94240 }, { "epoch": 3.396583414423181, "grad_norm": 0.18376512825489044, "learning_rate": 1.2330895355211697e-05, "loss": 0.3449, "step": 94245 }, { "epoch": 3.3967636140844055, "grad_norm": 0.24991095066070557, "learning_rate": 1.2328379763100445e-05, "loss": 0.4067, "step": 94250 }, { "epoch": 3.3969438137456303, "grad_norm": 0.2116982489824295, "learning_rate": 1.2325864343637577e-05, "loss": 0.3399, "step": 94255 }, { "epoch": 3.397124013406855, "grad_norm": 0.23185938596725464, "learning_rate": 1.2323349096857354e-05, "loss": 0.378, "step": 94260 }, { "epoch": 3.3973042130680793, "grad_norm": 0.21932083368301392, "learning_rate": 1.2320834022794045e-05, "loss": 0.3541, "step": 94265 }, { "epoch": 3.397484412729304, "grad_norm": 0.20772342383861542, "learning_rate": 1.2318319121481917e-05, "loss": 0.3874, "step": 94270 }, { "epoch": 3.3976646123905287, "grad_norm": 0.2540047764778137, "learning_rate": 1.2315804392955228e-05, "loss": 0.4273, "step": 94275 }, { "epoch": 3.3978448120517535, "grad_norm": 0.25412896275520325, "learning_rate": 1.2313289837248254e-05, "loss": 0.3906, "step": 94280 }, { "epoch": 3.3980250117129778, "grad_norm": 0.2159910500049591, "learning_rate": 1.2310775454395252e-05, "loss": 0.3793, "step": 94285 }, { "epoch": 3.3982052113742025, "grad_norm": 0.23486000299453735, "learning_rate": 1.2308261244430477e-05, "loss": 0.3846, "step": 94290 }, { "epoch": 3.3983854110354272, "grad_norm": 0.2207733690738678, "learning_rate": 1.2305747207388187e-05, "loss": 0.3535, "step": 94295 }, { "epoch": 3.398565610696652, "grad_norm": 0.22273029386997223, "learning_rate": 1.2303233343302623e-05, "loss": 0.3769, "step": 94300 }, { "epoch": 3.3987458103578767, "grad_norm": 0.2068636119365692, "learning_rate": 1.2300719652208057e-05, "loss": 0.355, "step": 94305 }, { "epoch": 3.398926010019101, "grad_norm": 0.2813783884048462, "learning_rate": 1.2298206134138726e-05, "loss": 0.3902, "step": 94310 }, { "epoch": 3.3991062096803257, "grad_norm": 0.2261374592781067, "learning_rate": 1.2295692789128877e-05, "loss": 0.3624, "step": 94315 }, { "epoch": 3.3992864093415505, "grad_norm": 0.21216978132724762, "learning_rate": 1.2293179617212755e-05, "loss": 0.3872, "step": 94320 }, { "epoch": 3.399466609002775, "grad_norm": 0.22973038256168365, "learning_rate": 1.22906666184246e-05, "loss": 0.386, "step": 94325 }, { "epoch": 3.3996468086639995, "grad_norm": 0.18079350888729095, "learning_rate": 1.2288153792798642e-05, "loss": 0.3672, "step": 94330 }, { "epoch": 3.399827008325224, "grad_norm": 0.2188836932182312, "learning_rate": 1.2285641140369147e-05, "loss": 0.4015, "step": 94335 }, { "epoch": 3.400007207986449, "grad_norm": 0.2155511975288391, "learning_rate": 1.2283128661170313e-05, "loss": 0.358, "step": 94340 }, { "epoch": 3.4001874076476737, "grad_norm": 0.2768744230270386, "learning_rate": 1.2280616355236397e-05, "loss": 0.4151, "step": 94345 }, { "epoch": 3.4003676073088984, "grad_norm": 0.20749439299106598, "learning_rate": 1.2278104222601618e-05, "loss": 0.3595, "step": 94350 }, { "epoch": 3.4005478069701227, "grad_norm": 0.28811633586883545, "learning_rate": 1.227559226330021e-05, "loss": 0.4218, "step": 94355 }, { "epoch": 3.4007280066313474, "grad_norm": 0.24326983094215393, "learning_rate": 1.2273080477366391e-05, "loss": 0.3668, "step": 94360 }, { "epoch": 3.400908206292572, "grad_norm": 0.20822405815124512, "learning_rate": 1.2270568864834378e-05, "loss": 0.3486, "step": 94365 }, { "epoch": 3.401088405953797, "grad_norm": 0.21231502294540405, "learning_rate": 1.2268057425738408e-05, "loss": 0.3724, "step": 94370 }, { "epoch": 3.401268605615021, "grad_norm": 0.21015721559524536, "learning_rate": 1.2265546160112692e-05, "loss": 0.3906, "step": 94375 }, { "epoch": 3.401448805276246, "grad_norm": 0.20890061557292938, "learning_rate": 1.2263035067991443e-05, "loss": 0.3918, "step": 94380 }, { "epoch": 3.4016290049374707, "grad_norm": 0.23352204263210297, "learning_rate": 1.2260524149408875e-05, "loss": 0.3573, "step": 94385 }, { "epoch": 3.4018092045986954, "grad_norm": 0.24678070843219757, "learning_rate": 1.2258013404399202e-05, "loss": 0.3975, "step": 94390 }, { "epoch": 3.40198940425992, "grad_norm": 0.27147865295410156, "learning_rate": 1.2255502832996619e-05, "loss": 0.3848, "step": 94395 }, { "epoch": 3.402169603921145, "grad_norm": 0.2716798484325409, "learning_rate": 1.225299243523535e-05, "loss": 0.3492, "step": 94400 }, { "epoch": 3.402349803582369, "grad_norm": 0.2307744324207306, "learning_rate": 1.2250482211149591e-05, "loss": 0.3821, "step": 94405 }, { "epoch": 3.402530003243594, "grad_norm": 0.2626960575580597, "learning_rate": 1.2247972160773544e-05, "loss": 0.3784, "step": 94410 }, { "epoch": 3.4027102029048186, "grad_norm": 0.29123780131340027, "learning_rate": 1.224546228414141e-05, "loss": 0.3983, "step": 94415 }, { "epoch": 3.4028904025660434, "grad_norm": 0.23781827092170715, "learning_rate": 1.224295258128737e-05, "loss": 0.4109, "step": 94420 }, { "epoch": 3.4030706022272676, "grad_norm": 0.2665836811065674, "learning_rate": 1.2240443052245651e-05, "loss": 0.399, "step": 94425 }, { "epoch": 3.4032508018884924, "grad_norm": 0.27731844782829285, "learning_rate": 1.2237933697050416e-05, "loss": 0.4108, "step": 94430 }, { "epoch": 3.403431001549717, "grad_norm": 0.20404113829135895, "learning_rate": 1.2235424515735855e-05, "loss": 0.3812, "step": 94435 }, { "epoch": 3.403611201210942, "grad_norm": 0.19817714393138885, "learning_rate": 1.2232915508336173e-05, "loss": 0.3763, "step": 94440 }, { "epoch": 3.4037914008721666, "grad_norm": 0.2466011792421341, "learning_rate": 1.2230406674885536e-05, "loss": 0.3551, "step": 94445 }, { "epoch": 3.403971600533391, "grad_norm": 0.2783677577972412, "learning_rate": 1.2227898015418154e-05, "loss": 0.3643, "step": 94450 }, { "epoch": 3.4041518001946156, "grad_norm": 0.2562623620033264, "learning_rate": 1.222538952996818e-05, "loss": 0.4023, "step": 94455 }, { "epoch": 3.4043319998558403, "grad_norm": 0.2551863193511963, "learning_rate": 1.222288121856979e-05, "loss": 0.3695, "step": 94460 }, { "epoch": 3.404512199517065, "grad_norm": 0.27139002084732056, "learning_rate": 1.222037308125718e-05, "loss": 0.3763, "step": 94465 }, { "epoch": 3.4046923991782894, "grad_norm": 0.21469560265541077, "learning_rate": 1.2217865118064512e-05, "loss": 0.3879, "step": 94470 }, { "epoch": 3.404872598839514, "grad_norm": 0.20434612035751343, "learning_rate": 1.2215357329025956e-05, "loss": 0.3651, "step": 94475 }, { "epoch": 3.405052798500739, "grad_norm": 0.26074668765068054, "learning_rate": 1.2212849714175684e-05, "loss": 0.3782, "step": 94480 }, { "epoch": 3.4052329981619636, "grad_norm": 0.22299236059188843, "learning_rate": 1.2210342273547848e-05, "loss": 0.3674, "step": 94485 }, { "epoch": 3.4054131978231883, "grad_norm": 0.2286439836025238, "learning_rate": 1.2207835007176632e-05, "loss": 0.3969, "step": 94490 }, { "epoch": 3.4055933974844126, "grad_norm": 0.21935893595218658, "learning_rate": 1.2205327915096187e-05, "loss": 0.3621, "step": 94495 }, { "epoch": 3.4057735971456373, "grad_norm": 0.2769947946071625, "learning_rate": 1.2202820997340673e-05, "loss": 0.4064, "step": 94500 }, { "epoch": 3.4057735971456373, "eval_loss": 0.4305441975593567, "eval_runtime": 3.5328, "eval_samples_per_second": 28.307, "eval_steps_per_second": 7.077, "step": 94500 }, { "epoch": 3.405953796806862, "grad_norm": 0.22937120497226715, "learning_rate": 1.2200314253944243e-05, "loss": 0.4212, "step": 94505 }, { "epoch": 3.406133996468087, "grad_norm": 0.2228795886039734, "learning_rate": 1.2197807684941054e-05, "loss": 0.3915, "step": 94510 }, { "epoch": 3.406314196129311, "grad_norm": 0.2143542468547821, "learning_rate": 1.2195301290365248e-05, "loss": 0.38, "step": 94515 }, { "epoch": 3.406494395790536, "grad_norm": 0.22707048058509827, "learning_rate": 1.2192795070251001e-05, "loss": 0.3598, "step": 94520 }, { "epoch": 3.4066745954517605, "grad_norm": 0.24929481744766235, "learning_rate": 1.2190289024632424e-05, "loss": 0.389, "step": 94525 }, { "epoch": 3.4068547951129853, "grad_norm": 0.2553023397922516, "learning_rate": 1.2187783153543689e-05, "loss": 0.3685, "step": 94530 }, { "epoch": 3.40703499477421, "grad_norm": 0.2211211770772934, "learning_rate": 1.2185277457018926e-05, "loss": 0.3369, "step": 94535 }, { "epoch": 3.4072151944354343, "grad_norm": 0.27645251154899597, "learning_rate": 1.2182771935092277e-05, "loss": 0.3792, "step": 94540 }, { "epoch": 3.407395394096659, "grad_norm": 0.23426538705825806, "learning_rate": 1.2180266587797881e-05, "loss": 0.3623, "step": 94545 }, { "epoch": 3.4075755937578838, "grad_norm": 0.204464390873909, "learning_rate": 1.2177761415169869e-05, "loss": 0.3999, "step": 94550 }, { "epoch": 3.4077557934191085, "grad_norm": 0.22453001141548157, "learning_rate": 1.2175256417242365e-05, "loss": 0.3882, "step": 94555 }, { "epoch": 3.407935993080333, "grad_norm": 0.22461944818496704, "learning_rate": 1.217275159404952e-05, "loss": 0.4007, "step": 94560 }, { "epoch": 3.4081161927415575, "grad_norm": 0.2279561460018158, "learning_rate": 1.2170246945625451e-05, "loss": 0.3697, "step": 94565 }, { "epoch": 3.4082963924027823, "grad_norm": 0.20913352072238922, "learning_rate": 1.2167742472004284e-05, "loss": 0.3608, "step": 94570 }, { "epoch": 3.408476592064007, "grad_norm": 0.2152101993560791, "learning_rate": 1.2165238173220139e-05, "loss": 0.3632, "step": 94575 }, { "epoch": 3.4086567917252317, "grad_norm": 0.21769098937511444, "learning_rate": 1.216273404930713e-05, "loss": 0.3927, "step": 94580 }, { "epoch": 3.408836991386456, "grad_norm": 0.22281882166862488, "learning_rate": 1.2160230100299397e-05, "loss": 0.3571, "step": 94585 }, { "epoch": 3.4090171910476808, "grad_norm": 0.20765550434589386, "learning_rate": 1.2157726326231041e-05, "loss": 0.3766, "step": 94590 }, { "epoch": 3.4091973907089055, "grad_norm": 0.23559367656707764, "learning_rate": 1.215522272713618e-05, "loss": 0.3658, "step": 94595 }, { "epoch": 3.40937759037013, "grad_norm": 0.23205064237117767, "learning_rate": 1.2152719303048919e-05, "loss": 0.4183, "step": 94600 }, { "epoch": 3.4095577900313545, "grad_norm": 0.2513408958911896, "learning_rate": 1.2150216054003361e-05, "loss": 0.3678, "step": 94605 }, { "epoch": 3.4097379896925792, "grad_norm": 0.24810358881950378, "learning_rate": 1.2147712980033629e-05, "loss": 0.384, "step": 94610 }, { "epoch": 3.409918189353804, "grad_norm": Infinity, "learning_rate": 1.2145710646935363e-05, "loss": 0.3946, "step": 94615 }, { "epoch": 3.4100983890150287, "grad_norm": 0.2287181168794632, "learning_rate": 1.2143207888188035e-05, "loss": 0.3305, "step": 94620 }, { "epoch": 3.4102785886762534, "grad_norm": 0.28771016001701355, "learning_rate": 1.2140705304612024e-05, "loss": 0.4085, "step": 94625 }, { "epoch": 3.4104587883374777, "grad_norm": 0.23076020181179047, "learning_rate": 1.2138202896241413e-05, "loss": 0.3834, "step": 94630 }, { "epoch": 3.4106389879987025, "grad_norm": 0.25198495388031006, "learning_rate": 1.2135700663110295e-05, "loss": 0.3626, "step": 94635 }, { "epoch": 3.410819187659927, "grad_norm": 0.19510357081890106, "learning_rate": 1.2133198605252767e-05, "loss": 0.4004, "step": 94640 }, { "epoch": 3.410999387321152, "grad_norm": 0.21469490230083466, "learning_rate": 1.2130696722702917e-05, "loss": 0.3654, "step": 94645 }, { "epoch": 3.4111795869823767, "grad_norm": 0.22288212180137634, "learning_rate": 1.212819501549482e-05, "loss": 0.3701, "step": 94650 }, { "epoch": 3.411359786643601, "grad_norm": 0.25711116194725037, "learning_rate": 1.2125693483662586e-05, "loss": 0.3539, "step": 94655 }, { "epoch": 3.4115399863048257, "grad_norm": 0.30416256189346313, "learning_rate": 1.2123192127240286e-05, "loss": 0.3982, "step": 94660 }, { "epoch": 3.4117201859660504, "grad_norm": 0.23342035710811615, "learning_rate": 1.2120690946262e-05, "loss": 0.3734, "step": 94665 }, { "epoch": 3.411900385627275, "grad_norm": 0.3036738932132721, "learning_rate": 1.2118189940761807e-05, "loss": 0.3875, "step": 94670 }, { "epoch": 3.4120805852885, "grad_norm": 0.20019620656967163, "learning_rate": 1.2115689110773771e-05, "loss": 0.3689, "step": 94675 }, { "epoch": 3.412260784949724, "grad_norm": 0.23406051099300385, "learning_rate": 1.2113188456331987e-05, "loss": 0.3826, "step": 94680 }, { "epoch": 3.412440984610949, "grad_norm": 0.1972576528787613, "learning_rate": 1.2110687977470522e-05, "loss": 0.3868, "step": 94685 }, { "epoch": 3.4126211842721736, "grad_norm": 0.21116188168525696, "learning_rate": 1.210818767422342e-05, "loss": 0.3706, "step": 94690 }, { "epoch": 3.4128013839333984, "grad_norm": 0.23035746812820435, "learning_rate": 1.2105687546624777e-05, "loss": 0.3828, "step": 94695 }, { "epoch": 3.4129815835946227, "grad_norm": 0.25209763646125793, "learning_rate": 1.2103187594708635e-05, "loss": 0.4037, "step": 94700 }, { "epoch": 3.4131617832558474, "grad_norm": 0.23511983454227448, "learning_rate": 1.2100687818509083e-05, "loss": 0.3718, "step": 94705 }, { "epoch": 3.413341982917072, "grad_norm": 0.2819865047931671, "learning_rate": 1.2098188218060153e-05, "loss": 0.4107, "step": 94710 }, { "epoch": 3.413522182578297, "grad_norm": 0.21473956108093262, "learning_rate": 1.2095688793395898e-05, "loss": 0.4104, "step": 94715 }, { "epoch": 3.4137023822395216, "grad_norm": 0.24196425080299377, "learning_rate": 1.2093189544550398e-05, "loss": 0.4131, "step": 94720 }, { "epoch": 3.413882581900746, "grad_norm": 0.2057386040687561, "learning_rate": 1.2090690471557689e-05, "loss": 0.3766, "step": 94725 }, { "epoch": 3.4140627815619706, "grad_norm": 0.268764466047287, "learning_rate": 1.2088191574451827e-05, "loss": 0.375, "step": 94730 }, { "epoch": 3.4142429812231954, "grad_norm": 0.27153804898262024, "learning_rate": 1.2085692853266852e-05, "loss": 0.4177, "step": 94735 }, { "epoch": 3.41442318088442, "grad_norm": 0.23466871678829193, "learning_rate": 1.2083194308036803e-05, "loss": 0.4096, "step": 94740 }, { "epoch": 3.4146033805456444, "grad_norm": 0.22995726764202118, "learning_rate": 1.2080695938795739e-05, "loss": 0.359, "step": 94745 }, { "epoch": 3.414783580206869, "grad_norm": 0.2112264633178711, "learning_rate": 1.2078197745577693e-05, "loss": 0.3687, "step": 94750 }, { "epoch": 3.414963779868094, "grad_norm": 0.19448639452457428, "learning_rate": 1.20756997284167e-05, "loss": 0.3947, "step": 94755 }, { "epoch": 3.4151439795293186, "grad_norm": 0.20815762877464294, "learning_rate": 1.2073201887346797e-05, "loss": 0.4007, "step": 94760 }, { "epoch": 3.4153241791905433, "grad_norm": 0.21343150734901428, "learning_rate": 1.2070704222402016e-05, "loss": 0.3743, "step": 94765 }, { "epoch": 3.4155043788517676, "grad_norm": 0.2795332372188568, "learning_rate": 1.2068206733616375e-05, "loss": 0.3962, "step": 94770 }, { "epoch": 3.4156845785129923, "grad_norm": 0.26403021812438965, "learning_rate": 1.2065709421023923e-05, "loss": 0.3737, "step": 94775 }, { "epoch": 3.415864778174217, "grad_norm": 0.21119317412376404, "learning_rate": 1.2063212284658679e-05, "loss": 0.363, "step": 94780 }, { "epoch": 3.416044977835442, "grad_norm": 0.23738868534564972, "learning_rate": 1.206071532455466e-05, "loss": 0.3517, "step": 94785 }, { "epoch": 3.416225177496666, "grad_norm": 0.22581474483013153, "learning_rate": 1.2058218540745891e-05, "loss": 0.3721, "step": 94790 }, { "epoch": 3.416405377157891, "grad_norm": 0.20880122482776642, "learning_rate": 1.2055721933266381e-05, "loss": 0.3756, "step": 94795 }, { "epoch": 3.4165855768191156, "grad_norm": 0.19523420929908752, "learning_rate": 1.205322550215017e-05, "loss": 0.3655, "step": 94800 }, { "epoch": 3.4167657764803403, "grad_norm": 0.26016107201576233, "learning_rate": 1.2050729247431238e-05, "loss": 0.3844, "step": 94805 }, { "epoch": 3.416945976141565, "grad_norm": 0.21247242391109467, "learning_rate": 1.2048233169143623e-05, "loss": 0.409, "step": 94810 }, { "epoch": 3.4171261758027893, "grad_norm": 0.27394580841064453, "learning_rate": 1.204573726732132e-05, "loss": 0.438, "step": 94815 }, { "epoch": 3.417306375464014, "grad_norm": 0.22982816398143768, "learning_rate": 1.204324154199834e-05, "loss": 0.3659, "step": 94820 }, { "epoch": 3.417486575125239, "grad_norm": 0.2429938167333603, "learning_rate": 1.2040745993208685e-05, "loss": 0.374, "step": 94825 }, { "epoch": 3.4176667747864635, "grad_norm": 0.2131299376487732, "learning_rate": 1.2038250620986358e-05, "loss": 0.4089, "step": 94830 }, { "epoch": 3.417846974447688, "grad_norm": 0.2698366641998291, "learning_rate": 1.2035755425365348e-05, "loss": 0.3917, "step": 94835 }, { "epoch": 3.4180271741089125, "grad_norm": 0.22712182998657227, "learning_rate": 1.2033260406379668e-05, "loss": 0.3908, "step": 94840 }, { "epoch": 3.4182073737701373, "grad_norm": 0.25237366557121277, "learning_rate": 1.2030765564063306e-05, "loss": 0.4028, "step": 94845 }, { "epoch": 3.418387573431362, "grad_norm": 0.22257663309574127, "learning_rate": 1.2028270898450254e-05, "loss": 0.3829, "step": 94850 }, { "epoch": 3.4185677730925867, "grad_norm": 0.22527682781219482, "learning_rate": 1.2025776409574496e-05, "loss": 0.3725, "step": 94855 }, { "epoch": 3.418747972753811, "grad_norm": 0.24640600383281708, "learning_rate": 1.2023282097470016e-05, "loss": 0.4155, "step": 94860 }, { "epoch": 3.4189281724150358, "grad_norm": 0.2200070023536682, "learning_rate": 1.202078796217081e-05, "loss": 0.3464, "step": 94865 }, { "epoch": 3.4191083720762605, "grad_norm": 0.19475413858890533, "learning_rate": 1.2018294003710867e-05, "loss": 0.3841, "step": 94870 }, { "epoch": 3.4192885717374852, "grad_norm": 0.2146916389465332, "learning_rate": 1.2015800222124137e-05, "loss": 0.3938, "step": 94875 }, { "epoch": 3.4194687713987095, "grad_norm": 0.2292756289243698, "learning_rate": 1.2013306617444625e-05, "loss": 0.3847, "step": 94880 }, { "epoch": 3.4196489710599343, "grad_norm": 0.26337769627571106, "learning_rate": 1.2010813189706285e-05, "loss": 0.4117, "step": 94885 }, { "epoch": 3.419829170721159, "grad_norm": 0.2760777175426483, "learning_rate": 1.200831993894311e-05, "loss": 0.3819, "step": 94890 }, { "epoch": 3.4200093703823837, "grad_norm": 0.22346773743629456, "learning_rate": 1.200582686518907e-05, "loss": 0.3848, "step": 94895 }, { "epoch": 3.4201895700436085, "grad_norm": 0.2624737322330475, "learning_rate": 1.2003333968478106e-05, "loss": 0.3789, "step": 94900 }, { "epoch": 3.420369769704833, "grad_norm": 0.33500173687934875, "learning_rate": 1.200084124884421e-05, "loss": 0.3858, "step": 94905 }, { "epoch": 3.4205499693660575, "grad_norm": 0.19514542818069458, "learning_rate": 1.1998348706321333e-05, "loss": 0.3719, "step": 94910 }, { "epoch": 3.420730169027282, "grad_norm": 0.25635966658592224, "learning_rate": 1.1995856340943439e-05, "loss": 0.3643, "step": 94915 }, { "epoch": 3.420910368688507, "grad_norm": 0.1672072410583496, "learning_rate": 1.1993364152744485e-05, "loss": 0.3837, "step": 94920 }, { "epoch": 3.4210905683497317, "grad_norm": 0.23457878828048706, "learning_rate": 1.1990872141758417e-05, "loss": 0.3667, "step": 94925 }, { "epoch": 3.421270768010956, "grad_norm": 0.23055920004844666, "learning_rate": 1.1988380308019207e-05, "loss": 0.3776, "step": 94930 }, { "epoch": 3.4214509676721807, "grad_norm": 0.24517036974430084, "learning_rate": 1.1985888651560795e-05, "loss": 0.3673, "step": 94935 }, { "epoch": 3.4216311673334054, "grad_norm": 0.22914618253707886, "learning_rate": 1.1983397172417129e-05, "loss": 0.3627, "step": 94940 }, { "epoch": 3.42181136699463, "grad_norm": 0.2351149469614029, "learning_rate": 1.1980905870622159e-05, "loss": 0.3745, "step": 94945 }, { "epoch": 3.421991566655855, "grad_norm": 0.20726028084754944, "learning_rate": 1.1978414746209826e-05, "loss": 0.3554, "step": 94950 }, { "epoch": 3.422171766317079, "grad_norm": 0.2245931476354599, "learning_rate": 1.1975923799214061e-05, "loss": 0.4048, "step": 94955 }, { "epoch": 3.422351965978304, "grad_norm": 0.2029707431793213, "learning_rate": 1.1973433029668821e-05, "loss": 0.3594, "step": 94960 }, { "epoch": 3.4225321656395287, "grad_norm": 0.22745849192142487, "learning_rate": 1.1970942437608035e-05, "loss": 0.4059, "step": 94965 }, { "epoch": 3.4227123653007534, "grad_norm": 0.283438116312027, "learning_rate": 1.1968452023065635e-05, "loss": 0.4164, "step": 94970 }, { "epoch": 3.4228925649619777, "grad_norm": 0.181619331240654, "learning_rate": 1.1965961786075556e-05, "loss": 0.3374, "step": 94975 }, { "epoch": 3.4230727646232024, "grad_norm": 0.1780719757080078, "learning_rate": 1.196347172667171e-05, "loss": 0.3894, "step": 94980 }, { "epoch": 3.423252964284427, "grad_norm": 0.24398981034755707, "learning_rate": 1.196098184488806e-05, "loss": 0.3389, "step": 94985 }, { "epoch": 3.423433163945652, "grad_norm": 0.21179978549480438, "learning_rate": 1.1958492140758497e-05, "loss": 0.3897, "step": 94990 }, { "epoch": 3.4236133636068766, "grad_norm": 0.2665993273258209, "learning_rate": 1.1956002614316946e-05, "loss": 0.3834, "step": 94995 }, { "epoch": 3.423793563268101, "grad_norm": 0.24159680306911469, "learning_rate": 1.1953513265597344e-05, "loss": 0.3963, "step": 95000 }, { "epoch": 3.423793563268101, "eval_loss": 0.4299665093421936, "eval_runtime": 3.5329, "eval_samples_per_second": 28.305, "eval_steps_per_second": 7.076, "step": 95000 }, { "epoch": 3.4239737629293256, "grad_norm": 0.2492501586675644, "learning_rate": 1.1951024094633594e-05, "loss": 0.3966, "step": 95005 }, { "epoch": 3.4241539625905504, "grad_norm": 0.2158864438533783, "learning_rate": 1.1948535101459619e-05, "loss": 0.3756, "step": 95010 }, { "epoch": 3.424334162251775, "grad_norm": 0.20892734825611115, "learning_rate": 1.1946046286109321e-05, "loss": 0.3704, "step": 95015 }, { "epoch": 3.4245143619129994, "grad_norm": 0.2723451852798462, "learning_rate": 1.194355764861661e-05, "loss": 0.3775, "step": 95020 }, { "epoch": 3.424694561574224, "grad_norm": 0.24041932821273804, "learning_rate": 1.1941069189015406e-05, "loss": 0.3849, "step": 95025 }, { "epoch": 3.424874761235449, "grad_norm": 0.23057153820991516, "learning_rate": 1.1938580907339606e-05, "loss": 0.3625, "step": 95030 }, { "epoch": 3.4250549608966736, "grad_norm": 0.23812946677207947, "learning_rate": 1.1936092803623114e-05, "loss": 0.3765, "step": 95035 }, { "epoch": 3.4252351605578983, "grad_norm": 0.18412302434444427, "learning_rate": 1.1933604877899825e-05, "loss": 0.3616, "step": 95040 }, { "epoch": 3.4254153602191226, "grad_norm": 0.20551741123199463, "learning_rate": 1.1931117130203634e-05, "loss": 0.3636, "step": 95045 }, { "epoch": 3.4255955598803474, "grad_norm": 0.2384248822927475, "learning_rate": 1.192862956056845e-05, "loss": 0.399, "step": 95050 }, { "epoch": 3.425775759541572, "grad_norm": 0.27354708313941956, "learning_rate": 1.1926142169028156e-05, "loss": 0.3748, "step": 95055 }, { "epoch": 3.425955959202797, "grad_norm": 0.23982170224189758, "learning_rate": 1.1923654955616645e-05, "loss": 0.3492, "step": 95060 }, { "epoch": 3.426136158864021, "grad_norm": 0.20187516510486603, "learning_rate": 1.1921167920367805e-05, "loss": 0.3737, "step": 95065 }, { "epoch": 3.426316358525246, "grad_norm": 0.2167653739452362, "learning_rate": 1.1918681063315507e-05, "loss": 0.3887, "step": 95070 }, { "epoch": 3.4264965581864706, "grad_norm": 0.2231149524450302, "learning_rate": 1.1916194384493659e-05, "loss": 0.3895, "step": 95075 }, { "epoch": 3.4266767578476953, "grad_norm": 0.2766813337802887, "learning_rate": 1.1913707883936136e-05, "loss": 0.3737, "step": 95080 }, { "epoch": 3.42685695750892, "grad_norm": 0.2061295360326767, "learning_rate": 1.1911221561676795e-05, "loss": 0.3716, "step": 95085 }, { "epoch": 3.4270371571701443, "grad_norm": 0.21205422282218933, "learning_rate": 1.1908735417749534e-05, "loss": 0.3941, "step": 95090 }, { "epoch": 3.427217356831369, "grad_norm": 0.2193949669599533, "learning_rate": 1.1906249452188217e-05, "loss": 0.3787, "step": 95095 }, { "epoch": 3.427397556492594, "grad_norm": 0.1897018849849701, "learning_rate": 1.1903763665026719e-05, "loss": 0.3418, "step": 95100 }, { "epoch": 3.4275777561538185, "grad_norm": 0.2177032083272934, "learning_rate": 1.1901278056298901e-05, "loss": 0.3671, "step": 95105 }, { "epoch": 3.427757955815043, "grad_norm": 0.29652655124664307, "learning_rate": 1.1898792626038626e-05, "loss": 0.366, "step": 95110 }, { "epoch": 3.4279381554762676, "grad_norm": 0.20908688008785248, "learning_rate": 1.1896307374279777e-05, "loss": 0.3806, "step": 95115 }, { "epoch": 3.4281183551374923, "grad_norm": 0.2735453248023987, "learning_rate": 1.18938223010562e-05, "loss": 0.3945, "step": 95120 }, { "epoch": 3.428298554798717, "grad_norm": 0.23692834377288818, "learning_rate": 1.1891337406401758e-05, "loss": 0.3743, "step": 95125 }, { "epoch": 3.4284787544599418, "grad_norm": 0.23678283393383026, "learning_rate": 1.1888852690350305e-05, "loss": 0.3876, "step": 95130 }, { "epoch": 3.428658954121166, "grad_norm": 0.30739691853523254, "learning_rate": 1.1886368152935698e-05, "loss": 0.3843, "step": 95135 }, { "epoch": 3.428839153782391, "grad_norm": 0.23699170351028442, "learning_rate": 1.1883883794191774e-05, "loss": 0.363, "step": 95140 }, { "epoch": 3.4290193534436155, "grad_norm": 0.2635442912578583, "learning_rate": 1.1881399614152406e-05, "loss": 0.3764, "step": 95145 }, { "epoch": 3.4291995531048403, "grad_norm": 0.23307177424430847, "learning_rate": 1.1878915612851428e-05, "loss": 0.3758, "step": 95150 }, { "epoch": 3.429379752766065, "grad_norm": 0.208553284406662, "learning_rate": 1.1876431790322682e-05, "loss": 0.3888, "step": 95155 }, { "epoch": 3.4295599524272893, "grad_norm": 0.30267196893692017, "learning_rate": 1.1873948146600012e-05, "loss": 0.4137, "step": 95160 }, { "epoch": 3.429740152088514, "grad_norm": 0.21039345860481262, "learning_rate": 1.1871464681717249e-05, "loss": 0.382, "step": 95165 }, { "epoch": 3.4299203517497387, "grad_norm": 0.21296541392803192, "learning_rate": 1.1868981395708256e-05, "loss": 0.3729, "step": 95170 }, { "epoch": 3.4301005514109635, "grad_norm": 0.2397422194480896, "learning_rate": 1.1866498288606843e-05, "loss": 0.3815, "step": 95175 }, { "epoch": 3.430280751072188, "grad_norm": 0.259847491979599, "learning_rate": 1.1864015360446834e-05, "loss": 0.3923, "step": 95180 }, { "epoch": 3.4304609507334125, "grad_norm": 0.23978331685066223, "learning_rate": 1.1861532611262086e-05, "loss": 0.3823, "step": 95185 }, { "epoch": 3.4306411503946372, "grad_norm": 0.20742113888263702, "learning_rate": 1.18590500410864e-05, "loss": 0.3816, "step": 95190 }, { "epoch": 3.430821350055862, "grad_norm": 0.24058598279953003, "learning_rate": 1.1856567649953635e-05, "loss": 0.3628, "step": 95195 }, { "epoch": 3.4310015497170867, "grad_norm": 0.1972092241048813, "learning_rate": 1.1854085437897578e-05, "loss": 0.3614, "step": 95200 }, { "epoch": 3.431181749378311, "grad_norm": 0.22613808512687683, "learning_rate": 1.1851603404952056e-05, "loss": 0.3998, "step": 95205 }, { "epoch": 3.4313619490395357, "grad_norm": 0.24530638754367828, "learning_rate": 1.1849121551150899e-05, "loss": 0.3455, "step": 95210 }, { "epoch": 3.4315421487007605, "grad_norm": 0.22673511505126953, "learning_rate": 1.1846639876527915e-05, "loss": 0.3772, "step": 95215 }, { "epoch": 3.431722348361985, "grad_norm": 0.24466091394424438, "learning_rate": 1.1844158381116916e-05, "loss": 0.3992, "step": 95220 }, { "epoch": 3.43190254802321, "grad_norm": 0.24469353258609772, "learning_rate": 1.1841677064951714e-05, "loss": 0.3863, "step": 95225 }, { "epoch": 3.432082747684434, "grad_norm": 0.23059558868408203, "learning_rate": 1.1839195928066102e-05, "loss": 0.4255, "step": 95230 }, { "epoch": 3.432262947345659, "grad_norm": 0.18520836532115936, "learning_rate": 1.183671497049391e-05, "loss": 0.3682, "step": 95235 }, { "epoch": 3.4324431470068837, "grad_norm": 0.2409636229276657, "learning_rate": 1.1834234192268926e-05, "loss": 0.3583, "step": 95240 }, { "epoch": 3.4326233466681084, "grad_norm": 0.1873437464237213, "learning_rate": 1.1831753593424951e-05, "loss": 0.3832, "step": 95245 }, { "epoch": 3.4328035463293327, "grad_norm": 0.27376478910446167, "learning_rate": 1.1829273173995787e-05, "loss": 0.3519, "step": 95250 }, { "epoch": 3.4329837459905574, "grad_norm": 0.24432578682899475, "learning_rate": 1.1826792934015216e-05, "loss": 0.4051, "step": 95255 }, { "epoch": 3.433163945651782, "grad_norm": 0.3113139867782593, "learning_rate": 1.182431287351705e-05, "loss": 0.41, "step": 95260 }, { "epoch": 3.433344145313007, "grad_norm": 0.24303504824638367, "learning_rate": 1.1821832992535078e-05, "loss": 0.3869, "step": 95265 }, { "epoch": 3.4335243449742316, "grad_norm": 0.19319789111614227, "learning_rate": 1.1819353291103064e-05, "loss": 0.3631, "step": 95270 }, { "epoch": 3.433704544635456, "grad_norm": 0.24024613201618195, "learning_rate": 1.1816873769254816e-05, "loss": 0.3829, "step": 95275 }, { "epoch": 3.4338847442966807, "grad_norm": 0.23290759325027466, "learning_rate": 1.1814394427024112e-05, "loss": 0.3731, "step": 95280 }, { "epoch": 3.4340649439579054, "grad_norm": 0.21962322294712067, "learning_rate": 1.1811915264444732e-05, "loss": 0.3682, "step": 95285 }, { "epoch": 3.43424514361913, "grad_norm": 0.2735930383205414, "learning_rate": 1.1809436281550451e-05, "loss": 0.4133, "step": 95290 }, { "epoch": 3.4344253432803544, "grad_norm": 0.2372862547636032, "learning_rate": 1.1806957478375038e-05, "loss": 0.3902, "step": 95295 }, { "epoch": 3.434605542941579, "grad_norm": 0.24612626433372498, "learning_rate": 1.1804478854952287e-05, "loss": 0.4095, "step": 95300 }, { "epoch": 3.434785742602804, "grad_norm": 0.21510684490203857, "learning_rate": 1.1802000411315953e-05, "loss": 0.3714, "step": 95305 }, { "epoch": 3.4349659422640286, "grad_norm": 0.23222926259040833, "learning_rate": 1.1799522147499812e-05, "loss": 0.3823, "step": 95310 }, { "epoch": 3.4351461419252534, "grad_norm": 0.303889662027359, "learning_rate": 1.1797044063537625e-05, "loss": 0.374, "step": 95315 }, { "epoch": 3.4353263415864776, "grad_norm": 0.22583414614200592, "learning_rate": 1.1794566159463155e-05, "loss": 0.3554, "step": 95320 }, { "epoch": 3.4355065412477024, "grad_norm": 0.21528246998786926, "learning_rate": 1.1792088435310156e-05, "loss": 0.4096, "step": 95325 }, { "epoch": 3.435686740908927, "grad_norm": 0.237309068441391, "learning_rate": 1.1789610891112402e-05, "loss": 0.3965, "step": 95330 }, { "epoch": 3.435866940570152, "grad_norm": 0.22394055128097534, "learning_rate": 1.1787133526903646e-05, "loss": 0.3985, "step": 95335 }, { "epoch": 3.436047140231376, "grad_norm": 0.23511168360710144, "learning_rate": 1.1784656342717634e-05, "loss": 0.4072, "step": 95340 }, { "epoch": 3.436227339892601, "grad_norm": 0.2214852273464203, "learning_rate": 1.1782179338588124e-05, "loss": 0.3582, "step": 95345 }, { "epoch": 3.4364075395538256, "grad_norm": 0.2489766925573349, "learning_rate": 1.1779702514548848e-05, "loss": 0.361, "step": 95350 }, { "epoch": 3.4365877392150503, "grad_norm": 0.23350292444229126, "learning_rate": 1.1777225870633579e-05, "loss": 0.3675, "step": 95355 }, { "epoch": 3.436767938876275, "grad_norm": 0.23286855220794678, "learning_rate": 1.1774749406876057e-05, "loss": 0.4119, "step": 95360 }, { "epoch": 3.4369481385374994, "grad_norm": 0.2144162505865097, "learning_rate": 1.1772273123309993e-05, "loss": 0.3873, "step": 95365 }, { "epoch": 3.437128338198724, "grad_norm": 0.24179521203041077, "learning_rate": 1.1769797019969156e-05, "loss": 0.3731, "step": 95370 }, { "epoch": 3.437308537859949, "grad_norm": 0.22237004339694977, "learning_rate": 1.1767321096887265e-05, "loss": 0.3584, "step": 95375 }, { "epoch": 3.4374887375211736, "grad_norm": 0.23531267046928406, "learning_rate": 1.1764845354098081e-05, "loss": 0.4297, "step": 95380 }, { "epoch": 3.437668937182398, "grad_norm": 0.2168639749288559, "learning_rate": 1.1762369791635305e-05, "loss": 0.4128, "step": 95385 }, { "epoch": 3.4378491368436226, "grad_norm": 0.24747677147388458, "learning_rate": 1.175989440953267e-05, "loss": 0.3953, "step": 95390 }, { "epoch": 3.4380293365048473, "grad_norm": 0.2616941034793854, "learning_rate": 1.1757419207823918e-05, "loss": 0.3724, "step": 95395 }, { "epoch": 3.438209536166072, "grad_norm": 0.21548521518707275, "learning_rate": 1.1754944186542768e-05, "loss": 0.3945, "step": 95400 }, { "epoch": 3.438389735827297, "grad_norm": 0.20728285610675812, "learning_rate": 1.1752469345722938e-05, "loss": 0.4152, "step": 95405 }, { "epoch": 3.4385699354885215, "grad_norm": 0.24086813628673553, "learning_rate": 1.1749994685398149e-05, "loss": 0.3667, "step": 95410 }, { "epoch": 3.438750135149746, "grad_norm": 0.21392937004566193, "learning_rate": 1.1747520205602108e-05, "loss": 0.3966, "step": 95415 }, { "epoch": 3.4389303348109705, "grad_norm": 0.21268320083618164, "learning_rate": 1.1745045906368549e-05, "loss": 0.3754, "step": 95420 }, { "epoch": 3.4391105344721953, "grad_norm": 0.2621963918209076, "learning_rate": 1.1742571787731172e-05, "loss": 0.4324, "step": 95425 }, { "epoch": 3.43929073413342, "grad_norm": 0.28907299041748047, "learning_rate": 1.1740097849723686e-05, "loss": 0.3717, "step": 95430 }, { "epoch": 3.4394709337946443, "grad_norm": 0.24965962767601013, "learning_rate": 1.17376240923798e-05, "loss": 0.3629, "step": 95435 }, { "epoch": 3.439651133455869, "grad_norm": 0.27768054604530334, "learning_rate": 1.1735150515733209e-05, "loss": 0.3604, "step": 95440 }, { "epoch": 3.4398313331170938, "grad_norm": 0.25251927971839905, "learning_rate": 1.1732677119817634e-05, "loss": 0.3643, "step": 95445 }, { "epoch": 3.4400115327783185, "grad_norm": 0.2550899386405945, "learning_rate": 1.1730203904666773e-05, "loss": 0.3879, "step": 95450 }, { "epoch": 3.4401917324395432, "grad_norm": 0.2622126638889313, "learning_rate": 1.1727730870314298e-05, "loss": 0.3498, "step": 95455 }, { "epoch": 3.4403719321007675, "grad_norm": 0.2463812530040741, "learning_rate": 1.1725258016793927e-05, "loss": 0.413, "step": 95460 }, { "epoch": 3.4405521317619923, "grad_norm": 0.2597421109676361, "learning_rate": 1.1722785344139347e-05, "loss": 0.3833, "step": 95465 }, { "epoch": 3.440732331423217, "grad_norm": 0.20518064498901367, "learning_rate": 1.1720312852384239e-05, "loss": 0.3924, "step": 95470 }, { "epoch": 3.4409125310844417, "grad_norm": 0.20788519084453583, "learning_rate": 1.1717840541562312e-05, "loss": 0.3612, "step": 95475 }, { "epoch": 3.441092730745666, "grad_norm": 0.21892179548740387, "learning_rate": 1.171536841170722e-05, "loss": 0.3725, "step": 95480 }, { "epoch": 3.4412729304068908, "grad_norm": 0.23169714212417603, "learning_rate": 1.1712896462852668e-05, "loss": 0.3949, "step": 95485 }, { "epoch": 3.4414531300681155, "grad_norm": 0.24189196527004242, "learning_rate": 1.1710424695032332e-05, "loss": 0.368, "step": 95490 }, { "epoch": 3.44163332972934, "grad_norm": 0.21008506417274475, "learning_rate": 1.1707953108279888e-05, "loss": 0.4096, "step": 95495 }, { "epoch": 3.441813529390565, "grad_norm": 0.23108026385307312, "learning_rate": 1.1705481702629007e-05, "loss": 0.3929, "step": 95500 }, { "epoch": 3.441813529390565, "eval_loss": 0.4303448796272278, "eval_runtime": 3.5356, "eval_samples_per_second": 28.284, "eval_steps_per_second": 7.071, "step": 95500 }, { "epoch": 3.4419937290517892, "grad_norm": 0.1974146068096161, "learning_rate": 1.1703010478113363e-05, "loss": 0.3409, "step": 95505 }, { "epoch": 3.442173928713014, "grad_norm": 0.2331085056066513, "learning_rate": 1.1700539434766619e-05, "loss": 0.3589, "step": 95510 }, { "epoch": 3.4423541283742387, "grad_norm": 0.23634552955627441, "learning_rate": 1.1698068572622458e-05, "loss": 0.3905, "step": 95515 }, { "epoch": 3.4425343280354634, "grad_norm": 0.3103722929954529, "learning_rate": 1.1695597891714539e-05, "loss": 0.3924, "step": 95520 }, { "epoch": 3.4427145276966877, "grad_norm": 0.2085963785648346, "learning_rate": 1.1693127392076522e-05, "loss": 0.3817, "step": 95525 }, { "epoch": 3.4428947273579125, "grad_norm": 0.2746676504611969, "learning_rate": 1.169065707374207e-05, "loss": 0.4077, "step": 95530 }, { "epoch": 3.443074927019137, "grad_norm": 0.2430097460746765, "learning_rate": 1.1688186936744827e-05, "loss": 0.3698, "step": 95535 }, { "epoch": 3.443255126680362, "grad_norm": 0.2820988595485687, "learning_rate": 1.1685716981118469e-05, "loss": 0.3811, "step": 95540 }, { "epoch": 3.4434353263415867, "grad_norm": 0.2863827347755432, "learning_rate": 1.168324720689665e-05, "loss": 0.4197, "step": 95545 }, { "epoch": 3.443615526002811, "grad_norm": 0.2102956622838974, "learning_rate": 1.1680777614112992e-05, "loss": 0.3898, "step": 95550 }, { "epoch": 3.4437957256640357, "grad_norm": 0.20017428696155548, "learning_rate": 1.1678308202801166e-05, "loss": 0.3685, "step": 95555 }, { "epoch": 3.4439759253252604, "grad_norm": 0.2171812206506729, "learning_rate": 1.1675838972994805e-05, "loss": 0.4052, "step": 95560 }, { "epoch": 3.444156124986485, "grad_norm": 0.20312188565731049, "learning_rate": 1.1673369924727576e-05, "loss": 0.3949, "step": 95565 }, { "epoch": 3.4443363246477094, "grad_norm": 0.2849917709827423, "learning_rate": 1.1670901058033093e-05, "loss": 0.3912, "step": 95570 }, { "epoch": 3.444516524308934, "grad_norm": 0.2215384542942047, "learning_rate": 1.1668432372944992e-05, "loss": 0.3403, "step": 95575 }, { "epoch": 3.444696723970159, "grad_norm": 0.26748472452163696, "learning_rate": 1.1665963869496932e-05, "loss": 0.3769, "step": 95580 }, { "epoch": 3.4448769236313836, "grad_norm": 0.20163197815418243, "learning_rate": 1.1663495547722528e-05, "loss": 0.3837, "step": 95585 }, { "epoch": 3.4450571232926084, "grad_norm": 0.23869015276432037, "learning_rate": 1.1661027407655417e-05, "loss": 0.3995, "step": 95590 }, { "epoch": 3.4452373229538327, "grad_norm": 0.25916892290115356, "learning_rate": 1.1658559449329227e-05, "loss": 0.3736, "step": 95595 }, { "epoch": 3.4454175226150574, "grad_norm": 0.2509896755218506, "learning_rate": 1.1656091672777569e-05, "loss": 0.3497, "step": 95600 }, { "epoch": 3.445597722276282, "grad_norm": 0.2731640636920929, "learning_rate": 1.165362407803409e-05, "loss": 0.3667, "step": 95605 }, { "epoch": 3.445777921937507, "grad_norm": 0.23170186579227448, "learning_rate": 1.1651156665132396e-05, "loss": 0.3817, "step": 95610 }, { "epoch": 3.445958121598731, "grad_norm": 0.21757963299751282, "learning_rate": 1.164868943410611e-05, "loss": 0.3697, "step": 95615 }, { "epoch": 3.446138321259956, "grad_norm": 0.2630380690097809, "learning_rate": 1.1646222384988847e-05, "loss": 0.3495, "step": 95620 }, { "epoch": 3.4463185209211806, "grad_norm": 0.23066528141498566, "learning_rate": 1.164375551781421e-05, "loss": 0.3468, "step": 95625 }, { "epoch": 3.4464987205824054, "grad_norm": 0.27254176139831543, "learning_rate": 1.1641288832615823e-05, "loss": 0.4197, "step": 95630 }, { "epoch": 3.44667892024363, "grad_norm": 0.2096617966890335, "learning_rate": 1.1638822329427293e-05, "loss": 0.3646, "step": 95635 }, { "epoch": 3.4468591199048544, "grad_norm": 0.20088768005371094, "learning_rate": 1.1636356008282221e-05, "loss": 0.3663, "step": 95640 }, { "epoch": 3.447039319566079, "grad_norm": 0.2403293251991272, "learning_rate": 1.163388986921421e-05, "loss": 0.3651, "step": 95645 }, { "epoch": 3.447219519227304, "grad_norm": 0.23152384161949158, "learning_rate": 1.163142391225686e-05, "loss": 0.3991, "step": 95650 }, { "epoch": 3.4473997188885286, "grad_norm": 0.29287877678871155, "learning_rate": 1.1628958137443766e-05, "loss": 0.4082, "step": 95655 }, { "epoch": 3.4475799185497533, "grad_norm": 0.2720036208629608, "learning_rate": 1.1626492544808545e-05, "loss": 0.3914, "step": 95660 }, { "epoch": 3.4477601182109776, "grad_norm": 0.24344083666801453, "learning_rate": 1.1624027134384757e-05, "loss": 0.3794, "step": 95665 }, { "epoch": 3.4479403178722023, "grad_norm": 0.22803980112075806, "learning_rate": 1.1621561906206019e-05, "loss": 0.3875, "step": 95670 }, { "epoch": 3.448120517533427, "grad_norm": 0.23738615214824677, "learning_rate": 1.1619096860305905e-05, "loss": 0.4092, "step": 95675 }, { "epoch": 3.448300717194652, "grad_norm": 0.20066243410110474, "learning_rate": 1.161663199671801e-05, "loss": 0.3773, "step": 95680 }, { "epoch": 3.4484809168558765, "grad_norm": 0.3553462326526642, "learning_rate": 1.1614167315475913e-05, "loss": 0.3927, "step": 95685 }, { "epoch": 3.448661116517101, "grad_norm": 0.23177509009838104, "learning_rate": 1.1611702816613195e-05, "loss": 0.4163, "step": 95690 }, { "epoch": 3.4488413161783256, "grad_norm": 0.21542638540267944, "learning_rate": 1.1609238500163424e-05, "loss": 0.4101, "step": 95695 }, { "epoch": 3.4490215158395503, "grad_norm": 0.2028144747018814, "learning_rate": 1.1606774366160194e-05, "loss": 0.3794, "step": 95700 }, { "epoch": 3.449201715500775, "grad_norm": 0.23155230283737183, "learning_rate": 1.160431041463707e-05, "loss": 0.3431, "step": 95705 }, { "epoch": 3.4493819151619993, "grad_norm": 0.2762317359447479, "learning_rate": 1.1601846645627625e-05, "loss": 0.3731, "step": 95710 }, { "epoch": 3.449562114823224, "grad_norm": 0.2540960907936096, "learning_rate": 1.1599383059165425e-05, "loss": 0.3865, "step": 95715 }, { "epoch": 3.449742314484449, "grad_norm": 0.19944801926612854, "learning_rate": 1.1596919655284027e-05, "loss": 0.3779, "step": 95720 }, { "epoch": 3.4499225141456735, "grad_norm": 0.23415467143058777, "learning_rate": 1.1594456434017013e-05, "loss": 0.3739, "step": 95725 }, { "epoch": 3.4501027138068983, "grad_norm": 0.21086935698986053, "learning_rate": 1.1591993395397944e-05, "loss": 0.3912, "step": 95730 }, { "epoch": 3.4502829134681225, "grad_norm": 0.23152147233486176, "learning_rate": 1.158953053946035e-05, "loss": 0.4073, "step": 95735 }, { "epoch": 3.4504631131293473, "grad_norm": 0.29169178009033203, "learning_rate": 1.1587067866237816e-05, "loss": 0.41, "step": 95740 }, { "epoch": 3.450643312790572, "grad_norm": 0.24295637011528015, "learning_rate": 1.1584605375763876e-05, "loss": 0.3491, "step": 95745 }, { "epoch": 3.4508235124517967, "grad_norm": 0.19027990102767944, "learning_rate": 1.158214306807211e-05, "loss": 0.385, "step": 95750 }, { "epoch": 3.451003712113021, "grad_norm": 0.21917863190174103, "learning_rate": 1.1579680943196036e-05, "loss": 0.3643, "step": 95755 }, { "epoch": 3.4511839117742458, "grad_norm": 0.22133587300777435, "learning_rate": 1.1577219001169204e-05, "loss": 0.3636, "step": 95760 }, { "epoch": 3.4513641114354705, "grad_norm": 0.22902527451515198, "learning_rate": 1.1574757242025169e-05, "loss": 0.3693, "step": 95765 }, { "epoch": 3.4515443110966952, "grad_norm": 0.21824954450130463, "learning_rate": 1.1572295665797472e-05, "loss": 0.3672, "step": 95770 }, { "epoch": 3.45172451075792, "grad_norm": 0.29460370540618896, "learning_rate": 1.1569834272519644e-05, "loss": 0.413, "step": 95775 }, { "epoch": 3.4519047104191443, "grad_norm": 0.259569376707077, "learning_rate": 1.1567373062225226e-05, "loss": 0.3763, "step": 95780 }, { "epoch": 3.452084910080369, "grad_norm": 0.28914976119995117, "learning_rate": 1.1564912034947739e-05, "loss": 0.3791, "step": 95785 }, { "epoch": 3.4522651097415937, "grad_norm": 0.2676786482334137, "learning_rate": 1.1562451190720733e-05, "loss": 0.3611, "step": 95790 }, { "epoch": 3.4524453094028185, "grad_norm": 0.2162621021270752, "learning_rate": 1.155999052957773e-05, "loss": 0.3395, "step": 95795 }, { "epoch": 3.4526255090640428, "grad_norm": 0.26020073890686035, "learning_rate": 1.155753005155225e-05, "loss": 0.3507, "step": 95800 }, { "epoch": 3.4528057087252675, "grad_norm": 0.2733500003814697, "learning_rate": 1.1555069756677824e-05, "loss": 0.377, "step": 95805 }, { "epoch": 3.452985908386492, "grad_norm": 0.25482839345932007, "learning_rate": 1.155260964498796e-05, "loss": 0.3767, "step": 95810 }, { "epoch": 3.453166108047717, "grad_norm": 0.2589060068130493, "learning_rate": 1.1550149716516196e-05, "loss": 0.3995, "step": 95815 }, { "epoch": 3.4533463077089417, "grad_norm": 0.26133349537849426, "learning_rate": 1.1547689971296038e-05, "loss": 0.3989, "step": 95820 }, { "epoch": 3.453526507370166, "grad_norm": 0.2330733835697174, "learning_rate": 1.1545230409360996e-05, "loss": 0.3813, "step": 95825 }, { "epoch": 3.4537067070313907, "grad_norm": 0.2710002660751343, "learning_rate": 1.154277103074459e-05, "loss": 0.3821, "step": 95830 }, { "epoch": 3.4538869066926154, "grad_norm": 0.24312447011470795, "learning_rate": 1.154031183548032e-05, "loss": 0.3863, "step": 95835 }, { "epoch": 3.45406710635384, "grad_norm": 0.21377715468406677, "learning_rate": 1.1537852823601685e-05, "loss": 0.3886, "step": 95840 }, { "epoch": 3.4542473060150645, "grad_norm": 0.2441663295030594, "learning_rate": 1.1535393995142222e-05, "loss": 0.3983, "step": 95845 }, { "epoch": 3.454427505676289, "grad_norm": 0.21451100707054138, "learning_rate": 1.1532935350135387e-05, "loss": 0.3596, "step": 95850 }, { "epoch": 3.454607705337514, "grad_norm": 0.22826121747493744, "learning_rate": 1.1530476888614711e-05, "loss": 0.3795, "step": 95855 }, { "epoch": 3.4547879049987387, "grad_norm": 0.2341785728931427, "learning_rate": 1.1528018610613678e-05, "loss": 0.3178, "step": 95860 }, { "epoch": 3.4549681046599634, "grad_norm": 0.25270435214042664, "learning_rate": 1.1525560516165784e-05, "loss": 0.3635, "step": 95865 }, { "epoch": 3.4551483043211877, "grad_norm": 0.2552870512008667, "learning_rate": 1.152310260530452e-05, "loss": 0.3629, "step": 95870 }, { "epoch": 3.4553285039824124, "grad_norm": 0.24541820585727692, "learning_rate": 1.1520644878063374e-05, "loss": 0.3886, "step": 95875 }, { "epoch": 3.455508703643637, "grad_norm": 0.22713278234004974, "learning_rate": 1.1518187334475822e-05, "loss": 0.3915, "step": 95880 }, { "epoch": 3.455688903304862, "grad_norm": 0.2381686270236969, "learning_rate": 1.1515729974575369e-05, "loss": 0.4036, "step": 95885 }, { "epoch": 3.455869102966086, "grad_norm": 0.2205180674791336, "learning_rate": 1.151327279839548e-05, "loss": 0.4164, "step": 95890 }, { "epoch": 3.456049302627311, "grad_norm": 0.2498241364955902, "learning_rate": 1.1510815805969641e-05, "loss": 0.3929, "step": 95895 }, { "epoch": 3.4562295022885356, "grad_norm": 0.20146778225898743, "learning_rate": 1.1508358997331322e-05, "loss": 0.378, "step": 95900 }, { "epoch": 3.4564097019497604, "grad_norm": 0.22101639211177826, "learning_rate": 1.1505902372513994e-05, "loss": 0.3808, "step": 95905 }, { "epoch": 3.456589901610985, "grad_norm": 0.19309233129024506, "learning_rate": 1.150344593155114e-05, "loss": 0.4071, "step": 95910 }, { "epoch": 3.45677010127221, "grad_norm": 0.2546674907207489, "learning_rate": 1.1500989674476234e-05, "loss": 0.3693, "step": 95915 }, { "epoch": 3.456950300933434, "grad_norm": 0.2687970697879791, "learning_rate": 1.1498533601322711e-05, "loss": 0.3548, "step": 95920 }, { "epoch": 3.457130500594659, "grad_norm": 0.22413934767246246, "learning_rate": 1.1496077712124062e-05, "loss": 0.3728, "step": 95925 }, { "epoch": 3.4573107002558836, "grad_norm": 0.20262411236763, "learning_rate": 1.1493622006913728e-05, "loss": 0.3563, "step": 95930 }, { "epoch": 3.4574908999171083, "grad_norm": 0.18897296488285065, "learning_rate": 1.1491166485725194e-05, "loss": 0.3703, "step": 95935 }, { "epoch": 3.4576710995783326, "grad_norm": 0.2627841830253601, "learning_rate": 1.1488711148591907e-05, "loss": 0.4015, "step": 95940 }, { "epoch": 3.4578512992395574, "grad_norm": 0.19592086970806122, "learning_rate": 1.1486255995547299e-05, "loss": 0.3791, "step": 95945 }, { "epoch": 3.458031498900782, "grad_norm": 0.27027082443237305, "learning_rate": 1.1483801026624844e-05, "loss": 0.3186, "step": 95950 }, { "epoch": 3.458211698562007, "grad_norm": 0.2537202537059784, "learning_rate": 1.1481346241857982e-05, "loss": 0.3907, "step": 95955 }, { "epoch": 3.4583918982232316, "grad_norm": 0.2511982023715973, "learning_rate": 1.1478891641280162e-05, "loss": 0.3686, "step": 95960 }, { "epoch": 3.458572097884456, "grad_norm": 0.2458512932062149, "learning_rate": 1.1476437224924824e-05, "loss": 0.3945, "step": 95965 }, { "epoch": 3.4587522975456806, "grad_norm": 0.2354070544242859, "learning_rate": 1.1473982992825403e-05, "loss": 0.3661, "step": 95970 }, { "epoch": 3.4589324972069053, "grad_norm": 0.22366875410079956, "learning_rate": 1.1471528945015352e-05, "loss": 0.4073, "step": 95975 }, { "epoch": 3.45911269686813, "grad_norm": 0.3030519485473633, "learning_rate": 1.1469075081528102e-05, "loss": 0.387, "step": 95980 }, { "epoch": 3.4592928965293543, "grad_norm": 0.1697065532207489, "learning_rate": 1.1466621402397085e-05, "loss": 0.383, "step": 95985 }, { "epoch": 3.459473096190579, "grad_norm": 0.2466737926006317, "learning_rate": 1.1464167907655731e-05, "loss": 0.3841, "step": 95990 }, { "epoch": 3.459653295851804, "grad_norm": 0.2090192586183548, "learning_rate": 1.1461714597337459e-05, "loss": 0.3883, "step": 95995 }, { "epoch": 3.4598334955130285, "grad_norm": 0.20262013375759125, "learning_rate": 1.1459261471475713e-05, "loss": 0.3893, "step": 96000 }, { "epoch": 3.4598334955130285, "eval_loss": 0.4300161600112915, "eval_runtime": 3.5329, "eval_samples_per_second": 28.305, "eval_steps_per_second": 7.076, "step": 96000 }, { "epoch": 3.4600136951742533, "grad_norm": 0.23101657629013062, "learning_rate": 1.1456808530103907e-05, "loss": 0.3993, "step": 96005 }, { "epoch": 3.4601938948354776, "grad_norm": 0.2159048616886139, "learning_rate": 1.1454355773255465e-05, "loss": 0.3827, "step": 96010 }, { "epoch": 3.4603740944967023, "grad_norm": 0.26098620891571045, "learning_rate": 1.14519032009638e-05, "loss": 0.3952, "step": 96015 }, { "epoch": 3.460554294157927, "grad_norm": 0.22338443994522095, "learning_rate": 1.1449450813262336e-05, "loss": 0.366, "step": 96020 }, { "epoch": 3.4607344938191518, "grad_norm": 0.218563973903656, "learning_rate": 1.1446998610184467e-05, "loss": 0.3993, "step": 96025 }, { "epoch": 3.460914693480376, "grad_norm": 0.2379787415266037, "learning_rate": 1.1444546591763639e-05, "loss": 0.3897, "step": 96030 }, { "epoch": 3.461094893141601, "grad_norm": 0.2325468510389328, "learning_rate": 1.144209475803322e-05, "loss": 0.3757, "step": 96035 }, { "epoch": 3.4612750928028255, "grad_norm": 0.237319678068161, "learning_rate": 1.1439643109026643e-05, "loss": 0.3858, "step": 96040 }, { "epoch": 3.4614552924640503, "grad_norm": 0.19926400482654572, "learning_rate": 1.1437191644777307e-05, "loss": 0.3948, "step": 96045 }, { "epoch": 3.461635492125275, "grad_norm": 0.20922797918319702, "learning_rate": 1.1434740365318597e-05, "loss": 0.352, "step": 96050 }, { "epoch": 3.4618156917864993, "grad_norm": 0.2545210123062134, "learning_rate": 1.1432289270683941e-05, "loss": 0.4189, "step": 96055 }, { "epoch": 3.461995891447724, "grad_norm": 0.20487937331199646, "learning_rate": 1.1429838360906708e-05, "loss": 0.398, "step": 96060 }, { "epoch": 3.4621760911089488, "grad_norm": 0.27374470233917236, "learning_rate": 1.1427387636020292e-05, "loss": 0.3865, "step": 96065 }, { "epoch": 3.4623562907701735, "grad_norm": 0.24212025105953217, "learning_rate": 1.14249370960581e-05, "loss": 0.3911, "step": 96070 }, { "epoch": 3.4625364904313978, "grad_norm": 0.24381519854068756, "learning_rate": 1.1422486741053513e-05, "loss": 0.3624, "step": 96075 }, { "epoch": 3.4627166900926225, "grad_norm": 0.23481976985931396, "learning_rate": 1.1420036571039915e-05, "loss": 0.3895, "step": 96080 }, { "epoch": 3.4628968897538472, "grad_norm": 0.27169761061668396, "learning_rate": 1.141758658605069e-05, "loss": 0.39, "step": 96085 }, { "epoch": 3.463077089415072, "grad_norm": 0.24660931527614594, "learning_rate": 1.1415136786119207e-05, "loss": 0.3867, "step": 96090 }, { "epoch": 3.4632572890762967, "grad_norm": 0.23132185637950897, "learning_rate": 1.1412687171278865e-05, "loss": 0.4086, "step": 96095 }, { "epoch": 3.463437488737521, "grad_norm": 0.266564279794693, "learning_rate": 1.1410237741563031e-05, "loss": 0.3929, "step": 96100 }, { "epoch": 3.4636176883987457, "grad_norm": 0.19676515460014343, "learning_rate": 1.1407788497005076e-05, "loss": 0.3475, "step": 96105 }, { "epoch": 3.4637978880599705, "grad_norm": 0.2622312605381012, "learning_rate": 1.140533943763837e-05, "loss": 0.3491, "step": 96110 }, { "epoch": 3.463978087721195, "grad_norm": 0.27703362703323364, "learning_rate": 1.1402890563496276e-05, "loss": 0.3801, "step": 96115 }, { "epoch": 3.4641582873824195, "grad_norm": 0.24706853926181793, "learning_rate": 1.1400441874612173e-05, "loss": 0.3824, "step": 96120 }, { "epoch": 3.464338487043644, "grad_norm": 0.2970876693725586, "learning_rate": 1.1397993371019427e-05, "loss": 0.3695, "step": 96125 }, { "epoch": 3.464518686704869, "grad_norm": 0.24940580129623413, "learning_rate": 1.1395545052751366e-05, "loss": 0.3726, "step": 96130 }, { "epoch": 3.4646988863660937, "grad_norm": 0.21381749212741852, "learning_rate": 1.1393096919841384e-05, "loss": 0.3976, "step": 96135 }, { "epoch": 3.4648790860273184, "grad_norm": 0.2594471275806427, "learning_rate": 1.1390648972322817e-05, "loss": 0.3572, "step": 96140 }, { "epoch": 3.4650592856885427, "grad_norm": 0.22999241948127747, "learning_rate": 1.1388201210229027e-05, "loss": 0.3397, "step": 96145 }, { "epoch": 3.4652394853497674, "grad_norm": 0.28377705812454224, "learning_rate": 1.1385753633593357e-05, "loss": 0.3886, "step": 96150 }, { "epoch": 3.465419685010992, "grad_norm": 0.21517440676689148, "learning_rate": 1.1383306242449152e-05, "loss": 0.4021, "step": 96155 }, { "epoch": 3.465599884672217, "grad_norm": 0.26591652631759644, "learning_rate": 1.138085903682977e-05, "loss": 0.3855, "step": 96160 }, { "epoch": 3.4657800843334416, "grad_norm": 0.2830233871936798, "learning_rate": 1.1378412016768547e-05, "loss": 0.3945, "step": 96165 }, { "epoch": 3.465960283994666, "grad_norm": 0.2300427407026291, "learning_rate": 1.1375965182298822e-05, "loss": 0.3664, "step": 96170 }, { "epoch": 3.4661404836558907, "grad_norm": 0.23842735588550568, "learning_rate": 1.1373518533453937e-05, "loss": 0.3729, "step": 96175 }, { "epoch": 3.4663206833171154, "grad_norm": 0.2205519825220108, "learning_rate": 1.1371072070267221e-05, "loss": 0.3791, "step": 96180 }, { "epoch": 3.46650088297834, "grad_norm": 0.22283774614334106, "learning_rate": 1.1368625792772e-05, "loss": 0.3938, "step": 96185 }, { "epoch": 3.466681082639565, "grad_norm": 0.23954735696315765, "learning_rate": 1.1366179701001625e-05, "loss": 0.4172, "step": 96190 }, { "epoch": 3.466861282300789, "grad_norm": 0.20571085810661316, "learning_rate": 1.136373379498941e-05, "loss": 0.3765, "step": 96195 }, { "epoch": 3.467041481962014, "grad_norm": 0.28632012009620667, "learning_rate": 1.1361288074768684e-05, "loss": 0.3678, "step": 96200 }, { "epoch": 3.4672216816232386, "grad_norm": 0.2133149951696396, "learning_rate": 1.1358842540372767e-05, "loss": 0.4013, "step": 96205 }, { "epoch": 3.4674018812844634, "grad_norm": 0.17300593852996826, "learning_rate": 1.1356397191834967e-05, "loss": 0.3512, "step": 96210 }, { "epoch": 3.4675820809456877, "grad_norm": 0.23370462656021118, "learning_rate": 1.1353952029188625e-05, "loss": 0.4099, "step": 96215 }, { "epoch": 3.4677622806069124, "grad_norm": 0.2420395463705063, "learning_rate": 1.1351507052467044e-05, "loss": 0.3867, "step": 96220 }, { "epoch": 3.467942480268137, "grad_norm": 0.2655438780784607, "learning_rate": 1.1349062261703538e-05, "loss": 0.3969, "step": 96225 }, { "epoch": 3.468122679929362, "grad_norm": 0.24291475117206573, "learning_rate": 1.1346617656931416e-05, "loss": 0.4008, "step": 96230 }, { "epoch": 3.4683028795905866, "grad_norm": 0.23957978188991547, "learning_rate": 1.1344173238183974e-05, "loss": 0.3572, "step": 96235 }, { "epoch": 3.468483079251811, "grad_norm": 0.23568493127822876, "learning_rate": 1.134172900549455e-05, "loss": 0.3883, "step": 96240 }, { "epoch": 3.4686632789130356, "grad_norm": 0.2132604718208313, "learning_rate": 1.1339284958896412e-05, "loss": 0.3945, "step": 96245 }, { "epoch": 3.4688434785742603, "grad_norm": 0.25484153628349304, "learning_rate": 1.1336841098422862e-05, "loss": 0.3929, "step": 96250 }, { "epoch": 3.469023678235485, "grad_norm": 0.3113678991794586, "learning_rate": 1.1334397424107218e-05, "loss": 0.4022, "step": 96255 }, { "epoch": 3.4692038778967094, "grad_norm": 0.1955181211233139, "learning_rate": 1.1331953935982761e-05, "loss": 0.3452, "step": 96260 }, { "epoch": 3.469384077557934, "grad_norm": 0.2540067732334137, "learning_rate": 1.1329510634082787e-05, "loss": 0.4174, "step": 96265 }, { "epoch": 3.469564277219159, "grad_norm": 0.2300526648759842, "learning_rate": 1.1327067518440585e-05, "loss": 0.4029, "step": 96270 }, { "epoch": 3.4697444768803836, "grad_norm": 0.2536962330341339, "learning_rate": 1.132462458908943e-05, "loss": 0.3517, "step": 96275 }, { "epoch": 3.4699246765416083, "grad_norm": 0.24605917930603027, "learning_rate": 1.1322181846062626e-05, "loss": 0.3896, "step": 96280 }, { "epoch": 3.4701048762028326, "grad_norm": 0.21459710597991943, "learning_rate": 1.1319739289393448e-05, "loss": 0.4341, "step": 96285 }, { "epoch": 3.4702850758640573, "grad_norm": 0.2235480546951294, "learning_rate": 1.1317296919115171e-05, "loss": 0.3664, "step": 96290 }, { "epoch": 3.470465275525282, "grad_norm": 0.23965181410312653, "learning_rate": 1.1314854735261076e-05, "loss": 0.3913, "step": 96295 }, { "epoch": 3.470645475186507, "grad_norm": 0.29223349690437317, "learning_rate": 1.1312412737864429e-05, "loss": 0.4089, "step": 96300 }, { "epoch": 3.470825674847731, "grad_norm": 0.2220560908317566, "learning_rate": 1.1309970926958513e-05, "loss": 0.3817, "step": 96305 }, { "epoch": 3.471005874508956, "grad_norm": 0.26034578680992126, "learning_rate": 1.1307529302576605e-05, "loss": 0.3736, "step": 96310 }, { "epoch": 3.4711860741701805, "grad_norm": 0.23067830502986908, "learning_rate": 1.130508786475194e-05, "loss": 0.3885, "step": 96315 }, { "epoch": 3.4713662738314053, "grad_norm": 0.2320612072944641, "learning_rate": 1.1302646613517812e-05, "loss": 0.3833, "step": 96320 }, { "epoch": 3.47154647349263, "grad_norm": 0.22921355068683624, "learning_rate": 1.130020554890747e-05, "loss": 0.3706, "step": 96325 }, { "epoch": 3.4717266731538543, "grad_norm": 0.24659670889377594, "learning_rate": 1.1297764670954166e-05, "loss": 0.3904, "step": 96330 }, { "epoch": 3.471906872815079, "grad_norm": 0.2158777266740799, "learning_rate": 1.1295323979691183e-05, "loss": 0.389, "step": 96335 }, { "epoch": 3.4720870724763038, "grad_norm": 0.21258202195167542, "learning_rate": 1.1292883475151741e-05, "loss": 0.3774, "step": 96340 }, { "epoch": 3.4722672721375285, "grad_norm": 0.20848548412322998, "learning_rate": 1.1290443157369115e-05, "loss": 0.373, "step": 96345 }, { "epoch": 3.472447471798753, "grad_norm": 0.268587201833725, "learning_rate": 1.1288003026376551e-05, "loss": 0.3654, "step": 96350 }, { "epoch": 3.4726276714599775, "grad_norm": 0.25624382495880127, "learning_rate": 1.1285563082207286e-05, "loss": 0.3601, "step": 96355 }, { "epoch": 3.4728078711212023, "grad_norm": 0.1996915340423584, "learning_rate": 1.1283123324894573e-05, "loss": 0.3577, "step": 96360 }, { "epoch": 3.472988070782427, "grad_norm": 0.32410600781440735, "learning_rate": 1.1280683754471647e-05, "loss": 0.3696, "step": 96365 }, { "epoch": 3.4731682704436517, "grad_norm": 0.20257310569286346, "learning_rate": 1.1278244370971739e-05, "loss": 0.3503, "step": 96370 }, { "epoch": 3.473348470104876, "grad_norm": 0.2658899426460266, "learning_rate": 1.1275805174428102e-05, "loss": 0.4159, "step": 96375 }, { "epoch": 3.4735286697661008, "grad_norm": 0.22585046291351318, "learning_rate": 1.1273366164873967e-05, "loss": 0.3779, "step": 96380 }, { "epoch": 3.4737088694273255, "grad_norm": 0.24556881189346313, "learning_rate": 1.1270927342342558e-05, "loss": 0.4055, "step": 96385 }, { "epoch": 3.47388906908855, "grad_norm": 0.20033571124076843, "learning_rate": 1.1268488706867105e-05, "loss": 0.3803, "step": 96390 }, { "epoch": 3.4740692687497745, "grad_norm": 0.2153259962797165, "learning_rate": 1.1266050258480829e-05, "loss": 0.4001, "step": 96395 }, { "epoch": 3.4742494684109992, "grad_norm": 0.24744583666324615, "learning_rate": 1.1263611997216966e-05, "loss": 0.3978, "step": 96400 }, { "epoch": 3.474429668072224, "grad_norm": 0.2664938271045685, "learning_rate": 1.1261173923108731e-05, "loss": 0.3655, "step": 96405 }, { "epoch": 3.4746098677334487, "grad_norm": 0.23613092303276062, "learning_rate": 1.125873603618934e-05, "loss": 0.407, "step": 96410 }, { "epoch": 3.4747900673946734, "grad_norm": 0.2801758348941803, "learning_rate": 1.1256298336492013e-05, "loss": 0.3698, "step": 96415 }, { "epoch": 3.474970267055898, "grad_norm": 0.24601031839847565, "learning_rate": 1.1253860824049948e-05, "loss": 0.4104, "step": 96420 }, { "epoch": 3.4751504667171225, "grad_norm": 0.2248278707265854, "learning_rate": 1.1251423498896389e-05, "loss": 0.3917, "step": 96425 }, { "epoch": 3.475330666378347, "grad_norm": 0.2194819152355194, "learning_rate": 1.1248986361064511e-05, "loss": 0.3757, "step": 96430 }, { "epoch": 3.475510866039572, "grad_norm": 0.22775894403457642, "learning_rate": 1.1246549410587522e-05, "loss": 0.3658, "step": 96435 }, { "epoch": 3.4756910657007967, "grad_norm": 0.23458243906497955, "learning_rate": 1.1244112647498647e-05, "loss": 0.3502, "step": 96440 }, { "epoch": 3.475871265362021, "grad_norm": 0.24316877126693726, "learning_rate": 1.1241676071831073e-05, "loss": 0.4056, "step": 96445 }, { "epoch": 3.4760514650232457, "grad_norm": 0.25940200686454773, "learning_rate": 1.1239239683617997e-05, "loss": 0.3978, "step": 96450 }, { "epoch": 3.4762316646844704, "grad_norm": 0.24130047857761383, "learning_rate": 1.123680348289262e-05, "loss": 0.3834, "step": 96455 }, { "epoch": 3.476411864345695, "grad_norm": 0.1958744078874588, "learning_rate": 1.1234367469688118e-05, "loss": 0.3965, "step": 96460 }, { "epoch": 3.47659206400692, "grad_norm": 0.29350563883781433, "learning_rate": 1.1231931644037701e-05, "loss": 0.3916, "step": 96465 }, { "epoch": 3.476772263668144, "grad_norm": 0.21596458554267883, "learning_rate": 1.1229496005974554e-05, "loss": 0.372, "step": 96470 }, { "epoch": 3.476952463329369, "grad_norm": 0.24967950582504272, "learning_rate": 1.1227060555531858e-05, "loss": 0.3712, "step": 96475 }, { "epoch": 3.4771326629905936, "grad_norm": 0.20245712995529175, "learning_rate": 1.1224625292742794e-05, "loss": 0.344, "step": 96480 }, { "epoch": 3.4773128626518184, "grad_norm": 0.26654964685440063, "learning_rate": 1.1222190217640535e-05, "loss": 0.4232, "step": 96485 }, { "epoch": 3.4774930623130427, "grad_norm": 0.2164486050605774, "learning_rate": 1.1219755330258274e-05, "loss": 0.3632, "step": 96490 }, { "epoch": 3.4776732619742674, "grad_norm": 0.2758568525314331, "learning_rate": 1.1217320630629184e-05, "loss": 0.3687, "step": 96495 }, { "epoch": 3.477853461635492, "grad_norm": 0.24314354360103607, "learning_rate": 1.1214886118786425e-05, "loss": 0.3426, "step": 96500 }, { "epoch": 3.477853461635492, "eval_loss": 0.42988404631614685, "eval_runtime": 3.5336, "eval_samples_per_second": 28.3, "eval_steps_per_second": 7.075, "step": 96500 }, { "epoch": 3.478033661296717, "grad_norm": 0.19991134107112885, "learning_rate": 1.1212451794763178e-05, "loss": 0.3617, "step": 96505 }, { "epoch": 3.4782138609579416, "grad_norm": 0.23575247824192047, "learning_rate": 1.1210017658592605e-05, "loss": 0.4334, "step": 96510 }, { "epoch": 3.478394060619166, "grad_norm": 0.23354025185108185, "learning_rate": 1.1207583710307861e-05, "loss": 0.3685, "step": 96515 }, { "epoch": 3.4785742602803906, "grad_norm": 0.27086594700813293, "learning_rate": 1.1205149949942139e-05, "loss": 0.418, "step": 96520 }, { "epoch": 3.4787544599416154, "grad_norm": 0.2325955182313919, "learning_rate": 1.120271637752856e-05, "loss": 0.3802, "step": 96525 }, { "epoch": 3.47893465960284, "grad_norm": 0.25425323843955994, "learning_rate": 1.1200282993100305e-05, "loss": 0.3639, "step": 96530 }, { "epoch": 3.4791148592640644, "grad_norm": 0.21064163744449615, "learning_rate": 1.1197849796690527e-05, "loss": 0.3845, "step": 96535 }, { "epoch": 3.479295058925289, "grad_norm": 0.20380394160747528, "learning_rate": 1.1195416788332371e-05, "loss": 0.3789, "step": 96540 }, { "epoch": 3.479475258586514, "grad_norm": 0.24418340623378754, "learning_rate": 1.119298396805899e-05, "loss": 0.3595, "step": 96545 }, { "epoch": 3.4796554582477386, "grad_norm": 0.19111719727516174, "learning_rate": 1.1190551335903526e-05, "loss": 0.3371, "step": 96550 }, { "epoch": 3.4798356579089633, "grad_norm": 0.1976500004529953, "learning_rate": 1.1188118891899122e-05, "loss": 0.3705, "step": 96555 }, { "epoch": 3.4800158575701876, "grad_norm": 0.21880120038986206, "learning_rate": 1.118568663607893e-05, "loss": 0.3656, "step": 96560 }, { "epoch": 3.4801960572314123, "grad_norm": 0.23799078166484833, "learning_rate": 1.1183254568476084e-05, "loss": 0.4213, "step": 96565 }, { "epoch": 3.480376256892637, "grad_norm": 0.2790437638759613, "learning_rate": 1.1180822689123719e-05, "loss": 0.3503, "step": 96570 }, { "epoch": 3.480556456553862, "grad_norm": 0.2113523781299591, "learning_rate": 1.1178390998054968e-05, "loss": 0.3869, "step": 96575 }, { "epoch": 3.480736656215086, "grad_norm": 0.20490826666355133, "learning_rate": 1.1175959495302957e-05, "loss": 0.3975, "step": 96580 }, { "epoch": 3.480916855876311, "grad_norm": 0.19894984364509583, "learning_rate": 1.117352818090083e-05, "loss": 0.364, "step": 96585 }, { "epoch": 3.4810970555375356, "grad_norm": 0.2029849737882614, "learning_rate": 1.1171097054881705e-05, "loss": 0.3436, "step": 96590 }, { "epoch": 3.4812772551987603, "grad_norm": 0.21581901609897614, "learning_rate": 1.1168666117278704e-05, "loss": 0.3586, "step": 96595 }, { "epoch": 3.481457454859985, "grad_norm": 0.27598705887794495, "learning_rate": 1.116623536812495e-05, "loss": 0.3926, "step": 96600 }, { "epoch": 3.4816376545212093, "grad_norm": 0.24678505957126617, "learning_rate": 1.1163804807453551e-05, "loss": 0.3838, "step": 96605 }, { "epoch": 3.481817854182434, "grad_norm": 0.1914645880460739, "learning_rate": 1.1161374435297653e-05, "loss": 0.3738, "step": 96610 }, { "epoch": 3.481998053843659, "grad_norm": 0.22241747379302979, "learning_rate": 1.1158944251690337e-05, "loss": 0.3985, "step": 96615 }, { "epoch": 3.4821782535048835, "grad_norm": 0.2281738668680191, "learning_rate": 1.1156514256664719e-05, "loss": 0.3981, "step": 96620 }, { "epoch": 3.482358453166108, "grad_norm": 0.20726355910301208, "learning_rate": 1.1154084450253924e-05, "loss": 0.3826, "step": 96625 }, { "epoch": 3.4825386528273325, "grad_norm": 0.2736766040325165, "learning_rate": 1.1151654832491034e-05, "loss": 0.3828, "step": 96630 }, { "epoch": 3.4827188524885573, "grad_norm": 0.15977123379707336, "learning_rate": 1.1149225403409189e-05, "loss": 0.3602, "step": 96635 }, { "epoch": 3.482899052149782, "grad_norm": 0.25133785605430603, "learning_rate": 1.1146796163041456e-05, "loss": 0.3763, "step": 96640 }, { "epoch": 3.4830792518110067, "grad_norm": 0.2637987434864044, "learning_rate": 1.1144367111420934e-05, "loss": 0.3561, "step": 96645 }, { "epoch": 3.483259451472231, "grad_norm": 0.262717604637146, "learning_rate": 1.1141938248580736e-05, "loss": 0.3806, "step": 96650 }, { "epoch": 3.4834396511334558, "grad_norm": 0.21309638023376465, "learning_rate": 1.1139509574553944e-05, "loss": 0.3634, "step": 96655 }, { "epoch": 3.4836198507946805, "grad_norm": 0.22101840376853943, "learning_rate": 1.1137081089373655e-05, "loss": 0.3906, "step": 96660 }, { "epoch": 3.4838000504559052, "grad_norm": 0.25005871057510376, "learning_rate": 1.113465279307295e-05, "loss": 0.3673, "step": 96665 }, { "epoch": 3.48398025011713, "grad_norm": 0.2766312062740326, "learning_rate": 1.1132224685684906e-05, "loss": 0.3975, "step": 96670 }, { "epoch": 3.4841604497783543, "grad_norm": 0.23252266645431519, "learning_rate": 1.1129796767242625e-05, "loss": 0.3683, "step": 96675 }, { "epoch": 3.484340649439579, "grad_norm": 0.3283499777317047, "learning_rate": 1.1127369037779182e-05, "loss": 0.3911, "step": 96680 }, { "epoch": 3.4845208491008037, "grad_norm": 0.31998345255851746, "learning_rate": 1.1124941497327646e-05, "loss": 0.3783, "step": 96685 }, { "epoch": 3.4847010487620285, "grad_norm": 0.21715426445007324, "learning_rate": 1.1122514145921097e-05, "loss": 0.3533, "step": 96690 }, { "epoch": 3.484881248423253, "grad_norm": 0.23155249655246735, "learning_rate": 1.1120086983592606e-05, "loss": 0.3644, "step": 96695 }, { "epoch": 3.4850614480844775, "grad_norm": 0.2212996780872345, "learning_rate": 1.1117660010375233e-05, "loss": 0.3898, "step": 96700 }, { "epoch": 3.485241647745702, "grad_norm": 0.18644365668296814, "learning_rate": 1.1115233226302074e-05, "loss": 0.3834, "step": 96705 }, { "epoch": 3.485421847406927, "grad_norm": 0.21037942171096802, "learning_rate": 1.1112806631406153e-05, "loss": 0.3391, "step": 96710 }, { "epoch": 3.4856020470681517, "grad_norm": 0.25046810507774353, "learning_rate": 1.1110380225720565e-05, "loss": 0.3941, "step": 96715 }, { "epoch": 3.485782246729376, "grad_norm": 0.23813064396381378, "learning_rate": 1.1107954009278357e-05, "loss": 0.386, "step": 96720 }, { "epoch": 3.4859624463906007, "grad_norm": 0.2435433268547058, "learning_rate": 1.1105527982112584e-05, "loss": 0.3704, "step": 96725 }, { "epoch": 3.4861426460518254, "grad_norm": 0.219720259308815, "learning_rate": 1.1103102144256305e-05, "loss": 0.3791, "step": 96730 }, { "epoch": 3.48632284571305, "grad_norm": 0.24152056872844696, "learning_rate": 1.1100676495742568e-05, "loss": 0.3654, "step": 96735 }, { "epoch": 3.486503045374275, "grad_norm": 0.22711649537086487, "learning_rate": 1.1098251036604413e-05, "loss": 0.397, "step": 96740 }, { "epoch": 3.486683245035499, "grad_norm": 0.22654442489147186, "learning_rate": 1.1095825766874904e-05, "loss": 0.3741, "step": 96745 }, { "epoch": 3.486863444696724, "grad_norm": 0.20491252839565277, "learning_rate": 1.109340068658708e-05, "loss": 0.3717, "step": 96750 }, { "epoch": 3.4870436443579487, "grad_norm": 0.200746089220047, "learning_rate": 1.1090975795773979e-05, "loss": 0.3603, "step": 96755 }, { "epoch": 3.4872238440191734, "grad_norm": 0.2560223639011383, "learning_rate": 1.1088551094468636e-05, "loss": 0.3774, "step": 96760 }, { "epoch": 3.4874040436803977, "grad_norm": 0.22208933532238007, "learning_rate": 1.1086126582704085e-05, "loss": 0.3932, "step": 96765 }, { "epoch": 3.4875842433416224, "grad_norm": 0.25349971652030945, "learning_rate": 1.1083702260513373e-05, "loss": 0.3906, "step": 96770 }, { "epoch": 3.487764443002847, "grad_norm": 0.20808152854442596, "learning_rate": 1.1081278127929534e-05, "loss": 0.3931, "step": 96775 }, { "epoch": 3.487944642664072, "grad_norm": 0.18402156233787537, "learning_rate": 1.1078854184985567e-05, "loss": 0.3678, "step": 96780 }, { "epoch": 3.4881248423252966, "grad_norm": 0.22728796303272247, "learning_rate": 1.1076430431714526e-05, "loss": 0.3698, "step": 96785 }, { "epoch": 3.488305041986521, "grad_norm": 0.2671549618244171, "learning_rate": 1.1074006868149413e-05, "loss": 0.3913, "step": 96790 }, { "epoch": 3.4884852416477456, "grad_norm": 0.2308701127767563, "learning_rate": 1.1071583494323274e-05, "loss": 0.3614, "step": 96795 }, { "epoch": 3.4886654413089704, "grad_norm": 0.22043147683143616, "learning_rate": 1.106916031026912e-05, "loss": 0.378, "step": 96800 }, { "epoch": 3.488845640970195, "grad_norm": 0.28679531812667847, "learning_rate": 1.106673731601994e-05, "loss": 0.3863, "step": 96805 }, { "epoch": 3.4890258406314194, "grad_norm": 0.25224077701568604, "learning_rate": 1.1064314511608778e-05, "loss": 0.381, "step": 96810 }, { "epoch": 3.489206040292644, "grad_norm": 0.252714604139328, "learning_rate": 1.1061891897068624e-05, "loss": 0.37, "step": 96815 }, { "epoch": 3.489386239953869, "grad_norm": 0.2400377094745636, "learning_rate": 1.105946947243251e-05, "loss": 0.3898, "step": 96820 }, { "epoch": 3.4895664396150936, "grad_norm": 0.26662567257881165, "learning_rate": 1.1057047237733417e-05, "loss": 0.3624, "step": 96825 }, { "epoch": 3.4897466392763183, "grad_norm": 0.219301238656044, "learning_rate": 1.1054625193004347e-05, "loss": 0.3836, "step": 96830 }, { "epoch": 3.4899268389375426, "grad_norm": 0.2580451965332031, "learning_rate": 1.1052203338278319e-05, "loss": 0.3923, "step": 96835 }, { "epoch": 3.4901070385987674, "grad_norm": 0.23948605358600616, "learning_rate": 1.104978167358832e-05, "loss": 0.3659, "step": 96840 }, { "epoch": 3.490287238259992, "grad_norm": 0.20990577340126038, "learning_rate": 1.1047360198967344e-05, "loss": 0.389, "step": 96845 }, { "epoch": 3.490467437921217, "grad_norm": 0.2310859113931656, "learning_rate": 1.1044938914448385e-05, "loss": 0.3828, "step": 96850 }, { "epoch": 3.490647637582441, "grad_norm": 0.19339106976985931, "learning_rate": 1.104251782006442e-05, "loss": 0.3703, "step": 96855 }, { "epoch": 3.490827837243666, "grad_norm": 0.23496823012828827, "learning_rate": 1.104009691584846e-05, "loss": 0.3635, "step": 96860 }, { "epoch": 3.4910080369048906, "grad_norm": 0.28526195883750916, "learning_rate": 1.1037676201833474e-05, "loss": 0.4448, "step": 96865 }, { "epoch": 3.4911882365661153, "grad_norm": 0.23180098831653595, "learning_rate": 1.103525567805245e-05, "loss": 0.3416, "step": 96870 }, { "epoch": 3.49136843622734, "grad_norm": 0.20511779189109802, "learning_rate": 1.1032835344538362e-05, "loss": 0.3618, "step": 96875 }, { "epoch": 3.4915486358885643, "grad_norm": 0.22699345648288727, "learning_rate": 1.1030415201324188e-05, "loss": 0.3726, "step": 96880 }, { "epoch": 3.491728835549789, "grad_norm": 0.24510350823402405, "learning_rate": 1.1027995248442891e-05, "loss": 0.3828, "step": 96885 }, { "epoch": 3.491909035211014, "grad_norm": 0.2351260483264923, "learning_rate": 1.1025575485927476e-05, "loss": 0.3986, "step": 96890 }, { "epoch": 3.4920892348722385, "grad_norm": 0.24812495708465576, "learning_rate": 1.1023155913810868e-05, "loss": 0.3722, "step": 96895 }, { "epoch": 3.492269434533463, "grad_norm": 0.26317039132118225, "learning_rate": 1.1020736532126063e-05, "loss": 0.3707, "step": 96900 }, { "epoch": 3.4924496341946876, "grad_norm": 0.2346991002559662, "learning_rate": 1.101831734090602e-05, "loss": 0.3692, "step": 96905 }, { "epoch": 3.4926298338559123, "grad_norm": 0.2354336977005005, "learning_rate": 1.1015898340183684e-05, "loss": 0.3982, "step": 96910 }, { "epoch": 3.492810033517137, "grad_norm": 0.2742219567298889, "learning_rate": 1.1013479529992047e-05, "loss": 0.3985, "step": 96915 }, { "epoch": 3.4929902331783618, "grad_norm": 0.24369125068187714, "learning_rate": 1.1011060910364033e-05, "loss": 0.4218, "step": 96920 }, { "epoch": 3.4931704328395865, "grad_norm": 0.2888088822364807, "learning_rate": 1.1008642481332596e-05, "loss": 0.3919, "step": 96925 }, { "epoch": 3.493350632500811, "grad_norm": 0.2732216715812683, "learning_rate": 1.1006224242930705e-05, "loss": 0.3953, "step": 96930 }, { "epoch": 3.4935308321620355, "grad_norm": 0.2236909568309784, "learning_rate": 1.1003806195191298e-05, "loss": 0.3726, "step": 96935 }, { "epoch": 3.4937110318232603, "grad_norm": 0.21694554388523102, "learning_rate": 1.1001388338147326e-05, "loss": 0.3782, "step": 96940 }, { "epoch": 3.493891231484485, "grad_norm": 0.2243148237466812, "learning_rate": 1.0998970671831726e-05, "loss": 0.3572, "step": 96945 }, { "epoch": 3.4940714311457093, "grad_norm": 0.22870691120624542, "learning_rate": 1.099655319627743e-05, "loss": 0.3504, "step": 96950 }, { "epoch": 3.494251630806934, "grad_norm": 0.2252253144979477, "learning_rate": 1.0994135911517396e-05, "loss": 0.3817, "step": 96955 }, { "epoch": 3.4944318304681588, "grad_norm": 0.21160556375980377, "learning_rate": 1.0991718817584549e-05, "loss": 0.3751, "step": 96960 }, { "epoch": 3.4946120301293835, "grad_norm": 0.229017436504364, "learning_rate": 1.0989301914511818e-05, "loss": 0.3454, "step": 96965 }, { "epoch": 3.494792229790608, "grad_norm": 0.20936749875545502, "learning_rate": 1.0986885202332136e-05, "loss": 0.3819, "step": 96970 }, { "epoch": 3.4949724294518325, "grad_norm": 0.21880267560482025, "learning_rate": 1.0984468681078424e-05, "loss": 0.3691, "step": 96975 }, { "epoch": 3.4951526291130572, "grad_norm": 0.22391989827156067, "learning_rate": 1.098205235078362e-05, "loss": 0.4176, "step": 96980 }, { "epoch": 3.495332828774282, "grad_norm": 0.28297480940818787, "learning_rate": 1.0979636211480648e-05, "loss": 0.3618, "step": 96985 }, { "epoch": 3.4955130284355067, "grad_norm": 0.22317856550216675, "learning_rate": 1.09772202632024e-05, "loss": 0.3755, "step": 96990 }, { "epoch": 3.495693228096731, "grad_norm": 0.2054082453250885, "learning_rate": 1.0974804505981822e-05, "loss": 0.3781, "step": 96995 }, { "epoch": 3.4958734277579557, "grad_norm": 0.2986920177936554, "learning_rate": 1.0972388939851804e-05, "loss": 0.3982, "step": 97000 }, { "epoch": 3.4958734277579557, "eval_loss": 0.4296031594276428, "eval_runtime": 3.5247, "eval_samples_per_second": 28.371, "eval_steps_per_second": 7.093, "step": 97000 }, { "epoch": 3.4960536274191805, "grad_norm": 0.25335046648979187, "learning_rate": 1.0969973564845293e-05, "loss": 0.4044, "step": 97005 }, { "epoch": 3.496233827080405, "grad_norm": 0.23634032905101776, "learning_rate": 1.0967558380995165e-05, "loss": 0.4013, "step": 97010 }, { "epoch": 3.49641402674163, "grad_norm": 0.2781918942928314, "learning_rate": 1.0965143388334329e-05, "loss": 0.3848, "step": 97015 }, { "epoch": 3.496594226402854, "grad_norm": 0.266989141702652, "learning_rate": 1.0962728586895706e-05, "loss": 0.3611, "step": 97020 }, { "epoch": 3.496774426064079, "grad_norm": 0.2467830330133438, "learning_rate": 1.0960313976712188e-05, "loss": 0.3863, "step": 97025 }, { "epoch": 3.4969546257253037, "grad_norm": 0.2517973482608795, "learning_rate": 1.0957899557816676e-05, "loss": 0.3733, "step": 97030 }, { "epoch": 3.4971348253865284, "grad_norm": 0.2016320675611496, "learning_rate": 1.095548533024206e-05, "loss": 0.4017, "step": 97035 }, { "epoch": 3.4973150250477527, "grad_norm": 0.2199011892080307, "learning_rate": 1.095307129402123e-05, "loss": 0.3199, "step": 97040 }, { "epoch": 3.4974952247089774, "grad_norm": 0.23464830219745636, "learning_rate": 1.0950657449187094e-05, "loss": 0.3899, "step": 97045 }, { "epoch": 3.497675424370202, "grad_norm": 0.2523633539676666, "learning_rate": 1.0948243795772528e-05, "loss": 0.3891, "step": 97050 }, { "epoch": 3.497855624031427, "grad_norm": 0.26079466938972473, "learning_rate": 1.0945830333810423e-05, "loss": 0.3812, "step": 97055 }, { "epoch": 3.4980358236926516, "grad_norm": 0.2218736708164215, "learning_rate": 1.0943417063333655e-05, "loss": 0.3631, "step": 97060 }, { "epoch": 3.498216023353876, "grad_norm": 0.19591349363327026, "learning_rate": 1.0941003984375112e-05, "loss": 0.3689, "step": 97065 }, { "epoch": 3.4983962230151007, "grad_norm": 0.2568768858909607, "learning_rate": 1.0938591096967657e-05, "loss": 0.4084, "step": 97070 }, { "epoch": 3.4985764226763254, "grad_norm": 0.21716725826263428, "learning_rate": 1.0936178401144184e-05, "loss": 0.387, "step": 97075 }, { "epoch": 3.49875662233755, "grad_norm": 0.24818527698516846, "learning_rate": 1.0933765896937556e-05, "loss": 0.402, "step": 97080 }, { "epoch": 3.4989368219987744, "grad_norm": 0.23442403972148895, "learning_rate": 1.093135358438065e-05, "loss": 0.373, "step": 97085 }, { "epoch": 3.499117021659999, "grad_norm": 0.2103729248046875, "learning_rate": 1.0928941463506322e-05, "loss": 0.3536, "step": 97090 }, { "epoch": 3.499297221321224, "grad_norm": 0.21677030622959137, "learning_rate": 1.0926529534347435e-05, "loss": 0.3647, "step": 97095 }, { "epoch": 3.4994774209824486, "grad_norm": 0.24412192404270172, "learning_rate": 1.0924117796936878e-05, "loss": 0.4133, "step": 97100 }, { "epoch": 3.4996576206436734, "grad_norm": 0.247679203748703, "learning_rate": 1.0921706251307481e-05, "loss": 0.4191, "step": 97105 }, { "epoch": 3.4998378203048977, "grad_norm": 0.23608912527561188, "learning_rate": 1.09192948974921e-05, "loss": 0.3869, "step": 97110 }, { "epoch": 3.5000180199661224, "grad_norm": 0.23635424673557281, "learning_rate": 1.0916883735523612e-05, "loss": 0.3906, "step": 97115 }, { "epoch": 3.500198219627347, "grad_norm": 0.3293451964855194, "learning_rate": 1.0914472765434852e-05, "loss": 0.3959, "step": 97120 }, { "epoch": 3.500378419288572, "grad_norm": 0.21557281911373138, "learning_rate": 1.091206198725868e-05, "loss": 0.36, "step": 97125 }, { "epoch": 3.500558618949796, "grad_norm": 0.2755977511405945, "learning_rate": 1.090965140102793e-05, "loss": 0.3945, "step": 97130 }, { "epoch": 3.500738818611021, "grad_norm": 0.18358829617500305, "learning_rate": 1.0907241006775445e-05, "loss": 0.3519, "step": 97135 }, { "epoch": 3.5009190182722456, "grad_norm": 0.2956106960773468, "learning_rate": 1.090483080453408e-05, "loss": 0.3928, "step": 97140 }, { "epoch": 3.5010992179334703, "grad_norm": 0.23157207667827606, "learning_rate": 1.090242079433667e-05, "loss": 0.4038, "step": 97145 }, { "epoch": 3.501279417594695, "grad_norm": 0.1804695725440979, "learning_rate": 1.0900010976216046e-05, "loss": 0.3982, "step": 97150 }, { "epoch": 3.50145961725592, "grad_norm": 0.25014370679855347, "learning_rate": 1.0897601350205042e-05, "loss": 0.3616, "step": 97155 }, { "epoch": 3.501639816917144, "grad_norm": 0.29266318678855896, "learning_rate": 1.0895191916336477e-05, "loss": 0.3894, "step": 97160 }, { "epoch": 3.501820016578369, "grad_norm": 0.2770882248878479, "learning_rate": 1.0892782674643207e-05, "loss": 0.3823, "step": 97165 }, { "epoch": 3.5020002162395936, "grad_norm": 0.2866399586200714, "learning_rate": 1.0890373625158046e-05, "loss": 0.41, "step": 97170 }, { "epoch": 3.502180415900818, "grad_norm": 0.2779446542263031, "learning_rate": 1.0887964767913796e-05, "loss": 0.3801, "step": 97175 }, { "epoch": 3.5023606155620426, "grad_norm": 0.2382117062807083, "learning_rate": 1.0885556102943303e-05, "loss": 0.3715, "step": 97180 }, { "epoch": 3.5025408152232673, "grad_norm": 0.23480451107025146, "learning_rate": 1.0883147630279367e-05, "loss": 0.3713, "step": 97185 }, { "epoch": 3.502721014884492, "grad_norm": 0.2746983468532562, "learning_rate": 1.0880739349954829e-05, "loss": 0.3993, "step": 97190 }, { "epoch": 3.502901214545717, "grad_norm": 0.27288299798965454, "learning_rate": 1.0878331262002475e-05, "loss": 0.3596, "step": 97195 }, { "epoch": 3.5030814142069415, "grad_norm": 0.25127673149108887, "learning_rate": 1.0875923366455113e-05, "loss": 0.3874, "step": 97200 }, { "epoch": 3.503261613868166, "grad_norm": 0.29258519411087036, "learning_rate": 1.0873515663345572e-05, "loss": 0.411, "step": 97205 }, { "epoch": 3.5034418135293905, "grad_norm": 0.2521027624607086, "learning_rate": 1.0871108152706644e-05, "loss": 0.3625, "step": 97210 }, { "epoch": 3.5036220131906153, "grad_norm": 0.24310770630836487, "learning_rate": 1.086870083457113e-05, "loss": 0.3946, "step": 97215 }, { "epoch": 3.5038022128518396, "grad_norm": 0.23873953521251678, "learning_rate": 1.0866293708971834e-05, "loss": 0.3985, "step": 97220 }, { "epoch": 3.5039824125130643, "grad_norm": 0.25413691997528076, "learning_rate": 1.086388677594154e-05, "loss": 0.3693, "step": 97225 }, { "epoch": 3.504162612174289, "grad_norm": 0.24815772473812103, "learning_rate": 1.086148003551306e-05, "loss": 0.3974, "step": 97230 }, { "epoch": 3.5043428118355138, "grad_norm": 0.2222224920988083, "learning_rate": 1.085907348771918e-05, "loss": 0.3328, "step": 97235 }, { "epoch": 3.5045230114967385, "grad_norm": 0.18813829123973846, "learning_rate": 1.0856667132592683e-05, "loss": 0.3828, "step": 97240 }, { "epoch": 3.5047032111579632, "grad_norm": 0.2500969171524048, "learning_rate": 1.0854260970166358e-05, "loss": 0.3769, "step": 97245 }, { "epoch": 3.5048834108191875, "grad_norm": 0.3164437711238861, "learning_rate": 1.085185500047299e-05, "loss": 0.3717, "step": 97250 }, { "epoch": 3.5050636104804123, "grad_norm": 0.23495697975158691, "learning_rate": 1.084944922354535e-05, "loss": 0.3808, "step": 97255 }, { "epoch": 3.505243810141637, "grad_norm": 0.2900897264480591, "learning_rate": 1.0847043639416233e-05, "loss": 0.3814, "step": 97260 }, { "epoch": 3.5054240098028617, "grad_norm": 0.2515815794467926, "learning_rate": 1.0844638248118405e-05, "loss": 0.3928, "step": 97265 }, { "epoch": 3.505604209464086, "grad_norm": 0.23748047649860382, "learning_rate": 1.0842233049684642e-05, "loss": 0.4071, "step": 97270 }, { "epoch": 3.5057844091253108, "grad_norm": 0.2293088734149933, "learning_rate": 1.0839828044147712e-05, "loss": 0.4278, "step": 97275 }, { "epoch": 3.5059646087865355, "grad_norm": 0.2869160771369934, "learning_rate": 1.0837423231540375e-05, "loss": 0.3777, "step": 97280 }, { "epoch": 3.50614480844776, "grad_norm": 0.2737312912940979, "learning_rate": 1.083501861189542e-05, "loss": 0.3707, "step": 97285 }, { "epoch": 3.506325008108985, "grad_norm": 0.2278176099061966, "learning_rate": 1.0832614185245587e-05, "loss": 0.362, "step": 97290 }, { "epoch": 3.5065052077702092, "grad_norm": 0.26734185218811035, "learning_rate": 1.0830209951623635e-05, "loss": 0.3986, "step": 97295 }, { "epoch": 3.506685407431434, "grad_norm": 0.2176479548215866, "learning_rate": 1.0827805911062336e-05, "loss": 0.3585, "step": 97300 }, { "epoch": 3.5068656070926587, "grad_norm": 0.22162985801696777, "learning_rate": 1.082540206359444e-05, "loss": 0.3828, "step": 97305 }, { "epoch": 3.5070458067538834, "grad_norm": 0.21119628846645355, "learning_rate": 1.0822998409252694e-05, "loss": 0.3715, "step": 97310 }, { "epoch": 3.5072260064151077, "grad_norm": 0.25533100962638855, "learning_rate": 1.082059494806985e-05, "loss": 0.4005, "step": 97315 }, { "epoch": 3.5074062060763325, "grad_norm": 0.3065352439880371, "learning_rate": 1.0818191680078648e-05, "loss": 0.3915, "step": 97320 }, { "epoch": 3.507586405737557, "grad_norm": 0.23930329084396362, "learning_rate": 1.0815788605311846e-05, "loss": 0.3843, "step": 97325 }, { "epoch": 3.507766605398782, "grad_norm": 0.2522366940975189, "learning_rate": 1.0813385723802177e-05, "loss": 0.3506, "step": 97330 }, { "epoch": 3.5079468050600067, "grad_norm": 0.22099296748638153, "learning_rate": 1.0810983035582384e-05, "loss": 0.3578, "step": 97335 }, { "epoch": 3.5081270047212314, "grad_norm": 0.2783578038215637, "learning_rate": 1.0808580540685198e-05, "loss": 0.409, "step": 97340 }, { "epoch": 3.5083072043824557, "grad_norm": 0.282536119222641, "learning_rate": 1.0806178239143347e-05, "loss": 0.392, "step": 97345 }, { "epoch": 3.5084874040436804, "grad_norm": 0.2658419609069824, "learning_rate": 1.0803776130989577e-05, "loss": 0.4034, "step": 97350 }, { "epoch": 3.508667603704905, "grad_norm": 0.23582491278648376, "learning_rate": 1.0801374216256619e-05, "loss": 0.3993, "step": 97355 }, { "epoch": 3.5088478033661294, "grad_norm": 0.22847378253936768, "learning_rate": 1.0798972494977167e-05, "loss": 0.3643, "step": 97360 }, { "epoch": 3.509028003027354, "grad_norm": 0.24764804542064667, "learning_rate": 1.0796570967183978e-05, "loss": 0.3575, "step": 97365 }, { "epoch": 3.509208202688579, "grad_norm": 0.2016579508781433, "learning_rate": 1.079416963290976e-05, "loss": 0.3777, "step": 97370 }, { "epoch": 3.5093884023498036, "grad_norm": 0.2673712968826294, "learning_rate": 1.0791768492187218e-05, "loss": 0.3758, "step": 97375 }, { "epoch": 3.5095686020110284, "grad_norm": 0.23791760206222534, "learning_rate": 1.07893675450491e-05, "loss": 0.3433, "step": 97380 }, { "epoch": 3.509748801672253, "grad_norm": 0.208527609705925, "learning_rate": 1.0786966791528078e-05, "loss": 0.3444, "step": 97385 }, { "epoch": 3.5099290013334774, "grad_norm": 0.2786830961704254, "learning_rate": 1.0784566231656893e-05, "loss": 0.3838, "step": 97390 }, { "epoch": 3.510109200994702, "grad_norm": 0.2595866024494171, "learning_rate": 1.078216586546824e-05, "loss": 0.3882, "step": 97395 }, { "epoch": 3.510289400655927, "grad_norm": 0.2380652129650116, "learning_rate": 1.0779765692994826e-05, "loss": 0.3906, "step": 97400 }, { "epoch": 3.510469600317151, "grad_norm": 0.2838803231716156, "learning_rate": 1.0777365714269347e-05, "loss": 0.3842, "step": 97405 }, { "epoch": 3.510649799978376, "grad_norm": 0.21511715650558472, "learning_rate": 1.0774965929324502e-05, "loss": 0.3744, "step": 97410 }, { "epoch": 3.5108299996396006, "grad_norm": 0.22063112258911133, "learning_rate": 1.0772566338193e-05, "loss": 0.3887, "step": 97415 }, { "epoch": 3.5110101993008254, "grad_norm": 0.21338684856891632, "learning_rate": 1.0770166940907524e-05, "loss": 0.3964, "step": 97420 }, { "epoch": 3.51119039896205, "grad_norm": 0.21853932738304138, "learning_rate": 1.0767767737500772e-05, "loss": 0.3588, "step": 97425 }, { "epoch": 3.511370598623275, "grad_norm": 0.20588691532611847, "learning_rate": 1.0765368728005429e-05, "loss": 0.4019, "step": 97430 }, { "epoch": 3.511550798284499, "grad_norm": 0.27712467312812805, "learning_rate": 1.076296991245418e-05, "loss": 0.4012, "step": 97435 }, { "epoch": 3.511730997945724, "grad_norm": 0.2563231289386749, "learning_rate": 1.07605712908797e-05, "loss": 0.3952, "step": 97440 }, { "epoch": 3.5119111976069486, "grad_norm": 0.1871892213821411, "learning_rate": 1.0758172863314689e-05, "loss": 0.3994, "step": 97445 }, { "epoch": 3.512091397268173, "grad_norm": 0.24314607679843903, "learning_rate": 1.0755774629791815e-05, "loss": 0.384, "step": 97450 }, { "epoch": 3.5122715969293976, "grad_norm": 0.23912304639816284, "learning_rate": 1.0753376590343755e-05, "loss": 0.3468, "step": 97455 }, { "epoch": 3.5124517965906223, "grad_norm": 0.25849324464797974, "learning_rate": 1.0750978745003181e-05, "loss": 0.395, "step": 97460 }, { "epoch": 3.512631996251847, "grad_norm": 0.21953187882900238, "learning_rate": 1.0748581093802753e-05, "loss": 0.3912, "step": 97465 }, { "epoch": 3.512812195913072, "grad_norm": 0.231736421585083, "learning_rate": 1.0746183636775167e-05, "loss": 0.3748, "step": 97470 }, { "epoch": 3.5129923955742965, "grad_norm": 0.27500176429748535, "learning_rate": 1.0743786373953061e-05, "loss": 0.3845, "step": 97475 }, { "epoch": 3.513172595235521, "grad_norm": 0.2197420597076416, "learning_rate": 1.0741389305369093e-05, "loss": 0.3836, "step": 97480 }, { "epoch": 3.5133527948967456, "grad_norm": 0.22646686434745789, "learning_rate": 1.0738992431055948e-05, "loss": 0.347, "step": 97485 }, { "epoch": 3.5135329945579703, "grad_norm": 0.22990435361862183, "learning_rate": 1.073659575104626e-05, "loss": 0.4269, "step": 97490 }, { "epoch": 3.5137131942191946, "grad_norm": 0.29172033071517944, "learning_rate": 1.0734678546958963e-05, "loss": 0.3727, "step": 97495 }, { "epoch": 3.5138933938804193, "grad_norm": 0.24528679251670837, "learning_rate": 1.0732282216777811e-05, "loss": 0.3703, "step": 97500 }, { "epoch": 3.5138933938804193, "eval_loss": 0.4301924407482147, "eval_runtime": 3.5317, "eval_samples_per_second": 28.315, "eval_steps_per_second": 7.079, "step": 97500 }, { "epoch": 3.514073593541644, "grad_norm": 0.20823730528354645, "learning_rate": 1.0729886080991553e-05, "loss": 0.3724, "step": 97505 }, { "epoch": 3.514253793202869, "grad_norm": 0.23355242609977722, "learning_rate": 1.0727490139632824e-05, "loss": 0.4024, "step": 97510 }, { "epoch": 3.5144339928640935, "grad_norm": 0.2617993950843811, "learning_rate": 1.0725094392734289e-05, "loss": 0.3831, "step": 97515 }, { "epoch": 3.5146141925253183, "grad_norm": 0.2531090974807739, "learning_rate": 1.0722698840328576e-05, "loss": 0.3876, "step": 97520 }, { "epoch": 3.5147943921865425, "grad_norm": 0.24823562800884247, "learning_rate": 1.0720303482448333e-05, "loss": 0.3846, "step": 97525 }, { "epoch": 3.5149745918477673, "grad_norm": 0.2218950390815735, "learning_rate": 1.0717908319126185e-05, "loss": 0.3883, "step": 97530 }, { "epoch": 3.515154791508992, "grad_norm": 0.23106147348880768, "learning_rate": 1.0715513350394762e-05, "loss": 0.4091, "step": 97535 }, { "epoch": 3.5153349911702167, "grad_norm": 0.31263554096221924, "learning_rate": 1.0713118576286716e-05, "loss": 0.3521, "step": 97540 }, { "epoch": 3.515515190831441, "grad_norm": 0.25022903084754944, "learning_rate": 1.0710723996834671e-05, "loss": 0.3958, "step": 97545 }, { "epoch": 3.5156953904926658, "grad_norm": 0.25562384724617004, "learning_rate": 1.0708329612071227e-05, "loss": 0.3729, "step": 97550 }, { "epoch": 3.5158755901538905, "grad_norm": 0.25449755787849426, "learning_rate": 1.0705935422029034e-05, "loss": 0.3987, "step": 97555 }, { "epoch": 3.5160557898151152, "grad_norm": 0.20006752014160156, "learning_rate": 1.0703541426740697e-05, "loss": 0.3625, "step": 97560 }, { "epoch": 3.51623598947634, "grad_norm": 0.20985792577266693, "learning_rate": 1.0701147626238856e-05, "loss": 0.3947, "step": 97565 }, { "epoch": 3.5164161891375643, "grad_norm": 0.24245929718017578, "learning_rate": 1.0698754020556101e-05, "loss": 0.3838, "step": 97570 }, { "epoch": 3.516596388798789, "grad_norm": 0.23297759890556335, "learning_rate": 1.0696360609725044e-05, "loss": 0.3709, "step": 97575 }, { "epoch": 3.5167765884600137, "grad_norm": 0.27019697427749634, "learning_rate": 1.0693967393778315e-05, "loss": 0.3778, "step": 97580 }, { "epoch": 3.5169567881212385, "grad_norm": 0.26086610555648804, "learning_rate": 1.0691574372748509e-05, "loss": 0.393, "step": 97585 }, { "epoch": 3.5171369877824628, "grad_norm": 0.20790447294712067, "learning_rate": 1.0689181546668234e-05, "loss": 0.342, "step": 97590 }, { "epoch": 3.5173171874436875, "grad_norm": 0.24701309204101562, "learning_rate": 1.0686788915570088e-05, "loss": 0.3964, "step": 97595 }, { "epoch": 3.517497387104912, "grad_norm": 0.2746676802635193, "learning_rate": 1.0684396479486664e-05, "loss": 0.3973, "step": 97600 }, { "epoch": 3.517677586766137, "grad_norm": 0.3433469831943512, "learning_rate": 1.0682004238450574e-05, "loss": 0.3645, "step": 97605 }, { "epoch": 3.5178577864273617, "grad_norm": 0.22473633289337158, "learning_rate": 1.0679612192494403e-05, "loss": 0.3702, "step": 97610 }, { "epoch": 3.5180379860885864, "grad_norm": 0.26055383682250977, "learning_rate": 1.0677220341650747e-05, "loss": 0.3831, "step": 97615 }, { "epoch": 3.5182181857498107, "grad_norm": 0.22649826109409332, "learning_rate": 1.067482868595219e-05, "loss": 0.3515, "step": 97620 }, { "epoch": 3.5183983854110354, "grad_norm": 0.24779903888702393, "learning_rate": 1.067243722543131e-05, "loss": 0.3627, "step": 97625 }, { "epoch": 3.51857858507226, "grad_norm": 0.2685101330280304, "learning_rate": 1.0670045960120707e-05, "loss": 0.3862, "step": 97630 }, { "epoch": 3.5187587847334845, "grad_norm": 0.2545015513896942, "learning_rate": 1.0667654890052962e-05, "loss": 0.3848, "step": 97635 }, { "epoch": 3.518938984394709, "grad_norm": 0.2047766149044037, "learning_rate": 1.0665264015260626e-05, "loss": 0.3676, "step": 97640 }, { "epoch": 3.519119184055934, "grad_norm": 0.24359673261642456, "learning_rate": 1.0662873335776302e-05, "loss": 0.3863, "step": 97645 }, { "epoch": 3.5192993837171587, "grad_norm": 0.2065386027097702, "learning_rate": 1.0660482851632553e-05, "loss": 0.402, "step": 97650 }, { "epoch": 3.5194795833783834, "grad_norm": 0.3513331711292267, "learning_rate": 1.065809256286194e-05, "loss": 0.3652, "step": 97655 }, { "epoch": 3.519659783039608, "grad_norm": 0.23848022520542145, "learning_rate": 1.0655702469497057e-05, "loss": 0.3883, "step": 97660 }, { "epoch": 3.5198399827008324, "grad_norm": 0.1824946254491806, "learning_rate": 1.0653312571570434e-05, "loss": 0.384, "step": 97665 }, { "epoch": 3.520020182362057, "grad_norm": 0.28067702054977417, "learning_rate": 1.0650922869114658e-05, "loss": 0.3883, "step": 97670 }, { "epoch": 3.520200382023282, "grad_norm": 0.23458154499530792, "learning_rate": 1.0648533362162277e-05, "loss": 0.4112, "step": 97675 }, { "epoch": 3.520380581684506, "grad_norm": 0.2893427908420563, "learning_rate": 1.0646144050745854e-05, "loss": 0.3922, "step": 97680 }, { "epoch": 3.520560781345731, "grad_norm": 0.17397251725196838, "learning_rate": 1.0643754934897937e-05, "loss": 0.3335, "step": 97685 }, { "epoch": 3.5207409810069556, "grad_norm": 0.25946879386901855, "learning_rate": 1.064136601465108e-05, "loss": 0.3578, "step": 97690 }, { "epoch": 3.5209211806681804, "grad_norm": 0.28153377771377563, "learning_rate": 1.0638977290037825e-05, "loss": 0.3993, "step": 97695 }, { "epoch": 3.521101380329405, "grad_norm": 0.2764267325401306, "learning_rate": 1.063658876109073e-05, "loss": 0.3849, "step": 97700 }, { "epoch": 3.52128157999063, "grad_norm": 0.26901406049728394, "learning_rate": 1.0634200427842334e-05, "loss": 0.3975, "step": 97705 }, { "epoch": 3.521461779651854, "grad_norm": 0.22816415131092072, "learning_rate": 1.0631812290325174e-05, "loss": 0.3826, "step": 97710 }, { "epoch": 3.521641979313079, "grad_norm": 0.22323520481586456, "learning_rate": 1.062942434857179e-05, "loss": 0.3701, "step": 97715 }, { "epoch": 3.5218221789743036, "grad_norm": 0.18023869395256042, "learning_rate": 1.062703660261471e-05, "loss": 0.3809, "step": 97720 }, { "epoch": 3.522002378635528, "grad_norm": 0.2793877124786377, "learning_rate": 1.062464905248648e-05, "loss": 0.3815, "step": 97725 }, { "epoch": 3.5221825782967526, "grad_norm": 0.19938796758651733, "learning_rate": 1.0622261698219634e-05, "loss": 0.3712, "step": 97730 }, { "epoch": 3.5223627779579774, "grad_norm": 0.21415963768959045, "learning_rate": 1.0619874539846673e-05, "loss": 0.3845, "step": 97735 }, { "epoch": 3.522542977619202, "grad_norm": 0.2542286515235901, "learning_rate": 1.0617487577400143e-05, "loss": 0.3956, "step": 97740 }, { "epoch": 3.522723177280427, "grad_norm": 0.25228896737098694, "learning_rate": 1.0615100810912551e-05, "loss": 0.365, "step": 97745 }, { "epoch": 3.5229033769416516, "grad_norm": 0.24575848877429962, "learning_rate": 1.0612714240416444e-05, "loss": 0.3517, "step": 97750 }, { "epoch": 3.523083576602876, "grad_norm": 0.2416459023952484, "learning_rate": 1.0610327865944311e-05, "loss": 0.3597, "step": 97755 }, { "epoch": 3.5232637762641006, "grad_norm": 0.2579323351383209, "learning_rate": 1.0607941687528669e-05, "loss": 0.4051, "step": 97760 }, { "epoch": 3.5234439759253253, "grad_norm": 0.3103504478931427, "learning_rate": 1.0605555705202041e-05, "loss": 0.3566, "step": 97765 }, { "epoch": 3.52362417558655, "grad_norm": 0.22384612262248993, "learning_rate": 1.060316991899693e-05, "loss": 0.3673, "step": 97770 }, { "epoch": 3.5238043752477743, "grad_norm": 0.22075200080871582, "learning_rate": 1.0600784328945843e-05, "loss": 0.3654, "step": 97775 }, { "epoch": 3.523984574908999, "grad_norm": 0.24411165714263916, "learning_rate": 1.059839893508128e-05, "loss": 0.3677, "step": 97780 }, { "epoch": 3.524164774570224, "grad_norm": 0.2727641761302948, "learning_rate": 1.0596013737435734e-05, "loss": 0.3663, "step": 97785 }, { "epoch": 3.5243449742314485, "grad_norm": 0.21201664209365845, "learning_rate": 1.0593628736041722e-05, "loss": 0.3997, "step": 97790 }, { "epoch": 3.5245251738926733, "grad_norm": 0.2514842450618744, "learning_rate": 1.0591243930931729e-05, "loss": 0.3711, "step": 97795 }, { "epoch": 3.5247053735538976, "grad_norm": 0.21594151854515076, "learning_rate": 1.0588859322138247e-05, "loss": 0.3714, "step": 97800 }, { "epoch": 3.5248855732151223, "grad_norm": 0.21471446752548218, "learning_rate": 1.058647490969377e-05, "loss": 0.3749, "step": 97805 }, { "epoch": 3.525065772876347, "grad_norm": 0.2319725900888443, "learning_rate": 1.0584090693630778e-05, "loss": 0.3581, "step": 97810 }, { "epoch": 3.5252459725375718, "grad_norm": 0.18940207362174988, "learning_rate": 1.0581706673981753e-05, "loss": 0.3634, "step": 97815 }, { "epoch": 3.525426172198796, "grad_norm": 0.22552742063999176, "learning_rate": 1.0579322850779188e-05, "loss": 0.3836, "step": 97820 }, { "epoch": 3.525606371860021, "grad_norm": 0.22318586707115173, "learning_rate": 1.0576939224055563e-05, "loss": 0.3915, "step": 97825 }, { "epoch": 3.5257865715212455, "grad_norm": 0.22033105790615082, "learning_rate": 1.0574555793843345e-05, "loss": 0.3772, "step": 97830 }, { "epoch": 3.5259667711824703, "grad_norm": 0.19937936961650848, "learning_rate": 1.0572172560175011e-05, "loss": 0.379, "step": 97835 }, { "epoch": 3.526146970843695, "grad_norm": 0.19125044345855713, "learning_rate": 1.0569789523083026e-05, "loss": 0.4136, "step": 97840 }, { "epoch": 3.5263271705049197, "grad_norm": 0.2663952708244324, "learning_rate": 1.056740668259988e-05, "loss": 0.3793, "step": 97845 }, { "epoch": 3.526507370166144, "grad_norm": 0.21519611775875092, "learning_rate": 1.0565024038758009e-05, "loss": 0.3712, "step": 97850 }, { "epoch": 3.5266875698273688, "grad_norm": 0.2259238213300705, "learning_rate": 1.0562641591589898e-05, "loss": 0.3741, "step": 97855 }, { "epoch": 3.5268677694885935, "grad_norm": 0.22167803347110748, "learning_rate": 1.0560259341128e-05, "loss": 0.3672, "step": 97860 }, { "epoch": 3.5270479691498178, "grad_norm": 0.1899726837873459, "learning_rate": 1.0557877287404774e-05, "loss": 0.375, "step": 97865 }, { "epoch": 3.5272281688110425, "grad_norm": 0.23604774475097656, "learning_rate": 1.0555495430452673e-05, "loss": 0.355, "step": 97870 }, { "epoch": 3.5274083684722672, "grad_norm": 0.22828394174575806, "learning_rate": 1.0553113770304152e-05, "loss": 0.3715, "step": 97875 }, { "epoch": 3.527588568133492, "grad_norm": 0.1918339878320694, "learning_rate": 1.0550732306991648e-05, "loss": 0.3772, "step": 97880 }, { "epoch": 3.5277687677947167, "grad_norm": 0.21536006033420563, "learning_rate": 1.0548351040547628e-05, "loss": 0.348, "step": 97885 }, { "epoch": 3.5279489674559414, "grad_norm": 0.2499334067106247, "learning_rate": 1.0545969971004527e-05, "loss": 0.3779, "step": 97890 }, { "epoch": 3.5281291671171657, "grad_norm": 0.22823038697242737, "learning_rate": 1.0543589098394784e-05, "loss": 0.3632, "step": 97895 }, { "epoch": 3.5283093667783905, "grad_norm": 0.20469775795936584, "learning_rate": 1.0541208422750846e-05, "loss": 0.3779, "step": 97900 }, { "epoch": 3.528489566439615, "grad_norm": 0.2718513309955597, "learning_rate": 1.053882794410513e-05, "loss": 0.3781, "step": 97905 }, { "epoch": 3.5286697661008395, "grad_norm": 0.1913396418094635, "learning_rate": 1.0536447662490097e-05, "loss": 0.3742, "step": 97910 }, { "epoch": 3.5288499657620642, "grad_norm": 0.24252444505691528, "learning_rate": 1.0534067577938172e-05, "loss": 0.3849, "step": 97915 }, { "epoch": 3.529030165423289, "grad_norm": 0.26426470279693604, "learning_rate": 1.0531687690481757e-05, "loss": 0.3961, "step": 97920 }, { "epoch": 3.5292103650845137, "grad_norm": 0.1625565141439438, "learning_rate": 1.0529308000153304e-05, "loss": 0.3535, "step": 97925 }, { "epoch": 3.5293905647457384, "grad_norm": 0.2652257978916168, "learning_rate": 1.052692850698522e-05, "loss": 0.3936, "step": 97930 }, { "epoch": 3.529570764406963, "grad_norm": 0.2854976952075958, "learning_rate": 1.052454921100995e-05, "loss": 0.382, "step": 97935 }, { "epoch": 3.5297509640681874, "grad_norm": 0.24219295382499695, "learning_rate": 1.0522170112259887e-05, "loss": 0.3635, "step": 97940 }, { "epoch": 3.529931163729412, "grad_norm": 0.19204115867614746, "learning_rate": 1.0519791210767446e-05, "loss": 0.3773, "step": 97945 }, { "epoch": 3.530111363390637, "grad_norm": 0.30104586482048035, "learning_rate": 1.0517412506565052e-05, "loss": 0.4092, "step": 97950 }, { "epoch": 3.530291563051861, "grad_norm": 0.26225852966308594, "learning_rate": 1.0515033999685109e-05, "loss": 0.3847, "step": 97955 }, { "epoch": 3.530471762713086, "grad_norm": 0.2835884690284729, "learning_rate": 1.0512655690160025e-05, "loss": 0.4016, "step": 97960 }, { "epoch": 3.5306519623743107, "grad_norm": 0.21307377517223358, "learning_rate": 1.05102775780222e-05, "loss": 0.3708, "step": 97965 }, { "epoch": 3.5308321620355354, "grad_norm": 0.22064463794231415, "learning_rate": 1.050789966330403e-05, "loss": 0.4074, "step": 97970 }, { "epoch": 3.53101236169676, "grad_norm": 0.22840745747089386, "learning_rate": 1.050552194603793e-05, "loss": 0.4148, "step": 97975 }, { "epoch": 3.531192561357985, "grad_norm": 0.22727341949939728, "learning_rate": 1.050314442625629e-05, "loss": 0.3656, "step": 97980 }, { "epoch": 3.531372761019209, "grad_norm": 0.22287560999393463, "learning_rate": 1.0500767103991496e-05, "loss": 0.4111, "step": 97985 }, { "epoch": 3.531552960680434, "grad_norm": 0.22651442885398865, "learning_rate": 1.0498389979275947e-05, "loss": 0.3876, "step": 97990 }, { "epoch": 3.5317331603416586, "grad_norm": 0.21187719702720642, "learning_rate": 1.0496013052142027e-05, "loss": 0.3932, "step": 97995 }, { "epoch": 3.531913360002883, "grad_norm": 0.23431113362312317, "learning_rate": 1.0493636322622108e-05, "loss": 0.3845, "step": 98000 }, { "epoch": 3.531913360002883, "eval_loss": 0.42959845066070557, "eval_runtime": 3.5275, "eval_samples_per_second": 28.349, "eval_steps_per_second": 7.087, "step": 98000 }, { "epoch": 3.5320935596641077, "grad_norm": 0.28880631923675537, "learning_rate": 1.0491259790748597e-05, "loss": 0.3756, "step": 98005 }, { "epoch": 3.5322737593253324, "grad_norm": 0.2546299397945404, "learning_rate": 1.0488883456553861e-05, "loss": 0.3563, "step": 98010 }, { "epoch": 3.532453958986557, "grad_norm": 0.2502884566783905, "learning_rate": 1.0486507320070279e-05, "loss": 0.362, "step": 98015 }, { "epoch": 3.532634158647782, "grad_norm": 0.23691552877426147, "learning_rate": 1.0484131381330228e-05, "loss": 0.3751, "step": 98020 }, { "epoch": 3.5328143583090066, "grad_norm": 0.2482050359249115, "learning_rate": 1.0481755640366065e-05, "loss": 0.3846, "step": 98025 }, { "epoch": 3.532994557970231, "grad_norm": 0.24961546063423157, "learning_rate": 1.0479380097210187e-05, "loss": 0.3872, "step": 98030 }, { "epoch": 3.5331747576314556, "grad_norm": 0.17989207804203033, "learning_rate": 1.0477004751894929e-05, "loss": 0.3843, "step": 98035 }, { "epoch": 3.5333549572926803, "grad_norm": 0.2397972047328949, "learning_rate": 1.0474629604452676e-05, "loss": 0.3869, "step": 98040 }, { "epoch": 3.533535156953905, "grad_norm": 0.272879958152771, "learning_rate": 1.0472254654915784e-05, "loss": 0.407, "step": 98045 }, { "epoch": 3.5337153566151294, "grad_norm": 0.25433728098869324, "learning_rate": 1.046987990331661e-05, "loss": 0.3856, "step": 98050 }, { "epoch": 3.533895556276354, "grad_norm": 0.2679862082004547, "learning_rate": 1.046750534968751e-05, "loss": 0.4406, "step": 98055 }, { "epoch": 3.534075755937579, "grad_norm": 0.18556173145771027, "learning_rate": 1.0465130994060831e-05, "loss": 0.3705, "step": 98060 }, { "epoch": 3.5342559555988036, "grad_norm": 0.23661454021930695, "learning_rate": 1.0462756836468924e-05, "loss": 0.3722, "step": 98065 }, { "epoch": 3.5344361552600283, "grad_norm": 0.21671940386295319, "learning_rate": 1.046038287694415e-05, "loss": 0.3898, "step": 98070 }, { "epoch": 3.5346163549212526, "grad_norm": 0.21144139766693115, "learning_rate": 1.0458009115518841e-05, "loss": 0.364, "step": 98075 }, { "epoch": 3.5347965545824773, "grad_norm": 0.2512389123439789, "learning_rate": 1.0455635552225343e-05, "loss": 0.3769, "step": 98080 }, { "epoch": 3.534976754243702, "grad_norm": 0.2403516322374344, "learning_rate": 1.0453262187095996e-05, "loss": 0.3693, "step": 98085 }, { "epoch": 3.535156953904927, "grad_norm": 0.2084726095199585, "learning_rate": 1.0450889020163126e-05, "loss": 0.3775, "step": 98090 }, { "epoch": 3.535337153566151, "grad_norm": 0.2216814011335373, "learning_rate": 1.0448516051459087e-05, "loss": 0.4085, "step": 98095 }, { "epoch": 3.535517353227376, "grad_norm": 0.2486451119184494, "learning_rate": 1.0446143281016205e-05, "loss": 0.3845, "step": 98100 }, { "epoch": 3.5356975528886005, "grad_norm": 0.22463186085224152, "learning_rate": 1.0443770708866787e-05, "loss": 0.3779, "step": 98105 }, { "epoch": 3.5358777525498253, "grad_norm": 0.232293039560318, "learning_rate": 1.0441398335043184e-05, "loss": 0.3514, "step": 98110 }, { "epoch": 3.53605795221105, "grad_norm": 0.29086434841156006, "learning_rate": 1.0439026159577703e-05, "loss": 0.3531, "step": 98115 }, { "epoch": 3.5362381518722747, "grad_norm": 0.2319757342338562, "learning_rate": 1.0436654182502678e-05, "loss": 0.4037, "step": 98120 }, { "epoch": 3.536418351533499, "grad_norm": 0.23496223986148834, "learning_rate": 1.0434282403850429e-05, "loss": 0.3831, "step": 98125 }, { "epoch": 3.5365985511947238, "grad_norm": 0.21282298862934113, "learning_rate": 1.0431910823653244e-05, "loss": 0.3574, "step": 98130 }, { "epoch": 3.5367787508559485, "grad_norm": 0.24302206933498383, "learning_rate": 1.0429539441943464e-05, "loss": 0.3586, "step": 98135 }, { "epoch": 3.536958950517173, "grad_norm": 0.2705324590206146, "learning_rate": 1.0427168258753386e-05, "loss": 0.3779, "step": 98140 }, { "epoch": 3.5371391501783975, "grad_norm": 0.23419471085071564, "learning_rate": 1.0424797274115322e-05, "loss": 0.3901, "step": 98145 }, { "epoch": 3.5373193498396223, "grad_norm": 0.20766755938529968, "learning_rate": 1.0422426488061572e-05, "loss": 0.386, "step": 98150 }, { "epoch": 3.537499549500847, "grad_norm": 0.23583157360553741, "learning_rate": 1.042005590062443e-05, "loss": 0.3403, "step": 98155 }, { "epoch": 3.5376797491620717, "grad_norm": 0.2042224109172821, "learning_rate": 1.0417685511836212e-05, "loss": 0.3912, "step": 98160 }, { "epoch": 3.5378599488232965, "grad_norm": 0.2094784826040268, "learning_rate": 1.0415315321729205e-05, "loss": 0.3683, "step": 98165 }, { "epoch": 3.5380401484845208, "grad_norm": 0.24115018546581268, "learning_rate": 1.0412945330335705e-05, "loss": 0.3984, "step": 98170 }, { "epoch": 3.5382203481457455, "grad_norm": 0.216557577252388, "learning_rate": 1.0410575537688e-05, "loss": 0.3695, "step": 98175 }, { "epoch": 3.53840054780697, "grad_norm": 0.22672514617443085, "learning_rate": 1.0408205943818378e-05, "loss": 0.3619, "step": 98180 }, { "epoch": 3.5385807474681945, "grad_norm": 0.23298558592796326, "learning_rate": 1.0405836548759117e-05, "loss": 0.414, "step": 98185 }, { "epoch": 3.5387609471294192, "grad_norm": 0.20260320603847504, "learning_rate": 1.0403467352542515e-05, "loss": 0.3736, "step": 98190 }, { "epoch": 3.538941146790644, "grad_norm": 0.21936598420143127, "learning_rate": 1.0401098355200848e-05, "loss": 0.3585, "step": 98195 }, { "epoch": 3.5391213464518687, "grad_norm": 0.2278914600610733, "learning_rate": 1.0398729556766385e-05, "loss": 0.3446, "step": 98200 }, { "epoch": 3.5393015461130934, "grad_norm": 0.24078984558582306, "learning_rate": 1.0396360957271405e-05, "loss": 0.4154, "step": 98205 }, { "epoch": 3.539481745774318, "grad_norm": 0.24573035538196564, "learning_rate": 1.0393992556748172e-05, "loss": 0.3924, "step": 98210 }, { "epoch": 3.5396619454355425, "grad_norm": 0.25402477383613586, "learning_rate": 1.0391624355228982e-05, "loss": 0.3632, "step": 98215 }, { "epoch": 3.539842145096767, "grad_norm": 0.19696156680583954, "learning_rate": 1.0389256352746063e-05, "loss": 0.3623, "step": 98220 }, { "epoch": 3.540022344757992, "grad_norm": 0.24171549081802368, "learning_rate": 1.0386888549331706e-05, "loss": 0.4052, "step": 98225 }, { "epoch": 3.5402025444192162, "grad_norm": 0.25626784563064575, "learning_rate": 1.0384520945018164e-05, "loss": 0.3945, "step": 98230 }, { "epoch": 3.540382744080441, "grad_norm": 0.2716904282569885, "learning_rate": 1.0382153539837686e-05, "loss": 0.3824, "step": 98235 }, { "epoch": 3.5405629437416657, "grad_norm": 0.20084236562252045, "learning_rate": 1.0379786333822552e-05, "loss": 0.3576, "step": 98240 }, { "epoch": 3.5407431434028904, "grad_norm": 0.21598944067955017, "learning_rate": 1.037741932700499e-05, "loss": 0.3622, "step": 98245 }, { "epoch": 3.540923343064115, "grad_norm": 0.2069658786058426, "learning_rate": 1.037505251941725e-05, "loss": 0.375, "step": 98250 }, { "epoch": 3.54110354272534, "grad_norm": 0.2307698279619217, "learning_rate": 1.0372685911091598e-05, "loss": 0.3512, "step": 98255 }, { "epoch": 3.541283742386564, "grad_norm": 0.23482072353363037, "learning_rate": 1.0370319502060267e-05, "loss": 0.3773, "step": 98260 }, { "epoch": 3.541463942047789, "grad_norm": 0.23283249139785767, "learning_rate": 1.03679532923555e-05, "loss": 0.4013, "step": 98265 }, { "epoch": 3.5416441417090136, "grad_norm": 0.23621907830238342, "learning_rate": 1.0365587282009539e-05, "loss": 0.404, "step": 98270 }, { "epoch": 3.5418243413702384, "grad_norm": 0.2579342722892761, "learning_rate": 1.0363221471054607e-05, "loss": 0.4041, "step": 98275 }, { "epoch": 3.5420045410314627, "grad_norm": 0.2534416615962982, "learning_rate": 1.0360855859522958e-05, "loss": 0.3824, "step": 98280 }, { "epoch": 3.5421847406926874, "grad_norm": 0.20750831067562103, "learning_rate": 1.0358490447446815e-05, "loss": 0.3827, "step": 98285 }, { "epoch": 3.542364940353912, "grad_norm": 0.22172388434410095, "learning_rate": 1.0356125234858405e-05, "loss": 0.3838, "step": 98290 }, { "epoch": 3.542545140015137, "grad_norm": 0.2661237120628357, "learning_rate": 1.0353760221789951e-05, "loss": 0.3559, "step": 98295 }, { "epoch": 3.5427253396763616, "grad_norm": 0.24932733178138733, "learning_rate": 1.035139540827367e-05, "loss": 0.3511, "step": 98300 }, { "epoch": 3.542905539337586, "grad_norm": 0.21312791109085083, "learning_rate": 1.0349030794341802e-05, "loss": 0.3607, "step": 98305 }, { "epoch": 3.5430857389988106, "grad_norm": 0.2608811855316162, "learning_rate": 1.0346666380026559e-05, "loss": 0.3928, "step": 98310 }, { "epoch": 3.5432659386600354, "grad_norm": 0.21913045644760132, "learning_rate": 1.0344302165360134e-05, "loss": 0.4061, "step": 98315 }, { "epoch": 3.54344613832126, "grad_norm": 0.23208434879779816, "learning_rate": 1.0341938150374761e-05, "loss": 0.3448, "step": 98320 }, { "epoch": 3.5436263379824844, "grad_norm": 0.22456184029579163, "learning_rate": 1.0339574335102645e-05, "loss": 0.3587, "step": 98325 }, { "epoch": 3.543806537643709, "grad_norm": 0.23729225993156433, "learning_rate": 1.033721071957599e-05, "loss": 0.3741, "step": 98330 }, { "epoch": 3.543986737304934, "grad_norm": 0.26063045859336853, "learning_rate": 1.0334847303827e-05, "loss": 0.3996, "step": 98335 }, { "epoch": 3.5441669369661586, "grad_norm": 0.24216891825199127, "learning_rate": 1.0332484087887867e-05, "loss": 0.4231, "step": 98340 }, { "epoch": 3.5443471366273833, "grad_norm": 0.25515010952949524, "learning_rate": 1.0330121071790808e-05, "loss": 0.3934, "step": 98345 }, { "epoch": 3.544527336288608, "grad_norm": 0.18739314377307892, "learning_rate": 1.0327758255568007e-05, "loss": 0.3672, "step": 98350 }, { "epoch": 3.5447075359498323, "grad_norm": 0.2008611410856247, "learning_rate": 1.0325395639251661e-05, "loss": 0.3601, "step": 98355 }, { "epoch": 3.544887735611057, "grad_norm": 0.20512117445468903, "learning_rate": 1.0323033222873956e-05, "loss": 0.3739, "step": 98360 }, { "epoch": 3.545067935272282, "grad_norm": 0.25009825825691223, "learning_rate": 1.0320671006467086e-05, "loss": 0.3642, "step": 98365 }, { "epoch": 3.545248134933506, "grad_norm": 0.27992385625839233, "learning_rate": 1.031830899006322e-05, "loss": 0.4017, "step": 98370 }, { "epoch": 3.545428334594731, "grad_norm": 0.2249823361635208, "learning_rate": 1.0315947173694562e-05, "loss": 0.3877, "step": 98375 }, { "epoch": 3.5456085342559556, "grad_norm": 0.26109376549720764, "learning_rate": 1.031358555739328e-05, "loss": 0.3812, "step": 98380 }, { "epoch": 3.5457887339171803, "grad_norm": 0.23642925918102264, "learning_rate": 1.031122414119155e-05, "loss": 0.3841, "step": 98385 }, { "epoch": 3.545968933578405, "grad_norm": 0.2479097694158554, "learning_rate": 1.0308862925121548e-05, "loss": 0.3603, "step": 98390 }, { "epoch": 3.5461491332396298, "grad_norm": 0.30400943756103516, "learning_rate": 1.0306501909215438e-05, "loss": 0.3651, "step": 98395 }, { "epoch": 3.546329332900854, "grad_norm": 0.1932227909564972, "learning_rate": 1.0304141093505401e-05, "loss": 0.4042, "step": 98400 }, { "epoch": 3.546509532562079, "grad_norm": 0.25808045268058777, "learning_rate": 1.0301780478023607e-05, "loss": 0.3855, "step": 98405 }, { "epoch": 3.5466897322233035, "grad_norm": 0.17466764152050018, "learning_rate": 1.0299420062802187e-05, "loss": 0.3894, "step": 98410 }, { "epoch": 3.546869931884528, "grad_norm": 0.2541663944721222, "learning_rate": 1.0297059847873334e-05, "loss": 0.3537, "step": 98415 }, { "epoch": 3.5470501315457525, "grad_norm": 0.277024507522583, "learning_rate": 1.0294699833269186e-05, "loss": 0.3547, "step": 98420 }, { "epoch": 3.5472303312069773, "grad_norm": 0.2566564381122589, "learning_rate": 1.029234001902192e-05, "loss": 0.3823, "step": 98425 }, { "epoch": 3.547410530868202, "grad_norm": 0.1786002814769745, "learning_rate": 1.0289980405163668e-05, "loss": 0.366, "step": 98430 }, { "epoch": 3.5475907305294268, "grad_norm": 0.24739274382591248, "learning_rate": 1.0287620991726577e-05, "loss": 0.3887, "step": 98435 }, { "epoch": 3.5477709301906515, "grad_norm": 0.2730603516101837, "learning_rate": 1.0285261778742808e-05, "loss": 0.3606, "step": 98440 }, { "epoch": 3.5479511298518758, "grad_norm": 0.27662402391433716, "learning_rate": 1.0282902766244498e-05, "loss": 0.4084, "step": 98445 }, { "epoch": 3.5481313295131005, "grad_norm": 0.18735605478286743, "learning_rate": 1.0280543954263792e-05, "loss": 0.3771, "step": 98450 }, { "epoch": 3.5483115291743252, "grad_norm": 0.2412840873003006, "learning_rate": 1.0278185342832821e-05, "loss": 0.4203, "step": 98455 }, { "epoch": 3.5484917288355495, "grad_norm": 0.2625180184841156, "learning_rate": 1.0275826931983718e-05, "loss": 0.4129, "step": 98460 }, { "epoch": 3.5486719284967743, "grad_norm": 0.23540963232517242, "learning_rate": 1.0273468721748631e-05, "loss": 0.362, "step": 98465 }, { "epoch": 3.548852128157999, "grad_norm": 0.23965346813201904, "learning_rate": 1.0271110712159679e-05, "loss": 0.3762, "step": 98470 }, { "epoch": 3.5490323278192237, "grad_norm": 0.23907659947872162, "learning_rate": 1.0268752903248995e-05, "loss": 0.3859, "step": 98475 }, { "epoch": 3.5492125274804485, "grad_norm": 0.2294924557209015, "learning_rate": 1.0266395295048701e-05, "loss": 0.3549, "step": 98480 }, { "epoch": 3.549392727141673, "grad_norm": 0.25225427746772766, "learning_rate": 1.0264037887590907e-05, "loss": 0.3969, "step": 98485 }, { "epoch": 3.5495729268028975, "grad_norm": 0.2280511111021042, "learning_rate": 1.0261680680907754e-05, "loss": 0.3913, "step": 98490 }, { "epoch": 3.549753126464122, "grad_norm": 0.2504872977733612, "learning_rate": 1.0259323675031357e-05, "loss": 0.3917, "step": 98495 }, { "epoch": 3.549933326125347, "grad_norm": 0.21838301420211792, "learning_rate": 1.0256966869993804e-05, "loss": 0.3925, "step": 98500 }, { "epoch": 3.549933326125347, "eval_loss": 0.42997604608535767, "eval_runtime": 3.5334, "eval_samples_per_second": 28.301, "eval_steps_per_second": 7.075, "step": 98500 }, { "epoch": 3.5501135257865712, "grad_norm": 0.226282998919487, "learning_rate": 1.025461026582723e-05, "loss": 0.3838, "step": 98505 }, { "epoch": 3.550293725447796, "grad_norm": 0.2352711260318756, "learning_rate": 1.0252253862563738e-05, "loss": 0.3911, "step": 98510 }, { "epoch": 3.5504739251090207, "grad_norm": 0.20618538558483124, "learning_rate": 1.024989766023543e-05, "loss": 0.3656, "step": 98515 }, { "epoch": 3.5506541247702454, "grad_norm": 0.2552780210971832, "learning_rate": 1.0247541658874412e-05, "loss": 0.3744, "step": 98520 }, { "epoch": 3.55083432443147, "grad_norm": 0.21368472278118134, "learning_rate": 1.0245185858512777e-05, "loss": 0.3951, "step": 98525 }, { "epoch": 3.551014524092695, "grad_norm": 0.257311075925827, "learning_rate": 1.024283025918263e-05, "loss": 0.3753, "step": 98530 }, { "epoch": 3.551194723753919, "grad_norm": 0.1774069368839264, "learning_rate": 1.0240474860916068e-05, "loss": 0.3849, "step": 98535 }, { "epoch": 3.551374923415144, "grad_norm": 0.20663419365882874, "learning_rate": 1.023811966374518e-05, "loss": 0.3717, "step": 98540 }, { "epoch": 3.5515551230763687, "grad_norm": 0.23598462343215942, "learning_rate": 1.0235764667702053e-05, "loss": 0.4101, "step": 98545 }, { "epoch": 3.5517353227375934, "grad_norm": 0.25389420986175537, "learning_rate": 1.0233409872818772e-05, "loss": 0.3636, "step": 98550 }, { "epoch": 3.5519155223988177, "grad_norm": 0.18197034299373627, "learning_rate": 1.0231055279127414e-05, "loss": 0.3946, "step": 98555 }, { "epoch": 3.5520957220600424, "grad_norm": 0.2080104500055313, "learning_rate": 1.0228700886660078e-05, "loss": 0.361, "step": 98560 }, { "epoch": 3.552275921721267, "grad_norm": 0.2673806846141815, "learning_rate": 1.0226346695448832e-05, "loss": 0.3957, "step": 98565 }, { "epoch": 3.552456121382492, "grad_norm": 0.25703608989715576, "learning_rate": 1.0223992705525753e-05, "loss": 0.3573, "step": 98570 }, { "epoch": 3.5526363210437166, "grad_norm": 0.24846360087394714, "learning_rate": 1.0221638916922909e-05, "loss": 0.3862, "step": 98575 }, { "epoch": 3.552816520704941, "grad_norm": 0.25549542903900146, "learning_rate": 1.021928532967237e-05, "loss": 0.3834, "step": 98580 }, { "epoch": 3.5529967203661657, "grad_norm": 0.2898486256599426, "learning_rate": 1.0216931943806213e-05, "loss": 0.3971, "step": 98585 }, { "epoch": 3.5531769200273904, "grad_norm": 0.25332796573638916, "learning_rate": 1.0214578759356504e-05, "loss": 0.3667, "step": 98590 }, { "epoch": 3.553357119688615, "grad_norm": 0.225131094455719, "learning_rate": 1.021222577635528e-05, "loss": 0.3701, "step": 98595 }, { "epoch": 3.5535373193498394, "grad_norm": 0.18378715217113495, "learning_rate": 1.0209872994834627e-05, "loss": 0.3875, "step": 98600 }, { "epoch": 3.553717519011064, "grad_norm": 0.21205966174602509, "learning_rate": 1.0207520414826583e-05, "loss": 0.3717, "step": 98605 }, { "epoch": 3.553897718672289, "grad_norm": 0.21734385192394257, "learning_rate": 1.0205168036363225e-05, "loss": 0.3397, "step": 98610 }, { "epoch": 3.5540779183335136, "grad_norm": 0.22054389119148254, "learning_rate": 1.0202815859476577e-05, "loss": 0.4035, "step": 98615 }, { "epoch": 3.5542581179947383, "grad_norm": 0.24117785692214966, "learning_rate": 1.0200463884198693e-05, "loss": 0.3737, "step": 98620 }, { "epoch": 3.554438317655963, "grad_norm": 0.20071789622306824, "learning_rate": 1.0198112110561631e-05, "loss": 0.3685, "step": 98625 }, { "epoch": 3.5546185173171874, "grad_norm": 0.29259100556373596, "learning_rate": 1.0195760538597426e-05, "loss": 0.3778, "step": 98630 }, { "epoch": 3.554798716978412, "grad_norm": 0.24713879823684692, "learning_rate": 1.0193409168338116e-05, "loss": 0.3813, "step": 98635 }, { "epoch": 3.554978916639637, "grad_norm": 0.22376933693885803, "learning_rate": 1.0191057999815743e-05, "loss": 0.3789, "step": 98640 }, { "epoch": 3.555159116300861, "grad_norm": 0.2563563287258148, "learning_rate": 1.0188707033062325e-05, "loss": 0.3876, "step": 98645 }, { "epoch": 3.555339315962086, "grad_norm": 0.2513370215892792, "learning_rate": 1.0186356268109917e-05, "loss": 0.3808, "step": 98650 }, { "epoch": 3.5555195156233106, "grad_norm": 0.23794013261795044, "learning_rate": 1.0184005704990538e-05, "loss": 0.3962, "step": 98655 }, { "epoch": 3.5556997152845353, "grad_norm": 0.23575954139232635, "learning_rate": 1.018165534373621e-05, "loss": 0.3906, "step": 98660 }, { "epoch": 3.55587991494576, "grad_norm": 0.20875148475170135, "learning_rate": 1.0179305184378959e-05, "loss": 0.3759, "step": 98665 }, { "epoch": 3.556060114606985, "grad_norm": 0.2343914955854416, "learning_rate": 1.0176955226950799e-05, "loss": 0.3565, "step": 98670 }, { "epoch": 3.556240314268209, "grad_norm": 0.24285919964313507, "learning_rate": 1.0174605471483761e-05, "loss": 0.3683, "step": 98675 }, { "epoch": 3.556420513929434, "grad_norm": 0.24529211223125458, "learning_rate": 1.0172255918009862e-05, "loss": 0.3582, "step": 98680 }, { "epoch": 3.5566007135906585, "grad_norm": 0.23087753355503082, "learning_rate": 1.0169906566561087e-05, "loss": 0.3616, "step": 98685 }, { "epoch": 3.556780913251883, "grad_norm": 0.2296365201473236, "learning_rate": 1.0167557417169476e-05, "loss": 0.3671, "step": 98690 }, { "epoch": 3.5569611129131076, "grad_norm": 0.27826911211013794, "learning_rate": 1.0165208469867022e-05, "loss": 0.3988, "step": 98695 }, { "epoch": 3.5571413125743323, "grad_norm": 0.21545450389385223, "learning_rate": 1.0162859724685723e-05, "loss": 0.3612, "step": 98700 }, { "epoch": 3.557321512235557, "grad_norm": 0.23605626821517944, "learning_rate": 1.0160511181657604e-05, "loss": 0.358, "step": 98705 }, { "epoch": 3.5575017118967818, "grad_norm": 0.2647882103919983, "learning_rate": 1.0158162840814627e-05, "loss": 0.4287, "step": 98710 }, { "epoch": 3.5576819115580065, "grad_norm": 0.2228686660528183, "learning_rate": 1.0155814702188818e-05, "loss": 0.414, "step": 98715 }, { "epoch": 3.557862111219231, "grad_norm": 0.23151619732379913, "learning_rate": 1.0153466765812161e-05, "loss": 0.3869, "step": 98720 }, { "epoch": 3.5580423108804555, "grad_norm": 0.24438922107219696, "learning_rate": 1.0151119031716646e-05, "loss": 0.3947, "step": 98725 }, { "epoch": 3.5582225105416803, "grad_norm": 0.2741881012916565, "learning_rate": 1.0148771499934257e-05, "loss": 0.3601, "step": 98730 }, { "epoch": 3.5584027102029046, "grad_norm": 0.3307461440563202, "learning_rate": 1.0146424170496982e-05, "loss": 0.3836, "step": 98735 }, { "epoch": 3.5585829098641293, "grad_norm": 0.2043096274137497, "learning_rate": 1.0144077043436792e-05, "loss": 0.3482, "step": 98740 }, { "epoch": 3.558763109525354, "grad_norm": 0.24508656561374664, "learning_rate": 1.0141730118785687e-05, "loss": 0.3842, "step": 98745 }, { "epoch": 3.5589433091865788, "grad_norm": 0.20151925086975098, "learning_rate": 1.013938339657563e-05, "loss": 0.3545, "step": 98750 }, { "epoch": 3.5591235088478035, "grad_norm": 0.22490449249744415, "learning_rate": 1.0137036876838598e-05, "loss": 0.3762, "step": 98755 }, { "epoch": 3.559303708509028, "grad_norm": 0.21991267800331116, "learning_rate": 1.0134690559606563e-05, "loss": 0.3916, "step": 98760 }, { "epoch": 3.5594839081702525, "grad_norm": 0.3030828833580017, "learning_rate": 1.0132344444911482e-05, "loss": 0.3619, "step": 98765 }, { "epoch": 3.5596641078314772, "grad_norm": 0.22489933669567108, "learning_rate": 1.0129998532785337e-05, "loss": 0.3944, "step": 98770 }, { "epoch": 3.559844307492702, "grad_norm": 0.21111252903938293, "learning_rate": 1.0127652823260092e-05, "loss": 0.3725, "step": 98775 }, { "epoch": 3.5600245071539267, "grad_norm": 0.23496191203594208, "learning_rate": 1.012530731636768e-05, "loss": 0.3468, "step": 98780 }, { "epoch": 3.560204706815151, "grad_norm": 0.22039222717285156, "learning_rate": 1.0122962012140083e-05, "loss": 0.3789, "step": 98785 }, { "epoch": 3.5603849064763757, "grad_norm": 0.2323601096868515, "learning_rate": 1.0120616910609243e-05, "loss": 0.3234, "step": 98790 }, { "epoch": 3.5605651061376005, "grad_norm": 0.21386387944221497, "learning_rate": 1.0118272011807134e-05, "loss": 0.405, "step": 98795 }, { "epoch": 3.560745305798825, "grad_norm": 0.22486340999603271, "learning_rate": 1.0115927315765678e-05, "loss": 0.3728, "step": 98800 }, { "epoch": 3.56092550546005, "grad_norm": 0.22867098450660706, "learning_rate": 1.011358282251682e-05, "loss": 0.4076, "step": 98805 }, { "epoch": 3.5611057051212742, "grad_norm": 0.2371281385421753, "learning_rate": 1.0111238532092524e-05, "loss": 0.3776, "step": 98810 }, { "epoch": 3.561285904782499, "grad_norm": 0.29785680770874023, "learning_rate": 1.0108894444524713e-05, "loss": 0.4038, "step": 98815 }, { "epoch": 3.5614661044437237, "grad_norm": 0.24299155175685883, "learning_rate": 1.0106550559845348e-05, "loss": 0.3717, "step": 98820 }, { "epoch": 3.5616463041049484, "grad_norm": 0.19322209060192108, "learning_rate": 1.0104206878086343e-05, "loss": 0.3855, "step": 98825 }, { "epoch": 3.5618265037661727, "grad_norm": 0.20190706849098206, "learning_rate": 1.0101863399279621e-05, "loss": 0.3884, "step": 98830 }, { "epoch": 3.5620067034273974, "grad_norm": 0.2435564547777176, "learning_rate": 1.0099520123457138e-05, "loss": 0.4033, "step": 98835 }, { "epoch": 3.562186903088622, "grad_norm": 0.2607412040233612, "learning_rate": 1.0097177050650808e-05, "loss": 0.3681, "step": 98840 }, { "epoch": 3.562367102749847, "grad_norm": 0.20941239595413208, "learning_rate": 1.0094834180892554e-05, "loss": 0.3891, "step": 98845 }, { "epoch": 3.5625473024110716, "grad_norm": 0.2150138020515442, "learning_rate": 1.0092491514214301e-05, "loss": 0.3664, "step": 98850 }, { "epoch": 3.562727502072296, "grad_norm": 0.2666082978248596, "learning_rate": 1.0090149050647955e-05, "loss": 0.3944, "step": 98855 }, { "epoch": 3.5629077017335207, "grad_norm": 0.27906569838523865, "learning_rate": 1.0087806790225451e-05, "loss": 0.3951, "step": 98860 }, { "epoch": 3.5630879013947454, "grad_norm": 0.2231929451227188, "learning_rate": 1.0085464732978692e-05, "loss": 0.3646, "step": 98865 }, { "epoch": 3.56326810105597, "grad_norm": 0.22479088604450226, "learning_rate": 1.0083122878939588e-05, "loss": 0.4146, "step": 98870 }, { "epoch": 3.5634483007171944, "grad_norm": 0.2482202798128128, "learning_rate": 1.0080781228140045e-05, "loss": 0.3679, "step": 98875 }, { "epoch": 3.563628500378419, "grad_norm": 0.3211915194988251, "learning_rate": 1.0078439780611973e-05, "loss": 0.3653, "step": 98880 }, { "epoch": 3.563808700039644, "grad_norm": 0.2703056037425995, "learning_rate": 1.007609853638726e-05, "loss": 0.3871, "step": 98885 }, { "epoch": 3.5639888997008686, "grad_norm": 0.21698278188705444, "learning_rate": 1.0073757495497832e-05, "loss": 0.3286, "step": 98890 }, { "epoch": 3.5641690993620934, "grad_norm": 0.2787526547908783, "learning_rate": 1.007141665797555e-05, "loss": 0.3737, "step": 98895 }, { "epoch": 3.564349299023318, "grad_norm": 0.23529410362243652, "learning_rate": 1.0069076023852337e-05, "loss": 0.4027, "step": 98900 }, { "epoch": 3.5645294986845424, "grad_norm": 0.22988703846931458, "learning_rate": 1.006673559316007e-05, "loss": 0.3583, "step": 98905 }, { "epoch": 3.564709698345767, "grad_norm": 0.25721538066864014, "learning_rate": 1.006439536593064e-05, "loss": 0.3557, "step": 98910 }, { "epoch": 3.564889898006992, "grad_norm": 0.3110162317752838, "learning_rate": 1.006205534219593e-05, "loss": 0.3782, "step": 98915 }, { "epoch": 3.565070097668216, "grad_norm": 0.2260642945766449, "learning_rate": 1.0059715521987829e-05, "loss": 0.3644, "step": 98920 }, { "epoch": 3.565250297329441, "grad_norm": 0.2566840350627899, "learning_rate": 1.0057375905338199e-05, "loss": 0.3579, "step": 98925 }, { "epoch": 3.5654304969906656, "grad_norm": 0.19224494695663452, "learning_rate": 1.0055036492278938e-05, "loss": 0.3589, "step": 98930 }, { "epoch": 3.5656106966518903, "grad_norm": 0.24910667538642883, "learning_rate": 1.005269728284191e-05, "loss": 0.3914, "step": 98935 }, { "epoch": 3.565790896313115, "grad_norm": 0.23995643854141235, "learning_rate": 1.0050358277058991e-05, "loss": 0.3903, "step": 98940 }, { "epoch": 3.56597109597434, "grad_norm": 0.29442286491394043, "learning_rate": 1.0048019474962044e-05, "loss": 0.3919, "step": 98945 }, { "epoch": 3.566151295635564, "grad_norm": 0.22082147002220154, "learning_rate": 1.004568087658293e-05, "loss": 0.346, "step": 98950 }, { "epoch": 3.566331495296789, "grad_norm": 0.24365797638893127, "learning_rate": 1.0043342481953525e-05, "loss": 0.3524, "step": 98955 }, { "epoch": 3.5665116949580136, "grad_norm": 0.21891754865646362, "learning_rate": 1.0041004291105693e-05, "loss": 0.3475, "step": 98960 }, { "epoch": 3.566691894619238, "grad_norm": 0.22469094395637512, "learning_rate": 1.0038666304071265e-05, "loss": 0.4066, "step": 98965 }, { "epoch": 3.5668720942804626, "grad_norm": 0.2404364049434662, "learning_rate": 1.0036328520882119e-05, "loss": 0.4111, "step": 98970 }, { "epoch": 3.5670522939416873, "grad_norm": 0.22921812534332275, "learning_rate": 1.0033990941570093e-05, "loss": 0.3701, "step": 98975 }, { "epoch": 3.567232493602912, "grad_norm": 0.20173293352127075, "learning_rate": 1.0031653566167048e-05, "loss": 0.373, "step": 98980 }, { "epoch": 3.567412693264137, "grad_norm": 0.23841802775859833, "learning_rate": 1.0029316394704839e-05, "loss": 0.3717, "step": 98985 }, { "epoch": 3.5675928929253615, "grad_norm": 0.25082719326019287, "learning_rate": 1.0026979427215275e-05, "loss": 0.4165, "step": 98990 }, { "epoch": 3.567773092586586, "grad_norm": 0.23590582609176636, "learning_rate": 1.0024642663730227e-05, "loss": 0.3838, "step": 98995 }, { "epoch": 3.5679532922478105, "grad_norm": 0.25584080815315247, "learning_rate": 1.0022306104281523e-05, "loss": 0.3765, "step": 99000 }, { "epoch": 3.5679532922478105, "eval_loss": 0.43000587821006775, "eval_runtime": 3.5299, "eval_samples_per_second": 28.33, "eval_steps_per_second": 7.082, "step": 99000 }, { "epoch": 3.5681334919090353, "grad_norm": 0.22017063200473785, "learning_rate": 1.0019969748900998e-05, "loss": 0.3714, "step": 99005 }, { "epoch": 3.5683136915702596, "grad_norm": 0.25801563262939453, "learning_rate": 1.0017633597620485e-05, "loss": 0.3674, "step": 99010 }, { "epoch": 3.5684938912314843, "grad_norm": 0.2582071125507355, "learning_rate": 1.0015297650471805e-05, "loss": 0.3786, "step": 99015 }, { "epoch": 3.568674090892709, "grad_norm": 0.2171230912208557, "learning_rate": 1.0012961907486804e-05, "loss": 0.3997, "step": 99020 }, { "epoch": 3.5688542905539338, "grad_norm": 0.20874454081058502, "learning_rate": 1.0010626368697292e-05, "loss": 0.4006, "step": 99025 }, { "epoch": 3.5690344902151585, "grad_norm": 0.21824629604816437, "learning_rate": 1.0008291034135098e-05, "loss": 0.3733, "step": 99030 }, { "epoch": 3.5692146898763832, "grad_norm": 0.3257184326648712, "learning_rate": 1.0005955903832031e-05, "loss": 0.3853, "step": 99035 }, { "epoch": 3.5693948895376075, "grad_norm": 0.21955911815166473, "learning_rate": 1.0003620977819908e-05, "loss": 0.3398, "step": 99040 }, { "epoch": 3.5695750891988323, "grad_norm": 0.2739407420158386, "learning_rate": 1.0001286256130551e-05, "loss": 0.3881, "step": 99045 }, { "epoch": 3.569755288860057, "grad_norm": 0.2548768222332001, "learning_rate": 9.998951738795767e-06, "loss": 0.3858, "step": 99050 }, { "epoch": 3.5699354885212817, "grad_norm": 0.22001412510871887, "learning_rate": 9.996617425847363e-06, "loss": 0.3921, "step": 99055 }, { "epoch": 3.570115688182506, "grad_norm": 0.2573753595352173, "learning_rate": 9.994283317317138e-06, "loss": 0.3646, "step": 99060 }, { "epoch": 3.5702958878437308, "grad_norm": 0.30579379200935364, "learning_rate": 9.991949413236898e-06, "loss": 0.4174, "step": 99065 }, { "epoch": 3.5704760875049555, "grad_norm": 0.22438612580299377, "learning_rate": 9.989615713638434e-06, "loss": 0.3636, "step": 99070 }, { "epoch": 3.57065628716618, "grad_norm": 0.25755026936531067, "learning_rate": 9.987282218553568e-06, "loss": 0.3919, "step": 99075 }, { "epoch": 3.570836486827405, "grad_norm": 0.20849241316318512, "learning_rate": 9.984948928014057e-06, "loss": 0.3864, "step": 99080 }, { "epoch": 3.5710166864886292, "grad_norm": 0.2606984078884125, "learning_rate": 9.982615842051719e-06, "loss": 0.3855, "step": 99085 }, { "epoch": 3.571196886149854, "grad_norm": 0.224889874458313, "learning_rate": 9.98028296069833e-06, "loss": 0.3739, "step": 99090 }, { "epoch": 3.5713770858110787, "grad_norm": 0.27115222811698914, "learning_rate": 9.977950283985673e-06, "loss": 0.3709, "step": 99095 }, { "epoch": 3.5715572854723034, "grad_norm": 0.22326913475990295, "learning_rate": 9.97561781194555e-06, "loss": 0.3937, "step": 99100 }, { "epoch": 3.5717374851335277, "grad_norm": 0.22230251133441925, "learning_rate": 9.97328554460972e-06, "loss": 0.362, "step": 99105 }, { "epoch": 3.5719176847947525, "grad_norm": 0.252175509929657, "learning_rate": 9.970953482009953e-06, "loss": 0.3838, "step": 99110 }, { "epoch": 3.572097884455977, "grad_norm": 0.24773818254470825, "learning_rate": 9.968621624178046e-06, "loss": 0.3791, "step": 99115 }, { "epoch": 3.572278084117202, "grad_norm": 0.2345571517944336, "learning_rate": 9.966289971145756e-06, "loss": 0.3671, "step": 99120 }, { "epoch": 3.5724582837784267, "grad_norm": 0.2684936821460724, "learning_rate": 9.963958522944858e-06, "loss": 0.3666, "step": 99125 }, { "epoch": 3.5726384834396514, "grad_norm": 0.2345341295003891, "learning_rate": 9.961627279607111e-06, "loss": 0.3676, "step": 99130 }, { "epoch": 3.5728186831008757, "grad_norm": 0.22453191876411438, "learning_rate": 9.959296241164273e-06, "loss": 0.3522, "step": 99135 }, { "epoch": 3.5729988827621004, "grad_norm": 0.26160648465156555, "learning_rate": 9.956965407648122e-06, "loss": 0.3757, "step": 99140 }, { "epoch": 3.573179082423325, "grad_norm": 0.24852387607097626, "learning_rate": 9.954634779090404e-06, "loss": 0.4034, "step": 99145 }, { "epoch": 3.5733592820845494, "grad_norm": 0.22410407662391663, "learning_rate": 9.952304355522876e-06, "loss": 0.4013, "step": 99150 }, { "epoch": 3.573539481745774, "grad_norm": 0.2551601529121399, "learning_rate": 9.949974136977286e-06, "loss": 0.3943, "step": 99155 }, { "epoch": 3.573719681406999, "grad_norm": 0.20935653150081635, "learning_rate": 9.947644123485376e-06, "loss": 0.3551, "step": 99160 }, { "epoch": 3.5738998810682236, "grad_norm": 0.24979546666145325, "learning_rate": 9.945314315078907e-06, "loss": 0.3857, "step": 99165 }, { "epoch": 3.5740800807294484, "grad_norm": 0.2232057750225067, "learning_rate": 9.94298471178963e-06, "loss": 0.3694, "step": 99170 }, { "epoch": 3.574260280390673, "grad_norm": 0.20639793574810028, "learning_rate": 9.94065531364925e-06, "loss": 0.3668, "step": 99175 }, { "epoch": 3.5744404800518974, "grad_norm": 0.2616964876651764, "learning_rate": 9.938326120689534e-06, "loss": 0.3901, "step": 99180 }, { "epoch": 3.574620679713122, "grad_norm": 0.2324073314666748, "learning_rate": 9.93599713294221e-06, "loss": 0.3556, "step": 99185 }, { "epoch": 3.574800879374347, "grad_norm": 0.23615048825740814, "learning_rate": 9.933668350439008e-06, "loss": 0.3786, "step": 99190 }, { "epoch": 3.574981079035571, "grad_norm": 0.21338878571987152, "learning_rate": 9.931339773211657e-06, "loss": 0.3847, "step": 99195 }, { "epoch": 3.575161278696796, "grad_norm": 0.2527485489845276, "learning_rate": 9.929011401291877e-06, "loss": 0.384, "step": 99200 }, { "epoch": 3.5753414783580206, "grad_norm": 0.20677971839904785, "learning_rate": 9.926683234711406e-06, "loss": 0.4078, "step": 99205 }, { "epoch": 3.5755216780192454, "grad_norm": 0.2947938144207001, "learning_rate": 9.92435527350196e-06, "loss": 0.3631, "step": 99210 }, { "epoch": 3.57570187768047, "grad_norm": 0.21625371277332306, "learning_rate": 9.922027517695253e-06, "loss": 0.3614, "step": 99215 }, { "epoch": 3.575882077341695, "grad_norm": 0.21706773340702057, "learning_rate": 9.919699967323001e-06, "loss": 0.3756, "step": 99220 }, { "epoch": 3.576062277002919, "grad_norm": 0.24126207828521729, "learning_rate": 9.917372622416912e-06, "loss": 0.3679, "step": 99225 }, { "epoch": 3.576242476664144, "grad_norm": 0.20370317995548248, "learning_rate": 9.915045483008706e-06, "loss": 0.3477, "step": 99230 }, { "epoch": 3.5764226763253686, "grad_norm": 0.2700237035751343, "learning_rate": 9.912718549130088e-06, "loss": 0.3631, "step": 99235 }, { "epoch": 3.576602875986593, "grad_norm": 0.21943305432796478, "learning_rate": 9.910391820812756e-06, "loss": 0.3704, "step": 99240 }, { "epoch": 3.5767830756478176, "grad_norm": 0.22363239526748657, "learning_rate": 9.908065298088414e-06, "loss": 0.3915, "step": 99245 }, { "epoch": 3.5769632753090423, "grad_norm": 0.2681196928024292, "learning_rate": 9.905738980988763e-06, "loss": 0.3785, "step": 99250 }, { "epoch": 3.577143474970267, "grad_norm": 0.2544713318347931, "learning_rate": 9.903412869545484e-06, "loss": 0.3965, "step": 99255 }, { "epoch": 3.577323674631492, "grad_norm": 0.254402220249176, "learning_rate": 9.901086963790294e-06, "loss": 0.4389, "step": 99260 }, { "epoch": 3.5775038742927165, "grad_norm": 0.2791866064071655, "learning_rate": 9.89876126375487e-06, "loss": 0.4082, "step": 99265 }, { "epoch": 3.577684073953941, "grad_norm": 0.2096082866191864, "learning_rate": 9.896435769470897e-06, "loss": 0.3456, "step": 99270 }, { "epoch": 3.5778642736151656, "grad_norm": 0.20750059187412262, "learning_rate": 9.894110480970064e-06, "loss": 0.3667, "step": 99275 }, { "epoch": 3.5780444732763903, "grad_norm": 0.22872807085514069, "learning_rate": 9.891785398284045e-06, "loss": 0.348, "step": 99280 }, { "epoch": 3.578224672937615, "grad_norm": 0.2133398801088333, "learning_rate": 9.889460521444541e-06, "loss": 0.3775, "step": 99285 }, { "epoch": 3.5784048725988393, "grad_norm": 0.24460799992084503, "learning_rate": 9.8871358504832e-06, "loss": 0.3606, "step": 99290 }, { "epoch": 3.578585072260064, "grad_norm": 0.25514861941337585, "learning_rate": 9.884811385431703e-06, "loss": 0.3779, "step": 99295 }, { "epoch": 3.578765271921289, "grad_norm": 0.2582806944847107, "learning_rate": 9.88248712632173e-06, "loss": 0.3921, "step": 99300 }, { "epoch": 3.5789454715825135, "grad_norm": 0.21507005393505096, "learning_rate": 9.880163073184945e-06, "loss": 0.3696, "step": 99305 }, { "epoch": 3.5791256712437383, "grad_norm": 0.2389105260372162, "learning_rate": 9.87783922605301e-06, "loss": 0.3775, "step": 99310 }, { "epoch": 3.5793058709049625, "grad_norm": 0.24397464096546173, "learning_rate": 9.875515584957587e-06, "loss": 0.4102, "step": 99315 }, { "epoch": 3.5794860705661873, "grad_norm": 0.262635201215744, "learning_rate": 9.87319214993033e-06, "loss": 0.3625, "step": 99320 }, { "epoch": 3.579666270227412, "grad_norm": 0.23571620881557465, "learning_rate": 9.870868921002908e-06, "loss": 0.4029, "step": 99325 }, { "epoch": 3.5798464698886368, "grad_norm": 0.20199549198150635, "learning_rate": 9.868545898206968e-06, "loss": 0.3573, "step": 99330 }, { "epoch": 3.580026669549861, "grad_norm": 0.262723445892334, "learning_rate": 9.86622308157416e-06, "loss": 0.3837, "step": 99335 }, { "epoch": 3.5802068692110858, "grad_norm": 0.2516125440597534, "learning_rate": 9.863900471136135e-06, "loss": 0.367, "step": 99340 }, { "epoch": 3.5803870688723105, "grad_norm": 0.2330031394958496, "learning_rate": 9.861578066924524e-06, "loss": 0.3491, "step": 99345 }, { "epoch": 3.5805672685335352, "grad_norm": 0.2417108565568924, "learning_rate": 9.85925586897099e-06, "loss": 0.4232, "step": 99350 }, { "epoch": 3.58074746819476, "grad_norm": 0.3049721121788025, "learning_rate": 9.856933877307173e-06, "loss": 0.3506, "step": 99355 }, { "epoch": 3.5809276678559843, "grad_norm": 0.20108012855052948, "learning_rate": 9.854612091964683e-06, "loss": 0.3765, "step": 99360 }, { "epoch": 3.581107867517209, "grad_norm": 0.18804548680782318, "learning_rate": 9.852290512975179e-06, "loss": 0.3871, "step": 99365 }, { "epoch": 3.5812880671784337, "grad_norm": 0.24155212938785553, "learning_rate": 9.849969140370286e-06, "loss": 0.3778, "step": 99370 }, { "epoch": 3.5814682668396585, "grad_norm": 0.2706049680709839, "learning_rate": 9.847647974181626e-06, "loss": 0.3831, "step": 99375 }, { "epoch": 3.5816484665008828, "grad_norm": 0.26998743414878845, "learning_rate": 9.845327014440834e-06, "loss": 0.4087, "step": 99380 }, { "epoch": 3.5818286661621075, "grad_norm": 0.21241755783557892, "learning_rate": 9.843006261179513e-06, "loss": 0.388, "step": 99385 }, { "epoch": 3.5820088658233322, "grad_norm": 0.261828750371933, "learning_rate": 9.84068571442931e-06, "loss": 0.3801, "step": 99390 }, { "epoch": 3.582189065484557, "grad_norm": 0.26328206062316895, "learning_rate": 9.838365374221827e-06, "loss": 0.4188, "step": 99395 }, { "epoch": 3.5823692651457817, "grad_norm": 0.26441022753715515, "learning_rate": 9.836045240588684e-06, "loss": 0.3694, "step": 99400 }, { "epoch": 3.5825494648070064, "grad_norm": 0.21081748604774475, "learning_rate": 9.833725313561487e-06, "loss": 0.33, "step": 99405 }, { "epoch": 3.5827296644682307, "grad_norm": 0.23671510815620422, "learning_rate": 9.831405593171836e-06, "loss": 0.4302, "step": 99410 }, { "epoch": 3.5829098641294554, "grad_norm": 0.2610369920730591, "learning_rate": 9.829086079451357e-06, "loss": 0.368, "step": 99415 }, { "epoch": 3.58309006379068, "grad_norm": 0.22345009446144104, "learning_rate": 9.826766772431643e-06, "loss": 0.3939, "step": 99420 }, { "epoch": 3.5832702634519045, "grad_norm": 0.28314733505249023, "learning_rate": 9.824447672144293e-06, "loss": 0.3636, "step": 99425 }, { "epoch": 3.583450463113129, "grad_norm": 0.26532143354415894, "learning_rate": 9.822128778620907e-06, "loss": 0.386, "step": 99430 }, { "epoch": 3.583630662774354, "grad_norm": 0.23965269327163696, "learning_rate": 9.819810091893078e-06, "loss": 0.4179, "step": 99435 }, { "epoch": 3.5838108624355787, "grad_norm": 0.23256781697273254, "learning_rate": 9.817491611992386e-06, "loss": 0.3622, "step": 99440 }, { "epoch": 3.5839910620968034, "grad_norm": 0.2113410234451294, "learning_rate": 9.815173338950442e-06, "loss": 0.3808, "step": 99445 }, { "epoch": 3.584171261758028, "grad_norm": 0.21958471834659576, "learning_rate": 9.812855272798822e-06, "loss": 0.3558, "step": 99450 }, { "epoch": 3.5843514614192524, "grad_norm": 0.21652553975582123, "learning_rate": 9.810537413569107e-06, "loss": 0.3874, "step": 99455 }, { "epoch": 3.584531661080477, "grad_norm": 0.23006175458431244, "learning_rate": 9.80821976129288e-06, "loss": 0.3678, "step": 99460 }, { "epoch": 3.584711860741702, "grad_norm": 0.22379863262176514, "learning_rate": 9.805902316001712e-06, "loss": 0.3814, "step": 99465 }, { "epoch": 3.584892060402926, "grad_norm": 0.26252281665802, "learning_rate": 9.803585077727195e-06, "loss": 0.3827, "step": 99470 }, { "epoch": 3.585072260064151, "grad_norm": 0.2835235893726349, "learning_rate": 9.801268046500884e-06, "loss": 0.3678, "step": 99475 }, { "epoch": 3.5852524597253757, "grad_norm": 0.25666457414627075, "learning_rate": 9.798951222354344e-06, "loss": 0.3539, "step": 99480 }, { "epoch": 3.5854326593866004, "grad_norm": 0.26046478748321533, "learning_rate": 9.796634605319158e-06, "loss": 0.3828, "step": 99485 }, { "epoch": 3.585612859047825, "grad_norm": 0.23460224270820618, "learning_rate": 9.794318195426882e-06, "loss": 0.3791, "step": 99490 }, { "epoch": 3.58579305870905, "grad_norm": 0.28312990069389343, "learning_rate": 9.79200199270908e-06, "loss": 0.4117, "step": 99495 }, { "epoch": 3.585973258370274, "grad_norm": 0.2613278925418854, "learning_rate": 9.7896859971973e-06, "loss": 0.3978, "step": 99500 }, { "epoch": 3.585973258370274, "eval_loss": 0.4298833906650543, "eval_runtime": 3.5272, "eval_samples_per_second": 28.351, "eval_steps_per_second": 7.088, "step": 99500 }, { "epoch": 3.586153458031499, "grad_norm": 0.2599956691265106, "learning_rate": 9.787370208923099e-06, "loss": 0.393, "step": 99505 }, { "epoch": 3.5863336576927236, "grad_norm": 0.2776644229888916, "learning_rate": 9.785054627918044e-06, "loss": 0.4205, "step": 99510 }, { "epoch": 3.586513857353948, "grad_norm": 0.27284055948257446, "learning_rate": 9.782739254213672e-06, "loss": 0.3999, "step": 99515 }, { "epoch": 3.5866940570151726, "grad_norm": 0.24026501178741455, "learning_rate": 9.780424087841533e-06, "loss": 0.378, "step": 99520 }, { "epoch": 3.5868742566763974, "grad_norm": 0.26283571124076843, "learning_rate": 9.778109128833166e-06, "loss": 0.3963, "step": 99525 }, { "epoch": 3.587054456337622, "grad_norm": 0.22547613084316254, "learning_rate": 9.77579437722011e-06, "loss": 0.3896, "step": 99530 }, { "epoch": 3.587234655998847, "grad_norm": 0.2306743711233139, "learning_rate": 9.773479833033913e-06, "loss": 0.4029, "step": 99535 }, { "epoch": 3.5874148556600716, "grad_norm": 0.24043156206607819, "learning_rate": 9.771165496306118e-06, "loss": 0.3885, "step": 99540 }, { "epoch": 3.587595055321296, "grad_norm": 0.18582390248775482, "learning_rate": 9.768851367068224e-06, "loss": 0.3691, "step": 99545 }, { "epoch": 3.5877752549825206, "grad_norm": 0.24359643459320068, "learning_rate": 9.766537445351792e-06, "loss": 0.402, "step": 99550 }, { "epoch": 3.5879554546437453, "grad_norm": 0.16479088366031647, "learning_rate": 9.764223731188337e-06, "loss": 0.398, "step": 99555 }, { "epoch": 3.58813565430497, "grad_norm": 0.23543262481689453, "learning_rate": 9.761910224609374e-06, "loss": 0.3722, "step": 99560 }, { "epoch": 3.5883158539661943, "grad_norm": 0.26243576407432556, "learning_rate": 9.759596925646456e-06, "loss": 0.3961, "step": 99565 }, { "epoch": 3.588496053627419, "grad_norm": 0.28197166323661804, "learning_rate": 9.757283834331057e-06, "loss": 0.3743, "step": 99570 }, { "epoch": 3.588676253288644, "grad_norm": 0.24312157928943634, "learning_rate": 9.754970950694725e-06, "loss": 0.4079, "step": 99575 }, { "epoch": 3.5888564529498685, "grad_norm": 0.21838116645812988, "learning_rate": 9.752658274768964e-06, "loss": 0.365, "step": 99580 }, { "epoch": 3.5890366526110933, "grad_norm": 0.2255815863609314, "learning_rate": 9.75034580658528e-06, "loss": 0.3944, "step": 99585 }, { "epoch": 3.5892168522723176, "grad_norm": 0.2602185308933258, "learning_rate": 9.748033546175182e-06, "loss": 0.3865, "step": 99590 }, { "epoch": 3.5893970519335423, "grad_norm": 0.1989058256149292, "learning_rate": 9.745721493570176e-06, "loss": 0.3411, "step": 99595 }, { "epoch": 3.589577251594767, "grad_norm": 0.25254446268081665, "learning_rate": 9.743409648801749e-06, "loss": 0.389, "step": 99600 }, { "epoch": 3.5897574512559918, "grad_norm": 0.2625841498374939, "learning_rate": 9.741098011901423e-06, "loss": 0.379, "step": 99605 }, { "epoch": 3.589937650917216, "grad_norm": 0.215394988656044, "learning_rate": 9.738786582900684e-06, "loss": 0.3997, "step": 99610 }, { "epoch": 3.590117850578441, "grad_norm": 0.2351488173007965, "learning_rate": 9.736475361831019e-06, "loss": 0.3684, "step": 99615 }, { "epoch": 3.5902980502396655, "grad_norm": 0.2385423183441162, "learning_rate": 9.734164348723922e-06, "loss": 0.3604, "step": 99620 }, { "epoch": 3.5904782499008903, "grad_norm": 0.17893251776695251, "learning_rate": 9.731853543610876e-06, "loss": 0.3638, "step": 99625 }, { "epoch": 3.590658449562115, "grad_norm": 0.23732203245162964, "learning_rate": 9.729542946523376e-06, "loss": 0.3772, "step": 99630 }, { "epoch": 3.5908386492233397, "grad_norm": 0.21697406470775604, "learning_rate": 9.727232557492896e-06, "loss": 0.3953, "step": 99635 }, { "epoch": 3.591018848884564, "grad_norm": 0.21853552758693695, "learning_rate": 9.724922376550915e-06, "loss": 0.3793, "step": 99640 }, { "epoch": 3.5911990485457888, "grad_norm": 0.23639477789402008, "learning_rate": 9.722612403728912e-06, "loss": 0.4109, "step": 99645 }, { "epoch": 3.5913792482070135, "grad_norm": 0.26770761609077454, "learning_rate": 9.720302639058349e-06, "loss": 0.3729, "step": 99650 }, { "epoch": 3.5915594478682378, "grad_norm": 0.21606463193893433, "learning_rate": 9.717993082570716e-06, "loss": 0.3727, "step": 99655 }, { "epoch": 3.5917396475294625, "grad_norm": 0.24439482390880585, "learning_rate": 9.715683734297466e-06, "loss": 0.3812, "step": 99660 }, { "epoch": 3.5919198471906872, "grad_norm": 0.24273943901062012, "learning_rate": 9.713374594270055e-06, "loss": 0.4021, "step": 99665 }, { "epoch": 3.592100046851912, "grad_norm": 0.2814248502254486, "learning_rate": 9.711065662519964e-06, "loss": 0.385, "step": 99670 }, { "epoch": 3.5922802465131367, "grad_norm": 0.23897801339626312, "learning_rate": 9.708756939078634e-06, "loss": 0.3566, "step": 99675 }, { "epoch": 3.5924604461743614, "grad_norm": 0.24239744246006012, "learning_rate": 9.70644842397755e-06, "loss": 0.4005, "step": 99680 }, { "epoch": 3.5926406458355857, "grad_norm": 0.2517867684364319, "learning_rate": 9.704140117248134e-06, "loss": 0.3916, "step": 99685 }, { "epoch": 3.5928208454968105, "grad_norm": 0.315687894821167, "learning_rate": 9.701832018921839e-06, "loss": 0.3683, "step": 99690 }, { "epoch": 3.593001045158035, "grad_norm": 0.29640886187553406, "learning_rate": 9.69952412903013e-06, "loss": 0.375, "step": 99695 }, { "epoch": 3.5931812448192595, "grad_norm": 0.22682395577430725, "learning_rate": 9.697216447604444e-06, "loss": 0.3987, "step": 99700 }, { "epoch": 3.5933614444804842, "grad_norm": 0.30215203762054443, "learning_rate": 9.69490897467622e-06, "loss": 0.3749, "step": 99705 }, { "epoch": 3.593541644141709, "grad_norm": 0.23141373693943024, "learning_rate": 9.692601710276897e-06, "loss": 0.3501, "step": 99710 }, { "epoch": 3.5937218438029337, "grad_norm": 0.238219752907753, "learning_rate": 9.690294654437907e-06, "loss": 0.3775, "step": 99715 }, { "epoch": 3.5939020434641584, "grad_norm": 0.1689501702785492, "learning_rate": 9.687987807190693e-06, "loss": 0.3641, "step": 99720 }, { "epoch": 3.594082243125383, "grad_norm": 0.21029123663902283, "learning_rate": 9.685681168566683e-06, "loss": 0.3353, "step": 99725 }, { "epoch": 3.5942624427866074, "grad_norm": 0.22430087625980377, "learning_rate": 9.6833747385973e-06, "loss": 0.3909, "step": 99730 }, { "epoch": 3.594442642447832, "grad_norm": 0.21610116958618164, "learning_rate": 9.681068517313973e-06, "loss": 0.3581, "step": 99735 }, { "epoch": 3.594622842109057, "grad_norm": 0.2914960980415344, "learning_rate": 9.678762504748121e-06, "loss": 0.3932, "step": 99740 }, { "epoch": 3.594803041770281, "grad_norm": 0.24280034005641937, "learning_rate": 9.676456700931152e-06, "loss": 0.3773, "step": 99745 }, { "epoch": 3.594983241431506, "grad_norm": 0.2517662048339844, "learning_rate": 9.674151105894516e-06, "loss": 0.3723, "step": 99750 }, { "epoch": 3.5951634410927307, "grad_norm": 0.23000794649124146, "learning_rate": 9.671845719669584e-06, "loss": 0.3735, "step": 99755 }, { "epoch": 3.5953436407539554, "grad_norm": 0.2036644071340561, "learning_rate": 9.669540542287795e-06, "loss": 0.3355, "step": 99760 }, { "epoch": 3.59552384041518, "grad_norm": 0.210072860121727, "learning_rate": 9.667235573780547e-06, "loss": 0.3654, "step": 99765 }, { "epoch": 3.595704040076405, "grad_norm": 0.23076112568378448, "learning_rate": 9.664930814179248e-06, "loss": 0.381, "step": 99770 }, { "epoch": 3.595884239737629, "grad_norm": 0.22967948019504547, "learning_rate": 9.662626263515298e-06, "loss": 0.3657, "step": 99775 }, { "epoch": 3.596064439398854, "grad_norm": 0.21021999418735504, "learning_rate": 9.660321921820095e-06, "loss": 0.3699, "step": 99780 }, { "epoch": 3.5962446390600786, "grad_norm": 0.2712535858154297, "learning_rate": 9.658017789125026e-06, "loss": 0.4032, "step": 99785 }, { "epoch": 3.5964248387213034, "grad_norm": 0.19251175224781036, "learning_rate": 9.655713865461505e-06, "loss": 0.3645, "step": 99790 }, { "epoch": 3.5966050383825277, "grad_norm": 0.22021165490150452, "learning_rate": 9.653410150860912e-06, "loss": 0.3683, "step": 99795 }, { "epoch": 3.5967852380437524, "grad_norm": 0.20990362763404846, "learning_rate": 9.651106645354632e-06, "loss": 0.3702, "step": 99800 }, { "epoch": 3.596965437704977, "grad_norm": 0.25066566467285156, "learning_rate": 9.648803348974054e-06, "loss": 0.3669, "step": 99805 }, { "epoch": 3.597145637366202, "grad_norm": 0.21980679035186768, "learning_rate": 9.64650026175055e-06, "loss": 0.3826, "step": 99810 }, { "epoch": 3.5973258370274266, "grad_norm": 0.20478756725788116, "learning_rate": 9.644197383715514e-06, "loss": 0.3417, "step": 99815 }, { "epoch": 3.597506036688651, "grad_norm": 0.2611001431941986, "learning_rate": 9.641894714900316e-06, "loss": 0.3901, "step": 99820 }, { "epoch": 3.5976862363498756, "grad_norm": 0.2687753140926361, "learning_rate": 9.63959225533633e-06, "loss": 0.3791, "step": 99825 }, { "epoch": 3.5978664360111003, "grad_norm": 0.20144647359848022, "learning_rate": 9.637290005054928e-06, "loss": 0.3827, "step": 99830 }, { "epoch": 3.598046635672325, "grad_norm": 0.2408836930990219, "learning_rate": 9.634987964087464e-06, "loss": 0.3959, "step": 99835 }, { "epoch": 3.5982268353335494, "grad_norm": 0.23702372610569, "learning_rate": 9.632686132465321e-06, "loss": 0.404, "step": 99840 }, { "epoch": 3.598407034994774, "grad_norm": 0.2729646563529968, "learning_rate": 9.630384510219867e-06, "loss": 0.3612, "step": 99845 }, { "epoch": 3.598587234655999, "grad_norm": 0.25669410824775696, "learning_rate": 9.628083097382428e-06, "loss": 0.373, "step": 99850 }, { "epoch": 3.5987674343172236, "grad_norm": 0.18720833957195282, "learning_rate": 9.625781893984392e-06, "loss": 0.3708, "step": 99855 }, { "epoch": 3.5989476339784483, "grad_norm": 0.2443101853132248, "learning_rate": 9.623480900057092e-06, "loss": 0.38, "step": 99860 }, { "epoch": 3.5991278336396726, "grad_norm": 0.24887430667877197, "learning_rate": 9.621180115631904e-06, "loss": 0.3652, "step": 99865 }, { "epoch": 3.5993080333008973, "grad_norm": 0.22554521262645721, "learning_rate": 9.61887954074015e-06, "loss": 0.3778, "step": 99870 }, { "epoch": 3.599488232962122, "grad_norm": 0.2344188541173935, "learning_rate": 9.616579175413176e-06, "loss": 0.3619, "step": 99875 }, { "epoch": 3.599668432623347, "grad_norm": 0.30154213309288025, "learning_rate": 9.614279019682343e-06, "loss": 0.4012, "step": 99880 }, { "epoch": 3.599848632284571, "grad_norm": 0.2288644164800644, "learning_rate": 9.611979073578977e-06, "loss": 0.3882, "step": 99885 }, { "epoch": 3.600028831945796, "grad_norm": 0.20435230433940887, "learning_rate": 9.60967933713442e-06, "loss": 0.408, "step": 99890 }, { "epoch": 3.6002090316070205, "grad_norm": 0.24859078228473663, "learning_rate": 9.607379810379998e-06, "loss": 0.3833, "step": 99895 }, { "epoch": 3.6003892312682453, "grad_norm": 0.28969287872314453, "learning_rate": 9.605080493347041e-06, "loss": 0.3902, "step": 99900 }, { "epoch": 3.60056943092947, "grad_norm": 0.2676686644554138, "learning_rate": 9.602781386066889e-06, "loss": 0.3858, "step": 99905 }, { "epoch": 3.6007496305906947, "grad_norm": 0.2682179808616638, "learning_rate": 9.600482488570862e-06, "loss": 0.3932, "step": 99910 }, { "epoch": 3.600929830251919, "grad_norm": 0.21893753111362457, "learning_rate": 9.598183800890276e-06, "loss": 0.3843, "step": 99915 }, { "epoch": 3.6011100299131438, "grad_norm": 0.2323058843612671, "learning_rate": 9.595885323056455e-06, "loss": 0.3797, "step": 99920 }, { "epoch": 3.6012902295743685, "grad_norm": 0.23504719138145447, "learning_rate": 9.593587055100717e-06, "loss": 0.3858, "step": 99925 }, { "epoch": 3.601470429235593, "grad_norm": 0.2661448121070862, "learning_rate": 9.591288997054359e-06, "loss": 0.3477, "step": 99930 }, { "epoch": 3.6016506288968175, "grad_norm": 0.261135458946228, "learning_rate": 9.588991148948726e-06, "loss": 0.3728, "step": 99935 }, { "epoch": 3.6018308285580423, "grad_norm": 0.2633829712867737, "learning_rate": 9.586693510815088e-06, "loss": 0.3891, "step": 99940 }, { "epoch": 3.602011028219267, "grad_norm": 0.23481690883636475, "learning_rate": 9.584396082684775e-06, "loss": 0.3821, "step": 99945 }, { "epoch": 3.6021912278804917, "grad_norm": 0.23687255382537842, "learning_rate": 9.582098864589078e-06, "loss": 0.3825, "step": 99950 }, { "epoch": 3.6023714275417165, "grad_norm": 0.24096225202083588, "learning_rate": 9.579801856559292e-06, "loss": 0.35, "step": 99955 }, { "epoch": 3.6025516272029408, "grad_norm": 0.23858147859573364, "learning_rate": 9.577505058626738e-06, "loss": 0.3736, "step": 99960 }, { "epoch": 3.6027318268641655, "grad_norm": 0.24173545837402344, "learning_rate": 9.575208470822682e-06, "loss": 0.3797, "step": 99965 }, { "epoch": 3.60291202652539, "grad_norm": 0.19363081455230713, "learning_rate": 9.572912093178419e-06, "loss": 0.3779, "step": 99970 }, { "epoch": 3.6030922261866145, "grad_norm": 0.2636258006095886, "learning_rate": 9.570615925725246e-06, "loss": 0.4388, "step": 99975 }, { "epoch": 3.6032724258478392, "grad_norm": 0.2344840168952942, "learning_rate": 9.568319968494446e-06, "loss": 0.3784, "step": 99980 }, { "epoch": 3.603452625509064, "grad_norm": 0.22249801456928253, "learning_rate": 9.5660242215173e-06, "loss": 0.3607, "step": 99985 }, { "epoch": 3.6036328251702887, "grad_norm": 0.19639311730861664, "learning_rate": 9.563728684825082e-06, "loss": 0.3788, "step": 99990 }, { "epoch": 3.6038130248315134, "grad_norm": 0.2106081247329712, "learning_rate": 9.561433358449068e-06, "loss": 0.3919, "step": 99995 }, { "epoch": 3.603993224492738, "grad_norm": 0.2347351461648941, "learning_rate": 9.559138242420544e-06, "loss": 0.3608, "step": 100000 }, { "epoch": 3.603993224492738, "eval_loss": 0.42966705560684204, "eval_runtime": 3.5294, "eval_samples_per_second": 28.334, "eval_steps_per_second": 7.083, "step": 100000 }, { "epoch": 3.6041734241539625, "grad_norm": 0.23393255472183228, "learning_rate": 9.556843336770768e-06, "loss": 0.3584, "step": 100005 }, { "epoch": 3.604353623815187, "grad_norm": 0.22265978157520294, "learning_rate": 9.554548641531014e-06, "loss": 0.355, "step": 100010 }, { "epoch": 3.604533823476412, "grad_norm": 0.2124675214290619, "learning_rate": 9.552254156732543e-06, "loss": 0.3352, "step": 100015 }, { "epoch": 3.6047140231376362, "grad_norm": 0.20015810430049896, "learning_rate": 9.549959882406611e-06, "loss": 0.3809, "step": 100020 }, { "epoch": 3.604894222798861, "grad_norm": 0.24774090945720673, "learning_rate": 9.547665818584491e-06, "loss": 0.3974, "step": 100025 }, { "epoch": 3.6050744224600857, "grad_norm": 0.23569168150424957, "learning_rate": 9.545371965297445e-06, "loss": 0.3889, "step": 100030 }, { "epoch": 3.6052546221213104, "grad_norm": 0.25077682733535767, "learning_rate": 9.543078322576696e-06, "loss": 0.3749, "step": 100035 }, { "epoch": 3.605434821782535, "grad_norm": 0.30293580889701843, "learning_rate": 9.54078489045352e-06, "loss": 0.3622, "step": 100040 }, { "epoch": 3.60561502144376, "grad_norm": 0.23128095269203186, "learning_rate": 9.538491668959146e-06, "loss": 0.3876, "step": 100045 }, { "epoch": 3.605795221104984, "grad_norm": 0.20295816659927368, "learning_rate": 9.536198658124849e-06, "loss": 0.3585, "step": 100050 }, { "epoch": 3.605975420766209, "grad_norm": 0.23228393495082855, "learning_rate": 9.533905857981843e-06, "loss": 0.4119, "step": 100055 }, { "epoch": 3.6061556204274337, "grad_norm": 0.2623964548110962, "learning_rate": 9.531613268561365e-06, "loss": 0.3794, "step": 100060 }, { "epoch": 3.6063358200886584, "grad_norm": 0.21798618137836456, "learning_rate": 9.529320889894671e-06, "loss": 0.391, "step": 100065 }, { "epoch": 3.6065160197498827, "grad_norm": 0.21800731122493744, "learning_rate": 9.527028722012985e-06, "loss": 0.3605, "step": 100070 }, { "epoch": 3.6066962194111074, "grad_norm": 0.2361879199743271, "learning_rate": 9.524736764947537e-06, "loss": 0.3766, "step": 100075 }, { "epoch": 3.606876419072332, "grad_norm": 0.23150278627872467, "learning_rate": 9.522445018729556e-06, "loss": 0.3902, "step": 100080 }, { "epoch": 3.607056618733557, "grad_norm": 0.25107669830322266, "learning_rate": 9.520153483390253e-06, "loss": 0.3765, "step": 100085 }, { "epoch": 3.6072368183947816, "grad_norm": 0.22123172879219055, "learning_rate": 9.517862158960873e-06, "loss": 0.3759, "step": 100090 }, { "epoch": 3.607417018056006, "grad_norm": 0.2164187878370285, "learning_rate": 9.515571045472623e-06, "loss": 0.3721, "step": 100095 }, { "epoch": 3.6075972177172306, "grad_norm": 0.23865751922130585, "learning_rate": 9.513280142956718e-06, "loss": 0.3667, "step": 100100 }, { "epoch": 3.6077774173784554, "grad_norm": 0.22169511020183563, "learning_rate": 9.510989451444374e-06, "loss": 0.3906, "step": 100105 }, { "epoch": 3.60795761703968, "grad_norm": 0.2418065220117569, "learning_rate": 9.5086989709668e-06, "loss": 0.3893, "step": 100110 }, { "epoch": 3.6081378167009044, "grad_norm": 0.274650514125824, "learning_rate": 9.506408701555197e-06, "loss": 0.3756, "step": 100115 }, { "epoch": 3.608318016362129, "grad_norm": 0.25587350130081177, "learning_rate": 9.504118643240781e-06, "loss": 0.3975, "step": 100120 }, { "epoch": 3.608498216023354, "grad_norm": 0.35312119126319885, "learning_rate": 9.501828796054751e-06, "loss": 0.3939, "step": 100125 }, { "epoch": 3.6086784156845786, "grad_norm": 0.24378515779972076, "learning_rate": 9.499539160028301e-06, "loss": 0.4169, "step": 100130 }, { "epoch": 3.6088586153458033, "grad_norm": 0.2463681548833847, "learning_rate": 9.497249735192628e-06, "loss": 0.3423, "step": 100135 }, { "epoch": 3.609038815007028, "grad_norm": 0.2869650423526764, "learning_rate": 9.494960521578922e-06, "loss": 0.3543, "step": 100140 }, { "epoch": 3.6092190146682523, "grad_norm": 0.29217445850372314, "learning_rate": 9.49267151921839e-06, "loss": 0.37, "step": 100145 }, { "epoch": 3.609399214329477, "grad_norm": 0.21554695069789886, "learning_rate": 9.4903827281422e-06, "loss": 0.3741, "step": 100150 }, { "epoch": 3.609579413990702, "grad_norm": 0.17673182487487793, "learning_rate": 9.488094148381533e-06, "loss": 0.3679, "step": 100155 }, { "epoch": 3.609759613651926, "grad_norm": 0.2148004174232483, "learning_rate": 9.485805779967588e-06, "loss": 0.3473, "step": 100160 }, { "epoch": 3.609939813313151, "grad_norm": 0.33437803387641907, "learning_rate": 9.483517622931534e-06, "loss": 0.3883, "step": 100165 }, { "epoch": 3.6101200129743756, "grad_norm": 0.31750041246414185, "learning_rate": 9.48122967730455e-06, "loss": 0.3693, "step": 100170 }, { "epoch": 3.6103002126356003, "grad_norm": 0.2354346662759781, "learning_rate": 9.478941943117809e-06, "loss": 0.388, "step": 100175 }, { "epoch": 3.610480412296825, "grad_norm": 0.2933277189731598, "learning_rate": 9.47665442040247e-06, "loss": 0.4218, "step": 100180 }, { "epoch": 3.6106606119580498, "grad_norm": 0.2739449441432953, "learning_rate": 9.474367109189716e-06, "loss": 0.3977, "step": 100185 }, { "epoch": 3.610840811619274, "grad_norm": 0.2737182676792145, "learning_rate": 9.472080009510707e-06, "loss": 0.3892, "step": 100190 }, { "epoch": 3.611021011280499, "grad_norm": 0.2394844889640808, "learning_rate": 9.469793121396597e-06, "loss": 0.366, "step": 100195 }, { "epoch": 3.6112012109417235, "grad_norm": 0.20772317051887512, "learning_rate": 9.46796376325298e-06, "loss": 0.333, "step": 100200 }, { "epoch": 3.611381410602948, "grad_norm": 0.19267010688781738, "learning_rate": 9.465677256034215e-06, "loss": 0.3884, "step": 100205 }, { "epoch": 3.6115616102641726, "grad_norm": 0.28297874331474304, "learning_rate": 9.463390960467583e-06, "loss": 0.3827, "step": 100210 }, { "epoch": 3.6117418099253973, "grad_norm": 0.25464269518852234, "learning_rate": 9.461104876584256e-06, "loss": 0.3932, "step": 100215 }, { "epoch": 3.611922009586622, "grad_norm": 0.22311927378177643, "learning_rate": 9.458819004415361e-06, "loss": 0.37, "step": 100220 }, { "epoch": 3.6121022092478468, "grad_norm": 0.22702719271183014, "learning_rate": 9.456533343992039e-06, "loss": 0.4145, "step": 100225 }, { "epoch": 3.6122824089090715, "grad_norm": 0.20826251804828644, "learning_rate": 9.454247895345447e-06, "loss": 0.3434, "step": 100230 }, { "epoch": 3.6124626085702958, "grad_norm": 0.2535870671272278, "learning_rate": 9.451962658506722e-06, "loss": 0.3997, "step": 100235 }, { "epoch": 3.6126428082315205, "grad_norm": 0.27190902829170227, "learning_rate": 9.449677633506992e-06, "loss": 0.3874, "step": 100240 }, { "epoch": 3.6128230078927452, "grad_norm": 0.2581278383731842, "learning_rate": 9.447392820377397e-06, "loss": 0.3805, "step": 100245 }, { "epoch": 3.6130032075539695, "grad_norm": 0.2271093875169754, "learning_rate": 9.445108219149052e-06, "loss": 0.3962, "step": 100250 }, { "epoch": 3.6131834072151943, "grad_norm": 0.251756489276886, "learning_rate": 9.442823829853103e-06, "loss": 0.3868, "step": 100255 }, { "epoch": 3.613363606876419, "grad_norm": 0.20969605445861816, "learning_rate": 9.440539652520671e-06, "loss": 0.3637, "step": 100260 }, { "epoch": 3.6135438065376437, "grad_norm": 0.2561705410480499, "learning_rate": 9.438255687182873e-06, "loss": 0.3433, "step": 100265 }, { "epoch": 3.6137240061988685, "grad_norm": 0.19795164465904236, "learning_rate": 9.435971933870827e-06, "loss": 0.3794, "step": 100270 }, { "epoch": 3.613904205860093, "grad_norm": 0.267392098903656, "learning_rate": 9.433688392615644e-06, "loss": 0.3855, "step": 100275 }, { "epoch": 3.6140844055213175, "grad_norm": 0.27009448409080505, "learning_rate": 9.431405063448451e-06, "loss": 0.3847, "step": 100280 }, { "epoch": 3.6142646051825422, "grad_norm": 0.22548210620880127, "learning_rate": 9.429121946400358e-06, "loss": 0.3959, "step": 100285 }, { "epoch": 3.614444804843767, "grad_norm": 0.2359258383512497, "learning_rate": 9.426839041502444e-06, "loss": 0.3798, "step": 100290 }, { "epoch": 3.6146250045049917, "grad_norm": 0.20709139108657837, "learning_rate": 9.424556348785846e-06, "loss": 0.3797, "step": 100295 }, { "epoch": 3.614805204166216, "grad_norm": 0.22372165322303772, "learning_rate": 9.422273868281639e-06, "loss": 0.3794, "step": 100300 }, { "epoch": 3.6149854038274407, "grad_norm": 0.17889077961444855, "learning_rate": 9.419991600020947e-06, "loss": 0.3699, "step": 100305 }, { "epoch": 3.6151656034886654, "grad_norm": 0.17842791974544525, "learning_rate": 9.417709544034862e-06, "loss": 0.3769, "step": 100310 }, { "epoch": 3.61534580314989, "grad_norm": 0.2319183498620987, "learning_rate": 9.415427700354446e-06, "loss": 0.3625, "step": 100315 }, { "epoch": 3.615526002811115, "grad_norm": 0.22687672078609467, "learning_rate": 9.413146069010822e-06, "loss": 0.387, "step": 100320 }, { "epoch": 3.615706202472339, "grad_norm": 0.25115665793418884, "learning_rate": 9.410864650035064e-06, "loss": 0.3584, "step": 100325 }, { "epoch": 3.615886402133564, "grad_norm": 0.23549871146678925, "learning_rate": 9.408583443458257e-06, "loss": 0.3989, "step": 100330 }, { "epoch": 3.6160666017947887, "grad_norm": 0.19801755249500275, "learning_rate": 9.406302449311485e-06, "loss": 0.3762, "step": 100335 }, { "epoch": 3.6162468014560134, "grad_norm": 0.25041335821151733, "learning_rate": 9.404021667625812e-06, "loss": 0.3706, "step": 100340 }, { "epoch": 3.6164270011172377, "grad_norm": 0.2323254495859146, "learning_rate": 9.401741098432332e-06, "loss": 0.3995, "step": 100345 }, { "epoch": 3.6166072007784624, "grad_norm": 0.20460768043994904, "learning_rate": 9.399460741762111e-06, "loss": 0.3853, "step": 100350 }, { "epoch": 3.616787400439687, "grad_norm": 0.30746299028396606, "learning_rate": 9.397180597646218e-06, "loss": 0.387, "step": 100355 }, { "epoch": 3.616967600100912, "grad_norm": 0.24463587999343872, "learning_rate": 9.394900666115716e-06, "loss": 0.3957, "step": 100360 }, { "epoch": 3.6171477997621366, "grad_norm": 0.19743074476718903, "learning_rate": 9.392620947201675e-06, "loss": 0.3711, "step": 100365 }, { "epoch": 3.617327999423361, "grad_norm": 0.23998847603797913, "learning_rate": 9.390341440935138e-06, "loss": 0.3986, "step": 100370 }, { "epoch": 3.6175081990845857, "grad_norm": 0.2761986553668976, "learning_rate": 9.38806214734719e-06, "loss": 0.3685, "step": 100375 }, { "epoch": 3.6176883987458104, "grad_norm": 0.209417924284935, "learning_rate": 9.38578306646887e-06, "loss": 0.3551, "step": 100380 }, { "epoch": 3.617868598407035, "grad_norm": 0.2800207734107971, "learning_rate": 9.383504198331233e-06, "loss": 0.38, "step": 100385 }, { "epoch": 3.6180487980682594, "grad_norm": 0.2673444151878357, "learning_rate": 9.38122554296533e-06, "loss": 0.3892, "step": 100390 }, { "epoch": 3.618228997729484, "grad_norm": 0.26103273034095764, "learning_rate": 9.378947100402194e-06, "loss": 0.3775, "step": 100395 }, { "epoch": 3.618409197390709, "grad_norm": 0.20183852314949036, "learning_rate": 9.376668870672897e-06, "loss": 0.3525, "step": 100400 }, { "epoch": 3.6185893970519336, "grad_norm": 0.22087271511554718, "learning_rate": 9.374390853808454e-06, "loss": 0.3442, "step": 100405 }, { "epoch": 3.6187695967131583, "grad_norm": 0.23035688698291779, "learning_rate": 9.372113049839903e-06, "loss": 0.3829, "step": 100410 }, { "epoch": 3.618949796374383, "grad_norm": 0.199892058968544, "learning_rate": 9.369835458798293e-06, "loss": 0.3724, "step": 100415 }, { "epoch": 3.6191299960356074, "grad_norm": 0.2462371587753296, "learning_rate": 9.367558080714639e-06, "loss": 0.3667, "step": 100420 }, { "epoch": 3.619310195696832, "grad_norm": 0.21908694505691528, "learning_rate": 9.36528091562e-06, "loss": 0.3298, "step": 100425 }, { "epoch": 3.619490395358057, "grad_norm": 0.22547689080238342, "learning_rate": 9.36300396354537e-06, "loss": 0.3721, "step": 100430 }, { "epoch": 3.619670595019281, "grad_norm": 0.28035834431648254, "learning_rate": 9.360727224521775e-06, "loss": 0.3714, "step": 100435 }, { "epoch": 3.619850794680506, "grad_norm": 0.2472325563430786, "learning_rate": 9.358450698580254e-06, "loss": 0.3513, "step": 100440 }, { "epoch": 3.6200309943417306, "grad_norm": 0.24515065550804138, "learning_rate": 9.356174385751815e-06, "loss": 0.3279, "step": 100445 }, { "epoch": 3.6202111940029553, "grad_norm": 0.29932889342308044, "learning_rate": 9.35389828606747e-06, "loss": 0.3565, "step": 100450 }, { "epoch": 3.62039139366418, "grad_norm": 0.2718527913093567, "learning_rate": 9.35162239955823e-06, "loss": 0.3859, "step": 100455 }, { "epoch": 3.620571593325405, "grad_norm": 0.21936644613742828, "learning_rate": 9.349346726255098e-06, "loss": 0.3857, "step": 100460 }, { "epoch": 3.620751792986629, "grad_norm": 0.2328893095254898, "learning_rate": 9.347071266189095e-06, "loss": 0.333, "step": 100465 }, { "epoch": 3.620931992647854, "grad_norm": 0.22715623676776886, "learning_rate": 9.344796019391217e-06, "loss": 0.3534, "step": 100470 }, { "epoch": 3.6211121923090785, "grad_norm": 0.2767705023288727, "learning_rate": 9.342520985892461e-06, "loss": 0.3693, "step": 100475 }, { "epoch": 3.621292391970303, "grad_norm": 0.22328001260757446, "learning_rate": 9.340246165723826e-06, "loss": 0.4045, "step": 100480 }, { "epoch": 3.6214725916315276, "grad_norm": 0.25353187322616577, "learning_rate": 9.337971558916295e-06, "loss": 0.3343, "step": 100485 }, { "epoch": 3.6216527912927523, "grad_norm": 0.2742082178592682, "learning_rate": 9.335697165500878e-06, "loss": 0.3792, "step": 100490 }, { "epoch": 3.621832990953977, "grad_norm": 0.28070586919784546, "learning_rate": 9.333422985508563e-06, "loss": 0.3781, "step": 100495 }, { "epoch": 3.6220131906152018, "grad_norm": 0.20703862607479095, "learning_rate": 9.331149018970311e-06, "loss": 0.3754, "step": 100500 }, { "epoch": 3.6220131906152018, "eval_loss": 0.42909446358680725, "eval_runtime": 3.5291, "eval_samples_per_second": 28.336, "eval_steps_per_second": 7.084, "step": 100500 }, { "epoch": 3.6221933902764265, "grad_norm": 0.23609666526317596, "learning_rate": 9.328875265917128e-06, "loss": 0.3835, "step": 100505 }, { "epoch": 3.622373589937651, "grad_norm": 0.22293473780155182, "learning_rate": 9.326601726379986e-06, "loss": 0.4105, "step": 100510 }, { "epoch": 3.6225537895988755, "grad_norm": 0.251966267824173, "learning_rate": 9.324328400389858e-06, "loss": 0.3797, "step": 100515 }, { "epoch": 3.6227339892601003, "grad_norm": 0.22561317682266235, "learning_rate": 9.322055287977724e-06, "loss": 0.3619, "step": 100520 }, { "epoch": 3.6229141889213246, "grad_norm": 0.22810781002044678, "learning_rate": 9.319782389174542e-06, "loss": 0.3784, "step": 100525 }, { "epoch": 3.6230943885825493, "grad_norm": 0.20727618038654327, "learning_rate": 9.317509704011298e-06, "loss": 0.3684, "step": 100530 }, { "epoch": 3.623274588243774, "grad_norm": 0.3055703341960907, "learning_rate": 9.315237232518948e-06, "loss": 0.3809, "step": 100535 }, { "epoch": 3.6234547879049988, "grad_norm": 0.2843535840511322, "learning_rate": 9.312964974728453e-06, "loss": 0.3996, "step": 100540 }, { "epoch": 3.6236349875662235, "grad_norm": 0.22682887315750122, "learning_rate": 9.310692930670773e-06, "loss": 0.3469, "step": 100545 }, { "epoch": 3.623815187227448, "grad_norm": 0.19296708703041077, "learning_rate": 9.308421100376865e-06, "loss": 0.3504, "step": 100550 }, { "epoch": 3.6239953868886725, "grad_norm": 0.3073079586029053, "learning_rate": 9.306149483877674e-06, "loss": 0.4082, "step": 100555 }, { "epoch": 3.6241755865498972, "grad_norm": 0.2964288294315338, "learning_rate": 9.303878081204165e-06, "loss": 0.4123, "step": 100560 }, { "epoch": 3.624355786211122, "grad_norm": 0.2711564600467682, "learning_rate": 9.301606892387276e-06, "loss": 0.3638, "step": 100565 }, { "epoch": 3.6245359858723467, "grad_norm": 0.2122730016708374, "learning_rate": 9.299335917457958e-06, "loss": 0.376, "step": 100570 }, { "epoch": 3.624716185533571, "grad_norm": 0.21361200511455536, "learning_rate": 9.297065156447147e-06, "loss": 0.3711, "step": 100575 }, { "epoch": 3.6248963851947957, "grad_norm": 0.24864503741264343, "learning_rate": 9.294794609385773e-06, "loss": 0.3661, "step": 100580 }, { "epoch": 3.6250765848560205, "grad_norm": 0.23032629489898682, "learning_rate": 9.292524276304792e-06, "loss": 0.3687, "step": 100585 }, { "epoch": 3.625256784517245, "grad_norm": 0.31852540373802185, "learning_rate": 9.290254157235135e-06, "loss": 0.3777, "step": 100590 }, { "epoch": 3.62543698417847, "grad_norm": 0.196120947599411, "learning_rate": 9.287984252207707e-06, "loss": 0.3826, "step": 100595 }, { "epoch": 3.6256171838396942, "grad_norm": 0.17373962700366974, "learning_rate": 9.285714561253458e-06, "loss": 0.3801, "step": 100600 }, { "epoch": 3.625797383500919, "grad_norm": 0.21692752838134766, "learning_rate": 9.2834450844033e-06, "loss": 0.3712, "step": 100605 }, { "epoch": 3.6259775831621437, "grad_norm": 0.23615795373916626, "learning_rate": 9.281175821688177e-06, "loss": 0.3827, "step": 100610 }, { "epoch": 3.6261577828233684, "grad_norm": 0.21718338131904602, "learning_rate": 9.278906773138979e-06, "loss": 0.3901, "step": 100615 }, { "epoch": 3.6263379824845927, "grad_norm": 0.2522786557674408, "learning_rate": 9.276637938786626e-06, "loss": 0.342, "step": 100620 }, { "epoch": 3.6265181821458174, "grad_norm": 0.31845128536224365, "learning_rate": 9.274369318662044e-06, "loss": 0.3978, "step": 100625 }, { "epoch": 3.626698381807042, "grad_norm": 0.2796033024787903, "learning_rate": 9.272100912796137e-06, "loss": 0.3601, "step": 100630 }, { "epoch": 3.626878581468267, "grad_norm": 0.22418884932994843, "learning_rate": 9.26983272121981e-06, "loss": 0.3808, "step": 100635 }, { "epoch": 3.6270587811294916, "grad_norm": 0.2793503403663635, "learning_rate": 9.267564743963963e-06, "loss": 0.4101, "step": 100640 }, { "epoch": 3.6272389807907164, "grad_norm": 0.23813951015472412, "learning_rate": 9.265296981059496e-06, "loss": 0.4087, "step": 100645 }, { "epoch": 3.6274191804519407, "grad_norm": 0.27661383152008057, "learning_rate": 9.263029432537317e-06, "loss": 0.3816, "step": 100650 }, { "epoch": 3.6275993801131654, "grad_norm": 0.25448471307754517, "learning_rate": 9.260762098428319e-06, "loss": 0.3711, "step": 100655 }, { "epoch": 3.62777957977439, "grad_norm": 0.18283936381340027, "learning_rate": 9.258494978763387e-06, "loss": 0.3951, "step": 100660 }, { "epoch": 3.6279597794356144, "grad_norm": 0.22228430211544037, "learning_rate": 9.256228073573413e-06, "loss": 0.3452, "step": 100665 }, { "epoch": 3.628139979096839, "grad_norm": 0.185472771525383, "learning_rate": 9.253961382889278e-06, "loss": 0.4253, "step": 100670 }, { "epoch": 3.628320178758064, "grad_norm": 0.18725287914276123, "learning_rate": 9.251694906741879e-06, "loss": 0.3513, "step": 100675 }, { "epoch": 3.6285003784192886, "grad_norm": 0.22151315212249756, "learning_rate": 9.249428645162095e-06, "loss": 0.3352, "step": 100680 }, { "epoch": 3.6286805780805134, "grad_norm": 0.22003361582756042, "learning_rate": 9.247162598180777e-06, "loss": 0.3763, "step": 100685 }, { "epoch": 3.628860777741738, "grad_norm": 0.280362993478775, "learning_rate": 9.244896765828831e-06, "loss": 0.3839, "step": 100690 }, { "epoch": 3.6290409774029624, "grad_norm": 0.24455983936786652, "learning_rate": 9.242631148137114e-06, "loss": 0.3854, "step": 100695 }, { "epoch": 3.629221177064187, "grad_norm": 0.28553998470306396, "learning_rate": 9.240365745136498e-06, "loss": 0.4387, "step": 100700 }, { "epoch": 3.629401376725412, "grad_norm": 0.22999370098114014, "learning_rate": 9.238100556857847e-06, "loss": 0.3486, "step": 100705 }, { "epoch": 3.629581576386636, "grad_norm": 0.24775420129299164, "learning_rate": 9.235835583332017e-06, "loss": 0.4171, "step": 100710 }, { "epoch": 3.629761776047861, "grad_norm": 0.30555811524391174, "learning_rate": 9.233570824589881e-06, "loss": 0.4051, "step": 100715 }, { "epoch": 3.6299419757090856, "grad_norm": 0.22852908074855804, "learning_rate": 9.23130628066229e-06, "loss": 0.3686, "step": 100720 }, { "epoch": 3.6301221753703103, "grad_norm": 0.20418477058410645, "learning_rate": 9.229041951580101e-06, "loss": 0.3467, "step": 100725 }, { "epoch": 3.630302375031535, "grad_norm": 0.2210427224636078, "learning_rate": 9.226777837374163e-06, "loss": 0.3732, "step": 100730 }, { "epoch": 3.63048257469276, "grad_norm": 0.2393556535243988, "learning_rate": 9.224513938075318e-06, "loss": 0.4281, "step": 100735 }, { "epoch": 3.630662774353984, "grad_norm": 0.2593443691730499, "learning_rate": 9.222250253714413e-06, "loss": 0.3624, "step": 100740 }, { "epoch": 3.630842974015209, "grad_norm": 0.2505268454551697, "learning_rate": 9.219986784322299e-06, "loss": 0.3829, "step": 100745 }, { "epoch": 3.6310231736764336, "grad_norm": 0.2201002836227417, "learning_rate": 9.217723529929812e-06, "loss": 0.405, "step": 100750 }, { "epoch": 3.631203373337658, "grad_norm": 0.2460431605577469, "learning_rate": 9.215460490567784e-06, "loss": 0.3749, "step": 100755 }, { "epoch": 3.6313835729988826, "grad_norm": 0.2341911494731903, "learning_rate": 9.213197666267052e-06, "loss": 0.4055, "step": 100760 }, { "epoch": 3.6315637726601073, "grad_norm": 0.25453901290893555, "learning_rate": 9.210935057058437e-06, "loss": 0.4276, "step": 100765 }, { "epoch": 3.631743972321332, "grad_norm": 0.22719573974609375, "learning_rate": 9.208672662972783e-06, "loss": 0.3768, "step": 100770 }, { "epoch": 3.631924171982557, "grad_norm": 0.22030307352542877, "learning_rate": 9.206410484040914e-06, "loss": 0.3629, "step": 100775 }, { "epoch": 3.6321043716437815, "grad_norm": 0.27278268337249756, "learning_rate": 9.204148520293629e-06, "loss": 0.3757, "step": 100780 }, { "epoch": 3.632284571305006, "grad_norm": 0.2132975459098816, "learning_rate": 9.201886771761772e-06, "loss": 0.3837, "step": 100785 }, { "epoch": 3.6324647709662305, "grad_norm": 0.2718239724636078, "learning_rate": 9.199625238476139e-06, "loss": 0.3942, "step": 100790 }, { "epoch": 3.6326449706274553, "grad_norm": 0.2489858865737915, "learning_rate": 9.19736392046757e-06, "loss": 0.3506, "step": 100795 }, { "epoch": 3.63282517028868, "grad_norm": 0.2690345048904419, "learning_rate": 9.195102817766851e-06, "loss": 0.365, "step": 100800 }, { "epoch": 3.6330053699499043, "grad_norm": 0.25727832317352295, "learning_rate": 9.192841930404786e-06, "loss": 0.3702, "step": 100805 }, { "epoch": 3.633185569611129, "grad_norm": 0.28264495730400085, "learning_rate": 9.190581258412198e-06, "loss": 0.3825, "step": 100810 }, { "epoch": 3.6333657692723538, "grad_norm": 0.2584122121334076, "learning_rate": 9.18832080181988e-06, "loss": 0.4279, "step": 100815 }, { "epoch": 3.6335459689335785, "grad_norm": 0.2514478862285614, "learning_rate": 9.18606056065863e-06, "loss": 0.3661, "step": 100820 }, { "epoch": 3.6337261685948032, "grad_norm": 0.2434535175561905, "learning_rate": 9.183800534959245e-06, "loss": 0.3543, "step": 100825 }, { "epoch": 3.6339063682560275, "grad_norm": 0.225295290350914, "learning_rate": 9.181540724752504e-06, "loss": 0.3573, "step": 100830 }, { "epoch": 3.6340865679172523, "grad_norm": 0.24885191023349762, "learning_rate": 9.179281130069217e-06, "loss": 0.3833, "step": 100835 }, { "epoch": 3.634266767578477, "grad_norm": 0.2610785961151123, "learning_rate": 9.177021750940162e-06, "loss": 0.3577, "step": 100840 }, { "epoch": 3.6344469672397017, "grad_norm": 0.2569064497947693, "learning_rate": 9.174762587396124e-06, "loss": 0.3981, "step": 100845 }, { "epoch": 3.634627166900926, "grad_norm": 0.308135986328125, "learning_rate": 9.17250363946788e-06, "loss": 0.4227, "step": 100850 }, { "epoch": 3.6348073665621508, "grad_norm": 0.2591281235218048, "learning_rate": 9.170244907186201e-06, "loss": 0.3902, "step": 100855 }, { "epoch": 3.6349875662233755, "grad_norm": 0.24686266481876373, "learning_rate": 9.167986390581879e-06, "loss": 0.3907, "step": 100860 }, { "epoch": 3.6351677658846, "grad_norm": 0.21753914654254913, "learning_rate": 9.165728089685687e-06, "loss": 0.398, "step": 100865 }, { "epoch": 3.635347965545825, "grad_norm": 0.23476707935333252, "learning_rate": 9.163470004528366e-06, "loss": 0.3879, "step": 100870 }, { "epoch": 3.6355281652070492, "grad_norm": 0.2765940725803375, "learning_rate": 9.16121213514071e-06, "loss": 0.4109, "step": 100875 }, { "epoch": 3.635708364868274, "grad_norm": 0.2919420301914215, "learning_rate": 9.15895448155347e-06, "loss": 0.3816, "step": 100880 }, { "epoch": 3.6358885645294987, "grad_norm": 0.2056131362915039, "learning_rate": 9.156697043797402e-06, "loss": 0.3565, "step": 100885 }, { "epoch": 3.6360687641907234, "grad_norm": 0.29402926564216614, "learning_rate": 9.154439821903286e-06, "loss": 0.4164, "step": 100890 }, { "epoch": 3.6362489638519477, "grad_norm": 0.2530692517757416, "learning_rate": 9.152182815901841e-06, "loss": 0.3703, "step": 100895 }, { "epoch": 3.6364291635131725, "grad_norm": 0.24608898162841797, "learning_rate": 9.149926025823846e-06, "loss": 0.3821, "step": 100900 }, { "epoch": 3.636609363174397, "grad_norm": 0.1967172771692276, "learning_rate": 9.147669451700042e-06, "loss": 0.3682, "step": 100905 }, { "epoch": 3.636789562835622, "grad_norm": 0.2919192910194397, "learning_rate": 9.14541309356117e-06, "loss": 0.379, "step": 100910 }, { "epoch": 3.6369697624968467, "grad_norm": 0.22142526507377625, "learning_rate": 9.143156951437976e-06, "loss": 0.3938, "step": 100915 }, { "epoch": 3.6371499621580714, "grad_norm": 0.22058270871639252, "learning_rate": 9.140901025361198e-06, "loss": 0.3691, "step": 100920 }, { "epoch": 3.6373301618192957, "grad_norm": 0.2393786460161209, "learning_rate": 9.138645315361566e-06, "loss": 0.3929, "step": 100925 }, { "epoch": 3.6375103614805204, "grad_norm": 0.26111987233161926, "learning_rate": 9.13638982146983e-06, "loss": 0.4086, "step": 100930 }, { "epoch": 3.637690561141745, "grad_norm": 0.2450508177280426, "learning_rate": 9.134134543716711e-06, "loss": 0.3776, "step": 100935 }, { "epoch": 3.6378707608029694, "grad_norm": 0.26415956020355225, "learning_rate": 9.131879482132935e-06, "loss": 0.4035, "step": 100940 }, { "epoch": 3.638050960464194, "grad_norm": 0.21556943655014038, "learning_rate": 9.12962463674923e-06, "loss": 0.3617, "step": 100945 }, { "epoch": 3.638231160125419, "grad_norm": 0.26129695773124695, "learning_rate": 9.127370007596308e-06, "loss": 0.3823, "step": 100950 }, { "epoch": 3.6384113597866437, "grad_norm": 0.2311808317899704, "learning_rate": 9.125115594704905e-06, "loss": 0.3735, "step": 100955 }, { "epoch": 3.6385915594478684, "grad_norm": 0.24276402592658997, "learning_rate": 9.122861398105736e-06, "loss": 0.3719, "step": 100960 }, { "epoch": 3.638771759109093, "grad_norm": 0.2854870557785034, "learning_rate": 9.12060741782949e-06, "loss": 0.3717, "step": 100965 }, { "epoch": 3.6389519587703174, "grad_norm": 0.2724928557872772, "learning_rate": 9.118353653906905e-06, "loss": 0.4111, "step": 100970 }, { "epoch": 3.639132158431542, "grad_norm": 0.22794464230537415, "learning_rate": 9.116100106368667e-06, "loss": 0.3872, "step": 100975 }, { "epoch": 3.639312358092767, "grad_norm": 0.2465512901544571, "learning_rate": 9.113846775245506e-06, "loss": 0.3988, "step": 100980 }, { "epoch": 3.639492557753991, "grad_norm": 0.24606558680534363, "learning_rate": 9.111593660568099e-06, "loss": 0.3761, "step": 100985 }, { "epoch": 3.639672757415216, "grad_norm": 0.1907157152891159, "learning_rate": 9.109340762367141e-06, "loss": 0.3817, "step": 100990 }, { "epoch": 3.6398529570764406, "grad_norm": 0.2616572976112366, "learning_rate": 9.107088080673351e-06, "loss": 0.4135, "step": 100995 }, { "epoch": 3.6400331567376654, "grad_norm": 0.2173600196838379, "learning_rate": 9.104835615517406e-06, "loss": 0.3634, "step": 101000 }, { "epoch": 3.6400331567376654, "eval_loss": 0.4288954436779022, "eval_runtime": 3.5295, "eval_samples_per_second": 28.333, "eval_steps_per_second": 7.083, "step": 101000 }, { "epoch": 3.64021335639889, "grad_norm": 0.2579386234283447, "learning_rate": 9.10258336693e-06, "loss": 0.3848, "step": 101005 }, { "epoch": 3.640393556060115, "grad_norm": 0.21844492852687836, "learning_rate": 9.10033133494182e-06, "loss": 0.4074, "step": 101010 }, { "epoch": 3.640573755721339, "grad_norm": 0.25436314940452576, "learning_rate": 9.098079519583536e-06, "loss": 0.3451, "step": 101015 }, { "epoch": 3.640753955382564, "grad_norm": 0.237601637840271, "learning_rate": 9.095827920885847e-06, "loss": 0.3692, "step": 101020 }, { "epoch": 3.6409341550437886, "grad_norm": 0.1813516467809677, "learning_rate": 9.093576538879425e-06, "loss": 0.408, "step": 101025 }, { "epoch": 3.641114354705013, "grad_norm": 0.23129382729530334, "learning_rate": 9.091325373594944e-06, "loss": 0.3885, "step": 101030 }, { "epoch": 3.6412945543662376, "grad_norm": 0.24837276339530945, "learning_rate": 9.089074425063074e-06, "loss": 0.4024, "step": 101035 }, { "epoch": 3.6414747540274623, "grad_norm": 0.23222190141677856, "learning_rate": 9.086823693314476e-06, "loss": 0.3736, "step": 101040 }, { "epoch": 3.641654953688687, "grad_norm": 0.2549395263195038, "learning_rate": 9.084573178379832e-06, "loss": 0.3688, "step": 101045 }, { "epoch": 3.641835153349912, "grad_norm": 0.3221529722213745, "learning_rate": 9.082322880289798e-06, "loss": 0.4044, "step": 101050 }, { "epoch": 3.6420153530111365, "grad_norm": 0.19385772943496704, "learning_rate": 9.080072799075033e-06, "loss": 0.3395, "step": 101055 }, { "epoch": 3.642195552672361, "grad_norm": 0.21461910009384155, "learning_rate": 9.077822934766194e-06, "loss": 0.3626, "step": 101060 }, { "epoch": 3.6423757523335856, "grad_norm": 0.2746890187263489, "learning_rate": 9.075573287393935e-06, "loss": 0.3974, "step": 101065 }, { "epoch": 3.6425559519948103, "grad_norm": 0.22592508792877197, "learning_rate": 9.073323856988898e-06, "loss": 0.3628, "step": 101070 }, { "epoch": 3.642736151656035, "grad_norm": 0.17644916474819183, "learning_rate": 9.071074643581757e-06, "loss": 0.3879, "step": 101075 }, { "epoch": 3.6429163513172593, "grad_norm": 0.20565937459468842, "learning_rate": 9.068825647203122e-06, "loss": 0.3596, "step": 101080 }, { "epoch": 3.643096550978484, "grad_norm": 0.2097647488117218, "learning_rate": 9.066576867883664e-06, "loss": 0.3634, "step": 101085 }, { "epoch": 3.643276750639709, "grad_norm": 0.24162964522838593, "learning_rate": 9.06432830565401e-06, "loss": 0.3715, "step": 101090 }, { "epoch": 3.6434569503009335, "grad_norm": 0.2646183371543884, "learning_rate": 9.062079960544798e-06, "loss": 0.3756, "step": 101095 }, { "epoch": 3.6436371499621583, "grad_norm": 0.20252090692520142, "learning_rate": 9.05983183258666e-06, "loss": 0.3804, "step": 101100 }, { "epoch": 3.6438173496233826, "grad_norm": 0.2239752858877182, "learning_rate": 9.057583921810225e-06, "loss": 0.34, "step": 101105 }, { "epoch": 3.6439975492846073, "grad_norm": 0.23255230486392975, "learning_rate": 9.055336228246119e-06, "loss": 0.3719, "step": 101110 }, { "epoch": 3.644177748945832, "grad_norm": 0.23270903527736664, "learning_rate": 9.053088751924976e-06, "loss": 0.3866, "step": 101115 }, { "epoch": 3.6443579486070568, "grad_norm": 0.27508658170700073, "learning_rate": 9.05084149287741e-06, "loss": 0.4123, "step": 101120 }, { "epoch": 3.644538148268281, "grad_norm": 0.19277596473693848, "learning_rate": 9.048594451134042e-06, "loss": 0.3398, "step": 101125 }, { "epoch": 3.6447183479295058, "grad_norm": 0.28608769178390503, "learning_rate": 9.046347626725487e-06, "loss": 0.4222, "step": 101130 }, { "epoch": 3.6448985475907305, "grad_norm": 0.21985885500907898, "learning_rate": 9.04410101968235e-06, "loss": 0.3681, "step": 101135 }, { "epoch": 3.6450787472519552, "grad_norm": 0.2108525186777115, "learning_rate": 9.04185463003525e-06, "loss": 0.338, "step": 101140 }, { "epoch": 3.64525894691318, "grad_norm": 0.28811293840408325, "learning_rate": 9.039608457814805e-06, "loss": 0.4011, "step": 101145 }, { "epoch": 3.6454391465744047, "grad_norm": 0.2849256992340088, "learning_rate": 9.037362503051585e-06, "loss": 0.4098, "step": 101150 }, { "epoch": 3.645619346235629, "grad_norm": 0.2216586172580719, "learning_rate": 9.035116765776223e-06, "loss": 0.355, "step": 101155 }, { "epoch": 3.6457995458968537, "grad_norm": 0.22768335044384003, "learning_rate": 9.032871246019292e-06, "loss": 0.368, "step": 101160 }, { "epoch": 3.6459797455580785, "grad_norm": 0.23428839445114136, "learning_rate": 9.030625943811408e-06, "loss": 0.3678, "step": 101165 }, { "epoch": 3.6461599452193028, "grad_norm": 0.2779921889305115, "learning_rate": 9.028380859183164e-06, "loss": 0.3978, "step": 101170 }, { "epoch": 3.6463401448805275, "grad_norm": 0.30054476857185364, "learning_rate": 9.02613599216512e-06, "loss": 0.3748, "step": 101175 }, { "epoch": 3.6465203445417522, "grad_norm": 0.27546700835227966, "learning_rate": 9.023891342787888e-06, "loss": 0.3651, "step": 101180 }, { "epoch": 3.646700544202977, "grad_norm": 0.23770831525325775, "learning_rate": 9.021646911082044e-06, "loss": 0.3828, "step": 101185 }, { "epoch": 3.6468807438642017, "grad_norm": 0.2967265248298645, "learning_rate": 9.019402697078167e-06, "loss": 0.3796, "step": 101190 }, { "epoch": 3.6470609435254264, "grad_norm": 0.22645854949951172, "learning_rate": 9.017158700806835e-06, "loss": 0.3777, "step": 101195 }, { "epoch": 3.6472411431866507, "grad_norm": 0.21501436829566956, "learning_rate": 9.014914922298612e-06, "loss": 0.3483, "step": 101200 }, { "epoch": 3.6474213428478754, "grad_norm": 0.25435328483581543, "learning_rate": 9.012671361584088e-06, "loss": 0.4062, "step": 101205 }, { "epoch": 3.6476015425091, "grad_norm": 0.21039626002311707, "learning_rate": 9.010428018693823e-06, "loss": 0.3534, "step": 101210 }, { "epoch": 3.6477817421703245, "grad_norm": 0.24590708315372467, "learning_rate": 9.008184893658378e-06, "loss": 0.3897, "step": 101215 }, { "epoch": 3.647961941831549, "grad_norm": 0.23743712902069092, "learning_rate": 9.005941986508318e-06, "loss": 0.375, "step": 101220 }, { "epoch": 3.648142141492774, "grad_norm": 0.23340876400470734, "learning_rate": 9.0036992972742e-06, "loss": 0.368, "step": 101225 }, { "epoch": 3.6483223411539987, "grad_norm": 0.2544988691806793, "learning_rate": 9.001456825986579e-06, "loss": 0.3809, "step": 101230 }, { "epoch": 3.6485025408152234, "grad_norm": 0.24296405911445618, "learning_rate": 8.999214572676015e-06, "loss": 0.3803, "step": 101235 }, { "epoch": 3.648682740476448, "grad_norm": 0.2581973969936371, "learning_rate": 8.996972537373057e-06, "loss": 0.364, "step": 101240 }, { "epoch": 3.6488629401376724, "grad_norm": 0.22508592903614044, "learning_rate": 8.99473072010825e-06, "loss": 0.3848, "step": 101245 }, { "epoch": 3.649043139798897, "grad_norm": 0.2491467446088791, "learning_rate": 8.992489120912138e-06, "loss": 0.3736, "step": 101250 }, { "epoch": 3.649223339460122, "grad_norm": 0.24909654259681702, "learning_rate": 8.990247739815252e-06, "loss": 0.3896, "step": 101255 }, { "epoch": 3.649403539121346, "grad_norm": 0.25092822313308716, "learning_rate": 8.988006576848159e-06, "loss": 0.3498, "step": 101260 }, { "epoch": 3.649583738782571, "grad_norm": 0.2587062120437622, "learning_rate": 8.98576563204136e-06, "loss": 0.3853, "step": 101265 }, { "epoch": 3.6497639384437957, "grad_norm": 0.27524474263191223, "learning_rate": 8.983524905425413e-06, "loss": 0.3667, "step": 101270 }, { "epoch": 3.6499441381050204, "grad_norm": 0.2666233479976654, "learning_rate": 8.981284397030837e-06, "loss": 0.37, "step": 101275 }, { "epoch": 3.650124337766245, "grad_norm": 0.21499104797840118, "learning_rate": 8.979044106888152e-06, "loss": 0.3844, "step": 101280 }, { "epoch": 3.65030453742747, "grad_norm": 0.22629836201667786, "learning_rate": 8.976804035027905e-06, "loss": 0.3803, "step": 101285 }, { "epoch": 3.650484737088694, "grad_norm": 0.24577264487743378, "learning_rate": 8.974564181480594e-06, "loss": 0.3886, "step": 101290 }, { "epoch": 3.650664936749919, "grad_norm": 0.22071078419685364, "learning_rate": 8.972324546276733e-06, "loss": 0.402, "step": 101295 }, { "epoch": 3.6508451364111436, "grad_norm": 0.2849634289741516, "learning_rate": 8.97008512944686e-06, "loss": 0.3814, "step": 101300 }, { "epoch": 3.6510253360723683, "grad_norm": 0.22348150610923767, "learning_rate": 8.967845931021469e-06, "loss": 0.375, "step": 101305 }, { "epoch": 3.6512055357335926, "grad_norm": 0.25602954626083374, "learning_rate": 8.965606951031074e-06, "loss": 0.3992, "step": 101310 }, { "epoch": 3.6513857353948174, "grad_norm": 0.270134836435318, "learning_rate": 8.96336818950618e-06, "loss": 0.3896, "step": 101315 }, { "epoch": 3.651565935056042, "grad_norm": 0.2468041628599167, "learning_rate": 8.961129646477281e-06, "loss": 0.3846, "step": 101320 }, { "epoch": 3.651746134717267, "grad_norm": 0.22694005072116852, "learning_rate": 8.958891321974896e-06, "loss": 0.3719, "step": 101325 }, { "epoch": 3.6519263343784916, "grad_norm": 0.3231867253780365, "learning_rate": 8.956653216029509e-06, "loss": 0.4239, "step": 101330 }, { "epoch": 3.652106534039716, "grad_norm": 0.23950925469398499, "learning_rate": 8.954415328671617e-06, "loss": 0.3259, "step": 101335 }, { "epoch": 3.6522867337009406, "grad_norm": 0.30801981687545776, "learning_rate": 8.95217765993171e-06, "loss": 0.3592, "step": 101340 }, { "epoch": 3.6524669333621653, "grad_norm": 0.23010605573654175, "learning_rate": 8.949940209840266e-06, "loss": 0.3641, "step": 101345 }, { "epoch": 3.65264713302339, "grad_norm": 0.2432517111301422, "learning_rate": 8.947702978427786e-06, "loss": 0.3871, "step": 101350 }, { "epoch": 3.6528273326846143, "grad_norm": 0.21140114963054657, "learning_rate": 8.945465965724756e-06, "loss": 0.3849, "step": 101355 }, { "epoch": 3.653007532345839, "grad_norm": 0.263443261384964, "learning_rate": 8.943229171761628e-06, "loss": 0.4131, "step": 101360 }, { "epoch": 3.653187732007064, "grad_norm": 0.19522389769554138, "learning_rate": 8.940992596568897e-06, "loss": 0.3953, "step": 101365 }, { "epoch": 3.6533679316682885, "grad_norm": 0.26923754811286926, "learning_rate": 8.938756240177037e-06, "loss": 0.3825, "step": 101370 }, { "epoch": 3.6535481313295133, "grad_norm": 0.24230341613292694, "learning_rate": 8.936520102616513e-06, "loss": 0.3723, "step": 101375 }, { "epoch": 3.6537283309907376, "grad_norm": 0.27861183881759644, "learning_rate": 8.934284183917793e-06, "loss": 0.3888, "step": 101380 }, { "epoch": 3.6539085306519623, "grad_norm": 0.18770521879196167, "learning_rate": 8.932048484111333e-06, "loss": 0.3635, "step": 101385 }, { "epoch": 3.654088730313187, "grad_norm": 0.2484419047832489, "learning_rate": 8.929813003227608e-06, "loss": 0.3887, "step": 101390 }, { "epoch": 3.6542689299744118, "grad_norm": 0.21241828799247742, "learning_rate": 8.927577741297072e-06, "loss": 0.3797, "step": 101395 }, { "epoch": 3.654449129635636, "grad_norm": 0.2524920701980591, "learning_rate": 8.925342698350175e-06, "loss": 0.3909, "step": 101400 }, { "epoch": 3.654629329296861, "grad_norm": 0.2676452696323395, "learning_rate": 8.923107874417372e-06, "loss": 0.3485, "step": 101405 }, { "epoch": 3.6548095289580855, "grad_norm": 0.21999165415763855, "learning_rate": 8.920873269529112e-06, "loss": 0.3712, "step": 101410 }, { "epoch": 3.6549897286193103, "grad_norm": 0.23482544720172882, "learning_rate": 8.918638883715832e-06, "loss": 0.3802, "step": 101415 }, { "epoch": 3.655169928280535, "grad_norm": 0.2594209313392639, "learning_rate": 8.916404717007992e-06, "loss": 0.3737, "step": 101420 }, { "epoch": 3.6553501279417597, "grad_norm": 0.20219695568084717, "learning_rate": 8.914170769436025e-06, "loss": 0.3846, "step": 101425 }, { "epoch": 3.655530327602984, "grad_norm": 0.24486181139945984, "learning_rate": 8.911937041030365e-06, "loss": 0.3891, "step": 101430 }, { "epoch": 3.6557105272642088, "grad_norm": 0.19330917298793793, "learning_rate": 8.90970353182145e-06, "loss": 0.3704, "step": 101435 }, { "epoch": 3.6558907269254335, "grad_norm": 0.21298684179782867, "learning_rate": 8.907470241839703e-06, "loss": 0.3495, "step": 101440 }, { "epoch": 3.6560709265866578, "grad_norm": 0.25250443816185, "learning_rate": 8.905237171115563e-06, "loss": 0.4116, "step": 101445 }, { "epoch": 3.6562511262478825, "grad_norm": 0.20300345122814178, "learning_rate": 8.90300431967945e-06, "loss": 0.3616, "step": 101450 }, { "epoch": 3.6564313259091072, "grad_norm": 0.2571481466293335, "learning_rate": 8.900771687561787e-06, "loss": 0.3844, "step": 101455 }, { "epoch": 3.656611525570332, "grad_norm": 0.18634074926376343, "learning_rate": 8.898539274792997e-06, "loss": 0.3538, "step": 101460 }, { "epoch": 3.6567917252315567, "grad_norm": 0.2489531934261322, "learning_rate": 8.896307081403479e-06, "loss": 0.3721, "step": 101465 }, { "epoch": 3.6569719248927814, "grad_norm": 0.2489020973443985, "learning_rate": 8.894075107423678e-06, "loss": 0.3765, "step": 101470 }, { "epoch": 3.6571521245540057, "grad_norm": 0.19710054993629456, "learning_rate": 8.891843352883978e-06, "loss": 0.3843, "step": 101475 }, { "epoch": 3.6573323242152305, "grad_norm": 0.22611059248447418, "learning_rate": 8.889611817814783e-06, "loss": 0.3832, "step": 101480 }, { "epoch": 3.657512523876455, "grad_norm": 0.2789013683795929, "learning_rate": 8.887380502246517e-06, "loss": 0.3685, "step": 101485 }, { "epoch": 3.6576927235376795, "grad_norm": 0.17864690721035004, "learning_rate": 8.885149406209573e-06, "loss": 0.3661, "step": 101490 }, { "epoch": 3.6578729231989042, "grad_norm": 0.23747530579566956, "learning_rate": 8.882918529734346e-06, "loss": 0.4134, "step": 101495 }, { "epoch": 3.658053122860129, "grad_norm": 0.25165247917175293, "learning_rate": 8.880687872851237e-06, "loss": 0.3606, "step": 101500 }, { "epoch": 3.658053122860129, "eval_loss": 0.4290784001350403, "eval_runtime": 3.529, "eval_samples_per_second": 28.337, "eval_steps_per_second": 7.084, "step": 101500 }, { "epoch": 3.6582333225213537, "grad_norm": 0.21503256261348724, "learning_rate": 8.878457435590626e-06, "loss": 0.3689, "step": 101505 }, { "epoch": 3.6584135221825784, "grad_norm": 0.1987353265285492, "learning_rate": 8.876227217982916e-06, "loss": 0.3373, "step": 101510 }, { "epoch": 3.658593721843803, "grad_norm": 0.29664599895477295, "learning_rate": 8.87399722005849e-06, "loss": 0.3837, "step": 101515 }, { "epoch": 3.6587739215050274, "grad_norm": 0.22414027154445648, "learning_rate": 8.871767441847734e-06, "loss": 0.3662, "step": 101520 }, { "epoch": 3.658954121166252, "grad_norm": 0.2331104874610901, "learning_rate": 8.869537883381022e-06, "loss": 0.3745, "step": 101525 }, { "epoch": 3.659134320827477, "grad_norm": 0.2712317407131195, "learning_rate": 8.86730854468872e-06, "loss": 0.3619, "step": 101530 }, { "epoch": 3.659314520488701, "grad_norm": 0.22348475456237793, "learning_rate": 8.865079425801228e-06, "loss": 0.3921, "step": 101535 }, { "epoch": 3.659494720149926, "grad_norm": 0.2932550311088562, "learning_rate": 8.862850526748917e-06, "loss": 0.3714, "step": 101540 }, { "epoch": 3.6596749198111507, "grad_norm": 0.26502054929733276, "learning_rate": 8.860621847562123e-06, "loss": 0.3736, "step": 101545 }, { "epoch": 3.6598551194723754, "grad_norm": 0.25092580914497375, "learning_rate": 8.858393388271238e-06, "loss": 0.3978, "step": 101550 }, { "epoch": 3.6600353191336, "grad_norm": 0.2842051684856415, "learning_rate": 8.85616514890662e-06, "loss": 0.3674, "step": 101555 }, { "epoch": 3.660215518794825, "grad_norm": 0.2661273777484894, "learning_rate": 8.853937129498627e-06, "loss": 0.3835, "step": 101560 }, { "epoch": 3.660395718456049, "grad_norm": 0.23841172456741333, "learning_rate": 8.851709330077615e-06, "loss": 0.3654, "step": 101565 }, { "epoch": 3.660575918117274, "grad_norm": 0.20587527751922607, "learning_rate": 8.849481750673927e-06, "loss": 0.383, "step": 101570 }, { "epoch": 3.6607561177784986, "grad_norm": 0.2173176258802414, "learning_rate": 8.84725439131793e-06, "loss": 0.4062, "step": 101575 }, { "epoch": 3.6609363174397234, "grad_norm": 0.201410710811615, "learning_rate": 8.845027252039969e-06, "loss": 0.3691, "step": 101580 }, { "epoch": 3.6611165171009477, "grad_norm": 0.24154742062091827, "learning_rate": 8.842800332870382e-06, "loss": 0.3661, "step": 101585 }, { "epoch": 3.6612967167621724, "grad_norm": 0.18455544114112854, "learning_rate": 8.84057363383951e-06, "loss": 0.3598, "step": 101590 }, { "epoch": 3.661476916423397, "grad_norm": 0.24699454009532928, "learning_rate": 8.838347154977697e-06, "loss": 0.3765, "step": 101595 }, { "epoch": 3.661657116084622, "grad_norm": 0.20563150942325592, "learning_rate": 8.836120896315267e-06, "loss": 0.3507, "step": 101600 }, { "epoch": 3.6618373157458466, "grad_norm": 0.21455486118793488, "learning_rate": 8.833894857882566e-06, "loss": 0.365, "step": 101605 }, { "epoch": 3.662017515407071, "grad_norm": 0.237900048494339, "learning_rate": 8.83166903970992e-06, "loss": 0.3663, "step": 101610 }, { "epoch": 3.6621977150682956, "grad_norm": 0.24725571274757385, "learning_rate": 8.82944344182765e-06, "loss": 0.382, "step": 101615 }, { "epoch": 3.6623779147295203, "grad_norm": 0.28484535217285156, "learning_rate": 8.827218064266085e-06, "loss": 0.4055, "step": 101620 }, { "epoch": 3.662558114390745, "grad_norm": 0.2544311583042145, "learning_rate": 8.824992907055534e-06, "loss": 0.3613, "step": 101625 }, { "epoch": 3.6627383140519694, "grad_norm": 0.21593233942985535, "learning_rate": 8.822767970226332e-06, "loss": 0.3756, "step": 101630 }, { "epoch": 3.662918513713194, "grad_norm": 0.2815852463245392, "learning_rate": 8.820543253808783e-06, "loss": 0.4064, "step": 101635 }, { "epoch": 3.663098713374419, "grad_norm": 0.299152135848999, "learning_rate": 8.8183187578332e-06, "loss": 0.4224, "step": 101640 }, { "epoch": 3.6632789130356436, "grad_norm": 0.24634763598442078, "learning_rate": 8.81609448232989e-06, "loss": 0.3834, "step": 101645 }, { "epoch": 3.6634591126968683, "grad_norm": 0.22788767516613007, "learning_rate": 8.813870427329155e-06, "loss": 0.3795, "step": 101650 }, { "epoch": 3.663639312358093, "grad_norm": 0.26291418075561523, "learning_rate": 8.811646592861314e-06, "loss": 0.376, "step": 101655 }, { "epoch": 3.6638195120193173, "grad_norm": 0.191041499376297, "learning_rate": 8.809422978956647e-06, "loss": 0.3674, "step": 101660 }, { "epoch": 3.663999711680542, "grad_norm": 0.28010818362236023, "learning_rate": 8.80719958564545e-06, "loss": 0.4071, "step": 101665 }, { "epoch": 3.664179911341767, "grad_norm": 0.21684035658836365, "learning_rate": 8.804976412958029e-06, "loss": 0.3826, "step": 101670 }, { "epoch": 3.664360111002991, "grad_norm": 0.26138171553611755, "learning_rate": 8.802753460924674e-06, "loss": 0.3812, "step": 101675 }, { "epoch": 3.664540310664216, "grad_norm": 0.20288319885730743, "learning_rate": 8.800530729575665e-06, "loss": 0.3448, "step": 101680 }, { "epoch": 3.6647205103254406, "grad_norm": 0.2686285674571991, "learning_rate": 8.798308218941287e-06, "loss": 0.414, "step": 101685 }, { "epoch": 3.6649007099866653, "grad_norm": 0.26645994186401367, "learning_rate": 8.796085929051814e-06, "loss": 0.3848, "step": 101690 }, { "epoch": 3.66508090964789, "grad_norm": 0.22347326576709747, "learning_rate": 8.793863859937543e-06, "loss": 0.3875, "step": 101695 }, { "epoch": 3.6652611093091148, "grad_norm": 0.21688346564769745, "learning_rate": 8.79164201162874e-06, "loss": 0.3964, "step": 101700 }, { "epoch": 3.665441308970339, "grad_norm": 0.2277715802192688, "learning_rate": 8.789420384155675e-06, "loss": 0.388, "step": 101705 }, { "epoch": 3.6656215086315638, "grad_norm": 0.23789222538471222, "learning_rate": 8.78719897754862e-06, "loss": 0.3821, "step": 101710 }, { "epoch": 3.6658017082927885, "grad_norm": 0.24796278774738312, "learning_rate": 8.784977791837831e-06, "loss": 0.3832, "step": 101715 }, { "epoch": 3.665981907954013, "grad_norm": 0.28839823603630066, "learning_rate": 8.782756827053588e-06, "loss": 0.3859, "step": 101720 }, { "epoch": 3.6661621076152375, "grad_norm": 0.2025182694196701, "learning_rate": 8.780536083226154e-06, "loss": 0.3639, "step": 101725 }, { "epoch": 3.6663423072764623, "grad_norm": 0.24186572432518005, "learning_rate": 8.778315560385756e-06, "loss": 0.3622, "step": 101730 }, { "epoch": 3.666522506937687, "grad_norm": 0.19818326830863953, "learning_rate": 8.776095258562677e-06, "loss": 0.4044, "step": 101735 }, { "epoch": 3.6667027065989117, "grad_norm": 0.2045317143201828, "learning_rate": 8.773875177787161e-06, "loss": 0.3493, "step": 101740 }, { "epoch": 3.6668829062601365, "grad_norm": 0.2028067409992218, "learning_rate": 8.771655318089445e-06, "loss": 0.3869, "step": 101745 }, { "epoch": 3.6670631059213608, "grad_norm": 0.24124419689178467, "learning_rate": 8.769435679499798e-06, "loss": 0.3822, "step": 101750 }, { "epoch": 3.6672433055825855, "grad_norm": 0.2072051465511322, "learning_rate": 8.767216262048433e-06, "loss": 0.3652, "step": 101755 }, { "epoch": 3.6674235052438102, "grad_norm": 0.22216108441352844, "learning_rate": 8.764997065765612e-06, "loss": 0.404, "step": 101760 }, { "epoch": 3.6676037049050345, "grad_norm": 0.23680134117603302, "learning_rate": 8.762778090681562e-06, "loss": 0.3964, "step": 101765 }, { "epoch": 3.6677839045662592, "grad_norm": 0.29067909717559814, "learning_rate": 8.760559336826519e-06, "loss": 0.3897, "step": 101770 }, { "epoch": 3.667964104227484, "grad_norm": 0.244569793343544, "learning_rate": 8.758340804230709e-06, "loss": 0.3949, "step": 101775 }, { "epoch": 3.6681443038887087, "grad_norm": 0.22354869544506073, "learning_rate": 8.75612249292436e-06, "loss": 0.3871, "step": 101780 }, { "epoch": 3.6683245035499334, "grad_norm": 0.2619014084339142, "learning_rate": 8.75390440293769e-06, "loss": 0.4095, "step": 101785 }, { "epoch": 3.668504703211158, "grad_norm": 0.2473766952753067, "learning_rate": 8.751686534300934e-06, "loss": 0.37, "step": 101790 }, { "epoch": 3.6686849028723825, "grad_norm": 0.24038511514663696, "learning_rate": 8.749468887044308e-06, "loss": 0.3576, "step": 101795 }, { "epoch": 3.668865102533607, "grad_norm": 0.25563234090805054, "learning_rate": 8.747251461198016e-06, "loss": 0.4066, "step": 101800 }, { "epoch": 3.669045302194832, "grad_norm": 0.2780168354511261, "learning_rate": 8.745034256792281e-06, "loss": 0.3688, "step": 101805 }, { "epoch": 3.6692255018560567, "grad_norm": 0.2796255350112915, "learning_rate": 8.742817273857295e-06, "loss": 0.3828, "step": 101810 }, { "epoch": 3.669405701517281, "grad_norm": 0.2719496190547943, "learning_rate": 8.740600512423289e-06, "loss": 0.3553, "step": 101815 }, { "epoch": 3.6695859011785057, "grad_norm": 0.22607314586639404, "learning_rate": 8.73838397252046e-06, "loss": 0.368, "step": 101820 }, { "epoch": 3.6697661008397304, "grad_norm": 0.20450633764266968, "learning_rate": 8.73616765417898e-06, "loss": 0.3893, "step": 101825 }, { "epoch": 3.669946300500955, "grad_norm": 0.20725052058696747, "learning_rate": 8.73395155742908e-06, "loss": 0.3655, "step": 101830 }, { "epoch": 3.67012650016218, "grad_norm": 0.2661026418209076, "learning_rate": 8.731735682300932e-06, "loss": 0.3748, "step": 101835 }, { "epoch": 3.670306699823404, "grad_norm": 0.20922096073627472, "learning_rate": 8.72952002882475e-06, "loss": 0.3746, "step": 101840 }, { "epoch": 3.670486899484629, "grad_norm": 0.2124052345752716, "learning_rate": 8.7273045970307e-06, "loss": 0.3466, "step": 101845 }, { "epoch": 3.6706670991458537, "grad_norm": 0.24056388437747955, "learning_rate": 8.725089386948967e-06, "loss": 0.3639, "step": 101850 }, { "epoch": 3.6708472988070784, "grad_norm": 0.21469198167324066, "learning_rate": 8.722874398609749e-06, "loss": 0.4017, "step": 101855 }, { "epoch": 3.6710274984683027, "grad_norm": 0.23535849153995514, "learning_rate": 8.720659632043207e-06, "loss": 0.4005, "step": 101860 }, { "epoch": 3.6712076981295274, "grad_norm": 0.2353014349937439, "learning_rate": 8.718445087279541e-06, "loss": 0.341, "step": 101865 }, { "epoch": 3.671387897790752, "grad_norm": 0.24978400766849518, "learning_rate": 8.716230764348901e-06, "loss": 0.3621, "step": 101870 }, { "epoch": 3.671568097451977, "grad_norm": 0.20733198523521423, "learning_rate": 8.714016663281457e-06, "loss": 0.3481, "step": 101875 }, { "epoch": 3.6717482971132016, "grad_norm": 0.23899643123149872, "learning_rate": 8.71180278410739e-06, "loss": 0.3813, "step": 101880 }, { "epoch": 3.671928496774426, "grad_norm": 0.27464571595191956, "learning_rate": 8.709589126856857e-06, "loss": 0.4055, "step": 101885 }, { "epoch": 3.6721086964356506, "grad_norm": 0.27018100023269653, "learning_rate": 8.707375691560018e-06, "loss": 0.3866, "step": 101890 }, { "epoch": 3.6722888960968754, "grad_norm": 0.22291234135627747, "learning_rate": 8.70516247824703e-06, "loss": 0.3894, "step": 101895 }, { "epoch": 3.6724690957581, "grad_norm": 0.24809099733829498, "learning_rate": 8.702949486948042e-06, "loss": 0.4137, "step": 101900 }, { "epoch": 3.6726492954193244, "grad_norm": 0.27931684255599976, "learning_rate": 8.70073671769322e-06, "loss": 0.3755, "step": 101905 }, { "epoch": 3.672829495080549, "grad_norm": 0.2072281837463379, "learning_rate": 8.698524170512703e-06, "loss": 0.4101, "step": 101910 }, { "epoch": 3.673009694741774, "grad_norm": 0.20641744136810303, "learning_rate": 8.696311845436641e-06, "loss": 0.3667, "step": 101915 }, { "epoch": 3.6731898944029986, "grad_norm": 0.28100159764289856, "learning_rate": 8.69409974249517e-06, "loss": 0.3929, "step": 101920 }, { "epoch": 3.6733700940642233, "grad_norm": 0.2308221310377121, "learning_rate": 8.691887861718437e-06, "loss": 0.3521, "step": 101925 }, { "epoch": 3.673550293725448, "grad_norm": 0.23538222908973694, "learning_rate": 8.689676203136565e-06, "loss": 0.3795, "step": 101930 }, { "epoch": 3.6737304933866723, "grad_norm": 0.25849369168281555, "learning_rate": 8.687464766779712e-06, "loss": 0.3653, "step": 101935 }, { "epoch": 3.673910693047897, "grad_norm": 0.19717663526535034, "learning_rate": 8.685253552677978e-06, "loss": 0.3281, "step": 101940 }, { "epoch": 3.674090892709122, "grad_norm": 0.21411502361297607, "learning_rate": 8.683042560861512e-06, "loss": 0.367, "step": 101945 }, { "epoch": 3.674271092370346, "grad_norm": 0.23891699314117432, "learning_rate": 8.680831791360433e-06, "loss": 0.3794, "step": 101950 }, { "epoch": 3.674451292031571, "grad_norm": 0.2534111440181732, "learning_rate": 8.678621244204863e-06, "loss": 0.4035, "step": 101955 }, { "epoch": 3.6746314916927956, "grad_norm": 0.21654155850410461, "learning_rate": 8.676410919424917e-06, "loss": 0.3668, "step": 101960 }, { "epoch": 3.6748116913540203, "grad_norm": 0.2257722020149231, "learning_rate": 8.674200817050712e-06, "loss": 0.3829, "step": 101965 }, { "epoch": 3.674991891015245, "grad_norm": 0.20194071531295776, "learning_rate": 8.671990937112354e-06, "loss": 0.3617, "step": 101970 }, { "epoch": 3.6751720906764698, "grad_norm": 0.2552418112754822, "learning_rate": 8.669781279639963e-06, "loss": 0.3767, "step": 101975 }, { "epoch": 3.675352290337694, "grad_norm": 0.1989719569683075, "learning_rate": 8.667571844663644e-06, "loss": 0.3989, "step": 101980 }, { "epoch": 3.675532489998919, "grad_norm": 0.2214037925004959, "learning_rate": 8.665362632213494e-06, "loss": 0.3783, "step": 101985 }, { "epoch": 3.6757126896601435, "grad_norm": 0.23372018337249756, "learning_rate": 8.663153642319616e-06, "loss": 0.3871, "step": 101990 }, { "epoch": 3.675892889321368, "grad_norm": 0.20797449350357056, "learning_rate": 8.6609448750121e-06, "loss": 0.3736, "step": 101995 }, { "epoch": 3.6760730889825926, "grad_norm": 0.2792084515094757, "learning_rate": 8.658736330321051e-06, "loss": 0.3913, "step": 102000 }, { "epoch": 3.6760730889825926, "eval_loss": 0.4288567304611206, "eval_runtime": 3.5476, "eval_samples_per_second": 28.188, "eval_steps_per_second": 7.047, "step": 102000 }, { "epoch": 3.6762532886438173, "grad_norm": 0.2272174209356308, "learning_rate": 8.656528008276568e-06, "loss": 0.3661, "step": 102005 }, { "epoch": 3.676433488305042, "grad_norm": 0.21868175268173218, "learning_rate": 8.654319908908709e-06, "loss": 0.3666, "step": 102010 }, { "epoch": 3.6766136879662668, "grad_norm": 0.23744815587997437, "learning_rate": 8.652112032247587e-06, "loss": 0.369, "step": 102015 }, { "epoch": 3.6767938876274915, "grad_norm": 0.26380473375320435, "learning_rate": 8.649904378323262e-06, "loss": 0.3666, "step": 102020 }, { "epoch": 3.6769740872887158, "grad_norm": 0.2755199372768402, "learning_rate": 8.647696947165834e-06, "loss": 0.3548, "step": 102025 }, { "epoch": 3.6771542869499405, "grad_norm": 0.2914251685142517, "learning_rate": 8.645489738805376e-06, "loss": 0.3747, "step": 102030 }, { "epoch": 3.6773344866111652, "grad_norm": 0.18029101192951202, "learning_rate": 8.64328275327194e-06, "loss": 0.3884, "step": 102035 }, { "epoch": 3.6775146862723895, "grad_norm": 0.22522252798080444, "learning_rate": 8.641075990595615e-06, "loss": 0.3904, "step": 102040 }, { "epoch": 3.6776948859336143, "grad_norm": 0.2504473924636841, "learning_rate": 8.638869450806455e-06, "loss": 0.3892, "step": 102045 }, { "epoch": 3.677875085594839, "grad_norm": 0.28689128160476685, "learning_rate": 8.63666313393455e-06, "loss": 0.3805, "step": 102050 }, { "epoch": 3.6780552852560637, "grad_norm": 0.3135242164134979, "learning_rate": 8.634457040009932e-06, "loss": 0.373, "step": 102055 }, { "epoch": 3.6782354849172885, "grad_norm": 0.2535361051559448, "learning_rate": 8.63225116906266e-06, "loss": 0.3851, "step": 102060 }, { "epoch": 3.678415684578513, "grad_norm": 0.27366381883621216, "learning_rate": 8.630045521122806e-06, "loss": 0.3767, "step": 102065 }, { "epoch": 3.6785958842397375, "grad_norm": 0.2666814923286438, "learning_rate": 8.62784009622041e-06, "loss": 0.3867, "step": 102070 }, { "epoch": 3.6787760839009622, "grad_norm": 0.27532005310058594, "learning_rate": 8.625634894385525e-06, "loss": 0.3659, "step": 102075 }, { "epoch": 3.678956283562187, "grad_norm": 0.2169187366962433, "learning_rate": 8.623429915648195e-06, "loss": 0.3966, "step": 102080 }, { "epoch": 3.6791364832234117, "grad_norm": 0.20126007497310638, "learning_rate": 8.62122516003845e-06, "loss": 0.3569, "step": 102085 }, { "epoch": 3.679316682884636, "grad_norm": 0.22194866836071014, "learning_rate": 8.619020627586355e-06, "loss": 0.3845, "step": 102090 }, { "epoch": 3.6794968825458607, "grad_norm": 0.2033412605524063, "learning_rate": 8.61681631832193e-06, "loss": 0.3834, "step": 102095 }, { "epoch": 3.6796770822070854, "grad_norm": 0.23441757261753082, "learning_rate": 8.61461223227521e-06, "loss": 0.4229, "step": 102100 }, { "epoch": 3.67985728186831, "grad_norm": 0.2877858877182007, "learning_rate": 8.612408369476225e-06, "loss": 0.3905, "step": 102105 }, { "epoch": 3.680037481529535, "grad_norm": 0.2478102594614029, "learning_rate": 8.610204729955005e-06, "loss": 0.3757, "step": 102110 }, { "epoch": 3.680217681190759, "grad_norm": 0.2399631142616272, "learning_rate": 8.608001313741562e-06, "loss": 0.3803, "step": 102115 }, { "epoch": 3.680397880851984, "grad_norm": 0.2504538595676422, "learning_rate": 8.605798120865946e-06, "loss": 0.381, "step": 102120 }, { "epoch": 3.6805780805132087, "grad_norm": 0.21540558338165283, "learning_rate": 8.60359515135814e-06, "loss": 0.4004, "step": 102125 }, { "epoch": 3.6807582801744334, "grad_norm": 0.28474679589271545, "learning_rate": 8.601392405248186e-06, "loss": 0.4054, "step": 102130 }, { "epoch": 3.6809384798356577, "grad_norm": 0.24228844046592712, "learning_rate": 8.59918988256608e-06, "loss": 0.4117, "step": 102135 }, { "epoch": 3.6811186794968824, "grad_norm": 0.2137162685394287, "learning_rate": 8.596987583341842e-06, "loss": 0.3891, "step": 102140 }, { "epoch": 3.681298879158107, "grad_norm": 0.22238872945308685, "learning_rate": 8.594785507605468e-06, "loss": 0.3851, "step": 102145 }, { "epoch": 3.681479078819332, "grad_norm": 0.21729840338230133, "learning_rate": 8.592583655386969e-06, "loss": 0.3649, "step": 102150 }, { "epoch": 3.6816592784805566, "grad_norm": 0.205953449010849, "learning_rate": 8.590382026716331e-06, "loss": 0.3353, "step": 102155 }, { "epoch": 3.6818394781417814, "grad_norm": 0.2748815715312958, "learning_rate": 8.58818062162357e-06, "loss": 0.425, "step": 102160 }, { "epoch": 3.6820196778030057, "grad_norm": 0.29391714930534363, "learning_rate": 8.58597944013867e-06, "loss": 0.4082, "step": 102165 }, { "epoch": 3.6821998774642304, "grad_norm": 0.24622267484664917, "learning_rate": 8.583778482291621e-06, "loss": 0.3797, "step": 102170 }, { "epoch": 3.682380077125455, "grad_norm": 0.27090001106262207, "learning_rate": 8.581577748112416e-06, "loss": 0.37, "step": 102175 }, { "epoch": 3.6825602767866794, "grad_norm": 0.24336892366409302, "learning_rate": 8.579377237631022e-06, "loss": 0.3865, "step": 102180 }, { "epoch": 3.682740476447904, "grad_norm": 0.18202580511569977, "learning_rate": 8.577176950877444e-06, "loss": 0.3838, "step": 102185 }, { "epoch": 3.682920676109129, "grad_norm": 0.25919443368911743, "learning_rate": 8.574976887881653e-06, "loss": 0.379, "step": 102190 }, { "epoch": 3.6831008757703536, "grad_norm": 0.2314002364873886, "learning_rate": 8.572777048673619e-06, "loss": 0.3772, "step": 102195 }, { "epoch": 3.6832810754315783, "grad_norm": 0.22203220427036285, "learning_rate": 8.57057743328332e-06, "loss": 0.3834, "step": 102200 }, { "epoch": 3.683461275092803, "grad_norm": 0.24144263565540314, "learning_rate": 8.568378041740712e-06, "loss": 0.3648, "step": 102205 }, { "epoch": 3.6836414747540274, "grad_norm": 0.22700710594654083, "learning_rate": 8.566178874075781e-06, "loss": 0.4007, "step": 102210 }, { "epoch": 3.683821674415252, "grad_norm": 0.22427678108215332, "learning_rate": 8.563979930318489e-06, "loss": 0.3775, "step": 102215 }, { "epoch": 3.684001874076477, "grad_norm": 0.2833479344844818, "learning_rate": 8.561781210498773e-06, "loss": 0.3957, "step": 102220 }, { "epoch": 3.684182073737701, "grad_norm": 0.25942087173461914, "learning_rate": 8.559582714646613e-06, "loss": 0.3475, "step": 102225 }, { "epoch": 3.684362273398926, "grad_norm": 0.2398940771818161, "learning_rate": 8.557384442791947e-06, "loss": 0.3867, "step": 102230 }, { "epoch": 3.6845424730601506, "grad_norm": 0.25691062211990356, "learning_rate": 8.55518639496475e-06, "loss": 0.4184, "step": 102235 }, { "epoch": 3.6847226727213753, "grad_norm": 0.3172163665294647, "learning_rate": 8.552988571194948e-06, "loss": 0.4036, "step": 102240 }, { "epoch": 3.6849028723826, "grad_norm": 0.2886956036090851, "learning_rate": 8.550790971512484e-06, "loss": 0.3484, "step": 102245 }, { "epoch": 3.685083072043825, "grad_norm": 0.26184308528900146, "learning_rate": 8.548593595947315e-06, "loss": 0.3872, "step": 102250 }, { "epoch": 3.685263271705049, "grad_norm": 0.21521615982055664, "learning_rate": 8.546396444529375e-06, "loss": 0.3831, "step": 102255 }, { "epoch": 3.685443471366274, "grad_norm": 0.2815273404121399, "learning_rate": 8.544199517288599e-06, "loss": 0.3915, "step": 102260 }, { "epoch": 3.6856236710274985, "grad_norm": 0.24685360491275787, "learning_rate": 8.542002814254918e-06, "loss": 0.3755, "step": 102265 }, { "epoch": 3.685803870688723, "grad_norm": 0.21425403654575348, "learning_rate": 8.539806335458253e-06, "loss": 0.3594, "step": 102270 }, { "epoch": 3.6859840703499476, "grad_norm": 0.20560602843761444, "learning_rate": 8.537610080928548e-06, "loss": 0.3628, "step": 102275 }, { "epoch": 3.6861642700111723, "grad_norm": 0.2760855555534363, "learning_rate": 8.53541405069572e-06, "loss": 0.4007, "step": 102280 }, { "epoch": 3.686344469672397, "grad_norm": 0.24321669340133667, "learning_rate": 8.533218244789683e-06, "loss": 0.3586, "step": 102285 }, { "epoch": 3.6865246693336218, "grad_norm": 0.23994041979312897, "learning_rate": 8.531022663240366e-06, "loss": 0.3742, "step": 102290 }, { "epoch": 3.6867048689948465, "grad_norm": 0.23098506033420563, "learning_rate": 8.528827306077672e-06, "loss": 0.3835, "step": 102295 }, { "epoch": 3.686885068656071, "grad_norm": 0.21299470961093903, "learning_rate": 8.526632173331511e-06, "loss": 0.3864, "step": 102300 }, { "epoch": 3.6870652683172955, "grad_norm": 0.20803943276405334, "learning_rate": 8.524437265031815e-06, "loss": 0.4014, "step": 102305 }, { "epoch": 3.6872454679785203, "grad_norm": 0.26612281799316406, "learning_rate": 8.522242581208451e-06, "loss": 0.3805, "step": 102310 }, { "epoch": 3.687425667639745, "grad_norm": 0.26680296659469604, "learning_rate": 8.520048121891352e-06, "loss": 0.3989, "step": 102315 }, { "epoch": 3.6876058673009693, "grad_norm": 0.26344892382621765, "learning_rate": 8.517853887110408e-06, "loss": 0.3972, "step": 102320 }, { "epoch": 3.687786066962194, "grad_norm": 0.2039593607187271, "learning_rate": 8.515659876895504e-06, "loss": 0.3739, "step": 102325 }, { "epoch": 3.6879662666234188, "grad_norm": 0.24437233805656433, "learning_rate": 8.51346609127656e-06, "loss": 0.3616, "step": 102330 }, { "epoch": 3.6881464662846435, "grad_norm": 0.23033076524734497, "learning_rate": 8.51127253028344e-06, "loss": 0.4245, "step": 102335 }, { "epoch": 3.688326665945868, "grad_norm": 0.2611679136753082, "learning_rate": 8.509079193946032e-06, "loss": 0.3759, "step": 102340 }, { "epoch": 3.6885068656070925, "grad_norm": 0.19574002921581268, "learning_rate": 8.506886082294233e-06, "loss": 0.3695, "step": 102345 }, { "epoch": 3.6886870652683172, "grad_norm": 0.2866314947605133, "learning_rate": 8.504693195357921e-06, "loss": 0.3685, "step": 102350 }, { "epoch": 3.688867264929542, "grad_norm": 0.23495450615882874, "learning_rate": 8.502500533166968e-06, "loss": 0.4131, "step": 102355 }, { "epoch": 3.6890474645907667, "grad_norm": 0.2283242791891098, "learning_rate": 8.500308095751253e-06, "loss": 0.3596, "step": 102360 }, { "epoch": 3.689227664251991, "grad_norm": 0.3276098370552063, "learning_rate": 8.498115883140637e-06, "loss": 0.3671, "step": 102365 }, { "epoch": 3.6894078639132157, "grad_norm": 0.2699391543865204, "learning_rate": 8.495923895365004e-06, "loss": 0.3776, "step": 102370 }, { "epoch": 3.6895880635744405, "grad_norm": 0.2524622082710266, "learning_rate": 8.493732132454215e-06, "loss": 0.3558, "step": 102375 }, { "epoch": 3.689768263235665, "grad_norm": 0.26742005348205566, "learning_rate": 8.49154059443813e-06, "loss": 0.3457, "step": 102380 }, { "epoch": 3.68994846289689, "grad_norm": 0.2498975545167923, "learning_rate": 8.489349281346606e-06, "loss": 0.3697, "step": 102385 }, { "epoch": 3.6901286625581142, "grad_norm": 0.20344579219818115, "learning_rate": 8.487158193209497e-06, "loss": 0.3884, "step": 102390 }, { "epoch": 3.690308862219339, "grad_norm": 0.2646947205066681, "learning_rate": 8.484967330056665e-06, "loss": 0.3891, "step": 102395 }, { "epoch": 3.6904890618805637, "grad_norm": 0.23882755637168884, "learning_rate": 8.482776691917966e-06, "loss": 0.3553, "step": 102400 }, { "epoch": 3.6906692615417884, "grad_norm": 0.20374254882335663, "learning_rate": 8.480586278823219e-06, "loss": 0.3649, "step": 102405 }, { "epoch": 3.6908494612030127, "grad_norm": 0.18348126113414764, "learning_rate": 8.478396090802294e-06, "loss": 0.338, "step": 102410 }, { "epoch": 3.6910296608642374, "grad_norm": 0.22053396701812744, "learning_rate": 8.476206127885026e-06, "loss": 0.3812, "step": 102415 }, { "epoch": 3.691209860525462, "grad_norm": 0.19936566054821014, "learning_rate": 8.474016390101247e-06, "loss": 0.3908, "step": 102420 }, { "epoch": 3.691390060186687, "grad_norm": 0.20026947557926178, "learning_rate": 8.471826877480795e-06, "loss": 0.3673, "step": 102425 }, { "epoch": 3.6915702598479117, "grad_norm": 0.24228374660015106, "learning_rate": 8.469637590053497e-06, "loss": 0.3657, "step": 102430 }, { "epoch": 3.6917504595091364, "grad_norm": 0.2570672035217285, "learning_rate": 8.467448527849192e-06, "loss": 0.344, "step": 102435 }, { "epoch": 3.6919306591703607, "grad_norm": 0.255535364151001, "learning_rate": 8.4652596908977e-06, "loss": 0.377, "step": 102440 }, { "epoch": 3.6921108588315854, "grad_norm": 0.2862887680530548, "learning_rate": 8.463071079228846e-06, "loss": 0.3874, "step": 102445 }, { "epoch": 3.69229105849281, "grad_norm": 0.2560741901397705, "learning_rate": 8.460882692872446e-06, "loss": 0.3996, "step": 102450 }, { "epoch": 3.6924712581540344, "grad_norm": 0.1878066509962082, "learning_rate": 8.458694531858308e-06, "loss": 0.3576, "step": 102455 }, { "epoch": 3.692651457815259, "grad_norm": 0.2039957493543625, "learning_rate": 8.456506596216262e-06, "loss": 0.3603, "step": 102460 }, { "epoch": 3.692831657476484, "grad_norm": 0.2823259234428406, "learning_rate": 8.454318885976113e-06, "loss": 0.3855, "step": 102465 }, { "epoch": 3.6930118571377086, "grad_norm": 0.20816364884376526, "learning_rate": 8.452131401167665e-06, "loss": 0.3641, "step": 102470 }, { "epoch": 3.6931920567989334, "grad_norm": 0.248849555850029, "learning_rate": 8.449944141820723e-06, "loss": 0.3656, "step": 102475 }, { "epoch": 3.693372256460158, "grad_norm": 0.21117424964904785, "learning_rate": 8.447757107965088e-06, "loss": 0.3622, "step": 102480 }, { "epoch": 3.6935524561213824, "grad_norm": 0.21693001687526703, "learning_rate": 8.445570299630548e-06, "loss": 0.3607, "step": 102485 }, { "epoch": 3.693732655782607, "grad_norm": 0.24611179530620575, "learning_rate": 8.443383716846917e-06, "loss": 0.3802, "step": 102490 }, { "epoch": 3.693912855443832, "grad_norm": 0.199144646525383, "learning_rate": 8.441197359643977e-06, "loss": 0.3146, "step": 102495 }, { "epoch": 3.694093055105056, "grad_norm": 0.2069302648305893, "learning_rate": 8.439011228051515e-06, "loss": 0.3344, "step": 102500 }, { "epoch": 3.694093055105056, "eval_loss": 0.4291780889034271, "eval_runtime": 3.5329, "eval_samples_per_second": 28.305, "eval_steps_per_second": 7.076, "step": 102500 }, { "epoch": 3.694273254766281, "grad_norm": 0.22869972884655, "learning_rate": 8.436825322099324e-06, "loss": 0.402, "step": 102505 }, { "epoch": 3.6944534544275056, "grad_norm": 0.21545152366161346, "learning_rate": 8.434639641817168e-06, "loss": 0.3588, "step": 102510 }, { "epoch": 3.6946336540887303, "grad_norm": 0.20983858406543732, "learning_rate": 8.43245418723486e-06, "loss": 0.3756, "step": 102515 }, { "epoch": 3.694813853749955, "grad_norm": 0.23540320992469788, "learning_rate": 8.430268958382146e-06, "loss": 0.3386, "step": 102520 }, { "epoch": 3.69499405341118, "grad_norm": 0.2099265158176422, "learning_rate": 8.428083955288801e-06, "loss": 0.3688, "step": 102525 }, { "epoch": 3.695174253072404, "grad_norm": 0.2085736095905304, "learning_rate": 8.425899177984611e-06, "loss": 0.3659, "step": 102530 }, { "epoch": 3.695354452733629, "grad_norm": 0.23389971256256104, "learning_rate": 8.423714626499338e-06, "loss": 0.355, "step": 102535 }, { "epoch": 3.6955346523948536, "grad_norm": 0.23065683245658875, "learning_rate": 8.421530300862743e-06, "loss": 0.3645, "step": 102540 }, { "epoch": 3.695714852056078, "grad_norm": 0.21496419608592987, "learning_rate": 8.419346201104588e-06, "loss": 0.3849, "step": 102545 }, { "epoch": 3.6958950517173026, "grad_norm": 0.22192232310771942, "learning_rate": 8.41716232725462e-06, "loss": 0.3783, "step": 102550 }, { "epoch": 3.6960752513785273, "grad_norm": 0.21217724680900574, "learning_rate": 8.414978679342617e-06, "loss": 0.3721, "step": 102555 }, { "epoch": 3.696255451039752, "grad_norm": 0.27816903591156006, "learning_rate": 8.412795257398318e-06, "loss": 0.3908, "step": 102560 }, { "epoch": 3.696435650700977, "grad_norm": 0.25314852595329285, "learning_rate": 8.410612061451473e-06, "loss": 0.3816, "step": 102565 }, { "epoch": 3.6966158503622015, "grad_norm": 0.25173550844192505, "learning_rate": 8.408429091531825e-06, "loss": 0.3694, "step": 102570 }, { "epoch": 3.696796050023426, "grad_norm": 0.2536860704421997, "learning_rate": 8.406246347669108e-06, "loss": 0.3842, "step": 102575 }, { "epoch": 3.6969762496846506, "grad_norm": 0.22247686982154846, "learning_rate": 8.404063829893083e-06, "loss": 0.4039, "step": 102580 }, { "epoch": 3.6971564493458753, "grad_norm": 0.20821170508861542, "learning_rate": 8.401881538233483e-06, "loss": 0.3945, "step": 102585 }, { "epoch": 3.6973366490071, "grad_norm": 0.252389132976532, "learning_rate": 8.399699472720019e-06, "loss": 0.3411, "step": 102590 }, { "epoch": 3.6975168486683243, "grad_norm": 0.2539133131504059, "learning_rate": 8.397517633382441e-06, "loss": 0.3972, "step": 102595 }, { "epoch": 3.697697048329549, "grad_norm": 0.17788057029247284, "learning_rate": 8.395336020250472e-06, "loss": 0.3556, "step": 102600 }, { "epoch": 3.6978772479907738, "grad_norm": 0.20301634073257446, "learning_rate": 8.393154633353825e-06, "loss": 0.3843, "step": 102605 }, { "epoch": 3.6980574476519985, "grad_norm": 0.2251247763633728, "learning_rate": 8.39097347272225e-06, "loss": 0.3663, "step": 102610 }, { "epoch": 3.6982376473132232, "grad_norm": 0.285604327917099, "learning_rate": 8.388792538385429e-06, "loss": 0.3637, "step": 102615 }, { "epoch": 3.6984178469744475, "grad_norm": 0.2661183178424835, "learning_rate": 8.3866118303731e-06, "loss": 0.4143, "step": 102620 }, { "epoch": 3.6985980466356723, "grad_norm": 0.2364463359117508, "learning_rate": 8.384431348714972e-06, "loss": 0.3785, "step": 102625 }, { "epoch": 3.698778246296897, "grad_norm": 0.21531599760055542, "learning_rate": 8.382251093440747e-06, "loss": 0.3636, "step": 102630 }, { "epoch": 3.6989584459581217, "grad_norm": 0.23172800242900848, "learning_rate": 8.380071064580133e-06, "loss": 0.3799, "step": 102635 }, { "epoch": 3.699138645619346, "grad_norm": 0.2571316063404083, "learning_rate": 8.377891262162827e-06, "loss": 0.3758, "step": 102640 }, { "epoch": 3.6993188452805708, "grad_norm": 0.22325757145881653, "learning_rate": 8.37571168621854e-06, "loss": 0.3447, "step": 102645 }, { "epoch": 3.6994990449417955, "grad_norm": 0.2073674499988556, "learning_rate": 8.373532336776965e-06, "loss": 0.4042, "step": 102650 }, { "epoch": 3.6996792446030202, "grad_norm": 0.22034046053886414, "learning_rate": 8.371353213867792e-06, "loss": 0.386, "step": 102655 }, { "epoch": 3.699859444264245, "grad_norm": 0.23101651668548584, "learning_rate": 8.369174317520714e-06, "loss": 0.4023, "step": 102660 }, { "epoch": 3.7000396439254697, "grad_norm": 0.2687579095363617, "learning_rate": 8.366995647765413e-06, "loss": 0.3909, "step": 102665 }, { "epoch": 3.700219843586694, "grad_norm": 0.24219973385334015, "learning_rate": 8.364817204631569e-06, "loss": 0.3828, "step": 102670 }, { "epoch": 3.7004000432479187, "grad_norm": 0.2693396508693695, "learning_rate": 8.36263898814888e-06, "loss": 0.3629, "step": 102675 }, { "epoch": 3.7005802429091434, "grad_norm": 0.20992809534072876, "learning_rate": 8.36046099834701e-06, "loss": 0.3739, "step": 102680 }, { "epoch": 3.7007604425703677, "grad_norm": 0.22777998447418213, "learning_rate": 8.35828323525564e-06, "loss": 0.3974, "step": 102685 }, { "epoch": 3.7009406422315925, "grad_norm": 0.25267812609672546, "learning_rate": 8.35610569890444e-06, "loss": 0.3982, "step": 102690 }, { "epoch": 3.701120841892817, "grad_norm": 0.24532164633274078, "learning_rate": 8.353928389323064e-06, "loss": 0.3529, "step": 102695 }, { "epoch": 3.701301041554042, "grad_norm": 0.23484160006046295, "learning_rate": 8.351751306541215e-06, "loss": 0.3878, "step": 102700 }, { "epoch": 3.7014812412152667, "grad_norm": 0.2185060977935791, "learning_rate": 8.349574450588518e-06, "loss": 0.3754, "step": 102705 }, { "epoch": 3.7016614408764914, "grad_norm": 0.28905272483825684, "learning_rate": 8.347397821494637e-06, "loss": 0.3616, "step": 102710 }, { "epoch": 3.7018416405377157, "grad_norm": 0.25226137042045593, "learning_rate": 8.345221419289247e-06, "loss": 0.387, "step": 102715 }, { "epoch": 3.7020218401989404, "grad_norm": 0.22827744483947754, "learning_rate": 8.343045244001982e-06, "loss": 0.3774, "step": 102720 }, { "epoch": 3.702202039860165, "grad_norm": 0.2841804623603821, "learning_rate": 8.340869295662517e-06, "loss": 0.4008, "step": 102725 }, { "epoch": 3.7023822395213895, "grad_norm": 0.2542993724346161, "learning_rate": 8.338693574300474e-06, "loss": 0.3553, "step": 102730 }, { "epoch": 3.702562439182614, "grad_norm": 0.21926385164260864, "learning_rate": 8.336518079945497e-06, "loss": 0.4037, "step": 102735 }, { "epoch": 3.702742638843839, "grad_norm": 0.2608276903629303, "learning_rate": 8.334342812627244e-06, "loss": 0.3645, "step": 102740 }, { "epoch": 3.7029228385050637, "grad_norm": 0.2661699950695038, "learning_rate": 8.332167772375344e-06, "loss": 0.3456, "step": 102745 }, { "epoch": 3.7031030381662884, "grad_norm": 0.18789133429527283, "learning_rate": 8.32999295921943e-06, "loss": 0.3741, "step": 102750 }, { "epoch": 3.703283237827513, "grad_norm": 0.276394248008728, "learning_rate": 8.327818373189133e-06, "loss": 0.3662, "step": 102755 }, { "epoch": 3.7034634374887374, "grad_norm": 0.20188158750534058, "learning_rate": 8.325644014314077e-06, "loss": 0.3959, "step": 102760 }, { "epoch": 3.703643637149962, "grad_norm": 0.22388440370559692, "learning_rate": 8.323469882623899e-06, "loss": 0.3678, "step": 102765 }, { "epoch": 3.703823836811187, "grad_norm": 0.2965695559978485, "learning_rate": 8.321295978148217e-06, "loss": 0.3772, "step": 102770 }, { "epoch": 3.704004036472411, "grad_norm": 0.2891329824924469, "learning_rate": 8.319122300916649e-06, "loss": 0.3989, "step": 102775 }, { "epoch": 3.704184236133636, "grad_norm": 0.2619994580745697, "learning_rate": 8.316948850958809e-06, "loss": 0.3902, "step": 102780 }, { "epoch": 3.7043644357948606, "grad_norm": 0.24362410604953766, "learning_rate": 8.31477562830431e-06, "loss": 0.3854, "step": 102785 }, { "epoch": 3.7045446354560854, "grad_norm": 0.27109575271606445, "learning_rate": 8.312602632982756e-06, "loss": 0.373, "step": 102790 }, { "epoch": 3.70472483511731, "grad_norm": 0.24202807247638702, "learning_rate": 8.310429865023775e-06, "loss": 0.3879, "step": 102795 }, { "epoch": 3.704905034778535, "grad_norm": 0.245781809091568, "learning_rate": 8.308257324456942e-06, "loss": 0.3847, "step": 102800 }, { "epoch": 3.705085234439759, "grad_norm": 0.2280445694923401, "learning_rate": 8.306085011311878e-06, "loss": 0.3645, "step": 102805 }, { "epoch": 3.705265434100984, "grad_norm": 0.2140251249074936, "learning_rate": 8.303912925618174e-06, "loss": 0.3751, "step": 102810 }, { "epoch": 3.7054456337622086, "grad_norm": 0.3311702311038971, "learning_rate": 8.301741067405424e-06, "loss": 0.3822, "step": 102815 }, { "epoch": 3.7056258334234333, "grad_norm": 0.2583707273006439, "learning_rate": 8.29956943670322e-06, "loss": 0.3744, "step": 102820 }, { "epoch": 3.7058060330846576, "grad_norm": 0.2845836877822876, "learning_rate": 8.297398033541137e-06, "loss": 0.389, "step": 102825 }, { "epoch": 3.7059862327458823, "grad_norm": 0.23895393311977386, "learning_rate": 8.295226857948785e-06, "loss": 0.3877, "step": 102830 }, { "epoch": 3.706166432407107, "grad_norm": 0.25052353739738464, "learning_rate": 8.29305590995573e-06, "loss": 0.4122, "step": 102835 }, { "epoch": 3.706346632068332, "grad_norm": 0.23934602737426758, "learning_rate": 8.290885189591555e-06, "loss": 0.4222, "step": 102840 }, { "epoch": 3.7065268317295565, "grad_norm": 0.24071356654167175, "learning_rate": 8.288714696885835e-06, "loss": 0.3534, "step": 102845 }, { "epoch": 3.706707031390781, "grad_norm": 0.250177264213562, "learning_rate": 8.28654443186814e-06, "loss": 0.3517, "step": 102850 }, { "epoch": 3.7068872310520056, "grad_norm": 0.25618675351142883, "learning_rate": 8.284374394568034e-06, "loss": 0.3571, "step": 102855 }, { "epoch": 3.7070674307132303, "grad_norm": 0.2533912658691406, "learning_rate": 8.282204585015098e-06, "loss": 0.384, "step": 102860 }, { "epoch": 3.707247630374455, "grad_norm": 0.24974510073661804, "learning_rate": 8.280035003238889e-06, "loss": 0.397, "step": 102865 }, { "epoch": 3.7074278300356793, "grad_norm": 0.2762242555618286, "learning_rate": 8.277865649268965e-06, "loss": 0.3575, "step": 102870 }, { "epoch": 3.707608029696904, "grad_norm": 0.28377920389175415, "learning_rate": 8.275696523134885e-06, "loss": 0.3622, "step": 102875 }, { "epoch": 3.707788229358129, "grad_norm": 0.2506207525730133, "learning_rate": 8.273527624866192e-06, "loss": 0.4064, "step": 102880 }, { "epoch": 3.7079684290193535, "grad_norm": 0.27845442295074463, "learning_rate": 8.271358954492458e-06, "loss": 0.3983, "step": 102885 }, { "epoch": 3.7081486286805783, "grad_norm": 0.1891564130783081, "learning_rate": 8.269190512043226e-06, "loss": 0.3888, "step": 102890 }, { "epoch": 3.7083288283418026, "grad_norm": 0.21435974538326263, "learning_rate": 8.267022297548016e-06, "loss": 0.3931, "step": 102895 }, { "epoch": 3.7085090280030273, "grad_norm": 0.2660645842552185, "learning_rate": 8.264854311036399e-06, "loss": 0.3827, "step": 102900 }, { "epoch": 3.708689227664252, "grad_norm": 0.23603610694408417, "learning_rate": 8.262686552537894e-06, "loss": 0.3614, "step": 102905 }, { "epoch": 3.7088694273254768, "grad_norm": 0.24526463449001312, "learning_rate": 8.260519022082058e-06, "loss": 0.3742, "step": 102910 }, { "epoch": 3.709049626986701, "grad_norm": 0.24888326227664948, "learning_rate": 8.258351719698401e-06, "loss": 0.3826, "step": 102915 }, { "epoch": 3.7092298266479258, "grad_norm": 0.2145439237356186, "learning_rate": 8.256184645416453e-06, "loss": 0.3911, "step": 102920 }, { "epoch": 3.7094100263091505, "grad_norm": 0.2232615351676941, "learning_rate": 8.254017799265757e-06, "loss": 0.3904, "step": 102925 }, { "epoch": 3.7095902259703752, "grad_norm": 0.2325001209974289, "learning_rate": 8.251851181275824e-06, "loss": 0.3509, "step": 102930 }, { "epoch": 3.7097704256316, "grad_norm": 0.2643667459487915, "learning_rate": 8.249684791476178e-06, "loss": 0.3894, "step": 102935 }, { "epoch": 3.7099506252928247, "grad_norm": 0.22574815154075623, "learning_rate": 8.247518629896334e-06, "loss": 0.3611, "step": 102940 }, { "epoch": 3.710130824954049, "grad_norm": 0.26316380500793457, "learning_rate": 8.245352696565797e-06, "loss": 0.3905, "step": 102945 }, { "epoch": 3.7103110246152737, "grad_norm": 0.20727650821208954, "learning_rate": 8.243186991514092e-06, "loss": 0.3788, "step": 102950 }, { "epoch": 3.7104912242764985, "grad_norm": 0.24111421406269073, "learning_rate": 8.241021514770721e-06, "loss": 0.364, "step": 102955 }, { "epoch": 3.7106714239377228, "grad_norm": 0.21225392818450928, "learning_rate": 8.238856266365188e-06, "loss": 0.3928, "step": 102960 }, { "epoch": 3.7108516235989475, "grad_norm": 0.27659305930137634, "learning_rate": 8.236691246326994e-06, "loss": 0.3714, "step": 102965 }, { "epoch": 3.7110318232601722, "grad_norm": 0.26336660981178284, "learning_rate": 8.234526454685634e-06, "loss": 0.3199, "step": 102970 }, { "epoch": 3.711212022921397, "grad_norm": 0.2715068459510803, "learning_rate": 8.232361891470599e-06, "loss": 0.3795, "step": 102975 }, { "epoch": 3.7113922225826217, "grad_norm": 0.2276494801044464, "learning_rate": 8.230197556711403e-06, "loss": 0.3569, "step": 102980 }, { "epoch": 3.7115724222438464, "grad_norm": 0.25353577733039856, "learning_rate": 8.228033450437503e-06, "loss": 0.406, "step": 102985 }, { "epoch": 3.7117526219050707, "grad_norm": 0.20923548936843872, "learning_rate": 8.225869572678405e-06, "loss": 0.3721, "step": 102990 }, { "epoch": 3.7119328215662954, "grad_norm": 0.2902141809463501, "learning_rate": 8.22370592346359e-06, "loss": 0.4074, "step": 102995 }, { "epoch": 3.71211302122752, "grad_norm": 0.26280632615089417, "learning_rate": 8.221542502822533e-06, "loss": 0.3569, "step": 103000 }, { "epoch": 3.71211302122752, "eval_loss": 0.42930299043655396, "eval_runtime": 3.5329, "eval_samples_per_second": 28.306, "eval_steps_per_second": 7.076, "step": 103000 }, { "epoch": 3.7122932208887445, "grad_norm": 0.24260425567626953, "learning_rate": 8.219379310784708e-06, "loss": 0.3687, "step": 103005 }, { "epoch": 3.712473420549969, "grad_norm": 0.2277086079120636, "learning_rate": 8.217216347379594e-06, "loss": 0.392, "step": 103010 }, { "epoch": 3.712653620211194, "grad_norm": 0.28349313139915466, "learning_rate": 8.21505361263665e-06, "loss": 0.3905, "step": 103015 }, { "epoch": 3.7128338198724187, "grad_norm": 0.2919948399066925, "learning_rate": 8.212891106585357e-06, "loss": 0.3954, "step": 103020 }, { "epoch": 3.7130140195336434, "grad_norm": 0.20646890997886658, "learning_rate": 8.210728829255173e-06, "loss": 0.3824, "step": 103025 }, { "epoch": 3.713194219194868, "grad_norm": 0.2770639657974243, "learning_rate": 8.208566780675561e-06, "loss": 0.3688, "step": 103030 }, { "epoch": 3.7133744188560924, "grad_norm": 0.31089577078819275, "learning_rate": 8.206404960875972e-06, "loss": 0.3768, "step": 103035 }, { "epoch": 3.713554618517317, "grad_norm": 0.20020338892936707, "learning_rate": 8.204243369885859e-06, "loss": 0.3446, "step": 103040 }, { "epoch": 3.713734818178542, "grad_norm": 0.25456637144088745, "learning_rate": 8.202082007734684e-06, "loss": 0.3762, "step": 103045 }, { "epoch": 3.713915017839766, "grad_norm": 0.23088417947292328, "learning_rate": 8.19992087445189e-06, "loss": 0.3943, "step": 103050 }, { "epoch": 3.714095217500991, "grad_norm": 0.26196154952049255, "learning_rate": 8.197759970066923e-06, "loss": 0.4015, "step": 103055 }, { "epoch": 3.7142754171622157, "grad_norm": 0.21619191765785217, "learning_rate": 8.195599294609222e-06, "loss": 0.3914, "step": 103060 }, { "epoch": 3.7144556168234404, "grad_norm": 0.29650604724884033, "learning_rate": 8.19343884810822e-06, "loss": 0.3818, "step": 103065 }, { "epoch": 3.714635816484665, "grad_norm": 0.26321521401405334, "learning_rate": 8.191278630593367e-06, "loss": 0.3734, "step": 103070 }, { "epoch": 3.71481601614589, "grad_norm": 0.2702058255672455, "learning_rate": 8.189118642094095e-06, "loss": 0.3759, "step": 103075 }, { "epoch": 3.714996215807114, "grad_norm": 0.2815110683441162, "learning_rate": 8.186958882639813e-06, "loss": 0.3757, "step": 103080 }, { "epoch": 3.715176415468339, "grad_norm": 0.28946641087532043, "learning_rate": 8.18479935225997e-06, "loss": 0.3692, "step": 103085 }, { "epoch": 3.7153566151295636, "grad_norm": 0.3027191758155823, "learning_rate": 8.18264005098397e-06, "loss": 0.3738, "step": 103090 }, { "epoch": 3.7155368147907883, "grad_norm": 0.22006292641162872, "learning_rate": 8.18048097884126e-06, "loss": 0.398, "step": 103095 }, { "epoch": 3.7157170144520126, "grad_norm": 0.27258020639419556, "learning_rate": 8.178322135861233e-06, "loss": 0.3971, "step": 103100 }, { "epoch": 3.7158972141132374, "grad_norm": 0.22416990995407104, "learning_rate": 8.176163522073302e-06, "loss": 0.3712, "step": 103105 }, { "epoch": 3.716077413774462, "grad_norm": 0.22454127669334412, "learning_rate": 8.174005137506894e-06, "loss": 0.3709, "step": 103110 }, { "epoch": 3.716257613435687, "grad_norm": 0.21970023214817047, "learning_rate": 8.171846982191409e-06, "loss": 0.3468, "step": 103115 }, { "epoch": 3.7164378130969116, "grad_norm": 0.23587019741535187, "learning_rate": 8.169689056156249e-06, "loss": 0.3688, "step": 103120 }, { "epoch": 3.716618012758136, "grad_norm": 0.2582648992538452, "learning_rate": 8.167531359430815e-06, "loss": 0.3482, "step": 103125 }, { "epoch": 3.7167982124193606, "grad_norm": 0.23627053201198578, "learning_rate": 8.165373892044503e-06, "loss": 0.3601, "step": 103130 }, { "epoch": 3.7169784120805853, "grad_norm": 0.2360253483057022, "learning_rate": 8.163216654026721e-06, "loss": 0.3759, "step": 103135 }, { "epoch": 3.71715861174181, "grad_norm": 0.32699692249298096, "learning_rate": 8.16105964540685e-06, "loss": 0.4162, "step": 103140 }, { "epoch": 3.7173388114030343, "grad_norm": 0.2977791726589203, "learning_rate": 8.158902866214282e-06, "loss": 0.3725, "step": 103145 }, { "epoch": 3.717519011064259, "grad_norm": 0.29648059606552124, "learning_rate": 8.156746316478403e-06, "loss": 0.412, "step": 103150 }, { "epoch": 3.717699210725484, "grad_norm": 0.2452070415019989, "learning_rate": 8.154589996228595e-06, "loss": 0.3724, "step": 103155 }, { "epoch": 3.7178794103867085, "grad_norm": 0.23447571694850922, "learning_rate": 8.152433905494228e-06, "loss": 0.3573, "step": 103160 }, { "epoch": 3.7180596100479333, "grad_norm": 0.22790288925170898, "learning_rate": 8.150278044304702e-06, "loss": 0.3927, "step": 103165 }, { "epoch": 3.718239809709158, "grad_norm": 0.24579019844532013, "learning_rate": 8.14812241268936e-06, "loss": 0.3558, "step": 103170 }, { "epoch": 3.7184200093703823, "grad_norm": 0.2221144735813141, "learning_rate": 8.145967010677597e-06, "loss": 0.3759, "step": 103175 }, { "epoch": 3.718600209031607, "grad_norm": 0.2838151454925537, "learning_rate": 8.143811838298767e-06, "loss": 0.3491, "step": 103180 }, { "epoch": 3.7187804086928318, "grad_norm": 0.20898236334323883, "learning_rate": 8.141656895582233e-06, "loss": 0.3515, "step": 103185 }, { "epoch": 3.718960608354056, "grad_norm": 0.2016228437423706, "learning_rate": 8.139502182557373e-06, "loss": 0.3518, "step": 103190 }, { "epoch": 3.719140808015281, "grad_norm": 0.29854917526245117, "learning_rate": 8.137347699253526e-06, "loss": 0.3839, "step": 103195 }, { "epoch": 3.7193210076765055, "grad_norm": 0.24756811559200287, "learning_rate": 8.135193445700043e-06, "loss": 0.3774, "step": 103200 }, { "epoch": 3.7195012073377303, "grad_norm": 0.25717464089393616, "learning_rate": 8.133039421926291e-06, "loss": 0.3913, "step": 103205 }, { "epoch": 3.719681406998955, "grad_norm": 0.216623917222023, "learning_rate": 8.130885627961612e-06, "loss": 0.401, "step": 103210 }, { "epoch": 3.7198616066601797, "grad_norm": 0.2196447253227234, "learning_rate": 8.12873206383535e-06, "loss": 0.3996, "step": 103215 }, { "epoch": 3.720041806321404, "grad_norm": 0.17287389934062958, "learning_rate": 8.126578729576851e-06, "loss": 0.3962, "step": 103220 }, { "epoch": 3.7202220059826288, "grad_norm": 0.23201312124729156, "learning_rate": 8.124425625215437e-06, "loss": 0.3583, "step": 103225 }, { "epoch": 3.7204022056438535, "grad_norm": 0.21326382458209991, "learning_rate": 8.122272750780465e-06, "loss": 0.38, "step": 103230 }, { "epoch": 3.7205824053050778, "grad_norm": 0.2922148108482361, "learning_rate": 8.120120106301263e-06, "loss": 0.3748, "step": 103235 }, { "epoch": 3.7207626049663025, "grad_norm": 0.22092339396476746, "learning_rate": 8.117967691807155e-06, "loss": 0.4146, "step": 103240 }, { "epoch": 3.7209428046275272, "grad_norm": 0.2587362825870514, "learning_rate": 8.115815507327465e-06, "loss": 0.3996, "step": 103245 }, { "epoch": 3.721123004288752, "grad_norm": 0.2722143232822418, "learning_rate": 8.113663552891516e-06, "loss": 0.4322, "step": 103250 }, { "epoch": 3.7213032039499767, "grad_norm": 0.21530699729919434, "learning_rate": 8.11151182852864e-06, "loss": 0.3856, "step": 103255 }, { "epoch": 3.7214834036112014, "grad_norm": 0.22060318291187286, "learning_rate": 8.10936033426815e-06, "loss": 0.3657, "step": 103260 }, { "epoch": 3.7216636032724257, "grad_norm": 0.36303722858428955, "learning_rate": 8.107209070139341e-06, "loss": 0.4083, "step": 103265 }, { "epoch": 3.7218438029336505, "grad_norm": 0.189983531832695, "learning_rate": 8.105058036171547e-06, "loss": 0.3606, "step": 103270 }, { "epoch": 3.722024002594875, "grad_norm": 0.2879542410373688, "learning_rate": 8.102907232394057e-06, "loss": 0.3928, "step": 103275 }, { "epoch": 3.7222042022560995, "grad_norm": 0.2692555785179138, "learning_rate": 8.100756658836202e-06, "loss": 0.3752, "step": 103280 }, { "epoch": 3.7223844019173242, "grad_norm": 0.298684298992157, "learning_rate": 8.098606315527258e-06, "loss": 0.3915, "step": 103285 }, { "epoch": 3.722564601578549, "grad_norm": 0.2624755799770355, "learning_rate": 8.096456202496518e-06, "loss": 0.3932, "step": 103290 }, { "epoch": 3.7227448012397737, "grad_norm": 0.25200989842414856, "learning_rate": 8.094306319773304e-06, "loss": 0.3696, "step": 103295 }, { "epoch": 3.7229250009009984, "grad_norm": 0.26927831768989563, "learning_rate": 8.092156667386891e-06, "loss": 0.3891, "step": 103300 }, { "epoch": 3.723105200562223, "grad_norm": 0.21889813244342804, "learning_rate": 8.090007245366567e-06, "loss": 0.3829, "step": 103305 }, { "epoch": 3.7232854002234474, "grad_norm": 0.24276183545589447, "learning_rate": 8.087858053741626e-06, "loss": 0.3566, "step": 103310 }, { "epoch": 3.723465599884672, "grad_norm": 0.2647343873977661, "learning_rate": 8.085709092541332e-06, "loss": 0.3916, "step": 103315 }, { "epoch": 3.723645799545897, "grad_norm": 0.23813170194625854, "learning_rate": 8.083560361794988e-06, "loss": 0.3831, "step": 103320 }, { "epoch": 3.7238259992071217, "grad_norm": 0.21774239838123322, "learning_rate": 8.081411861531856e-06, "loss": 0.3904, "step": 103325 }, { "epoch": 3.724006198868346, "grad_norm": 0.2522357106208801, "learning_rate": 8.079263591781213e-06, "loss": 0.3394, "step": 103330 }, { "epoch": 3.7241863985295707, "grad_norm": 0.2412281036376953, "learning_rate": 8.077115552572328e-06, "loss": 0.3927, "step": 103335 }, { "epoch": 3.7243665981907954, "grad_norm": 0.20674298703670502, "learning_rate": 8.074967743934466e-06, "loss": 0.3638, "step": 103340 }, { "epoch": 3.72454679785202, "grad_norm": 0.2086464762687683, "learning_rate": 8.072820165896886e-06, "loss": 0.3662, "step": 103345 }, { "epoch": 3.724726997513245, "grad_norm": 0.24110788106918335, "learning_rate": 8.07067281848886e-06, "loss": 0.4261, "step": 103350 }, { "epoch": 3.724907197174469, "grad_norm": 0.2587282955646515, "learning_rate": 8.06852570173964e-06, "loss": 0.3886, "step": 103355 }, { "epoch": 3.725087396835694, "grad_norm": 0.20811423659324646, "learning_rate": 8.06637881567848e-06, "loss": 0.3289, "step": 103360 }, { "epoch": 3.7252675964969186, "grad_norm": 0.19246096909046173, "learning_rate": 8.064232160334632e-06, "loss": 0.351, "step": 103365 }, { "epoch": 3.7254477961581434, "grad_norm": 0.2847633361816406, "learning_rate": 8.062085735737329e-06, "loss": 0.3723, "step": 103370 }, { "epoch": 3.7256279958193677, "grad_norm": 0.21437424421310425, "learning_rate": 8.059939541915847e-06, "loss": 0.3805, "step": 103375 }, { "epoch": 3.7258081954805924, "grad_norm": 0.23143932223320007, "learning_rate": 8.057793578899403e-06, "loss": 0.3444, "step": 103380 }, { "epoch": 3.725988395141817, "grad_norm": 0.18867234885692596, "learning_rate": 8.055647846717232e-06, "loss": 0.367, "step": 103385 }, { "epoch": 3.726168594803042, "grad_norm": 0.22278887033462524, "learning_rate": 8.053502345398583e-06, "loss": 0.3791, "step": 103390 }, { "epoch": 3.7263487944642666, "grad_norm": 0.2264564037322998, "learning_rate": 8.051357074972688e-06, "loss": 0.3769, "step": 103395 }, { "epoch": 3.726528994125491, "grad_norm": 0.3085385262966156, "learning_rate": 8.04921203546877e-06, "loss": 0.3945, "step": 103400 }, { "epoch": 3.7267091937867156, "grad_norm": 0.1784570962190628, "learning_rate": 8.047067226916058e-06, "loss": 0.3429, "step": 103405 }, { "epoch": 3.7268893934479403, "grad_norm": 0.1708642989397049, "learning_rate": 8.044922649343762e-06, "loss": 0.3599, "step": 103410 }, { "epoch": 3.727069593109165, "grad_norm": 0.30440521240234375, "learning_rate": 8.042778302781123e-06, "loss": 0.4162, "step": 103415 }, { "epoch": 3.7272497927703894, "grad_norm": 0.28747257590293884, "learning_rate": 8.040634187257345e-06, "loss": 0.3721, "step": 103420 }, { "epoch": 3.727429992431614, "grad_norm": 0.24325698614120483, "learning_rate": 8.038490302801641e-06, "loss": 0.3838, "step": 103425 }, { "epoch": 3.727610192092839, "grad_norm": 0.22188496589660645, "learning_rate": 8.036346649443224e-06, "loss": 0.3684, "step": 103430 }, { "epoch": 3.7277903917540636, "grad_norm": 0.23214270174503326, "learning_rate": 8.034203227211292e-06, "loss": 0.3609, "step": 103435 }, { "epoch": 3.7279705914152883, "grad_norm": 0.2736099660396576, "learning_rate": 8.032060036135067e-06, "loss": 0.3663, "step": 103440 }, { "epoch": 3.728150791076513, "grad_norm": 0.26921337842941284, "learning_rate": 8.029917076243745e-06, "loss": 0.3804, "step": 103445 }, { "epoch": 3.7283309907377373, "grad_norm": 0.25673896074295044, "learning_rate": 8.027774347566499e-06, "loss": 0.3767, "step": 103450 }, { "epoch": 3.728511190398962, "grad_norm": 0.288904070854187, "learning_rate": 8.025631850132554e-06, "loss": 0.3969, "step": 103455 }, { "epoch": 3.728691390060187, "grad_norm": 0.23645661771297455, "learning_rate": 8.023489583971077e-06, "loss": 0.3695, "step": 103460 }, { "epoch": 3.728871589721411, "grad_norm": 0.24535304307937622, "learning_rate": 8.021347549111277e-06, "loss": 0.4008, "step": 103465 }, { "epoch": 3.729051789382636, "grad_norm": 0.25152429938316345, "learning_rate": 8.019205745582337e-06, "loss": 0.3894, "step": 103470 }, { "epoch": 3.7292319890438606, "grad_norm": 0.2640352249145508, "learning_rate": 8.017064173413416e-06, "loss": 0.3442, "step": 103475 }, { "epoch": 3.7294121887050853, "grad_norm": 0.22095341980457306, "learning_rate": 8.015351082277115e-06, "loss": 0.3711, "step": 103480 }, { "epoch": 3.72959238836631, "grad_norm": 0.23479041457176208, "learning_rate": 8.013209926629791e-06, "loss": 0.3823, "step": 103485 }, { "epoch": 3.7297725880275348, "grad_norm": 0.2575128972530365, "learning_rate": 8.011069002424194e-06, "loss": 0.3876, "step": 103490 }, { "epoch": 3.729952787688759, "grad_norm": 0.25867316126823425, "learning_rate": 8.00892830968949e-06, "loss": 0.3505, "step": 103495 }, { "epoch": 3.7301329873499838, "grad_norm": 0.24454909563064575, "learning_rate": 8.00678784845485e-06, "loss": 0.4008, "step": 103500 }, { "epoch": 3.7301329873499838, "eval_loss": 0.4287916421890259, "eval_runtime": 3.5358, "eval_samples_per_second": 28.282, "eval_steps_per_second": 7.071, "step": 103500 }, { "epoch": 3.7303131870112085, "grad_norm": 0.2569969892501831, "learning_rate": 8.004647618749424e-06, "loss": 0.3734, "step": 103505 }, { "epoch": 3.730493386672433, "grad_norm": 0.26036691665649414, "learning_rate": 8.002507620602398e-06, "loss": 0.3745, "step": 103510 }, { "epoch": 3.7306735863336575, "grad_norm": 0.2165229618549347, "learning_rate": 8.000367854042911e-06, "loss": 0.3696, "step": 103515 }, { "epoch": 3.7308537859948823, "grad_norm": 0.20541858673095703, "learning_rate": 7.99822831910012e-06, "loss": 0.3753, "step": 103520 }, { "epoch": 3.731033985656107, "grad_norm": 0.23357829451560974, "learning_rate": 7.99608901580318e-06, "loss": 0.3534, "step": 103525 }, { "epoch": 3.7312141853173317, "grad_norm": 0.2972395420074463, "learning_rate": 7.993949944181223e-06, "loss": 0.3948, "step": 103530 }, { "epoch": 3.7313943849785565, "grad_norm": 0.22406528890132904, "learning_rate": 7.991811104263417e-06, "loss": 0.3747, "step": 103535 }, { "epoch": 3.7315745846397808, "grad_norm": 0.21077711880207062, "learning_rate": 7.989672496078899e-06, "loss": 0.3868, "step": 103540 }, { "epoch": 3.7317547843010055, "grad_norm": 0.2398710399866104, "learning_rate": 7.98753411965678e-06, "loss": 0.3643, "step": 103545 }, { "epoch": 3.7319349839622302, "grad_norm": 0.29205483198165894, "learning_rate": 7.985395975026227e-06, "loss": 0.3413, "step": 103550 }, { "epoch": 3.7321151836234545, "grad_norm": 0.24142169952392578, "learning_rate": 7.983258062216361e-06, "loss": 0.3876, "step": 103555 }, { "epoch": 3.7322953832846792, "grad_norm": 0.2126045972108841, "learning_rate": 7.981120381256308e-06, "loss": 0.3878, "step": 103560 }, { "epoch": 3.732475582945904, "grad_norm": 0.21912820637226105, "learning_rate": 7.978982932175195e-06, "loss": 0.3782, "step": 103565 }, { "epoch": 3.7326557826071287, "grad_norm": 0.3162360191345215, "learning_rate": 7.976845715002131e-06, "loss": 0.4018, "step": 103570 }, { "epoch": 3.7328359822683534, "grad_norm": 0.20726588368415833, "learning_rate": 7.974708729766262e-06, "loss": 0.3311, "step": 103575 }, { "epoch": 3.733016181929578, "grad_norm": 0.30441704392433167, "learning_rate": 7.97257197649669e-06, "loss": 0.3811, "step": 103580 }, { "epoch": 3.7331963815908025, "grad_norm": 0.23164492845535278, "learning_rate": 7.970435455222528e-06, "loss": 0.3566, "step": 103585 }, { "epoch": 3.733376581252027, "grad_norm": 0.26288750767707825, "learning_rate": 7.968299165972884e-06, "loss": 0.3842, "step": 103590 }, { "epoch": 3.733556780913252, "grad_norm": 0.23692747950553894, "learning_rate": 7.966163108776866e-06, "loss": 0.3784, "step": 103595 }, { "epoch": 3.7337369805744767, "grad_norm": 0.19637106359004974, "learning_rate": 7.964027283663573e-06, "loss": 0.3741, "step": 103600 }, { "epoch": 3.733917180235701, "grad_norm": 0.2366388440132141, "learning_rate": 7.961891690662115e-06, "loss": 0.3927, "step": 103605 }, { "epoch": 3.7340973798969257, "grad_norm": 0.24893350899219513, "learning_rate": 7.959756329801588e-06, "loss": 0.3664, "step": 103610 }, { "epoch": 3.7342775795581504, "grad_norm": 0.26749271154403687, "learning_rate": 7.957621201111079e-06, "loss": 0.3876, "step": 103615 }, { "epoch": 3.734457779219375, "grad_norm": 0.23498766124248505, "learning_rate": 7.95548630461968e-06, "loss": 0.3765, "step": 103620 }, { "epoch": 3.7346379788806, "grad_norm": 0.23466086387634277, "learning_rate": 7.953351640356475e-06, "loss": 0.3897, "step": 103625 }, { "epoch": 3.734818178541824, "grad_norm": 0.27381378412246704, "learning_rate": 7.951217208350567e-06, "loss": 0.3869, "step": 103630 }, { "epoch": 3.734998378203049, "grad_norm": 0.23438720405101776, "learning_rate": 7.94908300863102e-06, "loss": 0.3729, "step": 103635 }, { "epoch": 3.7351785778642737, "grad_norm": 0.2471228539943695, "learning_rate": 7.946949041226904e-06, "loss": 0.4086, "step": 103640 }, { "epoch": 3.7353587775254984, "grad_norm": 0.2612572908401489, "learning_rate": 7.944815306167314e-06, "loss": 0.3758, "step": 103645 }, { "epoch": 3.7355389771867227, "grad_norm": 0.31286609172821045, "learning_rate": 7.942681803481309e-06, "loss": 0.3795, "step": 103650 }, { "epoch": 3.7357191768479474, "grad_norm": 0.2046850472688675, "learning_rate": 7.940548533197973e-06, "loss": 0.3579, "step": 103655 }, { "epoch": 3.735899376509172, "grad_norm": 0.22326718270778656, "learning_rate": 7.938415495346354e-06, "loss": 0.3635, "step": 103660 }, { "epoch": 3.736079576170397, "grad_norm": 0.27218303084373474, "learning_rate": 7.936282689955515e-06, "loss": 0.3704, "step": 103665 }, { "epoch": 3.7362597758316216, "grad_norm": 0.2567422688007355, "learning_rate": 7.934150117054528e-06, "loss": 0.3358, "step": 103670 }, { "epoch": 3.7364399754928463, "grad_norm": 0.21654558181762695, "learning_rate": 7.93201777667244e-06, "loss": 0.3797, "step": 103675 }, { "epoch": 3.7366201751540706, "grad_norm": 0.26781266927719116, "learning_rate": 7.929885668838305e-06, "loss": 0.3711, "step": 103680 }, { "epoch": 3.7368003748152954, "grad_norm": 0.23572920262813568, "learning_rate": 7.927753793581174e-06, "loss": 0.3858, "step": 103685 }, { "epoch": 3.73698057447652, "grad_norm": 0.2272387593984604, "learning_rate": 7.925622150930085e-06, "loss": 0.4051, "step": 103690 }, { "epoch": 3.7371607741377444, "grad_norm": 0.24096395075321198, "learning_rate": 7.923490740914097e-06, "loss": 0.3699, "step": 103695 }, { "epoch": 3.737340973798969, "grad_norm": 0.2557925581932068, "learning_rate": 7.92135956356224e-06, "loss": 0.4079, "step": 103700 }, { "epoch": 3.737521173460194, "grad_norm": 0.2199949324131012, "learning_rate": 7.919228618903551e-06, "loss": 0.3751, "step": 103705 }, { "epoch": 3.7377013731214186, "grad_norm": 0.2655465602874756, "learning_rate": 7.91709790696707e-06, "loss": 0.4015, "step": 103710 }, { "epoch": 3.7378815727826433, "grad_norm": 0.25117290019989014, "learning_rate": 7.91496742778181e-06, "loss": 0.3899, "step": 103715 }, { "epoch": 3.738061772443868, "grad_norm": 0.30154553055763245, "learning_rate": 7.91283718137682e-06, "loss": 0.424, "step": 103720 }, { "epoch": 3.7382419721050923, "grad_norm": 0.27134832739830017, "learning_rate": 7.910707167781126e-06, "loss": 0.3404, "step": 103725 }, { "epoch": 3.738422171766317, "grad_norm": 0.2242002934217453, "learning_rate": 7.908577387023719e-06, "loss": 0.38, "step": 103730 }, { "epoch": 3.738602371427542, "grad_norm": 0.2798958122730255, "learning_rate": 7.906447839133643e-06, "loss": 0.3884, "step": 103735 }, { "epoch": 3.738782571088766, "grad_norm": 0.2313423752784729, "learning_rate": 7.90431852413991e-06, "loss": 0.3852, "step": 103740 }, { "epoch": 3.738962770749991, "grad_norm": 0.24077194929122925, "learning_rate": 7.902189442071525e-06, "loss": 0.3847, "step": 103745 }, { "epoch": 3.7391429704112156, "grad_norm": 0.24760626256465912, "learning_rate": 7.900060592957498e-06, "loss": 0.3997, "step": 103750 }, { "epoch": 3.7393231700724403, "grad_norm": 0.19511263072490692, "learning_rate": 7.897931976826827e-06, "loss": 0.3953, "step": 103755 }, { "epoch": 3.739503369733665, "grad_norm": 0.1964816004037857, "learning_rate": 7.895803593708532e-06, "loss": 0.3628, "step": 103760 }, { "epoch": 3.7396835693948898, "grad_norm": 0.2270769327878952, "learning_rate": 7.893675443631599e-06, "loss": 0.3912, "step": 103765 }, { "epoch": 3.739863769056114, "grad_norm": 0.21808400750160217, "learning_rate": 7.891547526625026e-06, "loss": 0.3762, "step": 103770 }, { "epoch": 3.740043968717339, "grad_norm": 0.29697415232658386, "learning_rate": 7.889419842717807e-06, "loss": 0.364, "step": 103775 }, { "epoch": 3.7402241683785635, "grad_norm": 0.26142972707748413, "learning_rate": 7.88729239193893e-06, "loss": 0.3906, "step": 103780 }, { "epoch": 3.740404368039788, "grad_norm": 0.3567783534526825, "learning_rate": 7.885165174317374e-06, "loss": 0.3906, "step": 103785 }, { "epoch": 3.7405845677010126, "grad_norm": 0.2854664921760559, "learning_rate": 7.883038189882137e-06, "loss": 0.4332, "step": 103790 }, { "epoch": 3.7407647673622373, "grad_norm": 0.2237059473991394, "learning_rate": 7.880911438662186e-06, "loss": 0.38, "step": 103795 }, { "epoch": 3.740944967023462, "grad_norm": 0.1987508237361908, "learning_rate": 7.878784920686509e-06, "loss": 0.4177, "step": 103800 }, { "epoch": 3.7411251666846868, "grad_norm": 0.2105659693479538, "learning_rate": 7.876658635984065e-06, "loss": 0.3688, "step": 103805 }, { "epoch": 3.7413053663459115, "grad_norm": 0.2706106901168823, "learning_rate": 7.87453258458383e-06, "loss": 0.3544, "step": 103810 }, { "epoch": 3.7414855660071358, "grad_norm": 0.2130729705095291, "learning_rate": 7.872406766514779e-06, "loss": 0.4017, "step": 103815 }, { "epoch": 3.7416657656683605, "grad_norm": 0.191060870885849, "learning_rate": 7.870281181805877e-06, "loss": 0.3528, "step": 103820 }, { "epoch": 3.7418459653295852, "grad_norm": 0.26372456550598145, "learning_rate": 7.868155830486063e-06, "loss": 0.363, "step": 103825 }, { "epoch": 3.74202616499081, "grad_norm": 0.2443712055683136, "learning_rate": 7.866030712584318e-06, "loss": 0.3847, "step": 103830 }, { "epoch": 3.7422063646520343, "grad_norm": 0.30589404702186584, "learning_rate": 7.863905828129577e-06, "loss": 0.3667, "step": 103835 }, { "epoch": 3.742386564313259, "grad_norm": 0.3021865785121918, "learning_rate": 7.861781177150818e-06, "loss": 0.4225, "step": 103840 }, { "epoch": 3.7425667639744837, "grad_norm": 0.21095475554466248, "learning_rate": 7.859656759676964e-06, "loss": 0.3872, "step": 103845 }, { "epoch": 3.7427469636357085, "grad_norm": 0.25870993733406067, "learning_rate": 7.857532575736961e-06, "loss": 0.3782, "step": 103850 }, { "epoch": 3.742927163296933, "grad_norm": 0.2628803849220276, "learning_rate": 7.855408625359768e-06, "loss": 0.3724, "step": 103855 }, { "epoch": 3.7431073629581575, "grad_norm": 0.2707422375679016, "learning_rate": 7.85328490857431e-06, "loss": 0.3451, "step": 103860 }, { "epoch": 3.7432875626193822, "grad_norm": 0.21672846376895905, "learning_rate": 7.851161425409525e-06, "loss": 0.3629, "step": 103865 }, { "epoch": 3.743467762280607, "grad_norm": 0.2119687795639038, "learning_rate": 7.849038175894346e-06, "loss": 0.3829, "step": 103870 }, { "epoch": 3.7436479619418317, "grad_norm": 0.3262704610824585, "learning_rate": 7.846915160057694e-06, "loss": 0.3918, "step": 103875 }, { "epoch": 3.743828161603056, "grad_norm": 0.2228257954120636, "learning_rate": 7.84479237792851e-06, "loss": 0.3837, "step": 103880 }, { "epoch": 3.7440083612642807, "grad_norm": 0.24818168580532074, "learning_rate": 7.842669829535709e-06, "loss": 0.3801, "step": 103885 }, { "epoch": 3.7441885609255054, "grad_norm": 0.18585266172885895, "learning_rate": 7.840547514908209e-06, "loss": 0.3744, "step": 103890 }, { "epoch": 3.74436876058673, "grad_norm": 0.25262919068336487, "learning_rate": 7.838425434074925e-06, "loss": 0.3952, "step": 103895 }, { "epoch": 3.744548960247955, "grad_norm": 0.21435338258743286, "learning_rate": 7.836303587064766e-06, "loss": 0.3595, "step": 103900 }, { "epoch": 3.744729159909179, "grad_norm": 0.23021239042282104, "learning_rate": 7.834181973906656e-06, "loss": 0.4104, "step": 103905 }, { "epoch": 3.744909359570404, "grad_norm": 0.2677323818206787, "learning_rate": 7.832060594629504e-06, "loss": 0.3823, "step": 103910 }, { "epoch": 3.7450895592316287, "grad_norm": 0.22377659380435944, "learning_rate": 7.82993944926218e-06, "loss": 0.3476, "step": 103915 }, { "epoch": 3.7452697588928534, "grad_norm": 0.2559918761253357, "learning_rate": 7.82781853783362e-06, "loss": 0.3891, "step": 103920 }, { "epoch": 3.7454499585540777, "grad_norm": 0.201356440782547, "learning_rate": 7.825697860372705e-06, "loss": 0.3572, "step": 103925 }, { "epoch": 3.7456301582153024, "grad_norm": 0.26732808351516724, "learning_rate": 7.823577416908325e-06, "loss": 0.3886, "step": 103930 }, { "epoch": 3.745810357876527, "grad_norm": 0.22887802124023438, "learning_rate": 7.821457207469392e-06, "loss": 0.378, "step": 103935 }, { "epoch": 3.745990557537752, "grad_norm": 0.26050490140914917, "learning_rate": 7.819337232084764e-06, "loss": 0.3839, "step": 103940 }, { "epoch": 3.7461707571989766, "grad_norm": 0.23908868432044983, "learning_rate": 7.817217490783346e-06, "loss": 0.4197, "step": 103945 }, { "epoch": 3.7463509568602014, "grad_norm": 0.23192131519317627, "learning_rate": 7.815097983594016e-06, "loss": 0.3885, "step": 103950 }, { "epoch": 3.7465311565214257, "grad_norm": 0.31525352597236633, "learning_rate": 7.812978710545646e-06, "loss": 0.3664, "step": 103955 }, { "epoch": 3.7467113561826504, "grad_norm": 0.2983095347881317, "learning_rate": 7.810859671667118e-06, "loss": 0.4093, "step": 103960 }, { "epoch": 3.746891555843875, "grad_norm": 0.30987173318862915, "learning_rate": 7.808740866987293e-06, "loss": 0.4, "step": 103965 }, { "epoch": 3.7470717555050994, "grad_norm": 0.2129722535610199, "learning_rate": 7.80662229653504e-06, "loss": 0.3731, "step": 103970 }, { "epoch": 3.747251955166324, "grad_norm": 0.2828996181488037, "learning_rate": 7.804503960339238e-06, "loss": 0.3715, "step": 103975 }, { "epoch": 3.747432154827549, "grad_norm": 0.26484811305999756, "learning_rate": 7.80238585842874e-06, "loss": 0.3518, "step": 103980 }, { "epoch": 3.7476123544887736, "grad_norm": 0.23784774541854858, "learning_rate": 7.800267990832404e-06, "loss": 0.4227, "step": 103985 }, { "epoch": 3.7477925541499983, "grad_norm": 0.22352570295333862, "learning_rate": 7.79815035757909e-06, "loss": 0.3722, "step": 103990 }, { "epoch": 3.747972753811223, "grad_norm": 0.1883036494255066, "learning_rate": 7.796032958697635e-06, "loss": 0.3852, "step": 103995 }, { "epoch": 3.7481529534724474, "grad_norm": 0.29498735070228577, "learning_rate": 7.79391579421691e-06, "loss": 0.3987, "step": 104000 }, { "epoch": 3.7481529534724474, "eval_loss": 0.4285476803779602, "eval_runtime": 3.5328, "eval_samples_per_second": 28.306, "eval_steps_per_second": 7.076, "step": 104000 }, { "epoch": 3.748333153133672, "grad_norm": 0.2650872766971588, "learning_rate": 7.79179886416576e-06, "loss": 0.3802, "step": 104005 }, { "epoch": 3.748513352794897, "grad_norm": 0.19966596364974976, "learning_rate": 7.789682168573004e-06, "loss": 0.3448, "step": 104010 }, { "epoch": 3.748693552456121, "grad_norm": 0.2707030177116394, "learning_rate": 7.7875657074675e-06, "loss": 0.3787, "step": 104015 }, { "epoch": 3.748873752117346, "grad_norm": 0.21822407841682434, "learning_rate": 7.785449480878076e-06, "loss": 0.4043, "step": 104020 }, { "epoch": 3.7490539517785706, "grad_norm": 0.23433195054531097, "learning_rate": 7.783333488833585e-06, "loss": 0.3704, "step": 104025 }, { "epoch": 3.7492341514397953, "grad_norm": 0.26820576190948486, "learning_rate": 7.781217731362834e-06, "loss": 0.372, "step": 104030 }, { "epoch": 3.74941435110102, "grad_norm": 0.27072396874427795, "learning_rate": 7.779102208494648e-06, "loss": 0.3665, "step": 104035 }, { "epoch": 3.749594550762245, "grad_norm": 0.20719735324382782, "learning_rate": 7.776986920257873e-06, "loss": 0.3721, "step": 104040 }, { "epoch": 3.749774750423469, "grad_norm": 0.2387949824333191, "learning_rate": 7.774871866681313e-06, "loss": 0.3456, "step": 104045 }, { "epoch": 3.749954950084694, "grad_norm": 0.23849116265773773, "learning_rate": 7.772757047793792e-06, "loss": 0.3932, "step": 104050 }, { "epoch": 3.7501351497459186, "grad_norm": 0.25094231963157654, "learning_rate": 7.770642463624117e-06, "loss": 0.3632, "step": 104055 }, { "epoch": 3.750315349407143, "grad_norm": 0.21101891994476318, "learning_rate": 7.768528114201095e-06, "loss": 0.3791, "step": 104060 }, { "epoch": 3.7504955490683676, "grad_norm": 0.2257104218006134, "learning_rate": 7.766413999553552e-06, "loss": 0.3644, "step": 104065 }, { "epoch": 3.7506757487295923, "grad_norm": 0.28515586256980896, "learning_rate": 7.76430011971028e-06, "loss": 0.3604, "step": 104070 }, { "epoch": 3.750855948390817, "grad_norm": 0.23402278125286102, "learning_rate": 7.762186474700084e-06, "loss": 0.3458, "step": 104075 }, { "epoch": 3.7510361480520418, "grad_norm": 0.239648699760437, "learning_rate": 7.760073064551757e-06, "loss": 0.3902, "step": 104080 }, { "epoch": 3.7512163477132665, "grad_norm": 0.19721876084804535, "learning_rate": 7.75795988929409e-06, "loss": 0.3715, "step": 104085 }, { "epoch": 3.751396547374491, "grad_norm": 0.29029005765914917, "learning_rate": 7.755846948955889e-06, "loss": 0.3953, "step": 104090 }, { "epoch": 3.7515767470357155, "grad_norm": 0.23127782344818115, "learning_rate": 7.753734243565935e-06, "loss": 0.3737, "step": 104095 }, { "epoch": 3.7517569466969403, "grad_norm": 0.2870927155017853, "learning_rate": 7.751621773153014e-06, "loss": 0.415, "step": 104100 }, { "epoch": 3.751937146358165, "grad_norm": 0.2417393922805786, "learning_rate": 7.749509537745906e-06, "loss": 0.3357, "step": 104105 }, { "epoch": 3.7521173460193893, "grad_norm": 0.226087749004364, "learning_rate": 7.74739753737339e-06, "loss": 0.3768, "step": 104110 }, { "epoch": 3.752297545680614, "grad_norm": 0.25405430793762207, "learning_rate": 7.745285772064237e-06, "loss": 0.3824, "step": 104115 }, { "epoch": 3.7524777453418388, "grad_norm": 0.24911725521087646, "learning_rate": 7.743174241847237e-06, "loss": 0.3721, "step": 104120 }, { "epoch": 3.7526579450030635, "grad_norm": 0.2514585852622986, "learning_rate": 7.741062946751135e-06, "loss": 0.4184, "step": 104125 }, { "epoch": 3.7528381446642882, "grad_norm": 0.2331729680299759, "learning_rate": 7.738951886804713e-06, "loss": 0.4288, "step": 104130 }, { "epoch": 3.7530183443255125, "grad_norm": 0.2459155172109604, "learning_rate": 7.736841062036731e-06, "loss": 0.3804, "step": 104135 }, { "epoch": 3.7531985439867372, "grad_norm": 0.2611176073551178, "learning_rate": 7.734730472475948e-06, "loss": 0.3821, "step": 104140 }, { "epoch": 3.753378743647962, "grad_norm": 0.29315176606178284, "learning_rate": 7.73262011815112e-06, "loss": 0.3522, "step": 104145 }, { "epoch": 3.7535589433091867, "grad_norm": 0.20626281201839447, "learning_rate": 7.730509999090999e-06, "loss": 0.3707, "step": 104150 }, { "epoch": 3.753739142970411, "grad_norm": 0.19073475897312164, "learning_rate": 7.72840011532433e-06, "loss": 0.3713, "step": 104155 }, { "epoch": 3.7539193426316357, "grad_norm": 0.2597355246543884, "learning_rate": 7.726290466879872e-06, "loss": 0.3736, "step": 104160 }, { "epoch": 3.7540995422928605, "grad_norm": 0.19545309245586395, "learning_rate": 7.724181053786361e-06, "loss": 0.3695, "step": 104165 }, { "epoch": 3.754279741954085, "grad_norm": 0.2396879643201828, "learning_rate": 7.722071876072538e-06, "loss": 0.4125, "step": 104170 }, { "epoch": 3.75445994161531, "grad_norm": 0.22774870693683624, "learning_rate": 7.71996293376714e-06, "loss": 0.3802, "step": 104175 }, { "epoch": 3.7546401412765347, "grad_norm": 0.20402881503105164, "learning_rate": 7.717854226898897e-06, "loss": 0.34, "step": 104180 }, { "epoch": 3.754820340937759, "grad_norm": 0.22189156711101532, "learning_rate": 7.71574575549655e-06, "loss": 0.3903, "step": 104185 }, { "epoch": 3.7550005405989837, "grad_norm": 0.19797289371490479, "learning_rate": 7.713637519588829e-06, "loss": 0.3864, "step": 104190 }, { "epoch": 3.7551807402602084, "grad_norm": 0.24208073318004608, "learning_rate": 7.711529519204435e-06, "loss": 0.3533, "step": 104195 }, { "epoch": 3.7553609399214327, "grad_norm": 0.24616724252700806, "learning_rate": 7.709421754372112e-06, "loss": 0.3659, "step": 104200 }, { "epoch": 3.7555411395826575, "grad_norm": 0.2978845238685608, "learning_rate": 7.70731422512056e-06, "loss": 0.4112, "step": 104205 }, { "epoch": 3.755721339243882, "grad_norm": 0.22275815904140472, "learning_rate": 7.705206931478512e-06, "loss": 0.3825, "step": 104210 }, { "epoch": 3.755901538905107, "grad_norm": 0.3553224205970764, "learning_rate": 7.703099873474681e-06, "loss": 0.3811, "step": 104215 }, { "epoch": 3.7560817385663317, "grad_norm": 0.2287319004535675, "learning_rate": 7.700993051137751e-06, "loss": 0.3758, "step": 104220 }, { "epoch": 3.7562619382275564, "grad_norm": 0.22735942900180817, "learning_rate": 7.698886464496446e-06, "loss": 0.4105, "step": 104225 }, { "epoch": 3.7564421378887807, "grad_norm": 0.18838316202163696, "learning_rate": 7.696780113579464e-06, "loss": 0.3754, "step": 104230 }, { "epoch": 3.7566223375500054, "grad_norm": 0.2099224328994751, "learning_rate": 7.694673998415503e-06, "loss": 0.3815, "step": 104235 }, { "epoch": 3.75680253721123, "grad_norm": 0.3171490728855133, "learning_rate": 7.692568119033258e-06, "loss": 0.3913, "step": 104240 }, { "epoch": 3.7569827368724544, "grad_norm": 0.25509047508239746, "learning_rate": 7.690462475461416e-06, "loss": 0.4001, "step": 104245 }, { "epoch": 3.757162936533679, "grad_norm": 0.2789323627948761, "learning_rate": 7.688357067728676e-06, "loss": 0.3647, "step": 104250 }, { "epoch": 3.757343136194904, "grad_norm": 0.21449807286262512, "learning_rate": 7.686251895863721e-06, "loss": 0.3643, "step": 104255 }, { "epoch": 3.7575233358561286, "grad_norm": 0.25625357031822205, "learning_rate": 7.684146959895233e-06, "loss": 0.3751, "step": 104260 }, { "epoch": 3.7577035355173534, "grad_norm": 0.25232619047164917, "learning_rate": 7.68204225985189e-06, "loss": 0.415, "step": 104265 }, { "epoch": 3.757883735178578, "grad_norm": 0.24988026916980743, "learning_rate": 7.67993779576236e-06, "loss": 0.3681, "step": 104270 }, { "epoch": 3.7580639348398024, "grad_norm": 0.21227630972862244, "learning_rate": 7.677833567655331e-06, "loss": 0.3763, "step": 104275 }, { "epoch": 3.758244134501027, "grad_norm": 0.2227819859981537, "learning_rate": 7.675729575559468e-06, "loss": 0.3833, "step": 104280 }, { "epoch": 3.758424334162252, "grad_norm": 0.3054330348968506, "learning_rate": 7.673625819503433e-06, "loss": 0.3833, "step": 104285 }, { "epoch": 3.758604533823476, "grad_norm": 0.2734309136867523, "learning_rate": 7.671522299515893e-06, "loss": 0.3916, "step": 104290 }, { "epoch": 3.758784733484701, "grad_norm": 0.24472157657146454, "learning_rate": 7.669419015625507e-06, "loss": 0.3786, "step": 104295 }, { "epoch": 3.7589649331459256, "grad_norm": 0.21005383133888245, "learning_rate": 7.667315967860925e-06, "loss": 0.38, "step": 104300 }, { "epoch": 3.7591451328071503, "grad_norm": 0.18134789168834686, "learning_rate": 7.665213156250819e-06, "loss": 0.3707, "step": 104305 }, { "epoch": 3.759325332468375, "grad_norm": 0.26828300952911377, "learning_rate": 7.663110580823816e-06, "loss": 0.3727, "step": 104310 }, { "epoch": 3.7595055321296, "grad_norm": 0.28588294982910156, "learning_rate": 7.661008241608581e-06, "loss": 0.3755, "step": 104315 }, { "epoch": 3.759685731790824, "grad_norm": 0.27312424778938293, "learning_rate": 7.65890613863375e-06, "loss": 0.4112, "step": 104320 }, { "epoch": 3.759865931452049, "grad_norm": 0.2866198420524597, "learning_rate": 7.65680427192797e-06, "loss": 0.4028, "step": 104325 }, { "epoch": 3.7600461311132736, "grad_norm": 0.2471124827861786, "learning_rate": 7.654702641519871e-06, "loss": 0.3696, "step": 104330 }, { "epoch": 3.7602263307744983, "grad_norm": 0.24191972613334656, "learning_rate": 7.65260124743809e-06, "loss": 0.3817, "step": 104335 }, { "epoch": 3.7604065304357226, "grad_norm": 0.22872976958751678, "learning_rate": 7.650500089711252e-06, "loss": 0.4256, "step": 104340 }, { "epoch": 3.7605867300969473, "grad_norm": 0.3062474727630615, "learning_rate": 7.648399168367998e-06, "loss": 0.3657, "step": 104345 }, { "epoch": 3.760766929758172, "grad_norm": 0.2952815592288971, "learning_rate": 7.646298483436946e-06, "loss": 0.3974, "step": 104350 }, { "epoch": 3.760947129419397, "grad_norm": 0.30336183309555054, "learning_rate": 7.644198034946718e-06, "loss": 0.369, "step": 104355 }, { "epoch": 3.7611273290806215, "grad_norm": 0.24275535345077515, "learning_rate": 7.642097822925932e-06, "loss": 0.3679, "step": 104360 }, { "epoch": 3.761307528741846, "grad_norm": 0.20958095788955688, "learning_rate": 7.639997847403194e-06, "loss": 0.4052, "step": 104365 }, { "epoch": 3.7614877284030706, "grad_norm": 0.27302390336990356, "learning_rate": 7.637898108407132e-06, "loss": 0.3837, "step": 104370 }, { "epoch": 3.7616679280642953, "grad_norm": 0.3011491894721985, "learning_rate": 7.635798605966346e-06, "loss": 0.4018, "step": 104375 }, { "epoch": 3.76184812772552, "grad_norm": 0.2879173159599304, "learning_rate": 7.633699340109443e-06, "loss": 0.4226, "step": 104380 }, { "epoch": 3.7620283273867443, "grad_norm": 0.2275601178407669, "learning_rate": 7.631600310865025e-06, "loss": 0.4083, "step": 104385 }, { "epoch": 3.762208527047969, "grad_norm": 0.208656445145607, "learning_rate": 7.62950151826168e-06, "loss": 0.3667, "step": 104390 }, { "epoch": 3.7623887267091938, "grad_norm": 0.2310076206922531, "learning_rate": 7.627402962328026e-06, "loss": 0.3664, "step": 104395 }, { "epoch": 3.7625689263704185, "grad_norm": 0.2891443073749542, "learning_rate": 7.625304643092648e-06, "loss": 0.3927, "step": 104400 }, { "epoch": 3.7627491260316432, "grad_norm": 0.254215806722641, "learning_rate": 7.623206560584115e-06, "loss": 0.3493, "step": 104405 }, { "epoch": 3.7629293256928675, "grad_norm": 0.19437402486801147, "learning_rate": 7.621108714831038e-06, "loss": 0.3739, "step": 104410 }, { "epoch": 3.7631095253540923, "grad_norm": 0.23699922859668732, "learning_rate": 7.619011105861987e-06, "loss": 0.4086, "step": 104415 }, { "epoch": 3.763289725015317, "grad_norm": 0.21388274431228638, "learning_rate": 7.616913733705547e-06, "loss": 0.3858, "step": 104420 }, { "epoch": 3.7634699246765417, "grad_norm": 0.22161832451820374, "learning_rate": 7.61481659839029e-06, "loss": 0.3757, "step": 104425 }, { "epoch": 3.763650124337766, "grad_norm": 0.2681003212928772, "learning_rate": 7.612719699944784e-06, "loss": 0.4036, "step": 104430 }, { "epoch": 3.7638303239989908, "grad_norm": 0.2696530818939209, "learning_rate": 7.610623038397613e-06, "loss": 0.408, "step": 104435 }, { "epoch": 3.7640105236602155, "grad_norm": 0.25873225927352905, "learning_rate": 7.608526613777339e-06, "loss": 0.4039, "step": 104440 }, { "epoch": 3.7641907233214402, "grad_norm": 0.1770642250776291, "learning_rate": 7.6064304261125205e-06, "loss": 0.3661, "step": 104445 }, { "epoch": 3.764370922982665, "grad_norm": 0.25411689281463623, "learning_rate": 7.604334475431721e-06, "loss": 0.3954, "step": 104450 }, { "epoch": 3.7645511226438897, "grad_norm": 0.19758695363998413, "learning_rate": 7.602238761763486e-06, "loss": 0.3648, "step": 104455 }, { "epoch": 3.764731322305114, "grad_norm": 0.24316218495368958, "learning_rate": 7.600143285136391e-06, "loss": 0.4164, "step": 104460 }, { "epoch": 3.7649115219663387, "grad_norm": 0.23017525672912598, "learning_rate": 7.598048045578973e-06, "loss": 0.3761, "step": 104465 }, { "epoch": 3.7650917216275634, "grad_norm": 0.2317330241203308, "learning_rate": 7.595953043119783e-06, "loss": 0.3907, "step": 104470 }, { "epoch": 3.7652719212887877, "grad_norm": 0.21798720955848694, "learning_rate": 7.593858277787361e-06, "loss": 0.3797, "step": 104475 }, { "epoch": 3.7654521209500125, "grad_norm": 0.18874719738960266, "learning_rate": 7.59176374961025e-06, "loss": 0.3733, "step": 104480 }, { "epoch": 3.765632320611237, "grad_norm": 0.24172750115394592, "learning_rate": 7.58966945861698e-06, "loss": 0.3748, "step": 104485 }, { "epoch": 3.765812520272462, "grad_norm": 0.32291868329048157, "learning_rate": 7.5875754048361114e-06, "loss": 0.4205, "step": 104490 }, { "epoch": 3.7659927199336867, "grad_norm": 0.23808036744594574, "learning_rate": 7.5854815882961364e-06, "loss": 0.3663, "step": 104495 }, { "epoch": 3.7661729195949114, "grad_norm": 0.2094922959804535, "learning_rate": 7.583388009025616e-06, "loss": 0.3745, "step": 104500 }, { "epoch": 3.7661729195949114, "eval_loss": 0.4286915063858032, "eval_runtime": 3.5312, "eval_samples_per_second": 28.319, "eval_steps_per_second": 7.08, "step": 104500 }, { "epoch": 3.7663531192561357, "grad_norm": 0.25394493341445923, "learning_rate": 7.581294667053057e-06, "loss": 0.3741, "step": 104505 }, { "epoch": 3.7665333189173604, "grad_norm": 0.22518283128738403, "learning_rate": 7.579201562406982e-06, "loss": 0.372, "step": 104510 }, { "epoch": 3.766713518578585, "grad_norm": 0.24221821129322052, "learning_rate": 7.577108695115928e-06, "loss": 0.4004, "step": 104515 }, { "epoch": 3.7668937182398095, "grad_norm": 0.25630852580070496, "learning_rate": 7.575016065208385e-06, "loss": 0.3834, "step": 104520 }, { "epoch": 3.767073917901034, "grad_norm": 0.21267221868038177, "learning_rate": 7.5729236727128674e-06, "loss": 0.3436, "step": 104525 }, { "epoch": 3.767254117562259, "grad_norm": 0.24416276812553406, "learning_rate": 7.5708315176579e-06, "loss": 0.4102, "step": 104530 }, { "epoch": 3.7674343172234837, "grad_norm": 0.2707764506340027, "learning_rate": 7.568739600071978e-06, "loss": 0.3707, "step": 104535 }, { "epoch": 3.7676145168847084, "grad_norm": 0.2370613068342209, "learning_rate": 7.566647919983602e-06, "loss": 0.3701, "step": 104540 }, { "epoch": 3.767794716545933, "grad_norm": 0.2817910611629486, "learning_rate": 7.564556477421275e-06, "loss": 0.3784, "step": 104545 }, { "epoch": 3.7679749162071574, "grad_norm": 0.27115681767463684, "learning_rate": 7.562465272413483e-06, "loss": 0.4333, "step": 104550 }, { "epoch": 3.768155115868382, "grad_norm": 0.23443850874900818, "learning_rate": 7.560374304988732e-06, "loss": 0.4106, "step": 104555 }, { "epoch": 3.768335315529607, "grad_norm": 0.2954671382904053, "learning_rate": 7.5582835751755064e-06, "loss": 0.3693, "step": 104560 }, { "epoch": 3.768515515190831, "grad_norm": 0.2391144335269928, "learning_rate": 7.556193083002291e-06, "loss": 0.3376, "step": 104565 }, { "epoch": 3.768695714852056, "grad_norm": 0.29020795226097107, "learning_rate": 7.554102828497564e-06, "loss": 0.4022, "step": 104570 }, { "epoch": 3.7688759145132806, "grad_norm": 0.24601967632770538, "learning_rate": 7.552012811689804e-06, "loss": 0.3811, "step": 104575 }, { "epoch": 3.7690561141745054, "grad_norm": 0.23416286706924438, "learning_rate": 7.549923032607498e-06, "loss": 0.3815, "step": 104580 }, { "epoch": 3.76923631383573, "grad_norm": 0.2573948800563812, "learning_rate": 7.547833491279119e-06, "loss": 0.3997, "step": 104585 }, { "epoch": 3.769416513496955, "grad_norm": 0.23901304602622986, "learning_rate": 7.5457441877331145e-06, "loss": 0.3577, "step": 104590 }, { "epoch": 3.769596713158179, "grad_norm": 0.1927175372838974, "learning_rate": 7.543655121997975e-06, "loss": 0.3581, "step": 104595 }, { "epoch": 3.769776912819404, "grad_norm": 0.30516329407691956, "learning_rate": 7.541566294102154e-06, "loss": 0.3853, "step": 104600 }, { "epoch": 3.7699571124806286, "grad_norm": 0.22798460721969604, "learning_rate": 7.539895403048933e-06, "loss": 0.3772, "step": 104605 }, { "epoch": 3.7701373121418533, "grad_norm": 0.24452564120292664, "learning_rate": 7.537807003335604e-06, "loss": 0.3669, "step": 104610 }, { "epoch": 3.7703175118030776, "grad_norm": 0.22427910566329956, "learning_rate": 7.5357188415412724e-06, "loss": 0.3694, "step": 104615 }, { "epoch": 3.7704977114643023, "grad_norm": 0.2747547924518585, "learning_rate": 7.5336309176943845e-06, "loss": 0.3733, "step": 104620 }, { "epoch": 3.770677911125527, "grad_norm": 0.22400154173374176, "learning_rate": 7.531543231823399e-06, "loss": 0.3961, "step": 104625 }, { "epoch": 3.770858110786752, "grad_norm": 0.2119477391242981, "learning_rate": 7.529455783956757e-06, "loss": 0.3566, "step": 104630 }, { "epoch": 3.7710383104479765, "grad_norm": 0.22162562608718872, "learning_rate": 7.5273685741228976e-06, "loss": 0.3925, "step": 104635 }, { "epoch": 3.771218510109201, "grad_norm": 0.20573687553405762, "learning_rate": 7.525281602350259e-06, "loss": 0.3899, "step": 104640 }, { "epoch": 3.7713987097704256, "grad_norm": 0.26889345049858093, "learning_rate": 7.523194868667266e-06, "loss": 0.407, "step": 104645 }, { "epoch": 3.7715789094316503, "grad_norm": 0.23474839329719543, "learning_rate": 7.521108373102367e-06, "loss": 0.3577, "step": 104650 }, { "epoch": 3.771759109092875, "grad_norm": 0.2511729896068573, "learning_rate": 7.519022115683994e-06, "loss": 0.3994, "step": 104655 }, { "epoch": 3.7719393087540993, "grad_norm": 0.24130114912986755, "learning_rate": 7.5169360964405415e-06, "loss": 0.3788, "step": 104660 }, { "epoch": 3.772119508415324, "grad_norm": 0.2123662829399109, "learning_rate": 7.514850315400457e-06, "loss": 0.3786, "step": 104665 }, { "epoch": 3.772299708076549, "grad_norm": 0.21604010462760925, "learning_rate": 7.512764772592151e-06, "loss": 0.3807, "step": 104670 }, { "epoch": 3.7724799077377735, "grad_norm": 0.26000019907951355, "learning_rate": 7.510679468044035e-06, "loss": 0.3552, "step": 104675 }, { "epoch": 3.7726601073989983, "grad_norm": 0.2929229140281677, "learning_rate": 7.508594401784538e-06, "loss": 0.3689, "step": 104680 }, { "epoch": 3.772840307060223, "grad_norm": 0.26968440413475037, "learning_rate": 7.506509573842041e-06, "loss": 0.3501, "step": 104685 }, { "epoch": 3.7730205067214473, "grad_norm": 0.2252168208360672, "learning_rate": 7.5044249842449735e-06, "loss": 0.3709, "step": 104690 }, { "epoch": 3.773200706382672, "grad_norm": 0.20150010287761688, "learning_rate": 7.502340633021726e-06, "loss": 0.3637, "step": 104695 }, { "epoch": 3.7733809060438968, "grad_norm": 0.2643103003501892, "learning_rate": 7.500256520200702e-06, "loss": 0.4388, "step": 104700 }, { "epoch": 3.773561105705121, "grad_norm": 0.2742420732975006, "learning_rate": 7.498172645810292e-06, "loss": 0.3994, "step": 104705 }, { "epoch": 3.7737413053663458, "grad_norm": 0.3029593229293823, "learning_rate": 7.496089009878884e-06, "loss": 0.3941, "step": 104710 }, { "epoch": 3.7739215050275705, "grad_norm": 0.23510831594467163, "learning_rate": 7.494005612434885e-06, "loss": 0.3873, "step": 104715 }, { "epoch": 3.7741017046887952, "grad_norm": 0.2507026195526123, "learning_rate": 7.49192245350667e-06, "loss": 0.3756, "step": 104720 }, { "epoch": 3.77428190435002, "grad_norm": 0.2193816602230072, "learning_rate": 7.48983953312262e-06, "loss": 0.3564, "step": 104725 }, { "epoch": 3.7744621040112447, "grad_norm": 0.2253447026014328, "learning_rate": 7.487756851311114e-06, "loss": 0.3607, "step": 104730 }, { "epoch": 3.774642303672469, "grad_norm": 0.27351540327072144, "learning_rate": 7.485674408100535e-06, "loss": 0.3961, "step": 104735 }, { "epoch": 3.7748225033336937, "grad_norm": 0.20876996219158173, "learning_rate": 7.483592203519241e-06, "loss": 0.3839, "step": 104740 }, { "epoch": 3.7750027029949185, "grad_norm": 0.23797959089279175, "learning_rate": 7.481510237595621e-06, "loss": 0.3904, "step": 104745 }, { "epoch": 3.7751829026561428, "grad_norm": 0.21280419826507568, "learning_rate": 7.47942851035803e-06, "loss": 0.3748, "step": 104750 }, { "epoch": 3.7753631023173675, "grad_norm": 0.22197531163692474, "learning_rate": 7.477347021834838e-06, "loss": 0.3514, "step": 104755 }, { "epoch": 3.7755433019785922, "grad_norm": 0.2479541003704071, "learning_rate": 7.475265772054396e-06, "loss": 0.3469, "step": 104760 }, { "epoch": 3.775723501639817, "grad_norm": 0.2767854630947113, "learning_rate": 7.4731847610450604e-06, "loss": 0.3672, "step": 104765 }, { "epoch": 3.7759037013010417, "grad_norm": 0.21771368384361267, "learning_rate": 7.471103988835202e-06, "loss": 0.3716, "step": 104770 }, { "epoch": 3.7760839009622664, "grad_norm": 0.25155210494995117, "learning_rate": 7.46902345545315e-06, "loss": 0.3624, "step": 104775 }, { "epoch": 3.7762641006234907, "grad_norm": 0.27159664034843445, "learning_rate": 7.466943160927253e-06, "loss": 0.3326, "step": 104780 }, { "epoch": 3.7764443002847154, "grad_norm": 0.22621986269950867, "learning_rate": 7.464863105285868e-06, "loss": 0.3833, "step": 104785 }, { "epoch": 3.77662449994594, "grad_norm": 0.21097923815250397, "learning_rate": 7.462783288557329e-06, "loss": 0.378, "step": 104790 }, { "epoch": 3.7768046996071645, "grad_norm": 0.21815836429595947, "learning_rate": 7.460703710769973e-06, "loss": 0.364, "step": 104795 }, { "epoch": 3.776984899268389, "grad_norm": 0.22789250314235687, "learning_rate": 7.458624371952133e-06, "loss": 0.3781, "step": 104800 }, { "epoch": 3.777165098929614, "grad_norm": 0.28179875016212463, "learning_rate": 7.456545272132132e-06, "loss": 0.417, "step": 104805 }, { "epoch": 3.7773452985908387, "grad_norm": 0.21157479286193848, "learning_rate": 7.45446641133831e-06, "loss": 0.3719, "step": 104810 }, { "epoch": 3.7775254982520634, "grad_norm": 0.24585333466529846, "learning_rate": 7.452387789598988e-06, "loss": 0.399, "step": 104815 }, { "epoch": 3.777705697913288, "grad_norm": 0.2581721842288971, "learning_rate": 7.450309406942488e-06, "loss": 0.4285, "step": 104820 }, { "epoch": 3.7778858975745124, "grad_norm": 0.2738182842731476, "learning_rate": 7.448231263397121e-06, "loss": 0.4033, "step": 104825 }, { "epoch": 3.778066097235737, "grad_norm": 0.22549976408481598, "learning_rate": 7.446153358991198e-06, "loss": 0.4013, "step": 104830 }, { "epoch": 3.778246296896962, "grad_norm": 0.20595544576644897, "learning_rate": 7.444075693753044e-06, "loss": 0.3896, "step": 104835 }, { "epoch": 3.7784264965581866, "grad_norm": 0.24291178584098816, "learning_rate": 7.441998267710962e-06, "loss": 0.3715, "step": 104840 }, { "epoch": 3.778606696219411, "grad_norm": 0.248873770236969, "learning_rate": 7.439921080893253e-06, "loss": 0.3732, "step": 104845 }, { "epoch": 3.7787868958806357, "grad_norm": 0.28607362508773804, "learning_rate": 7.43784413332822e-06, "loss": 0.3769, "step": 104850 }, { "epoch": 3.7789670955418604, "grad_norm": 0.254102498292923, "learning_rate": 7.43576742504416e-06, "loss": 0.3671, "step": 104855 }, { "epoch": 3.779147295203085, "grad_norm": 0.21698035299777985, "learning_rate": 7.433690956069361e-06, "loss": 0.3563, "step": 104860 }, { "epoch": 3.77932749486431, "grad_norm": 0.25203150510787964, "learning_rate": 7.431614726432137e-06, "loss": 0.3384, "step": 104865 }, { "epoch": 3.779507694525534, "grad_norm": 0.24114732444286346, "learning_rate": 7.429538736160746e-06, "loss": 0.4012, "step": 104870 }, { "epoch": 3.779687894186759, "grad_norm": 0.26797887682914734, "learning_rate": 7.4274629852834955e-06, "loss": 0.4196, "step": 104875 }, { "epoch": 3.7798680938479836, "grad_norm": 0.22066861391067505, "learning_rate": 7.425387473828657e-06, "loss": 0.3429, "step": 104880 }, { "epoch": 3.7800482935092083, "grad_norm": 0.23529654741287231, "learning_rate": 7.423312201824514e-06, "loss": 0.3881, "step": 104885 }, { "epoch": 3.7802284931704326, "grad_norm": 0.2249959409236908, "learning_rate": 7.421237169299341e-06, "loss": 0.3611, "step": 104890 }, { "epoch": 3.7804086928316574, "grad_norm": 0.24566762149333954, "learning_rate": 7.419162376281397e-06, "loss": 0.4263, "step": 104895 }, { "epoch": 3.780588892492882, "grad_norm": 0.18927571177482605, "learning_rate": 7.417087822798971e-06, "loss": 0.3778, "step": 104900 }, { "epoch": 3.780769092154107, "grad_norm": 0.2276502400636673, "learning_rate": 7.415013508880319e-06, "loss": 0.3857, "step": 104905 }, { "epoch": 3.7809492918153316, "grad_norm": 0.2894652485847473, "learning_rate": 7.412939434553707e-06, "loss": 0.3817, "step": 104910 }, { "epoch": 3.781129491476556, "grad_norm": 0.2344924807548523, "learning_rate": 7.410865599847386e-06, "loss": 0.3711, "step": 104915 }, { "epoch": 3.7813096911377806, "grad_norm": 0.2306702584028244, "learning_rate": 7.408792004789616e-06, "loss": 0.3803, "step": 104920 }, { "epoch": 3.7814898907990053, "grad_norm": 0.25984832644462585, "learning_rate": 7.4067186494086425e-06, "loss": 0.4154, "step": 104925 }, { "epoch": 3.78167009046023, "grad_norm": 0.23578700423240662, "learning_rate": 7.404645533732729e-06, "loss": 0.3523, "step": 104930 }, { "epoch": 3.7818502901214543, "grad_norm": 0.2648398280143738, "learning_rate": 7.4025726577901135e-06, "loss": 0.3804, "step": 104935 }, { "epoch": 3.782030489782679, "grad_norm": 0.22717751562595367, "learning_rate": 7.400500021609038e-06, "loss": 0.376, "step": 104940 }, { "epoch": 3.782210689443904, "grad_norm": 0.2581784129142761, "learning_rate": 7.398427625217743e-06, "loss": 0.3915, "step": 104945 }, { "epoch": 3.7823908891051286, "grad_norm": 0.263120174407959, "learning_rate": 7.3963554686444556e-06, "loss": 0.4054, "step": 104950 }, { "epoch": 3.7825710887663533, "grad_norm": 0.24363566935062408, "learning_rate": 7.394283551917433e-06, "loss": 0.3712, "step": 104955 }, { "epoch": 3.782751288427578, "grad_norm": 0.2304638773202896, "learning_rate": 7.39221187506488e-06, "loss": 0.3958, "step": 104960 }, { "epoch": 3.7829314880888023, "grad_norm": 0.3181205093860626, "learning_rate": 7.3901404381150255e-06, "loss": 0.405, "step": 104965 }, { "epoch": 3.783111687750027, "grad_norm": 0.29112708568573, "learning_rate": 7.3880692410961045e-06, "loss": 0.38, "step": 104970 }, { "epoch": 3.7832918874112518, "grad_norm": 0.25521332025527954, "learning_rate": 7.385998284036322e-06, "loss": 0.4297, "step": 104975 }, { "epoch": 3.783472087072476, "grad_norm": 0.2132871150970459, "learning_rate": 7.383927566963919e-06, "loss": 0.3451, "step": 104980 }, { "epoch": 3.783652286733701, "grad_norm": 0.25532466173171997, "learning_rate": 7.381857089907082e-06, "loss": 0.4008, "step": 104985 }, { "epoch": 3.7838324863949255, "grad_norm": 0.28475886583328247, "learning_rate": 7.379786852894027e-06, "loss": 0.3948, "step": 104990 }, { "epoch": 3.7840126860561503, "grad_norm": 0.24241603910923004, "learning_rate": 7.377716855952971e-06, "loss": 0.3718, "step": 104995 }, { "epoch": 3.784192885717375, "grad_norm": 0.22656336426734924, "learning_rate": 7.37564709911211e-06, "loss": 0.3624, "step": 105000 }, { "epoch": 3.784192885717375, "eval_loss": 0.4286513924598694, "eval_runtime": 3.5365, "eval_samples_per_second": 28.277, "eval_steps_per_second": 7.069, "step": 105000 }, { "epoch": 3.7843730853785997, "grad_norm": 0.2644498944282532, "learning_rate": 7.3735775823996465e-06, "loss": 0.387, "step": 105005 }, { "epoch": 3.784553285039824, "grad_norm": 0.2483258843421936, "learning_rate": 7.371508305843775e-06, "loss": 0.3835, "step": 105010 }, { "epoch": 3.7847334847010488, "grad_norm": 0.22739523649215698, "learning_rate": 7.3694392694726835e-06, "loss": 0.3869, "step": 105015 }, { "epoch": 3.7849136843622735, "grad_norm": 0.306366503238678, "learning_rate": 7.367370473314575e-06, "loss": 0.3759, "step": 105020 }, { "epoch": 3.785093884023498, "grad_norm": 0.21845044195652008, "learning_rate": 7.365301917397629e-06, "loss": 0.3692, "step": 105025 }, { "epoch": 3.7852740836847225, "grad_norm": 0.2691507041454315, "learning_rate": 7.363233601750033e-06, "loss": 0.3929, "step": 105030 }, { "epoch": 3.7854542833459472, "grad_norm": 0.23008035123348236, "learning_rate": 7.361165526399963e-06, "loss": 0.3841, "step": 105035 }, { "epoch": 3.785634483007172, "grad_norm": 0.21267521381378174, "learning_rate": 7.359097691375596e-06, "loss": 0.3342, "step": 105040 }, { "epoch": 3.7858146826683967, "grad_norm": 0.25716471672058105, "learning_rate": 7.357030096705103e-06, "loss": 0.4005, "step": 105045 }, { "epoch": 3.7859948823296214, "grad_norm": 0.25578397512435913, "learning_rate": 7.354962742416674e-06, "loss": 0.3783, "step": 105050 }, { "epoch": 3.7861750819908457, "grad_norm": 0.25349554419517517, "learning_rate": 7.352895628538445e-06, "loss": 0.3775, "step": 105055 }, { "epoch": 3.7863552816520705, "grad_norm": 0.22552327811717987, "learning_rate": 7.350828755098604e-06, "loss": 0.3725, "step": 105060 }, { "epoch": 3.786535481313295, "grad_norm": 0.20090581476688385, "learning_rate": 7.348762122125305e-06, "loss": 0.3946, "step": 105065 }, { "epoch": 3.7867156809745195, "grad_norm": 0.24854734539985657, "learning_rate": 7.346695729646705e-06, "loss": 0.402, "step": 105070 }, { "epoch": 3.7868958806357442, "grad_norm": 0.2817821502685547, "learning_rate": 7.344629577690956e-06, "loss": 0.3533, "step": 105075 }, { "epoch": 3.787076080296969, "grad_norm": 0.24129050970077515, "learning_rate": 7.3425636662862115e-06, "loss": 0.3783, "step": 105080 }, { "epoch": 3.7872562799581937, "grad_norm": 0.2622377574443817, "learning_rate": 7.340497995460613e-06, "loss": 0.4121, "step": 105085 }, { "epoch": 3.7874364796194184, "grad_norm": 0.22572429478168488, "learning_rate": 7.338432565242314e-06, "loss": 0.3947, "step": 105090 }, { "epoch": 3.787616679280643, "grad_norm": 0.23703832924365997, "learning_rate": 7.3363673756594555e-06, "loss": 0.3583, "step": 105095 }, { "epoch": 3.7877968789418675, "grad_norm": 0.24465423822402954, "learning_rate": 7.3343024267401725e-06, "loss": 0.4008, "step": 105100 }, { "epoch": 3.787977078603092, "grad_norm": 0.22674879431724548, "learning_rate": 7.332237718512594e-06, "loss": 0.3801, "step": 105105 }, { "epoch": 3.788157278264317, "grad_norm": 0.24080802500247955, "learning_rate": 7.330173251004851e-06, "loss": 0.3732, "step": 105110 }, { "epoch": 3.7883374779255417, "grad_norm": 0.28059399127960205, "learning_rate": 7.328109024245086e-06, "loss": 0.4045, "step": 105115 }, { "epoch": 3.788517677586766, "grad_norm": 0.19681565463542938, "learning_rate": 7.326045038261411e-06, "loss": 0.3517, "step": 105120 }, { "epoch": 3.7886978772479907, "grad_norm": 0.2852540612220764, "learning_rate": 7.3239812930819524e-06, "loss": 0.4153, "step": 105125 }, { "epoch": 3.7888780769092154, "grad_norm": 0.28406253457069397, "learning_rate": 7.321917788734825e-06, "loss": 0.3899, "step": 105130 }, { "epoch": 3.78905827657044, "grad_norm": 0.2611198127269745, "learning_rate": 7.31985452524814e-06, "loss": 0.3949, "step": 105135 }, { "epoch": 3.789238476231665, "grad_norm": 0.24562934041023254, "learning_rate": 7.31779150265002e-06, "loss": 0.3793, "step": 105140 }, { "epoch": 3.789418675892889, "grad_norm": 0.27355340123176575, "learning_rate": 7.315728720968576e-06, "loss": 0.3941, "step": 105145 }, { "epoch": 3.789598875554114, "grad_norm": 0.20609021186828613, "learning_rate": 7.313666180231888e-06, "loss": 0.3649, "step": 105150 }, { "epoch": 3.7897790752153386, "grad_norm": 0.21863946318626404, "learning_rate": 7.311603880468082e-06, "loss": 0.4042, "step": 105155 }, { "epoch": 3.7899592748765634, "grad_norm": 0.22044524550437927, "learning_rate": 7.3095418217052405e-06, "loss": 0.3829, "step": 105160 }, { "epoch": 3.7901394745377877, "grad_norm": 0.23821088671684265, "learning_rate": 7.3074800039714844e-06, "loss": 0.3746, "step": 105165 }, { "epoch": 3.7903196741990124, "grad_norm": 0.21885710954666138, "learning_rate": 7.305418427294877e-06, "loss": 0.342, "step": 105170 }, { "epoch": 3.790499873860237, "grad_norm": 0.26055842638015747, "learning_rate": 7.303357091703511e-06, "loss": 0.3668, "step": 105175 }, { "epoch": 3.790680073521462, "grad_norm": 0.21390856802463531, "learning_rate": 7.301295997225488e-06, "loss": 0.3783, "step": 105180 }, { "epoch": 3.7908602731826866, "grad_norm": 0.20063291490077972, "learning_rate": 7.299235143888878e-06, "loss": 0.373, "step": 105185 }, { "epoch": 3.7910404728439113, "grad_norm": 0.19434435665607452, "learning_rate": 7.297174531721762e-06, "loss": 0.4074, "step": 105190 }, { "epoch": 3.7912206725051356, "grad_norm": 0.2075873613357544, "learning_rate": 7.295114160752217e-06, "loss": 0.3656, "step": 105195 }, { "epoch": 3.7914008721663603, "grad_norm": 0.2969796657562256, "learning_rate": 7.293054031008306e-06, "loss": 0.4043, "step": 105200 }, { "epoch": 3.791581071827585, "grad_norm": 0.3087260127067566, "learning_rate": 7.290994142518115e-06, "loss": 0.3599, "step": 105205 }, { "epoch": 3.7917612714888094, "grad_norm": 0.2378859966993332, "learning_rate": 7.288934495309699e-06, "loss": 0.3884, "step": 105210 }, { "epoch": 3.791941471150034, "grad_norm": 0.292811781167984, "learning_rate": 7.286875089411119e-06, "loss": 0.3768, "step": 105215 }, { "epoch": 3.792121670811259, "grad_norm": 0.2711928188800812, "learning_rate": 7.284815924850441e-06, "loss": 0.3985, "step": 105220 }, { "epoch": 3.7923018704724836, "grad_norm": 0.23554451763629913, "learning_rate": 7.282757001655713e-06, "loss": 0.3652, "step": 105225 }, { "epoch": 3.7924820701337083, "grad_norm": 0.24152545630931854, "learning_rate": 7.280698319854984e-06, "loss": 0.4081, "step": 105230 }, { "epoch": 3.792662269794933, "grad_norm": 0.30207541584968567, "learning_rate": 7.2786398794763235e-06, "loss": 0.4096, "step": 105235 }, { "epoch": 3.7928424694561573, "grad_norm": 0.24256369471549988, "learning_rate": 7.27658168054775e-06, "loss": 0.3791, "step": 105240 }, { "epoch": 3.793022669117382, "grad_norm": 0.221241295337677, "learning_rate": 7.274523723097329e-06, "loss": 0.3753, "step": 105245 }, { "epoch": 3.793202868778607, "grad_norm": 0.2517562508583069, "learning_rate": 7.272466007153086e-06, "loss": 0.4043, "step": 105250 }, { "epoch": 3.793383068439831, "grad_norm": 0.2802235782146454, "learning_rate": 7.270408532743059e-06, "loss": 0.3501, "step": 105255 }, { "epoch": 3.793563268101056, "grad_norm": 0.28124722838401794, "learning_rate": 7.268351299895295e-06, "loss": 0.3683, "step": 105260 }, { "epoch": 3.7937434677622806, "grad_norm": 0.21457895636558533, "learning_rate": 7.266294308637805e-06, "loss": 0.3829, "step": 105265 }, { "epoch": 3.7939236674235053, "grad_norm": 0.2182459533214569, "learning_rate": 7.264237558998615e-06, "loss": 0.3997, "step": 105270 }, { "epoch": 3.79410386708473, "grad_norm": 0.22549273073673248, "learning_rate": 7.262181051005762e-06, "loss": 0.3851, "step": 105275 }, { "epoch": 3.7942840667459548, "grad_norm": 0.23273131251335144, "learning_rate": 7.260124784687256e-06, "loss": 0.4087, "step": 105280 }, { "epoch": 3.794464266407179, "grad_norm": 0.22543184459209442, "learning_rate": 7.258068760071115e-06, "loss": 0.3824, "step": 105285 }, { "epoch": 3.7946444660684038, "grad_norm": 0.39207062125205994, "learning_rate": 7.256012977185356e-06, "loss": 0.4229, "step": 105290 }, { "epoch": 3.7948246657296285, "grad_norm": 0.24338999390602112, "learning_rate": 7.253957436057973e-06, "loss": 0.3539, "step": 105295 }, { "epoch": 3.795004865390853, "grad_norm": 0.20618541538715363, "learning_rate": 7.251902136716996e-06, "loss": 0.3645, "step": 105300 }, { "epoch": 3.7951850650520775, "grad_norm": 0.24109040200710297, "learning_rate": 7.249847079190414e-06, "loss": 0.3805, "step": 105305 }, { "epoch": 3.7953652647133023, "grad_norm": 0.2463836818933487, "learning_rate": 7.247792263506228e-06, "loss": 0.3682, "step": 105310 }, { "epoch": 3.795545464374527, "grad_norm": 0.23828503489494324, "learning_rate": 7.2457376896924365e-06, "loss": 0.3785, "step": 105315 }, { "epoch": 3.7957256640357517, "grad_norm": 0.2362687587738037, "learning_rate": 7.243683357777023e-06, "loss": 0.3743, "step": 105320 }, { "epoch": 3.7959058636969765, "grad_norm": 0.2598309814929962, "learning_rate": 7.2416292677879946e-06, "loss": 0.3675, "step": 105325 }, { "epoch": 3.7960860633582008, "grad_norm": 0.2267717868089676, "learning_rate": 7.239575419753339e-06, "loss": 0.3687, "step": 105330 }, { "epoch": 3.7962662630194255, "grad_norm": 0.20179912447929382, "learning_rate": 7.237521813701012e-06, "loss": 0.3649, "step": 105335 }, { "epoch": 3.7964464626806502, "grad_norm": 0.29818323254585266, "learning_rate": 7.235468449659019e-06, "loss": 0.3729, "step": 105340 }, { "epoch": 3.796626662341875, "grad_norm": 0.26517441868782043, "learning_rate": 7.233415327655321e-06, "loss": 0.3762, "step": 105345 }, { "epoch": 3.7968068620030992, "grad_norm": 0.22521330416202545, "learning_rate": 7.231362447717915e-06, "loss": 0.3459, "step": 105350 }, { "epoch": 3.796987061664324, "grad_norm": 0.20847706496715546, "learning_rate": 7.229309809874749e-06, "loss": 0.3975, "step": 105355 }, { "epoch": 3.7971672613255487, "grad_norm": 0.22983822226524353, "learning_rate": 7.22725741415379e-06, "loss": 0.3864, "step": 105360 }, { "epoch": 3.7973474609867734, "grad_norm": 0.24204209446907043, "learning_rate": 7.225205260583013e-06, "loss": 0.3853, "step": 105365 }, { "epoch": 3.797527660647998, "grad_norm": 0.3181317448616028, "learning_rate": 7.223153349190373e-06, "loss": 0.388, "step": 105370 }, { "epoch": 3.7977078603092225, "grad_norm": 0.22117455303668976, "learning_rate": 7.221101680003828e-06, "loss": 0.4012, "step": 105375 }, { "epoch": 3.797888059970447, "grad_norm": 0.1853034645318985, "learning_rate": 7.219050253051329e-06, "loss": 0.3584, "step": 105380 }, { "epoch": 3.798068259631672, "grad_norm": 0.20408424735069275, "learning_rate": 7.216999068360822e-06, "loss": 0.3564, "step": 105385 }, { "epoch": 3.7982484592928967, "grad_norm": 0.2169559895992279, "learning_rate": 7.214948125960266e-06, "loss": 0.3362, "step": 105390 }, { "epoch": 3.798428658954121, "grad_norm": 0.2969491481781006, "learning_rate": 7.212897425877599e-06, "loss": 0.3837, "step": 105395 }, { "epoch": 3.7986088586153457, "grad_norm": 0.2599042057991028, "learning_rate": 7.2108469681407605e-06, "loss": 0.3692, "step": 105400 }, { "epoch": 3.7987890582765704, "grad_norm": 0.19389642775058746, "learning_rate": 7.208796752777691e-06, "loss": 0.356, "step": 105405 }, { "epoch": 3.798969257937795, "grad_norm": 0.22755640745162964, "learning_rate": 7.206746779816317e-06, "loss": 0.4012, "step": 105410 }, { "epoch": 3.79914945759902, "grad_norm": 0.25583744049072266, "learning_rate": 7.204697049284567e-06, "loss": 0.3835, "step": 105415 }, { "epoch": 3.799329657260244, "grad_norm": 0.26095953583717346, "learning_rate": 7.202647561210382e-06, "loss": 0.373, "step": 105420 }, { "epoch": 3.799509856921469, "grad_norm": 0.26644977927207947, "learning_rate": 7.200598315621679e-06, "loss": 0.4386, "step": 105425 }, { "epoch": 3.7996900565826937, "grad_norm": 0.22700461745262146, "learning_rate": 7.198549312546379e-06, "loss": 0.4308, "step": 105430 }, { "epoch": 3.7998702562439184, "grad_norm": 0.20378275215625763, "learning_rate": 7.196500552012397e-06, "loss": 0.3812, "step": 105435 }, { "epoch": 3.8000504559051427, "grad_norm": 0.2918340861797333, "learning_rate": 7.194452034047639e-06, "loss": 0.3411, "step": 105440 }, { "epoch": 3.8002306555663674, "grad_norm": 0.19135360419750214, "learning_rate": 7.1924037586800415e-06, "loss": 0.3652, "step": 105445 }, { "epoch": 3.800410855227592, "grad_norm": 0.24061061441898346, "learning_rate": 7.190355725937487e-06, "loss": 0.3725, "step": 105450 }, { "epoch": 3.800591054888817, "grad_norm": 0.27103692293167114, "learning_rate": 7.1883079358478825e-06, "loss": 0.3687, "step": 105455 }, { "epoch": 3.8007712545500416, "grad_norm": 0.2294936627149582, "learning_rate": 7.186260388439137e-06, "loss": 0.3556, "step": 105460 }, { "epoch": 3.8009514542112663, "grad_norm": 0.2741897702217102, "learning_rate": 7.184213083739147e-06, "loss": 0.3703, "step": 105465 }, { "epoch": 3.8011316538724906, "grad_norm": 0.2626726031303406, "learning_rate": 7.182166021775805e-06, "loss": 0.379, "step": 105470 }, { "epoch": 3.8013118535337154, "grad_norm": 0.292074054479599, "learning_rate": 7.1801192025770015e-06, "loss": 0.3627, "step": 105475 }, { "epoch": 3.80149205319494, "grad_norm": 0.25104910135269165, "learning_rate": 7.178072626170615e-06, "loss": 0.3784, "step": 105480 }, { "epoch": 3.8016722528561644, "grad_norm": 0.24408739805221558, "learning_rate": 7.176026292584548e-06, "loss": 0.3792, "step": 105485 }, { "epoch": 3.801852452517389, "grad_norm": 0.238882914185524, "learning_rate": 7.1739802018466695e-06, "loss": 0.3824, "step": 105490 }, { "epoch": 3.802032652178614, "grad_norm": 0.22461140155792236, "learning_rate": 7.171934353984863e-06, "loss": 0.3616, "step": 105495 }, { "epoch": 3.8022128518398386, "grad_norm": 0.2754288911819458, "learning_rate": 7.169888749026995e-06, "loss": 0.3749, "step": 105500 }, { "epoch": 3.8022128518398386, "eval_loss": 0.4286453127861023, "eval_runtime": 3.5307, "eval_samples_per_second": 28.323, "eval_steps_per_second": 7.081, "step": 105500 }, { "epoch": 3.8023930515010633, "grad_norm": 0.1857483983039856, "learning_rate": 7.167843387000936e-06, "loss": 0.4028, "step": 105505 }, { "epoch": 3.802573251162288, "grad_norm": 0.18854497373104095, "learning_rate": 7.165798267934565e-06, "loss": 0.3894, "step": 105510 }, { "epoch": 3.8027534508235123, "grad_norm": 0.2537713050842285, "learning_rate": 7.163753391855749e-06, "loss": 0.3789, "step": 105515 }, { "epoch": 3.802933650484737, "grad_norm": 0.2627655565738678, "learning_rate": 7.161708758792324e-06, "loss": 0.3729, "step": 105520 }, { "epoch": 3.803113850145962, "grad_norm": 0.2551818788051605, "learning_rate": 7.15966436877217e-06, "loss": 0.3955, "step": 105525 }, { "epoch": 3.803294049807186, "grad_norm": 0.2461671382188797, "learning_rate": 7.157620221823127e-06, "loss": 0.3978, "step": 105530 }, { "epoch": 3.803474249468411, "grad_norm": 0.29957154393196106, "learning_rate": 7.155576317973061e-06, "loss": 0.4128, "step": 105535 }, { "epoch": 3.8036544491296356, "grad_norm": 0.2094818353652954, "learning_rate": 7.153532657249823e-06, "loss": 0.3306, "step": 105540 }, { "epoch": 3.8038346487908603, "grad_norm": 0.21276958286762238, "learning_rate": 7.1514892396812335e-06, "loss": 0.3973, "step": 105545 }, { "epoch": 3.804014848452085, "grad_norm": 0.2489434778690338, "learning_rate": 7.149446065295151e-06, "loss": 0.3623, "step": 105550 }, { "epoch": 3.8041950481133098, "grad_norm": 0.2391531616449356, "learning_rate": 7.147403134119412e-06, "loss": 0.3777, "step": 105555 }, { "epoch": 3.804375247774534, "grad_norm": 0.2393515408039093, "learning_rate": 7.145360446181848e-06, "loss": 0.3775, "step": 105560 }, { "epoch": 3.804555447435759, "grad_norm": 0.26830413937568665, "learning_rate": 7.1433180015102936e-06, "loss": 0.3694, "step": 105565 }, { "epoch": 3.8047356470969835, "grad_norm": 0.3024479150772095, "learning_rate": 7.141275800132563e-06, "loss": 0.3865, "step": 105570 }, { "epoch": 3.804915846758208, "grad_norm": 0.21776083111763, "learning_rate": 7.1392338420765005e-06, "loss": 0.3866, "step": 105575 }, { "epoch": 3.8050960464194326, "grad_norm": 0.24332380294799805, "learning_rate": 7.137192127369921e-06, "loss": 0.3977, "step": 105580 }, { "epoch": 3.8052762460806573, "grad_norm": 0.1978878378868103, "learning_rate": 7.13515065604064e-06, "loss": 0.3775, "step": 105585 }, { "epoch": 3.805456445741882, "grad_norm": 0.22366763651371002, "learning_rate": 7.1331094281164715e-06, "loss": 0.3749, "step": 105590 }, { "epoch": 3.8056366454031068, "grad_norm": 0.20870821177959442, "learning_rate": 7.13106844362523e-06, "loss": 0.4025, "step": 105595 }, { "epoch": 3.8058168450643315, "grad_norm": 0.2318304181098938, "learning_rate": 7.129027702594713e-06, "loss": 0.4165, "step": 105600 }, { "epoch": 3.8059970447255558, "grad_norm": 0.22088027000427246, "learning_rate": 7.126987205052738e-06, "loss": 0.3893, "step": 105605 }, { "epoch": 3.8061772443867805, "grad_norm": 0.27171891927719116, "learning_rate": 7.124946951027103e-06, "loss": 0.3769, "step": 105610 }, { "epoch": 3.8063574440480052, "grad_norm": 0.21456721425056458, "learning_rate": 7.12290694054561e-06, "loss": 0.3962, "step": 105615 }, { "epoch": 3.80653764370923, "grad_norm": 0.2533538043498993, "learning_rate": 7.120867173636042e-06, "loss": 0.3834, "step": 105620 }, { "epoch": 3.8067178433704543, "grad_norm": 0.2441474199295044, "learning_rate": 7.118827650326193e-06, "loss": 0.3718, "step": 105625 }, { "epoch": 3.806898043031679, "grad_norm": 0.2645309865474701, "learning_rate": 7.116788370643873e-06, "loss": 0.3572, "step": 105630 }, { "epoch": 3.8070782426929037, "grad_norm": 0.2198515087366104, "learning_rate": 7.1147493346168385e-06, "loss": 0.3757, "step": 105635 }, { "epoch": 3.8072584423541285, "grad_norm": 0.23640519380569458, "learning_rate": 7.112710542272874e-06, "loss": 0.3703, "step": 105640 }, { "epoch": 3.807438642015353, "grad_norm": 0.28243306279182434, "learning_rate": 7.110671993639772e-06, "loss": 0.427, "step": 105645 }, { "epoch": 3.8076188416765775, "grad_norm": 0.2905183434486389, "learning_rate": 7.108633688745303e-06, "loss": 0.4278, "step": 105650 }, { "epoch": 3.8077990413378022, "grad_norm": 0.2647572457790375, "learning_rate": 7.106595627617235e-06, "loss": 0.3962, "step": 105655 }, { "epoch": 3.807979240999027, "grad_norm": 0.22848838567733765, "learning_rate": 7.104557810283338e-06, "loss": 0.3891, "step": 105660 }, { "epoch": 3.8081594406602517, "grad_norm": 0.23743629455566406, "learning_rate": 7.102520236771368e-06, "loss": 0.3594, "step": 105665 }, { "epoch": 3.808339640321476, "grad_norm": 0.2258959710597992, "learning_rate": 7.100482907109102e-06, "loss": 0.3635, "step": 105670 }, { "epoch": 3.8085198399827007, "grad_norm": 0.2394886165857315, "learning_rate": 7.098445821324293e-06, "loss": 0.3791, "step": 105675 }, { "epoch": 3.8087000396439255, "grad_norm": 0.2250211089849472, "learning_rate": 7.096408979444691e-06, "loss": 0.403, "step": 105680 }, { "epoch": 3.80888023930515, "grad_norm": 0.1884530633687973, "learning_rate": 7.094372381498052e-06, "loss": 0.3308, "step": 105685 }, { "epoch": 3.809060438966375, "grad_norm": 0.24882015585899353, "learning_rate": 7.092336027512115e-06, "loss": 0.3892, "step": 105690 }, { "epoch": 3.8092406386275997, "grad_norm": 0.21837691962718964, "learning_rate": 7.0902999175146396e-06, "loss": 0.3885, "step": 105695 }, { "epoch": 3.809420838288824, "grad_norm": 0.27154818177223206, "learning_rate": 7.0882640515333585e-06, "loss": 0.3667, "step": 105700 }, { "epoch": 3.8096010379500487, "grad_norm": 0.19823803007602692, "learning_rate": 7.0862284295960144e-06, "loss": 0.3472, "step": 105705 }, { "epoch": 3.8097812376112734, "grad_norm": 0.2487536519765854, "learning_rate": 7.08419305173034e-06, "loss": 0.3867, "step": 105710 }, { "epoch": 3.8099614372724977, "grad_norm": 0.2396322339773178, "learning_rate": 7.082157917964058e-06, "loss": 0.3602, "step": 105715 }, { "epoch": 3.8101416369337224, "grad_norm": 0.22832445800304413, "learning_rate": 7.0801230283249145e-06, "loss": 0.3922, "step": 105720 }, { "epoch": 3.810321836594947, "grad_norm": 0.22182156145572662, "learning_rate": 7.078088382840631e-06, "loss": 0.3613, "step": 105725 }, { "epoch": 3.810502036256172, "grad_norm": 0.26968759298324585, "learning_rate": 7.0760539815389075e-06, "loss": 0.4139, "step": 105730 }, { "epoch": 3.8106822359173966, "grad_norm": 0.19509482383728027, "learning_rate": 7.0740198244474895e-06, "loss": 0.356, "step": 105735 }, { "epoch": 3.8108624355786214, "grad_norm": 0.24696923792362213, "learning_rate": 7.071985911594078e-06, "loss": 0.3766, "step": 105740 }, { "epoch": 3.8110426352398457, "grad_norm": 0.2208133488893509, "learning_rate": 7.069952243006389e-06, "loss": 0.3709, "step": 105745 }, { "epoch": 3.8112228349010704, "grad_norm": 0.2347523719072342, "learning_rate": 7.067918818712127e-06, "loss": 0.3837, "step": 105750 }, { "epoch": 3.811403034562295, "grad_norm": 0.25173941254615784, "learning_rate": 7.065885638738995e-06, "loss": 0.4095, "step": 105755 }, { "epoch": 3.8115832342235194, "grad_norm": 0.19811011850833893, "learning_rate": 7.063852703114704e-06, "loss": 0.4196, "step": 105760 }, { "epoch": 3.811763433884744, "grad_norm": 0.22163262963294983, "learning_rate": 7.061820011866949e-06, "loss": 0.3861, "step": 105765 }, { "epoch": 3.811943633545969, "grad_norm": 0.21935562789440155, "learning_rate": 7.059787565023421e-06, "loss": 0.3796, "step": 105770 }, { "epoch": 3.8121238332071936, "grad_norm": 0.1883041262626648, "learning_rate": 7.0577553626118145e-06, "loss": 0.3783, "step": 105775 }, { "epoch": 3.8123040328684183, "grad_norm": 0.2626688778400421, "learning_rate": 7.05572340465982e-06, "loss": 0.4043, "step": 105780 }, { "epoch": 3.812484232529643, "grad_norm": 0.2265564501285553, "learning_rate": 7.053691691195111e-06, "loss": 0.3838, "step": 105785 }, { "epoch": 3.8126644321908674, "grad_norm": 0.3275195360183716, "learning_rate": 7.051660222245388e-06, "loss": 0.3578, "step": 105790 }, { "epoch": 3.812844631852092, "grad_norm": 0.24849045276641846, "learning_rate": 7.049628997838315e-06, "loss": 0.3943, "step": 105795 }, { "epoch": 3.813024831513317, "grad_norm": 0.22452738881111145, "learning_rate": 7.047598018001575e-06, "loss": 0.3827, "step": 105800 }, { "epoch": 3.813205031174541, "grad_norm": 0.2721640467643738, "learning_rate": 7.045567282762836e-06, "loss": 0.3972, "step": 105805 }, { "epoch": 3.813385230835766, "grad_norm": 0.22152206301689148, "learning_rate": 7.043536792149757e-06, "loss": 0.354, "step": 105810 }, { "epoch": 3.8135654304969906, "grad_norm": 0.19255799055099487, "learning_rate": 7.04150654619003e-06, "loss": 0.3711, "step": 105815 }, { "epoch": 3.8137456301582153, "grad_norm": 0.2830601930618286, "learning_rate": 7.039476544911291e-06, "loss": 0.38, "step": 105820 }, { "epoch": 3.81392582981944, "grad_norm": 0.20766660571098328, "learning_rate": 7.037446788341198e-06, "loss": 0.3799, "step": 105825 }, { "epoch": 3.814106029480665, "grad_norm": 0.2430235594511032, "learning_rate": 7.035417276507425e-06, "loss": 0.3873, "step": 105830 }, { "epoch": 3.814286229141889, "grad_norm": 0.29118457436561584, "learning_rate": 7.0333880094376055e-06, "loss": 0.3836, "step": 105835 }, { "epoch": 3.814466428803114, "grad_norm": 0.20183739066123962, "learning_rate": 7.031358987159409e-06, "loss": 0.3727, "step": 105840 }, { "epoch": 3.8146466284643386, "grad_norm": 0.22086220979690552, "learning_rate": 7.029330209700463e-06, "loss": 0.3697, "step": 105845 }, { "epoch": 3.8148268281255633, "grad_norm": 0.24414370954036713, "learning_rate": 7.0273016770884045e-06, "loss": 0.3763, "step": 105850 }, { "epoch": 3.8150070277867876, "grad_norm": 0.20783692598342896, "learning_rate": 7.025273389350886e-06, "loss": 0.3389, "step": 105855 }, { "epoch": 3.8151872274480123, "grad_norm": 0.29034414887428284, "learning_rate": 7.023245346515541e-06, "loss": 0.4052, "step": 105860 }, { "epoch": 3.815367427109237, "grad_norm": 0.27145785093307495, "learning_rate": 7.021217548609999e-06, "loss": 0.3902, "step": 105865 }, { "epoch": 3.8155476267704618, "grad_norm": 0.2717854976654053, "learning_rate": 7.019189995661884e-06, "loss": 0.374, "step": 105870 }, { "epoch": 3.8157278264316865, "grad_norm": 0.30218860507011414, "learning_rate": 7.017162687698817e-06, "loss": 0.3719, "step": 105875 }, { "epoch": 3.815908026092911, "grad_norm": 0.2221565693616867, "learning_rate": 7.015135624748434e-06, "loss": 0.3678, "step": 105880 }, { "epoch": 3.8160882257541355, "grad_norm": 0.2444906085729599, "learning_rate": 7.013108806838348e-06, "loss": 0.4113, "step": 105885 }, { "epoch": 3.8162684254153603, "grad_norm": 0.2825545370578766, "learning_rate": 7.011082233996169e-06, "loss": 0.3969, "step": 105890 }, { "epoch": 3.816448625076585, "grad_norm": 0.226121723651886, "learning_rate": 7.009055906249515e-06, "loss": 0.3798, "step": 105895 }, { "epoch": 3.8166288247378093, "grad_norm": 0.2496255487203598, "learning_rate": 7.007029823625982e-06, "loss": 0.3724, "step": 105900 }, { "epoch": 3.816809024399034, "grad_norm": 0.2551683187484741, "learning_rate": 7.0050039861531915e-06, "loss": 0.3912, "step": 105905 }, { "epoch": 3.8169892240602588, "grad_norm": 0.24302713572978973, "learning_rate": 7.002978393858747e-06, "loss": 0.376, "step": 105910 }, { "epoch": 3.8171694237214835, "grad_norm": 0.25622105598449707, "learning_rate": 7.00095304677022e-06, "loss": 0.374, "step": 105915 }, { "epoch": 3.8173496233827082, "grad_norm": 0.23344071209430695, "learning_rate": 6.9989279449152316e-06, "loss": 0.3773, "step": 105920 }, { "epoch": 3.8175298230439325, "grad_norm": 0.24558846652507782, "learning_rate": 6.996903088321366e-06, "loss": 0.3956, "step": 105925 }, { "epoch": 3.8177100227051572, "grad_norm": 0.2149125039577484, "learning_rate": 6.994878477016209e-06, "loss": 0.3839, "step": 105930 }, { "epoch": 3.817890222366382, "grad_norm": 0.2317209243774414, "learning_rate": 6.992854111027347e-06, "loss": 0.403, "step": 105935 }, { "epoch": 3.8180704220276067, "grad_norm": 0.29228413105010986, "learning_rate": 6.9908299903823555e-06, "loss": 0.3927, "step": 105940 }, { "epoch": 3.818250621688831, "grad_norm": 0.20377010107040405, "learning_rate": 6.988806115108826e-06, "loss": 0.3415, "step": 105945 }, { "epoch": 3.8184308213500557, "grad_norm": 0.24830646812915802, "learning_rate": 6.986782485234322e-06, "loss": 0.38, "step": 105950 }, { "epoch": 3.8186110210112805, "grad_norm": 0.2752242982387543, "learning_rate": 6.9847591007864225e-06, "loss": 0.3827, "step": 105955 }, { "epoch": 3.818791220672505, "grad_norm": 0.2355724275112152, "learning_rate": 6.9827359617926945e-06, "loss": 0.3878, "step": 105960 }, { "epoch": 3.81897142033373, "grad_norm": 0.21855174005031586, "learning_rate": 6.980713068280698e-06, "loss": 0.3636, "step": 105965 }, { "epoch": 3.8191516199949547, "grad_norm": 0.24916981160640717, "learning_rate": 6.978690420277989e-06, "loss": 0.3912, "step": 105970 }, { "epoch": 3.819331819656179, "grad_norm": 0.25882112979888916, "learning_rate": 6.976668017812144e-06, "loss": 0.4247, "step": 105975 }, { "epoch": 3.8195120193174037, "grad_norm": 0.24076975882053375, "learning_rate": 6.974645860910706e-06, "loss": 0.3626, "step": 105980 }, { "epoch": 3.8196922189786284, "grad_norm": 0.25258931517601013, "learning_rate": 6.9726239496012285e-06, "loss": 0.3866, "step": 105985 }, { "epoch": 3.8198724186398527, "grad_norm": 0.24138487875461578, "learning_rate": 6.97060228391126e-06, "loss": 0.3732, "step": 105990 }, { "epoch": 3.8200526183010775, "grad_norm": 0.23699279129505157, "learning_rate": 6.968580863868334e-06, "loss": 0.3936, "step": 105995 }, { "epoch": 3.820232817962302, "grad_norm": 0.28243473172187805, "learning_rate": 6.9665596895000155e-06, "loss": 0.3651, "step": 106000 }, { "epoch": 3.820232817962302, "eval_loss": 0.42869871854782104, "eval_runtime": 3.5415, "eval_samples_per_second": 28.237, "eval_steps_per_second": 7.059, "step": 106000 }, { "epoch": 3.820413017623527, "grad_norm": 0.2416362762451172, "learning_rate": 6.964538760833836e-06, "loss": 0.3882, "step": 106005 }, { "epoch": 3.8205932172847517, "grad_norm": 0.2267039567232132, "learning_rate": 6.962518077897306e-06, "loss": 0.392, "step": 106010 }, { "epoch": 3.8207734169459764, "grad_norm": 0.24580471217632294, "learning_rate": 6.960497640717986e-06, "loss": 0.3695, "step": 106015 }, { "epoch": 3.8209536166072007, "grad_norm": 0.23100297152996063, "learning_rate": 6.958477449323384e-06, "loss": 0.389, "step": 106020 }, { "epoch": 3.8211338162684254, "grad_norm": 0.25452324748039246, "learning_rate": 6.956457503741049e-06, "loss": 0.3714, "step": 106025 }, { "epoch": 3.82131401592965, "grad_norm": 0.20147554576396942, "learning_rate": 6.954437803998479e-06, "loss": 0.3673, "step": 106030 }, { "epoch": 3.8214942155908744, "grad_norm": 0.26207849383354187, "learning_rate": 6.952418350123194e-06, "loss": 0.3893, "step": 106035 }, { "epoch": 3.821674415252099, "grad_norm": 0.25192660093307495, "learning_rate": 6.950399142142722e-06, "loss": 0.4313, "step": 106040 }, { "epoch": 3.821854614913324, "grad_norm": 0.2420777529478073, "learning_rate": 6.9483801800845685e-06, "loss": 0.3608, "step": 106045 }, { "epoch": 3.8220348145745486, "grad_norm": 0.19895605742931366, "learning_rate": 6.9463614639762366e-06, "loss": 0.3602, "step": 106050 }, { "epoch": 3.8222150142357734, "grad_norm": 0.3234860599040985, "learning_rate": 6.944342993845237e-06, "loss": 0.3955, "step": 106055 }, { "epoch": 3.822395213896998, "grad_norm": 0.23543211817741394, "learning_rate": 6.942324769719061e-06, "loss": 0.3795, "step": 106060 }, { "epoch": 3.8225754135582224, "grad_norm": 0.18973630666732788, "learning_rate": 6.9403067916252205e-06, "loss": 0.3613, "step": 106065 }, { "epoch": 3.822755613219447, "grad_norm": 0.242222398519516, "learning_rate": 6.938289059591205e-06, "loss": 0.3813, "step": 106070 }, { "epoch": 3.822935812880672, "grad_norm": 0.23931699991226196, "learning_rate": 6.936271573644501e-06, "loss": 0.3862, "step": 106075 }, { "epoch": 3.823116012541896, "grad_norm": 0.23971091210842133, "learning_rate": 6.934254333812601e-06, "loss": 0.4044, "step": 106080 }, { "epoch": 3.823296212203121, "grad_norm": 0.26793283224105835, "learning_rate": 6.932237340122982e-06, "loss": 0.4126, "step": 106085 }, { "epoch": 3.8234764118643456, "grad_norm": 0.24026966094970703, "learning_rate": 6.930220592603137e-06, "loss": 0.4094, "step": 106090 }, { "epoch": 3.8236566115255703, "grad_norm": 0.24495449662208557, "learning_rate": 6.928204091280549e-06, "loss": 0.4011, "step": 106095 }, { "epoch": 3.823836811186795, "grad_norm": 0.20799663662910461, "learning_rate": 6.926187836182663e-06, "loss": 0.3974, "step": 106100 }, { "epoch": 3.82401701084802, "grad_norm": 0.2459585964679718, "learning_rate": 6.924171827336975e-06, "loss": 0.3619, "step": 106105 }, { "epoch": 3.824197210509244, "grad_norm": 0.2440251260995865, "learning_rate": 6.9221560647709485e-06, "loss": 0.3862, "step": 106110 }, { "epoch": 3.824377410170469, "grad_norm": 0.29908034205436707, "learning_rate": 6.920140548512038e-06, "loss": 0.4097, "step": 106115 }, { "epoch": 3.8245576098316936, "grad_norm": 0.304340660572052, "learning_rate": 6.9181252785877285e-06, "loss": 0.3573, "step": 106120 }, { "epoch": 3.8247378094929183, "grad_norm": 0.20545925199985504, "learning_rate": 6.916110255025443e-06, "loss": 0.365, "step": 106125 }, { "epoch": 3.8249180091541426, "grad_norm": 0.20274123549461365, "learning_rate": 6.914095477852664e-06, "loss": 0.3642, "step": 106130 }, { "epoch": 3.8250982088153673, "grad_norm": 0.21836356818675995, "learning_rate": 6.912080947096833e-06, "loss": 0.3782, "step": 106135 }, { "epoch": 3.825278408476592, "grad_norm": 0.21938830614089966, "learning_rate": 6.910066662785394e-06, "loss": 0.389, "step": 106140 }, { "epoch": 3.825458608137817, "grad_norm": 0.22280332446098328, "learning_rate": 6.908052624945796e-06, "loss": 0.3447, "step": 106145 }, { "epoch": 3.8256388077990415, "grad_norm": 0.25814318656921387, "learning_rate": 6.90603883360548e-06, "loss": 0.3924, "step": 106150 }, { "epoch": 3.825819007460266, "grad_norm": 0.28167471289634705, "learning_rate": 6.904025288791874e-06, "loss": 0.3889, "step": 106155 }, { "epoch": 3.8259992071214906, "grad_norm": 0.21864792704582214, "learning_rate": 6.902011990532425e-06, "loss": 0.3953, "step": 106160 }, { "epoch": 3.8261794067827153, "grad_norm": 0.23761038482189178, "learning_rate": 6.89999893885456e-06, "loss": 0.3828, "step": 106165 }, { "epoch": 3.82635960644394, "grad_norm": 0.23446708917617798, "learning_rate": 6.8979861337857055e-06, "loss": 0.3481, "step": 106170 }, { "epoch": 3.8265398061051643, "grad_norm": 0.25455090403556824, "learning_rate": 6.895973575353287e-06, "loss": 0.38, "step": 106175 }, { "epoch": 3.826720005766389, "grad_norm": 0.2646729648113251, "learning_rate": 6.893961263584714e-06, "loss": 0.3828, "step": 106180 }, { "epoch": 3.8269002054276138, "grad_norm": 0.25168514251708984, "learning_rate": 6.891949198507419e-06, "loss": 0.376, "step": 106185 }, { "epoch": 3.8270804050888385, "grad_norm": 0.24938587844371796, "learning_rate": 6.88993738014882e-06, "loss": 0.3898, "step": 106190 }, { "epoch": 3.8272606047500632, "grad_norm": 0.24917519092559814, "learning_rate": 6.8879258085363025e-06, "loss": 0.3947, "step": 106195 }, { "epoch": 3.827440804411288, "grad_norm": 0.2233772575855255, "learning_rate": 6.8859144836972976e-06, "loss": 0.3719, "step": 106200 }, { "epoch": 3.8276210040725123, "grad_norm": 0.2717728018760681, "learning_rate": 6.883903405659192e-06, "loss": 0.3885, "step": 106205 }, { "epoch": 3.827801203733737, "grad_norm": 0.24201686680316925, "learning_rate": 6.881892574449411e-06, "loss": 0.3567, "step": 106210 }, { "epoch": 3.8279814033949617, "grad_norm": 0.22028078138828278, "learning_rate": 6.87988199009533e-06, "loss": 0.3652, "step": 106215 }, { "epoch": 3.828161603056186, "grad_norm": 0.21828703582286835, "learning_rate": 6.87787165262434e-06, "loss": 0.3752, "step": 106220 }, { "epoch": 3.8283418027174108, "grad_norm": 0.22384034097194672, "learning_rate": 6.875861562063846e-06, "loss": 0.387, "step": 106225 }, { "epoch": 3.8285220023786355, "grad_norm": 0.38978999853134155, "learning_rate": 6.873851718441232e-06, "loss": 0.3725, "step": 106230 }, { "epoch": 3.8287022020398602, "grad_norm": 0.30686643719673157, "learning_rate": 6.8718421217838754e-06, "loss": 0.3894, "step": 106235 }, { "epoch": 3.828882401701085, "grad_norm": 0.24971897900104523, "learning_rate": 6.869832772119164e-06, "loss": 0.3785, "step": 106240 }, { "epoch": 3.8290626013623097, "grad_norm": 0.26769372820854187, "learning_rate": 6.86782366947446e-06, "loss": 0.3451, "step": 106245 }, { "epoch": 3.829242801023534, "grad_norm": 0.30026867985725403, "learning_rate": 6.865814813877158e-06, "loss": 0.4113, "step": 106250 }, { "epoch": 3.8294230006847587, "grad_norm": 0.2826428711414337, "learning_rate": 6.863806205354617e-06, "loss": 0.4001, "step": 106255 }, { "epoch": 3.8296032003459834, "grad_norm": 0.24307632446289062, "learning_rate": 6.861797843934206e-06, "loss": 0.3537, "step": 106260 }, { "epoch": 3.8297834000072077, "grad_norm": 0.277296245098114, "learning_rate": 6.859789729643287e-06, "loss": 0.3807, "step": 106265 }, { "epoch": 3.8299635996684325, "grad_norm": 0.2420312762260437, "learning_rate": 6.857781862509221e-06, "loss": 0.3795, "step": 106270 }, { "epoch": 3.830143799329657, "grad_norm": 0.17696630954742432, "learning_rate": 6.855774242559359e-06, "loss": 0.3511, "step": 106275 }, { "epoch": 3.830323998990882, "grad_norm": 0.26137152314186096, "learning_rate": 6.853766869821066e-06, "loss": 0.3418, "step": 106280 }, { "epoch": 3.8305041986521067, "grad_norm": 0.21335354447364807, "learning_rate": 6.851759744321687e-06, "loss": 0.3742, "step": 106285 }, { "epoch": 3.8306843983133314, "grad_norm": 0.23139753937721252, "learning_rate": 6.849752866088566e-06, "loss": 0.3503, "step": 106290 }, { "epoch": 3.8308645979745557, "grad_norm": 0.21770185232162476, "learning_rate": 6.84774623514905e-06, "loss": 0.3839, "step": 106295 }, { "epoch": 3.8310447976357804, "grad_norm": 0.2551557719707489, "learning_rate": 6.845739851530469e-06, "loss": 0.3913, "step": 106300 }, { "epoch": 3.831224997297005, "grad_norm": 0.2286597192287445, "learning_rate": 6.843733715260181e-06, "loss": 0.413, "step": 106305 }, { "epoch": 3.8314051969582295, "grad_norm": 0.22169090807437897, "learning_rate": 6.841727826365493e-06, "loss": 0.379, "step": 106310 }, { "epoch": 3.831585396619454, "grad_norm": 0.240523561835289, "learning_rate": 6.839722184873757e-06, "loss": 0.3675, "step": 106315 }, { "epoch": 3.831765596280679, "grad_norm": 0.2571893632411957, "learning_rate": 6.8377167908122876e-06, "loss": 0.382, "step": 106320 }, { "epoch": 3.8319457959419037, "grad_norm": 0.24433764815330505, "learning_rate": 6.8357116442084115e-06, "loss": 0.3755, "step": 106325 }, { "epoch": 3.8321259956031284, "grad_norm": 0.1973889172077179, "learning_rate": 6.833706745089446e-06, "loss": 0.3574, "step": 106330 }, { "epoch": 3.832306195264353, "grad_norm": 0.28018200397491455, "learning_rate": 6.831702093482711e-06, "loss": 0.3909, "step": 106335 }, { "epoch": 3.8324863949255774, "grad_norm": 0.2281164973974228, "learning_rate": 6.82969768941551e-06, "loss": 0.368, "step": 106340 }, { "epoch": 3.832666594586802, "grad_norm": 0.25604715943336487, "learning_rate": 6.827693532915166e-06, "loss": 0.3783, "step": 106345 }, { "epoch": 3.832846794248027, "grad_norm": 0.21953798830509186, "learning_rate": 6.82568962400898e-06, "loss": 0.377, "step": 106350 }, { "epoch": 3.8330269939092516, "grad_norm": 0.22131898999214172, "learning_rate": 6.823685962724255e-06, "loss": 0.3739, "step": 106355 }, { "epoch": 3.833207193570476, "grad_norm": 0.3169558048248291, "learning_rate": 6.8216825490882875e-06, "loss": 0.4028, "step": 106360 }, { "epoch": 3.8333873932317006, "grad_norm": 0.2238478660583496, "learning_rate": 6.819679383128372e-06, "loss": 0.4006, "step": 106365 }, { "epoch": 3.8335675928929254, "grad_norm": 0.28951504826545715, "learning_rate": 6.817676464871808e-06, "loss": 0.361, "step": 106370 }, { "epoch": 3.83374779255415, "grad_norm": 0.2536328136920929, "learning_rate": 6.815673794345895e-06, "loss": 0.3859, "step": 106375 }, { "epoch": 3.833927992215375, "grad_norm": 0.2552483379840851, "learning_rate": 6.8136713715778875e-06, "loss": 0.4012, "step": 106380 }, { "epoch": 3.834108191876599, "grad_norm": 0.2368234395980835, "learning_rate": 6.811669196595094e-06, "loss": 0.3601, "step": 106385 }, { "epoch": 3.834288391537824, "grad_norm": 0.2486155778169632, "learning_rate": 6.80966726942478e-06, "loss": 0.3964, "step": 106390 }, { "epoch": 3.8344685911990486, "grad_norm": 0.2343214601278305, "learning_rate": 6.807665590094242e-06, "loss": 0.408, "step": 106395 }, { "epoch": 3.8346487908602733, "grad_norm": 0.22257140278816223, "learning_rate": 6.805664158630728e-06, "loss": 0.3616, "step": 106400 }, { "epoch": 3.8348289905214976, "grad_norm": 0.26378926634788513, "learning_rate": 6.803662975061515e-06, "loss": 0.4036, "step": 106405 }, { "epoch": 3.8350091901827223, "grad_norm": 0.2561560273170471, "learning_rate": 6.801662039413875e-06, "loss": 0.3486, "step": 106410 }, { "epoch": 3.835189389843947, "grad_norm": 0.24770770967006683, "learning_rate": 6.799661351715067e-06, "loss": 0.417, "step": 106415 }, { "epoch": 3.835369589505172, "grad_norm": 0.27001357078552246, "learning_rate": 6.797660911992351e-06, "loss": 0.398, "step": 106420 }, { "epoch": 3.8355497891663966, "grad_norm": 0.2153816670179367, "learning_rate": 6.795660720272978e-06, "loss": 0.3642, "step": 106425 }, { "epoch": 3.835729988827621, "grad_norm": 0.2308131456375122, "learning_rate": 6.7936607765841985e-06, "loss": 0.4077, "step": 106430 }, { "epoch": 3.8359101884888456, "grad_norm": 0.19024352729320526, "learning_rate": 6.791661080953274e-06, "loss": 0.3708, "step": 106435 }, { "epoch": 3.8360903881500703, "grad_norm": 0.15937311947345734, "learning_rate": 6.78966163340744e-06, "loss": 0.3621, "step": 106440 }, { "epoch": 3.836270587811295, "grad_norm": 0.29799744486808777, "learning_rate": 6.78766243397394e-06, "loss": 0.371, "step": 106445 }, { "epoch": 3.8364507874725193, "grad_norm": 0.20933188498020172, "learning_rate": 6.785663482680016e-06, "loss": 0.3618, "step": 106450 }, { "epoch": 3.836630987133744, "grad_norm": 0.25528839230537415, "learning_rate": 6.7836647795528976e-06, "loss": 0.3922, "step": 106455 }, { "epoch": 3.836811186794969, "grad_norm": 0.231916606426239, "learning_rate": 6.781666324619815e-06, "loss": 0.379, "step": 106460 }, { "epoch": 3.8369913864561935, "grad_norm": 0.21450571715831757, "learning_rate": 6.779668117908008e-06, "loss": 0.3953, "step": 106465 }, { "epoch": 3.8371715861174183, "grad_norm": 0.23598310351371765, "learning_rate": 6.777670159444696e-06, "loss": 0.367, "step": 106470 }, { "epoch": 3.837351785778643, "grad_norm": 0.18300804495811462, "learning_rate": 6.775672449257098e-06, "loss": 0.3378, "step": 106475 }, { "epoch": 3.8375319854398673, "grad_norm": 0.27081814408302307, "learning_rate": 6.773674987372436e-06, "loss": 0.389, "step": 106480 }, { "epoch": 3.837712185101092, "grad_norm": 0.22525940835475922, "learning_rate": 6.771677773817917e-06, "loss": 0.398, "step": 106485 }, { "epoch": 3.8378923847623168, "grad_norm": 0.23411720991134644, "learning_rate": 6.769680808620774e-06, "loss": 0.4028, "step": 106490 }, { "epoch": 3.838072584423541, "grad_norm": 0.2922147214412689, "learning_rate": 6.7676840918081825e-06, "loss": 0.4001, "step": 106495 }, { "epoch": 3.838252784084766, "grad_norm": 0.23292958736419678, "learning_rate": 6.765687623407377e-06, "loss": 0.3461, "step": 106500 }, { "epoch": 3.838252784084766, "eval_loss": 0.4283367097377777, "eval_runtime": 3.5256, "eval_samples_per_second": 28.364, "eval_steps_per_second": 7.091, "step": 106500 }, { "epoch": 3.8384329837459905, "grad_norm": 0.2138739675283432, "learning_rate": 6.763691403445543e-06, "loss": 0.3657, "step": 106505 }, { "epoch": 3.8386131834072152, "grad_norm": 0.20448392629623413, "learning_rate": 6.761695431949888e-06, "loss": 0.3634, "step": 106510 }, { "epoch": 3.83879338306844, "grad_norm": 0.269741415977478, "learning_rate": 6.759699708947598e-06, "loss": 0.3845, "step": 106515 }, { "epoch": 3.8389735827296647, "grad_norm": 0.20338128507137299, "learning_rate": 6.757704234465869e-06, "loss": 0.3784, "step": 106520 }, { "epoch": 3.839153782390889, "grad_norm": 0.2528837025165558, "learning_rate": 6.75570900853188e-06, "loss": 0.4003, "step": 106525 }, { "epoch": 3.8393339820521137, "grad_norm": 0.22932448983192444, "learning_rate": 6.753714031172834e-06, "loss": 0.3484, "step": 106530 }, { "epoch": 3.8395141817133385, "grad_norm": 0.2735101282596588, "learning_rate": 6.751719302415898e-06, "loss": 0.4048, "step": 106535 }, { "epoch": 3.8396943813745628, "grad_norm": 0.2353210300207138, "learning_rate": 6.749724822288256e-06, "loss": 0.3939, "step": 106540 }, { "epoch": 3.8398745810357875, "grad_norm": 0.2255113422870636, "learning_rate": 6.747730590817078e-06, "loss": 0.3748, "step": 106545 }, { "epoch": 3.8400547806970122, "grad_norm": 0.2259177565574646, "learning_rate": 6.7457366080295296e-06, "loss": 0.3593, "step": 106550 }, { "epoch": 3.840234980358237, "grad_norm": 0.27952998876571655, "learning_rate": 6.743742873952794e-06, "loss": 0.3721, "step": 106555 }, { "epoch": 3.8404151800194617, "grad_norm": 0.23785199224948883, "learning_rate": 6.741749388614027e-06, "loss": 0.3657, "step": 106560 }, { "epoch": 3.8405953796806864, "grad_norm": 0.19771380722522736, "learning_rate": 6.739756152040391e-06, "loss": 0.3827, "step": 106565 }, { "epoch": 3.8407755793419107, "grad_norm": 0.23672428727149963, "learning_rate": 6.7377631642590395e-06, "loss": 0.4001, "step": 106570 }, { "epoch": 3.8409557790031355, "grad_norm": 0.28637582063674927, "learning_rate": 6.7357704252971245e-06, "loss": 0.381, "step": 106575 }, { "epoch": 3.84113597866436, "grad_norm": 0.23448318243026733, "learning_rate": 6.733777935181806e-06, "loss": 0.4347, "step": 106580 }, { "epoch": 3.8413161783255845, "grad_norm": 0.24527902901172638, "learning_rate": 6.731785693940237e-06, "loss": 0.405, "step": 106585 }, { "epoch": 3.841496377986809, "grad_norm": 0.2648589015007019, "learning_rate": 6.729793701599535e-06, "loss": 0.3911, "step": 106590 }, { "epoch": 3.841676577648034, "grad_norm": 0.28334563970565796, "learning_rate": 6.727801958186864e-06, "loss": 0.4194, "step": 106595 }, { "epoch": 3.8418567773092587, "grad_norm": 0.23570017516613007, "learning_rate": 6.725810463729354e-06, "loss": 0.3873, "step": 106600 }, { "epoch": 3.8420369769704834, "grad_norm": 0.254119873046875, "learning_rate": 6.723819218254138e-06, "loss": 0.3425, "step": 106605 }, { "epoch": 3.842217176631708, "grad_norm": 0.23003673553466797, "learning_rate": 6.721828221788346e-06, "loss": 0.3899, "step": 106610 }, { "epoch": 3.8423973762929324, "grad_norm": 0.21980217099189758, "learning_rate": 6.719837474359098e-06, "loss": 0.372, "step": 106615 }, { "epoch": 3.842577575954157, "grad_norm": 0.25185590982437134, "learning_rate": 6.717846975993536e-06, "loss": 0.4302, "step": 106620 }, { "epoch": 3.842757775615382, "grad_norm": 0.23943038284778595, "learning_rate": 6.715856726718767e-06, "loss": 0.4065, "step": 106625 }, { "epoch": 3.8429379752766066, "grad_norm": 0.2441278100013733, "learning_rate": 6.71386672656191e-06, "loss": 0.3898, "step": 106630 }, { "epoch": 3.843118174937831, "grad_norm": 0.2758260667324066, "learning_rate": 6.711876975550077e-06, "loss": 0.3623, "step": 106635 }, { "epoch": 3.8432983745990557, "grad_norm": 0.20836462080478668, "learning_rate": 6.7098874737103825e-06, "loss": 0.372, "step": 106640 }, { "epoch": 3.8434785742602804, "grad_norm": 0.25424110889434814, "learning_rate": 6.707898221069922e-06, "loss": 0.3941, "step": 106645 }, { "epoch": 3.843658773921505, "grad_norm": 0.25770169496536255, "learning_rate": 6.705909217655815e-06, "loss": 0.3841, "step": 106650 }, { "epoch": 3.84383897358273, "grad_norm": 0.296792596578598, "learning_rate": 6.703920463495151e-06, "loss": 0.3746, "step": 106655 }, { "epoch": 3.844019173243954, "grad_norm": 0.2855359613895416, "learning_rate": 6.701931958615029e-06, "loss": 0.4001, "step": 106660 }, { "epoch": 3.844199372905179, "grad_norm": 0.2668587267398834, "learning_rate": 6.699943703042541e-06, "loss": 0.3571, "step": 106665 }, { "epoch": 3.8443795725664036, "grad_norm": 0.2563954293727875, "learning_rate": 6.69795569680477e-06, "loss": 0.3489, "step": 106670 }, { "epoch": 3.8445597722276283, "grad_norm": 0.3133787214756012, "learning_rate": 6.6959679399288265e-06, "loss": 0.3753, "step": 106675 }, { "epoch": 3.8447399718888526, "grad_norm": 0.22209054231643677, "learning_rate": 6.693980432441757e-06, "loss": 0.3987, "step": 106680 }, { "epoch": 3.8449201715500774, "grad_norm": 0.22669175267219543, "learning_rate": 6.69199317437067e-06, "loss": 0.3612, "step": 106685 }, { "epoch": 3.845100371211302, "grad_norm": 0.24316006898880005, "learning_rate": 6.6900061657426325e-06, "loss": 0.3768, "step": 106690 }, { "epoch": 3.845280570872527, "grad_norm": 0.26429635286331177, "learning_rate": 6.688019406584706e-06, "loss": 0.3953, "step": 106695 }, { "epoch": 3.8454607705337516, "grad_norm": 0.24208183586597443, "learning_rate": 6.686032896923986e-06, "loss": 0.3652, "step": 106700 }, { "epoch": 3.8456409701949763, "grad_norm": 0.21602210402488708, "learning_rate": 6.684046636787514e-06, "loss": 0.3948, "step": 106705 }, { "epoch": 3.8458211698562006, "grad_norm": 0.21090590953826904, "learning_rate": 6.682060626202355e-06, "loss": 0.3688, "step": 106710 }, { "epoch": 3.8460013695174253, "grad_norm": 0.23994767665863037, "learning_rate": 6.6800748651955795e-06, "loss": 0.3735, "step": 106715 }, { "epoch": 3.84618156917865, "grad_norm": 0.24536341428756714, "learning_rate": 6.678089353794237e-06, "loss": 0.3965, "step": 106720 }, { "epoch": 3.8463617688398744, "grad_norm": 0.24873821437358856, "learning_rate": 6.676104092025378e-06, "loss": 0.3989, "step": 106725 }, { "epoch": 3.846541968501099, "grad_norm": 0.2966354191303253, "learning_rate": 6.674119079916056e-06, "loss": 0.404, "step": 106730 }, { "epoch": 3.846722168162324, "grad_norm": 0.24333995580673218, "learning_rate": 6.672134317493303e-06, "loss": 0.3602, "step": 106735 }, { "epoch": 3.8469023678235486, "grad_norm": 0.2882740795612335, "learning_rate": 6.670149804784181e-06, "loss": 0.3889, "step": 106740 }, { "epoch": 3.8470825674847733, "grad_norm": 0.2220769077539444, "learning_rate": 6.668165541815721e-06, "loss": 0.3715, "step": 106745 }, { "epoch": 3.847262767145998, "grad_norm": 0.26409411430358887, "learning_rate": 6.666181528614954e-06, "loss": 0.356, "step": 106750 }, { "epoch": 3.8474429668072223, "grad_norm": 0.24379925429821014, "learning_rate": 6.6641977652089155e-06, "loss": 0.3866, "step": 106755 }, { "epoch": 3.847623166468447, "grad_norm": 0.2612653076648712, "learning_rate": 6.662214251624624e-06, "loss": 0.3836, "step": 106760 }, { "epoch": 3.8478033661296718, "grad_norm": 0.20070244371891022, "learning_rate": 6.660230987889121e-06, "loss": 0.3629, "step": 106765 }, { "epoch": 3.847983565790896, "grad_norm": 0.1985878348350525, "learning_rate": 6.658247974029427e-06, "loss": 0.3356, "step": 106770 }, { "epoch": 3.848163765452121, "grad_norm": 0.24218730628490448, "learning_rate": 6.656265210072537e-06, "loss": 0.396, "step": 106775 }, { "epoch": 3.8483439651133455, "grad_norm": 0.29759272933006287, "learning_rate": 6.6542826960454915e-06, "loss": 0.3804, "step": 106780 }, { "epoch": 3.8485241647745703, "grad_norm": 0.340660035610199, "learning_rate": 6.652300431975292e-06, "loss": 0.4202, "step": 106785 }, { "epoch": 3.848704364435795, "grad_norm": 0.24566273391246796, "learning_rate": 6.650318417888948e-06, "loss": 0.396, "step": 106790 }, { "epoch": 3.8488845640970197, "grad_norm": 0.2540030777454376, "learning_rate": 6.648336653813461e-06, "loss": 0.3947, "step": 106795 }, { "epoch": 3.849064763758244, "grad_norm": 0.27518022060394287, "learning_rate": 6.646355139775828e-06, "loss": 0.3916, "step": 106800 }, { "epoch": 3.8492449634194688, "grad_norm": 0.19195982813835144, "learning_rate": 6.644373875803059e-06, "loss": 0.3582, "step": 106805 }, { "epoch": 3.8494251630806935, "grad_norm": 0.17849093675613403, "learning_rate": 6.642392861922145e-06, "loss": 0.4018, "step": 106810 }, { "epoch": 3.849605362741918, "grad_norm": 0.20975105464458466, "learning_rate": 6.64041209816007e-06, "loss": 0.3944, "step": 106815 }, { "epoch": 3.8497855624031425, "grad_norm": 0.20660048723220825, "learning_rate": 6.638431584543827e-06, "loss": 0.3759, "step": 106820 }, { "epoch": 3.8499657620643672, "grad_norm": 0.20970280468463898, "learning_rate": 6.636451321100401e-06, "loss": 0.3418, "step": 106825 }, { "epoch": 3.850145961725592, "grad_norm": 0.20445165038108826, "learning_rate": 6.634471307856763e-06, "loss": 0.3869, "step": 106830 }, { "epoch": 3.8503261613868167, "grad_norm": 0.2625582814216614, "learning_rate": 6.632491544839903e-06, "loss": 0.4001, "step": 106835 }, { "epoch": 3.8505063610480414, "grad_norm": 0.2839062213897705, "learning_rate": 6.6305120320767885e-06, "loss": 0.3926, "step": 106840 }, { "epoch": 3.8506865607092657, "grad_norm": 0.22392812371253967, "learning_rate": 6.628532769594395e-06, "loss": 0.4024, "step": 106845 }, { "epoch": 3.8508667603704905, "grad_norm": 0.2890661060810089, "learning_rate": 6.626553757419682e-06, "loss": 0.3862, "step": 106850 }, { "epoch": 3.851046960031715, "grad_norm": 0.29909321665763855, "learning_rate": 6.62457499557961e-06, "loss": 0.3887, "step": 106855 }, { "epoch": 3.85122715969294, "grad_norm": 0.22355806827545166, "learning_rate": 6.622596484101156e-06, "loss": 0.388, "step": 106860 }, { "epoch": 3.8514073593541642, "grad_norm": 0.2812059223651886, "learning_rate": 6.620618223011274e-06, "loss": 0.3845, "step": 106865 }, { "epoch": 3.851587559015389, "grad_norm": 0.28787899017333984, "learning_rate": 6.618640212336893e-06, "loss": 0.3785, "step": 106870 }, { "epoch": 3.8517677586766137, "grad_norm": 0.21656519174575806, "learning_rate": 6.616662452104991e-06, "loss": 0.393, "step": 106875 }, { "epoch": 3.8519479583378384, "grad_norm": 0.2218136340379715, "learning_rate": 6.6146849423424946e-06, "loss": 0.3646, "step": 106880 }, { "epoch": 3.852128157999063, "grad_norm": 0.2481302171945572, "learning_rate": 6.612707683076369e-06, "loss": 0.3755, "step": 106885 }, { "epoch": 3.8523083576602875, "grad_norm": 0.23201486468315125, "learning_rate": 6.610730674333537e-06, "loss": 0.3652, "step": 106890 }, { "epoch": 3.852488557321512, "grad_norm": 0.2411048710346222, "learning_rate": 6.6087539161409305e-06, "loss": 0.3625, "step": 106895 }, { "epoch": 3.852668756982737, "grad_norm": 0.24224650859832764, "learning_rate": 6.6067774085254995e-06, "loss": 0.354, "step": 106900 }, { "epoch": 3.8528489566439617, "grad_norm": 0.25247785449028015, "learning_rate": 6.6048011515141646e-06, "loss": 0.3811, "step": 106905 }, { "epoch": 3.853029156305186, "grad_norm": 0.2523113191127777, "learning_rate": 6.602825145133854e-06, "loss": 0.4067, "step": 106910 }, { "epoch": 3.8532093559664107, "grad_norm": 0.21480904519557953, "learning_rate": 6.600849389411487e-06, "loss": 0.388, "step": 106915 }, { "epoch": 3.8533895556276354, "grad_norm": 0.2217121422290802, "learning_rate": 6.59887388437398e-06, "loss": 0.3805, "step": 106920 }, { "epoch": 3.85356975528886, "grad_norm": 0.2531042695045471, "learning_rate": 6.5968986300482595e-06, "loss": 0.3892, "step": 106925 }, { "epoch": 3.853749954950085, "grad_norm": 0.27398139238357544, "learning_rate": 6.594923626461233e-06, "loss": 0.3802, "step": 106930 }, { "epoch": 3.853930154611309, "grad_norm": 0.24913030862808228, "learning_rate": 6.592948873639807e-06, "loss": 0.3898, "step": 106935 }, { "epoch": 3.854110354272534, "grad_norm": 0.23287741839885712, "learning_rate": 6.590974371610889e-06, "loss": 0.3705, "step": 106940 }, { "epoch": 3.8542905539337586, "grad_norm": 0.23462989926338196, "learning_rate": 6.589000120401375e-06, "loss": 0.3583, "step": 106945 }, { "epoch": 3.8544707535949834, "grad_norm": 0.2538296580314636, "learning_rate": 6.587026120038178e-06, "loss": 0.393, "step": 106950 }, { "epoch": 3.8546509532562077, "grad_norm": 0.23619996011257172, "learning_rate": 6.585052370548192e-06, "loss": 0.3613, "step": 106955 }, { "epoch": 3.8548311529174324, "grad_norm": 0.20444704592227936, "learning_rate": 6.583078871958287e-06, "loss": 0.3983, "step": 106960 }, { "epoch": 3.855011352578657, "grad_norm": 0.2089342325925827, "learning_rate": 6.581105624295372e-06, "loss": 0.3489, "step": 106965 }, { "epoch": 3.855191552239882, "grad_norm": 0.2452775537967682, "learning_rate": 6.579132627586329e-06, "loss": 0.3685, "step": 106970 }, { "epoch": 3.8553717519011066, "grad_norm": 0.23251672089099884, "learning_rate": 6.577159881858031e-06, "loss": 0.3592, "step": 106975 }, { "epoch": 3.8555519515623313, "grad_norm": 0.21264806389808655, "learning_rate": 6.575187387137377e-06, "loss": 0.3709, "step": 106980 }, { "epoch": 3.8557321512235556, "grad_norm": 0.20880858600139618, "learning_rate": 6.5732151434512115e-06, "loss": 0.36, "step": 106985 }, { "epoch": 3.8559123508847803, "grad_norm": 0.2800189256668091, "learning_rate": 6.571243150826431e-06, "loss": 0.3584, "step": 106990 }, { "epoch": 3.856092550546005, "grad_norm": 0.27210569381713867, "learning_rate": 6.569271409289895e-06, "loss": 0.3647, "step": 106995 }, { "epoch": 3.8562727502072294, "grad_norm": 0.22882099449634552, "learning_rate": 6.567299918868467e-06, "loss": 0.3507, "step": 107000 }, { "epoch": 3.8562727502072294, "eval_loss": 0.42817628383636475, "eval_runtime": 3.5212, "eval_samples_per_second": 28.399, "eval_steps_per_second": 7.1, "step": 107000 }, { "epoch": 3.856452949868454, "grad_norm": 0.22581203281879425, "learning_rate": 6.565328679589008e-06, "loss": 0.3751, "step": 107005 }, { "epoch": 3.856633149529679, "grad_norm": 0.23361347615718842, "learning_rate": 6.563357691478378e-06, "loss": 0.3699, "step": 107010 }, { "epoch": 3.8568133491909036, "grad_norm": 0.2552787959575653, "learning_rate": 6.561386954563423e-06, "loss": 0.368, "step": 107015 }, { "epoch": 3.8569935488521283, "grad_norm": 0.19680064916610718, "learning_rate": 6.559416468871008e-06, "loss": 0.4097, "step": 107020 }, { "epoch": 3.857173748513353, "grad_norm": 0.300270676612854, "learning_rate": 6.557446234427972e-06, "loss": 0.3709, "step": 107025 }, { "epoch": 3.8573539481745773, "grad_norm": 0.2358800619840622, "learning_rate": 6.555476251261161e-06, "loss": 0.3658, "step": 107030 }, { "epoch": 3.857534147835802, "grad_norm": 0.2335239201784134, "learning_rate": 6.553506519397417e-06, "loss": 0.3828, "step": 107035 }, { "epoch": 3.857714347497027, "grad_norm": 0.3097918629646301, "learning_rate": 6.551537038863567e-06, "loss": 0.3893, "step": 107040 }, { "epoch": 3.857894547158251, "grad_norm": 0.22679081559181213, "learning_rate": 6.549567809686458e-06, "loss": 0.3852, "step": 107045 }, { "epoch": 3.858074746819476, "grad_norm": 0.23170988261699677, "learning_rate": 6.547598831892926e-06, "loss": 0.3755, "step": 107050 }, { "epoch": 3.8582549464807006, "grad_norm": 0.24494971334934235, "learning_rate": 6.545630105509771e-06, "loss": 0.3614, "step": 107055 }, { "epoch": 3.8584351461419253, "grad_norm": 0.22469878196716309, "learning_rate": 6.543661630563841e-06, "loss": 0.37, "step": 107060 }, { "epoch": 3.85861534580315, "grad_norm": 0.27316057682037354, "learning_rate": 6.541693407081939e-06, "loss": 0.3548, "step": 107065 }, { "epoch": 3.8587955454643748, "grad_norm": 0.2721122205257416, "learning_rate": 6.53972543509091e-06, "loss": 0.3677, "step": 107070 }, { "epoch": 3.858975745125599, "grad_norm": 0.2867799699306488, "learning_rate": 6.537757714617537e-06, "loss": 0.367, "step": 107075 }, { "epoch": 3.8591559447868238, "grad_norm": 0.2439620941877365, "learning_rate": 6.535790245688633e-06, "loss": 0.3661, "step": 107080 }, { "epoch": 3.8593361444480485, "grad_norm": 0.21324770152568817, "learning_rate": 6.533823028331021e-06, "loss": 0.3907, "step": 107085 }, { "epoch": 3.859516344109273, "grad_norm": 0.24692684412002563, "learning_rate": 6.5318560625714924e-06, "loss": 0.3767, "step": 107090 }, { "epoch": 3.8596965437704975, "grad_norm": 0.2432447373867035, "learning_rate": 6.529889348436852e-06, "loss": 0.3643, "step": 107095 }, { "epoch": 3.8598767434317223, "grad_norm": 0.2154054492712021, "learning_rate": 6.5279228859538935e-06, "loss": 0.3863, "step": 107100 }, { "epoch": 3.860056943092947, "grad_norm": 0.23707520961761475, "learning_rate": 6.5259566751494e-06, "loss": 0.3823, "step": 107105 }, { "epoch": 3.8602371427541717, "grad_norm": 0.23373231291770935, "learning_rate": 6.523990716050179e-06, "loss": 0.393, "step": 107110 }, { "epoch": 3.8604173424153965, "grad_norm": 0.26256629824638367, "learning_rate": 6.522025008683008e-06, "loss": 0.3853, "step": 107115 }, { "epoch": 3.8605975420766208, "grad_norm": 0.23125460743904114, "learning_rate": 6.520059553074667e-06, "loss": 0.3477, "step": 107120 }, { "epoch": 3.8607777417378455, "grad_norm": 0.23365911841392517, "learning_rate": 6.518094349251938e-06, "loss": 0.3883, "step": 107125 }, { "epoch": 3.8609579413990702, "grad_norm": 0.18686018884181976, "learning_rate": 6.516129397241588e-06, "loss": 0.376, "step": 107130 }, { "epoch": 3.861138141060295, "grad_norm": 0.23002989590168, "learning_rate": 6.5141646970704015e-06, "loss": 0.3877, "step": 107135 }, { "epoch": 3.8613183407215192, "grad_norm": 0.3117145299911499, "learning_rate": 6.512200248765146e-06, "loss": 0.4153, "step": 107140 }, { "epoch": 3.861498540382744, "grad_norm": 0.2764013111591339, "learning_rate": 6.510236052352581e-06, "loss": 0.398, "step": 107145 }, { "epoch": 3.8616787400439687, "grad_norm": 0.3027382493019104, "learning_rate": 6.508272107859467e-06, "loss": 0.3804, "step": 107150 }, { "epoch": 3.8618589397051934, "grad_norm": 0.20171962678432465, "learning_rate": 6.50630841531257e-06, "loss": 0.3427, "step": 107155 }, { "epoch": 3.862039139366418, "grad_norm": 0.2870292663574219, "learning_rate": 6.50434497473863e-06, "loss": 0.3651, "step": 107160 }, { "epoch": 3.8622193390276425, "grad_norm": 0.2380640059709549, "learning_rate": 6.502381786164424e-06, "loss": 0.3692, "step": 107165 }, { "epoch": 3.862399538688867, "grad_norm": 0.2691294550895691, "learning_rate": 6.500418849616671e-06, "loss": 0.3916, "step": 107170 }, { "epoch": 3.862579738350092, "grad_norm": 0.27460694313049316, "learning_rate": 6.498456165122139e-06, "loss": 0.379, "step": 107175 }, { "epoch": 3.8627599380113167, "grad_norm": 0.24331451952457428, "learning_rate": 6.496493732707556e-06, "loss": 0.3881, "step": 107180 }, { "epoch": 3.862940137672541, "grad_norm": 0.2511681020259857, "learning_rate": 6.494531552399666e-06, "loss": 0.371, "step": 107185 }, { "epoch": 3.8631203373337657, "grad_norm": 0.23705346882343292, "learning_rate": 6.4925696242251975e-06, "loss": 0.3931, "step": 107190 }, { "epoch": 3.8633005369949904, "grad_norm": 0.24382388591766357, "learning_rate": 6.490607948210889e-06, "loss": 0.3663, "step": 107195 }, { "epoch": 3.863480736656215, "grad_norm": 0.29807329177856445, "learning_rate": 6.488646524383454e-06, "loss": 0.3872, "step": 107200 }, { "epoch": 3.86366093631744, "grad_norm": 0.18684354424476624, "learning_rate": 6.486685352769634e-06, "loss": 0.363, "step": 107205 }, { "epoch": 3.8638411359786646, "grad_norm": 0.20785370469093323, "learning_rate": 6.484724433396141e-06, "loss": 0.3894, "step": 107210 }, { "epoch": 3.864021335639889, "grad_norm": 0.2543993890285492, "learning_rate": 6.482763766289693e-06, "loss": 0.3472, "step": 107215 }, { "epoch": 3.8642015353011137, "grad_norm": 0.31936973333358765, "learning_rate": 6.480803351477005e-06, "loss": 0.3948, "step": 107220 }, { "epoch": 3.8643817349623384, "grad_norm": 0.33362480998039246, "learning_rate": 6.478843188984776e-06, "loss": 0.3973, "step": 107225 }, { "epoch": 3.8645619346235627, "grad_norm": 0.2270936667919159, "learning_rate": 6.476883278839732e-06, "loss": 0.3745, "step": 107230 }, { "epoch": 3.8647421342847874, "grad_norm": 0.22412335872650146, "learning_rate": 6.474923621068574e-06, "loss": 0.3999, "step": 107235 }, { "epoch": 3.864922333946012, "grad_norm": 0.23207208514213562, "learning_rate": 6.472964215697982e-06, "loss": 0.4043, "step": 107240 }, { "epoch": 3.865102533607237, "grad_norm": 0.23929710686206818, "learning_rate": 6.47100506275467e-06, "loss": 0.3865, "step": 107245 }, { "epoch": 3.8652827332684616, "grad_norm": 0.22255569696426392, "learning_rate": 6.469046162265322e-06, "loss": 0.3684, "step": 107250 }, { "epoch": 3.8654629329296863, "grad_norm": 0.2753349840641022, "learning_rate": 6.467087514256645e-06, "loss": 0.3796, "step": 107255 }, { "epoch": 3.8656431325909106, "grad_norm": 0.2791849374771118, "learning_rate": 6.465129118755309e-06, "loss": 0.3707, "step": 107260 }, { "epoch": 3.8658233322521354, "grad_norm": 0.2520217299461365, "learning_rate": 6.4631709757879885e-06, "loss": 0.3908, "step": 107265 }, { "epoch": 3.86600353191336, "grad_norm": 0.2589038014411926, "learning_rate": 6.461213085381384e-06, "loss": 0.3644, "step": 107270 }, { "epoch": 3.8661837315745844, "grad_norm": 0.21371279656887054, "learning_rate": 6.459255447562155e-06, "loss": 0.3742, "step": 107275 }, { "epoch": 3.866363931235809, "grad_norm": 0.24295492470264435, "learning_rate": 6.457298062356995e-06, "loss": 0.3742, "step": 107280 }, { "epoch": 3.866544130897034, "grad_norm": 0.2283410131931305, "learning_rate": 6.45534092979255e-06, "loss": 0.3884, "step": 107285 }, { "epoch": 3.8667243305582586, "grad_norm": 0.20489716529846191, "learning_rate": 6.453384049895489e-06, "loss": 0.3688, "step": 107290 }, { "epoch": 3.8669045302194833, "grad_norm": 0.3497338891029358, "learning_rate": 6.451427422692485e-06, "loss": 0.3847, "step": 107295 }, { "epoch": 3.867084729880708, "grad_norm": 0.29012593626976013, "learning_rate": 6.449471048210193e-06, "loss": 0.3362, "step": 107300 }, { "epoch": 3.8672649295419324, "grad_norm": 0.24133215844631195, "learning_rate": 6.4475149264752675e-06, "loss": 0.3658, "step": 107305 }, { "epoch": 3.867445129203157, "grad_norm": 0.26846662163734436, "learning_rate": 6.445559057514358e-06, "loss": 0.3976, "step": 107310 }, { "epoch": 3.867625328864382, "grad_norm": 0.2922466993331909, "learning_rate": 6.443603441354107e-06, "loss": 0.404, "step": 107315 }, { "epoch": 3.867805528525606, "grad_norm": 0.24670326709747314, "learning_rate": 6.441648078021173e-06, "loss": 0.3723, "step": 107320 }, { "epoch": 3.867985728186831, "grad_norm": 0.29100772738456726, "learning_rate": 6.439692967542191e-06, "loss": 0.4027, "step": 107325 }, { "epoch": 3.8681659278480556, "grad_norm": 0.21996453404426575, "learning_rate": 6.4377381099438e-06, "loss": 0.4048, "step": 107330 }, { "epoch": 3.8683461275092803, "grad_norm": 0.2679681181907654, "learning_rate": 6.435783505252632e-06, "loss": 0.3685, "step": 107335 }, { "epoch": 3.868526327170505, "grad_norm": 0.2263759821653366, "learning_rate": 6.4338291534953215e-06, "loss": 0.3657, "step": 107340 }, { "epoch": 3.8687065268317298, "grad_norm": 0.2599135637283325, "learning_rate": 6.431875054698486e-06, "loss": 0.3653, "step": 107345 }, { "epoch": 3.868886726492954, "grad_norm": 0.28875550627708435, "learning_rate": 6.429921208888773e-06, "loss": 0.3754, "step": 107350 }, { "epoch": 3.869066926154179, "grad_norm": 0.23925039172172546, "learning_rate": 6.4279676160927725e-06, "loss": 0.4144, "step": 107355 }, { "epoch": 3.8692471258154035, "grad_norm": 0.3068399727344513, "learning_rate": 6.426014276337125e-06, "loss": 0.3856, "step": 107360 }, { "epoch": 3.8694273254766283, "grad_norm": 0.18254274129867554, "learning_rate": 6.4240611896484365e-06, "loss": 0.3931, "step": 107365 }, { "epoch": 3.8696075251378526, "grad_norm": 0.20957380533218384, "learning_rate": 6.422108356053319e-06, "loss": 0.3773, "step": 107370 }, { "epoch": 3.8697877247990773, "grad_norm": 0.20488250255584717, "learning_rate": 6.420155775578379e-06, "loss": 0.3621, "step": 107375 }, { "epoch": 3.869967924460302, "grad_norm": 0.30153316259384155, "learning_rate": 6.418203448250218e-06, "loss": 0.4062, "step": 107380 }, { "epoch": 3.8701481241215268, "grad_norm": 0.27384403347969055, "learning_rate": 6.416251374095431e-06, "loss": 0.4122, "step": 107385 }, { "epoch": 3.8703283237827515, "grad_norm": 0.317130446434021, "learning_rate": 6.414299553140629e-06, "loss": 0.4039, "step": 107390 }, { "epoch": 3.870508523443976, "grad_norm": 0.21292860805988312, "learning_rate": 6.412347985412395e-06, "loss": 0.3729, "step": 107395 }, { "epoch": 3.8706887231052005, "grad_norm": 0.2204897701740265, "learning_rate": 6.410396670937325e-06, "loss": 0.3625, "step": 107400 }, { "epoch": 3.8708689227664252, "grad_norm": 0.2745380997657776, "learning_rate": 6.4084456097419976e-06, "loss": 0.4188, "step": 107405 }, { "epoch": 3.87104912242765, "grad_norm": 0.28390398621559143, "learning_rate": 6.4064948018529915e-06, "loss": 0.39, "step": 107410 }, { "epoch": 3.8712293220888743, "grad_norm": 0.2293710857629776, "learning_rate": 6.404544247296903e-06, "loss": 0.3506, "step": 107415 }, { "epoch": 3.871409521750099, "grad_norm": 0.22023165225982666, "learning_rate": 6.4025939461003075e-06, "loss": 0.3471, "step": 107420 }, { "epoch": 3.8715897214113237, "grad_norm": 0.25889191031455994, "learning_rate": 6.400643898289751e-06, "loss": 0.3967, "step": 107425 }, { "epoch": 3.8717699210725485, "grad_norm": 0.2643846273422241, "learning_rate": 6.3986941038918305e-06, "loss": 0.394, "step": 107430 }, { "epoch": 3.871950120733773, "grad_norm": 0.205980584025383, "learning_rate": 6.396744562933094e-06, "loss": 0.3505, "step": 107435 }, { "epoch": 3.8721303203949975, "grad_norm": 0.24297955632209778, "learning_rate": 6.394795275440118e-06, "loss": 0.3931, "step": 107440 }, { "epoch": 3.8723105200562222, "grad_norm": 0.22801950573921204, "learning_rate": 6.392846241439462e-06, "loss": 0.3853, "step": 107445 }, { "epoch": 3.872490719717447, "grad_norm": 0.20344804227352142, "learning_rate": 6.390897460957657e-06, "loss": 0.3943, "step": 107450 }, { "epoch": 3.8726709193786717, "grad_norm": 0.24186420440673828, "learning_rate": 6.38894893402128e-06, "loss": 0.368, "step": 107455 }, { "epoch": 3.872851119039896, "grad_norm": 0.28862589597702026, "learning_rate": 6.387000660656869e-06, "loss": 0.4025, "step": 107460 }, { "epoch": 3.8730313187011207, "grad_norm": 0.2653532922267914, "learning_rate": 6.385052640890973e-06, "loss": 0.4228, "step": 107465 }, { "epoch": 3.8732115183623455, "grad_norm": 0.20793728530406952, "learning_rate": 6.383104874750129e-06, "loss": 0.3892, "step": 107470 }, { "epoch": 3.87339171802357, "grad_norm": 0.20338398218154907, "learning_rate": 6.38115736226087e-06, "loss": 0.3623, "step": 107475 }, { "epoch": 3.873571917684795, "grad_norm": 0.27274417877197266, "learning_rate": 6.3792101034497454e-06, "loss": 0.4013, "step": 107480 }, { "epoch": 3.8737521173460197, "grad_norm": 0.21529635787010193, "learning_rate": 6.3772630983432776e-06, "loss": 0.3646, "step": 107485 }, { "epoch": 3.873932317007244, "grad_norm": 0.24685950577259064, "learning_rate": 6.375316346967994e-06, "loss": 0.3973, "step": 107490 }, { "epoch": 3.8741125166684687, "grad_norm": 0.1983158439397812, "learning_rate": 6.373369849350419e-06, "loss": 0.368, "step": 107495 }, { "epoch": 3.8742927163296934, "grad_norm": 0.27724209427833557, "learning_rate": 6.371423605517066e-06, "loss": 0.3422, "step": 107500 }, { "epoch": 3.8742927163296934, "eval_loss": 0.4283193349838257, "eval_runtime": 3.5283, "eval_samples_per_second": 28.342, "eval_steps_per_second": 7.086, "step": 107500 }, { "epoch": 3.8744729159909177, "grad_norm": 0.25930055975914, "learning_rate": 6.369477615494468e-06, "loss": 0.3701, "step": 107505 }, { "epoch": 3.8746531156521424, "grad_norm": 0.25892987847328186, "learning_rate": 6.3675318793091295e-06, "loss": 0.3995, "step": 107510 }, { "epoch": 3.874833315313367, "grad_norm": 0.22906829416751862, "learning_rate": 6.365586396987563e-06, "loss": 0.3731, "step": 107515 }, { "epoch": 3.875013514974592, "grad_norm": 0.2710282802581787, "learning_rate": 6.363641168556275e-06, "loss": 0.4196, "step": 107520 }, { "epoch": 3.8751937146358166, "grad_norm": 0.24969340860843658, "learning_rate": 6.361696194041766e-06, "loss": 0.4083, "step": 107525 }, { "epoch": 3.8753739142970414, "grad_norm": 0.28726011514663696, "learning_rate": 6.359751473470532e-06, "loss": 0.4036, "step": 107530 }, { "epoch": 3.8755541139582657, "grad_norm": 0.2590154707431793, "learning_rate": 6.35780700686909e-06, "loss": 0.3776, "step": 107535 }, { "epoch": 3.8757343136194904, "grad_norm": 0.22933447360992432, "learning_rate": 6.355862794263903e-06, "loss": 0.3781, "step": 107540 }, { "epoch": 3.875914513280715, "grad_norm": 0.24526602029800415, "learning_rate": 6.353918835681483e-06, "loss": 0.3529, "step": 107545 }, { "epoch": 3.8760947129419394, "grad_norm": 0.24101245403289795, "learning_rate": 6.351975131148308e-06, "loss": 0.3841, "step": 107550 }, { "epoch": 3.876274912603164, "grad_norm": 0.23179291188716888, "learning_rate": 6.350031680690854e-06, "loss": 0.3638, "step": 107555 }, { "epoch": 3.876455112264389, "grad_norm": 0.25088703632354736, "learning_rate": 6.348088484335624e-06, "loss": 0.3617, "step": 107560 }, { "epoch": 3.8766353119256136, "grad_norm": 0.2836175262928009, "learning_rate": 6.346145542109069e-06, "loss": 0.3649, "step": 107565 }, { "epoch": 3.8768155115868383, "grad_norm": 0.22787512838840485, "learning_rate": 6.344202854037662e-06, "loss": 0.3655, "step": 107570 }, { "epoch": 3.876995711248063, "grad_norm": 0.24610376358032227, "learning_rate": 6.342260420147889e-06, "loss": 0.3755, "step": 107575 }, { "epoch": 3.8771759109092874, "grad_norm": 0.270090788602829, "learning_rate": 6.340318240466203e-06, "loss": 0.3885, "step": 107580 }, { "epoch": 3.877356110570512, "grad_norm": 0.24736785888671875, "learning_rate": 6.3383763150190675e-06, "loss": 0.3802, "step": 107585 }, { "epoch": 3.877536310231737, "grad_norm": 0.22846059501171112, "learning_rate": 6.336434643832942e-06, "loss": 0.3796, "step": 107590 }, { "epoch": 3.877716509892961, "grad_norm": 0.20484845340251923, "learning_rate": 6.334493226934276e-06, "loss": 0.3389, "step": 107595 }, { "epoch": 3.877896709554186, "grad_norm": 0.2559860050678253, "learning_rate": 6.3325520643495314e-06, "loss": 0.3823, "step": 107600 }, { "epoch": 3.8780769092154106, "grad_norm": 0.20136897265911102, "learning_rate": 6.330611156105151e-06, "loss": 0.38, "step": 107605 }, { "epoch": 3.8782571088766353, "grad_norm": 0.20288234949111938, "learning_rate": 6.328670502227579e-06, "loss": 0.3778, "step": 107610 }, { "epoch": 3.87843730853786, "grad_norm": 0.2542252838611603, "learning_rate": 6.326730102743259e-06, "loss": 0.3886, "step": 107615 }, { "epoch": 3.878617508199085, "grad_norm": 0.2526249587535858, "learning_rate": 6.324789957678617e-06, "loss": 0.3643, "step": 107620 }, { "epoch": 3.878797707860309, "grad_norm": 0.20341147482395172, "learning_rate": 6.3228500670601015e-06, "loss": 0.389, "step": 107625 }, { "epoch": 3.878977907521534, "grad_norm": 0.2713795602321625, "learning_rate": 6.320910430914148e-06, "loss": 0.4011, "step": 107630 }, { "epoch": 3.8791581071827586, "grad_norm": 0.2428542822599411, "learning_rate": 6.318971049267159e-06, "loss": 0.3911, "step": 107635 }, { "epoch": 3.8793383068439833, "grad_norm": 0.2431861311197281, "learning_rate": 6.317031922145577e-06, "loss": 0.4293, "step": 107640 }, { "epoch": 3.8795185065052076, "grad_norm": 0.23291923105716705, "learning_rate": 6.315093049575821e-06, "loss": 0.3808, "step": 107645 }, { "epoch": 3.8796987061664323, "grad_norm": 0.287264347076416, "learning_rate": 6.313154431584303e-06, "loss": 0.4031, "step": 107650 }, { "epoch": 3.879878905827657, "grad_norm": 0.24084652960300446, "learning_rate": 6.31121606819744e-06, "loss": 0.3864, "step": 107655 }, { "epoch": 3.8800591054888818, "grad_norm": 0.25108230113983154, "learning_rate": 6.309277959441629e-06, "loss": 0.4094, "step": 107660 }, { "epoch": 3.8802393051501065, "grad_norm": 0.24399013817310333, "learning_rate": 6.307340105343298e-06, "loss": 0.3851, "step": 107665 }, { "epoch": 3.880419504811331, "grad_norm": 0.20561757683753967, "learning_rate": 6.305402505928837e-06, "loss": 0.3682, "step": 107670 }, { "epoch": 3.8805997044725555, "grad_norm": 0.2230025976896286, "learning_rate": 6.3034651612246476e-06, "loss": 0.4135, "step": 107675 }, { "epoch": 3.8807799041337803, "grad_norm": 0.2320537567138672, "learning_rate": 6.301528071257127e-06, "loss": 0.3782, "step": 107680 }, { "epoch": 3.880960103795005, "grad_norm": 0.28236204385757446, "learning_rate": 6.299591236052657e-06, "loss": 0.4315, "step": 107685 }, { "epoch": 3.8811403034562293, "grad_norm": 0.24441473186016083, "learning_rate": 6.297654655637644e-06, "loss": 0.3819, "step": 107690 }, { "epoch": 3.881320503117454, "grad_norm": 0.2429056018590927, "learning_rate": 6.295718330038466e-06, "loss": 0.4267, "step": 107695 }, { "epoch": 3.8815007027786788, "grad_norm": 0.26638370752334595, "learning_rate": 6.293782259281503e-06, "loss": 0.3785, "step": 107700 }, { "epoch": 3.8816809024399035, "grad_norm": 0.23976294696331024, "learning_rate": 6.291846443393137e-06, "loss": 0.3971, "step": 107705 }, { "epoch": 3.8818611021011282, "grad_norm": 0.2684692144393921, "learning_rate": 6.289910882399741e-06, "loss": 0.4069, "step": 107710 }, { "epoch": 3.882041301762353, "grad_norm": 0.2549930512905121, "learning_rate": 6.287975576327679e-06, "loss": 0.379, "step": 107715 }, { "epoch": 3.8822215014235772, "grad_norm": 0.24471694231033325, "learning_rate": 6.286040525203332e-06, "loss": 0.3727, "step": 107720 }, { "epoch": 3.882401701084802, "grad_norm": 0.2403818666934967, "learning_rate": 6.284105729053064e-06, "loss": 0.3695, "step": 107725 }, { "epoch": 3.8825819007460267, "grad_norm": 0.24353429675102234, "learning_rate": 6.28217118790323e-06, "loss": 0.394, "step": 107730 }, { "epoch": 3.882762100407251, "grad_norm": 0.17580963671207428, "learning_rate": 6.28023690178019e-06, "loss": 0.3487, "step": 107735 }, { "epoch": 3.8829423000684757, "grad_norm": 0.21670427918434143, "learning_rate": 6.27830287071029e-06, "loss": 0.3557, "step": 107740 }, { "epoch": 3.8831224997297005, "grad_norm": 0.26354971528053284, "learning_rate": 6.276369094719903e-06, "loss": 0.3939, "step": 107745 }, { "epoch": 3.883302699390925, "grad_norm": 0.24392694234848022, "learning_rate": 6.274435573835355e-06, "loss": 0.3638, "step": 107750 }, { "epoch": 3.88348289905215, "grad_norm": 0.2409679889678955, "learning_rate": 6.272502308082989e-06, "loss": 0.3691, "step": 107755 }, { "epoch": 3.8836630987133747, "grad_norm": 0.24243175983428955, "learning_rate": 6.270569297489162e-06, "loss": 0.3579, "step": 107760 }, { "epoch": 3.883843298374599, "grad_norm": 0.19919145107269287, "learning_rate": 6.2686365420802e-06, "loss": 0.3814, "step": 107765 }, { "epoch": 3.8840234980358237, "grad_norm": 0.2675543427467346, "learning_rate": 6.266704041882443e-06, "loss": 0.3378, "step": 107770 }, { "epoch": 3.8842036976970484, "grad_norm": 0.2584518492221832, "learning_rate": 6.264771796922212e-06, "loss": 0.361, "step": 107775 }, { "epoch": 3.8843838973582727, "grad_norm": 0.23442451655864716, "learning_rate": 6.262839807225834e-06, "loss": 0.3821, "step": 107780 }, { "epoch": 3.8845640970194975, "grad_norm": 0.24751421809196472, "learning_rate": 6.260908072819641e-06, "loss": 0.3798, "step": 107785 }, { "epoch": 3.884744296680722, "grad_norm": 0.2060132473707199, "learning_rate": 6.258976593729948e-06, "loss": 0.3664, "step": 107790 }, { "epoch": 3.884924496341947, "grad_norm": 0.18985074758529663, "learning_rate": 6.2570453699830725e-06, "loss": 0.417, "step": 107795 }, { "epoch": 3.8851046960031717, "grad_norm": 0.21168529987335205, "learning_rate": 6.2551144016053224e-06, "loss": 0.371, "step": 107800 }, { "epoch": 3.8852848956643964, "grad_norm": 0.21965354681015015, "learning_rate": 6.253183688623002e-06, "loss": 0.3586, "step": 107805 }, { "epoch": 3.8854650953256207, "grad_norm": 0.2647111117839813, "learning_rate": 6.251253231062435e-06, "loss": 0.4069, "step": 107810 }, { "epoch": 3.8856452949868454, "grad_norm": 0.2614315152168274, "learning_rate": 6.249323028949916e-06, "loss": 0.4049, "step": 107815 }, { "epoch": 3.88582549464807, "grad_norm": 0.24572202563285828, "learning_rate": 6.247393082311728e-06, "loss": 0.4089, "step": 107820 }, { "epoch": 3.8860056943092944, "grad_norm": 0.21743349730968475, "learning_rate": 6.245463391174186e-06, "loss": 0.3464, "step": 107825 }, { "epoch": 3.886185893970519, "grad_norm": 0.21934011578559875, "learning_rate": 6.243533955563574e-06, "loss": 0.3717, "step": 107830 }, { "epoch": 3.886366093631744, "grad_norm": 0.2820332646369934, "learning_rate": 6.241604775506174e-06, "loss": 0.3887, "step": 107835 }, { "epoch": 3.8865462932929686, "grad_norm": 0.2698332965373993, "learning_rate": 6.2396758510282895e-06, "loss": 0.3956, "step": 107840 }, { "epoch": 3.8867264929541934, "grad_norm": 0.2532370090484619, "learning_rate": 6.237747182156178e-06, "loss": 0.3619, "step": 107845 }, { "epoch": 3.886906692615418, "grad_norm": 0.275337278842926, "learning_rate": 6.2358187689161356e-06, "loss": 0.3903, "step": 107850 }, { "epoch": 3.8870868922766424, "grad_norm": 0.2451581060886383, "learning_rate": 6.233890611334428e-06, "loss": 0.3933, "step": 107855 }, { "epoch": 3.887267091937867, "grad_norm": 0.2677507698535919, "learning_rate": 6.231962709437328e-06, "loss": 0.3806, "step": 107860 }, { "epoch": 3.887447291599092, "grad_norm": 0.26012924313545227, "learning_rate": 6.230035063251102e-06, "loss": 0.3458, "step": 107865 }, { "epoch": 3.8876274912603166, "grad_norm": 0.2477564662694931, "learning_rate": 6.2281076728020085e-06, "loss": 0.394, "step": 107870 }, { "epoch": 3.887807690921541, "grad_norm": 0.2495788335800171, "learning_rate": 6.226180538116319e-06, "loss": 0.3995, "step": 107875 }, { "epoch": 3.8879878905827656, "grad_norm": 0.2793433666229248, "learning_rate": 6.224253659220286e-06, "loss": 0.3702, "step": 107880 }, { "epoch": 3.8881680902439903, "grad_norm": 0.19013087451457977, "learning_rate": 6.22232703614016e-06, "loss": 0.3745, "step": 107885 }, { "epoch": 3.888348289905215, "grad_norm": 0.2680392265319824, "learning_rate": 6.220400668902196e-06, "loss": 0.3855, "step": 107890 }, { "epoch": 3.88852848956644, "grad_norm": 0.29071950912475586, "learning_rate": 6.218474557532633e-06, "loss": 0.4051, "step": 107895 }, { "epoch": 3.888708689227664, "grad_norm": 0.27325236797332764, "learning_rate": 6.216548702057715e-06, "loss": 0.3856, "step": 107900 }, { "epoch": 3.888888888888889, "grad_norm": 0.23505853116512299, "learning_rate": 6.214623102503689e-06, "loss": 0.3783, "step": 107905 }, { "epoch": 3.8890690885501136, "grad_norm": 0.23786808550357819, "learning_rate": 6.212697758896788e-06, "loss": 0.3829, "step": 107910 }, { "epoch": 3.8892492882113383, "grad_norm": 0.4009203612804413, "learning_rate": 6.2107726712632404e-06, "loss": 0.3708, "step": 107915 }, { "epoch": 3.8894294878725626, "grad_norm": 0.20991608500480652, "learning_rate": 6.208847839629278e-06, "loss": 0.3835, "step": 107920 }, { "epoch": 3.8896096875337873, "grad_norm": 0.22346235811710358, "learning_rate": 6.206923264021119e-06, "loss": 0.3753, "step": 107925 }, { "epoch": 3.889789887195012, "grad_norm": 0.22620312869548798, "learning_rate": 6.204998944465007e-06, "loss": 0.3604, "step": 107930 }, { "epoch": 3.889970086856237, "grad_norm": 0.2643108665943146, "learning_rate": 6.203074880987137e-06, "loss": 0.3826, "step": 107935 }, { "epoch": 3.8901502865174615, "grad_norm": 0.25667524337768555, "learning_rate": 6.201151073613726e-06, "loss": 0.3917, "step": 107940 }, { "epoch": 3.890330486178686, "grad_norm": 0.20191670954227448, "learning_rate": 6.199227522371001e-06, "loss": 0.3856, "step": 107945 }, { "epoch": 3.8905106858399106, "grad_norm": 0.25496187806129456, "learning_rate": 6.197304227285158e-06, "loss": 0.3649, "step": 107950 }, { "epoch": 3.8906908855011353, "grad_norm": 0.2135312408208847, "learning_rate": 6.195381188382407e-06, "loss": 0.3425, "step": 107955 }, { "epoch": 3.89087108516236, "grad_norm": 0.20012623071670532, "learning_rate": 6.193458405688945e-06, "loss": 0.3803, "step": 107960 }, { "epoch": 3.8910512848235843, "grad_norm": 0.2358315885066986, "learning_rate": 6.191535879230964e-06, "loss": 0.3862, "step": 107965 }, { "epoch": 3.891231484484809, "grad_norm": 0.2576768398284912, "learning_rate": 6.1896136090346755e-06, "loss": 0.407, "step": 107970 }, { "epoch": 3.891411684146034, "grad_norm": 0.23550166189670563, "learning_rate": 6.187691595126255e-06, "loss": 0.3778, "step": 107975 }, { "epoch": 3.8915918838072585, "grad_norm": 0.22159500420093536, "learning_rate": 6.185769837531899e-06, "loss": 0.3908, "step": 107980 }, { "epoch": 3.8917720834684832, "grad_norm": 0.27781954407691956, "learning_rate": 6.183848336277784e-06, "loss": 0.3946, "step": 107985 }, { "epoch": 3.891952283129708, "grad_norm": 0.23066557943820953, "learning_rate": 6.1819270913900826e-06, "loss": 0.394, "step": 107990 }, { "epoch": 3.8921324827909323, "grad_norm": 0.26918134093284607, "learning_rate": 6.1800061028949916e-06, "loss": 0.3916, "step": 107995 }, { "epoch": 3.892312682452157, "grad_norm": 0.24214857816696167, "learning_rate": 6.178085370818676e-06, "loss": 0.3739, "step": 108000 }, { "epoch": 3.892312682452157, "eval_loss": 0.42808079719543457, "eval_runtime": 3.5205, "eval_samples_per_second": 28.405, "eval_steps_per_second": 7.101, "step": 108000 }, { "epoch": 3.8924928821133817, "grad_norm": 0.18488271534442902, "learning_rate": 6.1761648951873e-06, "loss": 0.3708, "step": 108005 }, { "epoch": 3.892673081774606, "grad_norm": 0.3259855806827545, "learning_rate": 6.174244676027033e-06, "loss": 0.36, "step": 108010 }, { "epoch": 3.8928532814358308, "grad_norm": 0.25019964575767517, "learning_rate": 6.172324713364039e-06, "loss": 0.3678, "step": 108015 }, { "epoch": 3.8930334810970555, "grad_norm": 0.24694189429283142, "learning_rate": 6.170405007224467e-06, "loss": 0.3751, "step": 108020 }, { "epoch": 3.8932136807582802, "grad_norm": 0.1833256483078003, "learning_rate": 6.168485557634496e-06, "loss": 0.3654, "step": 108025 }, { "epoch": 3.893393880419505, "grad_norm": 0.2395724505186081, "learning_rate": 6.166566364620249e-06, "loss": 0.3517, "step": 108030 }, { "epoch": 3.8935740800807297, "grad_norm": 0.2357412576675415, "learning_rate": 6.164647428207896e-06, "loss": 0.3885, "step": 108035 }, { "epoch": 3.893754279741954, "grad_norm": 0.25304263830184937, "learning_rate": 6.162728748423577e-06, "loss": 0.3779, "step": 108040 }, { "epoch": 3.8939344794031787, "grad_norm": 0.23290584981441498, "learning_rate": 6.160810325293429e-06, "loss": 0.3866, "step": 108045 }, { "epoch": 3.8941146790644035, "grad_norm": 0.25585252046585083, "learning_rate": 6.158892158843593e-06, "loss": 0.3682, "step": 108050 }, { "epoch": 3.8942948787256277, "grad_norm": 0.24071577191352844, "learning_rate": 6.1569742491002054e-06, "loss": 0.3762, "step": 108055 }, { "epoch": 3.8944750783868525, "grad_norm": 0.21371334791183472, "learning_rate": 6.155056596089387e-06, "loss": 0.3549, "step": 108060 }, { "epoch": 3.894655278048077, "grad_norm": 0.23668819665908813, "learning_rate": 6.153139199837282e-06, "loss": 0.383, "step": 108065 }, { "epoch": 3.894835477709302, "grad_norm": 0.23611487448215485, "learning_rate": 6.151222060370007e-06, "loss": 0.3706, "step": 108070 }, { "epoch": 3.8950156773705267, "grad_norm": 0.22055992484092712, "learning_rate": 6.149305177713682e-06, "loss": 0.4095, "step": 108075 }, { "epoch": 3.8951958770317514, "grad_norm": 0.20526739954948425, "learning_rate": 6.147388551894423e-06, "loss": 0.3806, "step": 108080 }, { "epoch": 3.8953760766929757, "grad_norm": 0.27778366208076477, "learning_rate": 6.145472182938339e-06, "loss": 0.4111, "step": 108085 }, { "epoch": 3.8955562763542004, "grad_norm": 0.2274710088968277, "learning_rate": 6.143556070871554e-06, "loss": 0.3905, "step": 108090 }, { "epoch": 3.895736476015425, "grad_norm": 0.29294195771217346, "learning_rate": 6.141640215720165e-06, "loss": 0.3839, "step": 108095 }, { "epoch": 3.8959166756766495, "grad_norm": 0.2358580380678177, "learning_rate": 6.139724617510279e-06, "loss": 0.3886, "step": 108100 }, { "epoch": 3.896096875337874, "grad_norm": 0.2260809987783432, "learning_rate": 6.137809276267992e-06, "loss": 0.348, "step": 108105 }, { "epoch": 3.896277074999099, "grad_norm": 0.22863958775997162, "learning_rate": 6.1358941920193955e-06, "loss": 0.368, "step": 108110 }, { "epoch": 3.8964572746603237, "grad_norm": 0.22534294426441193, "learning_rate": 6.133979364790601e-06, "loss": 0.3866, "step": 108115 }, { "epoch": 3.8966374743215484, "grad_norm": 0.22448866069316864, "learning_rate": 6.132064794607681e-06, "loss": 0.3379, "step": 108120 }, { "epoch": 3.896817673982773, "grad_norm": 0.3200676143169403, "learning_rate": 6.130150481496716e-06, "loss": 0.3729, "step": 108125 }, { "epoch": 3.8969978736439974, "grad_norm": 0.2216705083847046, "learning_rate": 6.128236425483805e-06, "loss": 0.3849, "step": 108130 }, { "epoch": 3.897178073305222, "grad_norm": 0.3131025433540344, "learning_rate": 6.12632262659501e-06, "loss": 0.358, "step": 108135 }, { "epoch": 3.897358272966447, "grad_norm": 0.23903706669807434, "learning_rate": 6.124409084856433e-06, "loss": 0.3908, "step": 108140 }, { "epoch": 3.8975384726276716, "grad_norm": 0.3344826400279999, "learning_rate": 6.122495800294117e-06, "loss": 0.4013, "step": 108145 }, { "epoch": 3.897718672288896, "grad_norm": 0.248307466506958, "learning_rate": 6.120582772934136e-06, "loss": 0.377, "step": 108150 }, { "epoch": 3.8978988719501206, "grad_norm": 0.21172760426998138, "learning_rate": 6.118670002802568e-06, "loss": 0.3875, "step": 108155 }, { "epoch": 3.8980790716113454, "grad_norm": 0.24647603929042816, "learning_rate": 6.116757489925462e-06, "loss": 0.3586, "step": 108160 }, { "epoch": 3.89825927127257, "grad_norm": 0.20428353548049927, "learning_rate": 6.114845234328881e-06, "loss": 0.3424, "step": 108165 }, { "epoch": 3.898439470933795, "grad_norm": 0.2739085257053375, "learning_rate": 6.112933236038878e-06, "loss": 0.3761, "step": 108170 }, { "epoch": 3.898619670595019, "grad_norm": 0.21551024913787842, "learning_rate": 6.111021495081496e-06, "loss": 0.3632, "step": 108175 }, { "epoch": 3.898799870256244, "grad_norm": 0.2526455819606781, "learning_rate": 6.109110011482797e-06, "loss": 0.3779, "step": 108180 }, { "epoch": 3.8989800699174686, "grad_norm": 0.2527839243412018, "learning_rate": 6.107198785268814e-06, "loss": 0.3568, "step": 108185 }, { "epoch": 3.8991602695786933, "grad_norm": 0.24524226784706116, "learning_rate": 6.105287816465591e-06, "loss": 0.3872, "step": 108190 }, { "epoch": 3.8993404692399176, "grad_norm": 0.20940566062927246, "learning_rate": 6.103377105099165e-06, "loss": 0.3772, "step": 108195 }, { "epoch": 3.8995206689011424, "grad_norm": 0.2865338921546936, "learning_rate": 6.101466651195564e-06, "loss": 0.4084, "step": 108200 }, { "epoch": 3.899700868562367, "grad_norm": 0.2509433925151825, "learning_rate": 6.099556454780817e-06, "loss": 0.3579, "step": 108205 }, { "epoch": 3.899881068223592, "grad_norm": 0.271535187959671, "learning_rate": 6.097646515880967e-06, "loss": 0.4306, "step": 108210 }, { "epoch": 3.9000612678848166, "grad_norm": 0.2970694899559021, "learning_rate": 6.095736834522009e-06, "loss": 0.3645, "step": 108215 }, { "epoch": 3.9002414675460413, "grad_norm": 0.2571132779121399, "learning_rate": 6.093827410729985e-06, "loss": 0.4004, "step": 108220 }, { "epoch": 3.9004216672072656, "grad_norm": 0.23718644678592682, "learning_rate": 6.091918244530903e-06, "loss": 0.3924, "step": 108225 }, { "epoch": 3.9006018668684903, "grad_norm": 0.24922458827495575, "learning_rate": 6.090009335950772e-06, "loss": 0.3443, "step": 108230 }, { "epoch": 3.900782066529715, "grad_norm": 0.2874678075313568, "learning_rate": 6.088100685015605e-06, "loss": 0.3818, "step": 108235 }, { "epoch": 3.9009622661909393, "grad_norm": 0.23355595767498016, "learning_rate": 6.0861922917514024e-06, "loss": 0.391, "step": 108240 }, { "epoch": 3.901142465852164, "grad_norm": 0.2503456473350525, "learning_rate": 6.084284156184161e-06, "loss": 0.3889, "step": 108245 }, { "epoch": 3.901322665513389, "grad_norm": 0.2787312865257263, "learning_rate": 6.082376278339893e-06, "loss": 0.4176, "step": 108250 }, { "epoch": 3.9015028651746135, "grad_norm": 0.24109527468681335, "learning_rate": 6.080468658244587e-06, "loss": 0.3879, "step": 108255 }, { "epoch": 3.9016830648358383, "grad_norm": 0.24064235389232635, "learning_rate": 6.078561295924232e-06, "loss": 0.4142, "step": 108260 }, { "epoch": 3.901863264497063, "grad_norm": 0.20909632742404938, "learning_rate": 6.076654191404815e-06, "loss": 0.3915, "step": 108265 }, { "epoch": 3.9020434641582873, "grad_norm": 0.21806110441684723, "learning_rate": 6.074747344712314e-06, "loss": 0.3774, "step": 108270 }, { "epoch": 3.902223663819512, "grad_norm": 0.21959331631660461, "learning_rate": 6.0728407558727246e-06, "loss": 0.375, "step": 108275 }, { "epoch": 3.9024038634807368, "grad_norm": 0.20421810448169708, "learning_rate": 6.070934424912014e-06, "loss": 0.3855, "step": 108280 }, { "epoch": 3.902584063141961, "grad_norm": 0.18650266528129578, "learning_rate": 6.06902835185616e-06, "loss": 0.3661, "step": 108285 }, { "epoch": 3.902764262803186, "grad_norm": 0.23381759226322174, "learning_rate": 6.067122536731126e-06, "loss": 0.4026, "step": 108290 }, { "epoch": 3.9029444624644105, "grad_norm": 0.19069373607635498, "learning_rate": 6.065216979562877e-06, "loss": 0.3993, "step": 108295 }, { "epoch": 3.9031246621256352, "grad_norm": 0.22146674990653992, "learning_rate": 6.063311680377387e-06, "loss": 0.3903, "step": 108300 }, { "epoch": 3.90330486178686, "grad_norm": 0.24637946486473083, "learning_rate": 6.061406639200617e-06, "loss": 0.3661, "step": 108305 }, { "epoch": 3.9034850614480847, "grad_norm": 0.2656784653663635, "learning_rate": 6.0595018560585e-06, "loss": 0.3357, "step": 108310 }, { "epoch": 3.903665261109309, "grad_norm": 0.21832560002803802, "learning_rate": 6.057597330977011e-06, "loss": 0.348, "step": 108315 }, { "epoch": 3.9038454607705337, "grad_norm": 0.215951070189476, "learning_rate": 6.055693063982082e-06, "loss": 0.3295, "step": 108320 }, { "epoch": 3.9040256604317585, "grad_norm": 0.23906680941581726, "learning_rate": 6.053789055099685e-06, "loss": 0.3876, "step": 108325 }, { "epoch": 3.9042058600929828, "grad_norm": 0.2423383742570877, "learning_rate": 6.051885304355734e-06, "loss": 0.3528, "step": 108330 }, { "epoch": 3.9043860597542075, "grad_norm": 0.20754733681678772, "learning_rate": 6.04998181177617e-06, "loss": 0.3529, "step": 108335 }, { "epoch": 3.9045662594154322, "grad_norm": 0.23662780225276947, "learning_rate": 6.048078577386945e-06, "loss": 0.3659, "step": 108340 }, { "epoch": 3.904746459076657, "grad_norm": 0.3288610577583313, "learning_rate": 6.046175601213977e-06, "loss": 0.4009, "step": 108345 }, { "epoch": 3.9049266587378817, "grad_norm": 0.30411192774772644, "learning_rate": 6.044272883283198e-06, "loss": 0.3728, "step": 108350 }, { "epoch": 3.9051068583991064, "grad_norm": 0.2672629654407501, "learning_rate": 6.04237042362053e-06, "loss": 0.3629, "step": 108355 }, { "epoch": 3.9052870580603307, "grad_norm": 0.22953931987285614, "learning_rate": 6.040468222251891e-06, "loss": 0.3809, "step": 108360 }, { "epoch": 3.9054672577215555, "grad_norm": 0.22573940455913544, "learning_rate": 6.038566279203206e-06, "loss": 0.3482, "step": 108365 }, { "epoch": 3.90564745738278, "grad_norm": 0.29565203189849854, "learning_rate": 6.036664594500385e-06, "loss": 0.3928, "step": 108370 }, { "epoch": 3.905827657044005, "grad_norm": 0.2615247070789337, "learning_rate": 6.0347631681693384e-06, "loss": 0.3485, "step": 108375 }, { "epoch": 3.906007856705229, "grad_norm": 0.3057554364204407, "learning_rate": 6.0328620002359695e-06, "loss": 0.3922, "step": 108380 }, { "epoch": 3.906188056366454, "grad_norm": 0.23635023832321167, "learning_rate": 6.030961090726186e-06, "loss": 0.4425, "step": 108385 }, { "epoch": 3.9063682560276787, "grad_norm": 0.16913343966007233, "learning_rate": 6.029060439665876e-06, "loss": 0.3574, "step": 108390 }, { "epoch": 3.9065484556889034, "grad_norm": 0.20368297398090363, "learning_rate": 6.0271600470809595e-06, "loss": 0.3818, "step": 108395 }, { "epoch": 3.906728655350128, "grad_norm": 0.27825161814689636, "learning_rate": 6.0252599129973e-06, "loss": 0.3998, "step": 108400 }, { "epoch": 3.9069088550113524, "grad_norm": 0.2604582905769348, "learning_rate": 6.023360037440809e-06, "loss": 0.4151, "step": 108405 }, { "epoch": 3.907089054672577, "grad_norm": 0.27497240900993347, "learning_rate": 6.021460420437364e-06, "loss": 0.382, "step": 108410 }, { "epoch": 3.907269254333802, "grad_norm": 0.18511195480823517, "learning_rate": 6.0195610620128354e-06, "loss": 0.389, "step": 108415 }, { "epoch": 3.9074494539950266, "grad_norm": 0.24932751059532166, "learning_rate": 6.01766196219313e-06, "loss": 0.3824, "step": 108420 }, { "epoch": 3.907629653656251, "grad_norm": 0.24645911157131195, "learning_rate": 6.0157631210040975e-06, "loss": 0.3741, "step": 108425 }, { "epoch": 3.9078098533174757, "grad_norm": 0.24892014265060425, "learning_rate": 6.0138645384716085e-06, "loss": 0.368, "step": 108430 }, { "epoch": 3.9079900529787004, "grad_norm": 0.22853389382362366, "learning_rate": 6.0119662146215475e-06, "loss": 0.3671, "step": 108435 }, { "epoch": 3.908170252639925, "grad_norm": 0.2502167820930481, "learning_rate": 6.010068149479772e-06, "loss": 0.4101, "step": 108440 }, { "epoch": 3.90835045230115, "grad_norm": 0.25325870513916016, "learning_rate": 6.008170343072139e-06, "loss": 0.3676, "step": 108445 }, { "epoch": 3.908530651962374, "grad_norm": 0.269979864358902, "learning_rate": 6.006272795424511e-06, "loss": 0.3602, "step": 108450 }, { "epoch": 3.908710851623599, "grad_norm": 0.21623685956001282, "learning_rate": 6.004375506562729e-06, "loss": 0.3969, "step": 108455 }, { "epoch": 3.9088910512848236, "grad_norm": 0.20743024349212646, "learning_rate": 6.002478476512663e-06, "loss": 0.3587, "step": 108460 }, { "epoch": 3.9090712509460483, "grad_norm": 0.26383909583091736, "learning_rate": 6.0005817053001485e-06, "loss": 0.3759, "step": 108465 }, { "epoch": 3.9092514506072726, "grad_norm": 0.22323699295520782, "learning_rate": 5.998685192951029e-06, "loss": 0.371, "step": 108470 }, { "epoch": 3.9094316502684974, "grad_norm": 0.2346304953098297, "learning_rate": 5.996788939491143e-06, "loss": 0.3776, "step": 108475 }, { "epoch": 3.909611849929722, "grad_norm": 0.25308549404144287, "learning_rate": 5.994892944946326e-06, "loss": 0.3947, "step": 108480 }, { "epoch": 3.909792049590947, "grad_norm": 0.24598398804664612, "learning_rate": 5.992997209342416e-06, "loss": 0.3543, "step": 108485 }, { "epoch": 3.9099722492521716, "grad_norm": 0.21500460803508759, "learning_rate": 5.991101732705248e-06, "loss": 0.3669, "step": 108490 }, { "epoch": 3.9101524489133963, "grad_norm": 0.2395174652338028, "learning_rate": 5.989206515060625e-06, "loss": 0.3834, "step": 108495 }, { "epoch": 3.9103326485746206, "grad_norm": 0.20296072959899902, "learning_rate": 5.987311556434391e-06, "loss": 0.4221, "step": 108500 }, { "epoch": 3.9103326485746206, "eval_loss": 0.4281059205532074, "eval_runtime": 3.5316, "eval_samples_per_second": 28.316, "eval_steps_per_second": 7.079, "step": 108500 }, { "epoch": 3.9105128482358453, "grad_norm": 0.22557717561721802, "learning_rate": 5.985416856852347e-06, "loss": 0.3589, "step": 108505 }, { "epoch": 3.91069304789707, "grad_norm": 0.22067588567733765, "learning_rate": 5.9835224163403315e-06, "loss": 0.3639, "step": 108510 }, { "epoch": 3.9108732475582944, "grad_norm": 0.2571333050727844, "learning_rate": 5.9816282349241335e-06, "loss": 0.3619, "step": 108515 }, { "epoch": 3.911053447219519, "grad_norm": 0.24989810585975647, "learning_rate": 5.979734312629562e-06, "loss": 0.4091, "step": 108520 }, { "epoch": 3.911233646880744, "grad_norm": 0.2283269464969635, "learning_rate": 5.977840649482435e-06, "loss": 0.3804, "step": 108525 }, { "epoch": 3.9114138465419686, "grad_norm": 0.2447991967201233, "learning_rate": 5.975947245508548e-06, "loss": 0.3941, "step": 108530 }, { "epoch": 3.9115940462031933, "grad_norm": 0.27993232011795044, "learning_rate": 5.9740541007336974e-06, "loss": 0.3952, "step": 108535 }, { "epoch": 3.911774245864418, "grad_norm": 0.21602769196033478, "learning_rate": 5.972161215183672e-06, "loss": 0.3507, "step": 108540 }, { "epoch": 3.9119544455256423, "grad_norm": 0.2476750910282135, "learning_rate": 5.970268588884262e-06, "loss": 0.3867, "step": 108545 }, { "epoch": 3.912134645186867, "grad_norm": 0.2227691113948822, "learning_rate": 5.968376221861266e-06, "loss": 0.3658, "step": 108550 }, { "epoch": 3.9123148448480918, "grad_norm": 0.19756188988685608, "learning_rate": 5.9664841141404585e-06, "loss": 0.3666, "step": 108555 }, { "epoch": 3.912495044509316, "grad_norm": 0.2740320563316345, "learning_rate": 5.964592265747617e-06, "loss": 0.3738, "step": 108560 }, { "epoch": 3.912675244170541, "grad_norm": 0.28433406352996826, "learning_rate": 5.9627006767085244e-06, "loss": 0.3794, "step": 108565 }, { "epoch": 3.9128554438317655, "grad_norm": 0.3014092445373535, "learning_rate": 5.960809347048948e-06, "loss": 0.3592, "step": 108570 }, { "epoch": 3.9130356434929903, "grad_norm": 0.2842918038368225, "learning_rate": 5.95891827679465e-06, "loss": 0.3823, "step": 108575 }, { "epoch": 3.913215843154215, "grad_norm": 0.27905863523483276, "learning_rate": 5.957027465971413e-06, "loss": 0.3583, "step": 108580 }, { "epoch": 3.9133960428154397, "grad_norm": 0.27240729331970215, "learning_rate": 5.955136914604989e-06, "loss": 0.3527, "step": 108585 }, { "epoch": 3.913576242476664, "grad_norm": 0.2016393542289734, "learning_rate": 5.953246622721137e-06, "loss": 0.3709, "step": 108590 }, { "epoch": 3.9137564421378888, "grad_norm": 0.24369855225086212, "learning_rate": 5.951356590345611e-06, "loss": 0.3883, "step": 108595 }, { "epoch": 3.9139366417991135, "grad_norm": 0.20250004529953003, "learning_rate": 5.94946681750416e-06, "loss": 0.3649, "step": 108600 }, { "epoch": 3.914116841460338, "grad_norm": 0.26689743995666504, "learning_rate": 5.9475773042225455e-06, "loss": 0.3699, "step": 108605 }, { "epoch": 3.9142970411215625, "grad_norm": 0.24879872798919678, "learning_rate": 5.945688050526496e-06, "loss": 0.3921, "step": 108610 }, { "epoch": 3.9144772407827872, "grad_norm": 0.20798632502555847, "learning_rate": 5.943799056441751e-06, "loss": 0.3478, "step": 108615 }, { "epoch": 3.914657440444012, "grad_norm": 0.2264632135629654, "learning_rate": 5.941910321994063e-06, "loss": 0.3668, "step": 108620 }, { "epoch": 3.9148376401052367, "grad_norm": 0.26185837388038635, "learning_rate": 5.940021847209157e-06, "loss": 0.3667, "step": 108625 }, { "epoch": 3.9150178397664614, "grad_norm": 0.20407342910766602, "learning_rate": 5.93813363211276e-06, "loss": 0.3491, "step": 108630 }, { "epoch": 3.9151980394276857, "grad_norm": 0.21757426857948303, "learning_rate": 5.936245676730603e-06, "loss": 0.3534, "step": 108635 }, { "epoch": 3.9153782390889105, "grad_norm": 0.23258854448795319, "learning_rate": 5.934357981088403e-06, "loss": 0.4074, "step": 108640 }, { "epoch": 3.915558438750135, "grad_norm": 0.31023046374320984, "learning_rate": 5.932470545211891e-06, "loss": 0.4156, "step": 108645 }, { "epoch": 3.91573863841136, "grad_norm": 0.2955925166606903, "learning_rate": 5.930583369126774e-06, "loss": 0.4184, "step": 108650 }, { "epoch": 3.9159188380725842, "grad_norm": 0.25111132860183716, "learning_rate": 5.928696452858768e-06, "loss": 0.3435, "step": 108655 }, { "epoch": 3.916099037733809, "grad_norm": 0.25099194049835205, "learning_rate": 5.92680979643358e-06, "loss": 0.3505, "step": 108660 }, { "epoch": 3.9162792373950337, "grad_norm": 0.23228740692138672, "learning_rate": 5.924923399876908e-06, "loss": 0.3887, "step": 108665 }, { "epoch": 3.9164594370562584, "grad_norm": 0.26973646879196167, "learning_rate": 5.9230372632144705e-06, "loss": 0.3786, "step": 108670 }, { "epoch": 3.916639636717483, "grad_norm": 0.2194148302078247, "learning_rate": 5.921151386471962e-06, "loss": 0.3936, "step": 108675 }, { "epoch": 3.9168198363787075, "grad_norm": 0.21613870561122894, "learning_rate": 5.919265769675059e-06, "loss": 0.3644, "step": 108680 }, { "epoch": 3.917000036039932, "grad_norm": 0.2264699786901474, "learning_rate": 5.917380412849474e-06, "loss": 0.3825, "step": 108685 }, { "epoch": 3.917180235701157, "grad_norm": 0.21265465021133423, "learning_rate": 5.9154953160208776e-06, "loss": 0.3764, "step": 108690 }, { "epoch": 3.9173604353623817, "grad_norm": 0.2905406653881073, "learning_rate": 5.913610479214976e-06, "loss": 0.3926, "step": 108695 }, { "epoch": 3.917540635023606, "grad_norm": 0.30452674627304077, "learning_rate": 5.911725902457432e-06, "loss": 0.3959, "step": 108700 }, { "epoch": 3.9177208346848307, "grad_norm": 0.20968550443649292, "learning_rate": 5.909841585773917e-06, "loss": 0.3671, "step": 108705 }, { "epoch": 3.9179010343460554, "grad_norm": 0.20306989550590515, "learning_rate": 5.9079575291901254e-06, "loss": 0.3766, "step": 108710 }, { "epoch": 3.91808123400728, "grad_norm": 0.20528307557106018, "learning_rate": 5.906073732731712e-06, "loss": 0.3609, "step": 108715 }, { "epoch": 3.918261433668505, "grad_norm": 0.32778772711753845, "learning_rate": 5.90419019642435e-06, "loss": 0.3861, "step": 108720 }, { "epoch": 3.9184416333297296, "grad_norm": 0.25821900367736816, "learning_rate": 5.9023069202936984e-06, "loss": 0.3839, "step": 108725 }, { "epoch": 3.918621832990954, "grad_norm": 0.2765897810459137, "learning_rate": 5.90042390436541e-06, "loss": 0.3741, "step": 108730 }, { "epoch": 3.9188020326521786, "grad_norm": 0.22229650616645813, "learning_rate": 5.898541148665154e-06, "loss": 0.3783, "step": 108735 }, { "epoch": 3.9189822323134034, "grad_norm": 0.32153648138046265, "learning_rate": 5.896658653218578e-06, "loss": 0.3865, "step": 108740 }, { "epoch": 3.9191624319746277, "grad_norm": 0.23668645322322845, "learning_rate": 5.8947764180513316e-06, "loss": 0.3962, "step": 108745 }, { "epoch": 3.9193426316358524, "grad_norm": 0.26345136761665344, "learning_rate": 5.892894443189056e-06, "loss": 0.3455, "step": 108750 }, { "epoch": 3.919522831297077, "grad_norm": 0.23843061923980713, "learning_rate": 5.891012728657394e-06, "loss": 0.3981, "step": 108755 }, { "epoch": 3.919703030958302, "grad_norm": 0.25555703043937683, "learning_rate": 5.889131274481977e-06, "loss": 0.3639, "step": 108760 }, { "epoch": 3.9198832306195266, "grad_norm": 0.18954876065254211, "learning_rate": 5.887250080688453e-06, "loss": 0.3554, "step": 108765 }, { "epoch": 3.9200634302807513, "grad_norm": 0.2582615613937378, "learning_rate": 5.885369147302447e-06, "loss": 0.4017, "step": 108770 }, { "epoch": 3.9202436299419756, "grad_norm": 0.20174746215343475, "learning_rate": 5.883488474349586e-06, "loss": 0.381, "step": 108775 }, { "epoch": 3.9204238296032003, "grad_norm": 0.2733262777328491, "learning_rate": 5.881608061855492e-06, "loss": 0.39, "step": 108780 }, { "epoch": 3.920604029264425, "grad_norm": 0.21888548135757446, "learning_rate": 5.879727909845781e-06, "loss": 0.3547, "step": 108785 }, { "epoch": 3.9207842289256494, "grad_norm": 0.26759272813796997, "learning_rate": 5.8778480183460885e-06, "loss": 0.4001, "step": 108790 }, { "epoch": 3.920964428586874, "grad_norm": 0.26541879773139954, "learning_rate": 5.875968387382008e-06, "loss": 0.342, "step": 108795 }, { "epoch": 3.921144628248099, "grad_norm": 0.2672159671783447, "learning_rate": 5.874089016979151e-06, "loss": 0.3564, "step": 108800 }, { "epoch": 3.9213248279093236, "grad_norm": 0.18863865733146667, "learning_rate": 5.872209907163132e-06, "loss": 0.3994, "step": 108805 }, { "epoch": 3.9215050275705483, "grad_norm": 0.2630614638328552, "learning_rate": 5.870331057959552e-06, "loss": 0.3765, "step": 108810 }, { "epoch": 3.921685227231773, "grad_norm": 0.2330998033285141, "learning_rate": 5.868452469394006e-06, "loss": 0.3701, "step": 108815 }, { "epoch": 3.9218654268929973, "grad_norm": 0.20276689529418945, "learning_rate": 5.86657414149209e-06, "loss": 0.3582, "step": 108820 }, { "epoch": 3.922045626554222, "grad_norm": 0.2714731991291046, "learning_rate": 5.864696074279394e-06, "loss": 0.3724, "step": 108825 }, { "epoch": 3.922225826215447, "grad_norm": 0.2654849886894226, "learning_rate": 5.862818267781514e-06, "loss": 0.3798, "step": 108830 }, { "epoch": 3.922406025876671, "grad_norm": 0.18133863806724548, "learning_rate": 5.860940722024027e-06, "loss": 0.4061, "step": 108835 }, { "epoch": 3.922586225537896, "grad_norm": 0.2509717345237732, "learning_rate": 5.8590634370325224e-06, "loss": 0.4112, "step": 108840 }, { "epoch": 3.9227664251991206, "grad_norm": 0.23945482075214386, "learning_rate": 5.857186412832569e-06, "loss": 0.3412, "step": 108845 }, { "epoch": 3.9229466248603453, "grad_norm": 0.30383405089378357, "learning_rate": 5.855309649449739e-06, "loss": 0.4108, "step": 108850 }, { "epoch": 3.92312682452157, "grad_norm": 0.20874951779842377, "learning_rate": 5.853433146909618e-06, "loss": 0.4059, "step": 108855 }, { "epoch": 3.9233070241827948, "grad_norm": 0.3230457901954651, "learning_rate": 5.851556905237768e-06, "loss": 0.4288, "step": 108860 }, { "epoch": 3.923487223844019, "grad_norm": 0.28123703598976135, "learning_rate": 5.849680924459733e-06, "loss": 0.4122, "step": 108865 }, { "epoch": 3.923667423505244, "grad_norm": 0.2393689751625061, "learning_rate": 5.847805204601095e-06, "loss": 0.3825, "step": 108870 }, { "epoch": 3.9238476231664685, "grad_norm": 0.23288077116012573, "learning_rate": 5.845929745687398e-06, "loss": 0.3605, "step": 108875 }, { "epoch": 3.9240278228276932, "grad_norm": 0.23712435364723206, "learning_rate": 5.844054547744204e-06, "loss": 0.3832, "step": 108880 }, { "epoch": 3.9242080224889175, "grad_norm": 0.23456718027591705, "learning_rate": 5.842179610797066e-06, "loss": 0.3643, "step": 108885 }, { "epoch": 3.9243882221501423, "grad_norm": 0.2397494912147522, "learning_rate": 5.84030493487151e-06, "loss": 0.3329, "step": 108890 }, { "epoch": 3.924568421811367, "grad_norm": 0.22742457687854767, "learning_rate": 5.838430519993096e-06, "loss": 0.3847, "step": 108895 }, { "epoch": 3.9247486214725917, "grad_norm": 0.21573381125926971, "learning_rate": 5.836556366187354e-06, "loss": 0.3471, "step": 108900 }, { "epoch": 3.9249288211338165, "grad_norm": 0.2524566650390625, "learning_rate": 5.834682473479824e-06, "loss": 0.35, "step": 108905 }, { "epoch": 3.9251090207950408, "grad_norm": 0.22824294865131378, "learning_rate": 5.832808841896034e-06, "loss": 0.3783, "step": 108910 }, { "epoch": 3.9252892204562655, "grad_norm": 0.2146436721086502, "learning_rate": 5.830935471461504e-06, "loss": 0.3896, "step": 108915 }, { "epoch": 3.9254694201174902, "grad_norm": 0.2600581645965576, "learning_rate": 5.829062362201776e-06, "loss": 0.3775, "step": 108920 }, { "epoch": 3.925649619778715, "grad_norm": 0.3017957806587219, "learning_rate": 5.827189514142361e-06, "loss": 0.4071, "step": 108925 }, { "epoch": 3.9258298194399393, "grad_norm": 0.2875199615955353, "learning_rate": 5.8253169273087774e-06, "loss": 0.4051, "step": 108930 }, { "epoch": 3.926010019101164, "grad_norm": 0.22084161639213562, "learning_rate": 5.823444601726538e-06, "loss": 0.3576, "step": 108935 }, { "epoch": 3.9261902187623887, "grad_norm": 0.19947287440299988, "learning_rate": 5.8215725374211525e-06, "loss": 0.3679, "step": 108940 }, { "epoch": 3.9263704184236135, "grad_norm": 0.2296856790781021, "learning_rate": 5.819700734418121e-06, "loss": 0.4062, "step": 108945 }, { "epoch": 3.926550618084838, "grad_norm": 0.258223295211792, "learning_rate": 5.817829192742963e-06, "loss": 0.3719, "step": 108950 }, { "epoch": 3.9267308177460625, "grad_norm": 0.27443790435791016, "learning_rate": 5.8159579124211665e-06, "loss": 0.3654, "step": 108955 }, { "epoch": 3.926911017407287, "grad_norm": 0.23533061146736145, "learning_rate": 5.8140868934782295e-06, "loss": 0.3518, "step": 108960 }, { "epoch": 3.927091217068512, "grad_norm": 0.21623454988002777, "learning_rate": 5.812216135939644e-06, "loss": 0.3867, "step": 108965 }, { "epoch": 3.9272714167297367, "grad_norm": 0.2719837725162506, "learning_rate": 5.810345639830891e-06, "loss": 0.3659, "step": 108970 }, { "epoch": 3.927451616390961, "grad_norm": 0.2205159217119217, "learning_rate": 5.80847540517748e-06, "loss": 0.3625, "step": 108975 }, { "epoch": 3.9276318160521857, "grad_norm": 0.23744551837444305, "learning_rate": 5.806605432004866e-06, "loss": 0.3772, "step": 108980 }, { "epoch": 3.9278120157134104, "grad_norm": 0.235315203666687, "learning_rate": 5.804735720338531e-06, "loss": 0.3935, "step": 108985 }, { "epoch": 3.927992215374635, "grad_norm": 0.26385051012039185, "learning_rate": 5.802866270203964e-06, "loss": 0.3864, "step": 108990 }, { "epoch": 3.92817241503586, "grad_norm": 0.234380304813385, "learning_rate": 5.800997081626619e-06, "loss": 0.3736, "step": 108995 }, { "epoch": 3.9283526146970846, "grad_norm": 0.21200546622276306, "learning_rate": 5.799128154631986e-06, "loss": 0.3632, "step": 109000 }, { "epoch": 3.9283526146970846, "eval_loss": 0.42800185084342957, "eval_runtime": 3.5264, "eval_samples_per_second": 28.357, "eval_steps_per_second": 7.089, "step": 109000 }, { "epoch": 3.928532814358309, "grad_norm": 0.24245581030845642, "learning_rate": 5.797259489245508e-06, "loss": 0.3871, "step": 109005 }, { "epoch": 3.9287130140195337, "grad_norm": 0.29158252477645874, "learning_rate": 5.795391085492644e-06, "loss": 0.4042, "step": 109010 }, { "epoch": 3.9288932136807584, "grad_norm": 0.2737109959125519, "learning_rate": 5.793522943398863e-06, "loss": 0.4132, "step": 109015 }, { "epoch": 3.9290734133419827, "grad_norm": 0.255136638879776, "learning_rate": 5.791655062989618e-06, "loss": 0.3969, "step": 109020 }, { "epoch": 3.9292536130032074, "grad_norm": 0.2431129515171051, "learning_rate": 5.789787444290351e-06, "loss": 0.3924, "step": 109025 }, { "epoch": 3.929433812664432, "grad_norm": 0.27715519070625305, "learning_rate": 5.787920087326512e-06, "loss": 0.3818, "step": 109030 }, { "epoch": 3.929614012325657, "grad_norm": 0.24491608142852783, "learning_rate": 5.786052992123533e-06, "loss": 0.3742, "step": 109035 }, { "epoch": 3.9297942119868816, "grad_norm": 0.21611642837524414, "learning_rate": 5.784186158706872e-06, "loss": 0.3943, "step": 109040 }, { "epoch": 3.9299744116481063, "grad_norm": 0.2787199914455414, "learning_rate": 5.782319587101953e-06, "loss": 0.3888, "step": 109045 }, { "epoch": 3.9301546113093306, "grad_norm": 0.26569807529449463, "learning_rate": 5.780453277334208e-06, "loss": 0.3812, "step": 109050 }, { "epoch": 3.9303348109705554, "grad_norm": 0.25157374143600464, "learning_rate": 5.778587229429069e-06, "loss": 0.4124, "step": 109055 }, { "epoch": 3.93051501063178, "grad_norm": 0.21944718062877655, "learning_rate": 5.776721443411956e-06, "loss": 0.3392, "step": 109060 }, { "epoch": 3.9306952102930044, "grad_norm": 0.25366106629371643, "learning_rate": 5.774855919308284e-06, "loss": 0.4012, "step": 109065 }, { "epoch": 3.930875409954229, "grad_norm": 0.24010884761810303, "learning_rate": 5.772990657143492e-06, "loss": 0.3748, "step": 109070 }, { "epoch": 3.931055609615454, "grad_norm": 0.2395809143781662, "learning_rate": 5.771125656942966e-06, "loss": 0.3855, "step": 109075 }, { "epoch": 3.9312358092766786, "grad_norm": 0.22961130738258362, "learning_rate": 5.769260918732139e-06, "loss": 0.3671, "step": 109080 }, { "epoch": 3.9314160089379033, "grad_norm": 0.21910971403121948, "learning_rate": 5.767396442536404e-06, "loss": 0.3404, "step": 109085 }, { "epoch": 3.931596208599128, "grad_norm": 0.2396189421415329, "learning_rate": 5.765532228381174e-06, "loss": 0.404, "step": 109090 }, { "epoch": 3.9317764082603524, "grad_norm": 0.25804564356803894, "learning_rate": 5.763668276291842e-06, "loss": 0.3575, "step": 109095 }, { "epoch": 3.931956607921577, "grad_norm": 0.2125440388917923, "learning_rate": 5.761804586293798e-06, "loss": 0.3658, "step": 109100 }, { "epoch": 3.932136807582802, "grad_norm": 0.20259714126586914, "learning_rate": 5.759941158412449e-06, "loss": 0.3936, "step": 109105 }, { "epoch": 3.932317007244026, "grad_norm": 0.26884329319000244, "learning_rate": 5.758077992673175e-06, "loss": 0.3642, "step": 109110 }, { "epoch": 3.932497206905251, "grad_norm": 0.22424453496932983, "learning_rate": 5.756215089101366e-06, "loss": 0.404, "step": 109115 }, { "epoch": 3.9326774065664756, "grad_norm": 0.3316144645214081, "learning_rate": 5.7543524477224e-06, "loss": 0.3845, "step": 109120 }, { "epoch": 3.9328576062277003, "grad_norm": 0.2477588653564453, "learning_rate": 5.752490068561653e-06, "loss": 0.3847, "step": 109125 }, { "epoch": 3.933037805888925, "grad_norm": 0.21736137568950653, "learning_rate": 5.750627951644496e-06, "loss": 0.3864, "step": 109130 }, { "epoch": 3.9332180055501498, "grad_norm": 0.253451406955719, "learning_rate": 5.748766096996316e-06, "loss": 0.3711, "step": 109135 }, { "epoch": 3.933398205211374, "grad_norm": 0.2794763147830963, "learning_rate": 5.74690450464247e-06, "loss": 0.3803, "step": 109140 }, { "epoch": 3.933578404872599, "grad_norm": 0.24337390065193176, "learning_rate": 5.745043174608322e-06, "loss": 0.3818, "step": 109145 }, { "epoch": 3.9337586045338235, "grad_norm": 0.2820989489555359, "learning_rate": 5.7431821069192345e-06, "loss": 0.4063, "step": 109150 }, { "epoch": 3.9339388041950483, "grad_norm": 0.27026164531707764, "learning_rate": 5.741321301600553e-06, "loss": 0.3897, "step": 109155 }, { "epoch": 3.9341190038562726, "grad_norm": 0.24505634605884552, "learning_rate": 5.7394607586776475e-06, "loss": 0.3734, "step": 109160 }, { "epoch": 3.9342992035174973, "grad_norm": 0.2610364258289337, "learning_rate": 5.73760047817587e-06, "loss": 0.3896, "step": 109165 }, { "epoch": 3.934479403178722, "grad_norm": 0.2157258540391922, "learning_rate": 5.735740460120539e-06, "loss": 0.3791, "step": 109170 }, { "epoch": 3.9346596028399468, "grad_norm": 0.2157232165336609, "learning_rate": 5.733880704537023e-06, "loss": 0.3816, "step": 109175 }, { "epoch": 3.9348398025011715, "grad_norm": 0.22246497869491577, "learning_rate": 5.732021211450647e-06, "loss": 0.3887, "step": 109180 }, { "epoch": 3.935020002162396, "grad_norm": 0.1965298354625702, "learning_rate": 5.730161980886764e-06, "loss": 0.373, "step": 109185 }, { "epoch": 3.9352002018236205, "grad_norm": 0.2733939588069916, "learning_rate": 5.7283030128706865e-06, "loss": 0.3991, "step": 109190 }, { "epoch": 3.9353804014848452, "grad_norm": 0.22759440541267395, "learning_rate": 5.726444307427742e-06, "loss": 0.3825, "step": 109195 }, { "epoch": 3.93556060114607, "grad_norm": 0.24440236389636993, "learning_rate": 5.724585864583271e-06, "loss": 0.3675, "step": 109200 }, { "epoch": 3.9357408008072943, "grad_norm": 0.24098166823387146, "learning_rate": 5.722727684362583e-06, "loss": 0.3957, "step": 109205 }, { "epoch": 3.935921000468519, "grad_norm": 0.1858464926481247, "learning_rate": 5.720869766790999e-06, "loss": 0.3768, "step": 109210 }, { "epoch": 3.9361012001297437, "grad_norm": 0.2523440420627594, "learning_rate": 5.719012111893832e-06, "loss": 0.3604, "step": 109215 }, { "epoch": 3.9362813997909685, "grad_norm": 0.3191049098968506, "learning_rate": 5.7171547196963854e-06, "loss": 0.3716, "step": 109220 }, { "epoch": 3.936461599452193, "grad_norm": 0.22533096373081207, "learning_rate": 5.715297590223981e-06, "loss": 0.3648, "step": 109225 }, { "epoch": 3.936641799113418, "grad_norm": 0.23449839651584625, "learning_rate": 5.713440723501912e-06, "loss": 0.4066, "step": 109230 }, { "epoch": 3.9368219987746422, "grad_norm": 0.2854231894016266, "learning_rate": 5.711584119555477e-06, "loss": 0.3489, "step": 109235 }, { "epoch": 3.937002198435867, "grad_norm": 0.21907241642475128, "learning_rate": 5.709727778409976e-06, "loss": 0.378, "step": 109240 }, { "epoch": 3.9371823980970917, "grad_norm": 0.2457626909017563, "learning_rate": 5.7078717000907e-06, "loss": 0.3533, "step": 109245 }, { "epoch": 3.937362597758316, "grad_norm": 0.24565333127975464, "learning_rate": 5.706015884622928e-06, "loss": 0.3805, "step": 109250 }, { "epoch": 3.9375427974195407, "grad_norm": 0.23259061574935913, "learning_rate": 5.704160332031969e-06, "loss": 0.373, "step": 109255 }, { "epoch": 3.9377229970807655, "grad_norm": 0.2456471472978592, "learning_rate": 5.702305042343075e-06, "loss": 0.357, "step": 109260 }, { "epoch": 3.93790319674199, "grad_norm": 0.27309781312942505, "learning_rate": 5.700450015581549e-06, "loss": 0.3559, "step": 109265 }, { "epoch": 3.938083396403215, "grad_norm": 0.2628590166568756, "learning_rate": 5.698595251772651e-06, "loss": 0.3645, "step": 109270 }, { "epoch": 3.9382635960644397, "grad_norm": 0.19301342964172363, "learning_rate": 5.696740750941651e-06, "loss": 0.3669, "step": 109275 }, { "epoch": 3.938443795725664, "grad_norm": 0.2241153120994568, "learning_rate": 5.6948865131138344e-06, "loss": 0.3542, "step": 109280 }, { "epoch": 3.9386239953868887, "grad_norm": 0.21480773389339447, "learning_rate": 5.6930325383144386e-06, "loss": 0.3573, "step": 109285 }, { "epoch": 3.9388041950481134, "grad_norm": 0.27999842166900635, "learning_rate": 5.6911788265687435e-06, "loss": 0.4112, "step": 109290 }, { "epoch": 3.9389843947093377, "grad_norm": 0.2639464735984802, "learning_rate": 5.689325377901997e-06, "loss": 0.3861, "step": 109295 }, { "epoch": 3.9391645943705624, "grad_norm": 0.21820655465126038, "learning_rate": 5.6874721923394545e-06, "loss": 0.3692, "step": 109300 }, { "epoch": 3.939344794031787, "grad_norm": 0.2508092522621155, "learning_rate": 5.685619269906364e-06, "loss": 0.3893, "step": 109305 }, { "epoch": 3.939524993693012, "grad_norm": 0.22443386912345886, "learning_rate": 5.683766610627972e-06, "loss": 0.3788, "step": 109310 }, { "epoch": 3.9397051933542366, "grad_norm": 0.23551128804683685, "learning_rate": 5.681914214529513e-06, "loss": 0.3793, "step": 109315 }, { "epoch": 3.9398853930154614, "grad_norm": 0.2493138313293457, "learning_rate": 5.6800620816362395e-06, "loss": 0.3987, "step": 109320 }, { "epoch": 3.9400655926766857, "grad_norm": 0.27576449513435364, "learning_rate": 5.678210211973378e-06, "loss": 0.3981, "step": 109325 }, { "epoch": 3.9402457923379104, "grad_norm": 0.2820398509502411, "learning_rate": 5.676358605566165e-06, "loss": 0.346, "step": 109330 }, { "epoch": 3.940425991999135, "grad_norm": 0.21052910387516022, "learning_rate": 5.674507262439821e-06, "loss": 0.3848, "step": 109335 }, { "epoch": 3.9406061916603594, "grad_norm": 0.2435581535100937, "learning_rate": 5.67265618261957e-06, "loss": 0.3897, "step": 109340 }, { "epoch": 3.940786391321584, "grad_norm": 0.23726899921894073, "learning_rate": 5.670805366130644e-06, "loss": 0.3792, "step": 109345 }, { "epoch": 3.940966590982809, "grad_norm": 0.26930510997772217, "learning_rate": 5.6689548129982565e-06, "loss": 0.3823, "step": 109350 }, { "epoch": 3.9411467906440336, "grad_norm": 0.2104661613702774, "learning_rate": 5.667104523247605e-06, "loss": 0.4122, "step": 109355 }, { "epoch": 3.9413269903052583, "grad_norm": 0.2654925286769867, "learning_rate": 5.665254496903919e-06, "loss": 0.3602, "step": 109360 }, { "epoch": 3.941507189966483, "grad_norm": 0.26603254675865173, "learning_rate": 5.663404733992389e-06, "loss": 0.3944, "step": 109365 }, { "epoch": 3.9416873896277074, "grad_norm": 0.25029662251472473, "learning_rate": 5.6615552345382405e-06, "loss": 0.3695, "step": 109370 }, { "epoch": 3.941867589288932, "grad_norm": 0.28742098808288574, "learning_rate": 5.65970599856665e-06, "loss": 0.3896, "step": 109375 }, { "epoch": 3.942047788950157, "grad_norm": 0.24512720108032227, "learning_rate": 5.657857026102814e-06, "loss": 0.3886, "step": 109380 }, { "epoch": 3.9422279886113816, "grad_norm": 0.23668235540390015, "learning_rate": 5.65600831717194e-06, "loss": 0.3648, "step": 109385 }, { "epoch": 3.942408188272606, "grad_norm": 0.22627753019332886, "learning_rate": 5.654159871799206e-06, "loss": 0.3937, "step": 109390 }, { "epoch": 3.9425883879338306, "grad_norm": 0.2681983709335327, "learning_rate": 5.652311690009798e-06, "loss": 0.365, "step": 109395 }, { "epoch": 3.9427685875950553, "grad_norm": 0.19054733216762543, "learning_rate": 5.650463771828898e-06, "loss": 0.3927, "step": 109400 }, { "epoch": 3.94294878725628, "grad_norm": 0.279351145029068, "learning_rate": 5.648616117281677e-06, "loss": 0.3602, "step": 109405 }, { "epoch": 3.943128986917505, "grad_norm": 0.20294272899627686, "learning_rate": 5.64676872639332e-06, "loss": 0.365, "step": 109410 }, { "epoch": 3.943309186578729, "grad_norm": 0.3039834201335907, "learning_rate": 5.6449215991889934e-06, "loss": 0.3659, "step": 109415 }, { "epoch": 3.943489386239954, "grad_norm": 0.3200457990169525, "learning_rate": 5.643074735693862e-06, "loss": 0.3948, "step": 109420 }, { "epoch": 3.9436695859011786, "grad_norm": 0.2976410984992981, "learning_rate": 5.641228135933091e-06, "loss": 0.353, "step": 109425 }, { "epoch": 3.9438497855624033, "grad_norm": 0.23301257193088531, "learning_rate": 5.6393817999318365e-06, "loss": 0.3733, "step": 109430 }, { "epoch": 3.9440299852236276, "grad_norm": 0.2382785677909851, "learning_rate": 5.637535727715251e-06, "loss": 0.3598, "step": 109435 }, { "epoch": 3.9442101848848523, "grad_norm": 0.2509336471557617, "learning_rate": 5.635689919308498e-06, "loss": 0.3935, "step": 109440 }, { "epoch": 3.944390384546077, "grad_norm": 0.2762444317340851, "learning_rate": 5.633844374736724e-06, "loss": 0.4127, "step": 109445 }, { "epoch": 3.9445705842073018, "grad_norm": 0.2096366137266159, "learning_rate": 5.6319990940250696e-06, "loss": 0.393, "step": 109450 }, { "epoch": 3.9447507838685265, "grad_norm": 0.23750926554203033, "learning_rate": 5.630154077198677e-06, "loss": 0.371, "step": 109455 }, { "epoch": 3.944930983529751, "grad_norm": 0.22931966185569763, "learning_rate": 5.628309324282676e-06, "loss": 0.3814, "step": 109460 }, { "epoch": 3.9451111831909755, "grad_norm": 0.23210839927196503, "learning_rate": 5.626464835302228e-06, "loss": 0.3851, "step": 109465 }, { "epoch": 3.9452913828522003, "grad_norm": 0.3315199911594391, "learning_rate": 5.62462061028243e-06, "loss": 0.3627, "step": 109470 }, { "epoch": 3.945471582513425, "grad_norm": 0.20502960681915283, "learning_rate": 5.6227766492484315e-06, "loss": 0.391, "step": 109475 }, { "epoch": 3.9456517821746493, "grad_norm": 0.23437687754631042, "learning_rate": 5.62093295222535e-06, "loss": 0.3435, "step": 109480 }, { "epoch": 3.945831981835874, "grad_norm": 0.2661973237991333, "learning_rate": 5.619089519238305e-06, "loss": 0.3835, "step": 109485 }, { "epoch": 3.9460121814970988, "grad_norm": 0.2659934163093567, "learning_rate": 5.617246350312414e-06, "loss": 0.4016, "step": 109490 }, { "epoch": 3.9461923811583235, "grad_norm": 0.24436159431934357, "learning_rate": 5.6154034454727885e-06, "loss": 0.3518, "step": 109495 }, { "epoch": 3.9463725808195482, "grad_norm": 0.23105373978614807, "learning_rate": 5.6135608047445306e-06, "loss": 0.3574, "step": 109500 }, { "epoch": 3.9463725808195482, "eval_loss": 0.4281553030014038, "eval_runtime": 3.5325, "eval_samples_per_second": 28.309, "eval_steps_per_second": 7.077, "step": 109500 }, { "epoch": 3.946552780480773, "grad_norm": 0.23463231325149536, "learning_rate": 5.611718428152759e-06, "loss": 0.372, "step": 109505 }, { "epoch": 3.9467329801419972, "grad_norm": 0.19474101066589355, "learning_rate": 5.609876315722573e-06, "loss": 0.3851, "step": 109510 }, { "epoch": 3.946913179803222, "grad_norm": 0.27828896045684814, "learning_rate": 5.6080344674790676e-06, "loss": 0.389, "step": 109515 }, { "epoch": 3.9470933794644467, "grad_norm": 0.22116512060165405, "learning_rate": 5.606192883447336e-06, "loss": 0.3621, "step": 109520 }, { "epoch": 3.947273579125671, "grad_norm": 0.27116748690605164, "learning_rate": 5.604351563652466e-06, "loss": 0.3818, "step": 109525 }, { "epoch": 3.9474537787868957, "grad_norm": 0.2014303356409073, "learning_rate": 5.60251050811956e-06, "loss": 0.3935, "step": 109530 }, { "epoch": 3.9476339784481205, "grad_norm": 0.30118808150291443, "learning_rate": 5.600669716873697e-06, "loss": 0.4039, "step": 109535 }, { "epoch": 3.947814178109345, "grad_norm": 0.23722492158412933, "learning_rate": 5.5988291899399385e-06, "loss": 0.4329, "step": 109540 }, { "epoch": 3.94799437777057, "grad_norm": 0.25418469309806824, "learning_rate": 5.596988927343386e-06, "loss": 0.3885, "step": 109545 }, { "epoch": 3.9481745774317947, "grad_norm": 0.21690379083156586, "learning_rate": 5.595148929109095e-06, "loss": 0.3706, "step": 109550 }, { "epoch": 3.948354777093019, "grad_norm": 0.2371807098388672, "learning_rate": 5.593309195262156e-06, "loss": 0.401, "step": 109555 }, { "epoch": 3.9485349767542437, "grad_norm": 0.2515186369419098, "learning_rate": 5.591469725827614e-06, "loss": 0.3795, "step": 109560 }, { "epoch": 3.9487151764154684, "grad_norm": 0.22983525693416595, "learning_rate": 5.589630520830536e-06, "loss": 0.4015, "step": 109565 }, { "epoch": 3.9488953760766927, "grad_norm": 0.2533568739891052, "learning_rate": 5.587791580295987e-06, "loss": 0.3962, "step": 109570 }, { "epoch": 3.9490755757379175, "grad_norm": 0.24301256239414215, "learning_rate": 5.58595290424902e-06, "loss": 0.3958, "step": 109575 }, { "epoch": 3.949255775399142, "grad_norm": 0.2538679838180542, "learning_rate": 5.5841144927146875e-06, "loss": 0.4044, "step": 109580 }, { "epoch": 3.949435975060367, "grad_norm": 0.2607662081718445, "learning_rate": 5.582276345718037e-06, "loss": 0.3572, "step": 109585 }, { "epoch": 3.9496161747215917, "grad_norm": 0.23929943144321442, "learning_rate": 5.5804384632841044e-06, "loss": 0.3594, "step": 109590 }, { "epoch": 3.9497963743828164, "grad_norm": 0.29421868920326233, "learning_rate": 5.578600845437942e-06, "loss": 0.3578, "step": 109595 }, { "epoch": 3.9499765740440407, "grad_norm": 0.19604270160198212, "learning_rate": 5.576763492204587e-06, "loss": 0.3804, "step": 109600 }, { "epoch": 3.9501567737052654, "grad_norm": 0.2703362703323364, "learning_rate": 5.574926403609066e-06, "loss": 0.41, "step": 109605 }, { "epoch": 3.95033697336649, "grad_norm": 0.336032509803772, "learning_rate": 5.573089579676413e-06, "loss": 0.3946, "step": 109610 }, { "epoch": 3.9505171730277144, "grad_norm": 0.3432498574256897, "learning_rate": 5.5712530204316545e-06, "loss": 0.3912, "step": 109615 }, { "epoch": 3.950697372688939, "grad_norm": 0.24581557512283325, "learning_rate": 5.569416725899804e-06, "loss": 0.3591, "step": 109620 }, { "epoch": 3.950877572350164, "grad_norm": 0.2500128448009491, "learning_rate": 5.567580696105895e-06, "loss": 0.4141, "step": 109625 }, { "epoch": 3.9510577720113886, "grad_norm": 0.2446908950805664, "learning_rate": 5.565744931074937e-06, "loss": 0.3828, "step": 109630 }, { "epoch": 3.9512379716726134, "grad_norm": 0.22978517413139343, "learning_rate": 5.563909430831943e-06, "loss": 0.3831, "step": 109635 }, { "epoch": 3.951418171333838, "grad_norm": 0.2268865555524826, "learning_rate": 5.56207419540192e-06, "loss": 0.3924, "step": 109640 }, { "epoch": 3.9515983709950624, "grad_norm": 0.22920677065849304, "learning_rate": 5.560239224809866e-06, "loss": 0.3564, "step": 109645 }, { "epoch": 3.951778570656287, "grad_norm": 0.24135887622833252, "learning_rate": 5.558404519080801e-06, "loss": 0.3767, "step": 109650 }, { "epoch": 3.951958770317512, "grad_norm": 0.27055665850639343, "learning_rate": 5.556570078239703e-06, "loss": 0.4004, "step": 109655 }, { "epoch": 3.9521389699787366, "grad_norm": 0.19351933896541595, "learning_rate": 5.554735902311567e-06, "loss": 0.3738, "step": 109660 }, { "epoch": 3.952319169639961, "grad_norm": 0.2451026439666748, "learning_rate": 5.552901991321399e-06, "loss": 0.3815, "step": 109665 }, { "epoch": 3.9524993693011856, "grad_norm": 0.2261337786912918, "learning_rate": 5.5510683452941745e-06, "loss": 0.3699, "step": 109670 }, { "epoch": 3.9526795689624104, "grad_norm": 0.22415819764137268, "learning_rate": 5.549234964254879e-06, "loss": 0.383, "step": 109675 }, { "epoch": 3.952859768623635, "grad_norm": 0.23757857084274292, "learning_rate": 5.547401848228489e-06, "loss": 0.3905, "step": 109680 }, { "epoch": 3.95303996828486, "grad_norm": 0.22890903055667877, "learning_rate": 5.545568997239978e-06, "loss": 0.3978, "step": 109685 }, { "epoch": 3.953220167946084, "grad_norm": 0.24089115858078003, "learning_rate": 5.543736411314329e-06, "loss": 0.3726, "step": 109690 }, { "epoch": 3.953400367607309, "grad_norm": 0.2351144254207611, "learning_rate": 5.541904090476505e-06, "loss": 0.3726, "step": 109695 }, { "epoch": 3.9535805672685336, "grad_norm": 0.20729920268058777, "learning_rate": 5.54007203475147e-06, "loss": 0.3672, "step": 109700 }, { "epoch": 3.9537607669297583, "grad_norm": 0.231421560049057, "learning_rate": 5.538240244164186e-06, "loss": 0.3376, "step": 109705 }, { "epoch": 3.9539409665909826, "grad_norm": 0.2679100036621094, "learning_rate": 5.536408718739605e-06, "loss": 0.3995, "step": 109710 }, { "epoch": 3.9541211662522073, "grad_norm": 0.2754489481449127, "learning_rate": 5.534577458502693e-06, "loss": 0.3953, "step": 109715 }, { "epoch": 3.954301365913432, "grad_norm": 0.21285389363765717, "learning_rate": 5.532746463478403e-06, "loss": 0.3766, "step": 109720 }, { "epoch": 3.954481565574657, "grad_norm": 0.24366551637649536, "learning_rate": 5.5309157336916575e-06, "loss": 0.3781, "step": 109725 }, { "epoch": 3.9546617652358815, "grad_norm": 0.20494620501995087, "learning_rate": 5.529085269167422e-06, "loss": 0.3657, "step": 109730 }, { "epoch": 3.9548419648971063, "grad_norm": 0.268721342086792, "learning_rate": 5.527255069930626e-06, "loss": 0.3734, "step": 109735 }, { "epoch": 3.9550221645583306, "grad_norm": 0.22930163145065308, "learning_rate": 5.525425136006216e-06, "loss": 0.368, "step": 109740 }, { "epoch": 3.9552023642195553, "grad_norm": 0.25767385959625244, "learning_rate": 5.5235954674191235e-06, "loss": 0.3905, "step": 109745 }, { "epoch": 3.95538256388078, "grad_norm": 0.24452823400497437, "learning_rate": 5.5217660641942585e-06, "loss": 0.38, "step": 109750 }, { "epoch": 3.9555627635420043, "grad_norm": 0.2989307940006256, "learning_rate": 5.519936926356567e-06, "loss": 0.3936, "step": 109755 }, { "epoch": 3.955742963203229, "grad_norm": 0.23451735079288483, "learning_rate": 5.518108053930962e-06, "loss": 0.3972, "step": 109760 }, { "epoch": 3.955923162864454, "grad_norm": 0.27269425988197327, "learning_rate": 5.516279446942366e-06, "loss": 0.379, "step": 109765 }, { "epoch": 3.9561033625256785, "grad_norm": 0.2326863706111908, "learning_rate": 5.5144511054156885e-06, "loss": 0.3866, "step": 109770 }, { "epoch": 3.9562835621869032, "grad_norm": 0.2507339119911194, "learning_rate": 5.512623029375835e-06, "loss": 0.3986, "step": 109775 }, { "epoch": 3.956463761848128, "grad_norm": 0.23485083878040314, "learning_rate": 5.510795218847725e-06, "loss": 0.4092, "step": 109780 }, { "epoch": 3.9566439615093523, "grad_norm": 0.22537368535995483, "learning_rate": 5.5089676738562606e-06, "loss": 0.3579, "step": 109785 }, { "epoch": 3.956824161170577, "grad_norm": 0.2544284760951996, "learning_rate": 5.507140394426335e-06, "loss": 0.3826, "step": 109790 }, { "epoch": 3.9570043608318017, "grad_norm": 0.2154984176158905, "learning_rate": 5.50531338058285e-06, "loss": 0.3775, "step": 109795 }, { "epoch": 3.957184560493026, "grad_norm": 0.29244160652160645, "learning_rate": 5.503486632350696e-06, "loss": 0.3983, "step": 109800 }, { "epoch": 3.9573647601542508, "grad_norm": 0.21299979090690613, "learning_rate": 5.501660149754753e-06, "loss": 0.36, "step": 109805 }, { "epoch": 3.9575449598154755, "grad_norm": 0.21175287663936615, "learning_rate": 5.4998339328199255e-06, "loss": 0.3463, "step": 109810 }, { "epoch": 3.9577251594767002, "grad_norm": 0.19887837767601013, "learning_rate": 5.498007981571082e-06, "loss": 0.3646, "step": 109815 }, { "epoch": 3.957905359137925, "grad_norm": 0.22215895354747772, "learning_rate": 5.4961822960331064e-06, "loss": 0.3875, "step": 109820 }, { "epoch": 3.9580855587991497, "grad_norm": 0.2197951376438141, "learning_rate": 5.494356876230869e-06, "loss": 0.3831, "step": 109825 }, { "epoch": 3.958265758460374, "grad_norm": 0.27231666445732117, "learning_rate": 5.492531722189237e-06, "loss": 0.3833, "step": 109830 }, { "epoch": 3.9584459581215987, "grad_norm": 0.22005848586559296, "learning_rate": 5.490706833933096e-06, "loss": 0.3738, "step": 109835 }, { "epoch": 3.9586261577828235, "grad_norm": 0.19976355135440826, "learning_rate": 5.488882211487292e-06, "loss": 0.4119, "step": 109840 }, { "epoch": 3.9588063574440477, "grad_norm": 0.24296000599861145, "learning_rate": 5.487057854876682e-06, "loss": 0.3737, "step": 109845 }, { "epoch": 3.9589865571052725, "grad_norm": 0.32272353768348694, "learning_rate": 5.485233764126138e-06, "loss": 0.421, "step": 109850 }, { "epoch": 3.959166756766497, "grad_norm": 0.23841454088687897, "learning_rate": 5.4834099392605e-06, "loss": 0.3983, "step": 109855 }, { "epoch": 3.959346956427722, "grad_norm": 0.2716634273529053, "learning_rate": 5.481586380304637e-06, "loss": 0.3676, "step": 109860 }, { "epoch": 3.9595271560889467, "grad_norm": 0.2504705488681793, "learning_rate": 5.479763087283374e-06, "loss": 0.3868, "step": 109865 }, { "epoch": 3.9597073557501714, "grad_norm": 0.2552550137042999, "learning_rate": 5.4779400602215505e-06, "loss": 0.3778, "step": 109870 }, { "epoch": 3.9598875554113957, "grad_norm": 0.24792692065238953, "learning_rate": 5.476117299144024e-06, "loss": 0.3571, "step": 109875 }, { "epoch": 3.9600677550726204, "grad_norm": 0.2851993441581726, "learning_rate": 5.474294804075617e-06, "loss": 0.3578, "step": 109880 }, { "epoch": 3.960247954733845, "grad_norm": 0.28467345237731934, "learning_rate": 5.472472575041166e-06, "loss": 0.3829, "step": 109885 }, { "epoch": 3.96042815439507, "grad_norm": 0.26601019501686096, "learning_rate": 5.470650612065492e-06, "loss": 0.3478, "step": 109890 }, { "epoch": 3.960608354056294, "grad_norm": 0.24324598908424377, "learning_rate": 5.468828915173416e-06, "loss": 0.3622, "step": 109895 }, { "epoch": 3.960788553717519, "grad_norm": 0.21530494093894958, "learning_rate": 5.4670074843897715e-06, "loss": 0.3667, "step": 109900 }, { "epoch": 3.9609687533787437, "grad_norm": 0.2549532651901245, "learning_rate": 5.465186319739371e-06, "loss": 0.4011, "step": 109905 }, { "epoch": 3.9611489530399684, "grad_norm": 0.26102617383003235, "learning_rate": 5.463365421247024e-06, "loss": 0.3918, "step": 109910 }, { "epoch": 3.961329152701193, "grad_norm": 0.24327853322029114, "learning_rate": 5.46154478893754e-06, "loss": 0.3286, "step": 109915 }, { "epoch": 3.9615093523624174, "grad_norm": 0.26538389921188354, "learning_rate": 5.459724422835716e-06, "loss": 0.3881, "step": 109920 }, { "epoch": 3.961689552023642, "grad_norm": 0.18152877688407898, "learning_rate": 5.457904322966373e-06, "loss": 0.4096, "step": 109925 }, { "epoch": 3.961869751684867, "grad_norm": 0.2658418118953705, "learning_rate": 5.456084489354307e-06, "loss": 0.3956, "step": 109930 }, { "epoch": 3.9620499513460916, "grad_norm": 0.23971152305603027, "learning_rate": 5.454264922024294e-06, "loss": 0.4017, "step": 109935 }, { "epoch": 3.962230151007316, "grad_norm": 0.2468896061182022, "learning_rate": 5.4524456210011425e-06, "loss": 0.3612, "step": 109940 }, { "epoch": 3.9624103506685406, "grad_norm": 0.29942211508750916, "learning_rate": 5.450626586309634e-06, "loss": 0.3639, "step": 109945 }, { "epoch": 3.9625905503297654, "grad_norm": 0.22130388021469116, "learning_rate": 5.448807817974555e-06, "loss": 0.3656, "step": 109950 }, { "epoch": 3.96277074999099, "grad_norm": 0.24621397256851196, "learning_rate": 5.4469893160206815e-06, "loss": 0.3605, "step": 109955 }, { "epoch": 3.962950949652215, "grad_norm": 0.2520377039909363, "learning_rate": 5.4451710804727885e-06, "loss": 0.3968, "step": 109960 }, { "epoch": 3.963131149313439, "grad_norm": 0.20365937054157257, "learning_rate": 5.443353111355659e-06, "loss": 0.3776, "step": 109965 }, { "epoch": 3.963311348974664, "grad_norm": 0.2284347116947174, "learning_rate": 5.44153540869406e-06, "loss": 0.3964, "step": 109970 }, { "epoch": 3.9634915486358886, "grad_norm": 0.2807764708995819, "learning_rate": 5.439717972512751e-06, "loss": 0.3695, "step": 109975 }, { "epoch": 3.9636717482971133, "grad_norm": 0.2894369959831238, "learning_rate": 5.437900802836499e-06, "loss": 0.377, "step": 109980 }, { "epoch": 3.9638519479583376, "grad_norm": 0.2694891095161438, "learning_rate": 5.436083899690061e-06, "loss": 0.3832, "step": 109985 }, { "epoch": 3.9640321476195624, "grad_norm": 0.2883671224117279, "learning_rate": 5.4342672630981865e-06, "loss": 0.3855, "step": 109990 }, { "epoch": 3.964212347280787, "grad_norm": 0.2609039545059204, "learning_rate": 5.432450893085639e-06, "loss": 0.3817, "step": 109995 }, { "epoch": 3.964392546942012, "grad_norm": 0.20613409578800201, "learning_rate": 5.430634789677158e-06, "loss": 0.3783, "step": 110000 }, { "epoch": 3.964392546942012, "eval_loss": 0.427830308675766, "eval_runtime": 3.5225, "eval_samples_per_second": 28.389, "eval_steps_per_second": 7.097, "step": 110000 }, { "epoch": 3.9645727466032366, "grad_norm": 0.23606936633586884, "learning_rate": 5.428818952897491e-06, "loss": 0.3744, "step": 110005 }, { "epoch": 3.9647529462644613, "grad_norm": 0.2058301568031311, "learning_rate": 5.427003382771376e-06, "loss": 0.3679, "step": 110010 }, { "epoch": 3.9649331459256856, "grad_norm": 0.2392200529575348, "learning_rate": 5.425188079323543e-06, "loss": 0.3732, "step": 110015 }, { "epoch": 3.9651133455869103, "grad_norm": 0.22312918305397034, "learning_rate": 5.423373042578742e-06, "loss": 0.3765, "step": 110020 }, { "epoch": 3.965293545248135, "grad_norm": 0.2611244320869446, "learning_rate": 5.421558272561697e-06, "loss": 0.3729, "step": 110025 }, { "epoch": 3.9654737449093593, "grad_norm": 0.22937054932117462, "learning_rate": 5.419743769297117e-06, "loss": 0.3881, "step": 110030 }, { "epoch": 3.965653944570584, "grad_norm": 0.2127593755722046, "learning_rate": 5.417929532809746e-06, "loss": 0.3301, "step": 110035 }, { "epoch": 3.965834144231809, "grad_norm": 0.25653398036956787, "learning_rate": 5.416115563124283e-06, "loss": 0.3757, "step": 110040 }, { "epoch": 3.9660143438930335, "grad_norm": 0.23030655086040497, "learning_rate": 5.414301860265472e-06, "loss": 0.3658, "step": 110045 }, { "epoch": 3.9661945435542583, "grad_norm": 0.28496742248535156, "learning_rate": 5.412488424257997e-06, "loss": 0.3403, "step": 110050 }, { "epoch": 3.966374743215483, "grad_norm": 0.21361872553825378, "learning_rate": 5.410675255126568e-06, "loss": 0.3534, "step": 110055 }, { "epoch": 3.9665549428767073, "grad_norm": 0.22324657440185547, "learning_rate": 5.408862352895905e-06, "loss": 0.3898, "step": 110060 }, { "epoch": 3.966735142537932, "grad_norm": 0.2588847875595093, "learning_rate": 5.407049717590698e-06, "loss": 0.3481, "step": 110065 }, { "epoch": 3.9669153421991568, "grad_norm": 0.23955267667770386, "learning_rate": 5.405237349235645e-06, "loss": 0.3918, "step": 110070 }, { "epoch": 3.967095541860381, "grad_norm": 0.2915831208229065, "learning_rate": 5.40342524785544e-06, "loss": 0.3471, "step": 110075 }, { "epoch": 3.967275741521606, "grad_norm": 0.24253830313682556, "learning_rate": 5.401613413474762e-06, "loss": 0.371, "step": 110080 }, { "epoch": 3.9674559411828305, "grad_norm": 0.25190943479537964, "learning_rate": 5.399801846118318e-06, "loss": 0.3664, "step": 110085 }, { "epoch": 3.9676361408440552, "grad_norm": 0.2514355480670929, "learning_rate": 5.397990545810777e-06, "loss": 0.3753, "step": 110090 }, { "epoch": 3.96781634050528, "grad_norm": 0.2575605809688568, "learning_rate": 5.396179512576821e-06, "loss": 0.3553, "step": 110095 }, { "epoch": 3.9679965401665047, "grad_norm": 0.24077442288398743, "learning_rate": 5.394368746441125e-06, "loss": 0.3611, "step": 110100 }, { "epoch": 3.968176739827729, "grad_norm": 0.2745268940925598, "learning_rate": 5.39255824742835e-06, "loss": 0.3721, "step": 110105 }, { "epoch": 3.9683569394889537, "grad_norm": 0.22541183233261108, "learning_rate": 5.39074801556318e-06, "loss": 0.373, "step": 110110 }, { "epoch": 3.9685371391501785, "grad_norm": 0.21690218150615692, "learning_rate": 5.388938050870279e-06, "loss": 0.3838, "step": 110115 }, { "epoch": 3.9687173388114028, "grad_norm": 0.26479387283325195, "learning_rate": 5.387128353374285e-06, "loss": 0.3713, "step": 110120 }, { "epoch": 3.9688975384726275, "grad_norm": 0.2510485351085663, "learning_rate": 5.385318923099877e-06, "loss": 0.3695, "step": 110125 }, { "epoch": 3.9690777381338522, "grad_norm": 0.23300889134407043, "learning_rate": 5.383509760071703e-06, "loss": 0.3741, "step": 110130 }, { "epoch": 3.969257937795077, "grad_norm": 0.21996130049228668, "learning_rate": 5.3817008643144096e-06, "loss": 0.3319, "step": 110135 }, { "epoch": 3.9694381374563017, "grad_norm": 0.24466222524642944, "learning_rate": 5.379892235852646e-06, "loss": 0.3745, "step": 110140 }, { "epoch": 3.9696183371175264, "grad_norm": 0.2463037371635437, "learning_rate": 5.378083874711043e-06, "loss": 0.3904, "step": 110145 }, { "epoch": 3.9697985367787507, "grad_norm": 0.23316043615341187, "learning_rate": 5.3762757809142555e-06, "loss": 0.3767, "step": 110150 }, { "epoch": 3.9699787364399755, "grad_norm": 0.24817629158496857, "learning_rate": 5.374467954486911e-06, "loss": 0.3823, "step": 110155 }, { "epoch": 3.9701589361012, "grad_norm": 0.26238492131233215, "learning_rate": 5.372660395453641e-06, "loss": 0.4208, "step": 110160 }, { "epoch": 3.970339135762425, "grad_norm": 0.2136494368314743, "learning_rate": 5.370853103839071e-06, "loss": 0.3898, "step": 110165 }, { "epoch": 3.970519335423649, "grad_norm": 0.28680065274238586, "learning_rate": 5.36904607966783e-06, "loss": 0.4224, "step": 110170 }, { "epoch": 3.970699535084874, "grad_norm": 0.35283222794532776, "learning_rate": 5.367239322964529e-06, "loss": 0.3185, "step": 110175 }, { "epoch": 3.9708797347460987, "grad_norm": 0.2509290277957916, "learning_rate": 5.365432833753797e-06, "loss": 0.3933, "step": 110180 }, { "epoch": 3.9710599344073234, "grad_norm": 0.2476135492324829, "learning_rate": 5.363626612060241e-06, "loss": 0.3734, "step": 110185 }, { "epoch": 3.971240134068548, "grad_norm": 0.2526455223560333, "learning_rate": 5.36182065790847e-06, "loss": 0.3917, "step": 110190 }, { "epoch": 3.9714203337297724, "grad_norm": 0.2520613670349121, "learning_rate": 5.360014971323094e-06, "loss": 0.413, "step": 110195 }, { "epoch": 3.971600533390997, "grad_norm": 0.24791032075881958, "learning_rate": 5.358209552328699e-06, "loss": 0.3903, "step": 110200 }, { "epoch": 3.971780733052222, "grad_norm": 0.22999384999275208, "learning_rate": 5.356404400949908e-06, "loss": 0.3685, "step": 110205 }, { "epoch": 3.9719609327134466, "grad_norm": 0.25010523200035095, "learning_rate": 5.354599517211309e-06, "loss": 0.3803, "step": 110210 }, { "epoch": 3.972141132374671, "grad_norm": 0.2954365313053131, "learning_rate": 5.3527949011374724e-06, "loss": 0.4006, "step": 110215 }, { "epoch": 3.9723213320358957, "grad_norm": 0.23827119171619415, "learning_rate": 5.350990552753013e-06, "loss": 0.347, "step": 110220 }, { "epoch": 3.9725015316971204, "grad_norm": 0.23179586231708527, "learning_rate": 5.349186472082493e-06, "loss": 0.3856, "step": 110225 }, { "epoch": 3.972681731358345, "grad_norm": 0.21170353889465332, "learning_rate": 5.347382659150516e-06, "loss": 0.3922, "step": 110230 }, { "epoch": 3.97286193101957, "grad_norm": 0.2426007241010666, "learning_rate": 5.345579113981641e-06, "loss": 0.3793, "step": 110235 }, { "epoch": 3.9730421306807946, "grad_norm": 0.28864866495132446, "learning_rate": 5.343775836600437e-06, "loss": 0.3969, "step": 110240 }, { "epoch": 3.973222330342019, "grad_norm": 0.27137455344200134, "learning_rate": 5.341972827031488e-06, "loss": 0.4164, "step": 110245 }, { "epoch": 3.9734025300032436, "grad_norm": 0.286211758852005, "learning_rate": 5.340170085299353e-06, "loss": 0.393, "step": 110250 }, { "epoch": 3.9735827296644683, "grad_norm": 0.21608895063400269, "learning_rate": 5.338367611428596e-06, "loss": 0.3818, "step": 110255 }, { "epoch": 3.9737629293256926, "grad_norm": 0.24834184348583221, "learning_rate": 5.336565405443772e-06, "loss": 0.3731, "step": 110260 }, { "epoch": 3.9739431289869174, "grad_norm": 0.24085558950901031, "learning_rate": 5.334763467369433e-06, "loss": 0.3548, "step": 110265 }, { "epoch": 3.974123328648142, "grad_norm": 0.28163549304008484, "learning_rate": 5.332961797230138e-06, "loss": 0.3742, "step": 110270 }, { "epoch": 3.974303528309367, "grad_norm": 0.26481056213378906, "learning_rate": 5.331160395050433e-06, "loss": 0.4114, "step": 110275 }, { "epoch": 3.9744837279705916, "grad_norm": 0.252930223941803, "learning_rate": 5.3293592608548606e-06, "loss": 0.3308, "step": 110280 }, { "epoch": 3.9746639276318163, "grad_norm": 0.2916033864021301, "learning_rate": 5.327558394667958e-06, "loss": 0.3712, "step": 110285 }, { "epoch": 3.9748441272930406, "grad_norm": 0.27334362268447876, "learning_rate": 5.325757796514258e-06, "loss": 0.3883, "step": 110290 }, { "epoch": 3.9750243269542653, "grad_norm": 0.2750318944454193, "learning_rate": 5.323957466418303e-06, "loss": 0.3864, "step": 110295 }, { "epoch": 3.97520452661549, "grad_norm": 0.2912689447402954, "learning_rate": 5.322157404404629e-06, "loss": 0.3845, "step": 110300 }, { "epoch": 3.9753847262767144, "grad_norm": 0.26538118720054626, "learning_rate": 5.320357610497734e-06, "loss": 0.3879, "step": 110305 }, { "epoch": 3.975564925937939, "grad_norm": 0.20100893080234528, "learning_rate": 5.3185580847221635e-06, "loss": 0.4083, "step": 110310 }, { "epoch": 3.975745125599164, "grad_norm": 0.2257593870162964, "learning_rate": 5.31675882710243e-06, "loss": 0.3935, "step": 110315 }, { "epoch": 3.9759253252603886, "grad_norm": 0.21260130405426025, "learning_rate": 5.314959837663039e-06, "loss": 0.3592, "step": 110320 }, { "epoch": 3.9761055249216133, "grad_norm": 0.2437485307455063, "learning_rate": 5.313161116428522e-06, "loss": 0.3784, "step": 110325 }, { "epoch": 3.976285724582838, "grad_norm": 0.25405043363571167, "learning_rate": 5.311362663423361e-06, "loss": 0.3999, "step": 110330 }, { "epoch": 3.9764659242440623, "grad_norm": 0.20990101993083954, "learning_rate": 5.30956447867208e-06, "loss": 0.351, "step": 110335 }, { "epoch": 3.976646123905287, "grad_norm": 0.22888407111167908, "learning_rate": 5.307766562199168e-06, "loss": 0.4054, "step": 110340 }, { "epoch": 3.976826323566512, "grad_norm": 0.19908618927001953, "learning_rate": 5.305968914029125e-06, "loss": 0.3972, "step": 110345 }, { "epoch": 3.977006523227736, "grad_norm": 0.26034486293792725, "learning_rate": 5.3041715341864415e-06, "loss": 0.345, "step": 110350 }, { "epoch": 3.977186722888961, "grad_norm": 0.2160859853029251, "learning_rate": 5.3023744226956106e-06, "loss": 0.3957, "step": 110355 }, { "epoch": 3.9773669225501855, "grad_norm": 0.23834609985351562, "learning_rate": 5.300577579581107e-06, "loss": 0.387, "step": 110360 }, { "epoch": 3.9775471222114103, "grad_norm": 0.2771296203136444, "learning_rate": 5.298781004867425e-06, "loss": 0.383, "step": 110365 }, { "epoch": 3.977727321872635, "grad_norm": 0.2532643973827362, "learning_rate": 5.296984698579038e-06, "loss": 0.3822, "step": 110370 }, { "epoch": 3.9779075215338597, "grad_norm": 0.24019809067249298, "learning_rate": 5.295188660740422e-06, "loss": 0.3729, "step": 110375 }, { "epoch": 3.978087721195084, "grad_norm": 0.20523947477340698, "learning_rate": 5.2933928913760426e-06, "loss": 0.3622, "step": 110380 }, { "epoch": 3.9782679208563088, "grad_norm": 0.21291251480579376, "learning_rate": 5.291597390510364e-06, "loss": 0.3355, "step": 110385 }, { "epoch": 3.9784481205175335, "grad_norm": 0.18715035915374756, "learning_rate": 5.289802158167862e-06, "loss": 0.3336, "step": 110390 }, { "epoch": 3.9786283201787582, "grad_norm": 0.24453963339328766, "learning_rate": 5.288007194372996e-06, "loss": 0.4144, "step": 110395 }, { "epoch": 3.9788085198399825, "grad_norm": 0.24042107164859772, "learning_rate": 5.2862124991502035e-06, "loss": 0.3996, "step": 110400 }, { "epoch": 3.9789887195012072, "grad_norm": 0.2552798390388489, "learning_rate": 5.284418072523953e-06, "loss": 0.3834, "step": 110405 }, { "epoch": 3.979168919162432, "grad_norm": 0.273781418800354, "learning_rate": 5.282623914518686e-06, "loss": 0.3961, "step": 110410 }, { "epoch": 3.9793491188236567, "grad_norm": 0.2784363329410553, "learning_rate": 5.280830025158861e-06, "loss": 0.3527, "step": 110415 }, { "epoch": 3.9795293184848815, "grad_norm": 0.28789547085762024, "learning_rate": 5.279036404468904e-06, "loss": 0.3491, "step": 110420 }, { "epoch": 3.9797095181461057, "grad_norm": 0.20534764230251312, "learning_rate": 5.277243052473252e-06, "loss": 0.3759, "step": 110425 }, { "epoch": 3.9798897178073305, "grad_norm": 0.29807037115097046, "learning_rate": 5.27544996919635e-06, "loss": 0.3851, "step": 110430 }, { "epoch": 3.980069917468555, "grad_norm": 0.2329607754945755, "learning_rate": 5.273657154662626e-06, "loss": 0.3944, "step": 110435 }, { "epoch": 3.98025011712978, "grad_norm": 0.2554316520690918, "learning_rate": 5.271864608896501e-06, "loss": 0.4016, "step": 110440 }, { "epoch": 3.9804303167910042, "grad_norm": 0.2084946185350418, "learning_rate": 5.270072331922405e-06, "loss": 0.3763, "step": 110445 }, { "epoch": 3.980610516452229, "grad_norm": 0.2905483841896057, "learning_rate": 5.268280323764744e-06, "loss": 0.3805, "step": 110450 }, { "epoch": 3.9807907161134537, "grad_norm": 0.2571609616279602, "learning_rate": 5.266488584447949e-06, "loss": 0.3814, "step": 110455 }, { "epoch": 3.9809709157746784, "grad_norm": 0.25238341093063354, "learning_rate": 5.264697113996428e-06, "loss": 0.396, "step": 110460 }, { "epoch": 3.981151115435903, "grad_norm": 0.21064473688602448, "learning_rate": 5.2629059124345875e-06, "loss": 0.3894, "step": 110465 }, { "epoch": 3.9813313150971275, "grad_norm": 0.19198505580425262, "learning_rate": 5.261114979786832e-06, "loss": 0.3529, "step": 110470 }, { "epoch": 3.981511514758352, "grad_norm": 0.20793737471103668, "learning_rate": 5.259324316077561e-06, "loss": 0.3754, "step": 110475 }, { "epoch": 3.981691714419577, "grad_norm": 0.2338850349187851, "learning_rate": 5.257533921331176e-06, "loss": 0.3651, "step": 110480 }, { "epoch": 3.9818719140808017, "grad_norm": 0.2929813265800476, "learning_rate": 5.255743795572071e-06, "loss": 0.3788, "step": 110485 }, { "epoch": 3.982052113742026, "grad_norm": 0.22587592899799347, "learning_rate": 5.253953938824635e-06, "loss": 0.3693, "step": 110490 }, { "epoch": 3.9822323134032507, "grad_norm": 0.25955894589424133, "learning_rate": 5.252164351113254e-06, "loss": 0.3895, "step": 110495 }, { "epoch": 3.9824125130644754, "grad_norm": 0.22721461951732635, "learning_rate": 5.250375032462307e-06, "loss": 0.3641, "step": 110500 }, { "epoch": 3.9824125130644754, "eval_loss": 0.4280896484851837, "eval_runtime": 3.5308, "eval_samples_per_second": 28.322, "eval_steps_per_second": 7.081, "step": 110500 }, { "epoch": 3.9825927127257, "grad_norm": 0.21194064617156982, "learning_rate": 5.248585982896173e-06, "loss": 0.3549, "step": 110505 }, { "epoch": 3.982772912386925, "grad_norm": 0.24252504110336304, "learning_rate": 5.2467972024392435e-06, "loss": 0.3785, "step": 110510 }, { "epoch": 3.9829531120481496, "grad_norm": 0.28674301505088806, "learning_rate": 5.245008691115863e-06, "loss": 0.3665, "step": 110515 }, { "epoch": 3.983133311709374, "grad_norm": 0.25369998812675476, "learning_rate": 5.243220448950423e-06, "loss": 0.3448, "step": 110520 }, { "epoch": 3.9833135113705986, "grad_norm": 0.19632992148399353, "learning_rate": 5.241432475967278e-06, "loss": 0.3655, "step": 110525 }, { "epoch": 3.9834937110318234, "grad_norm": 0.2441524714231491, "learning_rate": 5.239644772190791e-06, "loss": 0.3927, "step": 110530 }, { "epoch": 3.9836739106930477, "grad_norm": 0.2212579846382141, "learning_rate": 5.237857337645319e-06, "loss": 0.3874, "step": 110535 }, { "epoch": 3.9838541103542724, "grad_norm": 0.2771473228931427, "learning_rate": 5.236070172355212e-06, "loss": 0.3673, "step": 110540 }, { "epoch": 3.984034310015497, "grad_norm": 0.2671329379081726, "learning_rate": 5.234283276344818e-06, "loss": 0.3517, "step": 110545 }, { "epoch": 3.984214509676722, "grad_norm": 0.20370586216449738, "learning_rate": 5.232496649638494e-06, "loss": 0.3667, "step": 110550 }, { "epoch": 3.9843947093379466, "grad_norm": 0.25102975964546204, "learning_rate": 5.230710292260576e-06, "loss": 0.382, "step": 110555 }, { "epoch": 3.9845749089991713, "grad_norm": 0.2473146915435791, "learning_rate": 5.2289242042354025e-06, "loss": 0.4072, "step": 110560 }, { "epoch": 3.9847551086603956, "grad_norm": 0.2885691523551941, "learning_rate": 5.22713838558731e-06, "loss": 0.3524, "step": 110565 }, { "epoch": 3.9849353083216204, "grad_norm": 0.2779085338115692, "learning_rate": 5.225352836340622e-06, "loss": 0.3521, "step": 110570 }, { "epoch": 3.985115507982845, "grad_norm": 0.2686401903629303, "learning_rate": 5.223567556519679e-06, "loss": 0.3986, "step": 110575 }, { "epoch": 3.9852957076440694, "grad_norm": 0.20243895053863525, "learning_rate": 5.221782546148804e-06, "loss": 0.358, "step": 110580 }, { "epoch": 3.985475907305294, "grad_norm": 0.2190602421760559, "learning_rate": 5.219997805252303e-06, "loss": 0.3865, "step": 110585 }, { "epoch": 3.985656106966519, "grad_norm": 0.2563530504703522, "learning_rate": 5.2182133338545074e-06, "loss": 0.3668, "step": 110590 }, { "epoch": 3.9858363066277436, "grad_norm": 0.26702284812927246, "learning_rate": 5.216429131979717e-06, "loss": 0.3666, "step": 110595 }, { "epoch": 3.9860165062889683, "grad_norm": 0.243973970413208, "learning_rate": 5.214645199652257e-06, "loss": 0.3657, "step": 110600 }, { "epoch": 3.986196705950193, "grad_norm": 0.22171549499034882, "learning_rate": 5.212861536896435e-06, "loss": 0.4154, "step": 110605 }, { "epoch": 3.9863769056114173, "grad_norm": 0.224653422832489, "learning_rate": 5.211078143736528e-06, "loss": 0.3901, "step": 110610 }, { "epoch": 3.986557105272642, "grad_norm": 0.2409631460905075, "learning_rate": 5.209295020196855e-06, "loss": 0.3659, "step": 110615 }, { "epoch": 3.986737304933867, "grad_norm": 0.21369391679763794, "learning_rate": 5.207512166301709e-06, "loss": 0.3474, "step": 110620 }, { "epoch": 3.986917504595091, "grad_norm": 0.24725468456745148, "learning_rate": 5.205729582075375e-06, "loss": 0.3711, "step": 110625 }, { "epoch": 3.987097704256316, "grad_norm": 0.23448602855205536, "learning_rate": 5.203947267542145e-06, "loss": 0.3823, "step": 110630 }, { "epoch": 3.9872779039175406, "grad_norm": 0.20930229127407074, "learning_rate": 5.202165222726294e-06, "loss": 0.3424, "step": 110635 }, { "epoch": 3.9874581035787653, "grad_norm": 0.2615196406841278, "learning_rate": 5.200383447652115e-06, "loss": 0.3677, "step": 110640 }, { "epoch": 3.98763830323999, "grad_norm": 0.27675536274909973, "learning_rate": 5.198601942343878e-06, "loss": 0.3756, "step": 110645 }, { "epoch": 3.9878185029012148, "grad_norm": 0.2948794364929199, "learning_rate": 5.196820706825856e-06, "loss": 0.4064, "step": 110650 }, { "epoch": 3.987998702562439, "grad_norm": 0.2301834374666214, "learning_rate": 5.195039741122318e-06, "loss": 0.337, "step": 110655 }, { "epoch": 3.988178902223664, "grad_norm": 0.27218741178512573, "learning_rate": 5.193259045257523e-06, "loss": 0.4075, "step": 110660 }, { "epoch": 3.9883591018848885, "grad_norm": 0.2404523491859436, "learning_rate": 5.191478619255746e-06, "loss": 0.3845, "step": 110665 }, { "epoch": 3.9885393015461132, "grad_norm": 0.2734452486038208, "learning_rate": 5.189698463141237e-06, "loss": 0.3988, "step": 110670 }, { "epoch": 3.9887195012073375, "grad_norm": 0.2366897016763687, "learning_rate": 5.18791857693825e-06, "loss": 0.3859, "step": 110675 }, { "epoch": 3.9888997008685623, "grad_norm": 0.22594159841537476, "learning_rate": 5.186138960671039e-06, "loss": 0.3872, "step": 110680 }, { "epoch": 3.989079900529787, "grad_norm": 0.24368955194950104, "learning_rate": 5.184359614363846e-06, "loss": 0.3196, "step": 110685 }, { "epoch": 3.9892601001910117, "grad_norm": 0.25890570878982544, "learning_rate": 5.18258053804091e-06, "loss": 0.3746, "step": 110690 }, { "epoch": 3.9894402998522365, "grad_norm": 0.23485039174556732, "learning_rate": 5.180801731726493e-06, "loss": 0.364, "step": 110695 }, { "epoch": 3.9896204995134608, "grad_norm": 0.18861232697963715, "learning_rate": 5.179023195444802e-06, "loss": 0.3839, "step": 110700 }, { "epoch": 3.9898006991746855, "grad_norm": 0.23490877449512482, "learning_rate": 5.177244929220088e-06, "loss": 0.3705, "step": 110705 }, { "epoch": 3.9899808988359102, "grad_norm": 0.2072230577468872, "learning_rate": 5.175466933076573e-06, "loss": 0.3797, "step": 110710 }, { "epoch": 3.990161098497135, "grad_norm": 0.2669602334499359, "learning_rate": 5.173689207038479e-06, "loss": 0.3559, "step": 110715 }, { "epoch": 3.9903412981583593, "grad_norm": 0.2188437432050705, "learning_rate": 5.17191175113004e-06, "loss": 0.3732, "step": 110720 }, { "epoch": 3.990521497819584, "grad_norm": 0.2595865726470947, "learning_rate": 5.1701345653754615e-06, "loss": 0.3975, "step": 110725 }, { "epoch": 3.9907016974808087, "grad_norm": 0.22355596721172333, "learning_rate": 5.168357649798952e-06, "loss": 0.3346, "step": 110730 }, { "epoch": 3.9908818971420335, "grad_norm": 0.19183094799518585, "learning_rate": 5.1665810044247395e-06, "loss": 0.3851, "step": 110735 }, { "epoch": 3.991062096803258, "grad_norm": 0.2600308060646057, "learning_rate": 5.164804629277018e-06, "loss": 0.3942, "step": 110740 }, { "epoch": 3.991242296464483, "grad_norm": 0.2572632431983948, "learning_rate": 5.163028524379995e-06, "loss": 0.4191, "step": 110745 }, { "epoch": 3.991422496125707, "grad_norm": 0.21313636004924774, "learning_rate": 5.161252689757867e-06, "loss": 0.3822, "step": 110750 }, { "epoch": 3.991602695786932, "grad_norm": 0.24319961667060852, "learning_rate": 5.159477125434822e-06, "loss": 0.3934, "step": 110755 }, { "epoch": 3.9917828954481567, "grad_norm": 0.2313625067472458, "learning_rate": 5.157701831435069e-06, "loss": 0.3736, "step": 110760 }, { "epoch": 3.991963095109381, "grad_norm": 0.2143564522266388, "learning_rate": 5.155926807782785e-06, "loss": 0.3559, "step": 110765 }, { "epoch": 3.9921432947706057, "grad_norm": 0.28823596239089966, "learning_rate": 5.15415205450216e-06, "loss": 0.3423, "step": 110770 }, { "epoch": 3.9923234944318304, "grad_norm": 0.21266886591911316, "learning_rate": 5.152377571617368e-06, "loss": 0.367, "step": 110775 }, { "epoch": 3.992503694093055, "grad_norm": 0.25234442949295044, "learning_rate": 5.150603359152581e-06, "loss": 0.3763, "step": 110780 }, { "epoch": 3.99268389375428, "grad_norm": 0.2379533350467682, "learning_rate": 5.14882941713199e-06, "loss": 0.3801, "step": 110785 }, { "epoch": 3.9928640934155046, "grad_norm": 0.26140525937080383, "learning_rate": 5.147410458251575e-06, "loss": 0.3709, "step": 110790 }, { "epoch": 3.993044293076729, "grad_norm": 0.24928440153598785, "learning_rate": 5.1456370030914245e-06, "loss": 0.3754, "step": 110795 }, { "epoch": 3.9932244927379537, "grad_norm": 0.2500706613063812, "learning_rate": 5.1438638184431175e-06, "loss": 0.3691, "step": 110800 }, { "epoch": 3.9934046923991784, "grad_norm": 0.2307901233434677, "learning_rate": 5.142090904330829e-06, "loss": 0.3391, "step": 110805 }, { "epoch": 3.9935848920604027, "grad_norm": 0.2293684333562851, "learning_rate": 5.1403182607787065e-06, "loss": 0.3591, "step": 110810 }, { "epoch": 3.9937650917216274, "grad_norm": 0.21017733216285706, "learning_rate": 5.138545887810903e-06, "loss": 0.3675, "step": 110815 }, { "epoch": 3.993945291382852, "grad_norm": 0.26224109530448914, "learning_rate": 5.136773785451565e-06, "loss": 0.3531, "step": 110820 }, { "epoch": 3.994125491044077, "grad_norm": 0.20869319140911102, "learning_rate": 5.135001953724833e-06, "loss": 0.3558, "step": 110825 }, { "epoch": 3.9943056907053016, "grad_norm": 0.3143329918384552, "learning_rate": 5.133230392654861e-06, "loss": 0.3956, "step": 110830 }, { "epoch": 3.9944858903665263, "grad_norm": 0.25546544790267944, "learning_rate": 5.131459102265779e-06, "loss": 0.3808, "step": 110835 }, { "epoch": 3.9946660900277506, "grad_norm": 0.20834296941757202, "learning_rate": 5.1296880825817185e-06, "loss": 0.3834, "step": 110840 }, { "epoch": 3.9948462896889754, "grad_norm": 0.2101668119430542, "learning_rate": 5.127917333626811e-06, "loss": 0.3633, "step": 110845 }, { "epoch": 3.9950264893502, "grad_norm": 0.2534985840320587, "learning_rate": 5.126146855425176e-06, "loss": 0.3666, "step": 110850 }, { "epoch": 3.9952066890114244, "grad_norm": 0.1858637034893036, "learning_rate": 5.12437664800095e-06, "loss": 0.3767, "step": 110855 }, { "epoch": 3.995386888672649, "grad_norm": 0.25924816727638245, "learning_rate": 5.12260671137825e-06, "loss": 0.3932, "step": 110860 }, { "epoch": 3.995567088333874, "grad_norm": 0.2486696094274521, "learning_rate": 5.12083704558117e-06, "loss": 0.3783, "step": 110865 }, { "epoch": 3.9957472879950986, "grad_norm": 0.24414058029651642, "learning_rate": 5.119067650633847e-06, "loss": 0.3898, "step": 110870 }, { "epoch": 3.9959274876563233, "grad_norm": 0.22701597213745117, "learning_rate": 5.117298526560374e-06, "loss": 0.3538, "step": 110875 }, { "epoch": 3.996107687317548, "grad_norm": 0.2523609697818756, "learning_rate": 5.115529673384861e-06, "loss": 0.3745, "step": 110880 }, { "epoch": 3.9962878869787724, "grad_norm": 0.21400229632854462, "learning_rate": 5.113761091131408e-06, "loss": 0.3759, "step": 110885 }, { "epoch": 3.996468086639997, "grad_norm": 0.2355312556028366, "learning_rate": 5.111992779824101e-06, "loss": 0.3256, "step": 110890 }, { "epoch": 3.996648286301222, "grad_norm": 0.231663778424263, "learning_rate": 5.110224739487051e-06, "loss": 0.4152, "step": 110895 }, { "epoch": 3.9968284859624466, "grad_norm": 0.22095178067684174, "learning_rate": 5.108456970144338e-06, "loss": 0.3665, "step": 110900 }, { "epoch": 3.997008685623671, "grad_norm": 0.31155142188072205, "learning_rate": 5.106689471820045e-06, "loss": 0.3996, "step": 110905 }, { "epoch": 3.9971888852848956, "grad_norm": 0.18749769032001495, "learning_rate": 5.10492224453826e-06, "loss": 0.3583, "step": 110910 }, { "epoch": 3.9973690849461203, "grad_norm": 0.28036782145500183, "learning_rate": 5.1031552883230475e-06, "loss": 0.3738, "step": 110915 }, { "epoch": 3.997549284607345, "grad_norm": 0.2641852796077728, "learning_rate": 5.101388603198501e-06, "loss": 0.3823, "step": 110920 }, { "epoch": 3.9977294842685698, "grad_norm": 0.22281073033809662, "learning_rate": 5.099622189188679e-06, "loss": 0.3982, "step": 110925 }, { "epoch": 3.997909683929794, "grad_norm": 0.2702135741710663, "learning_rate": 5.097856046317656e-06, "loss": 0.3821, "step": 110930 }, { "epoch": 3.998089883591019, "grad_norm": 0.25473445653915405, "learning_rate": 5.096090174609489e-06, "loss": 0.3727, "step": 110935 }, { "epoch": 3.9982700832522435, "grad_norm": 0.2873947024345398, "learning_rate": 5.0943245740882414e-06, "loss": 0.3805, "step": 110940 }, { "epoch": 3.9984502829134683, "grad_norm": 0.211199089884758, "learning_rate": 5.092559244777958e-06, "loss": 0.358, "step": 110945 }, { "epoch": 3.9986304825746926, "grad_norm": 0.256854772567749, "learning_rate": 5.090794186702711e-06, "loss": 0.3848, "step": 110950 }, { "epoch": 3.9988106822359173, "grad_norm": 0.22104725241661072, "learning_rate": 5.0890293998865355e-06, "loss": 0.3522, "step": 110955 }, { "epoch": 3.998990881897142, "grad_norm": 0.2235444188117981, "learning_rate": 5.087264884353482e-06, "loss": 0.3764, "step": 110960 }, { "epoch": 3.9991710815583668, "grad_norm": 0.21866776049137115, "learning_rate": 5.085500640127588e-06, "loss": 0.366, "step": 110965 }, { "epoch": 3.9993512812195915, "grad_norm": 0.2265380173921585, "learning_rate": 5.083736667232886e-06, "loss": 0.3968, "step": 110970 }, { "epoch": 3.999531480880816, "grad_norm": 0.19983336329460144, "learning_rate": 5.081972965693429e-06, "loss": 0.3938, "step": 110975 }, { "epoch": 3.9997116805420405, "grad_norm": 0.2481551617383957, "learning_rate": 5.080209535533229e-06, "loss": 0.3913, "step": 110980 }, { "epoch": 3.9998918802032652, "grad_norm": 0.20951221883296967, "learning_rate": 5.078446376776308e-06, "loss": 0.3391, "step": 110985 }, { "epoch": 4.0000720798644895, "grad_norm": 0.25803592801094055, "learning_rate": 5.076683489446707e-06, "loss": 0.3555, "step": 110990 }, { "epoch": 4.000252279525714, "grad_norm": 0.17808286845684052, "learning_rate": 5.074920873568434e-06, "loss": 0.3527, "step": 110995 }, { "epoch": 4.000432479186939, "grad_norm": 0.2654048502445221, "learning_rate": 5.073158529165508e-06, "loss": 0.3552, "step": 111000 }, { "epoch": 4.000432479186939, "eval_loss": 0.42817291617393494, "eval_runtime": 3.521, "eval_samples_per_second": 28.401, "eval_steps_per_second": 7.1, "step": 111000 }, { "epoch": 4.000612678848164, "grad_norm": 0.34447306394577026, "learning_rate": 5.07139645626194e-06, "loss": 0.3778, "step": 111005 }, { "epoch": 4.0007928785093885, "grad_norm": 0.22575746476650238, "learning_rate": 5.069634654881728e-06, "loss": 0.4257, "step": 111010 }, { "epoch": 4.000973078170613, "grad_norm": 0.23802292346954346, "learning_rate": 5.067873125048894e-06, "loss": 0.3794, "step": 111015 }, { "epoch": 4.001153277831838, "grad_norm": 0.20055340230464935, "learning_rate": 5.066111866787429e-06, "loss": 0.3623, "step": 111020 }, { "epoch": 4.001333477493063, "grad_norm": 0.23983781039714813, "learning_rate": 5.06435088012133e-06, "loss": 0.3751, "step": 111025 }, { "epoch": 4.0015136771542865, "grad_norm": 0.27245986461639404, "learning_rate": 5.062590165074591e-06, "loss": 0.3687, "step": 111030 }, { "epoch": 4.001693876815511, "grad_norm": 0.20769277215003967, "learning_rate": 5.060829721671193e-06, "loss": 0.365, "step": 111035 }, { "epoch": 4.001874076476736, "grad_norm": 0.2103443592786789, "learning_rate": 5.059069549935139e-06, "loss": 0.349, "step": 111040 }, { "epoch": 4.002054276137961, "grad_norm": 0.26758506894111633, "learning_rate": 5.057309649890407e-06, "loss": 0.3724, "step": 111045 }, { "epoch": 4.0022344757991855, "grad_norm": 0.22477580606937408, "learning_rate": 5.055550021560956e-06, "loss": 0.3396, "step": 111050 }, { "epoch": 4.00241467546041, "grad_norm": 0.2671130299568176, "learning_rate": 5.053790664970781e-06, "loss": 0.3947, "step": 111055 }, { "epoch": 4.002594875121635, "grad_norm": 0.24974533915519714, "learning_rate": 5.052031580143848e-06, "loss": 0.3721, "step": 111060 }, { "epoch": 4.00277507478286, "grad_norm": 0.2837058901786804, "learning_rate": 5.050272767104114e-06, "loss": 0.3729, "step": 111065 }, { "epoch": 4.002955274444084, "grad_norm": 0.29399046301841736, "learning_rate": 5.048514225875567e-06, "loss": 0.4003, "step": 111070 }, { "epoch": 4.003135474105309, "grad_norm": 0.2518676221370697, "learning_rate": 5.046755956482135e-06, "loss": 0.386, "step": 111075 }, { "epoch": 4.003315673766533, "grad_norm": 0.3091764748096466, "learning_rate": 5.044997958947801e-06, "loss": 0.3661, "step": 111080 }, { "epoch": 4.003495873427758, "grad_norm": 0.24037063121795654, "learning_rate": 5.0432402332965005e-06, "loss": 0.3483, "step": 111085 }, { "epoch": 4.003676073088982, "grad_norm": 0.2921180725097656, "learning_rate": 5.041482779552192e-06, "loss": 0.3889, "step": 111090 }, { "epoch": 4.003856272750207, "grad_norm": 0.20005546510219574, "learning_rate": 5.039725597738815e-06, "loss": 0.3702, "step": 111095 }, { "epoch": 4.004036472411432, "grad_norm": 0.25480973720550537, "learning_rate": 5.037968687880306e-06, "loss": 0.363, "step": 111100 }, { "epoch": 4.004216672072657, "grad_norm": 0.2529853284358978, "learning_rate": 5.036212050000616e-06, "loss": 0.3698, "step": 111105 }, { "epoch": 4.004396871733881, "grad_norm": 0.2999667525291443, "learning_rate": 5.034455684123673e-06, "loss": 0.3507, "step": 111110 }, { "epoch": 4.004577071395106, "grad_norm": 0.21827971935272217, "learning_rate": 5.032699590273404e-06, "loss": 0.3973, "step": 111115 }, { "epoch": 4.004757271056331, "grad_norm": 0.2813790440559387, "learning_rate": 5.030943768473736e-06, "loss": 0.3708, "step": 111120 }, { "epoch": 4.004937470717555, "grad_norm": 0.28172221779823303, "learning_rate": 5.029188218748595e-06, "loss": 0.38, "step": 111125 }, { "epoch": 4.005117670378779, "grad_norm": 0.21738965809345245, "learning_rate": 5.027432941121893e-06, "loss": 0.3671, "step": 111130 }, { "epoch": 4.005297870040004, "grad_norm": 0.24970416724681854, "learning_rate": 5.025677935617554e-06, "loss": 0.3419, "step": 111135 }, { "epoch": 4.005478069701229, "grad_norm": Infinity, "learning_rate": 5.024274127158257e-06, "loss": 0.3606, "step": 111140 }, { "epoch": 4.005658269362454, "grad_norm": 0.289569228887558, "learning_rate": 5.0225196115344124e-06, "loss": 0.3755, "step": 111145 }, { "epoch": 4.005838469023678, "grad_norm": 0.2360028475522995, "learning_rate": 5.020765368099878e-06, "loss": 0.3216, "step": 111150 }, { "epoch": 4.006018668684903, "grad_norm": 0.26487815380096436, "learning_rate": 5.019011396878548e-06, "loss": 0.3865, "step": 111155 }, { "epoch": 4.006198868346128, "grad_norm": 0.19243910908699036, "learning_rate": 5.017257697894321e-06, "loss": 0.3547, "step": 111160 }, { "epoch": 4.0063790680073526, "grad_norm": 0.2855333983898163, "learning_rate": 5.015504271171087e-06, "loss": 0.3976, "step": 111165 }, { "epoch": 4.006559267668576, "grad_norm": 0.2589762508869171, "learning_rate": 5.013751116732732e-06, "loss": 0.3747, "step": 111170 }, { "epoch": 4.006739467329801, "grad_norm": 0.22709418833255768, "learning_rate": 5.011998234603155e-06, "loss": 0.357, "step": 111175 }, { "epoch": 4.006919666991026, "grad_norm": 0.2505840063095093, "learning_rate": 5.010245624806232e-06, "loss": 0.3812, "step": 111180 }, { "epoch": 4.007099866652251, "grad_norm": 0.23559889197349548, "learning_rate": 5.008493287365842e-06, "loss": 0.3925, "step": 111185 }, { "epoch": 4.007280066313475, "grad_norm": 0.22356297075748444, "learning_rate": 5.006741222305861e-06, "loss": 0.3622, "step": 111190 }, { "epoch": 4.0074602659747, "grad_norm": 0.1752263903617859, "learning_rate": 5.004989429650161e-06, "loss": 0.3287, "step": 111195 }, { "epoch": 4.007640465635925, "grad_norm": 0.21405918896198273, "learning_rate": 5.003237909422601e-06, "loss": 0.3745, "step": 111200 }, { "epoch": 4.0078206652971495, "grad_norm": 0.27426615357398987, "learning_rate": 5.001486661647059e-06, "loss": 0.401, "step": 111205 }, { "epoch": 4.008000864958374, "grad_norm": 0.21843719482421875, "learning_rate": 4.99973568634739e-06, "loss": 0.3893, "step": 111210 }, { "epoch": 4.008181064619598, "grad_norm": 0.20329011976718903, "learning_rate": 4.9979849835474515e-06, "loss": 0.3976, "step": 111215 }, { "epoch": 4.008361264280823, "grad_norm": 0.21530331671237946, "learning_rate": 4.996234553271092e-06, "loss": 0.3503, "step": 111220 }, { "epoch": 4.008541463942048, "grad_norm": 0.23485726118087769, "learning_rate": 4.994484395542159e-06, "loss": 0.3453, "step": 111225 }, { "epoch": 4.008721663603272, "grad_norm": 0.22897350788116455, "learning_rate": 4.992734510384511e-06, "loss": 0.3664, "step": 111230 }, { "epoch": 4.008901863264497, "grad_norm": 0.2363205850124359, "learning_rate": 4.990984897821988e-06, "loss": 0.3858, "step": 111235 }, { "epoch": 4.009082062925722, "grad_norm": 0.2695505917072296, "learning_rate": 4.989235557878408e-06, "loss": 0.3598, "step": 111240 }, { "epoch": 4.0092622625869465, "grad_norm": 0.22455400228500366, "learning_rate": 4.987486490577626e-06, "loss": 0.3521, "step": 111245 }, { "epoch": 4.009442462248171, "grad_norm": 0.23414896428585052, "learning_rate": 4.985737695943457e-06, "loss": 0.3757, "step": 111250 }, { "epoch": 4.009622661909396, "grad_norm": 0.26559486985206604, "learning_rate": 4.9839891739997535e-06, "loss": 0.3847, "step": 111255 }, { "epoch": 4.00980286157062, "grad_norm": 0.2793545126914978, "learning_rate": 4.982240924770315e-06, "loss": 0.4138, "step": 111260 }, { "epoch": 4.009983061231845, "grad_norm": 0.28454217314720154, "learning_rate": 4.980492948278961e-06, "loss": 0.3402, "step": 111265 }, { "epoch": 4.010163260893069, "grad_norm": 0.1981167495250702, "learning_rate": 4.978745244549521e-06, "loss": 0.3759, "step": 111270 }, { "epoch": 4.010343460554294, "grad_norm": 0.2657647132873535, "learning_rate": 4.976997813605802e-06, "loss": 0.3501, "step": 111275 }, { "epoch": 4.010523660215519, "grad_norm": 0.20731034874916077, "learning_rate": 4.975250655471611e-06, "loss": 0.3307, "step": 111280 }, { "epoch": 4.0107038598767435, "grad_norm": 0.3427305221557617, "learning_rate": 4.973503770170751e-06, "loss": 0.3818, "step": 111285 }, { "epoch": 4.010884059537968, "grad_norm": 0.2292642891407013, "learning_rate": 4.971757157727019e-06, "loss": 0.3537, "step": 111290 }, { "epoch": 4.011064259199193, "grad_norm": 0.2728843092918396, "learning_rate": 4.970010818164225e-06, "loss": 0.3926, "step": 111295 }, { "epoch": 4.011244458860418, "grad_norm": 0.2139047533273697, "learning_rate": 4.968264751506157e-06, "loss": 0.3357, "step": 111300 }, { "epoch": 4.0114246585216415, "grad_norm": 0.2937036156654358, "learning_rate": 4.966518957776603e-06, "loss": 0.3816, "step": 111305 }, { "epoch": 4.011604858182866, "grad_norm": 0.31727510690689087, "learning_rate": 4.964773436999348e-06, "loss": 0.3726, "step": 111310 }, { "epoch": 4.011785057844091, "grad_norm": 0.2226351499557495, "learning_rate": 4.96302818919818e-06, "loss": 0.3629, "step": 111315 }, { "epoch": 4.011965257505316, "grad_norm": 0.28360915184020996, "learning_rate": 4.961283214396864e-06, "loss": 0.3353, "step": 111320 }, { "epoch": 4.0121454571665405, "grad_norm": 0.25234130024909973, "learning_rate": 4.959538512619197e-06, "loss": 0.3666, "step": 111325 }, { "epoch": 4.012325656827765, "grad_norm": 0.357035368680954, "learning_rate": 4.9577940838889255e-06, "loss": 0.3633, "step": 111330 }, { "epoch": 4.01250585648899, "grad_norm": 0.26695460081100464, "learning_rate": 4.956049928229836e-06, "loss": 0.371, "step": 111335 }, { "epoch": 4.012686056150215, "grad_norm": 0.24045346677303314, "learning_rate": 4.954306045665686e-06, "loss": 0.384, "step": 111340 }, { "epoch": 4.012866255811439, "grad_norm": 0.22014078497886658, "learning_rate": 4.952562436220234e-06, "loss": 0.3773, "step": 111345 }, { "epoch": 4.013046455472664, "grad_norm": 0.22839348018169403, "learning_rate": 4.950819099917239e-06, "loss": 0.3454, "step": 111350 }, { "epoch": 4.013226655133888, "grad_norm": 0.2525436282157898, "learning_rate": 4.949076036780445e-06, "loss": 0.3878, "step": 111355 }, { "epoch": 4.013406854795113, "grad_norm": 0.23207542300224304, "learning_rate": 4.947333246833613e-06, "loss": 0.3541, "step": 111360 }, { "epoch": 4.0135870544563375, "grad_norm": 0.2273051142692566, "learning_rate": 4.945590730100486e-06, "loss": 0.3643, "step": 111365 }, { "epoch": 4.013767254117562, "grad_norm": 0.31350627541542053, "learning_rate": 4.943848486604802e-06, "loss": 0.3552, "step": 111370 }, { "epoch": 4.013947453778787, "grad_norm": 0.26370009779930115, "learning_rate": 4.942106516370298e-06, "loss": 0.3816, "step": 111375 }, { "epoch": 4.014127653440012, "grad_norm": 0.24444960057735443, "learning_rate": 4.940364819420709e-06, "loss": 0.38, "step": 111380 }, { "epoch": 4.014307853101236, "grad_norm": 0.1919577717781067, "learning_rate": 4.93862339577976e-06, "loss": 0.3385, "step": 111385 }, { "epoch": 4.014488052762461, "grad_norm": 0.2629943788051605, "learning_rate": 4.936882245471192e-06, "loss": 0.3585, "step": 111390 }, { "epoch": 4.014668252423686, "grad_norm": 0.2678518295288086, "learning_rate": 4.935141368518717e-06, "loss": 0.3823, "step": 111395 }, { "epoch": 4.01484845208491, "grad_norm": 0.2841361165046692, "learning_rate": 4.933400764946056e-06, "loss": 0.3928, "step": 111400 }, { "epoch": 4.015028651746134, "grad_norm": 0.2338554561138153, "learning_rate": 4.931660434776925e-06, "loss": 0.3745, "step": 111405 }, { "epoch": 4.015208851407359, "grad_norm": 0.2238939106464386, "learning_rate": 4.929920378035027e-06, "loss": 0.3918, "step": 111410 }, { "epoch": 4.015389051068584, "grad_norm": 0.28193482756614685, "learning_rate": 4.9281805947440865e-06, "loss": 0.3874, "step": 111415 }, { "epoch": 4.015569250729809, "grad_norm": 0.216234028339386, "learning_rate": 4.926441084927805e-06, "loss": 0.3888, "step": 111420 }, { "epoch": 4.015749450391033, "grad_norm": 0.22301048040390015, "learning_rate": 4.924701848609864e-06, "loss": 0.3606, "step": 111425 }, { "epoch": 4.015929650052258, "grad_norm": 0.24179665744304657, "learning_rate": 4.922962885813981e-06, "loss": 0.3597, "step": 111430 }, { "epoch": 4.016109849713483, "grad_norm": 0.24188639223575592, "learning_rate": 4.9212241965638365e-06, "loss": 0.3479, "step": 111435 }, { "epoch": 4.016290049374708, "grad_norm": 0.26836642622947693, "learning_rate": 4.919485780883135e-06, "loss": 0.3606, "step": 111440 }, { "epoch": 4.016470249035931, "grad_norm": 0.2224971503019333, "learning_rate": 4.9177476387955475e-06, "loss": 0.3699, "step": 111445 }, { "epoch": 4.016650448697156, "grad_norm": 0.27810531854629517, "learning_rate": 4.916009770324753e-06, "loss": 0.3726, "step": 111450 }, { "epoch": 4.016830648358381, "grad_norm": 0.21480928361415863, "learning_rate": 4.9142721754944446e-06, "loss": 0.3529, "step": 111455 }, { "epoch": 4.017010848019606, "grad_norm": 0.24362391233444214, "learning_rate": 4.9125348543282875e-06, "loss": 0.3412, "step": 111460 }, { "epoch": 4.01719104768083, "grad_norm": 0.2782408893108368, "learning_rate": 4.910797806849956e-06, "loss": 0.3839, "step": 111465 }, { "epoch": 4.017371247342055, "grad_norm": 0.22316765785217285, "learning_rate": 4.909061033083112e-06, "loss": 0.3899, "step": 111470 }, { "epoch": 4.01755144700328, "grad_norm": 0.2562723457813263, "learning_rate": 4.907324533051419e-06, "loss": 0.3515, "step": 111475 }, { "epoch": 4.0177316466645046, "grad_norm": 0.2701679468154907, "learning_rate": 4.905588306778544e-06, "loss": 0.3532, "step": 111480 }, { "epoch": 4.017911846325729, "grad_norm": 0.20503626763820648, "learning_rate": 4.9038523542881355e-06, "loss": 0.3923, "step": 111485 }, { "epoch": 4.018092045986953, "grad_norm": 0.2625948488712311, "learning_rate": 4.902116675603852e-06, "loss": 0.3843, "step": 111490 }, { "epoch": 4.018272245648178, "grad_norm": 0.24299366772174835, "learning_rate": 4.900381270749335e-06, "loss": 0.3681, "step": 111495 }, { "epoch": 4.018452445309403, "grad_norm": 0.19307754933834076, "learning_rate": 4.898646139748234e-06, "loss": 0.3657, "step": 111500 }, { "epoch": 4.018452445309403, "eval_loss": 0.42932766675949097, "eval_runtime": 3.5358, "eval_samples_per_second": 28.283, "eval_steps_per_second": 7.071, "step": 111500 }, { "epoch": 4.018632644970627, "grad_norm": 0.22898030281066895, "learning_rate": 4.896911282624178e-06, "loss": 0.3802, "step": 111505 }, { "epoch": 4.018812844631852, "grad_norm": 0.21475541591644287, "learning_rate": 4.8951766994008274e-06, "loss": 0.3608, "step": 111510 }, { "epoch": 4.018993044293077, "grad_norm": 0.2441704273223877, "learning_rate": 4.893442390101791e-06, "loss": 0.3579, "step": 111515 }, { "epoch": 4.0191732439543015, "grad_norm": 0.24041332304477692, "learning_rate": 4.891708354750716e-06, "loss": 0.3862, "step": 111520 }, { "epoch": 4.019353443615526, "grad_norm": 0.262669175863266, "learning_rate": 4.889974593371218e-06, "loss": 0.3809, "step": 111525 }, { "epoch": 4.019533643276751, "grad_norm": 0.3033019006252289, "learning_rate": 4.888241105986918e-06, "loss": 0.401, "step": 111530 }, { "epoch": 4.019713842937975, "grad_norm": 0.22395041584968567, "learning_rate": 4.886507892621453e-06, "loss": 0.3718, "step": 111535 }, { "epoch": 4.0198940425992, "grad_norm": 0.2800545394420624, "learning_rate": 4.88477495329841e-06, "loss": 0.3998, "step": 111540 }, { "epoch": 4.020074242260424, "grad_norm": 0.2535509765148163, "learning_rate": 4.883042288041423e-06, "loss": 0.3666, "step": 111545 }, { "epoch": 4.020254441921649, "grad_norm": 0.26017653942108154, "learning_rate": 4.881309896874087e-06, "loss": 0.4077, "step": 111550 }, { "epoch": 4.020434641582874, "grad_norm": 0.2357221096754074, "learning_rate": 4.879577779820007e-06, "loss": 0.3764, "step": 111555 }, { "epoch": 4.0206148412440985, "grad_norm": 0.24376040697097778, "learning_rate": 4.877845936902789e-06, "loss": 0.3291, "step": 111560 }, { "epoch": 4.020795040905323, "grad_norm": 0.22934941947460175, "learning_rate": 4.87611436814602e-06, "loss": 0.3561, "step": 111565 }, { "epoch": 4.020975240566548, "grad_norm": 0.25413674116134644, "learning_rate": 4.874383073573294e-06, "loss": 0.3447, "step": 111570 }, { "epoch": 4.021155440227773, "grad_norm": 0.23448479175567627, "learning_rate": 4.872652053208207e-06, "loss": 0.3587, "step": 111575 }, { "epoch": 4.0213356398889974, "grad_norm": 0.2433057725429535, "learning_rate": 4.870921307074339e-06, "loss": 0.3762, "step": 111580 }, { "epoch": 4.021515839550221, "grad_norm": 0.22767940163612366, "learning_rate": 4.86919083519527e-06, "loss": 0.383, "step": 111585 }, { "epoch": 4.021696039211446, "grad_norm": 0.24196745455265045, "learning_rate": 4.867460637594579e-06, "loss": 0.3517, "step": 111590 }, { "epoch": 4.021876238872671, "grad_norm": 0.27073025703430176, "learning_rate": 4.865730714295832e-06, "loss": 0.3839, "step": 111595 }, { "epoch": 4.0220564385338955, "grad_norm": 0.30549201369285583, "learning_rate": 4.864001065322616e-06, "loss": 0.377, "step": 111600 }, { "epoch": 4.02223663819512, "grad_norm": 0.23959122598171234, "learning_rate": 4.862271690698489e-06, "loss": 0.3811, "step": 111605 }, { "epoch": 4.022416837856345, "grad_norm": 0.2952626943588257, "learning_rate": 4.8605425904470005e-06, "loss": 0.3632, "step": 111610 }, { "epoch": 4.02259703751757, "grad_norm": 0.24323879182338715, "learning_rate": 4.858813764591727e-06, "loss": 0.4091, "step": 111615 }, { "epoch": 4.022777237178794, "grad_norm": 0.23510761559009552, "learning_rate": 4.857085213156209e-06, "loss": 0.3739, "step": 111620 }, { "epoch": 4.022957436840019, "grad_norm": 0.2673884630203247, "learning_rate": 4.855356936164018e-06, "loss": 0.3963, "step": 111625 }, { "epoch": 4.023137636501243, "grad_norm": 0.25060799717903137, "learning_rate": 4.853628933638682e-06, "loss": 0.3889, "step": 111630 }, { "epoch": 4.023317836162468, "grad_norm": 0.2960754930973053, "learning_rate": 4.851901205603746e-06, "loss": 0.3883, "step": 111635 }, { "epoch": 4.0234980358236925, "grad_norm": 0.2322222739458084, "learning_rate": 4.85017375208276e-06, "loss": 0.3718, "step": 111640 }, { "epoch": 4.023678235484917, "grad_norm": 0.2459927797317505, "learning_rate": 4.848446573099258e-06, "loss": 0.3736, "step": 111645 }, { "epoch": 4.023858435146142, "grad_norm": 0.26722821593284607, "learning_rate": 4.8467196686767695e-06, "loss": 0.3832, "step": 111650 }, { "epoch": 4.024038634807367, "grad_norm": 0.2405874878168106, "learning_rate": 4.8449930388388215e-06, "loss": 0.3748, "step": 111655 }, { "epoch": 4.024218834468591, "grad_norm": 0.2542579472064972, "learning_rate": 4.843266683608936e-06, "loss": 0.395, "step": 111660 }, { "epoch": 4.024399034129816, "grad_norm": 0.24059006571769714, "learning_rate": 4.8415406030106465e-06, "loss": 0.381, "step": 111665 }, { "epoch": 4.024579233791041, "grad_norm": 0.2607281804084778, "learning_rate": 4.839814797067463e-06, "loss": 0.3987, "step": 111670 }, { "epoch": 4.024759433452265, "grad_norm": 0.2265724539756775, "learning_rate": 4.838089265802901e-06, "loss": 0.3876, "step": 111675 }, { "epoch": 4.0249396331134895, "grad_norm": 0.28544244170188904, "learning_rate": 4.836364009240465e-06, "loss": 0.3896, "step": 111680 }, { "epoch": 4.025119832774714, "grad_norm": 0.21306416392326355, "learning_rate": 4.83463902740367e-06, "loss": 0.3687, "step": 111685 }, { "epoch": 4.025300032435939, "grad_norm": 0.22182679176330566, "learning_rate": 4.832914320316006e-06, "loss": 0.3602, "step": 111690 }, { "epoch": 4.025480232097164, "grad_norm": 0.2823246121406555, "learning_rate": 4.831189888000986e-06, "loss": 0.3927, "step": 111695 }, { "epoch": 4.025660431758388, "grad_norm": 0.2230641394853592, "learning_rate": 4.8294657304821e-06, "loss": 0.3698, "step": 111700 }, { "epoch": 4.025840631419613, "grad_norm": 0.2776728570461273, "learning_rate": 4.827741847782838e-06, "loss": 0.3692, "step": 111705 }, { "epoch": 4.026020831080838, "grad_norm": 0.2971155643463135, "learning_rate": 4.826018239926689e-06, "loss": 0.3711, "step": 111710 }, { "epoch": 4.026201030742063, "grad_norm": 0.23264330625534058, "learning_rate": 4.824294906937126e-06, "loss": 0.4002, "step": 111715 }, { "epoch": 4.026381230403286, "grad_norm": 0.20618966221809387, "learning_rate": 4.8225718488376535e-06, "loss": 0.3391, "step": 111720 }, { "epoch": 4.026561430064511, "grad_norm": 0.23295053839683533, "learning_rate": 4.8208490656517225e-06, "loss": 0.3322, "step": 111725 }, { "epoch": 4.026741629725736, "grad_norm": 0.19821318984031677, "learning_rate": 4.81912655740282e-06, "loss": 0.351, "step": 111730 }, { "epoch": 4.026921829386961, "grad_norm": 0.3166259825229645, "learning_rate": 4.81740432411441e-06, "loss": 0.3433, "step": 111735 }, { "epoch": 4.027102029048185, "grad_norm": 0.2291676253080368, "learning_rate": 4.815682365809959e-06, "loss": 0.3965, "step": 111740 }, { "epoch": 4.02728222870941, "grad_norm": 0.2493552565574646, "learning_rate": 4.81396068251293e-06, "loss": 0.3637, "step": 111745 }, { "epoch": 4.027462428370635, "grad_norm": 0.2596898376941681, "learning_rate": 4.812239274246777e-06, "loss": 0.3494, "step": 111750 }, { "epoch": 4.02764262803186, "grad_norm": 0.246169313788414, "learning_rate": 4.810518141034951e-06, "loss": 0.4282, "step": 111755 }, { "epoch": 4.027822827693084, "grad_norm": 0.29572200775146484, "learning_rate": 4.808797282900912e-06, "loss": 0.3703, "step": 111760 }, { "epoch": 4.028003027354308, "grad_norm": 0.23195022344589233, "learning_rate": 4.807076699868099e-06, "loss": 0.3474, "step": 111765 }, { "epoch": 4.028183227015533, "grad_norm": 0.21716444194316864, "learning_rate": 4.805356391959959e-06, "loss": 0.3792, "step": 111770 }, { "epoch": 4.028363426676758, "grad_norm": 0.2253197580575943, "learning_rate": 4.8036363591999255e-06, "loss": 0.3733, "step": 111775 }, { "epoch": 4.028543626337982, "grad_norm": 0.26673805713653564, "learning_rate": 4.801916601611433e-06, "loss": 0.3862, "step": 111780 }, { "epoch": 4.028723825999207, "grad_norm": 0.2358836978673935, "learning_rate": 4.8001971192179226e-06, "loss": 0.3647, "step": 111785 }, { "epoch": 4.028904025660432, "grad_norm": 0.2774098813533783, "learning_rate": 4.79847791204282e-06, "loss": 0.3863, "step": 111790 }, { "epoch": 4.0290842253216566, "grad_norm": 0.287156879901886, "learning_rate": 4.7967589801095335e-06, "loss": 0.3558, "step": 111795 }, { "epoch": 4.029264424982881, "grad_norm": 0.24171940982341766, "learning_rate": 4.7950403234414995e-06, "loss": 0.358, "step": 111800 }, { "epoch": 4.029444624644106, "grad_norm": 0.23957575857639313, "learning_rate": 4.793321942062124e-06, "loss": 0.3846, "step": 111805 }, { "epoch": 4.02962482430533, "grad_norm": 0.28689080476760864, "learning_rate": 4.791603835994829e-06, "loss": 0.3987, "step": 111810 }, { "epoch": 4.029805023966555, "grad_norm": 0.28590720891952515, "learning_rate": 4.789886005263028e-06, "loss": 0.3648, "step": 111815 }, { "epoch": 4.029985223627779, "grad_norm": 0.22683300077915192, "learning_rate": 4.788168449890104e-06, "loss": 0.3772, "step": 111820 }, { "epoch": 4.030165423289004, "grad_norm": 0.2961829900741577, "learning_rate": 4.786451169899478e-06, "loss": 0.3508, "step": 111825 }, { "epoch": 4.030345622950229, "grad_norm": 0.2695339322090149, "learning_rate": 4.784734165314542e-06, "loss": 0.3903, "step": 111830 }, { "epoch": 4.0305258226114535, "grad_norm": 0.1870175302028656, "learning_rate": 4.783017436158691e-06, "loss": 0.4212, "step": 111835 }, { "epoch": 4.030706022272678, "grad_norm": 0.26787975430488586, "learning_rate": 4.781300982455309e-06, "loss": 0.3925, "step": 111840 }, { "epoch": 4.030886221933903, "grad_norm": 0.2814364731311798, "learning_rate": 4.779584804227783e-06, "loss": 0.348, "step": 111845 }, { "epoch": 4.031066421595128, "grad_norm": 0.3020128309726715, "learning_rate": 4.777868901499505e-06, "loss": 0.3637, "step": 111850 }, { "epoch": 4.0312466212563525, "grad_norm": 0.22931928932666779, "learning_rate": 4.776153274293848e-06, "loss": 0.3362, "step": 111855 }, { "epoch": 4.031426820917576, "grad_norm": 0.24356307089328766, "learning_rate": 4.774437922634187e-06, "loss": 0.3895, "step": 111860 }, { "epoch": 4.031607020578801, "grad_norm": 0.2754194438457489, "learning_rate": 4.772722846543892e-06, "loss": 0.3886, "step": 111865 }, { "epoch": 4.031787220240026, "grad_norm": 0.2473660260438919, "learning_rate": 4.7710080460463356e-06, "loss": 0.3681, "step": 111870 }, { "epoch": 4.0319674199012505, "grad_norm": 0.24248690903186798, "learning_rate": 4.769293521164869e-06, "loss": 0.3893, "step": 111875 }, { "epoch": 4.032147619562475, "grad_norm": 0.27788907289505005, "learning_rate": 4.76757927192287e-06, "loss": 0.3653, "step": 111880 }, { "epoch": 4.0323278192237, "grad_norm": 0.22080229222774506, "learning_rate": 4.765865298343686e-06, "loss": 0.4033, "step": 111885 }, { "epoch": 4.032508018884925, "grad_norm": 0.26164865493774414, "learning_rate": 4.764151600450667e-06, "loss": 0.3896, "step": 111890 }, { "epoch": 4.0326882185461495, "grad_norm": 0.24852848052978516, "learning_rate": 4.7624381782671675e-06, "loss": 0.3699, "step": 111895 }, { "epoch": 4.032868418207374, "grad_norm": 0.18736465275287628, "learning_rate": 4.760725031816521e-06, "loss": 0.3585, "step": 111900 }, { "epoch": 4.033048617868598, "grad_norm": 0.3258678913116455, "learning_rate": 4.759012161122092e-06, "loss": 0.4135, "step": 111905 }, { "epoch": 4.033228817529823, "grad_norm": 0.23651060461997986, "learning_rate": 4.757299566207196e-06, "loss": 0.3723, "step": 111910 }, { "epoch": 4.0334090171910475, "grad_norm": 0.2316112071275711, "learning_rate": 4.75558724709517e-06, "loss": 0.3634, "step": 111915 }, { "epoch": 4.033589216852272, "grad_norm": 0.22371111810207367, "learning_rate": 4.753875203809352e-06, "loss": 0.3619, "step": 111920 }, { "epoch": 4.033769416513497, "grad_norm": 0.2516692578792572, "learning_rate": 4.752163436373061e-06, "loss": 0.3887, "step": 111925 }, { "epoch": 4.033949616174722, "grad_norm": 0.24128705263137817, "learning_rate": 4.750451944809634e-06, "loss": 0.3456, "step": 111930 }, { "epoch": 4.034129815835946, "grad_norm": 0.24484743177890778, "learning_rate": 4.748740729142373e-06, "loss": 0.3982, "step": 111935 }, { "epoch": 4.034310015497171, "grad_norm": 0.21860742568969727, "learning_rate": 4.747029789394591e-06, "loss": 0.3646, "step": 111940 }, { "epoch": 4.034490215158396, "grad_norm": 0.24374550580978394, "learning_rate": 4.7453191255896145e-06, "loss": 0.3578, "step": 111945 }, { "epoch": 4.03467041481962, "grad_norm": 0.29540616273880005, "learning_rate": 4.743608737750746e-06, "loss": 0.3785, "step": 111950 }, { "epoch": 4.0348506144808445, "grad_norm": 0.19289089739322662, "learning_rate": 4.7418986259012836e-06, "loss": 0.3744, "step": 111955 }, { "epoch": 4.035030814142069, "grad_norm": 0.2452259063720703, "learning_rate": 4.740188790064531e-06, "loss": 0.3566, "step": 111960 }, { "epoch": 4.035211013803294, "grad_norm": 0.2256636768579483, "learning_rate": 4.738479230263776e-06, "loss": 0.3578, "step": 111965 }, { "epoch": 4.035391213464519, "grad_norm": 0.21084842085838318, "learning_rate": 4.736769946522326e-06, "loss": 0.359, "step": 111970 }, { "epoch": 4.035571413125743, "grad_norm": 0.22610744833946228, "learning_rate": 4.735060938863464e-06, "loss": 0.3715, "step": 111975 }, { "epoch": 4.035751612786968, "grad_norm": 0.2904773950576782, "learning_rate": 4.733352207310473e-06, "loss": 0.3575, "step": 111980 }, { "epoch": 4.035931812448193, "grad_norm": 0.2337644398212433, "learning_rate": 4.731643751886633e-06, "loss": 0.3408, "step": 111985 }, { "epoch": 4.036112012109418, "grad_norm": 0.2608318030834198, "learning_rate": 4.729935572615219e-06, "loss": 0.3671, "step": 111990 }, { "epoch": 4.0362922117706415, "grad_norm": 0.2300008088350296, "learning_rate": 4.728227669519511e-06, "loss": 0.3985, "step": 111995 }, { "epoch": 4.036472411431866, "grad_norm": 0.24639463424682617, "learning_rate": 4.726520042622784e-06, "loss": 0.364, "step": 112000 }, { "epoch": 4.036472411431866, "eval_loss": 0.4291331470012665, "eval_runtime": 3.5175, "eval_samples_per_second": 28.429, "eval_steps_per_second": 7.107, "step": 112000 }, { "epoch": 4.036652611093091, "grad_norm": 0.2528611123561859, "learning_rate": 4.724812691948285e-06, "loss": 0.3803, "step": 112005 }, { "epoch": 4.036832810754316, "grad_norm": 0.21292369067668915, "learning_rate": 4.723105617519294e-06, "loss": 0.3711, "step": 112010 }, { "epoch": 4.03701301041554, "grad_norm": 0.238296777009964, "learning_rate": 4.72139881935906e-06, "loss": 0.4014, "step": 112015 }, { "epoch": 4.037193210076765, "grad_norm": 0.24008990824222565, "learning_rate": 4.719692297490844e-06, "loss": 0.3731, "step": 112020 }, { "epoch": 4.03737340973799, "grad_norm": 0.2941230535507202, "learning_rate": 4.717986051937895e-06, "loss": 0.362, "step": 112025 }, { "epoch": 4.037553609399215, "grad_norm": 0.2386900782585144, "learning_rate": 4.716280082723451e-06, "loss": 0.3563, "step": 112030 }, { "epoch": 4.037733809060439, "grad_norm": 0.22380246222019196, "learning_rate": 4.714574389870768e-06, "loss": 0.3797, "step": 112035 }, { "epoch": 4.037914008721663, "grad_norm": 0.22900426387786865, "learning_rate": 4.712868973403087e-06, "loss": 0.3573, "step": 112040 }, { "epoch": 4.038094208382888, "grad_norm": 0.21817940473556519, "learning_rate": 4.711163833343635e-06, "loss": 0.3323, "step": 112045 }, { "epoch": 4.038274408044113, "grad_norm": 0.2606887221336365, "learning_rate": 4.709458969715646e-06, "loss": 0.3829, "step": 112050 }, { "epoch": 4.038454607705337, "grad_norm": 0.17615626752376556, "learning_rate": 4.7077543825423535e-06, "loss": 0.3649, "step": 112055 }, { "epoch": 4.038634807366562, "grad_norm": 0.3894023299217224, "learning_rate": 4.706050071846971e-06, "loss": 0.3597, "step": 112060 }, { "epoch": 4.038815007027787, "grad_norm": 0.1984912008047104, "learning_rate": 4.704346037652735e-06, "loss": 0.3796, "step": 112065 }, { "epoch": 4.038995206689012, "grad_norm": 0.3245207667350769, "learning_rate": 4.702642279982855e-06, "loss": 0.4189, "step": 112070 }, { "epoch": 4.039175406350236, "grad_norm": 0.2791898548603058, "learning_rate": 4.700938798860544e-06, "loss": 0.3301, "step": 112075 }, { "epoch": 4.039355606011461, "grad_norm": 0.20591488480567932, "learning_rate": 4.6992355943090095e-06, "loss": 0.3515, "step": 112080 }, { "epoch": 4.039535805672686, "grad_norm": 0.20929506421089172, "learning_rate": 4.6975326663514535e-06, "loss": 0.349, "step": 112085 }, { "epoch": 4.03971600533391, "grad_norm": 0.27950236201286316, "learning_rate": 4.695830015011094e-06, "loss": 0.3617, "step": 112090 }, { "epoch": 4.039896204995134, "grad_norm": 0.21270787715911865, "learning_rate": 4.6941276403111245e-06, "loss": 0.3668, "step": 112095 }, { "epoch": 4.040076404656359, "grad_norm": 0.24949444830417633, "learning_rate": 4.69242554227472e-06, "loss": 0.3616, "step": 112100 }, { "epoch": 4.040256604317584, "grad_norm": 0.20137085020542145, "learning_rate": 4.690723720925094e-06, "loss": 0.347, "step": 112105 }, { "epoch": 4.0404368039788086, "grad_norm": 0.2264707386493683, "learning_rate": 4.689022176285418e-06, "loss": 0.4139, "step": 112110 }, { "epoch": 4.040617003640033, "grad_norm": 0.25556209683418274, "learning_rate": 4.687320908378895e-06, "loss": 0.345, "step": 112115 }, { "epoch": 4.040797203301258, "grad_norm": 0.2599429488182068, "learning_rate": 4.685619917228687e-06, "loss": 0.3659, "step": 112120 }, { "epoch": 4.040977402962483, "grad_norm": 0.19548028707504272, "learning_rate": 4.6839192028579674e-06, "loss": 0.3567, "step": 112125 }, { "epoch": 4.0411576026237075, "grad_norm": 0.23381386697292328, "learning_rate": 4.68221876528992e-06, "loss": 0.3942, "step": 112130 }, { "epoch": 4.041337802284931, "grad_norm": 0.21480177342891693, "learning_rate": 4.68051860454771e-06, "loss": 0.3674, "step": 112135 }, { "epoch": 4.041518001946156, "grad_norm": 0.20809517800807953, "learning_rate": 4.678818720654502e-06, "loss": 0.3405, "step": 112140 }, { "epoch": 4.041698201607381, "grad_norm": 0.24788372218608856, "learning_rate": 4.677119113633452e-06, "loss": 0.4156, "step": 112145 }, { "epoch": 4.0418784012686055, "grad_norm": 0.24612867832183838, "learning_rate": 4.675419783507712e-06, "loss": 0.3738, "step": 112150 }, { "epoch": 4.04205860092983, "grad_norm": 0.2980947494506836, "learning_rate": 4.673720730300451e-06, "loss": 0.4167, "step": 112155 }, { "epoch": 4.042238800591055, "grad_norm": 0.2769397497177124, "learning_rate": 4.672021954034811e-06, "loss": 0.394, "step": 112160 }, { "epoch": 4.04241900025228, "grad_norm": 0.24730835855007172, "learning_rate": 4.670323454733933e-06, "loss": 0.3496, "step": 112165 }, { "epoch": 4.0425991999135045, "grad_norm": 0.2504551410675049, "learning_rate": 4.668625232420965e-06, "loss": 0.3607, "step": 112170 }, { "epoch": 4.042779399574729, "grad_norm": 0.2242858111858368, "learning_rate": 4.666927287119032e-06, "loss": 0.3641, "step": 112175 }, { "epoch": 4.042959599235953, "grad_norm": 0.22944757342338562, "learning_rate": 4.665229618851289e-06, "loss": 0.3853, "step": 112180 }, { "epoch": 4.043139798897178, "grad_norm": 0.24846546351909637, "learning_rate": 4.663532227640857e-06, "loss": 0.3888, "step": 112185 }, { "epoch": 4.0433199985584025, "grad_norm": 0.2676353454589844, "learning_rate": 4.661835113510851e-06, "loss": 0.3324, "step": 112190 }, { "epoch": 4.043500198219627, "grad_norm": 0.39060455560684204, "learning_rate": 4.6601382764844105e-06, "loss": 0.3952, "step": 112195 }, { "epoch": 4.043680397880852, "grad_norm": 0.2727122902870178, "learning_rate": 4.6584417165846495e-06, "loss": 0.3762, "step": 112200 }, { "epoch": 4.043860597542077, "grad_norm": 0.19775696098804474, "learning_rate": 4.6567454338346804e-06, "loss": 0.3758, "step": 112205 }, { "epoch": 4.0440407972033015, "grad_norm": 0.25784486532211304, "learning_rate": 4.655049428257615e-06, "loss": 0.3843, "step": 112210 }, { "epoch": 4.044220996864526, "grad_norm": 0.2990073263645172, "learning_rate": 4.6533536998765555e-06, "loss": 0.3875, "step": 112215 }, { "epoch": 4.044401196525751, "grad_norm": 0.24339328706264496, "learning_rate": 4.651658248714621e-06, "loss": 0.3816, "step": 112220 }, { "epoch": 4.044581396186975, "grad_norm": 0.2621822953224182, "learning_rate": 4.649963074794902e-06, "loss": 0.3381, "step": 112225 }, { "epoch": 4.0447615958481995, "grad_norm": 0.25351327657699585, "learning_rate": 4.648268178140497e-06, "loss": 0.3854, "step": 112230 }, { "epoch": 4.044941795509424, "grad_norm": 0.3043707013130188, "learning_rate": 4.646573558774497e-06, "loss": 0.3838, "step": 112235 }, { "epoch": 4.045121995170649, "grad_norm": 0.24836041033267975, "learning_rate": 4.644879216719994e-06, "loss": 0.3588, "step": 112240 }, { "epoch": 4.045302194831874, "grad_norm": 0.2699911594390869, "learning_rate": 4.643185152000062e-06, "loss": 0.3773, "step": 112245 }, { "epoch": 4.045482394493098, "grad_norm": 0.25113409757614136, "learning_rate": 4.641491364637798e-06, "loss": 0.3652, "step": 112250 }, { "epoch": 4.045662594154323, "grad_norm": 0.24034260213375092, "learning_rate": 4.6397978546562744e-06, "loss": 0.3806, "step": 112255 }, { "epoch": 4.045842793815548, "grad_norm": 0.2835821211338043, "learning_rate": 4.6381046220785595e-06, "loss": 0.3634, "step": 112260 }, { "epoch": 4.046022993476773, "grad_norm": 0.20180295407772064, "learning_rate": 4.636411666927731e-06, "loss": 0.3606, "step": 112265 }, { "epoch": 4.0462031931379965, "grad_norm": 0.24634915590286255, "learning_rate": 4.634718989226841e-06, "loss": 0.3853, "step": 112270 }, { "epoch": 4.046383392799221, "grad_norm": 0.23739095032215118, "learning_rate": 4.63302658899897e-06, "loss": 0.3726, "step": 112275 }, { "epoch": 4.046563592460446, "grad_norm": 0.32305994629859924, "learning_rate": 4.631334466267176e-06, "loss": 0.4008, "step": 112280 }, { "epoch": 4.046743792121671, "grad_norm": 0.2640374004840851, "learning_rate": 4.629642621054492e-06, "loss": 0.3762, "step": 112285 }, { "epoch": 4.046923991782895, "grad_norm": 0.25261178612709045, "learning_rate": 4.627951053383991e-06, "loss": 0.3538, "step": 112290 }, { "epoch": 4.04710419144412, "grad_norm": 0.25557366013526917, "learning_rate": 4.626259763278707e-06, "loss": 0.4039, "step": 112295 }, { "epoch": 4.047284391105345, "grad_norm": 0.23059487342834473, "learning_rate": 4.624568750761699e-06, "loss": 0.3653, "step": 112300 }, { "epoch": 4.04746459076657, "grad_norm": 0.1989499032497406, "learning_rate": 4.622878015855994e-06, "loss": 0.3872, "step": 112305 }, { "epoch": 4.047644790427794, "grad_norm": 0.2884153127670288, "learning_rate": 4.621187558584622e-06, "loss": 0.3838, "step": 112310 }, { "epoch": 4.047824990089018, "grad_norm": 0.26490432024002075, "learning_rate": 4.61949737897063e-06, "loss": 0.3588, "step": 112315 }, { "epoch": 4.048005189750243, "grad_norm": 0.2709238827228546, "learning_rate": 4.6178074770370424e-06, "loss": 0.4018, "step": 112320 }, { "epoch": 4.048185389411468, "grad_norm": 0.2417963743209839, "learning_rate": 4.616117852806881e-06, "loss": 0.3779, "step": 112325 }, { "epoch": 4.048365589072692, "grad_norm": 0.257938414812088, "learning_rate": 4.614428506303168e-06, "loss": 0.3659, "step": 112330 }, { "epoch": 4.048545788733917, "grad_norm": 0.2447778433561325, "learning_rate": 4.612739437548913e-06, "loss": 0.363, "step": 112335 }, { "epoch": 4.048725988395142, "grad_norm": 0.26171186566352844, "learning_rate": 4.611050646567142e-06, "loss": 0.4081, "step": 112340 }, { "epoch": 4.048906188056367, "grad_norm": 0.27486851811408997, "learning_rate": 4.60936213338086e-06, "loss": 0.3324, "step": 112345 }, { "epoch": 4.049086387717591, "grad_norm": 0.24321185052394867, "learning_rate": 4.6076738980130705e-06, "loss": 0.3454, "step": 112350 }, { "epoch": 4.049266587378816, "grad_norm": 0.334555059671402, "learning_rate": 4.605985940486776e-06, "loss": 0.3937, "step": 112355 }, { "epoch": 4.049446787040041, "grad_norm": 0.30166709423065186, "learning_rate": 4.604298260824965e-06, "loss": 0.3657, "step": 112360 }, { "epoch": 4.049626986701265, "grad_norm": 0.2711981236934662, "learning_rate": 4.602610859050652e-06, "loss": 0.3539, "step": 112365 }, { "epoch": 4.049807186362489, "grad_norm": 0.30052125453948975, "learning_rate": 4.600923735186824e-06, "loss": 0.3794, "step": 112370 }, { "epoch": 4.049987386023714, "grad_norm": 0.24002404510974884, "learning_rate": 4.5992368892564446e-06, "loss": 0.3768, "step": 112375 }, { "epoch": 4.050167585684939, "grad_norm": 0.26016783714294434, "learning_rate": 4.5975503212825206e-06, "loss": 0.3781, "step": 112380 }, { "epoch": 4.050347785346164, "grad_norm": 0.2607904374599457, "learning_rate": 4.595864031288025e-06, "loss": 0.3555, "step": 112385 }, { "epoch": 4.050527985007388, "grad_norm": 0.2204258143901825, "learning_rate": 4.594178019295922e-06, "loss": 0.3897, "step": 112390 }, { "epoch": 4.050708184668613, "grad_norm": 0.2713705003261566, "learning_rate": 4.592492285329208e-06, "loss": 0.3818, "step": 112395 }, { "epoch": 4.050888384329838, "grad_norm": 0.271567165851593, "learning_rate": 4.590806829410824e-06, "loss": 0.3609, "step": 112400 }, { "epoch": 4.0510685839910625, "grad_norm": 0.24618300795555115, "learning_rate": 4.589121651563749e-06, "loss": 0.3876, "step": 112405 }, { "epoch": 4.051248783652286, "grad_norm": 0.29348066449165344, "learning_rate": 4.587436751810942e-06, "loss": 0.3868, "step": 112410 }, { "epoch": 4.051428983313511, "grad_norm": 0.21088933944702148, "learning_rate": 4.585752130175355e-06, "loss": 0.3823, "step": 112415 }, { "epoch": 4.051609182974736, "grad_norm": 0.23839104175567627, "learning_rate": 4.5840677866799435e-06, "loss": 0.3536, "step": 112420 }, { "epoch": 4.051789382635961, "grad_norm": 0.2764272391796112, "learning_rate": 4.582383721347658e-06, "loss": 0.3414, "step": 112425 }, { "epoch": 4.051969582297185, "grad_norm": 0.2566319406032562, "learning_rate": 4.580699934201432e-06, "loss": 0.374, "step": 112430 }, { "epoch": 4.05214978195841, "grad_norm": 0.24363850057125092, "learning_rate": 4.579016425264224e-06, "loss": 0.3331, "step": 112435 }, { "epoch": 4.052329981619635, "grad_norm": 0.25705766677856445, "learning_rate": 4.577333194558964e-06, "loss": 0.3527, "step": 112440 }, { "epoch": 4.0525101812808595, "grad_norm": 0.23861709237098694, "learning_rate": 4.575650242108584e-06, "loss": 0.3601, "step": 112445 }, { "epoch": 4.052690380942084, "grad_norm": 0.23494866490364075, "learning_rate": 4.573967567936016e-06, "loss": 0.3682, "step": 112450 }, { "epoch": 4.052870580603308, "grad_norm": 0.29604148864746094, "learning_rate": 4.572285172064181e-06, "loss": 0.3502, "step": 112455 }, { "epoch": 4.053050780264533, "grad_norm": 0.2703550457954407, "learning_rate": 4.570603054516009e-06, "loss": 0.372, "step": 112460 }, { "epoch": 4.0532309799257575, "grad_norm": 0.20241078734397888, "learning_rate": 4.568921215314423e-06, "loss": 0.3367, "step": 112465 }, { "epoch": 4.053411179586982, "grad_norm": 0.2732201814651489, "learning_rate": 4.567239654482313e-06, "loss": 0.3518, "step": 112470 }, { "epoch": 4.053591379248207, "grad_norm": 0.2432149201631546, "learning_rate": 4.5655583720426195e-06, "loss": 0.3639, "step": 112475 }, { "epoch": 4.053771578909432, "grad_norm": 0.28690987825393677, "learning_rate": 4.563877368018227e-06, "loss": 0.395, "step": 112480 }, { "epoch": 4.0539517785706565, "grad_norm": 0.24714258313179016, "learning_rate": 4.562196642432063e-06, "loss": 0.3719, "step": 112485 }, { "epoch": 4.054131978231881, "grad_norm": 0.22608081996440887, "learning_rate": 4.560516195307005e-06, "loss": 0.3674, "step": 112490 }, { "epoch": 4.054312177893106, "grad_norm": 0.26828816533088684, "learning_rate": 4.5588360266659495e-06, "loss": 0.3886, "step": 112495 }, { "epoch": 4.05449237755433, "grad_norm": 0.24400553107261658, "learning_rate": 4.557156136531804e-06, "loss": 0.3551, "step": 112500 }, { "epoch": 4.05449237755433, "eval_loss": 0.42917296290397644, "eval_runtime": 3.5255, "eval_samples_per_second": 28.365, "eval_steps_per_second": 7.091, "step": 112500 }, { "epoch": 4.0546725772155545, "grad_norm": 0.27448979020118713, "learning_rate": 4.555476524927449e-06, "loss": 0.378, "step": 112505 }, { "epoch": 4.054852776876779, "grad_norm": 0.24059316515922546, "learning_rate": 4.553797191875767e-06, "loss": 0.3708, "step": 112510 }, { "epoch": 4.055032976538004, "grad_norm": 0.26339319348335266, "learning_rate": 4.552118137399639e-06, "loss": 0.3886, "step": 112515 }, { "epoch": 4.055213176199229, "grad_norm": 0.2619602084159851, "learning_rate": 4.550439361521935e-06, "loss": 0.3898, "step": 112520 }, { "epoch": 4.0553933758604535, "grad_norm": 0.25006812810897827, "learning_rate": 4.548760864265545e-06, "loss": 0.3685, "step": 112525 }, { "epoch": 4.055573575521678, "grad_norm": 0.2471635937690735, "learning_rate": 4.547082645653328e-06, "loss": 0.3887, "step": 112530 }, { "epoch": 4.055753775182903, "grad_norm": 0.2514253258705139, "learning_rate": 4.545404705708148e-06, "loss": 0.3463, "step": 112535 }, { "epoch": 4.055933974844128, "grad_norm": 0.21959030628204346, "learning_rate": 4.543727044452873e-06, "loss": 0.4068, "step": 112540 }, { "epoch": 4.0561141745053515, "grad_norm": 0.21514563262462616, "learning_rate": 4.542049661910344e-06, "loss": 0.337, "step": 112545 }, { "epoch": 4.056294374166576, "grad_norm": 0.1881556361913681, "learning_rate": 4.540372558103439e-06, "loss": 0.3604, "step": 112550 }, { "epoch": 4.056474573827801, "grad_norm": 0.2358875721693039, "learning_rate": 4.538695733054995e-06, "loss": 0.3601, "step": 112555 }, { "epoch": 4.056654773489026, "grad_norm": 0.2567630410194397, "learning_rate": 4.537019186787861e-06, "loss": 0.3783, "step": 112560 }, { "epoch": 4.05683497315025, "grad_norm": 0.26848697662353516, "learning_rate": 4.535342919324878e-06, "loss": 0.4059, "step": 112565 }, { "epoch": 4.057015172811475, "grad_norm": 0.22706463932991028, "learning_rate": 4.533666930688885e-06, "loss": 0.368, "step": 112570 }, { "epoch": 4.0571953724727, "grad_norm": 0.26052287220954895, "learning_rate": 4.531991220902712e-06, "loss": 0.3455, "step": 112575 }, { "epoch": 4.057375572133925, "grad_norm": 0.2673739194869995, "learning_rate": 4.530315789989209e-06, "loss": 0.3275, "step": 112580 }, { "epoch": 4.057555771795149, "grad_norm": 0.22586947679519653, "learning_rate": 4.528640637971177e-06, "loss": 0.3832, "step": 112585 }, { "epoch": 4.057735971456374, "grad_norm": 0.28457507491111755, "learning_rate": 4.5269657648714605e-06, "loss": 0.3725, "step": 112590 }, { "epoch": 4.057916171117598, "grad_norm": 0.27478164434432983, "learning_rate": 4.52529117071287e-06, "loss": 0.3783, "step": 112595 }, { "epoch": 4.058096370778823, "grad_norm": 0.22563715279102325, "learning_rate": 4.523616855518226e-06, "loss": 0.3952, "step": 112600 }, { "epoch": 4.058276570440047, "grad_norm": 0.26424163579940796, "learning_rate": 4.5219428193103345e-06, "loss": 0.3737, "step": 112605 }, { "epoch": 4.058456770101272, "grad_norm": 0.2882944643497467, "learning_rate": 4.5202690621120095e-06, "loss": 0.3523, "step": 112610 }, { "epoch": 4.058636969762497, "grad_norm": 0.2373085916042328, "learning_rate": 4.518595583946048e-06, "loss": 0.3757, "step": 112615 }, { "epoch": 4.058817169423722, "grad_norm": 0.2473360151052475, "learning_rate": 4.516922384835259e-06, "loss": 0.3808, "step": 112620 }, { "epoch": 4.058997369084946, "grad_norm": 0.19893379509449005, "learning_rate": 4.51524946480244e-06, "loss": 0.3353, "step": 112625 }, { "epoch": 4.059177568746171, "grad_norm": 0.3020991086959839, "learning_rate": 4.51357682387038e-06, "loss": 0.3551, "step": 112630 }, { "epoch": 4.059357768407396, "grad_norm": 0.2574337422847748, "learning_rate": 4.511904462061873e-06, "loss": 0.3703, "step": 112635 }, { "epoch": 4.05953796806862, "grad_norm": 0.2121739387512207, "learning_rate": 4.51023237939969e-06, "loss": 0.3629, "step": 112640 }, { "epoch": 4.059718167729844, "grad_norm": 0.2671159505844116, "learning_rate": 4.508560575906631e-06, "loss": 0.3642, "step": 112645 }, { "epoch": 4.059898367391069, "grad_norm": 0.18342912197113037, "learning_rate": 4.506889051605473e-06, "loss": 0.3798, "step": 112650 }, { "epoch": 4.060078567052294, "grad_norm": 0.22590021789073944, "learning_rate": 4.505217806518975e-06, "loss": 0.3921, "step": 112655 }, { "epoch": 4.060258766713519, "grad_norm": 0.24707923829555511, "learning_rate": 4.50354684066992e-06, "loss": 0.3438, "step": 112660 }, { "epoch": 4.060438966374743, "grad_norm": 0.22925209999084473, "learning_rate": 4.501876154081064e-06, "loss": 0.3645, "step": 112665 }, { "epoch": 4.060619166035968, "grad_norm": 0.21116822957992554, "learning_rate": 4.500205746775185e-06, "loss": 0.3326, "step": 112670 }, { "epoch": 4.060799365697193, "grad_norm": 0.25304093956947327, "learning_rate": 4.498535618775037e-06, "loss": 0.3963, "step": 112675 }, { "epoch": 4.0609795653584175, "grad_norm": 0.25455543398857117, "learning_rate": 4.496865770103362e-06, "loss": 0.3797, "step": 112680 }, { "epoch": 4.061159765019641, "grad_norm": 0.24712957441806793, "learning_rate": 4.495196200782928e-06, "loss": 0.3822, "step": 112685 }, { "epoch": 4.061339964680866, "grad_norm": 0.21119093894958496, "learning_rate": 4.493526910836476e-06, "loss": 0.3993, "step": 112690 }, { "epoch": 4.061520164342091, "grad_norm": 0.2761557698249817, "learning_rate": 4.491857900286747e-06, "loss": 0.3782, "step": 112695 }, { "epoch": 4.061700364003316, "grad_norm": 0.2396336793899536, "learning_rate": 4.490189169156487e-06, "loss": 0.3892, "step": 112700 }, { "epoch": 4.06188056366454, "grad_norm": 0.3060736656188965, "learning_rate": 4.488520717468419e-06, "loss": 0.3525, "step": 112705 }, { "epoch": 4.062060763325765, "grad_norm": 0.2690458595752716, "learning_rate": 4.486852545245296e-06, "loss": 0.3654, "step": 112710 }, { "epoch": 4.06224096298699, "grad_norm": 0.25593844056129456, "learning_rate": 4.4851846525098305e-06, "loss": 0.3875, "step": 112715 }, { "epoch": 4.0624211626482145, "grad_norm": 0.276349812746048, "learning_rate": 4.483517039284754e-06, "loss": 0.367, "step": 112720 }, { "epoch": 4.062601362309439, "grad_norm": 0.3029731810092926, "learning_rate": 4.481849705592789e-06, "loss": 0.3476, "step": 112725 }, { "epoch": 4.062781561970663, "grad_norm": 0.2233833223581314, "learning_rate": 4.480182651456638e-06, "loss": 0.3712, "step": 112730 }, { "epoch": 4.062961761631888, "grad_norm": 0.29494714736938477, "learning_rate": 4.4785158768990365e-06, "loss": 0.4097, "step": 112735 }, { "epoch": 4.063141961293113, "grad_norm": 0.23293974995613098, "learning_rate": 4.47684938194268e-06, "loss": 0.362, "step": 112740 }, { "epoch": 4.063322160954337, "grad_norm": 0.2608863115310669, "learning_rate": 4.475183166610278e-06, "loss": 0.3822, "step": 112745 }, { "epoch": 4.063502360615562, "grad_norm": 0.2746981978416443, "learning_rate": 4.473517230924532e-06, "loss": 0.369, "step": 112750 }, { "epoch": 4.063682560276787, "grad_norm": 0.24028368294239044, "learning_rate": 4.47185157490814e-06, "loss": 0.402, "step": 112755 }, { "epoch": 4.0638627599380115, "grad_norm": 0.2710413634777069, "learning_rate": 4.47018619858379e-06, "loss": 0.3931, "step": 112760 }, { "epoch": 4.064042959599236, "grad_norm": 0.22267523407936096, "learning_rate": 4.468521101974188e-06, "loss": 0.3561, "step": 112765 }, { "epoch": 4.064223159260461, "grad_norm": 0.22819946706295013, "learning_rate": 4.466856285102e-06, "loss": 0.3752, "step": 112770 }, { "epoch": 4.064403358921685, "grad_norm": 0.2634032964706421, "learning_rate": 4.465191747989927e-06, "loss": 0.3828, "step": 112775 }, { "epoch": 4.0645835585829095, "grad_norm": 0.2825946807861328, "learning_rate": 4.463527490660641e-06, "loss": 0.3642, "step": 112780 }, { "epoch": 4.064763758244134, "grad_norm": 0.24241934716701508, "learning_rate": 4.461863513136816e-06, "loss": 0.3458, "step": 112785 }, { "epoch": 4.064943957905359, "grad_norm": 0.27716174721717834, "learning_rate": 4.460199815441124e-06, "loss": 0.3882, "step": 112790 }, { "epoch": 4.065124157566584, "grad_norm": 0.24334949254989624, "learning_rate": 4.458536397596233e-06, "loss": 0.3789, "step": 112795 }, { "epoch": 4.0653043572278085, "grad_norm": 0.26515430212020874, "learning_rate": 4.456873259624802e-06, "loss": 0.4021, "step": 112800 }, { "epoch": 4.065484556889033, "grad_norm": 0.28932276368141174, "learning_rate": 4.455210401549501e-06, "loss": 0.3874, "step": 112805 }, { "epoch": 4.065664756550258, "grad_norm": 0.19949932396411896, "learning_rate": 4.4535478233929786e-06, "loss": 0.3483, "step": 112810 }, { "epoch": 4.065844956211483, "grad_norm": 0.2426164299249649, "learning_rate": 4.45188552517789e-06, "loss": 0.3642, "step": 112815 }, { "epoch": 4.0660251558727065, "grad_norm": 0.24339383840560913, "learning_rate": 4.450223506926884e-06, "loss": 0.3921, "step": 112820 }, { "epoch": 4.066205355533931, "grad_norm": 0.2848939001560211, "learning_rate": 4.448561768662599e-06, "loss": 0.3685, "step": 112825 }, { "epoch": 4.066385555195156, "grad_norm": 0.23407739400863647, "learning_rate": 4.4469003104076865e-06, "loss": 0.3936, "step": 112830 }, { "epoch": 4.066565754856381, "grad_norm": 0.34121012687683105, "learning_rate": 4.445239132184778e-06, "loss": 0.3816, "step": 112835 }, { "epoch": 4.0667459545176055, "grad_norm": 0.2433071732521057, "learning_rate": 4.443578234016504e-06, "loss": 0.3895, "step": 112840 }, { "epoch": 4.06692615417883, "grad_norm": 0.24688871204853058, "learning_rate": 4.4419176159255e-06, "loss": 0.4009, "step": 112845 }, { "epoch": 4.067106353840055, "grad_norm": 0.255391389131546, "learning_rate": 4.440257277934381e-06, "loss": 0.3783, "step": 112850 }, { "epoch": 4.06728655350128, "grad_norm": 0.323646605014801, "learning_rate": 4.438597220065782e-06, "loss": 0.3655, "step": 112855 }, { "epoch": 4.067466753162504, "grad_norm": 0.3119712769985199, "learning_rate": 4.436937442342323e-06, "loss": 0.357, "step": 112860 }, { "epoch": 4.067646952823729, "grad_norm": 0.2642781138420105, "learning_rate": 4.435277944786595e-06, "loss": 0.3654, "step": 112865 }, { "epoch": 4.067827152484953, "grad_norm": 0.19354425370693207, "learning_rate": 4.433618727421232e-06, "loss": 0.3937, "step": 112870 }, { "epoch": 4.068007352146178, "grad_norm": 0.227387472987175, "learning_rate": 4.431959790268831e-06, "loss": 0.3485, "step": 112875 }, { "epoch": 4.068187551807402, "grad_norm": 0.256632000207901, "learning_rate": 4.430301133351997e-06, "loss": 0.3545, "step": 112880 }, { "epoch": 4.068367751468627, "grad_norm": 0.21638993918895721, "learning_rate": 4.428642756693324e-06, "loss": 0.3527, "step": 112885 }, { "epoch": 4.068547951129852, "grad_norm": 0.30779585242271423, "learning_rate": 4.426984660315409e-06, "loss": 0.3595, "step": 112890 }, { "epoch": 4.068728150791077, "grad_norm": 0.2729836702346802, "learning_rate": 4.425326844240849e-06, "loss": 0.419, "step": 112895 }, { "epoch": 4.068908350452301, "grad_norm": 0.26512590050697327, "learning_rate": 4.423669308492228e-06, "loss": 0.3389, "step": 112900 }, { "epoch": 4.069088550113526, "grad_norm": 0.27653393149375916, "learning_rate": 4.422012053092128e-06, "loss": 0.3723, "step": 112905 }, { "epoch": 4.069268749774751, "grad_norm": 0.22129282355308533, "learning_rate": 4.420355078063129e-06, "loss": 0.3573, "step": 112910 }, { "epoch": 4.069448949435975, "grad_norm": 0.22182343900203705, "learning_rate": 4.41869838342781e-06, "loss": 0.3635, "step": 112915 }, { "epoch": 4.069629149097199, "grad_norm": 0.24553145468235016, "learning_rate": 4.417041969208735e-06, "loss": 0.3854, "step": 112920 }, { "epoch": 4.069809348758424, "grad_norm": 0.24355679750442505, "learning_rate": 4.4153858354284814e-06, "loss": 0.3762, "step": 112925 }, { "epoch": 4.069989548419649, "grad_norm": 0.24160782992839813, "learning_rate": 4.413729982109613e-06, "loss": 0.3698, "step": 112930 }, { "epoch": 4.070169748080874, "grad_norm": 0.23975780606269836, "learning_rate": 4.412074409274689e-06, "loss": 0.3852, "step": 112935 }, { "epoch": 4.070349947742098, "grad_norm": 0.19808252155780792, "learning_rate": 4.4104191169462625e-06, "loss": 0.341, "step": 112940 }, { "epoch": 4.070530147403323, "grad_norm": 0.2225724160671234, "learning_rate": 4.4087641051468846e-06, "loss": 0.3836, "step": 112945 }, { "epoch": 4.070710347064548, "grad_norm": 0.2074027806520462, "learning_rate": 4.4071093738991125e-06, "loss": 0.3487, "step": 112950 }, { "epoch": 4.0708905467257726, "grad_norm": 0.271662175655365, "learning_rate": 4.405454923225491e-06, "loss": 0.3715, "step": 112955 }, { "epoch": 4.071070746386996, "grad_norm": 0.23517721891403198, "learning_rate": 4.40380075314856e-06, "loss": 0.3949, "step": 112960 }, { "epoch": 4.071250946048221, "grad_norm": 0.24322974681854248, "learning_rate": 4.402146863690854e-06, "loss": 0.384, "step": 112965 }, { "epoch": 4.071431145709446, "grad_norm": 0.2979622781276703, "learning_rate": 4.400493254874902e-06, "loss": 0.3898, "step": 112970 }, { "epoch": 4.071611345370671, "grad_norm": 0.19745075702667236, "learning_rate": 4.3988399267232555e-06, "loss": 0.3579, "step": 112975 }, { "epoch": 4.071791545031895, "grad_norm": 0.23466679453849792, "learning_rate": 4.397186879258419e-06, "loss": 0.3682, "step": 112980 }, { "epoch": 4.07197174469312, "grad_norm": 0.18801245093345642, "learning_rate": 4.39553411250292e-06, "loss": 0.3595, "step": 112985 }, { "epoch": 4.072151944354345, "grad_norm": 0.23627303540706635, "learning_rate": 4.393881626479282e-06, "loss": 0.3774, "step": 112990 }, { "epoch": 4.0723321440155695, "grad_norm": 0.28023478388786316, "learning_rate": 4.3922294212100175e-06, "loss": 0.3824, "step": 112995 }, { "epoch": 4.072512343676794, "grad_norm": 0.2806805372238159, "learning_rate": 4.390577496717638e-06, "loss": 0.4054, "step": 113000 }, { "epoch": 4.072512343676794, "eval_loss": 0.4291308522224426, "eval_runtime": 3.5302, "eval_samples_per_second": 28.327, "eval_steps_per_second": 7.082, "step": 113000 }, { "epoch": 4.072692543338018, "grad_norm": 0.30133917927742004, "learning_rate": 4.3889258530246505e-06, "loss": 0.3692, "step": 113005 }, { "epoch": 4.072872742999243, "grad_norm": 0.22192399203777313, "learning_rate": 4.387274490153551e-06, "loss": 0.3635, "step": 113010 }, { "epoch": 4.073052942660468, "grad_norm": 0.23747487366199493, "learning_rate": 4.385623408126852e-06, "loss": 0.3788, "step": 113015 }, { "epoch": 4.073233142321692, "grad_norm": 0.25482213497161865, "learning_rate": 4.383972606967044e-06, "loss": 0.3497, "step": 113020 }, { "epoch": 4.073413341982917, "grad_norm": 0.2351936399936676, "learning_rate": 4.382322086696616e-06, "loss": 0.3412, "step": 113025 }, { "epoch": 4.073593541644142, "grad_norm": 0.21274252235889435, "learning_rate": 4.380671847338061e-06, "loss": 0.3827, "step": 113030 }, { "epoch": 4.0737737413053665, "grad_norm": 0.26760604977607727, "learning_rate": 4.379021888913851e-06, "loss": 0.4002, "step": 113035 }, { "epoch": 4.073953940966591, "grad_norm": 0.28649207949638367, "learning_rate": 4.377372211446482e-06, "loss": 0.3985, "step": 113040 }, { "epoch": 4.074134140627816, "grad_norm": 0.23005342483520508, "learning_rate": 4.37572281495843e-06, "loss": 0.3485, "step": 113045 }, { "epoch": 4.07431434028904, "grad_norm": 0.24847397208213806, "learning_rate": 4.37407369947215e-06, "loss": 0.3606, "step": 113050 }, { "epoch": 4.074494539950265, "grad_norm": 0.2495976686477661, "learning_rate": 4.37242486501013e-06, "loss": 0.4039, "step": 113055 }, { "epoch": 4.074674739611489, "grad_norm": 0.25559067726135254, "learning_rate": 4.370776311594826e-06, "loss": 0.3642, "step": 113060 }, { "epoch": 4.074854939272714, "grad_norm": 0.22897782921791077, "learning_rate": 4.369128039248699e-06, "loss": 0.3976, "step": 113065 }, { "epoch": 4.075035138933939, "grad_norm": 0.24675941467285156, "learning_rate": 4.36748004799421e-06, "loss": 0.4089, "step": 113070 }, { "epoch": 4.0752153385951635, "grad_norm": 0.30933111906051636, "learning_rate": 4.365832337853806e-06, "loss": 0.3718, "step": 113075 }, { "epoch": 4.075395538256388, "grad_norm": 0.2843681871891022, "learning_rate": 4.364184908849947e-06, "loss": 0.3674, "step": 113080 }, { "epoch": 4.075575737917613, "grad_norm": 0.20400385558605194, "learning_rate": 4.362537761005073e-06, "loss": 0.4067, "step": 113085 }, { "epoch": 4.075755937578838, "grad_norm": 0.2532506585121155, "learning_rate": 4.3608908943416264e-06, "loss": 0.3598, "step": 113090 }, { "epoch": 4.0759361372400615, "grad_norm": 0.23435381054878235, "learning_rate": 4.359244308882043e-06, "loss": 0.3748, "step": 113095 }, { "epoch": 4.076116336901286, "grad_norm": 0.2436995804309845, "learning_rate": 4.357598004648763e-06, "loss": 0.3937, "step": 113100 }, { "epoch": 4.076296536562511, "grad_norm": 0.26135680079460144, "learning_rate": 4.355951981664208e-06, "loss": 0.3677, "step": 113105 }, { "epoch": 4.076476736223736, "grad_norm": 0.3116925060749054, "learning_rate": 4.354306239950814e-06, "loss": 0.3898, "step": 113110 }, { "epoch": 4.0766569358849605, "grad_norm": 0.25382331013679504, "learning_rate": 4.352660779531001e-06, "loss": 0.4214, "step": 113115 }, { "epoch": 4.076837135546185, "grad_norm": 0.2189731001853943, "learning_rate": 4.3510156004271886e-06, "loss": 0.3645, "step": 113120 }, { "epoch": 4.07701733520741, "grad_norm": 0.26065686345100403, "learning_rate": 4.3493707026617894e-06, "loss": 0.3576, "step": 113125 }, { "epoch": 4.077197534868635, "grad_norm": 0.25834450125694275, "learning_rate": 4.347726086257212e-06, "loss": 0.3675, "step": 113130 }, { "epoch": 4.077377734529859, "grad_norm": 0.28615930676460266, "learning_rate": 4.346081751235873e-06, "loss": 0.3697, "step": 113135 }, { "epoch": 4.077557934191084, "grad_norm": 0.27379459142684937, "learning_rate": 4.344437697620174e-06, "loss": 0.3802, "step": 113140 }, { "epoch": 4.077738133852308, "grad_norm": 0.27598029375076294, "learning_rate": 4.342793925432509e-06, "loss": 0.3636, "step": 113145 }, { "epoch": 4.077918333513533, "grad_norm": 0.2820260226726532, "learning_rate": 4.341150434695279e-06, "loss": 0.3939, "step": 113150 }, { "epoch": 4.0780985331747575, "grad_norm": 0.25555455684661865, "learning_rate": 4.339507225430867e-06, "loss": 0.3348, "step": 113155 }, { "epoch": 4.078278732835982, "grad_norm": 0.2568057179450989, "learning_rate": 4.337864297661684e-06, "loss": 0.3276, "step": 113160 }, { "epoch": 4.078458932497207, "grad_norm": 0.2430773675441742, "learning_rate": 4.336221651410091e-06, "loss": 0.3626, "step": 113165 }, { "epoch": 4.078639132158432, "grad_norm": 0.26786041259765625, "learning_rate": 4.334579286698473e-06, "loss": 0.3538, "step": 113170 }, { "epoch": 4.078819331819656, "grad_norm": 0.23630788922309875, "learning_rate": 4.3329372035492145e-06, "loss": 0.3774, "step": 113175 }, { "epoch": 4.078999531480881, "grad_norm": 0.2112296223640442, "learning_rate": 4.3312954019846894e-06, "loss": 0.3677, "step": 113180 }, { "epoch": 4.079179731142106, "grad_norm": 0.20694564282894135, "learning_rate": 4.32965388202726e-06, "loss": 0.3376, "step": 113185 }, { "epoch": 4.07935993080333, "grad_norm": 0.19612739980220795, "learning_rate": 4.3280126436992944e-06, "loss": 0.3649, "step": 113190 }, { "epoch": 4.079540130464554, "grad_norm": 0.2163010984659195, "learning_rate": 4.32637168702315e-06, "loss": 0.3567, "step": 113195 }, { "epoch": 4.079720330125779, "grad_norm": 0.27228453755378723, "learning_rate": 4.324731012021193e-06, "loss": 0.3773, "step": 113200 }, { "epoch": 4.079900529787004, "grad_norm": 0.28055036067962646, "learning_rate": 4.323090618715775e-06, "loss": 0.3608, "step": 113205 }, { "epoch": 4.080080729448229, "grad_norm": 0.2570863366127014, "learning_rate": 4.321450507129243e-06, "loss": 0.3704, "step": 113210 }, { "epoch": 4.080260929109453, "grad_norm": 0.262403666973114, "learning_rate": 4.319810677283945e-06, "loss": 0.3805, "step": 113215 }, { "epoch": 4.080441128770678, "grad_norm": 0.31988710165023804, "learning_rate": 4.318171129202217e-06, "loss": 0.3807, "step": 113220 }, { "epoch": 4.080621328431903, "grad_norm": 0.2832499146461487, "learning_rate": 4.316531862906409e-06, "loss": 0.3679, "step": 113225 }, { "epoch": 4.080801528093128, "grad_norm": 0.3199155330657959, "learning_rate": 4.314892878418855e-06, "loss": 0.3587, "step": 113230 }, { "epoch": 4.080981727754351, "grad_norm": 0.3142116069793701, "learning_rate": 4.313254175761869e-06, "loss": 0.3666, "step": 113235 }, { "epoch": 4.081161927415576, "grad_norm": 0.2442038655281067, "learning_rate": 4.311615754957796e-06, "loss": 0.3704, "step": 113240 }, { "epoch": 4.081342127076801, "grad_norm": 0.2437644749879837, "learning_rate": 4.309977616028954e-06, "loss": 0.3603, "step": 113245 }, { "epoch": 4.081522326738026, "grad_norm": 0.24333834648132324, "learning_rate": 4.3083397589976535e-06, "loss": 0.3627, "step": 113250 }, { "epoch": 4.08170252639925, "grad_norm": 0.2943533957004547, "learning_rate": 4.30670218388623e-06, "loss": 0.3846, "step": 113255 }, { "epoch": 4.081882726060475, "grad_norm": 0.24343356490135193, "learning_rate": 4.305064890716973e-06, "loss": 0.3757, "step": 113260 }, { "epoch": 4.0820629257217, "grad_norm": 0.22839723527431488, "learning_rate": 4.3034278795122035e-06, "loss": 0.3877, "step": 113265 }, { "epoch": 4.0822431253829246, "grad_norm": 0.3169880211353302, "learning_rate": 4.301791150294224e-06, "loss": 0.4014, "step": 113270 }, { "epoch": 4.082423325044149, "grad_norm": 0.24656805396080017, "learning_rate": 4.300154703085332e-06, "loss": 0.3673, "step": 113275 }, { "epoch": 4.082603524705373, "grad_norm": 0.2953994870185852, "learning_rate": 4.298518537907826e-06, "loss": 0.3797, "step": 113280 }, { "epoch": 4.082783724366598, "grad_norm": 0.2698920965194702, "learning_rate": 4.296882654783993e-06, "loss": 0.3937, "step": 113285 }, { "epoch": 4.082963924027823, "grad_norm": 0.27484050393104553, "learning_rate": 4.295247053736124e-06, "loss": 0.3645, "step": 113290 }, { "epoch": 4.083144123689047, "grad_norm": 0.22406524419784546, "learning_rate": 4.293611734786507e-06, "loss": 0.3469, "step": 113295 }, { "epoch": 4.083324323350272, "grad_norm": 0.22183670103549957, "learning_rate": 4.291976697957425e-06, "loss": 0.3374, "step": 113300 }, { "epoch": 4.083504523011497, "grad_norm": 0.25425711274147034, "learning_rate": 4.290341943271151e-06, "loss": 0.3478, "step": 113305 }, { "epoch": 4.0836847226727215, "grad_norm": 0.26116448640823364, "learning_rate": 4.288707470749956e-06, "loss": 0.3491, "step": 113310 }, { "epoch": 4.083864922333946, "grad_norm": 0.21055485308170319, "learning_rate": 4.287073280416107e-06, "loss": 0.3569, "step": 113315 }, { "epoch": 4.084045121995171, "grad_norm": 0.24054095149040222, "learning_rate": 4.28543937229188e-06, "loss": 0.3747, "step": 113320 }, { "epoch": 4.084225321656396, "grad_norm": 0.2076614648103714, "learning_rate": 4.283805746399533e-06, "loss": 0.374, "step": 113325 }, { "epoch": 4.08440552131762, "grad_norm": 0.2705661356449127, "learning_rate": 4.2821724027613224e-06, "loss": 0.3778, "step": 113330 }, { "epoch": 4.084585720978844, "grad_norm": 0.20731863379478455, "learning_rate": 4.280539341399497e-06, "loss": 0.3833, "step": 113335 }, { "epoch": 4.084765920640069, "grad_norm": 0.2571890950202942, "learning_rate": 4.27890656233631e-06, "loss": 0.3689, "step": 113340 }, { "epoch": 4.084946120301294, "grad_norm": 0.2876233756542206, "learning_rate": 4.27727406559402e-06, "loss": 0.3643, "step": 113345 }, { "epoch": 4.0851263199625185, "grad_norm": 0.21950480341911316, "learning_rate": 4.275641851194853e-06, "loss": 0.3595, "step": 113350 }, { "epoch": 4.085306519623743, "grad_norm": 0.2071562260389328, "learning_rate": 4.274009919161048e-06, "loss": 0.3788, "step": 113355 }, { "epoch": 4.085486719284968, "grad_norm": 0.2707284688949585, "learning_rate": 4.272378269514851e-06, "loss": 0.3466, "step": 113360 }, { "epoch": 4.085666918946193, "grad_norm": 0.22272071242332458, "learning_rate": 4.2707469022784804e-06, "loss": 0.3734, "step": 113365 }, { "epoch": 4.0858471186074174, "grad_norm": 0.26984214782714844, "learning_rate": 4.269115817474181e-06, "loss": 0.3943, "step": 113370 }, { "epoch": 4.086027318268641, "grad_norm": 0.23475557565689087, "learning_rate": 4.267485015124162e-06, "loss": 0.4002, "step": 113375 }, { "epoch": 4.086207517929866, "grad_norm": 0.2445220947265625, "learning_rate": 4.265854495250638e-06, "loss": 0.3855, "step": 113380 }, { "epoch": 4.086387717591091, "grad_norm": 0.3065510392189026, "learning_rate": 4.264224257875837e-06, "loss": 0.3648, "step": 113385 }, { "epoch": 4.0865679172523155, "grad_norm": 0.25749513506889343, "learning_rate": 4.262594303021966e-06, "loss": 0.3851, "step": 113390 }, { "epoch": 4.08674811691354, "grad_norm": 0.307963103055954, "learning_rate": 4.260964630711234e-06, "loss": 0.3856, "step": 113395 }, { "epoch": 4.086928316574765, "grad_norm": 0.2993742525577545, "learning_rate": 4.2593352409658445e-06, "loss": 0.4103, "step": 113400 }, { "epoch": 4.08710851623599, "grad_norm": 0.28788864612579346, "learning_rate": 4.25770613380799e-06, "loss": 0.3724, "step": 113405 }, { "epoch": 4.087288715897214, "grad_norm": 0.22574250400066376, "learning_rate": 4.2560773092598786e-06, "loss": 0.3732, "step": 113410 }, { "epoch": 4.087468915558439, "grad_norm": 0.21905584633350372, "learning_rate": 4.2544487673436996e-06, "loss": 0.368, "step": 113415 }, { "epoch": 4.087649115219663, "grad_norm": 0.20254629850387573, "learning_rate": 4.252820508081637e-06, "loss": 0.3668, "step": 113420 }, { "epoch": 4.087829314880888, "grad_norm": 0.24264578521251678, "learning_rate": 4.251192531495879e-06, "loss": 0.3633, "step": 113425 }, { "epoch": 4.0880095145421125, "grad_norm": 0.25658249855041504, "learning_rate": 4.249564837608608e-06, "loss": 0.3669, "step": 113430 }, { "epoch": 4.088189714203337, "grad_norm": 0.27399855852127075, "learning_rate": 4.247937426441989e-06, "loss": 0.3615, "step": 113435 }, { "epoch": 4.088369913864562, "grad_norm": 0.22151198983192444, "learning_rate": 4.2463102980182216e-06, "loss": 0.3419, "step": 113440 }, { "epoch": 4.088550113525787, "grad_norm": 0.23478932678699493, "learning_rate": 4.244683452359443e-06, "loss": 0.3594, "step": 113445 }, { "epoch": 4.088730313187011, "grad_norm": 0.19280430674552917, "learning_rate": 4.24305688948784e-06, "loss": 0.3518, "step": 113450 }, { "epoch": 4.088910512848236, "grad_norm": 0.2612258195877075, "learning_rate": 4.24143060942557e-06, "loss": 0.3779, "step": 113455 }, { "epoch": 4.089090712509461, "grad_norm": 0.24851427972316742, "learning_rate": 4.239804612194787e-06, "loss": 0.3803, "step": 113460 }, { "epoch": 4.089270912170685, "grad_norm": 0.24412818253040314, "learning_rate": 4.238178897817646e-06, "loss": 0.3736, "step": 113465 }, { "epoch": 4.0894511118319095, "grad_norm": 0.2470037341117859, "learning_rate": 4.2365534663163005e-06, "loss": 0.3824, "step": 113470 }, { "epoch": 4.089631311493134, "grad_norm": 0.27560508251190186, "learning_rate": 4.234928317712888e-06, "loss": 0.3997, "step": 113475 }, { "epoch": 4.089811511154359, "grad_norm": 0.29321983456611633, "learning_rate": 4.233303452029561e-06, "loss": 0.3753, "step": 113480 }, { "epoch": 4.089991710815584, "grad_norm": 0.26840469241142273, "learning_rate": 4.231678869288455e-06, "loss": 0.3921, "step": 113485 }, { "epoch": 4.090171910476808, "grad_norm": 0.18927091360092163, "learning_rate": 4.230054569511705e-06, "loss": 0.3568, "step": 113490 }, { "epoch": 4.090352110138033, "grad_norm": 0.2522876262664795, "learning_rate": 4.228430552721438e-06, "loss": 0.351, "step": 113495 }, { "epoch": 4.090532309799258, "grad_norm": 0.30679717659950256, "learning_rate": 4.226806818939777e-06, "loss": 0.3625, "step": 113500 }, { "epoch": 4.090532309799258, "eval_loss": 0.4292847514152527, "eval_runtime": 3.5606, "eval_samples_per_second": 28.085, "eval_steps_per_second": 7.021, "step": 113500 }, { "epoch": 4.090712509460483, "grad_norm": 0.28912264108657837, "learning_rate": 4.225183368188859e-06, "loss": 0.3747, "step": 113505 }, { "epoch": 4.090892709121706, "grad_norm": 0.2808322012424469, "learning_rate": 4.223560200490801e-06, "loss": 0.355, "step": 113510 }, { "epoch": 4.091072908782931, "grad_norm": 0.27322959899902344, "learning_rate": 4.221937315867702e-06, "loss": 0.3809, "step": 113515 }, { "epoch": 4.091253108444156, "grad_norm": 0.22654719650745392, "learning_rate": 4.220314714341689e-06, "loss": 0.3452, "step": 113520 }, { "epoch": 4.091433308105381, "grad_norm": 0.28565314412117004, "learning_rate": 4.218692395934856e-06, "loss": 0.3838, "step": 113525 }, { "epoch": 4.091613507766605, "grad_norm": 0.24002893269062042, "learning_rate": 4.217070360669325e-06, "loss": 0.3882, "step": 113530 }, { "epoch": 4.09179370742783, "grad_norm": 0.28343817591667175, "learning_rate": 4.215448608567193e-06, "loss": 0.3902, "step": 113535 }, { "epoch": 4.091973907089055, "grad_norm": 0.2013174593448639, "learning_rate": 4.213827139650536e-06, "loss": 0.3459, "step": 113540 }, { "epoch": 4.09215410675028, "grad_norm": 0.3252928555011749, "learning_rate": 4.212205953941467e-06, "loss": 0.3473, "step": 113545 }, { "epoch": 4.092334306411504, "grad_norm": 0.20931507647037506, "learning_rate": 4.2105850514620625e-06, "loss": 0.3574, "step": 113550 }, { "epoch": 4.092514506072728, "grad_norm": 0.21451285481452942, "learning_rate": 4.208964432234422e-06, "loss": 0.4097, "step": 113555 }, { "epoch": 4.092694705733953, "grad_norm": 0.23355019092559814, "learning_rate": 4.20734409628061e-06, "loss": 0.348, "step": 113560 }, { "epoch": 4.092874905395178, "grad_norm": 0.21140533685684204, "learning_rate": 4.2057240436227025e-06, "loss": 0.3602, "step": 113565 }, { "epoch": 4.093055105056402, "grad_norm": 0.22797791659832, "learning_rate": 4.2041042742827855e-06, "loss": 0.3894, "step": 113570 }, { "epoch": 4.093235304717627, "grad_norm": 0.2819134593009949, "learning_rate": 4.202484788282923e-06, "loss": 0.3677, "step": 113575 }, { "epoch": 4.093415504378852, "grad_norm": 0.27418237924575806, "learning_rate": 4.200865585645178e-06, "loss": 0.3588, "step": 113580 }, { "epoch": 4.0935957040400766, "grad_norm": 0.258755087852478, "learning_rate": 4.19924666639161e-06, "loss": 0.3634, "step": 113585 }, { "epoch": 4.093775903701301, "grad_norm": 0.2556750476360321, "learning_rate": 4.197628030544276e-06, "loss": 0.3824, "step": 113590 }, { "epoch": 4.093956103362526, "grad_norm": 0.19411814212799072, "learning_rate": 4.196009678125237e-06, "loss": 0.3567, "step": 113595 }, { "epoch": 4.094136303023751, "grad_norm": 0.23082207143306732, "learning_rate": 4.19439160915654e-06, "loss": 0.3516, "step": 113600 }, { "epoch": 4.094316502684975, "grad_norm": 0.26558494567871094, "learning_rate": 4.192773823660229e-06, "loss": 0.4062, "step": 113605 }, { "epoch": 4.094496702346199, "grad_norm": 0.2534783184528351, "learning_rate": 4.191156321658343e-06, "loss": 0.3816, "step": 113610 }, { "epoch": 4.094676902007424, "grad_norm": 0.2528476417064667, "learning_rate": 4.189539103172926e-06, "loss": 0.355, "step": 113615 }, { "epoch": 4.094857101668649, "grad_norm": 0.29152292013168335, "learning_rate": 4.1879221682260054e-06, "loss": 0.3748, "step": 113620 }, { "epoch": 4.0950373013298735, "grad_norm": 0.22224609553813934, "learning_rate": 4.186305516839625e-06, "loss": 0.3615, "step": 113625 }, { "epoch": 4.095217500991098, "grad_norm": 0.2683086395263672, "learning_rate": 4.184689149035792e-06, "loss": 0.4004, "step": 113630 }, { "epoch": 4.095397700652323, "grad_norm": 0.2543949782848358, "learning_rate": 4.183073064836545e-06, "loss": 0.3536, "step": 113635 }, { "epoch": 4.095577900313548, "grad_norm": 0.28799569606781006, "learning_rate": 4.181457264263897e-06, "loss": 0.3799, "step": 113640 }, { "epoch": 4.0957580999747725, "grad_norm": 0.21432629227638245, "learning_rate": 4.179841747339864e-06, "loss": 0.3454, "step": 113645 }, { "epoch": 4.095938299635996, "grad_norm": 0.256199449300766, "learning_rate": 4.178226514086453e-06, "loss": 0.3815, "step": 113650 }, { "epoch": 4.096118499297221, "grad_norm": 0.19477510452270508, "learning_rate": 4.176611564525679e-06, "loss": 0.3582, "step": 113655 }, { "epoch": 4.096298698958446, "grad_norm": 0.2505654990673065, "learning_rate": 4.1749968986795316e-06, "loss": 0.3512, "step": 113660 }, { "epoch": 4.0964788986196705, "grad_norm": 0.26357826590538025, "learning_rate": 4.173382516570026e-06, "loss": 0.361, "step": 113665 }, { "epoch": 4.096659098280895, "grad_norm": 0.22375109791755676, "learning_rate": 4.171768418219152e-06, "loss": 0.3643, "step": 113670 }, { "epoch": 4.09683929794212, "grad_norm": 0.2819434404373169, "learning_rate": 4.170154603648901e-06, "loss": 0.3794, "step": 113675 }, { "epoch": 4.097019497603345, "grad_norm": 0.24838387966156006, "learning_rate": 4.168541072881263e-06, "loss": 0.3224, "step": 113680 }, { "epoch": 4.0971996972645695, "grad_norm": 0.2065441608428955, "learning_rate": 4.1669278259382104e-06, "loss": 0.343, "step": 113685 }, { "epoch": 4.097379896925794, "grad_norm": 0.2739092707633972, "learning_rate": 4.16531486284174e-06, "loss": 0.4081, "step": 113690 }, { "epoch": 4.097560096587018, "grad_norm": 0.2650372385978699, "learning_rate": 4.1637021836138215e-06, "loss": 0.3675, "step": 113695 }, { "epoch": 4.097740296248243, "grad_norm": 0.23538993299007416, "learning_rate": 4.162089788276424e-06, "loss": 0.397, "step": 113700 }, { "epoch": 4.0979204959094675, "grad_norm": 0.2418733686208725, "learning_rate": 4.160477676851523e-06, "loss": 0.3946, "step": 113705 }, { "epoch": 4.098100695570692, "grad_norm": 0.2708457112312317, "learning_rate": 4.158865849361071e-06, "loss": 0.3647, "step": 113710 }, { "epoch": 4.098280895231917, "grad_norm": 0.24828830361366272, "learning_rate": 4.157254305827041e-06, "loss": 0.3341, "step": 113715 }, { "epoch": 4.098461094893142, "grad_norm": 0.27252304553985596, "learning_rate": 4.1556430462713954e-06, "loss": 0.3764, "step": 113720 }, { "epoch": 4.098641294554366, "grad_norm": 0.21500465273857117, "learning_rate": 4.154032070716063e-06, "loss": 0.3764, "step": 113725 }, { "epoch": 4.098821494215591, "grad_norm": 0.2509201467037201, "learning_rate": 4.152421379183013e-06, "loss": 0.3784, "step": 113730 }, { "epoch": 4.099001693876816, "grad_norm": 0.24733483791351318, "learning_rate": 4.150810971694183e-06, "loss": 0.3653, "step": 113735 }, { "epoch": 4.09918189353804, "grad_norm": 0.22112403810024261, "learning_rate": 4.1492008482715254e-06, "loss": 0.3734, "step": 113740 }, { "epoch": 4.0993620931992645, "grad_norm": 0.22997091710567474, "learning_rate": 4.147591008936966e-06, "loss": 0.3426, "step": 113745 }, { "epoch": 4.099542292860489, "grad_norm": 0.21955984830856323, "learning_rate": 4.145981453712436e-06, "loss": 0.3695, "step": 113750 }, { "epoch": 4.099722492521714, "grad_norm": 0.2155844122171402, "learning_rate": 4.144372182619874e-06, "loss": 0.3533, "step": 113755 }, { "epoch": 4.099902692182939, "grad_norm": 0.2931585907936096, "learning_rate": 4.1430849703355755e-06, "loss": 0.4126, "step": 113760 }, { "epoch": 4.100082891844163, "grad_norm": 0.23086297512054443, "learning_rate": 4.141476210735803e-06, "loss": 0.3405, "step": 113765 }, { "epoch": 4.100263091505388, "grad_norm": 0.259778767824173, "learning_rate": 4.139867735329381e-06, "loss": 0.3706, "step": 113770 }, { "epoch": 4.100443291166613, "grad_norm": 0.22859430313110352, "learning_rate": 4.13825954413822e-06, "loss": 0.3301, "step": 113775 }, { "epoch": 4.100623490827838, "grad_norm": 0.25174832344055176, "learning_rate": 4.13665163718423e-06, "loss": 0.3392, "step": 113780 }, { "epoch": 4.1008036904890615, "grad_norm": 0.2478947937488556, "learning_rate": 4.135044014489326e-06, "loss": 0.3599, "step": 113785 }, { "epoch": 4.100983890150286, "grad_norm": 0.2401927262544632, "learning_rate": 4.133436676075414e-06, "loss": 0.3812, "step": 113790 }, { "epoch": 4.101164089811511, "grad_norm": 0.2504352331161499, "learning_rate": 4.131829621964378e-06, "loss": 0.4121, "step": 113795 }, { "epoch": 4.101344289472736, "grad_norm": 0.21674078702926636, "learning_rate": 4.130222852178129e-06, "loss": 0.323, "step": 113800 }, { "epoch": 4.10152448913396, "grad_norm": 0.2414158135652542, "learning_rate": 4.128616366738544e-06, "loss": 0.3697, "step": 113805 }, { "epoch": 4.101704688795185, "grad_norm": 0.2412973940372467, "learning_rate": 4.127010165667533e-06, "loss": 0.3529, "step": 113810 }, { "epoch": 4.10188488845641, "grad_norm": 0.24448582530021667, "learning_rate": 4.125404248986961e-06, "loss": 0.3587, "step": 113815 }, { "epoch": 4.102065088117635, "grad_norm": 0.2748193144798279, "learning_rate": 4.12379861671871e-06, "loss": 0.374, "step": 113820 }, { "epoch": 4.102245287778859, "grad_norm": 0.28702595829963684, "learning_rate": 4.1221932688846664e-06, "loss": 0.3794, "step": 113825 }, { "epoch": 4.102425487440083, "grad_norm": 0.23407040536403656, "learning_rate": 4.120588205506698e-06, "loss": 0.3652, "step": 113830 }, { "epoch": 4.102605687101308, "grad_norm": 0.28252798318862915, "learning_rate": 4.1189834266066735e-06, "loss": 0.3586, "step": 113835 }, { "epoch": 4.102785886762533, "grad_norm": 0.2878764569759369, "learning_rate": 4.117378932206456e-06, "loss": 0.4031, "step": 113840 }, { "epoch": 4.102966086423757, "grad_norm": 0.21540088951587677, "learning_rate": 4.1157747223279034e-06, "loss": 0.3806, "step": 113845 }, { "epoch": 4.103146286084982, "grad_norm": 0.24554520845413208, "learning_rate": 4.114170796992881e-06, "loss": 0.3937, "step": 113850 }, { "epoch": 4.103326485746207, "grad_norm": 0.2406504899263382, "learning_rate": 4.112567156223237e-06, "loss": 0.3786, "step": 113855 }, { "epoch": 4.103506685407432, "grad_norm": 0.25493624806404114, "learning_rate": 4.110963800040824e-06, "loss": 0.3673, "step": 113860 }, { "epoch": 4.103686885068656, "grad_norm": 0.20379377901554108, "learning_rate": 4.109360728467485e-06, "loss": 0.3536, "step": 113865 }, { "epoch": 4.103867084729881, "grad_norm": 0.25673457980155945, "learning_rate": 4.107757941525059e-06, "loss": 0.3743, "step": 113870 }, { "epoch": 4.104047284391106, "grad_norm": 0.2972078025341034, "learning_rate": 4.1061554392353816e-06, "loss": 0.3751, "step": 113875 }, { "epoch": 4.10422748405233, "grad_norm": 0.2244071513414383, "learning_rate": 4.104553221620297e-06, "loss": 0.3879, "step": 113880 }, { "epoch": 4.104407683713554, "grad_norm": 0.20610791444778442, "learning_rate": 4.1029512887016286e-06, "loss": 0.3761, "step": 113885 }, { "epoch": 4.104587883374779, "grad_norm": 0.23367708921432495, "learning_rate": 4.101349640501206e-06, "loss": 0.3863, "step": 113890 }, { "epoch": 4.104768083036004, "grad_norm": 0.26050618290901184, "learning_rate": 4.099748277040846e-06, "loss": 0.4031, "step": 113895 }, { "epoch": 4.104948282697229, "grad_norm": 0.2746144235134125, "learning_rate": 4.098147198342364e-06, "loss": 0.3751, "step": 113900 }, { "epoch": 4.105128482358453, "grad_norm": 0.22158581018447876, "learning_rate": 4.09654640442759e-06, "loss": 0.3932, "step": 113905 }, { "epoch": 4.105308682019678, "grad_norm": 0.21861512959003448, "learning_rate": 4.09494589531832e-06, "loss": 0.3805, "step": 113910 }, { "epoch": 4.105488881680903, "grad_norm": 0.2370881885290146, "learning_rate": 4.0933456710363585e-06, "loss": 0.3719, "step": 113915 }, { "epoch": 4.1056690813421275, "grad_norm": 0.23769058287143707, "learning_rate": 4.091745731603519e-06, "loss": 0.3728, "step": 113920 }, { "epoch": 4.105849281003351, "grad_norm": 0.19477471709251404, "learning_rate": 4.090146077041598e-06, "loss": 0.3491, "step": 113925 }, { "epoch": 4.106029480664576, "grad_norm": 0.310822457075119, "learning_rate": 4.088546707372387e-06, "loss": 0.3381, "step": 113930 }, { "epoch": 4.106209680325801, "grad_norm": 0.25079676508903503, "learning_rate": 4.086947622617682e-06, "loss": 0.3781, "step": 113935 }, { "epoch": 4.1063898799870255, "grad_norm": 0.2638763189315796, "learning_rate": 4.085348822799257e-06, "loss": 0.4023, "step": 113940 }, { "epoch": 4.10657007964825, "grad_norm": 0.25837767124176025, "learning_rate": 4.083750307938911e-06, "loss": 0.3652, "step": 113945 }, { "epoch": 4.106750279309475, "grad_norm": 0.2736017405986786, "learning_rate": 4.082152078058419e-06, "loss": 0.3973, "step": 113950 }, { "epoch": 4.1069304789707, "grad_norm": 0.2712678611278534, "learning_rate": 4.080554133179554e-06, "loss": 0.381, "step": 113955 }, { "epoch": 4.1071106786319245, "grad_norm": 0.26710009574890137, "learning_rate": 4.078956473324091e-06, "loss": 0.3954, "step": 113960 }, { "epoch": 4.107290878293149, "grad_norm": 0.2664423882961273, "learning_rate": 4.077359098513789e-06, "loss": 0.3671, "step": 113965 }, { "epoch": 4.107471077954373, "grad_norm": 0.2606585919857025, "learning_rate": 4.075762008770423e-06, "loss": 0.4075, "step": 113970 }, { "epoch": 4.107651277615598, "grad_norm": 0.26195093989372253, "learning_rate": 4.074165204115754e-06, "loss": 0.3666, "step": 113975 }, { "epoch": 4.1078314772768225, "grad_norm": 0.23841117322444916, "learning_rate": 4.072568684571524e-06, "loss": 0.3521, "step": 113980 }, { "epoch": 4.108011676938047, "grad_norm": 0.24487237632274628, "learning_rate": 4.070972450159497e-06, "loss": 0.3861, "step": 113985 }, { "epoch": 4.108191876599272, "grad_norm": 0.22291886806488037, "learning_rate": 4.069376500901414e-06, "loss": 0.3597, "step": 113990 }, { "epoch": 4.108372076260497, "grad_norm": 0.2304575890302658, "learning_rate": 4.06778083681903e-06, "loss": 0.362, "step": 113995 }, { "epoch": 4.1085522759217215, "grad_norm": 0.2351878434419632, "learning_rate": 4.066185457934083e-06, "loss": 0.3816, "step": 114000 }, { "epoch": 4.1085522759217215, "eval_loss": 0.4291282296180725, "eval_runtime": 3.5373, "eval_samples_per_second": 28.27, "eval_steps_per_second": 7.068, "step": 114000 }, { "epoch": 4.108732475582946, "grad_norm": 0.25444066524505615, "learning_rate": 4.064590364268298e-06, "loss": 0.3812, "step": 114005 }, { "epoch": 4.108912675244171, "grad_norm": 0.23914211988449097, "learning_rate": 4.062995555843419e-06, "loss": 0.3443, "step": 114010 }, { "epoch": 4.109092874905395, "grad_norm": 0.25930920243263245, "learning_rate": 4.061401032681172e-06, "loss": 0.3715, "step": 114015 }, { "epoch": 4.1092730745666195, "grad_norm": 0.37500235438346863, "learning_rate": 4.059806794803283e-06, "loss": 0.4553, "step": 114020 }, { "epoch": 4.109453274227844, "grad_norm": 0.26621177792549133, "learning_rate": 4.058212842231474e-06, "loss": 0.3756, "step": 114025 }, { "epoch": 4.109633473889069, "grad_norm": 0.22278466820716858, "learning_rate": 4.056619174987453e-06, "loss": 0.3579, "step": 114030 }, { "epoch": 4.109813673550294, "grad_norm": 0.2507738769054413, "learning_rate": 4.055025793092945e-06, "loss": 0.3704, "step": 114035 }, { "epoch": 4.109993873211518, "grad_norm": 0.29194319248199463, "learning_rate": 4.053432696569659e-06, "loss": 0.3727, "step": 114040 }, { "epoch": 4.110174072872743, "grad_norm": 0.22868956625461578, "learning_rate": 4.0518398854392955e-06, "loss": 0.3543, "step": 114045 }, { "epoch": 4.110354272533968, "grad_norm": 0.24495558440685272, "learning_rate": 4.050247359723558e-06, "loss": 0.3589, "step": 114050 }, { "epoch": 4.110534472195193, "grad_norm": 0.22219400107860565, "learning_rate": 4.0486551194441445e-06, "loss": 0.3735, "step": 114055 }, { "epoch": 4.1107146718564165, "grad_norm": 0.240044966340065, "learning_rate": 4.0470631646227416e-06, "loss": 0.3688, "step": 114060 }, { "epoch": 4.110894871517641, "grad_norm": 0.2267971634864807, "learning_rate": 4.045471495281053e-06, "loss": 0.3558, "step": 114065 }, { "epoch": 4.111075071178866, "grad_norm": 0.24570490419864655, "learning_rate": 4.043880111440759e-06, "loss": 0.3887, "step": 114070 }, { "epoch": 4.111255270840091, "grad_norm": 0.24875159561634064, "learning_rate": 4.04228901312354e-06, "loss": 0.3644, "step": 114075 }, { "epoch": 4.111435470501315, "grad_norm": 0.2332112193107605, "learning_rate": 4.040698200351076e-06, "loss": 0.3569, "step": 114080 }, { "epoch": 4.11161567016254, "grad_norm": 0.2375115007162094, "learning_rate": 4.039107673145038e-06, "loss": 0.3451, "step": 114085 }, { "epoch": 4.111795869823765, "grad_norm": 0.26559481024742126, "learning_rate": 4.037517431527108e-06, "loss": 0.3891, "step": 114090 }, { "epoch": 4.11197606948499, "grad_norm": 0.26684683561325073, "learning_rate": 4.0359274755189386e-06, "loss": 0.3918, "step": 114095 }, { "epoch": 4.112156269146214, "grad_norm": 0.288522332906723, "learning_rate": 4.034337805142193e-06, "loss": 0.3604, "step": 114100 }, { "epoch": 4.112336468807438, "grad_norm": 0.23801618814468384, "learning_rate": 4.032748420418545e-06, "loss": 0.3558, "step": 114105 }, { "epoch": 4.112516668468663, "grad_norm": 0.26979947090148926, "learning_rate": 4.031159321369637e-06, "loss": 0.3942, "step": 114110 }, { "epoch": 4.112696868129888, "grad_norm": 0.24331551790237427, "learning_rate": 4.029570508017125e-06, "loss": 0.4005, "step": 114115 }, { "epoch": 4.112877067791112, "grad_norm": 0.24675403535366058, "learning_rate": 4.027981980382656e-06, "loss": 0.3982, "step": 114120 }, { "epoch": 4.113057267452337, "grad_norm": 0.18268905580043793, "learning_rate": 4.026393738487863e-06, "loss": 0.3782, "step": 114125 }, { "epoch": 4.113237467113562, "grad_norm": 0.24003072082996368, "learning_rate": 4.024805782354402e-06, "loss": 0.3864, "step": 114130 }, { "epoch": 4.113417666774787, "grad_norm": 0.26609840989112854, "learning_rate": 4.023218112003902e-06, "loss": 0.3561, "step": 114135 }, { "epoch": 4.113597866436011, "grad_norm": 0.29449740052223206, "learning_rate": 4.021630727457995e-06, "loss": 0.4234, "step": 114140 }, { "epoch": 4.113778066097236, "grad_norm": 0.20891326665878296, "learning_rate": 4.020043628738304e-06, "loss": 0.3911, "step": 114145 }, { "epoch": 4.113958265758461, "grad_norm": 0.20913873612880707, "learning_rate": 4.018456815866453e-06, "loss": 0.3592, "step": 114150 }, { "epoch": 4.114138465419685, "grad_norm": 0.22680309414863586, "learning_rate": 4.016870288864072e-06, "loss": 0.367, "step": 114155 }, { "epoch": 4.114318665080909, "grad_norm": 0.2160423845052719, "learning_rate": 4.015284047752771e-06, "loss": 0.3595, "step": 114160 }, { "epoch": 4.114498864742134, "grad_norm": 0.2502204179763794, "learning_rate": 4.013698092554163e-06, "loss": 0.3754, "step": 114165 }, { "epoch": 4.114679064403359, "grad_norm": 0.2723202109336853, "learning_rate": 4.012112423289855e-06, "loss": 0.3667, "step": 114170 }, { "epoch": 4.114859264064584, "grad_norm": 0.279547780752182, "learning_rate": 4.010527039981443e-06, "loss": 0.3663, "step": 114175 }, { "epoch": 4.115039463725808, "grad_norm": 0.2516183853149414, "learning_rate": 4.008941942650546e-06, "loss": 0.368, "step": 114180 }, { "epoch": 4.115219663387033, "grad_norm": 0.277414470911026, "learning_rate": 4.007357131318753e-06, "loss": 0.3524, "step": 114185 }, { "epoch": 4.115399863048258, "grad_norm": 0.237200066447258, "learning_rate": 4.005772606007646e-06, "loss": 0.3486, "step": 114190 }, { "epoch": 4.1155800627094825, "grad_norm": 0.2728269100189209, "learning_rate": 4.004188366738829e-06, "loss": 0.3794, "step": 114195 }, { "epoch": 4.115760262370706, "grad_norm": 0.2168535739183426, "learning_rate": 4.002604413533878e-06, "loss": 0.3768, "step": 114200 }, { "epoch": 4.115940462031931, "grad_norm": 0.22027429938316345, "learning_rate": 4.00102074641438e-06, "loss": 0.3745, "step": 114205 }, { "epoch": 4.116120661693156, "grad_norm": 0.23675896227359772, "learning_rate": 3.999437365401906e-06, "loss": 0.3777, "step": 114210 }, { "epoch": 4.116300861354381, "grad_norm": 0.24391767382621765, "learning_rate": 3.997854270518026e-06, "loss": 0.3857, "step": 114215 }, { "epoch": 4.116481061015605, "grad_norm": 0.239417165517807, "learning_rate": 3.996271461784324e-06, "loss": 0.3692, "step": 114220 }, { "epoch": 4.11666126067683, "grad_norm": 0.2657260596752167, "learning_rate": 3.9946889392223545e-06, "loss": 0.3556, "step": 114225 }, { "epoch": 4.116841460338055, "grad_norm": 0.2049977332353592, "learning_rate": 3.993106702853683e-06, "loss": 0.3461, "step": 114230 }, { "epoch": 4.1170216599992795, "grad_norm": 0.24377499520778656, "learning_rate": 3.9915247526998625e-06, "loss": 0.3724, "step": 114235 }, { "epoch": 4.117201859660504, "grad_norm": 0.23424182832241058, "learning_rate": 3.989943088782453e-06, "loss": 0.3779, "step": 114240 }, { "epoch": 4.117382059321728, "grad_norm": 0.21818037331104279, "learning_rate": 3.9883617111229955e-06, "loss": 0.3498, "step": 114245 }, { "epoch": 4.117562258982953, "grad_norm": 0.25411292910575867, "learning_rate": 3.986780619743047e-06, "loss": 0.3329, "step": 114250 }, { "epoch": 4.1177424586441775, "grad_norm": 0.2751554846763611, "learning_rate": 3.985199814664142e-06, "loss": 0.3821, "step": 114255 }, { "epoch": 4.117922658305402, "grad_norm": 0.20654655992984772, "learning_rate": 3.9836192959078225e-06, "loss": 0.3981, "step": 114260 }, { "epoch": 4.118102857966627, "grad_norm": 0.21761824190616608, "learning_rate": 3.982039063495621e-06, "loss": 0.3587, "step": 114265 }, { "epoch": 4.118283057627852, "grad_norm": 0.3250308334827423, "learning_rate": 3.980459117449065e-06, "loss": 0.3831, "step": 114270 }, { "epoch": 4.1184632572890765, "grad_norm": 0.22311455011367798, "learning_rate": 3.978879457789686e-06, "loss": 0.3779, "step": 114275 }, { "epoch": 4.118643456950301, "grad_norm": 0.28070372343063354, "learning_rate": 3.977300084539013e-06, "loss": 0.3387, "step": 114280 }, { "epoch": 4.118823656611526, "grad_norm": 0.19799430668354034, "learning_rate": 3.975720997718544e-06, "loss": 0.3412, "step": 114285 }, { "epoch": 4.11900385627275, "grad_norm": 0.24587881565093994, "learning_rate": 3.974142197349809e-06, "loss": 0.3441, "step": 114290 }, { "epoch": 4.1191840559339745, "grad_norm": 0.26680803298950195, "learning_rate": 3.972563683454314e-06, "loss": 0.3738, "step": 114295 }, { "epoch": 4.119364255595199, "grad_norm": 0.22729459404945374, "learning_rate": 3.970985456053578e-06, "loss": 0.38, "step": 114300 }, { "epoch": 4.119544455256424, "grad_norm": 0.28258028626441956, "learning_rate": 3.969407515169088e-06, "loss": 0.3575, "step": 114305 }, { "epoch": 4.119724654917649, "grad_norm": 0.2133554369211197, "learning_rate": 3.967829860822342e-06, "loss": 0.3429, "step": 114310 }, { "epoch": 4.1199048545788735, "grad_norm": 0.2711166441440582, "learning_rate": 3.966252493034847e-06, "loss": 0.3196, "step": 114315 }, { "epoch": 4.120085054240098, "grad_norm": 0.2520091235637665, "learning_rate": 3.964675411828092e-06, "loss": 0.387, "step": 114320 }, { "epoch": 4.120265253901323, "grad_norm": 0.2586155831813812, "learning_rate": 3.963098617223562e-06, "loss": 0.3943, "step": 114325 }, { "epoch": 4.120445453562548, "grad_norm": 0.23721350729465485, "learning_rate": 3.961522109242741e-06, "loss": 0.3705, "step": 114330 }, { "epoch": 4.120625653223772, "grad_norm": 0.24257147312164307, "learning_rate": 3.959945887907099e-06, "loss": 0.3615, "step": 114335 }, { "epoch": 4.120805852884996, "grad_norm": 0.2581164240837097, "learning_rate": 3.9583699532381304e-06, "loss": 0.3599, "step": 114340 }, { "epoch": 4.120986052546221, "grad_norm": 0.2084408849477768, "learning_rate": 3.956794305257294e-06, "loss": 0.3728, "step": 114345 }, { "epoch": 4.121166252207446, "grad_norm": 0.22756318747997284, "learning_rate": 3.955218943986064e-06, "loss": 0.3401, "step": 114350 }, { "epoch": 4.12134645186867, "grad_norm": 0.21954157948493958, "learning_rate": 3.9536438694459e-06, "loss": 0.394, "step": 114355 }, { "epoch": 4.121526651529895, "grad_norm": 0.24439333379268646, "learning_rate": 3.952069081658258e-06, "loss": 0.3647, "step": 114360 }, { "epoch": 4.12170685119112, "grad_norm": 0.2532067596912384, "learning_rate": 3.950494580644606e-06, "loss": 0.3597, "step": 114365 }, { "epoch": 4.121887050852345, "grad_norm": 0.23494374752044678, "learning_rate": 3.9489203664263955e-06, "loss": 0.3872, "step": 114370 }, { "epoch": 4.122067250513569, "grad_norm": 0.2250523865222931, "learning_rate": 3.947346439025057e-06, "loss": 0.3852, "step": 114375 }, { "epoch": 4.122247450174794, "grad_norm": 0.21566912531852722, "learning_rate": 3.9457727984620524e-06, "loss": 0.3862, "step": 114380 }, { "epoch": 4.122427649836018, "grad_norm": 0.33429858088493347, "learning_rate": 3.94419944475882e-06, "loss": 0.3911, "step": 114385 }, { "epoch": 4.122607849497243, "grad_norm": 0.28775933384895325, "learning_rate": 3.942626377936793e-06, "loss": 0.3773, "step": 114390 }, { "epoch": 4.122788049158467, "grad_norm": 0.23396413028240204, "learning_rate": 3.941053598017402e-06, "loss": 0.3944, "step": 114395 }, { "epoch": 4.122968248819692, "grad_norm": 0.2518860101699829, "learning_rate": 3.939481105022075e-06, "loss": 0.3333, "step": 114400 }, { "epoch": 4.123148448480917, "grad_norm": 0.2610318064689636, "learning_rate": 3.937908898972248e-06, "loss": 0.3537, "step": 114405 }, { "epoch": 4.123328648142142, "grad_norm": 0.2574523687362671, "learning_rate": 3.936336979889332e-06, "loss": 0.3709, "step": 114410 }, { "epoch": 4.123508847803366, "grad_norm": 0.2853260040283203, "learning_rate": 3.934765347794747e-06, "loss": 0.372, "step": 114415 }, { "epoch": 4.123689047464591, "grad_norm": 0.22084416449069977, "learning_rate": 3.933194002709906e-06, "loss": 0.3494, "step": 114420 }, { "epoch": 4.123869247125816, "grad_norm": 0.23949111998081207, "learning_rate": 3.9316229446562184e-06, "loss": 0.3449, "step": 114425 }, { "epoch": 4.12404944678704, "grad_norm": 0.2440653145313263, "learning_rate": 3.9300521736550825e-06, "loss": 0.358, "step": 114430 }, { "epoch": 4.124229646448264, "grad_norm": 0.2716491222381592, "learning_rate": 3.928481689727911e-06, "loss": 0.3932, "step": 114435 }, { "epoch": 4.124409846109489, "grad_norm": 0.26395681500434875, "learning_rate": 3.926911492896098e-06, "loss": 0.3841, "step": 114440 }, { "epoch": 4.124590045770714, "grad_norm": 0.2175927311182022, "learning_rate": 3.925341583181039e-06, "loss": 0.3782, "step": 114445 }, { "epoch": 4.124770245431939, "grad_norm": 0.26144078373908997, "learning_rate": 3.923771960604117e-06, "loss": 0.3954, "step": 114450 }, { "epoch": 4.124950445093163, "grad_norm": 0.24796469509601593, "learning_rate": 3.922202625186719e-06, "loss": 0.3914, "step": 114455 }, { "epoch": 4.125130644754388, "grad_norm": 0.24708044528961182, "learning_rate": 3.920633576950234e-06, "loss": 0.391, "step": 114460 }, { "epoch": 4.125310844415613, "grad_norm": 0.23979832231998444, "learning_rate": 3.91906481591604e-06, "loss": 0.3627, "step": 114465 }, { "epoch": 4.1254910440768375, "grad_norm": 0.2370292693376541, "learning_rate": 3.917496342105495e-06, "loss": 0.3845, "step": 114470 }, { "epoch": 4.125671243738061, "grad_norm": 0.2842269837856293, "learning_rate": 3.915928155539986e-06, "loss": 0.3571, "step": 114475 }, { "epoch": 4.125851443399286, "grad_norm": 0.23658975958824158, "learning_rate": 3.914360256240871e-06, "loss": 0.357, "step": 114480 }, { "epoch": 4.126031643060511, "grad_norm": 0.24424517154693604, "learning_rate": 3.912792644229524e-06, "loss": 0.3581, "step": 114485 }, { "epoch": 4.126211842721736, "grad_norm": 0.3009839951992035, "learning_rate": 3.91122531952729e-06, "loss": 0.3738, "step": 114490 }, { "epoch": 4.12639204238296, "grad_norm": 0.1834869086742401, "learning_rate": 3.9096582821555204e-06, "loss": 0.3381, "step": 114495 }, { "epoch": 4.126572242044185, "grad_norm": 0.19505488872528076, "learning_rate": 3.908091532135583e-06, "loss": 0.3742, "step": 114500 }, { "epoch": 4.126572242044185, "eval_loss": 0.42937061190605164, "eval_runtime": 3.5336, "eval_samples_per_second": 28.3, "eval_steps_per_second": 7.075, "step": 114500 }, { "epoch": 4.12675244170541, "grad_norm": 0.21231359243392944, "learning_rate": 3.906525069488812e-06, "loss": 0.3716, "step": 114505 }, { "epoch": 4.1269326413666345, "grad_norm": 0.255136638879776, "learning_rate": 3.904958894236554e-06, "loss": 0.3851, "step": 114510 }, { "epoch": 4.127112841027859, "grad_norm": 0.22166618704795837, "learning_rate": 3.903393006400147e-06, "loss": 0.3521, "step": 114515 }, { "epoch": 4.127293040689083, "grad_norm": 0.23030775785446167, "learning_rate": 3.901827406000918e-06, "loss": 0.348, "step": 114520 }, { "epoch": 4.127473240350308, "grad_norm": 0.20609252154827118, "learning_rate": 3.900262093060214e-06, "loss": 0.3331, "step": 114525 }, { "epoch": 4.127653440011533, "grad_norm": 0.2601858377456665, "learning_rate": 3.898697067599355e-06, "loss": 0.3415, "step": 114530 }, { "epoch": 4.127833639672757, "grad_norm": 0.20611776411533356, "learning_rate": 3.897132329639661e-06, "loss": 0.3781, "step": 114535 }, { "epoch": 4.128013839333982, "grad_norm": 0.2544640898704529, "learning_rate": 3.895567879202452e-06, "loss": 0.3592, "step": 114540 }, { "epoch": 4.128194038995207, "grad_norm": 0.2738896608352661, "learning_rate": 3.894003716309047e-06, "loss": 0.3508, "step": 114545 }, { "epoch": 4.1283742386564315, "grad_norm": 0.20672179758548737, "learning_rate": 3.89243984098075e-06, "loss": 0.3829, "step": 114550 }, { "epoch": 4.128554438317656, "grad_norm": 0.31415534019470215, "learning_rate": 3.890876253238884e-06, "loss": 0.3825, "step": 114555 }, { "epoch": 4.128734637978881, "grad_norm": 0.2770949900150299, "learning_rate": 3.8893129531047314e-06, "loss": 0.3637, "step": 114560 }, { "epoch": 4.128914837640105, "grad_norm": 0.20876848697662354, "learning_rate": 3.887749940599608e-06, "loss": 0.3571, "step": 114565 }, { "epoch": 4.1290950373013295, "grad_norm": 0.2529737055301666, "learning_rate": 3.886187215744805e-06, "loss": 0.3665, "step": 114570 }, { "epoch": 4.129275236962554, "grad_norm": 0.27046170830726624, "learning_rate": 3.884624778561605e-06, "loss": 0.368, "step": 114575 }, { "epoch": 4.129455436623779, "grad_norm": 0.2750345766544342, "learning_rate": 3.8830626290713185e-06, "loss": 0.3764, "step": 114580 }, { "epoch": 4.129635636285004, "grad_norm": 0.24169829487800598, "learning_rate": 3.881500767295201e-06, "loss": 0.363, "step": 114585 }, { "epoch": 4.1298158359462285, "grad_norm": 0.24228541553020477, "learning_rate": 3.8799391932545555e-06, "loss": 0.3773, "step": 114590 }, { "epoch": 4.129996035607453, "grad_norm": 0.21988023817539215, "learning_rate": 3.878377906970648e-06, "loss": 0.3664, "step": 114595 }, { "epoch": 4.130176235268678, "grad_norm": 0.2830277979373932, "learning_rate": 3.876816908464753e-06, "loss": 0.4126, "step": 114600 }, { "epoch": 4.130356434929903, "grad_norm": 0.2855124771595001, "learning_rate": 3.875256197758137e-06, "loss": 0.3822, "step": 114605 }, { "epoch": 4.130536634591127, "grad_norm": 0.314847856760025, "learning_rate": 3.873695774872066e-06, "loss": 0.3807, "step": 114610 }, { "epoch": 4.130716834252351, "grad_norm": 0.26585081219673157, "learning_rate": 3.872135639827795e-06, "loss": 0.3741, "step": 114615 }, { "epoch": 4.130897033913576, "grad_norm": 0.2731850743293762, "learning_rate": 3.870575792646591e-06, "loss": 0.3896, "step": 114620 }, { "epoch": 4.131077233574801, "grad_norm": 0.2461189180612564, "learning_rate": 3.8690162333497006e-06, "loss": 0.4125, "step": 114625 }, { "epoch": 4.1312574332360255, "grad_norm": 0.2130453884601593, "learning_rate": 3.867456961958374e-06, "loss": 0.3652, "step": 114630 }, { "epoch": 4.13143763289725, "grad_norm": 0.2346867322921753, "learning_rate": 3.865897978493851e-06, "loss": 0.3495, "step": 114635 }, { "epoch": 4.131617832558475, "grad_norm": 0.23216021060943604, "learning_rate": 3.864339282977375e-06, "loss": 0.367, "step": 114640 }, { "epoch": 4.1317980322197, "grad_norm": 0.28052276372909546, "learning_rate": 3.862780875430188e-06, "loss": 0.3296, "step": 114645 }, { "epoch": 4.131978231880924, "grad_norm": 0.26213961839675903, "learning_rate": 3.861222755873526e-06, "loss": 0.4356, "step": 114650 }, { "epoch": 4.132158431542149, "grad_norm": 0.3032136857509613, "learning_rate": 3.859664924328599e-06, "loss": 0.3864, "step": 114655 }, { "epoch": 4.132338631203373, "grad_norm": 0.297122061252594, "learning_rate": 3.85810738081665e-06, "loss": 0.3805, "step": 114660 }, { "epoch": 4.132518830864598, "grad_norm": 0.1998542994260788, "learning_rate": 3.85655012535889e-06, "loss": 0.3474, "step": 114665 }, { "epoch": 4.132699030525822, "grad_norm": 0.21667265892028809, "learning_rate": 3.854993157976553e-06, "loss": 0.3602, "step": 114670 }, { "epoch": 4.132879230187047, "grad_norm": 0.2543310523033142, "learning_rate": 3.853436478690833e-06, "loss": 0.3556, "step": 114675 }, { "epoch": 4.133059429848272, "grad_norm": 0.2114337682723999, "learning_rate": 3.851880087522941e-06, "loss": 0.3946, "step": 114680 }, { "epoch": 4.133239629509497, "grad_norm": 0.2727842330932617, "learning_rate": 3.850323984494095e-06, "loss": 0.3585, "step": 114685 }, { "epoch": 4.133419829170721, "grad_norm": 0.27654939889907837, "learning_rate": 3.848768169625491e-06, "loss": 0.362, "step": 114690 }, { "epoch": 4.133600028831946, "grad_norm": 0.26321300864219666, "learning_rate": 3.8472126429383246e-06, "loss": 0.3651, "step": 114695 }, { "epoch": 4.133780228493171, "grad_norm": 0.23769168555736542, "learning_rate": 3.84565740445379e-06, "loss": 0.3345, "step": 114700 }, { "epoch": 4.133960428154395, "grad_norm": 0.2931036353111267, "learning_rate": 3.844102454193072e-06, "loss": 0.3837, "step": 114705 }, { "epoch": 4.134140627815619, "grad_norm": 0.24765877425670624, "learning_rate": 3.84254779217737e-06, "loss": 0.3692, "step": 114710 }, { "epoch": 4.134320827476844, "grad_norm": 0.23711945116519928, "learning_rate": 3.840993418427855e-06, "loss": 0.3612, "step": 114715 }, { "epoch": 4.134501027138069, "grad_norm": 0.23325876891613007, "learning_rate": 3.839439332965711e-06, "loss": 0.3809, "step": 114720 }, { "epoch": 4.134681226799294, "grad_norm": 0.2400001585483551, "learning_rate": 3.837885535812108e-06, "loss": 0.3972, "step": 114725 }, { "epoch": 4.134861426460518, "grad_norm": 0.24221131205558777, "learning_rate": 3.836332026988218e-06, "loss": 0.3731, "step": 114730 }, { "epoch": 4.135041626121743, "grad_norm": 0.22964808344841003, "learning_rate": 3.834778806515199e-06, "loss": 0.3772, "step": 114735 }, { "epoch": 4.135221825782968, "grad_norm": 0.2366577684879303, "learning_rate": 3.833225874414228e-06, "loss": 0.3839, "step": 114740 }, { "epoch": 4.1354020254441926, "grad_norm": 0.281572550535202, "learning_rate": 3.8316732307064566e-06, "loss": 0.3524, "step": 114745 }, { "epoch": 4.135582225105416, "grad_norm": 0.24455080926418304, "learning_rate": 3.83012087541304e-06, "loss": 0.3965, "step": 114750 }, { "epoch": 4.135762424766641, "grad_norm": 0.2673299312591553, "learning_rate": 3.828568808555127e-06, "loss": 0.3553, "step": 114755 }, { "epoch": 4.135942624427866, "grad_norm": 0.29149943590164185, "learning_rate": 3.827017030153859e-06, "loss": 0.3808, "step": 114760 }, { "epoch": 4.136122824089091, "grad_norm": 0.2999207377433777, "learning_rate": 3.825465540230397e-06, "loss": 0.4054, "step": 114765 }, { "epoch": 4.136303023750315, "grad_norm": 0.2576257288455963, "learning_rate": 3.823914338805856e-06, "loss": 0.3723, "step": 114770 }, { "epoch": 4.13648322341154, "grad_norm": 0.2315170168876648, "learning_rate": 3.822363425901388e-06, "loss": 0.3923, "step": 114775 }, { "epoch": 4.136663423072765, "grad_norm": 0.24673189222812653, "learning_rate": 3.8208128015381176e-06, "loss": 0.3754, "step": 114780 }, { "epoch": 4.1368436227339895, "grad_norm": 0.24493588507175446, "learning_rate": 3.8192624657371725e-06, "loss": 0.3749, "step": 114785 }, { "epoch": 4.137023822395214, "grad_norm": 0.2173871397972107, "learning_rate": 3.817712418519676e-06, "loss": 0.3839, "step": 114790 }, { "epoch": 4.137204022056438, "grad_norm": 0.3121466040611267, "learning_rate": 3.816162659906747e-06, "loss": 0.3741, "step": 114795 }, { "epoch": 4.137384221717663, "grad_norm": 0.2459418624639511, "learning_rate": 3.8146131899194942e-06, "loss": 0.3915, "step": 114800 }, { "epoch": 4.137564421378888, "grad_norm": 0.2291506677865982, "learning_rate": 3.8130640085790396e-06, "loss": 0.3682, "step": 114805 }, { "epoch": 4.137744621040112, "grad_norm": 0.2138427197933197, "learning_rate": 3.811515115906489e-06, "loss": 0.3368, "step": 114810 }, { "epoch": 4.137924820701337, "grad_norm": 0.28790679574012756, "learning_rate": 3.8099665119229382e-06, "loss": 0.3658, "step": 114815 }, { "epoch": 4.138105020362562, "grad_norm": 0.2095593363046646, "learning_rate": 3.8084181966494966e-06, "loss": 0.3348, "step": 114820 }, { "epoch": 4.1382852200237865, "grad_norm": 0.28139370679855347, "learning_rate": 3.8068701701072436e-06, "loss": 0.3597, "step": 114825 }, { "epoch": 4.138465419685011, "grad_norm": 0.2298731952905655, "learning_rate": 3.8053224323172904e-06, "loss": 0.3687, "step": 114830 }, { "epoch": 4.138645619346236, "grad_norm": 0.2664845287799835, "learning_rate": 3.8037749833007203e-06, "loss": 0.3159, "step": 114835 }, { "epoch": 4.13882581900746, "grad_norm": 0.21755090355873108, "learning_rate": 3.8022278230785996e-06, "loss": 0.3509, "step": 114840 }, { "epoch": 4.139006018668685, "grad_norm": 0.29724082350730896, "learning_rate": 3.8006809516720282e-06, "loss": 0.3392, "step": 114845 }, { "epoch": 4.139186218329909, "grad_norm": 0.23176833987236023, "learning_rate": 3.79913436910207e-06, "loss": 0.3852, "step": 114850 }, { "epoch": 4.139366417991134, "grad_norm": 0.2422746866941452, "learning_rate": 3.7975880753898045e-06, "loss": 0.3726, "step": 114855 }, { "epoch": 4.139546617652359, "grad_norm": 0.23337143659591675, "learning_rate": 3.796042070556302e-06, "loss": 0.3838, "step": 114860 }, { "epoch": 4.1397268173135835, "grad_norm": 0.2680877447128296, "learning_rate": 3.794496354622612e-06, "loss": 0.4008, "step": 114865 }, { "epoch": 4.139907016974808, "grad_norm": 0.26528939604759216, "learning_rate": 3.792950927609809e-06, "loss": 0.389, "step": 114870 }, { "epoch": 4.140087216636033, "grad_norm": 0.2818238139152527, "learning_rate": 3.7914057895389433e-06, "loss": 0.4002, "step": 114875 }, { "epoch": 4.140267416297258, "grad_norm": 0.2836932837963104, "learning_rate": 3.789860940431067e-06, "loss": 0.3686, "step": 114880 }, { "epoch": 4.140447615958482, "grad_norm": 0.22321908175945282, "learning_rate": 3.7883163803072277e-06, "loss": 0.3532, "step": 114885 }, { "epoch": 4.140627815619706, "grad_norm": 0.18690916895866394, "learning_rate": 3.786772109188466e-06, "loss": 0.3564, "step": 114890 }, { "epoch": 4.140808015280931, "grad_norm": 0.21489687263965607, "learning_rate": 3.7852281270958357e-06, "loss": 0.3558, "step": 114895 }, { "epoch": 4.140988214942156, "grad_norm": 0.2863061726093292, "learning_rate": 3.7836844340503637e-06, "loss": 0.3493, "step": 114900 }, { "epoch": 4.1411684146033805, "grad_norm": 0.22299538552761078, "learning_rate": 3.782141030073083e-06, "loss": 0.356, "step": 114905 }, { "epoch": 4.141348614264605, "grad_norm": 0.2980153262615204, "learning_rate": 3.7805979151850217e-06, "loss": 0.3812, "step": 114910 }, { "epoch": 4.14152881392583, "grad_norm": 0.20422306656837463, "learning_rate": 3.7790550894072073e-06, "loss": 0.362, "step": 114915 }, { "epoch": 4.141709013587055, "grad_norm": 0.2799051105976105, "learning_rate": 3.7775125527606534e-06, "loss": 0.37, "step": 114920 }, { "epoch": 4.141889213248279, "grad_norm": 0.2169763743877411, "learning_rate": 3.775970305266385e-06, "loss": 0.3863, "step": 114925 }, { "epoch": 4.142069412909504, "grad_norm": 0.27121099829673767, "learning_rate": 3.774428346945416e-06, "loss": 0.3563, "step": 114930 }, { "epoch": 4.142249612570728, "grad_norm": 0.28678664565086365, "learning_rate": 3.772886677818749e-06, "loss": 0.4166, "step": 114935 }, { "epoch": 4.142429812231953, "grad_norm": 0.24329231679439545, "learning_rate": 3.7713452979073913e-06, "loss": 0.372, "step": 114940 }, { "epoch": 4.1426100118931775, "grad_norm": 0.25663426518440247, "learning_rate": 3.7698042072323386e-06, "loss": 0.3657, "step": 114945 }, { "epoch": 4.142790211554402, "grad_norm": 0.29921120405197144, "learning_rate": 3.7682634058146065e-06, "loss": 0.3919, "step": 114950 }, { "epoch": 4.142970411215627, "grad_norm": 0.25063279271125793, "learning_rate": 3.766722893675162e-06, "loss": 0.3886, "step": 114955 }, { "epoch": 4.143150610876852, "grad_norm": 0.24971787631511688, "learning_rate": 3.76518267083501e-06, "loss": 0.3546, "step": 114960 }, { "epoch": 4.143330810538076, "grad_norm": 0.25325024127960205, "learning_rate": 3.763642737315137e-06, "loss": 0.3863, "step": 114965 }, { "epoch": 4.143511010199301, "grad_norm": 0.24295130372047424, "learning_rate": 3.762103093136518e-06, "loss": 0.3941, "step": 114970 }, { "epoch": 4.143691209860526, "grad_norm": 0.19150157272815704, "learning_rate": 3.7605637383201357e-06, "loss": 0.3463, "step": 114975 }, { "epoch": 4.14387140952175, "grad_norm": 0.24536427855491638, "learning_rate": 3.759024672886957e-06, "loss": 0.3587, "step": 114980 }, { "epoch": 4.144051609182974, "grad_norm": 0.3299379348754883, "learning_rate": 3.7574858968579513e-06, "loss": 0.3832, "step": 114985 }, { "epoch": 4.144231808844199, "grad_norm": 0.2293405532836914, "learning_rate": 3.7559474102540905e-06, "loss": 0.3422, "step": 114990 }, { "epoch": 4.144412008505424, "grad_norm": 0.29431861639022827, "learning_rate": 3.7544092130963355e-06, "loss": 0.3949, "step": 114995 }, { "epoch": 4.144592208166649, "grad_norm": 0.23299720883369446, "learning_rate": 3.752871305405642e-06, "loss": 0.3659, "step": 115000 }, { "epoch": 4.144592208166649, "eval_loss": 0.4293991029262543, "eval_runtime": 3.5316, "eval_samples_per_second": 28.316, "eval_steps_per_second": 7.079, "step": 115000 }, { "epoch": 4.144772407827873, "grad_norm": 0.23517471551895142, "learning_rate": 3.7513336872029625e-06, "loss": 0.381, "step": 115005 }, { "epoch": 4.144952607489098, "grad_norm": 0.24651336669921875, "learning_rate": 3.7497963585092415e-06, "loss": 0.4108, "step": 115010 }, { "epoch": 4.145132807150323, "grad_norm": 0.2341834455728531, "learning_rate": 3.7482593193454375e-06, "loss": 0.3443, "step": 115015 }, { "epoch": 4.145313006811548, "grad_norm": 0.2706274092197418, "learning_rate": 3.7467225697324833e-06, "loss": 0.3762, "step": 115020 }, { "epoch": 4.145493206472771, "grad_norm": 0.2560116946697235, "learning_rate": 3.7451861096913234e-06, "loss": 0.3493, "step": 115025 }, { "epoch": 4.145673406133996, "grad_norm": 0.23122893273830414, "learning_rate": 3.7436499392428855e-06, "loss": 0.3732, "step": 115030 }, { "epoch": 4.145853605795221, "grad_norm": 0.2816644012928009, "learning_rate": 3.7421140584080942e-06, "loss": 0.378, "step": 115035 }, { "epoch": 4.146033805456446, "grad_norm": 0.3167472183704376, "learning_rate": 3.740578467207892e-06, "loss": 0.4059, "step": 115040 }, { "epoch": 4.14621400511767, "grad_norm": 0.22045278549194336, "learning_rate": 3.7390431656631975e-06, "loss": 0.3764, "step": 115045 }, { "epoch": 4.146394204778895, "grad_norm": 0.25917506217956543, "learning_rate": 3.737508153794911e-06, "loss": 0.3721, "step": 115050 }, { "epoch": 4.14657440444012, "grad_norm": 0.2379886656999588, "learning_rate": 3.7359734316239653e-06, "loss": 0.3326, "step": 115055 }, { "epoch": 4.1467546041013446, "grad_norm": 0.3082965612411499, "learning_rate": 3.7344389991712666e-06, "loss": 0.339, "step": 115060 }, { "epoch": 4.146934803762569, "grad_norm": 0.25842636823654175, "learning_rate": 3.732904856457717e-06, "loss": 0.3674, "step": 115065 }, { "epoch": 4.147115003423793, "grad_norm": 0.2803257405757904, "learning_rate": 3.731371003504222e-06, "loss": 0.3743, "step": 115070 }, { "epoch": 4.147295203085018, "grad_norm": 0.1919623762369156, "learning_rate": 3.7298374403316737e-06, "loss": 0.3497, "step": 115075 }, { "epoch": 4.147475402746243, "grad_norm": 0.25834715366363525, "learning_rate": 3.7283041669609775e-06, "loss": 0.3668, "step": 115080 }, { "epoch": 4.147655602407467, "grad_norm": 0.2559783458709717, "learning_rate": 3.7267711834130194e-06, "loss": 0.3553, "step": 115085 }, { "epoch": 4.147835802068692, "grad_norm": 0.30941301584243774, "learning_rate": 3.725238489708685e-06, "loss": 0.3845, "step": 115090 }, { "epoch": 4.148016001729917, "grad_norm": 0.22858378291130066, "learning_rate": 3.7237060858688584e-06, "loss": 0.3625, "step": 115095 }, { "epoch": 4.1481962013911415, "grad_norm": 0.2147953361272812, "learning_rate": 3.722173971914417e-06, "loss": 0.3643, "step": 115100 }, { "epoch": 4.148376401052366, "grad_norm": 0.3191777169704437, "learning_rate": 3.7206421478662294e-06, "loss": 0.3437, "step": 115105 }, { "epoch": 4.148556600713591, "grad_norm": 0.2847544848918915, "learning_rate": 3.7191106137451776e-06, "loss": 0.3535, "step": 115110 }, { "epoch": 4.148736800374815, "grad_norm": 0.3304956555366516, "learning_rate": 3.7175793695721244e-06, "loss": 0.3658, "step": 115115 }, { "epoch": 4.14891700003604, "grad_norm": 0.2144591212272644, "learning_rate": 3.7160484153679313e-06, "loss": 0.3673, "step": 115120 }, { "epoch": 4.149097199697264, "grad_norm": 0.2552328407764435, "learning_rate": 3.714517751153457e-06, "loss": 0.3804, "step": 115125 }, { "epoch": 4.149277399358489, "grad_norm": 0.19830626249313354, "learning_rate": 3.7129873769495534e-06, "loss": 0.3935, "step": 115130 }, { "epoch": 4.149457599019714, "grad_norm": 0.22259441018104553, "learning_rate": 3.7114572927770875e-06, "loss": 0.3558, "step": 115135 }, { "epoch": 4.1496377986809385, "grad_norm": 0.26004859805107117, "learning_rate": 3.709927498656887e-06, "loss": 0.3926, "step": 115140 }, { "epoch": 4.149817998342163, "grad_norm": 0.2216162085533142, "learning_rate": 3.708397994609797e-06, "loss": 0.3725, "step": 115145 }, { "epoch": 4.149998198003388, "grad_norm": 0.23738591372966766, "learning_rate": 3.7068687806566694e-06, "loss": 0.3529, "step": 115150 }, { "epoch": 4.150178397664613, "grad_norm": 0.2530551254749298, "learning_rate": 3.705339856818324e-06, "loss": 0.3268, "step": 115155 }, { "epoch": 4.1503585973258375, "grad_norm": 0.20809151232242584, "learning_rate": 3.7038112231156134e-06, "loss": 0.3447, "step": 115160 }, { "epoch": 4.150538796987061, "grad_norm": 0.22564588487148285, "learning_rate": 3.7022828795693467e-06, "loss": 0.3774, "step": 115165 }, { "epoch": 4.150718996648286, "grad_norm": 0.2013123780488968, "learning_rate": 3.700754826200345e-06, "loss": 0.3752, "step": 115170 }, { "epoch": 4.150899196309511, "grad_norm": 0.29589053988456726, "learning_rate": 3.6992270630294396e-06, "loss": 0.345, "step": 115175 }, { "epoch": 4.1510793959707355, "grad_norm": 0.2514277994632721, "learning_rate": 3.697699590077444e-06, "loss": 0.4192, "step": 115180 }, { "epoch": 4.15125959563196, "grad_norm": 0.23917201161384583, "learning_rate": 3.696172407365167e-06, "loss": 0.3958, "step": 115185 }, { "epoch": 4.151439795293185, "grad_norm": 0.3023105561733246, "learning_rate": 3.694645514913417e-06, "loss": 0.3855, "step": 115190 }, { "epoch": 4.15161999495441, "grad_norm": 0.20139765739440918, "learning_rate": 3.693118912742988e-06, "loss": 0.3553, "step": 115195 }, { "epoch": 4.151800194615634, "grad_norm": 0.25707682967185974, "learning_rate": 3.6915926008746964e-06, "loss": 0.3776, "step": 115200 }, { "epoch": 4.151980394276859, "grad_norm": 0.2819069027900696, "learning_rate": 3.690066579329332e-06, "loss": 0.4, "step": 115205 }, { "epoch": 4.152160593938083, "grad_norm": 0.24341073632240295, "learning_rate": 3.6885408481276833e-06, "loss": 0.3162, "step": 115210 }, { "epoch": 4.152340793599308, "grad_norm": 0.262385755777359, "learning_rate": 3.6870154072905396e-06, "loss": 0.4045, "step": 115215 }, { "epoch": 4.1525209932605325, "grad_norm": 0.2288028746843338, "learning_rate": 3.6854902568386777e-06, "loss": 0.3662, "step": 115220 }, { "epoch": 4.152701192921757, "grad_norm": 0.3026867210865021, "learning_rate": 3.6839653967928907e-06, "loss": 0.3821, "step": 115225 }, { "epoch": 4.152881392582982, "grad_norm": 0.2670912742614746, "learning_rate": 3.6824408271739552e-06, "loss": 0.3924, "step": 115230 }, { "epoch": 4.153061592244207, "grad_norm": 0.2640323340892792, "learning_rate": 3.6809165480026246e-06, "loss": 0.3696, "step": 115235 }, { "epoch": 4.153241791905431, "grad_norm": 0.272617369890213, "learning_rate": 3.6793925592996825e-06, "loss": 0.3597, "step": 115240 }, { "epoch": 4.153421991566656, "grad_norm": 0.2459690123796463, "learning_rate": 3.6778688610858896e-06, "loss": 0.3587, "step": 115245 }, { "epoch": 4.153602191227881, "grad_norm": 0.22347575426101685, "learning_rate": 3.6763454533820075e-06, "loss": 0.3799, "step": 115250 }, { "epoch": 4.153782390889105, "grad_norm": 0.2681044042110443, "learning_rate": 3.6748223362087886e-06, "loss": 0.3968, "step": 115255 }, { "epoch": 4.1539625905503295, "grad_norm": 0.2700466215610504, "learning_rate": 3.6732995095869806e-06, "loss": 0.3832, "step": 115260 }, { "epoch": 4.154142790211554, "grad_norm": 0.24011053144931793, "learning_rate": 3.671776973537344e-06, "loss": 0.3449, "step": 115265 }, { "epoch": 4.154322989872779, "grad_norm": 0.26308053731918335, "learning_rate": 3.670254728080616e-06, "loss": 0.3731, "step": 115270 }, { "epoch": 4.154503189534004, "grad_norm": 0.19285693764686584, "learning_rate": 3.6687327732375376e-06, "loss": 0.3644, "step": 115275 }, { "epoch": 4.154683389195228, "grad_norm": 0.327217698097229, "learning_rate": 3.667211109028848e-06, "loss": 0.4314, "step": 115280 }, { "epoch": 4.154863588856453, "grad_norm": 0.22068528831005096, "learning_rate": 3.665689735475275e-06, "loss": 0.3825, "step": 115285 }, { "epoch": 4.155043788517678, "grad_norm": 0.23468096554279327, "learning_rate": 3.664168652597541e-06, "loss": 0.3646, "step": 115290 }, { "epoch": 4.155223988178903, "grad_norm": 0.2511104941368103, "learning_rate": 3.6626478604163877e-06, "loss": 0.3713, "step": 115295 }, { "epoch": 4.155404187840126, "grad_norm": 0.2715943157672882, "learning_rate": 3.6611273589525236e-06, "loss": 0.3386, "step": 115300 }, { "epoch": 4.155584387501351, "grad_norm": 0.2659306526184082, "learning_rate": 3.659607148226671e-06, "loss": 0.3665, "step": 115305 }, { "epoch": 4.155764587162576, "grad_norm": 0.2604070007801056, "learning_rate": 3.658087228259538e-06, "loss": 0.4022, "step": 115310 }, { "epoch": 4.155944786823801, "grad_norm": 0.25114676356315613, "learning_rate": 3.6565675990718277e-06, "loss": 0.3772, "step": 115315 }, { "epoch": 4.156124986485025, "grad_norm": 0.25636595487594604, "learning_rate": 3.6550482606842572e-06, "loss": 0.3437, "step": 115320 }, { "epoch": 4.15630518614625, "grad_norm": 0.23536555469036102, "learning_rate": 3.653529213117529e-06, "loss": 0.391, "step": 115325 }, { "epoch": 4.156485385807475, "grad_norm": 0.26349982619285583, "learning_rate": 3.6520104563923213e-06, "loss": 0.3576, "step": 115330 }, { "epoch": 4.1566655854687, "grad_norm": 0.26209887862205505, "learning_rate": 3.6504919905293422e-06, "loss": 0.3723, "step": 115335 }, { "epoch": 4.156845785129924, "grad_norm": 0.21833176910877228, "learning_rate": 3.6489738155492696e-06, "loss": 0.3723, "step": 115340 }, { "epoch": 4.157025984791149, "grad_norm": 0.29597973823547363, "learning_rate": 3.6474559314728097e-06, "loss": 0.3672, "step": 115345 }, { "epoch": 4.157206184452373, "grad_norm": 0.24725015461444855, "learning_rate": 3.6459383383206202e-06, "loss": 0.3619, "step": 115350 }, { "epoch": 4.157386384113598, "grad_norm": 0.29851892590522766, "learning_rate": 3.6444210361133824e-06, "loss": 0.3806, "step": 115355 }, { "epoch": 4.157566583774822, "grad_norm": 0.2327483892440796, "learning_rate": 3.6429040248717795e-06, "loss": 0.3437, "step": 115360 }, { "epoch": 4.157746783436047, "grad_norm": 0.23190578818321228, "learning_rate": 3.6413873046164728e-06, "loss": 0.3671, "step": 115365 }, { "epoch": 4.157926983097272, "grad_norm": 0.21011842787265778, "learning_rate": 3.6398708753681286e-06, "loss": 0.3481, "step": 115370 }, { "epoch": 4.158107182758497, "grad_norm": 0.3232230544090271, "learning_rate": 3.638354737147412e-06, "loss": 0.4116, "step": 115375 }, { "epoch": 4.158287382419721, "grad_norm": 0.23676584661006927, "learning_rate": 3.6368388899749667e-06, "loss": 0.3751, "step": 115380 }, { "epoch": 4.158467582080946, "grad_norm": 0.2382199913263321, "learning_rate": 3.6353233338714626e-06, "loss": 0.3765, "step": 115385 }, { "epoch": 4.15864778174217, "grad_norm": 0.2705291509628296, "learning_rate": 3.6338080688575415e-06, "loss": 0.3765, "step": 115390 }, { "epoch": 4.158827981403395, "grad_norm": 0.25305843353271484, "learning_rate": 3.632293094953848e-06, "loss": 0.3927, "step": 115395 }, { "epoch": 4.159008181064619, "grad_norm": 0.20932044088840485, "learning_rate": 3.6307784121810267e-06, "loss": 0.3489, "step": 115400 }, { "epoch": 4.159188380725844, "grad_norm": 0.22791607677936554, "learning_rate": 3.6292640205597055e-06, "loss": 0.3658, "step": 115405 }, { "epoch": 4.159368580387069, "grad_norm": 0.24676699936389923, "learning_rate": 3.627749920110529e-06, "loss": 0.3737, "step": 115410 }, { "epoch": 4.1595487800482935, "grad_norm": 0.26706597208976746, "learning_rate": 3.626236110854128e-06, "loss": 0.3784, "step": 115415 }, { "epoch": 4.159728979709518, "grad_norm": 0.22861981391906738, "learning_rate": 3.624722592811111e-06, "loss": 0.3555, "step": 115420 }, { "epoch": 4.159909179370743, "grad_norm": 0.24083659052848816, "learning_rate": 3.623209366002117e-06, "loss": 0.3431, "step": 115425 }, { "epoch": 4.160089379031968, "grad_norm": 0.19510973989963531, "learning_rate": 3.6216964304477547e-06, "loss": 0.3485, "step": 115430 }, { "epoch": 4.1602695786931925, "grad_norm": 0.27537184953689575, "learning_rate": 3.620183786168635e-06, "loss": 0.3949, "step": 115435 }, { "epoch": 4.160449778354416, "grad_norm": 0.23948287963867188, "learning_rate": 3.618671433185383e-06, "loss": 0.3744, "step": 115440 }, { "epoch": 4.160629978015641, "grad_norm": 0.2727479040622711, "learning_rate": 3.617159371518583e-06, "loss": 0.3983, "step": 115445 }, { "epoch": 4.160810177676866, "grad_norm": 0.2685748040676117, "learning_rate": 3.6156476011888514e-06, "loss": 0.3499, "step": 115450 }, { "epoch": 4.1609903773380905, "grad_norm": 0.24433422088623047, "learning_rate": 3.6141361222167826e-06, "loss": 0.3799, "step": 115455 }, { "epoch": 4.161170576999315, "grad_norm": 0.24008378386497498, "learning_rate": 3.6126249346229656e-06, "loss": 0.3337, "step": 115460 }, { "epoch": 4.16135077666054, "grad_norm": 0.28376707434654236, "learning_rate": 3.6111140384279955e-06, "loss": 0.3982, "step": 115465 }, { "epoch": 4.161530976321765, "grad_norm": 0.22135163843631744, "learning_rate": 3.609603433652453e-06, "loss": 0.3829, "step": 115470 }, { "epoch": 4.1617111759829895, "grad_norm": 0.2689472436904907, "learning_rate": 3.608093120316919e-06, "loss": 0.366, "step": 115475 }, { "epoch": 4.161891375644214, "grad_norm": 0.24480800330638885, "learning_rate": 3.606583098441982e-06, "loss": 0.3782, "step": 115480 }, { "epoch": 4.162071575305438, "grad_norm": 0.27388742566108704, "learning_rate": 3.6050733680482063e-06, "loss": 0.3936, "step": 115485 }, { "epoch": 4.162251774966663, "grad_norm": 0.2704862356185913, "learning_rate": 3.603563929156162e-06, "loss": 0.3431, "step": 115490 }, { "epoch": 4.1624319746278875, "grad_norm": 0.6965771913528442, "learning_rate": 3.602054781786418e-06, "loss": 0.4249, "step": 115495 }, { "epoch": 4.162612174289112, "grad_norm": 0.32166483998298645, "learning_rate": 3.600545925959531e-06, "loss": 0.3667, "step": 115500 }, { "epoch": 4.162612174289112, "eval_loss": 0.42964106798171997, "eval_runtime": 3.5337, "eval_samples_per_second": 28.299, "eval_steps_per_second": 7.075, "step": 115500 }, { "epoch": 4.162792373950337, "grad_norm": 0.19025029242038727, "learning_rate": 3.599037361696067e-06, "loss": 0.3388, "step": 115505 }, { "epoch": 4.162972573611562, "grad_norm": 0.27333715558052063, "learning_rate": 3.5975290890165796e-06, "loss": 0.3742, "step": 115510 }, { "epoch": 4.163152773272786, "grad_norm": 0.28083357214927673, "learning_rate": 3.596021107941605e-06, "loss": 0.3722, "step": 115515 }, { "epoch": 4.163332972934011, "grad_norm": 0.26710939407348633, "learning_rate": 3.5945134184917046e-06, "loss": 0.3906, "step": 115520 }, { "epoch": 4.163513172595236, "grad_norm": 0.23718206584453583, "learning_rate": 3.5930060206874087e-06, "loss": 0.3612, "step": 115525 }, { "epoch": 4.16369337225646, "grad_norm": 0.23605412244796753, "learning_rate": 3.591498914549274e-06, "loss": 0.359, "step": 115530 }, { "epoch": 4.1638735719176845, "grad_norm": 0.2375323325395584, "learning_rate": 3.5899921000978137e-06, "loss": 0.4013, "step": 115535 }, { "epoch": 4.164053771578909, "grad_norm": 0.19733993709087372, "learning_rate": 3.588485577353562e-06, "loss": 0.3864, "step": 115540 }, { "epoch": 4.164233971240134, "grad_norm": 0.28855153918266296, "learning_rate": 3.5869793463370547e-06, "loss": 0.3374, "step": 115545 }, { "epoch": 4.164414170901359, "grad_norm": 0.24822647869586945, "learning_rate": 3.585473407068801e-06, "loss": 0.3587, "step": 115550 }, { "epoch": 4.164594370562583, "grad_norm": 0.26548629999160767, "learning_rate": 3.58396775956934e-06, "loss": 0.3545, "step": 115555 }, { "epoch": 4.164774570223808, "grad_norm": 0.20807038247585297, "learning_rate": 3.582462403859163e-06, "loss": 0.3993, "step": 115560 }, { "epoch": 4.164954769885033, "grad_norm": 0.20517727732658386, "learning_rate": 3.5809573399587846e-06, "loss": 0.3603, "step": 115565 }, { "epoch": 4.165134969546258, "grad_norm": 0.27609434723854065, "learning_rate": 3.579452567888722e-06, "loss": 0.387, "step": 115570 }, { "epoch": 4.1653151692074815, "grad_norm": 0.24407240748405457, "learning_rate": 3.577948087669472e-06, "loss": 0.4059, "step": 115575 }, { "epoch": 4.165495368868706, "grad_norm": 0.2198261171579361, "learning_rate": 3.5764438993215298e-06, "loss": 0.3422, "step": 115580 }, { "epoch": 4.165675568529931, "grad_norm": 0.2635994255542755, "learning_rate": 3.5749400028653927e-06, "loss": 0.3966, "step": 115585 }, { "epoch": 4.165855768191156, "grad_norm": 0.2824068069458008, "learning_rate": 3.5734363983215448e-06, "loss": 0.36, "step": 115590 }, { "epoch": 4.16603596785238, "grad_norm": 0.3001638650894165, "learning_rate": 3.571933085710483e-06, "loss": 0.3725, "step": 115595 }, { "epoch": 4.166216167513605, "grad_norm": 0.27790218591690063, "learning_rate": 3.5704300650526823e-06, "loss": 0.3894, "step": 115600 }, { "epoch": 4.16639636717483, "grad_norm": 0.2569793462753296, "learning_rate": 3.568927336368627e-06, "loss": 0.3765, "step": 115605 }, { "epoch": 4.166576566836055, "grad_norm": 0.25097736716270447, "learning_rate": 3.567424899678784e-06, "loss": 0.3612, "step": 115610 }, { "epoch": 4.166756766497279, "grad_norm": 0.2786616086959839, "learning_rate": 3.5659227550036284e-06, "loss": 0.3388, "step": 115615 }, { "epoch": 4.166936966158504, "grad_norm": 0.22307883203029633, "learning_rate": 3.5644209023636214e-06, "loss": 0.3804, "step": 115620 }, { "epoch": 4.167117165819728, "grad_norm": 0.27132895588874817, "learning_rate": 3.562919341779239e-06, "loss": 0.3481, "step": 115625 }, { "epoch": 4.167297365480953, "grad_norm": 0.2521323263645172, "learning_rate": 3.5614180732709194e-06, "loss": 0.409, "step": 115630 }, { "epoch": 4.167477565142177, "grad_norm": 0.2317046821117401, "learning_rate": 3.5599170968591334e-06, "loss": 0.3321, "step": 115635 }, { "epoch": 4.167657764803402, "grad_norm": 0.2945384979248047, "learning_rate": 3.558416412564325e-06, "loss": 0.347, "step": 115640 }, { "epoch": 4.167837964464627, "grad_norm": 0.26886409521102905, "learning_rate": 3.5569160204069446e-06, "loss": 0.391, "step": 115645 }, { "epoch": 4.168018164125852, "grad_norm": 0.25826171040534973, "learning_rate": 3.555415920407429e-06, "loss": 0.3669, "step": 115650 }, { "epoch": 4.168198363787076, "grad_norm": 0.3123614192008972, "learning_rate": 3.5539161125862226e-06, "loss": 0.3831, "step": 115655 }, { "epoch": 4.168378563448301, "grad_norm": 0.21039628982543945, "learning_rate": 3.552416596963748e-06, "loss": 0.3678, "step": 115660 }, { "epoch": 4.168558763109526, "grad_norm": 0.21605625748634338, "learning_rate": 3.550917373560453e-06, "loss": 0.3595, "step": 115665 }, { "epoch": 4.16873896277075, "grad_norm": 0.1955236792564392, "learning_rate": 3.549418442396757e-06, "loss": 0.3606, "step": 115670 }, { "epoch": 4.168919162431974, "grad_norm": 0.2682759165763855, "learning_rate": 3.54791980349308e-06, "loss": 0.4046, "step": 115675 }, { "epoch": 4.169099362093199, "grad_norm": 0.26579999923706055, "learning_rate": 3.546421456869842e-06, "loss": 0.3668, "step": 115680 }, { "epoch": 4.169279561754424, "grad_norm": 0.2342423051595688, "learning_rate": 3.5449234025474536e-06, "loss": 0.3764, "step": 115685 }, { "epoch": 4.169459761415649, "grad_norm": 0.24451355636119843, "learning_rate": 3.543425640546333e-06, "loss": 0.4006, "step": 115690 }, { "epoch": 4.169639961076873, "grad_norm": 0.26802754402160645, "learning_rate": 3.5419281708868933e-06, "loss": 0.3725, "step": 115695 }, { "epoch": 4.169820160738098, "grad_norm": 0.21809948980808258, "learning_rate": 3.5404309935895134e-06, "loss": 0.3756, "step": 115700 }, { "epoch": 4.170000360399323, "grad_norm": 0.23009651899337769, "learning_rate": 3.5389341086746158e-06, "loss": 0.3687, "step": 115705 }, { "epoch": 4.1701805600605475, "grad_norm": 0.2777594327926636, "learning_rate": 3.537437516162578e-06, "loss": 0.3974, "step": 115710 }, { "epoch": 4.170360759721771, "grad_norm": 0.22270822525024414, "learning_rate": 3.5359412160738044e-06, "loss": 0.3573, "step": 115715 }, { "epoch": 4.170540959382996, "grad_norm": 0.2586876451969147, "learning_rate": 3.5344452084286826e-06, "loss": 0.3584, "step": 115720 }, { "epoch": 4.170721159044221, "grad_norm": 0.2567586600780487, "learning_rate": 3.5329494932475754e-06, "loss": 0.3843, "step": 115725 }, { "epoch": 4.1709013587054455, "grad_norm": 0.24362897872924805, "learning_rate": 3.5314540705508823e-06, "loss": 0.3739, "step": 115730 }, { "epoch": 4.17108155836667, "grad_norm": 0.3008570373058319, "learning_rate": 3.529958940358971e-06, "loss": 0.3583, "step": 115735 }, { "epoch": 4.171261758027895, "grad_norm": 0.272904634475708, "learning_rate": 3.5284641026922137e-06, "loss": 0.3998, "step": 115740 }, { "epoch": 4.17144195768912, "grad_norm": 0.27428102493286133, "learning_rate": 3.526969557570975e-06, "loss": 0.3883, "step": 115745 }, { "epoch": 4.1716221573503445, "grad_norm": 0.25134575366973877, "learning_rate": 3.525475305015613e-06, "loss": 0.3813, "step": 115750 }, { "epoch": 4.171802357011569, "grad_norm": 0.2150566279888153, "learning_rate": 3.5239813450464983e-06, "loss": 0.3792, "step": 115755 }, { "epoch": 4.171982556672793, "grad_norm": 0.2204793393611908, "learning_rate": 3.5224876776839806e-06, "loss": 0.3873, "step": 115760 }, { "epoch": 4.172162756334018, "grad_norm": 0.2468349039554596, "learning_rate": 3.5209943029484077e-06, "loss": 0.4102, "step": 115765 }, { "epoch": 4.1723429559952425, "grad_norm": 0.2205950766801834, "learning_rate": 3.5195012208601303e-06, "loss": 0.3288, "step": 115770 }, { "epoch": 4.172523155656467, "grad_norm": 0.28197482228279114, "learning_rate": 3.5180084314394845e-06, "loss": 0.3871, "step": 115775 }, { "epoch": 4.172703355317692, "grad_norm": 0.28598877787590027, "learning_rate": 3.516515934706821e-06, "loss": 0.379, "step": 115780 }, { "epoch": 4.172883554978917, "grad_norm": 0.22734612226486206, "learning_rate": 3.515023730682468e-06, "loss": 0.3956, "step": 115785 }, { "epoch": 4.1730637546401415, "grad_norm": 0.2860606610774994, "learning_rate": 3.5135318193867563e-06, "loss": 0.4039, "step": 115790 }, { "epoch": 4.173243954301366, "grad_norm": 0.263313889503479, "learning_rate": 3.5120402008400167e-06, "loss": 0.3714, "step": 115795 }, { "epoch": 4.173424153962591, "grad_norm": 0.28966277837753296, "learning_rate": 3.5105488750625666e-06, "loss": 0.3525, "step": 115800 }, { "epoch": 4.173604353623815, "grad_norm": 0.27305835485458374, "learning_rate": 3.50905784207472e-06, "loss": 0.361, "step": 115805 }, { "epoch": 4.1737845532850395, "grad_norm": 0.28714773058891296, "learning_rate": 3.5075671018968136e-06, "loss": 0.3572, "step": 115810 }, { "epoch": 4.173964752946264, "grad_norm": 0.23891617357730865, "learning_rate": 3.506076654549134e-06, "loss": 0.3833, "step": 115815 }, { "epoch": 4.174144952607489, "grad_norm": 0.2725772261619568, "learning_rate": 3.504586500052001e-06, "loss": 0.3409, "step": 115820 }, { "epoch": 4.174325152268714, "grad_norm": 0.2952682375907898, "learning_rate": 3.503096638425718e-06, "loss": 0.3602, "step": 115825 }, { "epoch": 4.174505351929938, "grad_norm": 0.2583956718444824, "learning_rate": 3.5016070696905794e-06, "loss": 0.3786, "step": 115830 }, { "epoch": 4.174685551591163, "grad_norm": 0.2635524868965149, "learning_rate": 3.5001177938668834e-06, "loss": 0.3726, "step": 115835 }, { "epoch": 4.174865751252388, "grad_norm": 0.3067127764225006, "learning_rate": 3.4986288109749217e-06, "loss": 0.41, "step": 115840 }, { "epoch": 4.175045950913613, "grad_norm": 0.2670503556728363, "learning_rate": 3.4971401210349724e-06, "loss": 0.3842, "step": 115845 }, { "epoch": 4.1752261505748365, "grad_norm": 0.2181471437215805, "learning_rate": 3.4956517240673307e-06, "loss": 0.4022, "step": 115850 }, { "epoch": 4.175406350236061, "grad_norm": 0.21090255677700043, "learning_rate": 3.4941636200922695e-06, "loss": 0.3688, "step": 115855 }, { "epoch": 4.175586549897286, "grad_norm": 0.2528001368045807, "learning_rate": 3.4926758091300694e-06, "loss": 0.3877, "step": 115860 }, { "epoch": 4.175766749558511, "grad_norm": 0.2146507054567337, "learning_rate": 3.4911882912009947e-06, "loss": 0.392, "step": 115865 }, { "epoch": 4.175946949219735, "grad_norm": 0.2687932848930359, "learning_rate": 3.48970106632531e-06, "loss": 0.3346, "step": 115870 }, { "epoch": 4.17612714888096, "grad_norm": 0.33123210072517395, "learning_rate": 3.4882141345232903e-06, "loss": 0.3685, "step": 115875 }, { "epoch": 4.176307348542185, "grad_norm": 0.2785130739212036, "learning_rate": 3.486727495815187e-06, "loss": 0.3529, "step": 115880 }, { "epoch": 4.17648754820341, "grad_norm": 0.31324052810668945, "learning_rate": 3.485241150221258e-06, "loss": 0.3905, "step": 115885 }, { "epoch": 4.176667747864634, "grad_norm": 0.23870332539081573, "learning_rate": 3.483755097761751e-06, "loss": 0.3257, "step": 115890 }, { "epoch": 4.176847947525859, "grad_norm": 0.296524316072464, "learning_rate": 3.4822693384569087e-06, "loss": 0.3858, "step": 115895 }, { "epoch": 4.177028147187083, "grad_norm": 0.22270824015140533, "learning_rate": 3.4807838723269896e-06, "loss": 0.3931, "step": 115900 }, { "epoch": 4.177208346848308, "grad_norm": 0.25900474190711975, "learning_rate": 3.479298699392228e-06, "loss": 0.3714, "step": 115905 }, { "epoch": 4.177388546509532, "grad_norm": 0.28385141491889954, "learning_rate": 3.4778138196728425e-06, "loss": 0.3734, "step": 115910 }, { "epoch": 4.177568746170757, "grad_norm": 0.2589219808578491, "learning_rate": 3.476329233189085e-06, "loss": 0.3638, "step": 115915 }, { "epoch": 4.177748945831982, "grad_norm": 0.23942036926746368, "learning_rate": 3.4748449399611747e-06, "loss": 0.3584, "step": 115920 }, { "epoch": 4.177929145493207, "grad_norm": 0.2529667317867279, "learning_rate": 3.473360940009332e-06, "loss": 0.3587, "step": 115925 }, { "epoch": 4.178109345154431, "grad_norm": 0.22213000059127808, "learning_rate": 3.471877233353782e-06, "loss": 0.3889, "step": 115930 }, { "epoch": 4.178289544815656, "grad_norm": 0.2727386951446533, "learning_rate": 3.470393820014728e-06, "loss": 0.4031, "step": 115935 }, { "epoch": 4.178469744476881, "grad_norm": 0.20552094280719757, "learning_rate": 3.4689107000123982e-06, "loss": 0.3649, "step": 115940 }, { "epoch": 4.178649944138105, "grad_norm": 0.25130918622016907, "learning_rate": 3.467427873366988e-06, "loss": 0.3563, "step": 115945 }, { "epoch": 4.178830143799329, "grad_norm": 0.2697368562221527, "learning_rate": 3.4659453400987084e-06, "loss": 0.388, "step": 115950 }, { "epoch": 4.179010343460554, "grad_norm": 0.2934541404247284, "learning_rate": 3.4644631002277523e-06, "loss": 0.3953, "step": 115955 }, { "epoch": 4.179190543121779, "grad_norm": 0.20682430267333984, "learning_rate": 3.462981153774311e-06, "loss": 0.3416, "step": 115960 }, { "epoch": 4.179370742783004, "grad_norm": 0.22139784693717957, "learning_rate": 3.461499500758589e-06, "loss": 0.3593, "step": 115965 }, { "epoch": 4.179550942444228, "grad_norm": 0.26105570793151855, "learning_rate": 3.4600181412007633e-06, "loss": 0.3664, "step": 115970 }, { "epoch": 4.179731142105453, "grad_norm": 0.25991979241371155, "learning_rate": 3.458537075121024e-06, "loss": 0.394, "step": 115975 }, { "epoch": 4.179911341766678, "grad_norm": 0.24015338718891144, "learning_rate": 3.4570563025395435e-06, "loss": 0.3873, "step": 115980 }, { "epoch": 4.1800915414279025, "grad_norm": 0.24884821474552155, "learning_rate": 3.4555758234765006e-06, "loss": 0.3393, "step": 115985 }, { "epoch": 4.180271741089126, "grad_norm": 0.211832195520401, "learning_rate": 3.4540956379520623e-06, "loss": 0.3627, "step": 115990 }, { "epoch": 4.180451940750351, "grad_norm": 0.28997382521629333, "learning_rate": 3.4526157459864093e-06, "loss": 0.3957, "step": 115995 }, { "epoch": 4.180632140411576, "grad_norm": 0.23022478818893433, "learning_rate": 3.4511361475996843e-06, "loss": 0.3877, "step": 116000 }, { "epoch": 4.180632140411576, "eval_loss": 0.42928194999694824, "eval_runtime": 3.5376, "eval_samples_per_second": 28.268, "eval_steps_per_second": 7.067, "step": 116000 }, { "epoch": 4.180812340072801, "grad_norm": 0.23664060235023499, "learning_rate": 3.449656842812063e-06, "loss": 0.3887, "step": 116005 }, { "epoch": 4.180992539734025, "grad_norm": 0.31014373898506165, "learning_rate": 3.4481778316436925e-06, "loss": 0.3901, "step": 116010 }, { "epoch": 4.18117273939525, "grad_norm": 0.26111266016960144, "learning_rate": 3.4466991141147238e-06, "loss": 0.4023, "step": 116015 }, { "epoch": 4.181352939056475, "grad_norm": 0.20176957547664642, "learning_rate": 3.445220690245318e-06, "loss": 0.3669, "step": 116020 }, { "epoch": 4.1815331387176995, "grad_norm": 0.23669221997261047, "learning_rate": 3.4437425600555988e-06, "loss": 0.3726, "step": 116025 }, { "epoch": 4.181713338378924, "grad_norm": 0.22191861271858215, "learning_rate": 3.4422647235657105e-06, "loss": 0.3698, "step": 116030 }, { "epoch": 4.181893538040148, "grad_norm": 0.24171018600463867, "learning_rate": 3.4407871807957957e-06, "loss": 0.3699, "step": 116035 }, { "epoch": 4.182073737701373, "grad_norm": 0.22796885669231415, "learning_rate": 3.439309931765983e-06, "loss": 0.3784, "step": 116040 }, { "epoch": 4.1822539373625975, "grad_norm": 0.344339519739151, "learning_rate": 3.4378329764963995e-06, "loss": 0.3725, "step": 116045 }, { "epoch": 4.182434137023822, "grad_norm": 0.2166433185338974, "learning_rate": 3.4363563150071636e-06, "loss": 0.3542, "step": 116050 }, { "epoch": 4.182614336685047, "grad_norm": 0.2705228924751282, "learning_rate": 3.434879947318395e-06, "loss": 0.3529, "step": 116055 }, { "epoch": 4.182794536346272, "grad_norm": 0.2719237506389618, "learning_rate": 3.4334038734502166e-06, "loss": 0.3851, "step": 116060 }, { "epoch": 4.1829747360074965, "grad_norm": 0.2510291635990143, "learning_rate": 3.431928093422737e-06, "loss": 0.3846, "step": 116065 }, { "epoch": 4.183154935668721, "grad_norm": 0.2954877018928528, "learning_rate": 3.43045260725606e-06, "loss": 0.3558, "step": 116070 }, { "epoch": 4.183335135329946, "grad_norm": 0.2507972717285156, "learning_rate": 3.4289774149702887e-06, "loss": 0.3594, "step": 116075 }, { "epoch": 4.18351533499117, "grad_norm": 0.24110138416290283, "learning_rate": 3.427502516585521e-06, "loss": 0.3744, "step": 116080 }, { "epoch": 4.1836955346523945, "grad_norm": 0.25332164764404297, "learning_rate": 3.426027912121857e-06, "loss": 0.4066, "step": 116085 }, { "epoch": 4.183875734313619, "grad_norm": 0.22243022918701172, "learning_rate": 3.424553601599395e-06, "loss": 0.3625, "step": 116090 }, { "epoch": 4.184055933974844, "grad_norm": 0.23225410282611847, "learning_rate": 3.4230795850381998e-06, "loss": 0.3428, "step": 116095 }, { "epoch": 4.184236133636069, "grad_norm": 0.28310906887054443, "learning_rate": 3.4216058624583743e-06, "loss": 0.3719, "step": 116100 }, { "epoch": 4.1844163332972935, "grad_norm": 0.2678923010826111, "learning_rate": 3.4201324338799884e-06, "loss": 0.3407, "step": 116105 }, { "epoch": 4.184596532958518, "grad_norm": 0.2929144501686096, "learning_rate": 3.418659299323124e-06, "loss": 0.3569, "step": 116110 }, { "epoch": 4.184776732619743, "grad_norm": 0.2045121192932129, "learning_rate": 3.4171864588078474e-06, "loss": 0.3586, "step": 116115 }, { "epoch": 4.184956932280968, "grad_norm": 0.21932172775268555, "learning_rate": 3.415713912354221e-06, "loss": 0.3525, "step": 116120 }, { "epoch": 4.1851371319421915, "grad_norm": 0.21695736050605774, "learning_rate": 3.4142416599823175e-06, "loss": 0.3583, "step": 116125 }, { "epoch": 4.185317331603416, "grad_norm": 0.2124013900756836, "learning_rate": 3.4127697017121928e-06, "loss": 0.3525, "step": 116130 }, { "epoch": 4.185497531264641, "grad_norm": 0.2589491307735443, "learning_rate": 3.4112980375639035e-06, "loss": 0.3682, "step": 116135 }, { "epoch": 4.185677730925866, "grad_norm": 0.2505156099796295, "learning_rate": 3.4098266675574974e-06, "loss": 0.3868, "step": 116140 }, { "epoch": 4.18585793058709, "grad_norm": 0.2609826326370239, "learning_rate": 3.4083555917130165e-06, "loss": 0.3701, "step": 116145 }, { "epoch": 4.186038130248315, "grad_norm": 0.2588742971420288, "learning_rate": 3.4068848100505176e-06, "loss": 0.3641, "step": 116150 }, { "epoch": 4.18621832990954, "grad_norm": 0.20735150575637817, "learning_rate": 3.4054143225900335e-06, "loss": 0.3315, "step": 116155 }, { "epoch": 4.186398529570765, "grad_norm": 0.23937851190567017, "learning_rate": 3.4039441293515966e-06, "loss": 0.3675, "step": 116160 }, { "epoch": 4.186578729231989, "grad_norm": 0.24689562618732452, "learning_rate": 3.4024742303552405e-06, "loss": 0.3567, "step": 116165 }, { "epoch": 4.186758928893214, "grad_norm": 0.24008159339427948, "learning_rate": 3.4010046256209906e-06, "loss": 0.3822, "step": 116170 }, { "epoch": 4.186939128554438, "grad_norm": 0.1905672252178192, "learning_rate": 3.3995353151688667e-06, "loss": 0.3537, "step": 116175 }, { "epoch": 4.187119328215663, "grad_norm": 0.20124223828315735, "learning_rate": 3.3980662990188956e-06, "loss": 0.3373, "step": 116180 }, { "epoch": 4.187299527876887, "grad_norm": 0.29924681782722473, "learning_rate": 3.396597577191091e-06, "loss": 0.3908, "step": 116185 }, { "epoch": 4.187479727538112, "grad_norm": 0.2561193108558655, "learning_rate": 3.3951291497054616e-06, "loss": 0.364, "step": 116190 }, { "epoch": 4.187659927199337, "grad_norm": 0.23734471201896667, "learning_rate": 3.3936610165820144e-06, "loss": 0.4054, "step": 116195 }, { "epoch": 4.187840126860562, "grad_norm": 0.21448448300361633, "learning_rate": 3.3921931778407467e-06, "loss": 0.3877, "step": 116200 }, { "epoch": 4.188020326521786, "grad_norm": 0.25979727506637573, "learning_rate": 3.390725633501676e-06, "loss": 0.3934, "step": 116205 }, { "epoch": 4.188200526183011, "grad_norm": 0.25088486075401306, "learning_rate": 3.3892583835847786e-06, "loss": 0.3796, "step": 116210 }, { "epoch": 4.188380725844236, "grad_norm": 0.24051101505756378, "learning_rate": 3.387791428110046e-06, "loss": 0.3435, "step": 116215 }, { "epoch": 4.18856092550546, "grad_norm": 0.265553742647171, "learning_rate": 3.3863247670974764e-06, "loss": 0.3918, "step": 116220 }, { "epoch": 4.188741125166684, "grad_norm": 0.22547340393066406, "learning_rate": 3.384858400567045e-06, "loss": 0.3579, "step": 116225 }, { "epoch": 4.188921324827909, "grad_norm": 0.2896125614643097, "learning_rate": 3.383392328538737e-06, "loss": 0.3668, "step": 116230 }, { "epoch": 4.189101524489134, "grad_norm": 0.24138803780078888, "learning_rate": 3.3819265510325214e-06, "loss": 0.3922, "step": 116235 }, { "epoch": 4.189281724150359, "grad_norm": 0.29337865114212036, "learning_rate": 3.3804610680683665e-06, "loss": 0.4072, "step": 116240 }, { "epoch": 4.189461923811583, "grad_norm": 0.29773908853530884, "learning_rate": 3.3789958796662496e-06, "loss": 0.3651, "step": 116245 }, { "epoch": 4.189642123472808, "grad_norm": 0.25638502836227417, "learning_rate": 3.3775309858461253e-06, "loss": 0.3669, "step": 116250 }, { "epoch": 4.189822323134033, "grad_norm": 0.23928916454315186, "learning_rate": 3.376066386627957e-06, "loss": 0.3569, "step": 116255 }, { "epoch": 4.1900025227952575, "grad_norm": 0.24761387705802917, "learning_rate": 3.3746020820316972e-06, "loss": 0.4, "step": 116260 }, { "epoch": 4.190182722456481, "grad_norm": 0.2518233358860016, "learning_rate": 3.373138072077292e-06, "loss": 0.3886, "step": 116265 }, { "epoch": 4.190362922117706, "grad_norm": 0.2131795734167099, "learning_rate": 3.3716743567847015e-06, "loss": 0.3559, "step": 116270 }, { "epoch": 4.190543121778931, "grad_norm": 0.27569085359573364, "learning_rate": 3.370210936173862e-06, "loss": 0.3568, "step": 116275 }, { "epoch": 4.190723321440156, "grad_norm": 0.24571947753429413, "learning_rate": 3.3687478102647025e-06, "loss": 0.3836, "step": 116280 }, { "epoch": 4.19090352110138, "grad_norm": 0.25763317942619324, "learning_rate": 3.3672849790771704e-06, "loss": 0.3327, "step": 116285 }, { "epoch": 4.191083720762605, "grad_norm": 0.21428771317005157, "learning_rate": 3.365822442631192e-06, "loss": 0.3846, "step": 116290 }, { "epoch": 4.19126392042383, "grad_norm": 0.24545690417289734, "learning_rate": 3.3643602009466897e-06, "loss": 0.3348, "step": 116295 }, { "epoch": 4.1914441200850545, "grad_norm": 0.23609334230422974, "learning_rate": 3.3628982540436004e-06, "loss": 0.3521, "step": 116300 }, { "epoch": 4.191624319746279, "grad_norm": 0.28154176473617554, "learning_rate": 3.3614366019418255e-06, "loss": 0.3723, "step": 116305 }, { "epoch": 4.191804519407503, "grad_norm": 0.3257730007171631, "learning_rate": 3.3599752446612896e-06, "loss": 0.3708, "step": 116310 }, { "epoch": 4.191984719068728, "grad_norm": 0.2922912836074829, "learning_rate": 3.358514182221903e-06, "loss": 0.3717, "step": 116315 }, { "epoch": 4.192164918729953, "grad_norm": 0.2203517109155655, "learning_rate": 3.357053414643571e-06, "loss": 0.3921, "step": 116320 }, { "epoch": 4.192345118391177, "grad_norm": 0.27214643359184265, "learning_rate": 3.355592941946195e-06, "loss": 0.3657, "step": 116325 }, { "epoch": 4.192525318052402, "grad_norm": 0.22760187089443207, "learning_rate": 3.3541327641496754e-06, "loss": 0.3207, "step": 116330 }, { "epoch": 4.192705517713627, "grad_norm": 0.2242773473262787, "learning_rate": 3.3526728812738994e-06, "loss": 0.3882, "step": 116335 }, { "epoch": 4.1928857173748515, "grad_norm": 0.2846883237361908, "learning_rate": 3.35121329333877e-06, "loss": 0.3775, "step": 116340 }, { "epoch": 4.193065917036076, "grad_norm": 0.2596665918827057, "learning_rate": 3.349754000364169e-06, "loss": 0.3672, "step": 116345 }, { "epoch": 4.193246116697301, "grad_norm": 0.2771369516849518, "learning_rate": 3.3482950023699777e-06, "loss": 0.3545, "step": 116350 }, { "epoch": 4.193426316358526, "grad_norm": 0.2566249966621399, "learning_rate": 3.346836299376077e-06, "loss": 0.3985, "step": 116355 }, { "epoch": 4.1936065160197495, "grad_norm": 0.24861952662467957, "learning_rate": 3.345377891402332e-06, "loss": 0.3828, "step": 116360 }, { "epoch": 4.193786715680974, "grad_norm": 0.2329019010066986, "learning_rate": 3.3439197784686267e-06, "loss": 0.4036, "step": 116365 }, { "epoch": 4.193966915342199, "grad_norm": 0.20926065742969513, "learning_rate": 3.342461960594823e-06, "loss": 0.3728, "step": 116370 }, { "epoch": 4.194147115003424, "grad_norm": 0.29187721014022827, "learning_rate": 3.341004437800779e-06, "loss": 0.3899, "step": 116375 }, { "epoch": 4.1943273146646485, "grad_norm": 0.25245538353919983, "learning_rate": 3.3395472101063584e-06, "loss": 0.3589, "step": 116380 }, { "epoch": 4.194507514325873, "grad_norm": 0.24126659333705902, "learning_rate": 3.338090277531408e-06, "loss": 0.38, "step": 116385 }, { "epoch": 4.194687713987098, "grad_norm": 0.31214639544487, "learning_rate": 3.336633640095793e-06, "loss": 0.3653, "step": 116390 }, { "epoch": 4.194867913648323, "grad_norm": 0.2557271122932434, "learning_rate": 3.3351772978193472e-06, "loss": 0.3487, "step": 116395 }, { "epoch": 4.1950481133095465, "grad_norm": 0.22002442181110382, "learning_rate": 3.3337212507219077e-06, "loss": 0.3418, "step": 116400 }, { "epoch": 4.195228312970771, "grad_norm": 0.3052828907966614, "learning_rate": 3.332265498823331e-06, "loss": 0.3924, "step": 116405 }, { "epoch": 4.195408512631996, "grad_norm": 0.2319956123828888, "learning_rate": 3.3308100421434397e-06, "loss": 0.4031, "step": 116410 }, { "epoch": 4.195588712293221, "grad_norm": 0.2713097631931305, "learning_rate": 3.3293548807020654e-06, "loss": 0.3555, "step": 116415 }, { "epoch": 4.1957689119544455, "grad_norm": 0.296252578496933, "learning_rate": 3.3279000145190366e-06, "loss": 0.3809, "step": 116420 }, { "epoch": 4.19594911161567, "grad_norm": 0.29615938663482666, "learning_rate": 3.3264454436141677e-06, "loss": 0.382, "step": 116425 }, { "epoch": 4.196129311276895, "grad_norm": 0.2384418398141861, "learning_rate": 3.324991168007291e-06, "loss": 0.3745, "step": 116430 }, { "epoch": 4.19630951093812, "grad_norm": 0.26839661598205566, "learning_rate": 3.323537187718212e-06, "loss": 0.3893, "step": 116435 }, { "epoch": 4.196489710599344, "grad_norm": 0.3010355830192566, "learning_rate": 3.322083502766743e-06, "loss": 0.3965, "step": 116440 }, { "epoch": 4.196669910260569, "grad_norm": 0.25463324785232544, "learning_rate": 3.3206301131726875e-06, "loss": 0.3634, "step": 116445 }, { "epoch": 4.196850109921793, "grad_norm": 0.2410764843225479, "learning_rate": 3.3191770189558434e-06, "loss": 0.3554, "step": 116450 }, { "epoch": 4.197030309583018, "grad_norm": 0.315395325422287, "learning_rate": 3.317724220136023e-06, "loss": 0.3935, "step": 116455 }, { "epoch": 4.197210509244242, "grad_norm": 0.27405762672424316, "learning_rate": 3.3162717167330103e-06, "loss": 0.379, "step": 116460 }, { "epoch": 4.197390708905467, "grad_norm": 0.21512751281261444, "learning_rate": 3.314819508766598e-06, "loss": 0.358, "step": 116465 }, { "epoch": 4.197570908566692, "grad_norm": 0.2765870988368988, "learning_rate": 3.3133675962565723e-06, "loss": 0.3922, "step": 116470 }, { "epoch": 4.197751108227917, "grad_norm": 0.20413513481616974, "learning_rate": 3.311915979222713e-06, "loss": 0.3726, "step": 116475 }, { "epoch": 4.197931307889141, "grad_norm": 0.28501495718955994, "learning_rate": 3.3104646576847947e-06, "loss": 0.3839, "step": 116480 }, { "epoch": 4.198111507550366, "grad_norm": 0.2470470815896988, "learning_rate": 3.309013631662608e-06, "loss": 0.3825, "step": 116485 }, { "epoch": 4.198291707211591, "grad_norm": 0.2344074696302414, "learning_rate": 3.3075629011759007e-06, "loss": 0.37, "step": 116490 }, { "epoch": 4.198471906872815, "grad_norm": 0.2440483570098877, "learning_rate": 3.3061124662444513e-06, "loss": 0.3667, "step": 116495 }, { "epoch": 4.198652106534039, "grad_norm": 0.22694821655750275, "learning_rate": 3.3046623268880224e-06, "loss": 0.3171, "step": 116500 }, { "epoch": 4.198652106534039, "eval_loss": 0.42932018637657166, "eval_runtime": 3.531, "eval_samples_per_second": 28.32, "eval_steps_per_second": 7.08, "step": 116500 }, { "epoch": 4.198832306195264, "grad_norm": 0.25988444685935974, "learning_rate": 3.3032124831263668e-06, "loss": 0.3446, "step": 116505 }, { "epoch": 4.199012505856489, "grad_norm": 0.2985076308250427, "learning_rate": 3.301762934979244e-06, "loss": 0.3893, "step": 116510 }, { "epoch": 4.199192705517714, "grad_norm": 0.24939928948879242, "learning_rate": 3.3003136824663965e-06, "loss": 0.3893, "step": 116515 }, { "epoch": 4.199372905178938, "grad_norm": 0.24855536222457886, "learning_rate": 3.29886472560757e-06, "loss": 0.3757, "step": 116520 }, { "epoch": 4.199553104840163, "grad_norm": 0.23109517991542816, "learning_rate": 3.297416064422515e-06, "loss": 0.3638, "step": 116525 }, { "epoch": 4.199733304501388, "grad_norm": 0.3385915160179138, "learning_rate": 3.2959676989309657e-06, "loss": 0.3723, "step": 116530 }, { "epoch": 4.1999135041626126, "grad_norm": 0.24903911352157593, "learning_rate": 3.2945196291526535e-06, "loss": 0.3575, "step": 116535 }, { "epoch": 4.200093703823836, "grad_norm": 0.28424081206321716, "learning_rate": 3.29307185510731e-06, "loss": 0.3789, "step": 116540 }, { "epoch": 4.200273903485061, "grad_norm": 0.2683737277984619, "learning_rate": 3.2916243768146533e-06, "loss": 0.3787, "step": 116545 }, { "epoch": 4.200454103146286, "grad_norm": 0.26493752002716064, "learning_rate": 3.2901771942944167e-06, "loss": 0.3706, "step": 116550 }, { "epoch": 4.200634302807511, "grad_norm": 0.33468732237815857, "learning_rate": 3.288730307566312e-06, "loss": 0.3563, "step": 116555 }, { "epoch": 4.200814502468735, "grad_norm": 0.26545238494873047, "learning_rate": 3.2872837166500555e-06, "loss": 0.3556, "step": 116560 }, { "epoch": 4.20099470212996, "grad_norm": 0.2382139265537262, "learning_rate": 3.2858374215653547e-06, "loss": 0.3753, "step": 116565 }, { "epoch": 4.201174901791185, "grad_norm": 0.18432195484638214, "learning_rate": 3.284391422331909e-06, "loss": 0.3495, "step": 116570 }, { "epoch": 4.2013551014524095, "grad_norm": 0.2506224513053894, "learning_rate": 3.282945718969435e-06, "loss": 0.3927, "step": 116575 }, { "epoch": 4.201535301113634, "grad_norm": 0.270535945892334, "learning_rate": 3.2815003114976177e-06, "loss": 0.383, "step": 116580 }, { "epoch": 4.201715500774858, "grad_norm": 0.26242825388908386, "learning_rate": 3.2800551999361497e-06, "loss": 0.3206, "step": 116585 }, { "epoch": 4.201895700436083, "grad_norm": 0.29855820536613464, "learning_rate": 3.2786103843047254e-06, "loss": 0.3921, "step": 116590 }, { "epoch": 4.202075900097308, "grad_norm": 0.25370094180107117, "learning_rate": 3.2771658646230274e-06, "loss": 0.3744, "step": 116595 }, { "epoch": 4.202256099758532, "grad_norm": 0.22978107631206512, "learning_rate": 3.2757216409107506e-06, "loss": 0.3969, "step": 116600 }, { "epoch": 4.202436299419757, "grad_norm": 0.2947256565093994, "learning_rate": 3.2742777131875514e-06, "loss": 0.3395, "step": 116605 }, { "epoch": 4.202616499080982, "grad_norm": 0.22788827121257782, "learning_rate": 3.2728340814731082e-06, "loss": 0.354, "step": 116610 }, { "epoch": 4.2027966987422065, "grad_norm": 0.3038329482078552, "learning_rate": 3.2713907457871006e-06, "loss": 0.3497, "step": 116615 }, { "epoch": 4.202976898403431, "grad_norm": 0.3307643234729767, "learning_rate": 3.2699477061491873e-06, "loss": 0.3688, "step": 116620 }, { "epoch": 4.203157098064656, "grad_norm": 0.2802697718143463, "learning_rate": 3.2685049625790276e-06, "loss": 0.4305, "step": 116625 }, { "epoch": 4.203337297725881, "grad_norm": 0.24683642387390137, "learning_rate": 3.2670625150962806e-06, "loss": 0.3701, "step": 116630 }, { "epoch": 4.203517497387105, "grad_norm": 0.22279055416584015, "learning_rate": 3.265620363720595e-06, "loss": 0.3596, "step": 116635 }, { "epoch": 4.203697697048329, "grad_norm": 0.21414312720298767, "learning_rate": 3.2641785084716296e-06, "loss": 0.4076, "step": 116640 }, { "epoch": 4.203877896709554, "grad_norm": 0.2691706717014313, "learning_rate": 3.2627369493690245e-06, "loss": 0.3498, "step": 116645 }, { "epoch": 4.204058096370779, "grad_norm": 0.24428191781044006, "learning_rate": 3.261295686432417e-06, "loss": 0.4001, "step": 116650 }, { "epoch": 4.2042382960320035, "grad_norm": 0.23344086110591888, "learning_rate": 3.2598547196814487e-06, "loss": 0.3345, "step": 116655 }, { "epoch": 4.204418495693228, "grad_norm": 0.2314455211162567, "learning_rate": 3.2584140491357523e-06, "loss": 0.3732, "step": 116660 }, { "epoch": 4.204598695354453, "grad_norm": 0.2983637750148773, "learning_rate": 3.2569736748149477e-06, "loss": 0.4034, "step": 116665 }, { "epoch": 4.204778895015678, "grad_norm": 0.2737831473350525, "learning_rate": 3.2555335967386804e-06, "loss": 0.334, "step": 116670 }, { "epoch": 4.204959094676902, "grad_norm": 0.24176688492298126, "learning_rate": 3.2540938149265453e-06, "loss": 0.3541, "step": 116675 }, { "epoch": 4.205139294338126, "grad_norm": 0.21122761070728302, "learning_rate": 3.2526543293981775e-06, "loss": 0.3499, "step": 116680 }, { "epoch": 4.205319493999351, "grad_norm": 0.23312951624393463, "learning_rate": 3.2512151401731833e-06, "loss": 0.4094, "step": 116685 }, { "epoch": 4.205499693660576, "grad_norm": 0.24352633953094482, "learning_rate": 3.249776247271172e-06, "loss": 0.3615, "step": 116690 }, { "epoch": 4.2056798933218005, "grad_norm": 0.23205436766147614, "learning_rate": 3.24833765071175e-06, "loss": 0.3529, "step": 116695 }, { "epoch": 4.205860092983025, "grad_norm": 0.23461531102657318, "learning_rate": 3.246899350514515e-06, "loss": 0.3595, "step": 116700 }, { "epoch": 4.20604029264425, "grad_norm": 0.26459741592407227, "learning_rate": 3.245461346699061e-06, "loss": 0.408, "step": 116705 }, { "epoch": 4.206220492305475, "grad_norm": 0.2987162172794342, "learning_rate": 3.2440236392849877e-06, "loss": 0.3556, "step": 116710 }, { "epoch": 4.206400691966699, "grad_norm": 0.22682152688503265, "learning_rate": 3.24258622829188e-06, "loss": 0.361, "step": 116715 }, { "epoch": 4.206580891627924, "grad_norm": 0.2210111767053604, "learning_rate": 3.241149113739325e-06, "loss": 0.3509, "step": 116720 }, { "epoch": 4.206761091289148, "grad_norm": 0.2724922001361847, "learning_rate": 3.239712295646899e-06, "loss": 0.3745, "step": 116725 }, { "epoch": 4.206941290950373, "grad_norm": 0.25349199771881104, "learning_rate": 3.238275774034175e-06, "loss": 0.3653, "step": 116730 }, { "epoch": 4.2071214906115975, "grad_norm": 0.29536816477775574, "learning_rate": 3.2368395489207366e-06, "loss": 0.383, "step": 116735 }, { "epoch": 4.207301690272822, "grad_norm": 0.2729000151157379, "learning_rate": 3.235403620326144e-06, "loss": 0.354, "step": 116740 }, { "epoch": 4.207481889934047, "grad_norm": 0.27360138297080994, "learning_rate": 3.233967988269965e-06, "loss": 0.3776, "step": 116745 }, { "epoch": 4.207662089595272, "grad_norm": 0.20422430336475372, "learning_rate": 3.2325326527717582e-06, "loss": 0.3573, "step": 116750 }, { "epoch": 4.207842289256496, "grad_norm": 0.31160596013069153, "learning_rate": 3.2310976138510724e-06, "loss": 0.3787, "step": 116755 }, { "epoch": 4.208022488917721, "grad_norm": 0.2688230574131012, "learning_rate": 3.2296628715274723e-06, "loss": 0.3671, "step": 116760 }, { "epoch": 4.208202688578946, "grad_norm": 0.2496272772550583, "learning_rate": 3.2282284258205093e-06, "loss": 0.3467, "step": 116765 }, { "epoch": 4.20838288824017, "grad_norm": 0.2692166268825531, "learning_rate": 3.226794276749706e-06, "loss": 0.3607, "step": 116770 }, { "epoch": 4.208563087901394, "grad_norm": 0.27493536472320557, "learning_rate": 3.2253604243346196e-06, "loss": 0.3849, "step": 116775 }, { "epoch": 4.208743287562619, "grad_norm": 0.25217947363853455, "learning_rate": 3.2239268685947787e-06, "loss": 0.3608, "step": 116780 }, { "epoch": 4.208923487223844, "grad_norm": 0.24530518054962158, "learning_rate": 3.222493609549726e-06, "loss": 0.4019, "step": 116785 }, { "epoch": 4.209103686885069, "grad_norm": 0.2516281008720398, "learning_rate": 3.2210606472189792e-06, "loss": 0.3362, "step": 116790 }, { "epoch": 4.209283886546293, "grad_norm": 0.237876757979393, "learning_rate": 3.2196279816220587e-06, "loss": 0.3502, "step": 116795 }, { "epoch": 4.209464086207518, "grad_norm": 0.242570698261261, "learning_rate": 3.2181956127784933e-06, "loss": 0.3677, "step": 116800 }, { "epoch": 4.209644285868743, "grad_norm": 0.2549905478954315, "learning_rate": 3.2167635407077982e-06, "loss": 0.3541, "step": 116805 }, { "epoch": 4.209824485529968, "grad_norm": 0.33291956782341003, "learning_rate": 3.2153317654294803e-06, "loss": 0.4255, "step": 116810 }, { "epoch": 4.210004685191191, "grad_norm": 0.2316872775554657, "learning_rate": 3.213900286963051e-06, "loss": 0.3734, "step": 116815 }, { "epoch": 4.210184884852416, "grad_norm": 0.24276618659496307, "learning_rate": 3.212469105328006e-06, "loss": 0.402, "step": 116820 }, { "epoch": 4.210365084513641, "grad_norm": 0.20943395793437958, "learning_rate": 3.211038220543858e-06, "loss": 0.3512, "step": 116825 }, { "epoch": 4.210545284174866, "grad_norm": 0.263275682926178, "learning_rate": 3.2096076326300933e-06, "loss": 0.3615, "step": 116830 }, { "epoch": 4.21072548383609, "grad_norm": 0.234664648771286, "learning_rate": 3.2081773416062083e-06, "loss": 0.3926, "step": 116835 }, { "epoch": 4.210905683497315, "grad_norm": 0.2711605429649353, "learning_rate": 3.206747347491687e-06, "loss": 0.3848, "step": 116840 }, { "epoch": 4.21108588315854, "grad_norm": 0.22846084833145142, "learning_rate": 3.205317650306014e-06, "loss": 0.3744, "step": 116845 }, { "epoch": 4.2112660828197646, "grad_norm": 0.2208365499973297, "learning_rate": 3.2038882500686625e-06, "loss": 0.3703, "step": 116850 }, { "epoch": 4.211446282480989, "grad_norm": 0.3054751753807068, "learning_rate": 3.202459146799125e-06, "loss": 0.3998, "step": 116855 }, { "epoch": 4.211626482142213, "grad_norm": 0.23200340569019318, "learning_rate": 3.201030340516853e-06, "loss": 0.3711, "step": 116860 }, { "epoch": 4.211806681803438, "grad_norm": 0.29509085416793823, "learning_rate": 3.199601831241325e-06, "loss": 0.385, "step": 116865 }, { "epoch": 4.211986881464663, "grad_norm": 0.22504891455173492, "learning_rate": 3.198173618992001e-06, "loss": 0.3803, "step": 116870 }, { "epoch": 4.212167081125887, "grad_norm": 0.2349441796541214, "learning_rate": 3.196745703788334e-06, "loss": 0.3666, "step": 116875 }, { "epoch": 4.212347280787112, "grad_norm": 0.3012699484825134, "learning_rate": 3.1953180856497984e-06, "loss": 0.3774, "step": 116880 }, { "epoch": 4.212527480448337, "grad_norm": 0.23085784912109375, "learning_rate": 3.1938907645958275e-06, "loss": 0.3879, "step": 116885 }, { "epoch": 4.2127076801095615, "grad_norm": 0.20077353715896606, "learning_rate": 3.192463740645868e-06, "loss": 0.3781, "step": 116890 }, { "epoch": 4.212887879770786, "grad_norm": 0.26119643449783325, "learning_rate": 3.19103701381937e-06, "loss": 0.3959, "step": 116895 }, { "epoch": 4.213068079432011, "grad_norm": 0.28771156072616577, "learning_rate": 3.1896105841357745e-06, "loss": 0.3555, "step": 116900 }, { "epoch": 4.213248279093236, "grad_norm": 0.23709803819656372, "learning_rate": 3.188184451614509e-06, "loss": 0.3638, "step": 116905 }, { "epoch": 4.21342847875446, "grad_norm": 0.25012511014938354, "learning_rate": 3.1867586162750095e-06, "loss": 0.3855, "step": 116910 }, { "epoch": 4.213608678415684, "grad_norm": 0.24966490268707275, "learning_rate": 3.185333078136693e-06, "loss": 0.3769, "step": 116915 }, { "epoch": 4.213788878076909, "grad_norm": 0.2194073498249054, "learning_rate": 3.1839078372189966e-06, "loss": 0.3601, "step": 116920 }, { "epoch": 4.213969077738134, "grad_norm": 0.2655618488788605, "learning_rate": 3.182482893541333e-06, "loss": 0.4166, "step": 116925 }, { "epoch": 4.2141492773993585, "grad_norm": 0.23362277448177338, "learning_rate": 3.1810582471231143e-06, "loss": 0.3466, "step": 116930 }, { "epoch": 4.214329477060583, "grad_norm": 0.2612302303314209, "learning_rate": 3.179633897983755e-06, "loss": 0.3442, "step": 116935 }, { "epoch": 4.214509676721808, "grad_norm": 0.258493572473526, "learning_rate": 3.1782098461426516e-06, "loss": 0.3749, "step": 116940 }, { "epoch": 4.214689876383033, "grad_norm": 0.297879159450531, "learning_rate": 3.1767860916192216e-06, "loss": 0.395, "step": 116945 }, { "epoch": 4.2148700760442575, "grad_norm": 0.23272161185741425, "learning_rate": 3.17536263443286e-06, "loss": 0.3777, "step": 116950 }, { "epoch": 4.215050275705481, "grad_norm": 0.21118395030498505, "learning_rate": 3.1739394746029465e-06, "loss": 0.3803, "step": 116955 }, { "epoch": 4.215230475366706, "grad_norm": 0.1826326549053192, "learning_rate": 3.1725166121488875e-06, "loss": 0.3826, "step": 116960 }, { "epoch": 4.215410675027931, "grad_norm": 0.23955535888671875, "learning_rate": 3.171094047090059e-06, "loss": 0.3674, "step": 116965 }, { "epoch": 4.2155908746891555, "grad_norm": 0.27745673060417175, "learning_rate": 3.169671779445857e-06, "loss": 0.3951, "step": 116970 }, { "epoch": 4.21577107435038, "grad_norm": 0.1933916211128235, "learning_rate": 3.168249809235646e-06, "loss": 0.3331, "step": 116975 }, { "epoch": 4.215951274011605, "grad_norm": 0.30480435490608215, "learning_rate": 3.1668281364788e-06, "loss": 0.3533, "step": 116980 }, { "epoch": 4.21613147367283, "grad_norm": 0.2264707088470459, "learning_rate": 3.1654067611946974e-06, "loss": 0.3712, "step": 116985 }, { "epoch": 4.216311673334054, "grad_norm": 0.25249165296554565, "learning_rate": 3.1639856834027036e-06, "loss": 0.4058, "step": 116990 }, { "epoch": 4.216491872995279, "grad_norm": 0.22679844498634338, "learning_rate": 3.1625649031221725e-06, "loss": 0.3795, "step": 116995 }, { "epoch": 4.216672072656503, "grad_norm": 0.29318711161613464, "learning_rate": 3.1611444203724695e-06, "loss": 0.3723, "step": 117000 }, { "epoch": 4.216672072656503, "eval_loss": 0.4291633367538452, "eval_runtime": 3.5298, "eval_samples_per_second": 28.33, "eval_steps_per_second": 7.083, "step": 117000 }, { "epoch": 4.216852272317728, "grad_norm": 0.26223427057266235, "learning_rate": 3.1597242351729425e-06, "loss": 0.3777, "step": 117005 }, { "epoch": 4.2170324719789525, "grad_norm": 0.24939653277397156, "learning_rate": 3.1583043475429453e-06, "loss": 0.3564, "step": 117010 }, { "epoch": 4.217212671640177, "grad_norm": 0.27071085572242737, "learning_rate": 3.156884757501824e-06, "loss": 0.3712, "step": 117015 }, { "epoch": 4.217392871301402, "grad_norm": 0.2698659598827362, "learning_rate": 3.1554654650689215e-06, "loss": 0.394, "step": 117020 }, { "epoch": 4.217573070962627, "grad_norm": 0.28294146060943604, "learning_rate": 3.154046470263569e-06, "loss": 0.3858, "step": 117025 }, { "epoch": 4.217753270623851, "grad_norm": 0.28658536076545715, "learning_rate": 3.1526277731051074e-06, "loss": 0.3811, "step": 117030 }, { "epoch": 4.217933470285076, "grad_norm": 0.22426393628120422, "learning_rate": 3.1512093736128538e-06, "loss": 0.3315, "step": 117035 }, { "epoch": 4.218113669946301, "grad_norm": 0.22765573859214783, "learning_rate": 3.149791271806152e-06, "loss": 0.3562, "step": 117040 }, { "epoch": 4.218293869607525, "grad_norm": 0.2529604434967041, "learning_rate": 3.1483734677043102e-06, "loss": 0.3777, "step": 117045 }, { "epoch": 4.2184740692687495, "grad_norm": 0.21222354471683502, "learning_rate": 3.14695596132665e-06, "loss": 0.3615, "step": 117050 }, { "epoch": 4.218654268929974, "grad_norm": 0.28866201639175415, "learning_rate": 3.145538752692484e-06, "loss": 0.3926, "step": 117055 }, { "epoch": 4.218834468591199, "grad_norm": 0.27018246054649353, "learning_rate": 3.1441218418211155e-06, "loss": 0.3999, "step": 117060 }, { "epoch": 4.219014668252424, "grad_norm": 0.2012593001127243, "learning_rate": 3.142705228731868e-06, "loss": 0.3466, "step": 117065 }, { "epoch": 4.219194867913648, "grad_norm": 0.33578363060951233, "learning_rate": 3.141288913444024e-06, "loss": 0.3932, "step": 117070 }, { "epoch": 4.219375067574873, "grad_norm": 0.24772273004055023, "learning_rate": 3.139872895976881e-06, "loss": 0.3928, "step": 117075 }, { "epoch": 4.219555267236098, "grad_norm": 0.2516150176525116, "learning_rate": 3.1384571763497435e-06, "loss": 0.3273, "step": 117080 }, { "epoch": 4.219735466897323, "grad_norm": 0.3026273250579834, "learning_rate": 3.1370417545818933e-06, "loss": 0.388, "step": 117085 }, { "epoch": 4.219915666558546, "grad_norm": 0.2576843202114105, "learning_rate": 3.1356266306926175e-06, "loss": 0.3755, "step": 117090 }, { "epoch": 4.220095866219771, "grad_norm": 0.24807590246200562, "learning_rate": 3.1342118047011955e-06, "loss": 0.3744, "step": 117095 }, { "epoch": 4.220276065880996, "grad_norm": 0.25404757261276245, "learning_rate": 3.1327972766268977e-06, "loss": 0.3697, "step": 117100 }, { "epoch": 4.220456265542221, "grad_norm": 0.22809305787086487, "learning_rate": 3.1313830464890085e-06, "loss": 0.3502, "step": 117105 }, { "epoch": 4.220636465203445, "grad_norm": 0.2612314224243164, "learning_rate": 3.1299691143067937e-06, "loss": 0.3539, "step": 117110 }, { "epoch": 4.22081666486467, "grad_norm": 0.3267393708229065, "learning_rate": 3.1285554800995127e-06, "loss": 0.389, "step": 117115 }, { "epoch": 4.220996864525895, "grad_norm": 0.23879055678844452, "learning_rate": 3.127142143886433e-06, "loss": 0.4019, "step": 117120 }, { "epoch": 4.22117706418712, "grad_norm": 0.27509593963623047, "learning_rate": 3.1257291056867976e-06, "loss": 0.377, "step": 117125 }, { "epoch": 4.221357263848344, "grad_norm": 0.3244267702102661, "learning_rate": 3.124316365519875e-06, "loss": 0.3989, "step": 117130 }, { "epoch": 4.221537463509568, "grad_norm": 0.2525143623352051, "learning_rate": 3.122903923404913e-06, "loss": 0.3723, "step": 117135 }, { "epoch": 4.221717663170793, "grad_norm": 0.2540621757507324, "learning_rate": 3.1214917793611355e-06, "loss": 0.3725, "step": 117140 }, { "epoch": 4.221897862832018, "grad_norm": 0.2635304033756256, "learning_rate": 3.1200799334078044e-06, "loss": 0.346, "step": 117145 }, { "epoch": 4.222078062493242, "grad_norm": 0.24621783196926117, "learning_rate": 3.118668385564144e-06, "loss": 0.3364, "step": 117150 }, { "epoch": 4.222258262154467, "grad_norm": 0.2203269898891449, "learning_rate": 3.117257135849394e-06, "loss": 0.3385, "step": 117155 }, { "epoch": 4.222438461815692, "grad_norm": 0.2397884875535965, "learning_rate": 3.1158461842827833e-06, "loss": 0.3664, "step": 117160 }, { "epoch": 4.222618661476917, "grad_norm": 0.2857881188392639, "learning_rate": 3.114435530883522e-06, "loss": 0.3521, "step": 117165 }, { "epoch": 4.222798861138141, "grad_norm": 0.25649380683898926, "learning_rate": 3.113025175670847e-06, "loss": 0.3339, "step": 117170 }, { "epoch": 4.222979060799366, "grad_norm": 0.30346161127090454, "learning_rate": 3.111615118663963e-06, "loss": 0.3665, "step": 117175 }, { "epoch": 4.223159260460591, "grad_norm": 0.29739436507225037, "learning_rate": 3.1102053598820874e-06, "loss": 0.3754, "step": 117180 }, { "epoch": 4.223339460121815, "grad_norm": 0.23481184244155884, "learning_rate": 3.108795899344424e-06, "loss": 0.3601, "step": 117185 }, { "epoch": 4.223519659783039, "grad_norm": 0.2552500069141388, "learning_rate": 3.107386737070175e-06, "loss": 0.4017, "step": 117190 }, { "epoch": 4.223699859444264, "grad_norm": 0.29149770736694336, "learning_rate": 3.105977873078547e-06, "loss": 0.3966, "step": 117195 }, { "epoch": 4.223880059105489, "grad_norm": 0.28181055188179016, "learning_rate": 3.1045693073887327e-06, "loss": 0.4144, "step": 117200 }, { "epoch": 4.2240602587667135, "grad_norm": 0.25641298294067383, "learning_rate": 3.1031610400199225e-06, "loss": 0.3707, "step": 117205 }, { "epoch": 4.224240458427938, "grad_norm": 0.20379073917865753, "learning_rate": 3.1017530709913033e-06, "loss": 0.3584, "step": 117210 }, { "epoch": 4.224420658089163, "grad_norm": 0.2511664927005768, "learning_rate": 3.1003454003220577e-06, "loss": 0.374, "step": 117215 }, { "epoch": 4.224600857750388, "grad_norm": 0.21009521186351776, "learning_rate": 3.098938028031362e-06, "loss": 0.3763, "step": 117220 }, { "epoch": 4.2247810574116125, "grad_norm": 0.21885792911052704, "learning_rate": 3.0975309541384e-06, "loss": 0.3636, "step": 117225 }, { "epoch": 4.224961257072836, "grad_norm": 0.2734248638153076, "learning_rate": 3.096124178662338e-06, "loss": 0.3794, "step": 117230 }, { "epoch": 4.225141456734061, "grad_norm": 0.24762408435344696, "learning_rate": 3.094717701622346e-06, "loss": 0.3325, "step": 117235 }, { "epoch": 4.225321656395286, "grad_norm": 0.3509252071380615, "learning_rate": 3.0933115230375812e-06, "loss": 0.3739, "step": 117240 }, { "epoch": 4.2255018560565105, "grad_norm": 0.23616372048854828, "learning_rate": 3.0919056429272037e-06, "loss": 0.3129, "step": 117245 }, { "epoch": 4.225682055717735, "grad_norm": 0.33164361119270325, "learning_rate": 3.090500061310378e-06, "loss": 0.3645, "step": 117250 }, { "epoch": 4.22586225537896, "grad_norm": 0.2735852003097534, "learning_rate": 3.0890947782062416e-06, "loss": 0.3664, "step": 117255 }, { "epoch": 4.226042455040185, "grad_norm": 0.22918978333473206, "learning_rate": 3.087689793633944e-06, "loss": 0.3726, "step": 117260 }, { "epoch": 4.2262226547014095, "grad_norm": 0.2797573506832123, "learning_rate": 3.0862851076126325e-06, "loss": 0.3745, "step": 117265 }, { "epoch": 4.226402854362634, "grad_norm": 0.26490533351898193, "learning_rate": 3.084880720161448e-06, "loss": 0.3739, "step": 117270 }, { "epoch": 4.226583054023858, "grad_norm": 0.2413850724697113, "learning_rate": 3.0834766312995174e-06, "loss": 0.3652, "step": 117275 }, { "epoch": 4.226763253685083, "grad_norm": 0.2586573660373688, "learning_rate": 3.082072841045977e-06, "loss": 0.3765, "step": 117280 }, { "epoch": 4.2269434533463075, "grad_norm": 0.2556487023830414, "learning_rate": 3.080669349419943e-06, "loss": 0.4075, "step": 117285 }, { "epoch": 4.227123653007532, "grad_norm": 0.2673538029193878, "learning_rate": 3.0792661564405524e-06, "loss": 0.4263, "step": 117290 }, { "epoch": 4.227303852668757, "grad_norm": 0.2932944893836975, "learning_rate": 3.077863262126915e-06, "loss": 0.3598, "step": 117295 }, { "epoch": 4.227484052329982, "grad_norm": 0.22899527847766876, "learning_rate": 3.076460666498146e-06, "loss": 0.3761, "step": 117300 }, { "epoch": 4.227664251991206, "grad_norm": 0.26272207498550415, "learning_rate": 3.075058369573358e-06, "loss": 0.3494, "step": 117305 }, { "epoch": 4.227844451652431, "grad_norm": 0.28200453519821167, "learning_rate": 3.07365637137165e-06, "loss": 0.3753, "step": 117310 }, { "epoch": 4.228024651313656, "grad_norm": 0.2724056839942932, "learning_rate": 3.072254671912131e-06, "loss": 0.3659, "step": 117315 }, { "epoch": 4.22820485097488, "grad_norm": 0.19723065197467804, "learning_rate": 3.0708532712139e-06, "loss": 0.3401, "step": 117320 }, { "epoch": 4.2283850506361045, "grad_norm": 0.23599091172218323, "learning_rate": 3.0694521692960444e-06, "loss": 0.4004, "step": 117325 }, { "epoch": 4.228565250297329, "grad_norm": 0.2825251817703247, "learning_rate": 3.06805136617766e-06, "loss": 0.3491, "step": 117330 }, { "epoch": 4.228745449958554, "grad_norm": 0.2366117537021637, "learning_rate": 3.066650861877823e-06, "loss": 0.3718, "step": 117335 }, { "epoch": 4.228925649619779, "grad_norm": 0.252418577671051, "learning_rate": 3.0652506564156275e-06, "loss": 0.4073, "step": 117340 }, { "epoch": 4.229105849281003, "grad_norm": 0.25921630859375, "learning_rate": 3.063850749810149e-06, "loss": 0.4102, "step": 117345 }, { "epoch": 4.229286048942228, "grad_norm": 0.3047523498535156, "learning_rate": 3.062451142080447e-06, "loss": 0.3875, "step": 117350 }, { "epoch": 4.229466248603453, "grad_norm": 0.2350996434688568, "learning_rate": 3.061051833245607e-06, "loss": 0.3618, "step": 117355 }, { "epoch": 4.229646448264678, "grad_norm": 0.19591708481311798, "learning_rate": 3.059652823324688e-06, "loss": 0.3718, "step": 117360 }, { "epoch": 4.229826647925902, "grad_norm": 0.26747336983680725, "learning_rate": 3.058254112336753e-06, "loss": 0.378, "step": 117365 }, { "epoch": 4.230006847587126, "grad_norm": 0.25700604915618896, "learning_rate": 3.0568557003008535e-06, "loss": 0.3677, "step": 117370 }, { "epoch": 4.230187047248351, "grad_norm": 0.21863490343093872, "learning_rate": 3.0554575872360463e-06, "loss": 0.4036, "step": 117375 }, { "epoch": 4.230367246909576, "grad_norm": 0.2407938539981842, "learning_rate": 3.054059773161383e-06, "loss": 0.3465, "step": 117380 }, { "epoch": 4.2305474465708, "grad_norm": 0.27205365896224976, "learning_rate": 3.0526622580959063e-06, "loss": 0.3897, "step": 117385 }, { "epoch": 4.230727646232025, "grad_norm": 0.21772317588329315, "learning_rate": 3.0512650420586595e-06, "loss": 0.3788, "step": 117390 }, { "epoch": 4.23090784589325, "grad_norm": 0.24121488630771637, "learning_rate": 3.049868125068675e-06, "loss": 0.3711, "step": 117395 }, { "epoch": 4.231088045554475, "grad_norm": 0.22327007353305817, "learning_rate": 3.04847150714499e-06, "loss": 0.3705, "step": 117400 }, { "epoch": 4.231268245215699, "grad_norm": 0.21851101517677307, "learning_rate": 3.0470751883066223e-06, "loss": 0.3382, "step": 117405 }, { "epoch": 4.231448444876923, "grad_norm": 0.3136238753795624, "learning_rate": 3.0456791685726126e-06, "loss": 0.3754, "step": 117410 }, { "epoch": 4.231628644538148, "grad_norm": 0.24977144598960876, "learning_rate": 3.0442834479619707e-06, "loss": 0.3548, "step": 117415 }, { "epoch": 4.231808844199373, "grad_norm": 0.24291959404945374, "learning_rate": 3.0428880264937204e-06, "loss": 0.3734, "step": 117420 }, { "epoch": 4.231989043860597, "grad_norm": 0.2255113422870636, "learning_rate": 3.0414929041868657e-06, "loss": 0.3476, "step": 117425 }, { "epoch": 4.232169243521822, "grad_norm": 0.5030208230018616, "learning_rate": 3.0400980810604164e-06, "loss": 0.3713, "step": 117430 }, { "epoch": 4.232349443183047, "grad_norm": 0.22084102034568787, "learning_rate": 3.0387035571333906e-06, "loss": 0.365, "step": 117435 }, { "epoch": 4.232529642844272, "grad_norm": 0.20808632671833038, "learning_rate": 3.0373093324247676e-06, "loss": 0.3707, "step": 117440 }, { "epoch": 4.232709842505496, "grad_norm": 0.2130783647298813, "learning_rate": 3.035915406953552e-06, "loss": 0.3532, "step": 117445 }, { "epoch": 4.232890042166721, "grad_norm": 0.205308198928833, "learning_rate": 3.0345217807387387e-06, "loss": 0.35, "step": 117450 }, { "epoch": 4.233070241827946, "grad_norm": 0.2538002133369446, "learning_rate": 3.033128453799308e-06, "loss": 0.3864, "step": 117455 }, { "epoch": 4.23325044148917, "grad_norm": 0.27659669518470764, "learning_rate": 3.0317354261542613e-06, "loss": 0.3974, "step": 117460 }, { "epoch": 4.233430641150394, "grad_norm": 0.2688102722167969, "learning_rate": 3.0303426978225607e-06, "loss": 0.3609, "step": 117465 }, { "epoch": 4.233610840811619, "grad_norm": 0.25696924328804016, "learning_rate": 3.028950268823183e-06, "loss": 0.3414, "step": 117470 }, { "epoch": 4.233791040472844, "grad_norm": 0.19637776911258698, "learning_rate": 3.0275581391751075e-06, "loss": 0.3418, "step": 117475 }, { "epoch": 4.233971240134069, "grad_norm": 0.25194182991981506, "learning_rate": 3.026166308897299e-06, "loss": 0.3505, "step": 117480 }, { "epoch": 4.234151439795293, "grad_norm": 0.3067736029624939, "learning_rate": 3.024774778008721e-06, "loss": 0.3806, "step": 117485 }, { "epoch": 4.234331639456518, "grad_norm": 0.2704439163208008, "learning_rate": 3.023383546528333e-06, "loss": 0.4056, "step": 117490 }, { "epoch": 4.234511839117743, "grad_norm": 0.23782427608966827, "learning_rate": 3.021992614475083e-06, "loss": 0.3591, "step": 117495 }, { "epoch": 4.2346920387789675, "grad_norm": 0.21595478057861328, "learning_rate": 3.020601981867932e-06, "loss": 0.3604, "step": 117500 }, { "epoch": 4.2346920387789675, "eval_loss": 0.42911702394485474, "eval_runtime": 3.5344, "eval_samples_per_second": 28.293, "eval_steps_per_second": 7.073, "step": 117500 }, { "epoch": 4.234872238440191, "grad_norm": 0.2689270079135895, "learning_rate": 3.0192116487258252e-06, "loss": 0.378, "step": 117505 }, { "epoch": 4.235052438101416, "grad_norm": 0.24561098217964172, "learning_rate": 3.017821615067706e-06, "loss": 0.349, "step": 117510 }, { "epoch": 4.235232637762641, "grad_norm": 0.2601664364337921, "learning_rate": 3.01643188091251e-06, "loss": 0.3717, "step": 117515 }, { "epoch": 4.2354128374238655, "grad_norm": 0.21555201709270477, "learning_rate": 3.0150424462791706e-06, "loss": 0.404, "step": 117520 }, { "epoch": 4.23559303708509, "grad_norm": 0.30406418442726135, "learning_rate": 3.0136533111866206e-06, "loss": 0.3917, "step": 117525 }, { "epoch": 4.235773236746315, "grad_norm": 0.21674159169197083, "learning_rate": 3.012264475653795e-06, "loss": 0.365, "step": 117530 }, { "epoch": 4.23595343640754, "grad_norm": 0.27085331082344055, "learning_rate": 3.0108759396996004e-06, "loss": 0.3447, "step": 117535 }, { "epoch": 4.2361336360687645, "grad_norm": 0.20267651975154877, "learning_rate": 3.0094877033429667e-06, "loss": 0.3808, "step": 117540 }, { "epoch": 4.236313835729989, "grad_norm": 0.26923617720603943, "learning_rate": 3.0080997666028095e-06, "loss": 0.3694, "step": 117545 }, { "epoch": 4.236494035391213, "grad_norm": 0.21540358662605286, "learning_rate": 3.0067121294980323e-06, "loss": 0.3543, "step": 117550 }, { "epoch": 4.236674235052438, "grad_norm": 0.23205356299877167, "learning_rate": 3.0053247920475452e-06, "loss": 0.4063, "step": 117555 }, { "epoch": 4.2368544347136625, "grad_norm": 0.20041492581367493, "learning_rate": 3.0039377542702417e-06, "loss": 0.3609, "step": 117560 }, { "epoch": 4.237034634374887, "grad_norm": 0.3079770803451538, "learning_rate": 3.0025510161850373e-06, "loss": 0.3538, "step": 117565 }, { "epoch": 4.237214834036112, "grad_norm": 0.28639087080955505, "learning_rate": 3.0011645778108137e-06, "loss": 0.3659, "step": 117570 }, { "epoch": 4.237395033697337, "grad_norm": 0.2304767519235611, "learning_rate": 2.9997784391664647e-06, "loss": 0.4097, "step": 117575 }, { "epoch": 4.2375752333585615, "grad_norm": 0.2690269947052002, "learning_rate": 2.9983926002708746e-06, "loss": 0.3809, "step": 117580 }, { "epoch": 4.237755433019786, "grad_norm": 0.26503047347068787, "learning_rate": 2.9970070611429257e-06, "loss": 0.3656, "step": 117585 }, { "epoch": 4.237935632681011, "grad_norm": 0.1937418282032013, "learning_rate": 2.9956218218014893e-06, "loss": 0.3704, "step": 117590 }, { "epoch": 4.238115832342235, "grad_norm": 0.24524284899234772, "learning_rate": 2.9942368822654527e-06, "loss": 0.4087, "step": 117595 }, { "epoch": 4.2382960320034595, "grad_norm": 0.27891620993614197, "learning_rate": 2.9928522425536757e-06, "loss": 0.3533, "step": 117600 }, { "epoch": 4.238476231664684, "grad_norm": 0.2644476890563965, "learning_rate": 2.991467902685027e-06, "loss": 0.3914, "step": 117605 }, { "epoch": 4.238656431325909, "grad_norm": 0.2886243164539337, "learning_rate": 2.9900838626783693e-06, "loss": 0.3735, "step": 117610 }, { "epoch": 4.238836630987134, "grad_norm": 0.2373829036951065, "learning_rate": 2.9887001225525514e-06, "loss": 0.3564, "step": 117615 }, { "epoch": 4.239016830648358, "grad_norm": 0.22608888149261475, "learning_rate": 2.987316682326438e-06, "loss": 0.3711, "step": 117620 }, { "epoch": 4.239197030309583, "grad_norm": 0.24686965346336365, "learning_rate": 2.985933542018876e-06, "loss": 0.3849, "step": 117625 }, { "epoch": 4.239377229970808, "grad_norm": 0.22954131662845612, "learning_rate": 2.9845507016487e-06, "loss": 0.3794, "step": 117630 }, { "epoch": 4.239557429632033, "grad_norm": 0.2016417235136032, "learning_rate": 2.9831681612347616e-06, "loss": 0.3694, "step": 117635 }, { "epoch": 4.239737629293257, "grad_norm": 0.21057425439357758, "learning_rate": 2.9817859207958905e-06, "loss": 0.3543, "step": 117640 }, { "epoch": 4.239917828954481, "grad_norm": 0.23975323140621185, "learning_rate": 2.980403980350932e-06, "loss": 0.3292, "step": 117645 }, { "epoch": 4.240098028615706, "grad_norm": 0.23370075225830078, "learning_rate": 2.979022339918702e-06, "loss": 0.3572, "step": 117650 }, { "epoch": 4.240278228276931, "grad_norm": 0.22406595945358276, "learning_rate": 2.977640999518025e-06, "loss": 0.3527, "step": 117655 }, { "epoch": 4.240458427938155, "grad_norm": 0.27608826756477356, "learning_rate": 2.976259959167732e-06, "loss": 0.3598, "step": 117660 }, { "epoch": 4.24063862759938, "grad_norm": 0.286156564950943, "learning_rate": 2.9748792188866308e-06, "loss": 0.3439, "step": 117665 }, { "epoch": 4.240818827260605, "grad_norm": 0.3072472810745239, "learning_rate": 2.973498778693537e-06, "loss": 0.3777, "step": 117670 }, { "epoch": 4.24099902692183, "grad_norm": 0.2714073956012726, "learning_rate": 2.9721186386072573e-06, "loss": 0.3941, "step": 117675 }, { "epoch": 4.241179226583054, "grad_norm": 0.22121752798557281, "learning_rate": 2.970738798646591e-06, "loss": 0.354, "step": 117680 }, { "epoch": 4.241359426244279, "grad_norm": 0.2743302285671234, "learning_rate": 2.9693592588303482e-06, "loss": 0.3862, "step": 117685 }, { "epoch": 4.241539625905503, "grad_norm": 0.2942706048488617, "learning_rate": 2.9679800191773215e-06, "loss": 0.388, "step": 117690 }, { "epoch": 4.241719825566728, "grad_norm": 0.26952454447746277, "learning_rate": 2.966601079706302e-06, "loss": 0.381, "step": 117695 }, { "epoch": 4.241900025227952, "grad_norm": 0.17778491973876953, "learning_rate": 2.9652224404360747e-06, "loss": 0.3192, "step": 117700 }, { "epoch": 4.242080224889177, "grad_norm": 0.21804936230182648, "learning_rate": 2.9638441013854268e-06, "loss": 0.3632, "step": 117705 }, { "epoch": 4.242260424550402, "grad_norm": 0.25280699133872986, "learning_rate": 2.9624660625731325e-06, "loss": 0.3887, "step": 117710 }, { "epoch": 4.242440624211627, "grad_norm": 0.2739708721637726, "learning_rate": 2.96108832401798e-06, "loss": 0.3653, "step": 117715 }, { "epoch": 4.242620823872851, "grad_norm": 0.19621554017066956, "learning_rate": 2.959710885738723e-06, "loss": 0.3652, "step": 117720 }, { "epoch": 4.242801023534076, "grad_norm": 0.25483089685440063, "learning_rate": 2.9583337477541437e-06, "loss": 0.3788, "step": 117725 }, { "epoch": 4.242981223195301, "grad_norm": 0.25373703241348267, "learning_rate": 2.9569569100830003e-06, "loss": 0.3526, "step": 117730 }, { "epoch": 4.243161422856525, "grad_norm": 0.29680660367012024, "learning_rate": 2.9555803727440436e-06, "loss": 0.4243, "step": 117735 }, { "epoch": 4.243341622517749, "grad_norm": 0.24780967831611633, "learning_rate": 2.95420413575605e-06, "loss": 0.3667, "step": 117740 }, { "epoch": 4.243521822178974, "grad_norm": 0.22331710159778595, "learning_rate": 2.9528281991377443e-06, "loss": 0.3457, "step": 117745 }, { "epoch": 4.243702021840199, "grad_norm": 0.20010043680667877, "learning_rate": 2.9514525629078915e-06, "loss": 0.3773, "step": 117750 }, { "epoch": 4.243882221501424, "grad_norm": 0.27777811884880066, "learning_rate": 2.9500772270852268e-06, "loss": 0.3659, "step": 117755 }, { "epoch": 4.244062421162648, "grad_norm": 0.25709882378578186, "learning_rate": 2.948702191688493e-06, "loss": 0.385, "step": 117760 }, { "epoch": 4.244242620823873, "grad_norm": 0.3200331926345825, "learning_rate": 2.9473274567364234e-06, "loss": 0.3751, "step": 117765 }, { "epoch": 4.244422820485098, "grad_norm": 0.1971009075641632, "learning_rate": 2.9459530222477464e-06, "loss": 0.3772, "step": 117770 }, { "epoch": 4.2446030201463225, "grad_norm": 0.23797380924224854, "learning_rate": 2.944578888241184e-06, "loss": 0.3566, "step": 117775 }, { "epoch": 4.244783219807546, "grad_norm": 0.2632618844509125, "learning_rate": 2.9432050547354706e-06, "loss": 0.3802, "step": 117780 }, { "epoch": 4.244963419468771, "grad_norm": 0.30606886744499207, "learning_rate": 2.9418315217493165e-06, "loss": 0.3788, "step": 117785 }, { "epoch": 4.245143619129996, "grad_norm": 0.25161272287368774, "learning_rate": 2.94045828930144e-06, "loss": 0.3892, "step": 117790 }, { "epoch": 4.245323818791221, "grad_norm": 0.24037761986255646, "learning_rate": 2.9390853574105455e-06, "loss": 0.3661, "step": 117795 }, { "epoch": 4.245504018452445, "grad_norm": 0.2599053680896759, "learning_rate": 2.9377127260953374e-06, "loss": 0.3867, "step": 117800 }, { "epoch": 4.24568421811367, "grad_norm": 0.2276243269443512, "learning_rate": 2.936340395374529e-06, "loss": 0.4001, "step": 117805 }, { "epoch": 4.245864417774895, "grad_norm": 0.3195003271102905, "learning_rate": 2.934968365266816e-06, "loss": 0.351, "step": 117810 }, { "epoch": 4.2460446174361195, "grad_norm": 0.24188397824764252, "learning_rate": 2.933596635790875e-06, "loss": 0.3847, "step": 117815 }, { "epoch": 4.246224817097344, "grad_norm": 0.23399367928504944, "learning_rate": 2.9322252069654137e-06, "loss": 0.3921, "step": 117820 }, { "epoch": 4.246405016758568, "grad_norm": 0.25146761536598206, "learning_rate": 2.930854078809106e-06, "loss": 0.3672, "step": 117825 }, { "epoch": 4.246585216419793, "grad_norm": 0.27147242426872253, "learning_rate": 2.929483251340648e-06, "loss": 0.4172, "step": 117830 }, { "epoch": 4.2467654160810175, "grad_norm": 0.21517179906368256, "learning_rate": 2.9281127245787047e-06, "loss": 0.3483, "step": 117835 }, { "epoch": 4.246945615742242, "grad_norm": 0.1976982206106186, "learning_rate": 2.9267424985419457e-06, "loss": 0.3928, "step": 117840 }, { "epoch": 4.247125815403467, "grad_norm": 0.2845209836959839, "learning_rate": 2.925372573249052e-06, "loss": 0.3635, "step": 117845 }, { "epoch": 4.247306015064692, "grad_norm": 0.31492358446121216, "learning_rate": 2.924002948718685e-06, "loss": 0.3828, "step": 117850 }, { "epoch": 4.2474862147259165, "grad_norm": 0.3145192265510559, "learning_rate": 2.922633624969501e-06, "loss": 0.3752, "step": 117855 }, { "epoch": 4.247666414387141, "grad_norm": 0.26329880952835083, "learning_rate": 2.9212646020201602e-06, "loss": 0.3715, "step": 117860 }, { "epoch": 4.247846614048366, "grad_norm": 0.22250141203403473, "learning_rate": 2.9198958798893115e-06, "loss": 0.403, "step": 117865 }, { "epoch": 4.24802681370959, "grad_norm": 0.25044476985931396, "learning_rate": 2.9185274585956102e-06, "loss": 0.368, "step": 117870 }, { "epoch": 4.2482070133708145, "grad_norm": 0.22099538147449493, "learning_rate": 2.9171593381576962e-06, "loss": 0.3449, "step": 117875 }, { "epoch": 4.248387213032039, "grad_norm": 0.26484134793281555, "learning_rate": 2.915791518594213e-06, "loss": 0.3742, "step": 117880 }, { "epoch": 4.248567412693264, "grad_norm": 0.2437736839056015, "learning_rate": 2.914423999923793e-06, "loss": 0.3779, "step": 117885 }, { "epoch": 4.248747612354489, "grad_norm": 0.2136683613061905, "learning_rate": 2.9130567821650717e-06, "loss": 0.3602, "step": 117890 }, { "epoch": 4.2489278120157135, "grad_norm": 0.2902207374572754, "learning_rate": 2.9116898653366697e-06, "loss": 0.3767, "step": 117895 }, { "epoch": 4.249108011676938, "grad_norm": 0.263248473405838, "learning_rate": 2.9103232494572198e-06, "loss": 0.3361, "step": 117900 }, { "epoch": 4.249288211338163, "grad_norm": 0.31815701723098755, "learning_rate": 2.9089569345453425e-06, "loss": 0.3547, "step": 117905 }, { "epoch": 4.249468410999388, "grad_norm": 0.27344661951065063, "learning_rate": 2.907590920619649e-06, "loss": 0.3585, "step": 117910 }, { "epoch": 4.249648610660612, "grad_norm": 0.25131598114967346, "learning_rate": 2.9062252076987512e-06, "loss": 0.3798, "step": 117915 }, { "epoch": 4.249828810321836, "grad_norm": 0.24895480275154114, "learning_rate": 2.9048597958012515e-06, "loss": 0.3788, "step": 117920 }, { "epoch": 4.250009009983061, "grad_norm": 0.2820626497268677, "learning_rate": 2.9034946849457707e-06, "loss": 0.3905, "step": 117925 }, { "epoch": 4.250189209644286, "grad_norm": 0.24140560626983643, "learning_rate": 2.9021298751508856e-06, "loss": 0.3902, "step": 117930 }, { "epoch": 4.25036940930551, "grad_norm": 0.19780555367469788, "learning_rate": 2.9007653664352095e-06, "loss": 0.3747, "step": 117935 }, { "epoch": 4.250549608966735, "grad_norm": 0.23243926465511322, "learning_rate": 2.899401158817325e-06, "loss": 0.3512, "step": 117940 }, { "epoch": 4.25072980862796, "grad_norm": 0.19363261759281158, "learning_rate": 2.898037252315822e-06, "loss": 0.3274, "step": 117945 }, { "epoch": 4.250910008289185, "grad_norm": 0.22952446341514587, "learning_rate": 2.8966736469492826e-06, "loss": 0.3882, "step": 117950 }, { "epoch": 4.251090207950409, "grad_norm": 0.2463994026184082, "learning_rate": 2.8953103427362847e-06, "loss": 0.3941, "step": 117955 }, { "epoch": 4.251270407611634, "grad_norm": 0.2101328819990158, "learning_rate": 2.893947339695399e-06, "loss": 0.3343, "step": 117960 }, { "epoch": 4.251450607272858, "grad_norm": 0.22500717639923096, "learning_rate": 2.8925846378452056e-06, "loss": 0.3845, "step": 117965 }, { "epoch": 4.251630806934083, "grad_norm": 0.2579551935195923, "learning_rate": 2.8912222372042663e-06, "loss": 0.3704, "step": 117970 }, { "epoch": 4.251811006595307, "grad_norm": 0.28678232431411743, "learning_rate": 2.889860137791145e-06, "loss": 0.3729, "step": 117975 }, { "epoch": 4.251991206256532, "grad_norm": 0.24297846853733063, "learning_rate": 2.8884983396243988e-06, "loss": 0.3957, "step": 117980 }, { "epoch": 4.252171405917757, "grad_norm": 0.2564467191696167, "learning_rate": 2.8871368427225736e-06, "loss": 0.3645, "step": 117985 }, { "epoch": 4.252351605578982, "grad_norm": 0.21953606605529785, "learning_rate": 2.8857756471042357e-06, "loss": 0.3874, "step": 117990 }, { "epoch": 4.252531805240206, "grad_norm": 0.249432772397995, "learning_rate": 2.8844147527879283e-06, "loss": 0.4071, "step": 117995 }, { "epoch": 4.252712004901431, "grad_norm": 0.2628020942211151, "learning_rate": 2.8830541597921755e-06, "loss": 0.3869, "step": 118000 }, { "epoch": 4.252712004901431, "eval_loss": 0.42919114232063293, "eval_runtime": 3.5338, "eval_samples_per_second": 28.298, "eval_steps_per_second": 7.075, "step": 118000 }, { "epoch": 4.252892204562656, "grad_norm": 0.2826399505138397, "learning_rate": 2.881693868135535e-06, "loss": 0.3887, "step": 118005 }, { "epoch": 4.25307240422388, "grad_norm": 0.24618491530418396, "learning_rate": 2.8803338778365273e-06, "loss": 0.3729, "step": 118010 }, { "epoch": 4.253252603885104, "grad_norm": 0.2526216208934784, "learning_rate": 2.8789741889136994e-06, "loss": 0.4333, "step": 118015 }, { "epoch": 4.253432803546329, "grad_norm": 0.3000968098640442, "learning_rate": 2.877614801385559e-06, "loss": 0.3544, "step": 118020 }, { "epoch": 4.253613003207554, "grad_norm": 0.23581233620643616, "learning_rate": 2.8762557152706286e-06, "loss": 0.3761, "step": 118025 }, { "epoch": 4.253793202868779, "grad_norm": 0.2421904057264328, "learning_rate": 2.8748969305874367e-06, "loss": 0.3642, "step": 118030 }, { "epoch": 4.253973402530003, "grad_norm": 0.20584893226623535, "learning_rate": 2.87353844735449e-06, "loss": 0.3414, "step": 118035 }, { "epoch": 4.254153602191228, "grad_norm": 0.23724088072776794, "learning_rate": 2.872180265590299e-06, "loss": 0.3598, "step": 118040 }, { "epoch": 4.254333801852453, "grad_norm": 0.23216982185840607, "learning_rate": 2.870822385313368e-06, "loss": 0.3497, "step": 118045 }, { "epoch": 4.2545140015136775, "grad_norm": 0.27174580097198486, "learning_rate": 2.8694648065421907e-06, "loss": 0.3824, "step": 118050 }, { "epoch": 4.254694201174901, "grad_norm": 0.31109705567359924, "learning_rate": 2.8681075292952776e-06, "loss": 0.3833, "step": 118055 }, { "epoch": 4.254874400836126, "grad_norm": 0.2299165278673172, "learning_rate": 2.866750553591116e-06, "loss": 0.3445, "step": 118060 }, { "epoch": 4.255054600497351, "grad_norm": 0.30145031213760376, "learning_rate": 2.865393879448189e-06, "loss": 0.3773, "step": 118065 }, { "epoch": 4.255234800158576, "grad_norm": 0.22781316936016083, "learning_rate": 2.8640375068849867e-06, "loss": 0.3582, "step": 118070 }, { "epoch": 4.2554149998198, "grad_norm": 0.27182450890541077, "learning_rate": 2.8626814359199894e-06, "loss": 0.3855, "step": 118075 }, { "epoch": 4.255595199481025, "grad_norm": 0.2907634377479553, "learning_rate": 2.8613256665716653e-06, "loss": 0.37, "step": 118080 }, { "epoch": 4.25577539914225, "grad_norm": 0.2272307574748993, "learning_rate": 2.859970198858497e-06, "loss": 0.3706, "step": 118085 }, { "epoch": 4.2559555988034745, "grad_norm": 0.20825766026973724, "learning_rate": 2.85861503279895e-06, "loss": 0.3394, "step": 118090 }, { "epoch": 4.256135798464699, "grad_norm": 0.21996937692165375, "learning_rate": 2.8572601684114853e-06, "loss": 0.3639, "step": 118095 }, { "epoch": 4.256315998125924, "grad_norm": 0.23290881514549255, "learning_rate": 2.855905605714565e-06, "loss": 0.3777, "step": 118100 }, { "epoch": 4.256496197787148, "grad_norm": 0.21647751331329346, "learning_rate": 2.854551344726636e-06, "loss": 0.3746, "step": 118105 }, { "epoch": 4.256676397448373, "grad_norm": 0.2545035183429718, "learning_rate": 2.853197385466169e-06, "loss": 0.3716, "step": 118110 }, { "epoch": 4.256856597109597, "grad_norm": 0.20453687012195587, "learning_rate": 2.851843727951595e-06, "loss": 0.3852, "step": 118115 }, { "epoch": 4.257036796770822, "grad_norm": 0.24812854826450348, "learning_rate": 2.8504903722013587e-06, "loss": 0.3759, "step": 118120 }, { "epoch": 4.257216996432047, "grad_norm": 0.29127225279808044, "learning_rate": 2.8491373182339048e-06, "loss": 0.3276, "step": 118125 }, { "epoch": 4.2573971960932715, "grad_norm": 0.2810105085372925, "learning_rate": 2.8477845660676684e-06, "loss": 0.4026, "step": 118130 }, { "epoch": 4.257577395754496, "grad_norm": 0.29022449254989624, "learning_rate": 2.846432115721079e-06, "loss": 0.3805, "step": 118135 }, { "epoch": 4.257757595415721, "grad_norm": 0.25384798645973206, "learning_rate": 2.8450799672125607e-06, "loss": 0.361, "step": 118140 }, { "epoch": 4.257937795076945, "grad_norm": 0.23675154149532318, "learning_rate": 2.843728120560535e-06, "loss": 0.3651, "step": 118145 }, { "epoch": 4.2581179947381695, "grad_norm": 0.20214100182056427, "learning_rate": 2.8423765757834287e-06, "loss": 0.3901, "step": 118150 }, { "epoch": 4.258298194399394, "grad_norm": 0.2030409872531891, "learning_rate": 2.8410253328996523e-06, "loss": 0.3818, "step": 118155 }, { "epoch": 4.258478394060619, "grad_norm": 0.2594359815120697, "learning_rate": 2.839674391927613e-06, "loss": 0.3604, "step": 118160 }, { "epoch": 4.258658593721844, "grad_norm": 0.2291530966758728, "learning_rate": 2.8383237528857243e-06, "loss": 0.3353, "step": 118165 }, { "epoch": 4.2588387933830685, "grad_norm": 0.23758092522621155, "learning_rate": 2.836973415792374e-06, "loss": 0.3764, "step": 118170 }, { "epoch": 4.259018993044293, "grad_norm": 0.27188360691070557, "learning_rate": 2.8356233806659777e-06, "loss": 0.3948, "step": 118175 }, { "epoch": 4.259199192705518, "grad_norm": 0.21040531992912292, "learning_rate": 2.834273647524921e-06, "loss": 0.3403, "step": 118180 }, { "epoch": 4.259379392366743, "grad_norm": 0.28796496987342834, "learning_rate": 2.832924216387595e-06, "loss": 0.4118, "step": 118185 }, { "epoch": 4.259559592027967, "grad_norm": 0.23446249961853027, "learning_rate": 2.831575087272387e-06, "loss": 0.3733, "step": 118190 }, { "epoch": 4.259739791689191, "grad_norm": 0.28546759486198425, "learning_rate": 2.8302262601976687e-06, "loss": 0.3581, "step": 118195 }, { "epoch": 4.259919991350416, "grad_norm": 0.1949034482240677, "learning_rate": 2.8288777351818287e-06, "loss": 0.3575, "step": 118200 }, { "epoch": 4.260100191011641, "grad_norm": 0.28568407893180847, "learning_rate": 2.827529512243246e-06, "loss": 0.3726, "step": 118205 }, { "epoch": 4.2602803906728655, "grad_norm": 0.24735015630722046, "learning_rate": 2.8261815914002675e-06, "loss": 0.376, "step": 118210 }, { "epoch": 4.26046059033409, "grad_norm": 0.18040810525417328, "learning_rate": 2.8248339726712787e-06, "loss": 0.3756, "step": 118215 }, { "epoch": 4.260640789995315, "grad_norm": 0.20753014087677002, "learning_rate": 2.823486656074634e-06, "loss": 0.3616, "step": 118220 }, { "epoch": 4.26082098965654, "grad_norm": 0.21727077662944794, "learning_rate": 2.8221396416286905e-06, "loss": 0.3591, "step": 118225 }, { "epoch": 4.261001189317764, "grad_norm": 0.24121779203414917, "learning_rate": 2.820792929351798e-06, "loss": 0.3535, "step": 118230 }, { "epoch": 4.261181388978989, "grad_norm": 0.22880592942237854, "learning_rate": 2.819446519262306e-06, "loss": 0.3734, "step": 118235 }, { "epoch": 4.261361588640213, "grad_norm": 0.26039087772369385, "learning_rate": 2.8181004113785632e-06, "loss": 0.3805, "step": 118240 }, { "epoch": 4.261541788301438, "grad_norm": 0.21298900246620178, "learning_rate": 2.81675460571891e-06, "loss": 0.3944, "step": 118245 }, { "epoch": 4.261721987962662, "grad_norm": 0.24916015565395355, "learning_rate": 2.8154091023016805e-06, "loss": 0.3752, "step": 118250 }, { "epoch": 4.261902187623887, "grad_norm": 0.31941652297973633, "learning_rate": 2.8140639011452057e-06, "loss": 0.3874, "step": 118255 }, { "epoch": 4.262082387285112, "grad_norm": 0.2693467140197754, "learning_rate": 2.8127190022678135e-06, "loss": 0.3512, "step": 118260 }, { "epoch": 4.262262586946337, "grad_norm": 0.28551846742630005, "learning_rate": 2.8113744056878277e-06, "loss": 0.3449, "step": 118265 }, { "epoch": 4.262442786607561, "grad_norm": 0.24838872253894806, "learning_rate": 2.8100301114235726e-06, "loss": 0.354, "step": 118270 }, { "epoch": 4.262622986268786, "grad_norm": 0.20312124490737915, "learning_rate": 2.8086861194933616e-06, "loss": 0.3844, "step": 118275 }, { "epoch": 4.262803185930011, "grad_norm": 0.29347795248031616, "learning_rate": 2.807342429915505e-06, "loss": 0.3986, "step": 118280 }, { "epoch": 4.262983385591235, "grad_norm": 0.23310750722885132, "learning_rate": 2.8059990427083127e-06, "loss": 0.3745, "step": 118285 }, { "epoch": 4.263163585252459, "grad_norm": 0.2755783200263977, "learning_rate": 2.8046559578900815e-06, "loss": 0.3808, "step": 118290 }, { "epoch": 4.263343784913684, "grad_norm": 0.21833378076553345, "learning_rate": 2.8033131754791246e-06, "loss": 0.3937, "step": 118295 }, { "epoch": 4.263523984574909, "grad_norm": 0.2588220536708832, "learning_rate": 2.801970695493722e-06, "loss": 0.372, "step": 118300 }, { "epoch": 4.263704184236134, "grad_norm": 0.23471491038799286, "learning_rate": 2.8006285179521696e-06, "loss": 0.3778, "step": 118305 }, { "epoch": 4.263884383897358, "grad_norm": 0.31038230657577515, "learning_rate": 2.7992866428727587e-06, "loss": 0.3946, "step": 118310 }, { "epoch": 4.264064583558583, "grad_norm": 0.2159845381975174, "learning_rate": 2.7979450702737637e-06, "loss": 0.3621, "step": 118315 }, { "epoch": 4.264244783219808, "grad_norm": 0.1845831722021103, "learning_rate": 2.7966038001734777e-06, "loss": 0.3498, "step": 118320 }, { "epoch": 4.2644249828810326, "grad_norm": 0.21577754616737366, "learning_rate": 2.7952628325901647e-06, "loss": 0.3733, "step": 118325 }, { "epoch": 4.264605182542256, "grad_norm": 0.234993115067482, "learning_rate": 2.793922167542087e-06, "loss": 0.3823, "step": 118330 }, { "epoch": 4.264785382203481, "grad_norm": 0.22045071423053741, "learning_rate": 2.7925818050475284e-06, "loss": 0.3653, "step": 118335 }, { "epoch": 4.264965581864706, "grad_norm": 0.2358977198600769, "learning_rate": 2.7912417451247453e-06, "loss": 0.3503, "step": 118340 }, { "epoch": 4.265145781525931, "grad_norm": 0.2254144698381424, "learning_rate": 2.78990198779199e-06, "loss": 0.3497, "step": 118345 }, { "epoch": 4.265325981187155, "grad_norm": 0.2657366693019867, "learning_rate": 2.7885625330675206e-06, "loss": 0.3555, "step": 118350 }, { "epoch": 4.26550618084838, "grad_norm": 0.21268507838249207, "learning_rate": 2.7872233809695837e-06, "loss": 0.366, "step": 118355 }, { "epoch": 4.265686380509605, "grad_norm": 0.2723730504512787, "learning_rate": 2.7858845315164307e-06, "loss": 0.3865, "step": 118360 }, { "epoch": 4.2658665801708295, "grad_norm": 0.1851958930492401, "learning_rate": 2.7845459847263e-06, "loss": 0.3302, "step": 118365 }, { "epoch": 4.266046779832054, "grad_norm": 0.30947184562683105, "learning_rate": 2.78320774061743e-06, "loss": 0.3843, "step": 118370 }, { "epoch": 4.266226979493279, "grad_norm": 0.2673863172531128, "learning_rate": 2.7818697992080536e-06, "loss": 0.3744, "step": 118375 }, { "epoch": 4.266407179154503, "grad_norm": 0.22295500338077545, "learning_rate": 2.780532160516394e-06, "loss": 0.4008, "step": 118380 }, { "epoch": 4.266587378815728, "grad_norm": 0.2774406969547272, "learning_rate": 2.779194824560688e-06, "loss": 0.3602, "step": 118385 }, { "epoch": 4.266767578476952, "grad_norm": 0.3125916123390198, "learning_rate": 2.7778577913591534e-06, "loss": 0.3976, "step": 118390 }, { "epoch": 4.266947778138177, "grad_norm": 0.1951741874217987, "learning_rate": 2.7765210609299958e-06, "loss": 0.3666, "step": 118395 }, { "epoch": 4.267127977799402, "grad_norm": 0.2636171877384186, "learning_rate": 2.775184633291439e-06, "loss": 0.3816, "step": 118400 }, { "epoch": 4.2673081774606265, "grad_norm": 0.25289151072502136, "learning_rate": 2.7738485084616904e-06, "loss": 0.383, "step": 118405 }, { "epoch": 4.267488377121851, "grad_norm": 0.23693720996379852, "learning_rate": 2.7725126864589527e-06, "loss": 0.3504, "step": 118410 }, { "epoch": 4.267668576783076, "grad_norm": 0.31816625595092773, "learning_rate": 2.7711771673014253e-06, "loss": 0.3926, "step": 118415 }, { "epoch": 4.2678487764443, "grad_norm": 0.20201784372329712, "learning_rate": 2.7698419510073014e-06, "loss": 0.3648, "step": 118420 }, { "epoch": 4.268028976105525, "grad_norm": 0.28270724415779114, "learning_rate": 2.768507037594781e-06, "loss": 0.3847, "step": 118425 }, { "epoch": 4.268209175766749, "grad_norm": 0.2311401665210724, "learning_rate": 2.7671724270820487e-06, "loss": 0.3913, "step": 118430 }, { "epoch": 4.268389375427974, "grad_norm": 0.2863185405731201, "learning_rate": 2.765838119487288e-06, "loss": 0.4017, "step": 118435 }, { "epoch": 4.268569575089199, "grad_norm": 0.3267385959625244, "learning_rate": 2.764504114828678e-06, "loss": 0.3994, "step": 118440 }, { "epoch": 4.2687497747504235, "grad_norm": 0.350425660610199, "learning_rate": 2.7631704131243967e-06, "loss": 0.3746, "step": 118445 }, { "epoch": 4.268929974411648, "grad_norm": 0.2516806125640869, "learning_rate": 2.761837014392604e-06, "loss": 0.3326, "step": 118450 }, { "epoch": 4.269110174072873, "grad_norm": 0.27395808696746826, "learning_rate": 2.7605039186514858e-06, "loss": 0.3911, "step": 118455 }, { "epoch": 4.269290373734098, "grad_norm": 0.24805009365081787, "learning_rate": 2.7591711259191938e-06, "loss": 0.3596, "step": 118460 }, { "epoch": 4.269470573395322, "grad_norm": 0.23260460793972015, "learning_rate": 2.7578386362138886e-06, "loss": 0.3581, "step": 118465 }, { "epoch": 4.269650773056546, "grad_norm": 0.24661628901958466, "learning_rate": 2.756506449553725e-06, "loss": 0.3633, "step": 118470 }, { "epoch": 4.269830972717771, "grad_norm": 0.25693389773368835, "learning_rate": 2.7551745659568527e-06, "loss": 0.3823, "step": 118475 }, { "epoch": 4.270011172378996, "grad_norm": 0.2382129281759262, "learning_rate": 2.7538429854414238e-06, "loss": 0.3639, "step": 118480 }, { "epoch": 4.2701913720402205, "grad_norm": 0.2255300134420395, "learning_rate": 2.7525117080255817e-06, "loss": 0.3625, "step": 118485 }, { "epoch": 4.270371571701445, "grad_norm": 0.2332393378019333, "learning_rate": 2.7511807337274513e-06, "loss": 0.3748, "step": 118490 }, { "epoch": 4.27055177136267, "grad_norm": 0.18017563223838806, "learning_rate": 2.749850062565179e-06, "loss": 0.4011, "step": 118495 }, { "epoch": 4.270731971023895, "grad_norm": 0.2834778428077698, "learning_rate": 2.748519694556889e-06, "loss": 0.3585, "step": 118500 }, { "epoch": 4.270731971023895, "eval_loss": 0.42914897203445435, "eval_runtime": 3.5317, "eval_samples_per_second": 28.315, "eval_steps_per_second": 7.079, "step": 118500 }, { "epoch": 4.270912170685119, "grad_norm": 0.2678777575492859, "learning_rate": 2.747189629720717e-06, "loss": 0.4119, "step": 118505 }, { "epoch": 4.271092370346344, "grad_norm": 0.23227082192897797, "learning_rate": 2.7458598680747764e-06, "loss": 0.3705, "step": 118510 }, { "epoch": 4.271272570007568, "grad_norm": 0.26510533690452576, "learning_rate": 2.7445304096371803e-06, "loss": 0.3262, "step": 118515 }, { "epoch": 4.271452769668793, "grad_norm": 0.22343812882900238, "learning_rate": 2.7432012544260533e-06, "loss": 0.3612, "step": 118520 }, { "epoch": 4.2716329693300175, "grad_norm": 0.2528297007083893, "learning_rate": 2.7418724024595e-06, "loss": 0.3779, "step": 118525 }, { "epoch": 4.271813168991242, "grad_norm": 0.28484654426574707, "learning_rate": 2.7405438537556284e-06, "loss": 0.3164, "step": 118530 }, { "epoch": 4.271993368652467, "grad_norm": 0.2651745676994324, "learning_rate": 2.739215608332535e-06, "loss": 0.4037, "step": 118535 }, { "epoch": 4.272173568313692, "grad_norm": 0.2041269987821579, "learning_rate": 2.737887666208314e-06, "loss": 0.3386, "step": 118540 }, { "epoch": 4.272353767974916, "grad_norm": 0.22839802503585815, "learning_rate": 2.7365600274010694e-06, "loss": 0.3724, "step": 118545 }, { "epoch": 4.272533967636141, "grad_norm": 0.2205089032649994, "learning_rate": 2.7352326919288824e-06, "loss": 0.3454, "step": 118550 }, { "epoch": 4.272714167297366, "grad_norm": 0.25789710879325867, "learning_rate": 2.7339056598098407e-06, "loss": 0.3494, "step": 118555 }, { "epoch": 4.27289436695859, "grad_norm": 0.3210128843784332, "learning_rate": 2.7325789310620212e-06, "loss": 0.3931, "step": 118560 }, { "epoch": 4.273074566619814, "grad_norm": 0.24505449831485748, "learning_rate": 2.731252505703502e-06, "loss": 0.3628, "step": 118565 }, { "epoch": 4.273254766281039, "grad_norm": 0.2696022689342499, "learning_rate": 2.729926383752357e-06, "loss": 0.3555, "step": 118570 }, { "epoch": 4.273434965942264, "grad_norm": 0.23730942606925964, "learning_rate": 2.7286005652266604e-06, "loss": 0.4053, "step": 118575 }, { "epoch": 4.273615165603489, "grad_norm": 0.2668841779232025, "learning_rate": 2.7272750501444594e-06, "loss": 0.38, "step": 118580 }, { "epoch": 4.273795365264713, "grad_norm": 0.23594842851161957, "learning_rate": 2.7259498385238282e-06, "loss": 0.3971, "step": 118585 }, { "epoch": 4.273975564925938, "grad_norm": 0.2008124589920044, "learning_rate": 2.724624930382819e-06, "loss": 0.3583, "step": 118590 }, { "epoch": 4.274155764587163, "grad_norm": 0.21719537675380707, "learning_rate": 2.723300325739475e-06, "loss": 0.3669, "step": 118595 }, { "epoch": 4.274335964248388, "grad_norm": 0.25217390060424805, "learning_rate": 2.7219760246118638e-06, "loss": 0.3523, "step": 118600 }, { "epoch": 4.274516163909611, "grad_norm": 0.19717147946357727, "learning_rate": 2.720652027018006e-06, "loss": 0.3957, "step": 118605 }, { "epoch": 4.274696363570836, "grad_norm": 0.2760673761367798, "learning_rate": 2.7193283329759537e-06, "loss": 0.3723, "step": 118610 }, { "epoch": 4.274876563232061, "grad_norm": 0.2848314344882965, "learning_rate": 2.7180049425037407e-06, "loss": 0.3912, "step": 118615 }, { "epoch": 4.275056762893286, "grad_norm": 0.30649277567863464, "learning_rate": 2.7166818556193963e-06, "loss": 0.396, "step": 118620 }, { "epoch": 4.27523696255451, "grad_norm": 0.31178387999534607, "learning_rate": 2.7153590723409478e-06, "loss": 0.4219, "step": 118625 }, { "epoch": 4.275417162215735, "grad_norm": 0.2467115819454193, "learning_rate": 2.714036592686417e-06, "loss": 0.3704, "step": 118630 }, { "epoch": 4.27559736187696, "grad_norm": 0.2777189314365387, "learning_rate": 2.7127144166738173e-06, "loss": 0.3898, "step": 118635 }, { "epoch": 4.275777561538185, "grad_norm": 0.23906902968883514, "learning_rate": 2.711392544321176e-06, "loss": 0.3649, "step": 118640 }, { "epoch": 4.275957761199409, "grad_norm": 0.19675643742084503, "learning_rate": 2.710070975646492e-06, "loss": 0.3649, "step": 118645 }, { "epoch": 4.276137960860634, "grad_norm": 0.2644674777984619, "learning_rate": 2.7087497106677793e-06, "loss": 0.3944, "step": 118650 }, { "epoch": 4.276318160521858, "grad_norm": 0.2813570201396942, "learning_rate": 2.707428749403035e-06, "loss": 0.3593, "step": 118655 }, { "epoch": 4.276498360183083, "grad_norm": 0.24779802560806274, "learning_rate": 2.706108091870252e-06, "loss": 0.3836, "step": 118660 }, { "epoch": 4.276678559844307, "grad_norm": 0.2426891177892685, "learning_rate": 2.704787738087436e-06, "loss": 0.3918, "step": 118665 }, { "epoch": 4.276858759505532, "grad_norm": 0.3096999228000641, "learning_rate": 2.703467688072575e-06, "loss": 0.3599, "step": 118670 }, { "epoch": 4.277038959166757, "grad_norm": 0.20364230871200562, "learning_rate": 2.702147941843641e-06, "loss": 0.3373, "step": 118675 }, { "epoch": 4.2772191588279815, "grad_norm": 0.24588048458099365, "learning_rate": 2.7008284994186284e-06, "loss": 0.3355, "step": 118680 }, { "epoch": 4.277399358489206, "grad_norm": 0.24682247638702393, "learning_rate": 2.6995093608155053e-06, "loss": 0.3409, "step": 118685 }, { "epoch": 4.277579558150431, "grad_norm": 0.28250157833099365, "learning_rate": 2.6981905260522607e-06, "loss": 0.3628, "step": 118690 }, { "epoch": 4.277759757811655, "grad_norm": 0.25315603613853455, "learning_rate": 2.6968719951468464e-06, "loss": 0.3906, "step": 118695 }, { "epoch": 4.27793995747288, "grad_norm": 0.2779964506626129, "learning_rate": 2.695553768117226e-06, "loss": 0.3717, "step": 118700 }, { "epoch": 4.278120157134104, "grad_norm": 0.24498099088668823, "learning_rate": 2.6942358449813745e-06, "loss": 0.3657, "step": 118705 }, { "epoch": 4.278300356795329, "grad_norm": 0.2811969816684723, "learning_rate": 2.6929182257572432e-06, "loss": 0.3594, "step": 118710 }, { "epoch": 4.278480556456554, "grad_norm": 0.2293609082698822, "learning_rate": 2.6916009104627797e-06, "loss": 0.3518, "step": 118715 }, { "epoch": 4.2786607561177785, "grad_norm": 0.3065091669559479, "learning_rate": 2.6902838991159334e-06, "loss": 0.3964, "step": 118720 }, { "epoch": 4.278840955779003, "grad_norm": 0.21707165241241455, "learning_rate": 2.6889671917346483e-06, "loss": 0.3497, "step": 118725 }, { "epoch": 4.279021155440228, "grad_norm": 0.3095008432865143, "learning_rate": 2.687650788336868e-06, "loss": 0.3616, "step": 118730 }, { "epoch": 4.279201355101453, "grad_norm": 0.2820426821708679, "learning_rate": 2.6863346889405255e-06, "loss": 0.388, "step": 118735 }, { "epoch": 4.2793815547626775, "grad_norm": 0.24508492648601532, "learning_rate": 2.685018893563554e-06, "loss": 0.394, "step": 118740 }, { "epoch": 4.279561754423901, "grad_norm": 0.2219163477420807, "learning_rate": 2.6837034022238806e-06, "loss": 0.3613, "step": 118745 }, { "epoch": 4.279741954085126, "grad_norm": 0.23274736106395721, "learning_rate": 2.6823882149394215e-06, "loss": 0.3737, "step": 118750 }, { "epoch": 4.279922153746351, "grad_norm": 0.27821585536003113, "learning_rate": 2.6810733317281095e-06, "loss": 0.3587, "step": 118755 }, { "epoch": 4.2801023534075755, "grad_norm": 0.2513722777366638, "learning_rate": 2.67975875260785e-06, "loss": 0.3816, "step": 118760 }, { "epoch": 4.2802825530688, "grad_norm": 0.26026418805122375, "learning_rate": 2.6784444775965563e-06, "loss": 0.3802, "step": 118765 }, { "epoch": 4.280462752730025, "grad_norm": 0.2638653814792633, "learning_rate": 2.6771305067121363e-06, "loss": 0.3756, "step": 118770 }, { "epoch": 4.28064295239125, "grad_norm": 0.2352520376443863, "learning_rate": 2.675816839972489e-06, "loss": 0.4026, "step": 118775 }, { "epoch": 4.280823152052474, "grad_norm": 0.3046009838581085, "learning_rate": 2.6745034773955095e-06, "loss": 0.3791, "step": 118780 }, { "epoch": 4.281003351713699, "grad_norm": 0.21931016445159912, "learning_rate": 2.6731904189991108e-06, "loss": 0.3673, "step": 118785 }, { "epoch": 4.281183551374923, "grad_norm": 0.21778719127178192, "learning_rate": 2.671877664801159e-06, "loss": 0.3712, "step": 118790 }, { "epoch": 4.281363751036148, "grad_norm": 0.22375808656215668, "learning_rate": 2.6705652148195566e-06, "loss": 0.3566, "step": 118795 }, { "epoch": 4.2815439506973725, "grad_norm": 0.30791062116622925, "learning_rate": 2.6692530690721777e-06, "loss": 0.3972, "step": 118800 }, { "epoch": 4.281724150358597, "grad_norm": 0.259032279253006, "learning_rate": 2.667941227576906e-06, "loss": 0.3812, "step": 118805 }, { "epoch": 4.281904350019822, "grad_norm": 0.2583853006362915, "learning_rate": 2.66662969035161e-06, "loss": 0.3902, "step": 118810 }, { "epoch": 4.282084549681047, "grad_norm": 0.3107074499130249, "learning_rate": 2.6653184574141594e-06, "loss": 0.4077, "step": 118815 }, { "epoch": 4.282264749342271, "grad_norm": 0.2655295729637146, "learning_rate": 2.6640075287824166e-06, "loss": 0.3709, "step": 118820 }, { "epoch": 4.282444949003496, "grad_norm": 0.2224242240190506, "learning_rate": 2.6626969044742518e-06, "loss": 0.3534, "step": 118825 }, { "epoch": 4.282625148664721, "grad_norm": 0.25910016894340515, "learning_rate": 2.6613865845075164e-06, "loss": 0.3621, "step": 118830 }, { "epoch": 4.282805348325946, "grad_norm": 0.24160423874855042, "learning_rate": 2.6600765689000634e-06, "loss": 0.3737, "step": 118835 }, { "epoch": 4.2829855479871695, "grad_norm": 0.2217995524406433, "learning_rate": 2.658766857669745e-06, "loss": 0.3841, "step": 118840 }, { "epoch": 4.283165747648394, "grad_norm": 0.25542962551116943, "learning_rate": 2.657457450834394e-06, "loss": 0.4073, "step": 118845 }, { "epoch": 4.283345947309619, "grad_norm": 0.26905080676078796, "learning_rate": 2.656148348411866e-06, "loss": 0.3513, "step": 118850 }, { "epoch": 4.283526146970844, "grad_norm": 0.2789619266986847, "learning_rate": 2.6548395504199963e-06, "loss": 0.3766, "step": 118855 }, { "epoch": 4.283706346632068, "grad_norm": 0.21455584466457367, "learning_rate": 2.6535310568766013e-06, "loss": 0.3954, "step": 118860 }, { "epoch": 4.283886546293293, "grad_norm": 0.3085833787918091, "learning_rate": 2.6522228677995252e-06, "loss": 0.4161, "step": 118865 }, { "epoch": 4.284066745954518, "grad_norm": 0.22385522723197937, "learning_rate": 2.6509149832065817e-06, "loss": 0.3982, "step": 118870 }, { "epoch": 4.284246945615743, "grad_norm": 0.2983032464981079, "learning_rate": 2.6496074031156004e-06, "loss": 0.3327, "step": 118875 }, { "epoch": 4.284427145276966, "grad_norm": 0.27663928270339966, "learning_rate": 2.6483001275443896e-06, "loss": 0.3619, "step": 118880 }, { "epoch": 4.284607344938191, "grad_norm": 0.28087663650512695, "learning_rate": 2.6469931565107575e-06, "loss": 0.3947, "step": 118885 }, { "epoch": 4.284787544599416, "grad_norm": 0.27490758895874023, "learning_rate": 2.6456864900325174e-06, "loss": 0.3937, "step": 118890 }, { "epoch": 4.284967744260641, "grad_norm": 0.28742140531539917, "learning_rate": 2.644380128127474e-06, "loss": 0.3938, "step": 118895 }, { "epoch": 4.285147943921865, "grad_norm": 0.2435084581375122, "learning_rate": 2.6430740708134226e-06, "loss": 0.3626, "step": 118900 }, { "epoch": 4.28532814358309, "grad_norm": 0.27407246828079224, "learning_rate": 2.6417683181081592e-06, "loss": 0.3699, "step": 118905 }, { "epoch": 4.285508343244315, "grad_norm": 0.2970322370529175, "learning_rate": 2.640462870029467e-06, "loss": 0.3878, "step": 118910 }, { "epoch": 4.28568854290554, "grad_norm": 0.27050116658210754, "learning_rate": 2.6391577265951455e-06, "loss": 0.4109, "step": 118915 }, { "epoch": 4.285868742566764, "grad_norm": 0.18248991668224335, "learning_rate": 2.6378528878229697e-06, "loss": 0.3786, "step": 118920 }, { "epoch": 4.286048942227989, "grad_norm": 0.24371817708015442, "learning_rate": 2.6365483537307197e-06, "loss": 0.3315, "step": 118925 }, { "epoch": 4.286229141889213, "grad_norm": 0.2470201700925827, "learning_rate": 2.6352441243361698e-06, "loss": 0.3701, "step": 118930 }, { "epoch": 4.286409341550438, "grad_norm": 0.19732847809791565, "learning_rate": 2.6339401996570813e-06, "loss": 0.3634, "step": 118935 }, { "epoch": 4.286589541211662, "grad_norm": 0.21893630921840668, "learning_rate": 2.632636579711234e-06, "loss": 0.3664, "step": 118940 }, { "epoch": 4.286769740872887, "grad_norm": 0.26393207907676697, "learning_rate": 2.6313332645163836e-06, "loss": 0.3683, "step": 118945 }, { "epoch": 4.286949940534112, "grad_norm": 0.2583698630332947, "learning_rate": 2.630030254090285e-06, "loss": 0.3745, "step": 118950 }, { "epoch": 4.287130140195337, "grad_norm": 0.22545069456100464, "learning_rate": 2.6287275484506934e-06, "loss": 0.4017, "step": 118955 }, { "epoch": 4.287310339856561, "grad_norm": 0.3334513008594513, "learning_rate": 2.6274251476153587e-06, "loss": 0.382, "step": 118960 }, { "epoch": 4.287490539517786, "grad_norm": 0.32256385684013367, "learning_rate": 2.62612305160202e-06, "loss": 0.3868, "step": 118965 }, { "epoch": 4.287670739179011, "grad_norm": 0.28778645396232605, "learning_rate": 2.6248212604284313e-06, "loss": 0.3516, "step": 118970 }, { "epoch": 4.287850938840235, "grad_norm": 0.21005696058273315, "learning_rate": 2.6235197741123122e-06, "loss": 0.3425, "step": 118975 }, { "epoch": 4.288031138501459, "grad_norm": 0.20069657266139984, "learning_rate": 2.6222185926714076e-06, "loss": 0.3665, "step": 118980 }, { "epoch": 4.288211338162684, "grad_norm": 0.30215156078338623, "learning_rate": 2.6209177161234445e-06, "loss": 0.3804, "step": 118985 }, { "epoch": 4.288391537823909, "grad_norm": 0.22637207806110382, "learning_rate": 2.6196171444861417e-06, "loss": 0.3878, "step": 118990 }, { "epoch": 4.2885717374851335, "grad_norm": 0.28523603081703186, "learning_rate": 2.6183168777772244e-06, "loss": 0.3649, "step": 118995 }, { "epoch": 4.288751937146358, "grad_norm": 0.22392310202121735, "learning_rate": 2.617016916014406e-06, "loss": 0.3961, "step": 119000 }, { "epoch": 4.288751937146358, "eval_loss": 0.4290127456188202, "eval_runtime": 3.5355, "eval_samples_per_second": 28.284, "eval_steps_per_second": 7.071, "step": 119000 }, { "epoch": 4.288932136807583, "grad_norm": 0.2049478143453598, "learning_rate": 2.6157172592153944e-06, "loss": 0.3615, "step": 119005 }, { "epoch": 4.289112336468808, "grad_norm": 0.23813143372535706, "learning_rate": 2.614417907397906e-06, "loss": 0.3763, "step": 119010 }, { "epoch": 4.2892925361300325, "grad_norm": 0.29590684175491333, "learning_rate": 2.6131188605796384e-06, "loss": 0.3898, "step": 119015 }, { "epoch": 4.289472735791256, "grad_norm": 0.24687814712524414, "learning_rate": 2.6118201187782937e-06, "loss": 0.3756, "step": 119020 }, { "epoch": 4.289652935452481, "grad_norm": 0.2702934443950653, "learning_rate": 2.6105216820115657e-06, "loss": 0.3699, "step": 119025 }, { "epoch": 4.289833135113706, "grad_norm": 0.24939516186714172, "learning_rate": 2.609223550297141e-06, "loss": 0.3705, "step": 119030 }, { "epoch": 4.2900133347749305, "grad_norm": 0.2283768504858017, "learning_rate": 2.607925723652713e-06, "loss": 0.3792, "step": 119035 }, { "epoch": 4.290193534436155, "grad_norm": 0.24190238118171692, "learning_rate": 2.6066282020959678e-06, "loss": 0.3742, "step": 119040 }, { "epoch": 4.29037373409738, "grad_norm": 0.1982034593820572, "learning_rate": 2.605330985644566e-06, "loss": 0.3406, "step": 119045 }, { "epoch": 4.290553933758605, "grad_norm": 0.2854166030883789, "learning_rate": 2.6040340743162e-06, "loss": 0.3875, "step": 119050 }, { "epoch": 4.2907341334198295, "grad_norm": 0.2228785753250122, "learning_rate": 2.60273746812853e-06, "loss": 0.3492, "step": 119055 }, { "epoch": 4.290914333081054, "grad_norm": 0.3077186346054077, "learning_rate": 2.6014411670992305e-06, "loss": 0.4009, "step": 119060 }, { "epoch": 4.291094532742278, "grad_norm": 0.26383304595947266, "learning_rate": 2.60014517124596e-06, "loss": 0.3999, "step": 119065 }, { "epoch": 4.291274732403503, "grad_norm": 0.2802242934703827, "learning_rate": 2.5988494805863682e-06, "loss": 0.3834, "step": 119070 }, { "epoch": 4.2914549320647275, "grad_norm": 0.21582341194152832, "learning_rate": 2.597554095138122e-06, "loss": 0.367, "step": 119075 }, { "epoch": 4.291635131725952, "grad_norm": 0.2001202404499054, "learning_rate": 2.5962590149188616e-06, "loss": 0.3211, "step": 119080 }, { "epoch": 4.291815331387177, "grad_norm": 0.27572277188301086, "learning_rate": 2.594964239946232e-06, "loss": 0.3591, "step": 119085 }, { "epoch": 4.291995531048402, "grad_norm": 0.2560023069381714, "learning_rate": 2.5936697702378804e-06, "loss": 0.3773, "step": 119090 }, { "epoch": 4.292175730709626, "grad_norm": 0.23489350080490112, "learning_rate": 2.592375605811434e-06, "loss": 0.3515, "step": 119095 }, { "epoch": 4.292355930370851, "grad_norm": 0.25158292055130005, "learning_rate": 2.591081746684537e-06, "loss": 0.3998, "step": 119100 }, { "epoch": 4.292536130032076, "grad_norm": 0.23154456913471222, "learning_rate": 2.5897881928748145e-06, "loss": 0.3961, "step": 119105 }, { "epoch": 4.292716329693301, "grad_norm": 0.2787790298461914, "learning_rate": 2.5884949443998857e-06, "loss": 0.375, "step": 119110 }, { "epoch": 4.2928965293545245, "grad_norm": 0.23555888235569, "learning_rate": 2.587202001277378e-06, "loss": 0.3809, "step": 119115 }, { "epoch": 4.293076729015749, "grad_norm": 0.25257548689842224, "learning_rate": 2.5859093635248964e-06, "loss": 0.3726, "step": 119120 }, { "epoch": 4.293256928676974, "grad_norm": 0.23325851559638977, "learning_rate": 2.5846170311600637e-06, "loss": 0.3535, "step": 119125 }, { "epoch": 4.293437128338199, "grad_norm": 0.2508438527584076, "learning_rate": 2.5833250042004876e-06, "loss": 0.3674, "step": 119130 }, { "epoch": 4.293617327999423, "grad_norm": 0.3049331605434418, "learning_rate": 2.582033282663765e-06, "loss": 0.409, "step": 119135 }, { "epoch": 4.293797527660648, "grad_norm": 0.22652621567249298, "learning_rate": 2.5807418665675017e-06, "loss": 0.3541, "step": 119140 }, { "epoch": 4.293977727321873, "grad_norm": 0.2690187096595764, "learning_rate": 2.5794507559292885e-06, "loss": 0.3622, "step": 119145 }, { "epoch": 4.294157926983098, "grad_norm": 0.24629640579223633, "learning_rate": 2.578159950766712e-06, "loss": 0.3736, "step": 119150 }, { "epoch": 4.2943381266443215, "grad_norm": 0.2614523470401764, "learning_rate": 2.576869451097377e-06, "loss": 0.384, "step": 119155 }, { "epoch": 4.294518326305546, "grad_norm": 0.2685030996799469, "learning_rate": 2.5755792569388455e-06, "loss": 0.3787, "step": 119160 }, { "epoch": 4.294698525966771, "grad_norm": 0.23941963911056519, "learning_rate": 2.5742893683087076e-06, "loss": 0.4015, "step": 119165 }, { "epoch": 4.294878725627996, "grad_norm": 0.2448320984840393, "learning_rate": 2.5729997852245363e-06, "loss": 0.4058, "step": 119170 }, { "epoch": 4.29505892528922, "grad_norm": 0.2499290406703949, "learning_rate": 2.571710507703895e-06, "loss": 0.3464, "step": 119175 }, { "epoch": 4.295239124950445, "grad_norm": 0.263079971075058, "learning_rate": 2.5704215357643666e-06, "loss": 0.3939, "step": 119180 }, { "epoch": 4.29541932461167, "grad_norm": 0.22637499868869781, "learning_rate": 2.5691328694234964e-06, "loss": 0.393, "step": 119185 }, { "epoch": 4.295599524272895, "grad_norm": 0.29773595929145813, "learning_rate": 2.567844508698844e-06, "loss": 0.4092, "step": 119190 }, { "epoch": 4.295779723934119, "grad_norm": 0.25655168294906616, "learning_rate": 2.566556453607974e-06, "loss": 0.3502, "step": 119195 }, { "epoch": 4.295959923595344, "grad_norm": 0.28525644540786743, "learning_rate": 2.5652687041684247e-06, "loss": 0.3887, "step": 119200 }, { "epoch": 4.296140123256568, "grad_norm": 0.2969033718109131, "learning_rate": 2.563981260397749e-06, "loss": 0.3525, "step": 119205 }, { "epoch": 4.296320322917793, "grad_norm": 0.20924274623394012, "learning_rate": 2.562694122313483e-06, "loss": 0.3388, "step": 119210 }, { "epoch": 4.296500522579017, "grad_norm": 0.24764671921730042, "learning_rate": 2.5614072899331625e-06, "loss": 0.3648, "step": 119215 }, { "epoch": 4.296680722240242, "grad_norm": 0.2710559368133545, "learning_rate": 2.560120763274329e-06, "loss": 0.342, "step": 119220 }, { "epoch": 4.296860921901467, "grad_norm": 0.3236762285232544, "learning_rate": 2.5588345423545046e-06, "loss": 0.3871, "step": 119225 }, { "epoch": 4.297041121562692, "grad_norm": 0.2507827877998352, "learning_rate": 2.5575486271912143e-06, "loss": 0.3975, "step": 119230 }, { "epoch": 4.297221321223916, "grad_norm": 0.3020266890525818, "learning_rate": 2.556263017801977e-06, "loss": 0.3805, "step": 119235 }, { "epoch": 4.297401520885141, "grad_norm": 0.2670553922653198, "learning_rate": 2.55497771420431e-06, "loss": 0.3863, "step": 119240 }, { "epoch": 4.297581720546366, "grad_norm": 0.29104623198509216, "learning_rate": 2.553692716415729e-06, "loss": 0.3768, "step": 119245 }, { "epoch": 4.29776192020759, "grad_norm": 0.2508760392665863, "learning_rate": 2.552408024453742e-06, "loss": 0.3921, "step": 119250 }, { "epoch": 4.297942119868814, "grad_norm": 0.27478477358818054, "learning_rate": 2.5511236383358422e-06, "loss": 0.3742, "step": 119255 }, { "epoch": 4.298122319530039, "grad_norm": 0.28668543696403503, "learning_rate": 2.549839558079542e-06, "loss": 0.374, "step": 119260 }, { "epoch": 4.298302519191264, "grad_norm": 0.22271794080734253, "learning_rate": 2.5485557837023303e-06, "loss": 0.3471, "step": 119265 }, { "epoch": 4.298482718852489, "grad_norm": 0.22481966018676758, "learning_rate": 2.5472723152216992e-06, "loss": 0.3503, "step": 119270 }, { "epoch": 4.298662918513713, "grad_norm": 0.22791920602321625, "learning_rate": 2.545989152655137e-06, "loss": 0.3945, "step": 119275 }, { "epoch": 4.298843118174938, "grad_norm": 0.3118433952331543, "learning_rate": 2.544706296020119e-06, "loss": 0.368, "step": 119280 }, { "epoch": 4.299023317836163, "grad_norm": 0.2143528163433075, "learning_rate": 2.5434237453341364e-06, "loss": 0.3645, "step": 119285 }, { "epoch": 4.2992035174973875, "grad_norm": 0.28790873289108276, "learning_rate": 2.5421415006146558e-06, "loss": 0.391, "step": 119290 }, { "epoch": 4.299383717158611, "grad_norm": 0.25662392377853394, "learning_rate": 2.5408595618791497e-06, "loss": 0.3487, "step": 119295 }, { "epoch": 4.299563916819836, "grad_norm": 0.26446598768234253, "learning_rate": 2.5395779291450846e-06, "loss": 0.3973, "step": 119300 }, { "epoch": 4.299744116481061, "grad_norm": 0.24997925758361816, "learning_rate": 2.538296602429921e-06, "loss": 0.3521, "step": 119305 }, { "epoch": 4.2999243161422855, "grad_norm": 0.22187812626361847, "learning_rate": 2.537015581751115e-06, "loss": 0.3569, "step": 119310 }, { "epoch": 4.30010451580351, "grad_norm": 0.23413535952568054, "learning_rate": 2.5357348671261246e-06, "loss": 0.3708, "step": 119315 }, { "epoch": 4.300284715464735, "grad_norm": 0.2146236151456833, "learning_rate": 2.5344544585723997e-06, "loss": 0.3627, "step": 119320 }, { "epoch": 4.30046491512596, "grad_norm": 0.24262899160385132, "learning_rate": 2.5331743561073816e-06, "loss": 0.3694, "step": 119325 }, { "epoch": 4.3006451147871845, "grad_norm": 0.21525375545024872, "learning_rate": 2.5318945597485125e-06, "loss": 0.359, "step": 119330 }, { "epoch": 4.300825314448409, "grad_norm": 0.25374144315719604, "learning_rate": 2.5306150695132286e-06, "loss": 0.3611, "step": 119335 }, { "epoch": 4.301005514109633, "grad_norm": 0.2709214389324188, "learning_rate": 2.529335885418965e-06, "loss": 0.391, "step": 119340 }, { "epoch": 4.301185713770858, "grad_norm": 0.30983975529670715, "learning_rate": 2.5280570074831505e-06, "loss": 0.3709, "step": 119345 }, { "epoch": 4.3013659134320825, "grad_norm": 0.23917073011398315, "learning_rate": 2.5267784357232123e-06, "loss": 0.3768, "step": 119350 }, { "epoch": 4.301546113093307, "grad_norm": 0.22569625079631805, "learning_rate": 2.525500170156564e-06, "loss": 0.3792, "step": 119355 }, { "epoch": 4.301726312754532, "grad_norm": 0.19801470637321472, "learning_rate": 2.5242222108006196e-06, "loss": 0.3409, "step": 119360 }, { "epoch": 4.301906512415757, "grad_norm": 0.3329550623893738, "learning_rate": 2.5229445576728044e-06, "loss": 0.339, "step": 119365 }, { "epoch": 4.3020867120769815, "grad_norm": 0.2501632273197174, "learning_rate": 2.5216672107905153e-06, "loss": 0.3723, "step": 119370 }, { "epoch": 4.302266911738206, "grad_norm": 0.255783349275589, "learning_rate": 2.5203901701711525e-06, "loss": 0.3433, "step": 119375 }, { "epoch": 4.302447111399431, "grad_norm": 0.30260223150253296, "learning_rate": 2.5191134358321267e-06, "loss": 0.3702, "step": 119380 }, { "epoch": 4.302627311060656, "grad_norm": 0.33803072571754456, "learning_rate": 2.5178370077908297e-06, "loss": 0.374, "step": 119385 }, { "epoch": 4.3028075107218795, "grad_norm": 0.20354852080345154, "learning_rate": 2.5165608860646476e-06, "loss": 0.3368, "step": 119390 }, { "epoch": 4.302987710383104, "grad_norm": 0.2563473880290985, "learning_rate": 2.515285070670972e-06, "loss": 0.4105, "step": 119395 }, { "epoch": 4.303167910044329, "grad_norm": 0.2289261668920517, "learning_rate": 2.5140095616271816e-06, "loss": 0.3794, "step": 119400 }, { "epoch": 4.303348109705554, "grad_norm": 0.1935594230890274, "learning_rate": 2.5127343589506607e-06, "loss": 0.3595, "step": 119405 }, { "epoch": 4.303528309366778, "grad_norm": 0.2854008078575134, "learning_rate": 2.51145946265878e-06, "loss": 0.399, "step": 119410 }, { "epoch": 4.303708509028003, "grad_norm": 0.22041873633861542, "learning_rate": 2.510184872768909e-06, "loss": 0.3513, "step": 119415 }, { "epoch": 4.303888708689228, "grad_norm": 0.2398824393749237, "learning_rate": 2.508910589298416e-06, "loss": 0.3586, "step": 119420 }, { "epoch": 4.304068908350453, "grad_norm": 0.2502867877483368, "learning_rate": 2.5076366122646577e-06, "loss": 0.3853, "step": 119425 }, { "epoch": 4.3042491080116765, "grad_norm": 0.29783013463020325, "learning_rate": 2.5063629416850006e-06, "loss": 0.3828, "step": 119430 }, { "epoch": 4.304429307672901, "grad_norm": 0.22847159206867218, "learning_rate": 2.505089577576797e-06, "loss": 0.3618, "step": 119435 }, { "epoch": 4.304609507334126, "grad_norm": 0.2972005009651184, "learning_rate": 2.503816519957383e-06, "loss": 0.3382, "step": 119440 }, { "epoch": 4.304789706995351, "grad_norm": 0.26005420088768005, "learning_rate": 2.5025437688441206e-06, "loss": 0.3663, "step": 119445 }, { "epoch": 4.304969906656575, "grad_norm": 0.23355317115783691, "learning_rate": 2.5012713242543422e-06, "loss": 0.3487, "step": 119450 }, { "epoch": 4.3051501063178, "grad_norm": 0.23067983984947205, "learning_rate": 2.4999991862053818e-06, "loss": 0.3627, "step": 119455 }, { "epoch": 4.305330305979025, "grad_norm": 0.28559330105781555, "learning_rate": 2.498727354714586e-06, "loss": 0.3992, "step": 119460 }, { "epoch": 4.30551050564025, "grad_norm": 0.20493915677070618, "learning_rate": 2.497455829799261e-06, "loss": 0.3717, "step": 119465 }, { "epoch": 4.305690705301474, "grad_norm": 0.2629398703575134, "learning_rate": 2.496184611476754e-06, "loss": 0.3634, "step": 119470 }, { "epoch": 4.305870904962699, "grad_norm": 0.2835977077484131, "learning_rate": 2.49491369976437e-06, "loss": 0.3968, "step": 119475 }, { "epoch": 4.306051104623923, "grad_norm": 0.29381829500198364, "learning_rate": 2.4936430946794322e-06, "loss": 0.39, "step": 119480 }, { "epoch": 4.306231304285148, "grad_norm": 0.21818934381008148, "learning_rate": 2.492372796239251e-06, "loss": 0.3773, "step": 119485 }, { "epoch": 4.306411503946372, "grad_norm": 0.3064677119255066, "learning_rate": 2.491102804461129e-06, "loss": 0.3716, "step": 119490 }, { "epoch": 4.306591703607597, "grad_norm": 0.20513369143009186, "learning_rate": 2.4898331193623724e-06, "loss": 0.3567, "step": 119495 }, { "epoch": 4.306771903268822, "grad_norm": 0.28164172172546387, "learning_rate": 2.488563740960284e-06, "loss": 0.3665, "step": 119500 }, { "epoch": 4.306771903268822, "eval_loss": 0.4290541708469391, "eval_runtime": 3.5337, "eval_samples_per_second": 28.299, "eval_steps_per_second": 7.075, "step": 119500 }, { "epoch": 4.306952102930047, "grad_norm": 0.24295520782470703, "learning_rate": 2.4872946692721554e-06, "loss": 0.346, "step": 119505 }, { "epoch": 4.307132302591271, "grad_norm": 0.29755252599716187, "learning_rate": 2.4860259043152813e-06, "loss": 0.3668, "step": 119510 }, { "epoch": 4.307312502252496, "grad_norm": 0.24635331332683563, "learning_rate": 2.4847574461069446e-06, "loss": 0.3949, "step": 119515 }, { "epoch": 4.307492701913721, "grad_norm": 0.2744773328304291, "learning_rate": 2.483489294664423e-06, "loss": 0.4012, "step": 119520 }, { "epoch": 4.307672901574945, "grad_norm": 0.26855769753456116, "learning_rate": 2.482221450005004e-06, "loss": 0.37, "step": 119525 }, { "epoch": 4.307853101236169, "grad_norm": 0.2621409595012665, "learning_rate": 2.4809539121459613e-06, "loss": 0.3522, "step": 119530 }, { "epoch": 4.308033300897394, "grad_norm": 0.2507140040397644, "learning_rate": 2.4796866811045595e-06, "loss": 0.3455, "step": 119535 }, { "epoch": 4.308213500558619, "grad_norm": 0.26044005155563354, "learning_rate": 2.478419756898065e-06, "loss": 0.3529, "step": 119540 }, { "epoch": 4.308393700219844, "grad_norm": 0.2704107165336609, "learning_rate": 2.4771531395437394e-06, "loss": 0.3842, "step": 119545 }, { "epoch": 4.308573899881068, "grad_norm": 0.23864923417568207, "learning_rate": 2.4758868290588487e-06, "loss": 0.371, "step": 119550 }, { "epoch": 4.308754099542293, "grad_norm": 0.3333123028278351, "learning_rate": 2.4746208254606386e-06, "loss": 0.3874, "step": 119555 }, { "epoch": 4.308934299203518, "grad_norm": 0.236053004860878, "learning_rate": 2.47335512876635e-06, "loss": 0.3794, "step": 119560 }, { "epoch": 4.3091144988647425, "grad_norm": 0.24085481464862823, "learning_rate": 2.4720897389932413e-06, "loss": 0.3613, "step": 119565 }, { "epoch": 4.309294698525966, "grad_norm": 0.2732737064361572, "learning_rate": 2.470824656158549e-06, "loss": 0.383, "step": 119570 }, { "epoch": 4.309474898187191, "grad_norm": 0.23329199850559235, "learning_rate": 2.4695598802795096e-06, "loss": 0.3829, "step": 119575 }, { "epoch": 4.309655097848416, "grad_norm": 0.22309060394763947, "learning_rate": 2.4682954113733557e-06, "loss": 0.369, "step": 119580 }, { "epoch": 4.309835297509641, "grad_norm": 0.24578943848609924, "learning_rate": 2.4670312494573076e-06, "loss": 0.4047, "step": 119585 }, { "epoch": 4.310015497170865, "grad_norm": 0.304373562335968, "learning_rate": 2.4657673945486045e-06, "loss": 0.3447, "step": 119590 }, { "epoch": 4.31019569683209, "grad_norm": 0.32049259543418884, "learning_rate": 2.4645038466644538e-06, "loss": 0.4057, "step": 119595 }, { "epoch": 4.310375896493315, "grad_norm": 0.22794209420681, "learning_rate": 2.4632406058220758e-06, "loss": 0.3567, "step": 119600 }, { "epoch": 4.3105560961545395, "grad_norm": 0.21800605952739716, "learning_rate": 2.4619776720386846e-06, "loss": 0.3863, "step": 119605 }, { "epoch": 4.310736295815764, "grad_norm": 0.23417772352695465, "learning_rate": 2.460715045331477e-06, "loss": 0.3833, "step": 119610 }, { "epoch": 4.310916495476988, "grad_norm": 0.26621463894844055, "learning_rate": 2.4594527257176676e-06, "loss": 0.3761, "step": 119615 }, { "epoch": 4.311096695138213, "grad_norm": 0.23449282348155975, "learning_rate": 2.4581907132144506e-06, "loss": 0.35, "step": 119620 }, { "epoch": 4.3112768947994375, "grad_norm": 0.2845596671104431, "learning_rate": 2.4569290078390205e-06, "loss": 0.3927, "step": 119625 }, { "epoch": 4.311457094460662, "grad_norm": 0.24761004745960236, "learning_rate": 2.4556676096085696e-06, "loss": 0.3448, "step": 119630 }, { "epoch": 4.311637294121887, "grad_norm": 0.30386558175086975, "learning_rate": 2.4544065185402837e-06, "loss": 0.3739, "step": 119635 }, { "epoch": 4.311817493783112, "grad_norm": 0.32800036668777466, "learning_rate": 2.4531457346513382e-06, "loss": 0.3819, "step": 119640 }, { "epoch": 4.3119976934443365, "grad_norm": 0.24963368475437164, "learning_rate": 2.4518852579589247e-06, "loss": 0.3603, "step": 119645 }, { "epoch": 4.312177893105561, "grad_norm": 0.24996010959148407, "learning_rate": 2.450625088480202e-06, "loss": 0.3518, "step": 119650 }, { "epoch": 4.312358092766786, "grad_norm": 0.22332359850406647, "learning_rate": 2.44936522623235e-06, "loss": 0.4102, "step": 119655 }, { "epoch": 4.312538292428011, "grad_norm": 0.20659637451171875, "learning_rate": 2.448105671232531e-06, "loss": 0.3485, "step": 119660 }, { "epoch": 4.3127184920892345, "grad_norm": 0.24972330033779144, "learning_rate": 2.4468464234979063e-06, "loss": 0.3618, "step": 119665 }, { "epoch": 4.312898691750459, "grad_norm": 0.2645004987716675, "learning_rate": 2.4455874830456334e-06, "loss": 0.3474, "step": 119670 }, { "epoch": 4.313078891411684, "grad_norm": 0.20380602777004242, "learning_rate": 2.444328849892863e-06, "loss": 0.3755, "step": 119675 }, { "epoch": 4.313259091072909, "grad_norm": 0.23130224645137787, "learning_rate": 2.4430705240567425e-06, "loss": 0.3706, "step": 119680 }, { "epoch": 4.3134392907341335, "grad_norm": 0.23018963634967804, "learning_rate": 2.441812505554422e-06, "loss": 0.3971, "step": 119685 }, { "epoch": 4.313619490395358, "grad_norm": 0.26062634587287903, "learning_rate": 2.4405547944030383e-06, "loss": 0.3614, "step": 119690 }, { "epoch": 4.313799690056583, "grad_norm": 0.2006555050611496, "learning_rate": 2.4392973906197297e-06, "loss": 0.3731, "step": 119695 }, { "epoch": 4.313979889717808, "grad_norm": 0.2953641712665558, "learning_rate": 2.4380402942216245e-06, "loss": 0.3982, "step": 119700 }, { "epoch": 4.3141600893790315, "grad_norm": 0.23595255613327026, "learning_rate": 2.436783505225848e-06, "loss": 0.3646, "step": 119705 }, { "epoch": 4.314340289040256, "grad_norm": 0.265407919883728, "learning_rate": 2.4355270236495338e-06, "loss": 0.3473, "step": 119710 }, { "epoch": 4.314520488701481, "grad_norm": 0.19755345582962036, "learning_rate": 2.434270849509793e-06, "loss": 0.4052, "step": 119715 }, { "epoch": 4.314700688362706, "grad_norm": 0.24743589758872986, "learning_rate": 2.433014982823745e-06, "loss": 0.3699, "step": 119720 }, { "epoch": 4.31488088802393, "grad_norm": 0.38127291202545166, "learning_rate": 2.431759423608496e-06, "loss": 0.3905, "step": 119725 }, { "epoch": 4.315061087685155, "grad_norm": 0.24170427024364471, "learning_rate": 2.430504171881154e-06, "loss": 0.382, "step": 119730 }, { "epoch": 4.31524128734638, "grad_norm": 0.2505660951137543, "learning_rate": 2.4292492276588306e-06, "loss": 0.3779, "step": 119735 }, { "epoch": 4.315421487007605, "grad_norm": 0.2789454162120819, "learning_rate": 2.4279945909586125e-06, "loss": 0.3524, "step": 119740 }, { "epoch": 4.315601686668829, "grad_norm": 0.2678326964378357, "learning_rate": 2.4267402617975944e-06, "loss": 0.3817, "step": 119745 }, { "epoch": 4.315781886330054, "grad_norm": 0.2486988753080368, "learning_rate": 2.4254862401928728e-06, "loss": 0.3714, "step": 119750 }, { "epoch": 4.315962085991278, "grad_norm": 0.27803459763526917, "learning_rate": 2.4242325261615267e-06, "loss": 0.37, "step": 119755 }, { "epoch": 4.316142285652503, "grad_norm": 0.22846895456314087, "learning_rate": 2.4229791197206502e-06, "loss": 0.3555, "step": 119760 }, { "epoch": 4.316322485313727, "grad_norm": 0.2598210573196411, "learning_rate": 2.4217260208873077e-06, "loss": 0.3441, "step": 119765 }, { "epoch": 4.316502684974952, "grad_norm": 0.25162217020988464, "learning_rate": 2.4204732296785743e-06, "loss": 0.3937, "step": 119770 }, { "epoch": 4.316682884636177, "grad_norm": 0.25310778617858887, "learning_rate": 2.419220746111525e-06, "loss": 0.3502, "step": 119775 }, { "epoch": 4.316863084297402, "grad_norm": 0.2426055669784546, "learning_rate": 2.417968570203219e-06, "loss": 0.365, "step": 119780 }, { "epoch": 4.317043283958626, "grad_norm": Infinity, "learning_rate": 2.416967051002339e-06, "loss": 0.3506, "step": 119785 }, { "epoch": 4.317223483619851, "grad_norm": 0.2902944087982178, "learning_rate": 2.415715428922766e-06, "loss": 0.3559, "step": 119790 }, { "epoch": 4.317403683281076, "grad_norm": 0.2499387264251709, "learning_rate": 2.4144641145497e-06, "loss": 0.3648, "step": 119795 }, { "epoch": 4.3175838829423, "grad_norm": 0.25404685735702515, "learning_rate": 2.4132131079001807e-06, "loss": 0.3391, "step": 119800 }, { "epoch": 4.317764082603524, "grad_norm": 0.26094871759414673, "learning_rate": 2.411962408991264e-06, "loss": 0.3919, "step": 119805 }, { "epoch": 4.317944282264749, "grad_norm": 0.2552245855331421, "learning_rate": 2.4107120178399945e-06, "loss": 0.3532, "step": 119810 }, { "epoch": 4.318124481925974, "grad_norm": 0.24921070039272308, "learning_rate": 2.409461934463389e-06, "loss": 0.3427, "step": 119815 }, { "epoch": 4.318304681587199, "grad_norm": 0.23261171579360962, "learning_rate": 2.408212158878495e-06, "loss": 0.3322, "step": 119820 }, { "epoch": 4.318484881248423, "grad_norm": 0.23947398364543915, "learning_rate": 2.4069626911023325e-06, "loss": 0.3588, "step": 119825 }, { "epoch": 4.318665080909648, "grad_norm": 0.2614345848560333, "learning_rate": 2.40571353115194e-06, "loss": 0.3382, "step": 119830 }, { "epoch": 4.318845280570873, "grad_norm": 0.30958092212677, "learning_rate": 2.4044646790443188e-06, "loss": 0.3832, "step": 119835 }, { "epoch": 4.3190254802320975, "grad_norm": 0.26829853653907776, "learning_rate": 2.4032161347964875e-06, "loss": 0.3675, "step": 119840 }, { "epoch": 4.319205679893322, "grad_norm": 0.24423977732658386, "learning_rate": 2.4019678984254696e-06, "loss": 0.3462, "step": 119845 }, { "epoch": 4.319385879554546, "grad_norm": 0.3319460153579712, "learning_rate": 2.400719969948259e-06, "loss": 0.389, "step": 119850 }, { "epoch": 4.319566079215771, "grad_norm": 0.26566433906555176, "learning_rate": 2.399472349381868e-06, "loss": 0.3806, "step": 119855 }, { "epoch": 4.319746278876996, "grad_norm": 0.32363948225975037, "learning_rate": 2.3982250367432874e-06, "loss": 0.3628, "step": 119860 }, { "epoch": 4.31992647853822, "grad_norm": 0.2810172736644745, "learning_rate": 2.396978032049513e-06, "loss": 0.3626, "step": 119865 }, { "epoch": 4.320106678199445, "grad_norm": 0.262736439704895, "learning_rate": 2.3957313353175387e-06, "loss": 0.3693, "step": 119870 }, { "epoch": 4.32028687786067, "grad_norm": 0.2636507749557495, "learning_rate": 2.3944849465643488e-06, "loss": 0.3753, "step": 119875 }, { "epoch": 4.3204670775218945, "grad_norm": 0.26590320467948914, "learning_rate": 2.3932388658069264e-06, "loss": 0.3929, "step": 119880 }, { "epoch": 4.320647277183119, "grad_norm": 0.232346773147583, "learning_rate": 2.3919930930622476e-06, "loss": 0.3505, "step": 119885 }, { "epoch": 4.320827476844343, "grad_norm": 0.23552198708057404, "learning_rate": 2.390747628347284e-06, "loss": 0.3903, "step": 119890 }, { "epoch": 4.321007676505568, "grad_norm": 0.2921289801597595, "learning_rate": 2.389502471679003e-06, "loss": 0.3556, "step": 119895 }, { "epoch": 4.321187876166793, "grad_norm": 0.2172670215368271, "learning_rate": 2.3882576230743832e-06, "loss": 0.3701, "step": 119900 }, { "epoch": 4.321368075828017, "grad_norm": 0.23092396557331085, "learning_rate": 2.3870130825503656e-06, "loss": 0.3806, "step": 119905 }, { "epoch": 4.321548275489242, "grad_norm": 0.2696508467197418, "learning_rate": 2.3857688501239234e-06, "loss": 0.375, "step": 119910 }, { "epoch": 4.321728475150467, "grad_norm": 0.30084672570228577, "learning_rate": 2.3845249258120012e-06, "loss": 0.3592, "step": 119915 }, { "epoch": 4.3219086748116915, "grad_norm": 0.23035989701747894, "learning_rate": 2.383281309631544e-06, "loss": 0.4045, "step": 119920 }, { "epoch": 4.322088874472916, "grad_norm": 0.2685616910457611, "learning_rate": 2.382038001599507e-06, "loss": 0.3809, "step": 119925 }, { "epoch": 4.322269074134141, "grad_norm": 0.2256428748369217, "learning_rate": 2.3807950017328217e-06, "loss": 0.3882, "step": 119930 }, { "epoch": 4.322449273795366, "grad_norm": 0.2647817134857178, "learning_rate": 2.379552310048419e-06, "loss": 0.3421, "step": 119935 }, { "epoch": 4.3226294734565895, "grad_norm": 0.25552016496658325, "learning_rate": 2.378309926563241e-06, "loss": 0.3568, "step": 119940 }, { "epoch": 4.322809673117814, "grad_norm": 0.23187574744224548, "learning_rate": 2.3770678512942097e-06, "loss": 0.379, "step": 119945 }, { "epoch": 4.322989872779039, "grad_norm": 0.2112554907798767, "learning_rate": 2.375826084258251e-06, "loss": 0.3505, "step": 119950 }, { "epoch": 4.323170072440264, "grad_norm": 0.30000877380371094, "learning_rate": 2.3745846254722816e-06, "loss": 0.3723, "step": 119955 }, { "epoch": 4.3233502721014885, "grad_norm": 0.20632152259349823, "learning_rate": 2.3733434749532104e-06, "loss": 0.3575, "step": 119960 }, { "epoch": 4.323530471762713, "grad_norm": 0.2675947844982147, "learning_rate": 2.3721026327179595e-06, "loss": 0.4144, "step": 119965 }, { "epoch": 4.323710671423938, "grad_norm": 0.2205601930618286, "learning_rate": 2.37086209878343e-06, "loss": 0.3685, "step": 119970 }, { "epoch": 4.323890871085163, "grad_norm": 0.24030627310276031, "learning_rate": 2.369621873166522e-06, "loss": 0.3793, "step": 119975 }, { "epoch": 4.324071070746387, "grad_norm": 0.2809261679649353, "learning_rate": 2.368381955884136e-06, "loss": 0.3722, "step": 119980 }, { "epoch": 4.324251270407611, "grad_norm": 0.2488822638988495, "learning_rate": 2.367142346953158e-06, "loss": 0.3601, "step": 119985 }, { "epoch": 4.324431470068836, "grad_norm": 0.23414598405361176, "learning_rate": 2.3659030463904884e-06, "loss": 0.3784, "step": 119990 }, { "epoch": 4.324611669730061, "grad_norm": 0.27667903900146484, "learning_rate": 2.3646640542130117e-06, "loss": 0.3638, "step": 119995 }, { "epoch": 4.3247918693912855, "grad_norm": 0.2562429904937744, "learning_rate": 2.3634253704375967e-06, "loss": 0.3664, "step": 120000 }, { "epoch": 4.3247918693912855, "eval_loss": 0.4289790987968445, "eval_runtime": 3.5378, "eval_samples_per_second": 28.266, "eval_steps_per_second": 7.067, "step": 120000 }, { "epoch": 4.32497206905251, "grad_norm": 0.27249255776405334, "learning_rate": 2.362186995081134e-06, "loss": 0.3678, "step": 120005 }, { "epoch": 4.325152268713735, "grad_norm": 0.29627206921577454, "learning_rate": 2.3609489281604837e-06, "loss": 0.3616, "step": 120010 }, { "epoch": 4.32533246837496, "grad_norm": 0.27116361260414124, "learning_rate": 2.3597111696925327e-06, "loss": 0.3541, "step": 120015 }, { "epoch": 4.325512668036184, "grad_norm": 0.26802965998649597, "learning_rate": 2.3584737196941266e-06, "loss": 0.3786, "step": 120020 }, { "epoch": 4.325692867697409, "grad_norm": 0.22563250362873077, "learning_rate": 2.3572365781821286e-06, "loss": 0.4056, "step": 120025 }, { "epoch": 4.325873067358633, "grad_norm": 0.205821692943573, "learning_rate": 2.355999745173404e-06, "loss": 0.3754, "step": 120030 }, { "epoch": 4.326053267019858, "grad_norm": 0.24381496012210846, "learning_rate": 2.3547632206847998e-06, "loss": 0.3969, "step": 120035 }, { "epoch": 4.326233466681082, "grad_norm": 0.2163795679807663, "learning_rate": 2.353527004733161e-06, "loss": 0.3793, "step": 120040 }, { "epoch": 4.326413666342307, "grad_norm": 0.257968544960022, "learning_rate": 2.3522910973353324e-06, "loss": 0.3811, "step": 120045 }, { "epoch": 4.326593866003532, "grad_norm": 0.229737788438797, "learning_rate": 2.3510554985081477e-06, "loss": 0.3886, "step": 120050 }, { "epoch": 4.326774065664757, "grad_norm": 0.3103449046611786, "learning_rate": 2.349820208268455e-06, "loss": 0.3495, "step": 120055 }, { "epoch": 4.326954265325981, "grad_norm": 0.2852785289287567, "learning_rate": 2.348585226633074e-06, "loss": 0.3366, "step": 120060 }, { "epoch": 4.327134464987206, "grad_norm": 0.26147526502609253, "learning_rate": 2.3473505536188324e-06, "loss": 0.327, "step": 120065 }, { "epoch": 4.327314664648431, "grad_norm": 0.24618807435035706, "learning_rate": 2.346116189242556e-06, "loss": 0.3706, "step": 120070 }, { "epoch": 4.327494864309655, "grad_norm": 0.3258048892021179, "learning_rate": 2.344882133521059e-06, "loss": 0.3921, "step": 120075 }, { "epoch": 4.327675063970879, "grad_norm": 0.2521205544471741, "learning_rate": 2.3436483864711535e-06, "loss": 0.3588, "step": 120080 }, { "epoch": 4.327855263632104, "grad_norm": 0.23046831786632538, "learning_rate": 2.342414948109656e-06, "loss": 0.3982, "step": 120085 }, { "epoch": 4.328035463293329, "grad_norm": 0.260557621717453, "learning_rate": 2.3411818184533698e-06, "loss": 0.3612, "step": 120090 }, { "epoch": 4.328215662954554, "grad_norm": 0.20361223816871643, "learning_rate": 2.3399489975190923e-06, "loss": 0.3768, "step": 120095 }, { "epoch": 4.328395862615778, "grad_norm": 0.24504904448986053, "learning_rate": 2.3387164853236247e-06, "loss": 0.3511, "step": 120100 }, { "epoch": 4.328576062277003, "grad_norm": 0.22165676951408386, "learning_rate": 2.3374842818837507e-06, "loss": 0.3731, "step": 120105 }, { "epoch": 4.328756261938228, "grad_norm": 0.29873546957969666, "learning_rate": 2.3362523872162756e-06, "loss": 0.3618, "step": 120110 }, { "epoch": 4.328936461599453, "grad_norm": 0.22169798612594604, "learning_rate": 2.33502080133797e-06, "loss": 0.37, "step": 120115 }, { "epoch": 4.329116661260677, "grad_norm": 0.19561375677585602, "learning_rate": 2.3337895242656144e-06, "loss": 0.3759, "step": 120120 }, { "epoch": 4.329296860921901, "grad_norm": 0.3174193799495697, "learning_rate": 2.33255855601599e-06, "loss": 0.3906, "step": 120125 }, { "epoch": 4.329477060583126, "grad_norm": 0.2505806088447571, "learning_rate": 2.33132789660587e-06, "loss": 0.4405, "step": 120130 }, { "epoch": 4.329657260244351, "grad_norm": 0.2282123565673828, "learning_rate": 2.330097546052015e-06, "loss": 0.3469, "step": 120135 }, { "epoch": 4.329837459905575, "grad_norm": 0.2437511831521988, "learning_rate": 2.3288675043711933e-06, "loss": 0.3587, "step": 120140 }, { "epoch": 4.3300176595668, "grad_norm": 0.20224423706531525, "learning_rate": 2.3276377715801603e-06, "loss": 0.3455, "step": 120145 }, { "epoch": 4.330197859228025, "grad_norm": 0.23955215513706207, "learning_rate": 2.3264083476956743e-06, "loss": 0.3537, "step": 120150 }, { "epoch": 4.3303780588892495, "grad_norm": 0.23328891396522522, "learning_rate": 2.3251792327344867e-06, "loss": 0.3624, "step": 120155 }, { "epoch": 4.330558258550474, "grad_norm": 0.20525789260864258, "learning_rate": 2.323950426713342e-06, "loss": 0.3493, "step": 120160 }, { "epoch": 4.330738458211698, "grad_norm": 0.252485066652298, "learning_rate": 2.3227219296489827e-06, "loss": 0.3537, "step": 120165 }, { "epoch": 4.330918657872923, "grad_norm": 0.22659099102020264, "learning_rate": 2.3214937415581424e-06, "loss": 0.3759, "step": 120170 }, { "epoch": 4.331098857534148, "grad_norm": 0.24077731370925903, "learning_rate": 2.320265862457563e-06, "loss": 0.3789, "step": 120175 }, { "epoch": 4.331279057195372, "grad_norm": 0.2883579432964325, "learning_rate": 2.319038292363973e-06, "loss": 0.3513, "step": 120180 }, { "epoch": 4.331459256856597, "grad_norm": 0.25911572575569153, "learning_rate": 2.31781103129409e-06, "loss": 0.3691, "step": 120185 }, { "epoch": 4.331639456517822, "grad_norm": 0.2714954912662506, "learning_rate": 2.3165840792646414e-06, "loss": 0.3817, "step": 120190 }, { "epoch": 4.3318196561790465, "grad_norm": 0.2899724841117859, "learning_rate": 2.3153574362923395e-06, "loss": 0.3776, "step": 120195 }, { "epoch": 4.331999855840271, "grad_norm": 0.2563213109970093, "learning_rate": 2.31413110239391e-06, "loss": 0.4069, "step": 120200 }, { "epoch": 4.332180055501496, "grad_norm": 0.261004239320755, "learning_rate": 2.3129050775860473e-06, "loss": 0.3644, "step": 120205 }, { "epoch": 4.332360255162721, "grad_norm": 0.2524876296520233, "learning_rate": 2.3116793618854576e-06, "loss": 0.3632, "step": 120210 }, { "epoch": 4.332540454823945, "grad_norm": 0.24342192709445953, "learning_rate": 2.3104539553088474e-06, "loss": 0.3615, "step": 120215 }, { "epoch": 4.332720654485169, "grad_norm": 0.27565255761146545, "learning_rate": 2.3092288578729083e-06, "loss": 0.3985, "step": 120220 }, { "epoch": 4.332900854146394, "grad_norm": 0.22608241438865662, "learning_rate": 2.308004069594333e-06, "loss": 0.3917, "step": 120225 }, { "epoch": 4.333081053807619, "grad_norm": 0.2556675374507904, "learning_rate": 2.3067795904898103e-06, "loss": 0.3498, "step": 120230 }, { "epoch": 4.3332612534688435, "grad_norm": 0.2671878933906555, "learning_rate": 2.3055554205760168e-06, "loss": 0.3587, "step": 120235 }, { "epoch": 4.333441453130068, "grad_norm": 0.232125386595726, "learning_rate": 2.3043315598696406e-06, "loss": 0.3345, "step": 120240 }, { "epoch": 4.333621652791293, "grad_norm": 0.268929123878479, "learning_rate": 2.3031080083873556e-06, "loss": 0.4026, "step": 120245 }, { "epoch": 4.333801852452518, "grad_norm": 0.2586337625980377, "learning_rate": 2.3018847661458285e-06, "loss": 0.3407, "step": 120250 }, { "epoch": 4.333982052113742, "grad_norm": 0.20876720547676086, "learning_rate": 2.300661833161724e-06, "loss": 0.3956, "step": 120255 }, { "epoch": 4.334162251774966, "grad_norm": 0.26193374395370483, "learning_rate": 2.299439209451712e-06, "loss": 0.3588, "step": 120260 }, { "epoch": 4.334342451436191, "grad_norm": 0.29305386543273926, "learning_rate": 2.2982168950324374e-06, "loss": 0.3434, "step": 120265 }, { "epoch": 4.334522651097416, "grad_norm": 0.1981567144393921, "learning_rate": 2.29699488992057e-06, "loss": 0.3667, "step": 120270 }, { "epoch": 4.3347028507586405, "grad_norm": 0.2516523599624634, "learning_rate": 2.29577319413275e-06, "loss": 0.3551, "step": 120275 }, { "epoch": 4.334883050419865, "grad_norm": 0.2072906494140625, "learning_rate": 2.2945518076856243e-06, "loss": 0.3863, "step": 120280 }, { "epoch": 4.33506325008109, "grad_norm": 0.2831047475337982, "learning_rate": 2.2933307305958356e-06, "loss": 0.344, "step": 120285 }, { "epoch": 4.335243449742315, "grad_norm": 0.2764025330543518, "learning_rate": 2.2921099628800126e-06, "loss": 0.4017, "step": 120290 }, { "epoch": 4.335423649403539, "grad_norm": 0.2790067195892334, "learning_rate": 2.290889504554808e-06, "loss": 0.3866, "step": 120295 }, { "epoch": 4.335603849064764, "grad_norm": 0.20841774344444275, "learning_rate": 2.2896693556368284e-06, "loss": 0.3428, "step": 120300 }, { "epoch": 4.335784048725988, "grad_norm": 0.2399793118238449, "learning_rate": 2.2884495161427043e-06, "loss": 0.3393, "step": 120305 }, { "epoch": 4.335964248387213, "grad_norm": 0.2118580937385559, "learning_rate": 2.287229986089065e-06, "loss": 0.3936, "step": 120310 }, { "epoch": 4.3361444480484375, "grad_norm": 0.23589551448822021, "learning_rate": 2.286010765492516e-06, "loss": 0.376, "step": 120315 }, { "epoch": 4.336324647709662, "grad_norm": 0.240196093916893, "learning_rate": 2.284791854369675e-06, "loss": 0.3936, "step": 120320 }, { "epoch": 4.336504847370887, "grad_norm": 0.3579064607620239, "learning_rate": 2.2835732527371478e-06, "loss": 0.3894, "step": 120325 }, { "epoch": 4.336685047032112, "grad_norm": 0.21416036784648895, "learning_rate": 2.2823549606115324e-06, "loss": 0.3483, "step": 120330 }, { "epoch": 4.336865246693336, "grad_norm": 0.2611549496650696, "learning_rate": 2.281136978009435e-06, "loss": 0.4018, "step": 120335 }, { "epoch": 4.337045446354561, "grad_norm": 0.2718494236469269, "learning_rate": 2.2799193049474505e-06, "loss": 0.373, "step": 120340 }, { "epoch": 4.337225646015786, "grad_norm": 0.2340647578239441, "learning_rate": 2.2787019414421627e-06, "loss": 0.364, "step": 120345 }, { "epoch": 4.33740584567701, "grad_norm": 0.1981993019580841, "learning_rate": 2.2774848875101647e-06, "loss": 0.3663, "step": 120350 }, { "epoch": 4.337586045338234, "grad_norm": 0.32858866453170776, "learning_rate": 2.276268143168031e-06, "loss": 0.3422, "step": 120355 }, { "epoch": 4.337766244999459, "grad_norm": 0.24654696881771088, "learning_rate": 2.275051708432349e-06, "loss": 0.3722, "step": 120360 }, { "epoch": 4.337946444660684, "grad_norm": 0.20134994387626648, "learning_rate": 2.2738355833196916e-06, "loss": 0.3416, "step": 120365 }, { "epoch": 4.338126644321909, "grad_norm": 0.23259273171424866, "learning_rate": 2.2726197678466178e-06, "loss": 0.3483, "step": 120370 }, { "epoch": 4.338306843983133, "grad_norm": 0.25836434960365295, "learning_rate": 2.271404262029703e-06, "loss": 0.3657, "step": 120375 }, { "epoch": 4.338487043644358, "grad_norm": 0.2423553764820099, "learning_rate": 2.2701890658854975e-06, "loss": 0.3692, "step": 120380 }, { "epoch": 4.338667243305583, "grad_norm": 0.29900985956192017, "learning_rate": 2.268974179430572e-06, "loss": 0.3787, "step": 120385 }, { "epoch": 4.338847442966808, "grad_norm": 0.2342345416545868, "learning_rate": 2.267759602681477e-06, "loss": 0.4216, "step": 120390 }, { "epoch": 4.339027642628032, "grad_norm": Infinity, "learning_rate": 2.266788164281511e-06, "loss": 0.399, "step": 120395 }, { "epoch": 4.339207842289256, "grad_norm": 0.22955337166786194, "learning_rate": 2.265574145044594e-06, "loss": 0.389, "step": 120400 }, { "epoch": 4.339388041950481, "grad_norm": 0.25099387764930725, "learning_rate": 2.264360435559831e-06, "loss": 0.3777, "step": 120405 }, { "epoch": 4.339568241611706, "grad_norm": 0.23291358351707458, "learning_rate": 2.2631470358437545e-06, "loss": 0.3862, "step": 120410 }, { "epoch": 4.33974844127293, "grad_norm": 0.2527669370174408, "learning_rate": 2.2619339459128975e-06, "loss": 0.3817, "step": 120415 }, { "epoch": 4.339928640934155, "grad_norm": 0.28029200434684753, "learning_rate": 2.2607211657837886e-06, "loss": 0.34, "step": 120420 }, { "epoch": 4.34010884059538, "grad_norm": 0.2483205646276474, "learning_rate": 2.2595086954729476e-06, "loss": 0.3732, "step": 120425 }, { "epoch": 4.340289040256605, "grad_norm": 0.26079532504081726, "learning_rate": 2.2582965349969035e-06, "loss": 0.4245, "step": 120430 }, { "epoch": 4.340469239917829, "grad_norm": 0.3403790593147278, "learning_rate": 2.257084684372163e-06, "loss": 0.347, "step": 120435 }, { "epoch": 4.340649439579053, "grad_norm": 0.2807793915271759, "learning_rate": 2.255873143615242e-06, "loss": 0.3802, "step": 120440 }, { "epoch": 4.340829639240278, "grad_norm": 0.2575584948062897, "learning_rate": 2.2546619127426477e-06, "loss": 0.3579, "step": 120445 }, { "epoch": 4.341009838901503, "grad_norm": 0.2251080423593521, "learning_rate": 2.2534509917708753e-06, "loss": 0.3715, "step": 120450 }, { "epoch": 4.341190038562727, "grad_norm": 0.2660796046257019, "learning_rate": 2.2522403807164333e-06, "loss": 0.3834, "step": 120455 }, { "epoch": 4.341370238223952, "grad_norm": 0.277935653924942, "learning_rate": 2.251030079595817e-06, "loss": 0.3487, "step": 120460 }, { "epoch": 4.341550437885177, "grad_norm": 0.24934552609920502, "learning_rate": 2.249820088425503e-06, "loss": 0.3702, "step": 120465 }, { "epoch": 4.3417306375464015, "grad_norm": 0.22956299781799316, "learning_rate": 2.2486104072219905e-06, "loss": 0.3344, "step": 120470 }, { "epoch": 4.341910837207626, "grad_norm": 0.2705623507499695, "learning_rate": 2.247401036001753e-06, "loss": 0.3638, "step": 120475 }, { "epoch": 4.342091036868851, "grad_norm": 0.29147830605506897, "learning_rate": 2.2461919747812753e-06, "loss": 0.3837, "step": 120480 }, { "epoch": 4.342271236530076, "grad_norm": 0.23217852413654327, "learning_rate": 2.244983223577024e-06, "loss": 0.3905, "step": 120485 }, { "epoch": 4.3424514361913, "grad_norm": 0.22803989052772522, "learning_rate": 2.2437747824054694e-06, "loss": 0.3589, "step": 120490 }, { "epoch": 4.342631635852524, "grad_norm": 0.22938886284828186, "learning_rate": 2.242566651283079e-06, "loss": 0.3986, "step": 120495 }, { "epoch": 4.342811835513749, "grad_norm": 0.2516026198863983, "learning_rate": 2.2413588302263143e-06, "loss": 0.3489, "step": 120500 }, { "epoch": 4.342811835513749, "eval_loss": 0.4290280044078827, "eval_runtime": 3.5287, "eval_samples_per_second": 28.339, "eval_steps_per_second": 7.085, "step": 120500 }, { "epoch": 4.342992035174974, "grad_norm": 0.2596825361251831, "learning_rate": 2.2401513192516267e-06, "loss": 0.3906, "step": 120505 }, { "epoch": 4.3431722348361985, "grad_norm": 0.26794061064720154, "learning_rate": 2.2389441183754745e-06, "loss": 0.4, "step": 120510 }, { "epoch": 4.343352434497423, "grad_norm": 0.2583240568637848, "learning_rate": 2.2377372276143003e-06, "loss": 0.3953, "step": 120515 }, { "epoch": 4.343532634158648, "grad_norm": 0.27646011114120483, "learning_rate": 2.236530646984544e-06, "loss": 0.3915, "step": 120520 }, { "epoch": 4.343712833819873, "grad_norm": 0.2590472400188446, "learning_rate": 2.235324376502654e-06, "loss": 0.334, "step": 120525 }, { "epoch": 4.3438930334810975, "grad_norm": 0.2593589723110199, "learning_rate": 2.234118416185066e-06, "loss": 0.408, "step": 120530 }, { "epoch": 4.344073233142321, "grad_norm": 0.2586502134799957, "learning_rate": 2.2329127660482036e-06, "loss": 0.3995, "step": 120535 }, { "epoch": 4.344253432803546, "grad_norm": Infinity, "learning_rate": 2.2319484692798763e-06, "loss": 0.3567, "step": 120540 }, { "epoch": 4.344433632464771, "grad_norm": 0.2285531461238861, "learning_rate": 2.2307433775097196e-06, "loss": 0.3395, "step": 120545 }, { "epoch": 4.3446138321259955, "grad_norm": 0.28931424021720886, "learning_rate": 2.229538595966277e-06, "loss": 0.3916, "step": 120550 }, { "epoch": 4.34479403178722, "grad_norm": 0.19079504907131195, "learning_rate": 2.2283341246659623e-06, "loss": 0.3671, "step": 120555 }, { "epoch": 4.344974231448445, "grad_norm": 0.33871787786483765, "learning_rate": 2.2271299636251795e-06, "loss": 0.3929, "step": 120560 }, { "epoch": 4.34515443110967, "grad_norm": 0.2659500241279602, "learning_rate": 2.225926112860349e-06, "loss": 0.3407, "step": 120565 }, { "epoch": 4.345334630770894, "grad_norm": 0.2704176902770996, "learning_rate": 2.2247225723878663e-06, "loss": 0.3852, "step": 120570 }, { "epoch": 4.345514830432119, "grad_norm": 0.27689260244369507, "learning_rate": 2.2235193422241283e-06, "loss": 0.3471, "step": 120575 }, { "epoch": 4.345695030093343, "grad_norm": 0.21617692708969116, "learning_rate": 2.2223164223855287e-06, "loss": 0.36, "step": 120580 }, { "epoch": 4.345875229754568, "grad_norm": 0.32375189661979675, "learning_rate": 2.221113812888459e-06, "loss": 0.3506, "step": 120585 }, { "epoch": 4.3460554294157925, "grad_norm": 0.26374006271362305, "learning_rate": 2.2199115137492983e-06, "loss": 0.3691, "step": 120590 }, { "epoch": 4.346235629077017, "grad_norm": 0.2784136235713959, "learning_rate": 2.2187095249844385e-06, "loss": 0.3435, "step": 120595 }, { "epoch": 4.346415828738242, "grad_norm": 0.23929151892662048, "learning_rate": 2.2175078466102477e-06, "loss": 0.3381, "step": 120600 }, { "epoch": 4.346596028399467, "grad_norm": 0.29352447390556335, "learning_rate": 2.216306478643104e-06, "loss": 0.3898, "step": 120605 }, { "epoch": 4.346776228060691, "grad_norm": 0.2777535021305084, "learning_rate": 2.2151054210993754e-06, "loss": 0.3625, "step": 120610 }, { "epoch": 4.346956427721916, "grad_norm": 0.2904403507709503, "learning_rate": 2.213904673995418e-06, "loss": 0.3525, "step": 120615 }, { "epoch": 4.347136627383141, "grad_norm": 0.31510671973228455, "learning_rate": 2.2127042373476016e-06, "loss": 0.3433, "step": 120620 }, { "epoch": 4.347316827044365, "grad_norm": 0.26004233956336975, "learning_rate": 2.211504111172283e-06, "loss": 0.3828, "step": 120625 }, { "epoch": 4.3474970267055895, "grad_norm": 0.2599365711212158, "learning_rate": 2.210304295485799e-06, "loss": 0.4023, "step": 120630 }, { "epoch": 4.347677226366814, "grad_norm": 0.2279340624809265, "learning_rate": 2.2091047903045142e-06, "loss": 0.3625, "step": 120635 }, { "epoch": 4.347857426028039, "grad_norm": 0.2609095871448517, "learning_rate": 2.2079055956447574e-06, "loss": 0.3503, "step": 120640 }, { "epoch": 4.348037625689264, "grad_norm": 0.2672995328903198, "learning_rate": 2.206706711522882e-06, "loss": 0.3367, "step": 120645 }, { "epoch": 4.348217825350488, "grad_norm": 0.2778962552547455, "learning_rate": 2.205508137955212e-06, "loss": 0.3601, "step": 120650 }, { "epoch": 4.348398025011713, "grad_norm": 0.2293660044670105, "learning_rate": 2.204309874958077e-06, "loss": 0.4096, "step": 120655 }, { "epoch": 4.348578224672938, "grad_norm": 0.22069476544857025, "learning_rate": 2.2031119225478096e-06, "loss": 0.3899, "step": 120660 }, { "epoch": 4.348758424334163, "grad_norm": 0.22815968096256256, "learning_rate": 2.201914280740727e-06, "loss": 0.3738, "step": 120665 }, { "epoch": 4.348938623995387, "grad_norm": 0.28191712498664856, "learning_rate": 2.2007169495531527e-06, "loss": 0.3879, "step": 120670 }, { "epoch": 4.349118823656611, "grad_norm": 0.2501116991043091, "learning_rate": 2.1995199290013926e-06, "loss": 0.3743, "step": 120675 }, { "epoch": 4.349299023317836, "grad_norm": 0.3013772666454315, "learning_rate": 2.1983232191017556e-06, "loss": 0.3554, "step": 120680 }, { "epoch": 4.349479222979061, "grad_norm": 0.2710776627063751, "learning_rate": 2.1971268198705543e-06, "loss": 0.3778, "step": 120685 }, { "epoch": 4.349659422640285, "grad_norm": 0.28486597537994385, "learning_rate": 2.195930731324086e-06, "loss": 0.3988, "step": 120690 }, { "epoch": 4.34983962230151, "grad_norm": 0.27696672081947327, "learning_rate": 2.1947349534786443e-06, "loss": 0.3684, "step": 120695 }, { "epoch": 4.350019821962735, "grad_norm": 0.2695504426956177, "learning_rate": 2.1935394863505265e-06, "loss": 0.3804, "step": 120700 }, { "epoch": 4.35020002162396, "grad_norm": 0.2949710786342621, "learning_rate": 2.1923443299560113e-06, "loss": 0.3809, "step": 120705 }, { "epoch": 4.350380221285184, "grad_norm": 0.29531538486480713, "learning_rate": 2.1911494843113943e-06, "loss": 0.38, "step": 120710 }, { "epoch": 4.350560420946408, "grad_norm": 0.2712008059024811, "learning_rate": 2.1899549494329513e-06, "loss": 0.358, "step": 120715 }, { "epoch": 4.350740620607633, "grad_norm": 0.3000184893608093, "learning_rate": 2.1887607253369525e-06, "loss": 0.3792, "step": 120720 }, { "epoch": 4.350920820268858, "grad_norm": 0.28387993574142456, "learning_rate": 2.187566812039676e-06, "loss": 0.3642, "step": 120725 }, { "epoch": 4.351101019930082, "grad_norm": 0.21037955582141876, "learning_rate": 2.1863732095573825e-06, "loss": 0.3565, "step": 120730 }, { "epoch": 4.351281219591307, "grad_norm": 0.2745780646800995, "learning_rate": 2.1851799179063326e-06, "loss": 0.3818, "step": 120735 }, { "epoch": 4.351461419252532, "grad_norm": 0.25862976908683777, "learning_rate": 2.1839869371028e-06, "loss": 0.3547, "step": 120740 }, { "epoch": 4.351641618913757, "grad_norm": 0.2394440770149231, "learning_rate": 2.1827942671630184e-06, "loss": 0.3666, "step": 120745 }, { "epoch": 4.351821818574981, "grad_norm": 0.26800236105918884, "learning_rate": 2.181601908103251e-06, "loss": 0.3695, "step": 120750 }, { "epoch": 4.352002018236206, "grad_norm": 0.24717827141284943, "learning_rate": 2.1804098599397418e-06, "loss": 0.3604, "step": 120755 }, { "epoch": 4.352182217897431, "grad_norm": 0.24239477515220642, "learning_rate": 2.179218122688728e-06, "loss": 0.3599, "step": 120760 }, { "epoch": 4.352362417558655, "grad_norm": 0.28817063570022583, "learning_rate": 2.17802669636645e-06, "loss": 0.4087, "step": 120765 }, { "epoch": 4.352542617219879, "grad_norm": 0.25521016120910645, "learning_rate": 2.1768355809891396e-06, "loss": 0.3246, "step": 120770 }, { "epoch": 4.352722816881104, "grad_norm": 0.23344235122203827, "learning_rate": 2.1756447765730214e-06, "loss": 0.3409, "step": 120775 }, { "epoch": 4.352903016542329, "grad_norm": 0.27485767006874084, "learning_rate": 2.174454283134328e-06, "loss": 0.3866, "step": 120780 }, { "epoch": 4.3530832162035535, "grad_norm": 0.24092566967010498, "learning_rate": 2.1732641006892763e-06, "loss": 0.3861, "step": 120785 }, { "epoch": 4.353263415864778, "grad_norm": 0.23562981188297272, "learning_rate": 2.1720742292540814e-06, "loss": 0.3637, "step": 120790 }, { "epoch": 4.353443615526003, "grad_norm": 0.30748188495635986, "learning_rate": 2.1708846688449553e-06, "loss": 0.3486, "step": 120795 }, { "epoch": 4.353623815187228, "grad_norm": 0.263886421918869, "learning_rate": 2.1696954194781016e-06, "loss": 0.3717, "step": 120800 }, { "epoch": 4.3538040148484525, "grad_norm": 0.22052530944347382, "learning_rate": 2.1685064811697325e-06, "loss": 0.3637, "step": 120805 }, { "epoch": 4.353984214509676, "grad_norm": 0.2120153307914734, "learning_rate": 2.1673178539360488e-06, "loss": 0.3608, "step": 120810 }, { "epoch": 4.354164414170901, "grad_norm": 0.26720255613327026, "learning_rate": 2.1661295377932267e-06, "loss": 0.3854, "step": 120815 }, { "epoch": 4.354344613832126, "grad_norm": 0.22795835137367249, "learning_rate": 2.164941532757475e-06, "loss": 0.3628, "step": 120820 }, { "epoch": 4.3545248134933505, "grad_norm": 0.33458447456359863, "learning_rate": 2.1637538388449674e-06, "loss": 0.3986, "step": 120825 }, { "epoch": 4.354705013154575, "grad_norm": 0.2997223436832428, "learning_rate": 2.1625664560719043e-06, "loss": 0.3833, "step": 120830 }, { "epoch": 4.3548852128158, "grad_norm": 0.2829420864582062, "learning_rate": 2.1613793844544454e-06, "loss": 0.3839, "step": 120835 }, { "epoch": 4.355065412477025, "grad_norm": 0.2701405882835388, "learning_rate": 2.160192624008767e-06, "loss": 0.3709, "step": 120840 }, { "epoch": 4.3552456121382495, "grad_norm": 0.3344108462333679, "learning_rate": 2.1590061747510465e-06, "loss": 0.3555, "step": 120845 }, { "epoch": 4.355425811799474, "grad_norm": 0.20211008191108704, "learning_rate": 2.1578200366974386e-06, "loss": 0.4124, "step": 120850 }, { "epoch": 4.355606011460699, "grad_norm": 0.25280535221099854, "learning_rate": 2.1566342098641224e-06, "loss": 0.3437, "step": 120855 }, { "epoch": 4.355786211121923, "grad_norm": 0.29722049832344055, "learning_rate": 2.1554486942672367e-06, "loss": 0.37, "step": 120860 }, { "epoch": 4.3559664107831475, "grad_norm": 0.22865262627601624, "learning_rate": 2.1542634899229337e-06, "loss": 0.3594, "step": 120865 }, { "epoch": 4.356146610444372, "grad_norm": 0.2604643702507019, "learning_rate": 2.1530785968473744e-06, "loss": 0.3679, "step": 120870 }, { "epoch": 4.356326810105597, "grad_norm": 0.23937147855758667, "learning_rate": 2.1518940150566937e-06, "loss": 0.3803, "step": 120875 }, { "epoch": 4.356507009766822, "grad_norm": 0.2754795253276825, "learning_rate": 2.1507097445670337e-06, "loss": 0.3809, "step": 120880 }, { "epoch": 4.356687209428046, "grad_norm": 0.2818678617477417, "learning_rate": 2.149525785394532e-06, "loss": 0.3701, "step": 120885 }, { "epoch": 4.356867409089271, "grad_norm": 0.28676289319992065, "learning_rate": 2.148342137555312e-06, "loss": 0.3622, "step": 120890 }, { "epoch": 4.357047608750496, "grad_norm": 0.23003120720386505, "learning_rate": 2.1471588010655103e-06, "loss": 0.3463, "step": 120895 }, { "epoch": 4.35722780841172, "grad_norm": 0.2534288465976715, "learning_rate": 2.145975775941245e-06, "loss": 0.359, "step": 120900 }, { "epoch": 4.3574080080729445, "grad_norm": 0.22617748379707336, "learning_rate": 2.144793062198636e-06, "loss": 0.3883, "step": 120905 }, { "epoch": 4.357588207734169, "grad_norm": 0.23090511560440063, "learning_rate": 2.143610659853795e-06, "loss": 0.3662, "step": 120910 }, { "epoch": 4.357768407395394, "grad_norm": 0.26390355825424194, "learning_rate": 2.142428568922836e-06, "loss": 0.4108, "step": 120915 }, { "epoch": 4.357948607056619, "grad_norm": 0.2305891066789627, "learning_rate": 2.141246789421855e-06, "loss": 0.3628, "step": 120920 }, { "epoch": 4.358128806717843, "grad_norm": 0.29624882340431213, "learning_rate": 2.1400653213669715e-06, "loss": 0.4234, "step": 120925 }, { "epoch": 4.358309006379068, "grad_norm": 0.24561457335948944, "learning_rate": 2.138884164774263e-06, "loss": 0.3441, "step": 120930 }, { "epoch": 4.358489206040293, "grad_norm": 0.30524396896362305, "learning_rate": 2.1377033196598367e-06, "loss": 0.3792, "step": 120935 }, { "epoch": 4.358669405701518, "grad_norm": 0.22376736998558044, "learning_rate": 2.1365227860397767e-06, "loss": 0.3578, "step": 120940 }, { "epoch": 4.358849605362742, "grad_norm": 0.2799444794654846, "learning_rate": 2.135342563930165e-06, "loss": 0.396, "step": 120945 }, { "epoch": 4.359029805023966, "grad_norm": 0.26537925004959106, "learning_rate": 2.1341626533470855e-06, "loss": 0.397, "step": 120950 }, { "epoch": 4.359210004685191, "grad_norm": 0.30358922481536865, "learning_rate": 2.1329830543066144e-06, "loss": 0.3729, "step": 120955 }, { "epoch": 4.359390204346416, "grad_norm": 0.25468435883522034, "learning_rate": 2.1318037668248165e-06, "loss": 0.39, "step": 120960 }, { "epoch": 4.35957040400764, "grad_norm": 0.32225528359413147, "learning_rate": 2.1306247909177677e-06, "loss": 0.3589, "step": 120965 }, { "epoch": 4.359750603668865, "grad_norm": 0.28411632776260376, "learning_rate": 2.1294461266015307e-06, "loss": 0.3462, "step": 120970 }, { "epoch": 4.35993080333009, "grad_norm": 0.23393802344799042, "learning_rate": 2.128267773892162e-06, "loss": 0.3878, "step": 120975 }, { "epoch": 4.360111002991315, "grad_norm": 0.27324995398521423, "learning_rate": 2.127089732805715e-06, "loss": 0.4043, "step": 120980 }, { "epoch": 4.360291202652539, "grad_norm": 0.24664054811000824, "learning_rate": 2.125912003358238e-06, "loss": 0.3737, "step": 120985 }, { "epoch": 4.360471402313763, "grad_norm": 0.22538726031780243, "learning_rate": 2.124734585565788e-06, "loss": 0.3723, "step": 120990 }, { "epoch": 4.360651601974988, "grad_norm": 0.27115336060523987, "learning_rate": 2.1235574794444046e-06, "loss": 0.3447, "step": 120995 }, { "epoch": 4.360831801636213, "grad_norm": 0.19281277060508728, "learning_rate": 2.1223806850101114e-06, "loss": 0.3592, "step": 121000 }, { "epoch": 4.360831801636213, "eval_loss": 0.42894721031188965, "eval_runtime": 3.5365, "eval_samples_per_second": 28.276, "eval_steps_per_second": 7.069, "step": 121000 }, { "epoch": 4.361012001297437, "grad_norm": 0.24042321741580963, "learning_rate": 2.1212042022789562e-06, "loss": 0.3635, "step": 121005 }, { "epoch": 4.361192200958662, "grad_norm": 0.2706429660320282, "learning_rate": 2.12002803126696e-06, "loss": 0.3986, "step": 121010 }, { "epoch": 4.361372400619887, "grad_norm": 0.21886184811592102, "learning_rate": 2.1188521719901574e-06, "loss": 0.328, "step": 121015 }, { "epoch": 4.361552600281112, "grad_norm": 0.23916944861412048, "learning_rate": 2.117676624464568e-06, "loss": 0.3908, "step": 121020 }, { "epoch": 4.361732799942336, "grad_norm": 0.24652530252933502, "learning_rate": 2.1165013887061963e-06, "loss": 0.3795, "step": 121025 }, { "epoch": 4.361912999603561, "grad_norm": 0.20936556160449982, "learning_rate": 2.1153264647310654e-06, "loss": 0.3649, "step": 121030 }, { "epoch": 4.362093199264786, "grad_norm": 0.2558532953262329, "learning_rate": 2.1141518525551766e-06, "loss": 0.3569, "step": 121035 }, { "epoch": 4.36227339892601, "grad_norm": 0.2516135573387146, "learning_rate": 2.1129775521945476e-06, "loss": 0.3817, "step": 121040 }, { "epoch": 4.362453598587234, "grad_norm": 0.22439998388290405, "learning_rate": 2.111803563665166e-06, "loss": 0.3627, "step": 121045 }, { "epoch": 4.362633798248459, "grad_norm": 0.2665523886680603, "learning_rate": 2.110629886983023e-06, "loss": 0.3555, "step": 121050 }, { "epoch": 4.362813997909684, "grad_norm": 0.19356732070446014, "learning_rate": 2.109456522164119e-06, "loss": 0.3745, "step": 121055 }, { "epoch": 4.362994197570909, "grad_norm": 0.3014737069606781, "learning_rate": 2.1082834692244424e-06, "loss": 0.3489, "step": 121060 }, { "epoch": 4.363174397232133, "grad_norm": 0.21742402017116547, "learning_rate": 2.1071107281799675e-06, "loss": 0.3669, "step": 121065 }, { "epoch": 4.363354596893358, "grad_norm": 0.23088960349559784, "learning_rate": 2.105938299046678e-06, "loss": 0.3678, "step": 121070 }, { "epoch": 4.363534796554583, "grad_norm": 0.2994691729545593, "learning_rate": 2.1047661818405423e-06, "loss": 0.4163, "step": 121075 }, { "epoch": 4.3637149962158075, "grad_norm": 0.22919879853725433, "learning_rate": 2.103594376577542e-06, "loss": 0.3619, "step": 121080 }, { "epoch": 4.363895195877031, "grad_norm": 0.21188896894454956, "learning_rate": 2.1024228832736332e-06, "loss": 0.3756, "step": 121085 }, { "epoch": 4.364075395538256, "grad_norm": 0.18373993039131165, "learning_rate": 2.1012517019447786e-06, "loss": 0.3687, "step": 121090 }, { "epoch": 4.364255595199481, "grad_norm": 0.27971792221069336, "learning_rate": 2.100080832606935e-06, "loss": 0.4032, "step": 121095 }, { "epoch": 4.3644357948607055, "grad_norm": 0.24079079926013947, "learning_rate": 2.0989102752760587e-06, "loss": 0.3679, "step": 121100 }, { "epoch": 4.36461599452193, "grad_norm": 0.21578475832939148, "learning_rate": 2.0977400299680927e-06, "loss": 0.377, "step": 121105 }, { "epoch": 4.364796194183155, "grad_norm": 0.24278196692466736, "learning_rate": 2.096570096698991e-06, "loss": 0.3817, "step": 121110 }, { "epoch": 4.36497639384438, "grad_norm": 0.2502589523792267, "learning_rate": 2.0954004754846795e-06, "loss": 0.3749, "step": 121115 }, { "epoch": 4.3651565935056045, "grad_norm": 0.24439960718154907, "learning_rate": 2.0942311663411095e-06, "loss": 0.3807, "step": 121120 }, { "epoch": 4.365336793166829, "grad_norm": 0.2953266501426697, "learning_rate": 2.093062169284202e-06, "loss": 0.3785, "step": 121125 }, { "epoch": 4.365516992828054, "grad_norm": 0.21381710469722748, "learning_rate": 2.091893484329882e-06, "loss": 0.3554, "step": 121130 }, { "epoch": 4.365697192489278, "grad_norm": 0.2584659457206726, "learning_rate": 2.0907251114940907e-06, "loss": 0.3765, "step": 121135 }, { "epoch": 4.3658773921505025, "grad_norm": 0.205842986702919, "learning_rate": 2.089557050792726e-06, "loss": 0.364, "step": 121140 }, { "epoch": 4.366057591811727, "grad_norm": 0.22627507150173187, "learning_rate": 2.088389302241708e-06, "loss": 0.3628, "step": 121145 }, { "epoch": 4.366237791472952, "grad_norm": 0.22469063103199005, "learning_rate": 2.087221865856956e-06, "loss": 0.328, "step": 121150 }, { "epoch": 4.366417991134177, "grad_norm": 0.24385787546634674, "learning_rate": 2.0860547416543697e-06, "loss": 0.3852, "step": 121155 }, { "epoch": 4.3665981907954015, "grad_norm": 0.2274542599916458, "learning_rate": 2.084887929649848e-06, "loss": 0.3781, "step": 121160 }, { "epoch": 4.366778390456626, "grad_norm": 0.31590333580970764, "learning_rate": 2.0837214298592948e-06, "loss": 0.3794, "step": 121165 }, { "epoch": 4.366958590117851, "grad_norm": 0.24526174366474152, "learning_rate": 2.082555242298595e-06, "loss": 0.3573, "step": 121170 }, { "epoch": 4.367138789779075, "grad_norm": 0.3067219853401184, "learning_rate": 2.0813893669836495e-06, "loss": 0.388, "step": 121175 }, { "epoch": 4.3673189894402995, "grad_norm": 0.21108821034431458, "learning_rate": 2.080223803930334e-06, "loss": 0.3336, "step": 121180 }, { "epoch": 4.367499189101524, "grad_norm": 0.312544584274292, "learning_rate": 2.0790585531545336e-06, "loss": 0.3726, "step": 121185 }, { "epoch": 4.367679388762749, "grad_norm": 0.2698698937892914, "learning_rate": 2.077893614672122e-06, "loss": 0.3531, "step": 121190 }, { "epoch": 4.367859588423974, "grad_norm": 0.2712307274341583, "learning_rate": 2.076728988498966e-06, "loss": 0.352, "step": 121195 }, { "epoch": 4.368039788085198, "grad_norm": 0.2807067036628723, "learning_rate": 2.0755646746509453e-06, "loss": 0.3675, "step": 121200 }, { "epoch": 4.368219987746423, "grad_norm": 0.23370806872844696, "learning_rate": 2.0744006731439225e-06, "loss": 0.3685, "step": 121205 }, { "epoch": 4.368400187407648, "grad_norm": 0.30535852909088135, "learning_rate": 2.0732369839937428e-06, "loss": 0.3524, "step": 121210 }, { "epoch": 4.368580387068873, "grad_norm": 0.21354763209819794, "learning_rate": 2.072073607216274e-06, "loss": 0.3562, "step": 121215 }, { "epoch": 4.368760586730097, "grad_norm": 0.23636168241500854, "learning_rate": 2.0709105428273647e-06, "loss": 0.3307, "step": 121220 }, { "epoch": 4.368940786391321, "grad_norm": 0.22024032473564148, "learning_rate": 2.0697477908428602e-06, "loss": 0.3996, "step": 121225 }, { "epoch": 4.369120986052546, "grad_norm": 0.19595114886760712, "learning_rate": 2.0685853512786013e-06, "loss": 0.3299, "step": 121230 }, { "epoch": 4.369301185713771, "grad_norm": 0.28025031089782715, "learning_rate": 2.0674232241504223e-06, "loss": 0.4299, "step": 121235 }, { "epoch": 4.369481385374995, "grad_norm": 0.2547728419303894, "learning_rate": 2.0662614094741683e-06, "loss": 0.3679, "step": 121240 }, { "epoch": 4.36966158503622, "grad_norm": 0.22356577217578888, "learning_rate": 2.0650999072656606e-06, "loss": 0.3665, "step": 121245 }, { "epoch": 4.369841784697445, "grad_norm": 0.2514247000217438, "learning_rate": 2.063938717540728e-06, "loss": 0.3732, "step": 121250 }, { "epoch": 4.37002198435867, "grad_norm": 0.2653522193431854, "learning_rate": 2.0627778403151886e-06, "loss": 0.364, "step": 121255 }, { "epoch": 4.370202184019894, "grad_norm": 0.28731030225753784, "learning_rate": 2.0616172756048573e-06, "loss": 0.3676, "step": 121260 }, { "epoch": 4.370382383681119, "grad_norm": 0.2413557469844818, "learning_rate": 2.060457023425552e-06, "loss": 0.3525, "step": 121265 }, { "epoch": 4.370562583342343, "grad_norm": 0.27309590578079224, "learning_rate": 2.0592970837930825e-06, "loss": 0.3655, "step": 121270 }, { "epoch": 4.370742783003568, "grad_norm": 0.2248447686433792, "learning_rate": 2.0581374567232443e-06, "loss": 0.3994, "step": 121275 }, { "epoch": 4.370922982664792, "grad_norm": 0.1932300329208374, "learning_rate": 2.056978142231844e-06, "loss": 0.3831, "step": 121280 }, { "epoch": 4.371103182326017, "grad_norm": 0.22570841014385223, "learning_rate": 2.055819140334675e-06, "loss": 0.3929, "step": 121285 }, { "epoch": 4.371283381987242, "grad_norm": 0.2672460973262787, "learning_rate": 2.0546604510475213e-06, "loss": 0.3756, "step": 121290 }, { "epoch": 4.371463581648467, "grad_norm": 0.2674674689769745, "learning_rate": 2.0535020743861822e-06, "loss": 0.3614, "step": 121295 }, { "epoch": 4.371643781309691, "grad_norm": 0.2942127585411072, "learning_rate": 2.0523440103664358e-06, "loss": 0.3904, "step": 121300 }, { "epoch": 4.371823980970916, "grad_norm": 0.210496187210083, "learning_rate": 2.0511862590040566e-06, "loss": 0.3949, "step": 121305 }, { "epoch": 4.372004180632141, "grad_norm": 0.2888723909854889, "learning_rate": 2.0500288203148254e-06, "loss": 0.3525, "step": 121310 }, { "epoch": 4.372184380293365, "grad_norm": 0.2629977762699127, "learning_rate": 2.0488716943145025e-06, "loss": 0.3809, "step": 121315 }, { "epoch": 4.372364579954589, "grad_norm": 0.2915158271789551, "learning_rate": 2.047714881018867e-06, "loss": 0.3755, "step": 121320 }, { "epoch": 4.372544779615814, "grad_norm": 0.2206728309392929, "learning_rate": 2.0465583804436668e-06, "loss": 0.3814, "step": 121325 }, { "epoch": 4.372724979277039, "grad_norm": 0.27409932017326355, "learning_rate": 2.0454021926046645e-06, "loss": 0.3721, "step": 121330 }, { "epoch": 4.372905178938264, "grad_norm": 0.2713205814361572, "learning_rate": 2.0442463175176147e-06, "loss": 0.3677, "step": 121335 }, { "epoch": 4.373085378599488, "grad_norm": 0.2275678962469101, "learning_rate": 2.0430907551982654e-06, "loss": 0.365, "step": 121340 }, { "epoch": 4.373265578260713, "grad_norm": 0.2382095456123352, "learning_rate": 2.0419355056623597e-06, "loss": 0.3718, "step": 121345 }, { "epoch": 4.373445777921938, "grad_norm": 0.207700714468956, "learning_rate": 2.0407805689256374e-06, "loss": 0.3636, "step": 121350 }, { "epoch": 4.3736259775831625, "grad_norm": 0.2512191832065582, "learning_rate": 2.0396259450038315e-06, "loss": 0.3407, "step": 121355 }, { "epoch": 4.373806177244386, "grad_norm": 0.24360640347003937, "learning_rate": 2.038471633912678e-06, "loss": 0.3854, "step": 121360 }, { "epoch": 4.373986376905611, "grad_norm": 0.2918420433998108, "learning_rate": 2.0373176356679067e-06, "loss": 0.3846, "step": 121365 }, { "epoch": 4.374166576566836, "grad_norm": 0.21557246148586273, "learning_rate": 2.0361639502852358e-06, "loss": 0.3759, "step": 121370 }, { "epoch": 4.374346776228061, "grad_norm": 0.25849926471710205, "learning_rate": 2.0350105777803856e-06, "loss": 0.3576, "step": 121375 }, { "epoch": 4.374526975889285, "grad_norm": 0.19965314865112305, "learning_rate": 2.033857518169066e-06, "loss": 0.3659, "step": 121380 }, { "epoch": 4.37470717555051, "grad_norm": 0.2662714421749115, "learning_rate": 2.032704771466995e-06, "loss": 0.3814, "step": 121385 }, { "epoch": 4.374887375211735, "grad_norm": 0.2764718532562256, "learning_rate": 2.0315523376898765e-06, "loss": 0.3941, "step": 121390 }, { "epoch": 4.3750675748729595, "grad_norm": 0.2914084494113922, "learning_rate": 2.0304002168534065e-06, "loss": 0.3853, "step": 121395 }, { "epoch": 4.375247774534184, "grad_norm": 0.29716944694519043, "learning_rate": 2.029248408973289e-06, "loss": 0.3849, "step": 121400 }, { "epoch": 4.375427974195409, "grad_norm": 0.21052998304367065, "learning_rate": 2.028096914065214e-06, "loss": 0.3677, "step": 121405 }, { "epoch": 4.375608173856633, "grad_norm": 0.20469604432582855, "learning_rate": 2.026945732144872e-06, "loss": 0.3207, "step": 121410 }, { "epoch": 4.3757883735178575, "grad_norm": 0.24874727427959442, "learning_rate": 2.025794863227945e-06, "loss": 0.3688, "step": 121415 }, { "epoch": 4.375968573179082, "grad_norm": 0.2315155416727066, "learning_rate": 2.024644307330112e-06, "loss": 0.3849, "step": 121420 }, { "epoch": 4.376148772840307, "grad_norm": 0.27341365814208984, "learning_rate": 2.0234940644670576e-06, "loss": 0.3887, "step": 121425 }, { "epoch": 4.376328972501532, "grad_norm": 0.2433520257472992, "learning_rate": 2.0223441346544468e-06, "loss": 0.3943, "step": 121430 }, { "epoch": 4.3765091721627565, "grad_norm": 0.24576754868030548, "learning_rate": 2.0211945179079483e-06, "loss": 0.371, "step": 121435 }, { "epoch": 4.376689371823981, "grad_norm": 0.19979894161224365, "learning_rate": 2.020045214243227e-06, "loss": 0.3891, "step": 121440 }, { "epoch": 4.376869571485206, "grad_norm": 0.27705785632133484, "learning_rate": 2.0188962236759342e-06, "loss": 0.3707, "step": 121445 }, { "epoch": 4.37704977114643, "grad_norm": 0.26884958148002625, "learning_rate": 2.0177475462217376e-06, "loss": 0.3791, "step": 121450 }, { "epoch": 4.3772299708076545, "grad_norm": 0.26522353291511536, "learning_rate": 2.0165991818962784e-06, "loss": 0.3957, "step": 121455 }, { "epoch": 4.377410170468879, "grad_norm": 0.2895340323448181, "learning_rate": 2.0154511307152066e-06, "loss": 0.3571, "step": 121460 }, { "epoch": 4.377590370130104, "grad_norm": 0.23953476548194885, "learning_rate": 2.014303392694164e-06, "loss": 0.361, "step": 121465 }, { "epoch": 4.377770569791329, "grad_norm": 0.26138606667518616, "learning_rate": 2.01315596784879e-06, "loss": 0.3653, "step": 121470 }, { "epoch": 4.3779507694525535, "grad_norm": 0.2306900918483734, "learning_rate": 2.0120088561947082e-06, "loss": 0.3363, "step": 121475 }, { "epoch": 4.378130969113778, "grad_norm": 0.24305300414562225, "learning_rate": 2.010862057747559e-06, "loss": 0.394, "step": 121480 }, { "epoch": 4.378311168775003, "grad_norm": 0.2668958604335785, "learning_rate": 2.0097155725229634e-06, "loss": 0.3817, "step": 121485 }, { "epoch": 4.378491368436228, "grad_norm": 0.23452064394950867, "learning_rate": 2.0085694005365446e-06, "loss": 0.361, "step": 121490 }, { "epoch": 4.378671568097452, "grad_norm": 0.2706201672554016, "learning_rate": 2.007423541803913e-06, "loss": 0.3506, "step": 121495 }, { "epoch": 4.378851767758676, "grad_norm": 0.2533268332481384, "learning_rate": 2.006277996340683e-06, "loss": 0.3454, "step": 121500 }, { "epoch": 4.378851767758676, "eval_loss": 0.42894676327705383, "eval_runtime": 3.5453, "eval_samples_per_second": 28.206, "eval_steps_per_second": 7.052, "step": 121500 }, { "epoch": 4.379031967419901, "grad_norm": 0.23799777030944824, "learning_rate": 2.0051327641624706e-06, "loss": 0.3548, "step": 121505 }, { "epoch": 4.379212167081126, "grad_norm": 0.302617609500885, "learning_rate": 2.0039878452848687e-06, "loss": 0.3609, "step": 121510 }, { "epoch": 4.37939236674235, "grad_norm": 0.21825525164604187, "learning_rate": 2.0028432397234758e-06, "loss": 0.3464, "step": 121515 }, { "epoch": 4.379572566403575, "grad_norm": 0.26662659645080566, "learning_rate": 2.0016989474938934e-06, "loss": 0.3305, "step": 121520 }, { "epoch": 4.3797527660648, "grad_norm": 0.31008052825927734, "learning_rate": 2.000554968611712e-06, "loss": 0.3879, "step": 121525 }, { "epoch": 4.379932965726025, "grad_norm": 0.2644076943397522, "learning_rate": 1.9994113030925188e-06, "loss": 0.3359, "step": 121530 }, { "epoch": 4.380113165387249, "grad_norm": 0.25222399830818176, "learning_rate": 1.9982679509518905e-06, "loss": 0.3419, "step": 121535 }, { "epoch": 4.380293365048474, "grad_norm": 0.2816098928451538, "learning_rate": 1.997124912205403e-06, "loss": 0.3983, "step": 121540 }, { "epoch": 4.380473564709698, "grad_norm": 0.5892481803894043, "learning_rate": 1.9959821868686416e-06, "loss": 0.3839, "step": 121545 }, { "epoch": 4.380653764370923, "grad_norm": 0.2820974886417389, "learning_rate": 1.994839774957169e-06, "loss": 0.3707, "step": 121550 }, { "epoch": 4.380833964032147, "grad_norm": 0.24034863710403442, "learning_rate": 1.9936976764865502e-06, "loss": 0.3653, "step": 121555 }, { "epoch": 4.381014163693372, "grad_norm": 0.2928817570209503, "learning_rate": 1.9925558914723475e-06, "loss": 0.3839, "step": 121560 }, { "epoch": 4.381194363354597, "grad_norm": 0.2788287401199341, "learning_rate": 1.99141441993011e-06, "loss": 0.3397, "step": 121565 }, { "epoch": 4.381374563015822, "grad_norm": 0.2363235503435135, "learning_rate": 1.9902732618753998e-06, "loss": 0.3395, "step": 121570 }, { "epoch": 4.381554762677046, "grad_norm": 0.2198476344347, "learning_rate": 1.989132417323769e-06, "loss": 0.3649, "step": 121575 }, { "epoch": 4.381734962338271, "grad_norm": 0.2580127716064453, "learning_rate": 1.987991886290744e-06, "loss": 0.3786, "step": 121580 }, { "epoch": 4.381915161999496, "grad_norm": 0.225581094622612, "learning_rate": 1.986851668791878e-06, "loss": 0.3706, "step": 121585 }, { "epoch": 4.38209536166072, "grad_norm": 0.22968202829360962, "learning_rate": 1.9857117648427044e-06, "loss": 0.3682, "step": 121590 }, { "epoch": 4.382275561321944, "grad_norm": 0.26144370436668396, "learning_rate": 1.984572174458746e-06, "loss": 0.3679, "step": 121595 }, { "epoch": 4.382455760983169, "grad_norm": 0.2481224089860916, "learning_rate": 1.983432897655546e-06, "loss": 0.3604, "step": 121600 }, { "epoch": 4.382635960644394, "grad_norm": 0.24234378337860107, "learning_rate": 1.9822939344486063e-06, "loss": 0.389, "step": 121605 }, { "epoch": 4.382816160305619, "grad_norm": 0.27252396941185, "learning_rate": 1.9811552848534587e-06, "loss": 0.3839, "step": 121610 }, { "epoch": 4.382996359966843, "grad_norm": 0.30419376492500305, "learning_rate": 1.980016948885616e-06, "loss": 0.3901, "step": 121615 }, { "epoch": 4.383176559628068, "grad_norm": 0.3380153477191925, "learning_rate": 1.9788789265605844e-06, "loss": 0.3617, "step": 121620 }, { "epoch": 4.383356759289293, "grad_norm": 0.23734219372272491, "learning_rate": 1.9777412178938722e-06, "loss": 0.3463, "step": 121625 }, { "epoch": 4.3835369589505175, "grad_norm": 0.2698742747306824, "learning_rate": 1.976603822900974e-06, "loss": 0.3741, "step": 121630 }, { "epoch": 4.383717158611741, "grad_norm": 0.2732264995574951, "learning_rate": 1.9754667415973953e-06, "loss": 0.3122, "step": 121635 }, { "epoch": 4.383897358272966, "grad_norm": 0.2620375156402588, "learning_rate": 1.974329973998623e-06, "loss": 0.3475, "step": 121640 }, { "epoch": 4.384077557934191, "grad_norm": 0.26369425654411316, "learning_rate": 1.973193520120148e-06, "loss": 0.3816, "step": 121645 }, { "epoch": 4.384257757595416, "grad_norm": 0.26098334789276123, "learning_rate": 1.9720573799774542e-06, "loss": 0.3592, "step": 121650 }, { "epoch": 4.38443795725664, "grad_norm": 0.20638631284236908, "learning_rate": 1.970921553586019e-06, "loss": 0.3724, "step": 121655 }, { "epoch": 4.384618156917865, "grad_norm": 0.24286890029907227, "learning_rate": 1.9697860409613133e-06, "loss": 0.3677, "step": 121660 }, { "epoch": 4.38479835657909, "grad_norm": 0.2833465039730072, "learning_rate": 1.9686508421188214e-06, "loss": 0.4158, "step": 121665 }, { "epoch": 4.3849785562403145, "grad_norm": 0.2680220901966095, "learning_rate": 1.967515957074001e-06, "loss": 0.3728, "step": 121670 }, { "epoch": 4.385158755901539, "grad_norm": 0.26703187823295593, "learning_rate": 1.9663813858423167e-06, "loss": 0.3861, "step": 121675 }, { "epoch": 4.385338955562764, "grad_norm": 0.2260323315858841, "learning_rate": 1.9652471284392265e-06, "loss": 0.3477, "step": 121680 }, { "epoch": 4.385519155223988, "grad_norm": 0.23601694405078888, "learning_rate": 1.9641131848801785e-06, "loss": 0.4099, "step": 121685 }, { "epoch": 4.385699354885213, "grad_norm": 0.26613062620162964, "learning_rate": 1.9629795551806414e-06, "loss": 0.3624, "step": 121690 }, { "epoch": 4.385879554546437, "grad_norm": 0.23017358779907227, "learning_rate": 1.961846239356038e-06, "loss": 0.3831, "step": 121695 }, { "epoch": 4.386059754207662, "grad_norm": 0.3070394992828369, "learning_rate": 1.9607132374218184e-06, "loss": 0.349, "step": 121700 }, { "epoch": 4.386239953868887, "grad_norm": 0.22875505685806274, "learning_rate": 1.959580549393425e-06, "loss": 0.3717, "step": 121705 }, { "epoch": 4.3864201535301115, "grad_norm": 0.3101567327976227, "learning_rate": 1.9584481752862816e-06, "loss": 0.3494, "step": 121710 }, { "epoch": 4.386600353191336, "grad_norm": 0.2821701467037201, "learning_rate": 1.9573161151158288e-06, "loss": 0.3666, "step": 121715 }, { "epoch": 4.386780552852561, "grad_norm": 0.2166757434606552, "learning_rate": 1.9561843688974764e-06, "loss": 0.4189, "step": 121720 }, { "epoch": 4.386960752513785, "grad_norm": 0.24579738080501556, "learning_rate": 1.955052936646648e-06, "loss": 0.3535, "step": 121725 }, { "epoch": 4.3871409521750095, "grad_norm": 0.2603980600833893, "learning_rate": 1.953921818378768e-06, "loss": 0.3511, "step": 121730 }, { "epoch": 4.387321151836234, "grad_norm": 0.23183012008666992, "learning_rate": 1.952791014109237e-06, "loss": 0.3724, "step": 121735 }, { "epoch": 4.387501351497459, "grad_norm": 0.1970612108707428, "learning_rate": 1.9516605238534686e-06, "loss": 0.3725, "step": 121740 }, { "epoch": 4.387681551158684, "grad_norm": 0.283931702375412, "learning_rate": 1.950530347626864e-06, "loss": 0.371, "step": 121745 }, { "epoch": 4.3878617508199085, "grad_norm": 0.32697275280952454, "learning_rate": 1.9494004854448163e-06, "loss": 0.3709, "step": 121750 }, { "epoch": 4.388041950481133, "grad_norm": 0.2545589506626129, "learning_rate": 1.948270937322727e-06, "loss": 0.3777, "step": 121755 }, { "epoch": 4.388222150142358, "grad_norm": 0.2229977399110794, "learning_rate": 1.947141703275987e-06, "loss": 0.3423, "step": 121760 }, { "epoch": 4.388402349803583, "grad_norm": 0.2144192010164261, "learning_rate": 1.9460127833199756e-06, "loss": 0.3872, "step": 121765 }, { "epoch": 4.388582549464807, "grad_norm": 0.25160834193229675, "learning_rate": 1.9448841774700744e-06, "loss": 0.4037, "step": 121770 }, { "epoch": 4.388762749126031, "grad_norm": 0.2952007055282593, "learning_rate": 1.943755885741666e-06, "loss": 0.3825, "step": 121775 }, { "epoch": 4.388942948787256, "grad_norm": 0.25669974088668823, "learning_rate": 1.942627908150113e-06, "loss": 0.3635, "step": 121780 }, { "epoch": 4.389123148448481, "grad_norm": 0.28366559743881226, "learning_rate": 1.9415002447108004e-06, "loss": 0.3789, "step": 121785 }, { "epoch": 4.3893033481097055, "grad_norm": 0.23855380713939667, "learning_rate": 1.9403728954390745e-06, "loss": 0.4061, "step": 121790 }, { "epoch": 4.38948354777093, "grad_norm": 0.2798866033554077, "learning_rate": 1.939245860350308e-06, "loss": 0.3814, "step": 121795 }, { "epoch": 4.389663747432155, "grad_norm": 0.2672715485095978, "learning_rate": 1.9381191394598514e-06, "loss": 0.3589, "step": 121800 }, { "epoch": 4.38984394709338, "grad_norm": 0.22907915711402893, "learning_rate": 1.936992732783058e-06, "loss": 0.3436, "step": 121805 }, { "epoch": 4.390024146754604, "grad_norm": 0.3034953474998474, "learning_rate": 1.935866640335271e-06, "loss": 0.3956, "step": 121810 }, { "epoch": 4.390204346415829, "grad_norm": 0.21499215066432953, "learning_rate": 1.934740862131837e-06, "loss": 0.3986, "step": 121815 }, { "epoch": 4.390384546077053, "grad_norm": 0.2739037275314331, "learning_rate": 1.9336153981880906e-06, "loss": 0.3885, "step": 121820 }, { "epoch": 4.390564745738278, "grad_norm": 0.21741308271884918, "learning_rate": 1.9324902485193696e-06, "loss": 0.3895, "step": 121825 }, { "epoch": 4.390744945399502, "grad_norm": 0.24937617778778076, "learning_rate": 1.9313654131410035e-06, "loss": 0.3741, "step": 121830 }, { "epoch": 4.390925145060727, "grad_norm": 0.2421903759241104, "learning_rate": 1.930240892068319e-06, "loss": 0.3458, "step": 121835 }, { "epoch": 4.391105344721952, "grad_norm": 0.3107787072658539, "learning_rate": 1.929116685316634e-06, "loss": 0.3806, "step": 121840 }, { "epoch": 4.391285544383177, "grad_norm": 0.23685042560100555, "learning_rate": 1.9279927929012648e-06, "loss": 0.3384, "step": 121845 }, { "epoch": 4.391465744044401, "grad_norm": 0.27006104588508606, "learning_rate": 1.926869214837532e-06, "loss": 0.3696, "step": 121850 }, { "epoch": 4.391645943705626, "grad_norm": 0.23979541659355164, "learning_rate": 1.9257459511407366e-06, "loss": 0.3836, "step": 121855 }, { "epoch": 4.391826143366851, "grad_norm": 0.2638680040836334, "learning_rate": 1.9246230018261867e-06, "loss": 0.3689, "step": 121860 }, { "epoch": 4.392006343028076, "grad_norm": 0.23276658356189728, "learning_rate": 1.923500366909181e-06, "loss": 0.3862, "step": 121865 }, { "epoch": 4.392186542689299, "grad_norm": 0.3164710998535156, "learning_rate": 1.9223780464050103e-06, "loss": 0.3862, "step": 121870 }, { "epoch": 4.392366742350524, "grad_norm": 0.2991268038749695, "learning_rate": 1.9212560403289752e-06, "loss": 0.4063, "step": 121875 }, { "epoch": 4.392546942011749, "grad_norm": 0.2558140456676483, "learning_rate": 1.9201343486963646e-06, "loss": 0.3915, "step": 121880 }, { "epoch": 4.392727141672974, "grad_norm": 0.2522118389606476, "learning_rate": 1.9190129715224435e-06, "loss": 0.3721, "step": 121885 }, { "epoch": 4.392907341334198, "grad_norm": 0.22641310095787048, "learning_rate": 1.9178919088225086e-06, "loss": 0.3486, "step": 121890 }, { "epoch": 4.393087540995423, "grad_norm": 0.19269070029258728, "learning_rate": 1.9167711606118216e-06, "loss": 0.3699, "step": 121895 }, { "epoch": 4.393267740656648, "grad_norm": 0.24431343376636505, "learning_rate": 1.9156507269056683e-06, "loss": 0.3885, "step": 121900 }, { "epoch": 4.393447940317873, "grad_norm": 0.25327545404434204, "learning_rate": 1.9145306077192976e-06, "loss": 0.382, "step": 121905 }, { "epoch": 4.393628139979096, "grad_norm": 0.24400508403778076, "learning_rate": 1.9134108030679774e-06, "loss": 0.3645, "step": 121910 }, { "epoch": 4.393808339640321, "grad_norm": 0.23288530111312866, "learning_rate": 1.9122913129669657e-06, "loss": 0.3578, "step": 121915 }, { "epoch": 4.393988539301546, "grad_norm": 0.2210901528596878, "learning_rate": 1.911172137431519e-06, "loss": 0.3833, "step": 121920 }, { "epoch": 4.394168738962771, "grad_norm": 0.2453107088804245, "learning_rate": 1.910053276476878e-06, "loss": 0.3617, "step": 121925 }, { "epoch": 4.394348938623995, "grad_norm": 0.2022264003753662, "learning_rate": 1.9089347301182945e-06, "loss": 0.3762, "step": 121930 }, { "epoch": 4.39452913828522, "grad_norm": 0.23200011253356934, "learning_rate": 1.9078164983709984e-06, "loss": 0.3415, "step": 121935 }, { "epoch": 4.394709337946445, "grad_norm": 0.24369412660598755, "learning_rate": 1.9066985812502353e-06, "loss": 0.3742, "step": 121940 }, { "epoch": 4.3948895376076695, "grad_norm": 0.27759307622909546, "learning_rate": 1.9055809787712348e-06, "loss": 0.3759, "step": 121945 }, { "epoch": 4.395069737268894, "grad_norm": 0.28923094272613525, "learning_rate": 1.9044636909492208e-06, "loss": 0.3658, "step": 121950 }, { "epoch": 4.395249936930119, "grad_norm": 0.3017236888408661, "learning_rate": 1.9033467177994174e-06, "loss": 0.3987, "step": 121955 }, { "epoch": 4.395430136591343, "grad_norm": 0.31043195724487305, "learning_rate": 1.902230059337043e-06, "loss": 0.3689, "step": 121960 }, { "epoch": 4.395610336252568, "grad_norm": 0.25115767121315, "learning_rate": 1.9011137155773101e-06, "loss": 0.3732, "step": 121965 }, { "epoch": 4.395790535913792, "grad_norm": 0.23707129061222076, "learning_rate": 1.8999976865354374e-06, "loss": 0.37, "step": 121970 }, { "epoch": 4.395970735575017, "grad_norm": 0.2299506962299347, "learning_rate": 1.8988819722266154e-06, "loss": 0.3943, "step": 121975 }, { "epoch": 4.396150935236242, "grad_norm": 0.2858457565307617, "learning_rate": 1.8977665726660598e-06, "loss": 0.3703, "step": 121980 }, { "epoch": 4.3963311348974665, "grad_norm": 0.2686900198459625, "learning_rate": 1.8966514878689612e-06, "loss": 0.3801, "step": 121985 }, { "epoch": 4.396511334558691, "grad_norm": 0.25534695386886597, "learning_rate": 1.8955367178505073e-06, "loss": 0.3953, "step": 121990 }, { "epoch": 4.396691534219916, "grad_norm": 0.23531198501586914, "learning_rate": 1.8944222626259028e-06, "loss": 0.3857, "step": 121995 }, { "epoch": 4.39687173388114, "grad_norm": 0.23012003302574158, "learning_rate": 1.8933081222103189e-06, "loss": 0.4072, "step": 122000 }, { "epoch": 4.39687173388114, "eval_loss": 0.42878466844558716, "eval_runtime": 3.5304, "eval_samples_per_second": 28.325, "eval_steps_per_second": 7.081, "step": 122000 }, { "epoch": 4.397051933542365, "grad_norm": 0.2736528515815735, "learning_rate": 1.8921942966189322e-06, "loss": 0.4175, "step": 122005 }, { "epoch": 4.397232133203589, "grad_norm": 0.31495052576065063, "learning_rate": 1.8910807858669277e-06, "loss": 0.326, "step": 122010 }, { "epoch": 4.397412332864814, "grad_norm": 0.27840882539749146, "learning_rate": 1.889967589969474e-06, "loss": 0.3971, "step": 122015 }, { "epoch": 4.397592532526039, "grad_norm": 0.21493427455425262, "learning_rate": 1.8888547089417368e-06, "loss": 0.3331, "step": 122020 }, { "epoch": 4.3977727321872635, "grad_norm": 0.259475976228714, "learning_rate": 1.8877421427988818e-06, "loss": 0.3773, "step": 122025 }, { "epoch": 4.397952931848488, "grad_norm": 0.25516223907470703, "learning_rate": 1.8866298915560604e-06, "loss": 0.3608, "step": 122030 }, { "epoch": 4.398133131509713, "grad_norm": 0.25606194138526917, "learning_rate": 1.8855179552284357e-06, "loss": 0.3888, "step": 122035 }, { "epoch": 4.398313331170938, "grad_norm": 0.260857492685318, "learning_rate": 1.8844063338311542e-06, "loss": 0.3505, "step": 122040 }, { "epoch": 4.398493530832162, "grad_norm": 0.22685441374778748, "learning_rate": 1.8832950273793586e-06, "loss": 0.3711, "step": 122045 }, { "epoch": 4.398673730493386, "grad_norm": 0.24304035305976868, "learning_rate": 1.882184035888196e-06, "loss": 0.3749, "step": 122050 }, { "epoch": 4.398853930154611, "grad_norm": 0.31736481189727783, "learning_rate": 1.8810733593727952e-06, "loss": 0.352, "step": 122055 }, { "epoch": 4.399034129815836, "grad_norm": 0.190985307097435, "learning_rate": 1.8799629978482975e-06, "loss": 0.3598, "step": 122060 }, { "epoch": 4.3992143294770605, "grad_norm": 0.26111340522766113, "learning_rate": 1.878852951329832e-06, "loss": 0.3519, "step": 122065 }, { "epoch": 4.399394529138285, "grad_norm": 0.20273645222187042, "learning_rate": 1.877743219832509e-06, "loss": 0.3546, "step": 122070 }, { "epoch": 4.39957472879951, "grad_norm": 0.21357549726963043, "learning_rate": 1.8766338033714636e-06, "loss": 0.3868, "step": 122075 }, { "epoch": 4.399754928460735, "grad_norm": 0.2500903010368347, "learning_rate": 1.8755247019618034e-06, "loss": 0.3884, "step": 122080 }, { "epoch": 4.399935128121959, "grad_norm": 0.23067040741443634, "learning_rate": 1.8744159156186492e-06, "loss": 0.3731, "step": 122085 }, { "epoch": 4.400115327783184, "grad_norm": 0.23591923713684082, "learning_rate": 1.8733074443570947e-06, "loss": 0.3813, "step": 122090 }, { "epoch": 4.400295527444408, "grad_norm": 0.251615434885025, "learning_rate": 1.8721992881922473e-06, "loss": 0.3614, "step": 122095 }, { "epoch": 4.400475727105633, "grad_norm": 0.2689308524131775, "learning_rate": 1.8710914471392088e-06, "loss": 0.4076, "step": 122100 }, { "epoch": 4.4006559267668575, "grad_norm": 0.2680433392524719, "learning_rate": 1.8699839212130727e-06, "loss": 0.3672, "step": 122105 }, { "epoch": 4.400836126428082, "grad_norm": 0.2669503092765808, "learning_rate": 1.8688767104289267e-06, "loss": 0.3589, "step": 122110 }, { "epoch": 4.401016326089307, "grad_norm": 0.23472002148628235, "learning_rate": 1.8677698148018591e-06, "loss": 0.3815, "step": 122115 }, { "epoch": 4.401196525750532, "grad_norm": 0.2634426951408386, "learning_rate": 1.8666632343469437e-06, "loss": 0.3885, "step": 122120 }, { "epoch": 4.401376725411756, "grad_norm": 0.22323672473430634, "learning_rate": 1.8655569690792686e-06, "loss": 0.3725, "step": 122125 }, { "epoch": 4.401556925072981, "grad_norm": 0.1853046715259552, "learning_rate": 1.8644510190138992e-06, "loss": 0.3773, "step": 122130 }, { "epoch": 4.401737124734206, "grad_norm": 0.260140061378479, "learning_rate": 1.8633453841659042e-06, "loss": 0.351, "step": 122135 }, { "epoch": 4.401917324395431, "grad_norm": 0.26455867290496826, "learning_rate": 1.8622400645503525e-06, "loss": 0.3379, "step": 122140 }, { "epoch": 4.402097524056654, "grad_norm": 0.33009883761405945, "learning_rate": 1.861135060182298e-06, "loss": 0.3934, "step": 122145 }, { "epoch": 4.402277723717879, "grad_norm": 0.2440415918827057, "learning_rate": 1.860030371076793e-06, "loss": 0.3411, "step": 122150 }, { "epoch": 4.402457923379104, "grad_norm": 0.2990182340145111, "learning_rate": 1.8589259972489003e-06, "loss": 0.4079, "step": 122155 }, { "epoch": 4.402638123040329, "grad_norm": 0.2691243588924408, "learning_rate": 1.8578219387136608e-06, "loss": 0.379, "step": 122160 }, { "epoch": 4.402818322701553, "grad_norm": 0.23940818011760712, "learning_rate": 1.8567181954861152e-06, "loss": 0.3714, "step": 122165 }, { "epoch": 4.402998522362778, "grad_norm": 0.23924623429775238, "learning_rate": 1.8556147675813068e-06, "loss": 0.354, "step": 122170 }, { "epoch": 4.403178722024003, "grad_norm": 0.2857334017753601, "learning_rate": 1.85451165501426e-06, "loss": 0.398, "step": 122175 }, { "epoch": 4.403358921685228, "grad_norm": 0.26370975375175476, "learning_rate": 1.8534088578000181e-06, "loss": 0.3762, "step": 122180 }, { "epoch": 4.403539121346451, "grad_norm": 0.24754032492637634, "learning_rate": 1.8523063759535969e-06, "loss": 0.3867, "step": 122185 }, { "epoch": 4.403719321007676, "grad_norm": 0.24139243364334106, "learning_rate": 1.851204209490015e-06, "loss": 0.3443, "step": 122190 }, { "epoch": 4.403899520668901, "grad_norm": 0.2864011526107788, "learning_rate": 1.8501023584242965e-06, "loss": 0.3501, "step": 122195 }, { "epoch": 4.404079720330126, "grad_norm": 0.2613162398338318, "learning_rate": 1.8490008227714545e-06, "loss": 0.3541, "step": 122200 }, { "epoch": 4.40425991999135, "grad_norm": 0.2530914843082428, "learning_rate": 1.8478996025464933e-06, "loss": 0.3385, "step": 122205 }, { "epoch": 4.404440119652575, "grad_norm": 0.2663853168487549, "learning_rate": 1.8467986977644153e-06, "loss": 0.3808, "step": 122210 }, { "epoch": 4.4046203193138, "grad_norm": 0.26814600825309753, "learning_rate": 1.8456981084402192e-06, "loss": 0.3838, "step": 122215 }, { "epoch": 4.404800518975025, "grad_norm": 0.2669377326965332, "learning_rate": 1.8445978345889097e-06, "loss": 0.381, "step": 122220 }, { "epoch": 4.404980718636249, "grad_norm": 0.2761158049106598, "learning_rate": 1.8434978762254695e-06, "loss": 0.3659, "step": 122225 }, { "epoch": 4.405160918297474, "grad_norm": 0.24499890208244324, "learning_rate": 1.8423982333648893e-06, "loss": 0.3993, "step": 122230 }, { "epoch": 4.405341117958698, "grad_norm": 0.24173754453659058, "learning_rate": 1.8412989060221485e-06, "loss": 0.3923, "step": 122235 }, { "epoch": 4.405521317619923, "grad_norm": 0.2643168568611145, "learning_rate": 1.8401998942122218e-06, "loss": 0.3681, "step": 122240 }, { "epoch": 4.405701517281147, "grad_norm": 0.24129536747932434, "learning_rate": 1.8391011979500939e-06, "loss": 0.3466, "step": 122245 }, { "epoch": 4.405881716942372, "grad_norm": 0.2655704915523529, "learning_rate": 1.8380028172507308e-06, "loss": 0.3822, "step": 122250 }, { "epoch": 4.406061916603597, "grad_norm": 0.2480999380350113, "learning_rate": 1.8369047521290872e-06, "loss": 0.3705, "step": 122255 }, { "epoch": 4.4062421162648215, "grad_norm": 0.3369860053062439, "learning_rate": 1.8358070026001345e-06, "loss": 0.3736, "step": 122260 }, { "epoch": 4.406422315926046, "grad_norm": 0.3019421100616455, "learning_rate": 1.8347095686788247e-06, "loss": 0.3587, "step": 122265 }, { "epoch": 4.406602515587271, "grad_norm": 0.26029887795448303, "learning_rate": 1.8336124503801179e-06, "loss": 0.3561, "step": 122270 }, { "epoch": 4.406782715248496, "grad_norm": 0.31131497025489807, "learning_rate": 1.8325156477189547e-06, "loss": 0.3614, "step": 122275 }, { "epoch": 4.40696291490972, "grad_norm": 0.24049854278564453, "learning_rate": 1.8314191607102738e-06, "loss": 0.3702, "step": 122280 }, { "epoch": 4.407143114570944, "grad_norm": 0.23440882563591003, "learning_rate": 1.8303229893690266e-06, "loss": 0.3774, "step": 122285 }, { "epoch": 4.407323314232169, "grad_norm": 0.22003592550754547, "learning_rate": 1.829227133710143e-06, "loss": 0.3919, "step": 122290 }, { "epoch": 4.407503513893394, "grad_norm": 0.1988513171672821, "learning_rate": 1.8281315937485527e-06, "loss": 0.3903, "step": 122295 }, { "epoch": 4.4076837135546185, "grad_norm": 0.19561129808425903, "learning_rate": 1.8270363694991855e-06, "loss": 0.3364, "step": 122300 }, { "epoch": 4.407863913215843, "grad_norm": 0.2583327889442444, "learning_rate": 1.825941460976957e-06, "loss": 0.3895, "step": 122305 }, { "epoch": 4.408044112877068, "grad_norm": 0.2395971715450287, "learning_rate": 1.8248468681967916e-06, "loss": 0.3592, "step": 122310 }, { "epoch": 4.408224312538293, "grad_norm": 0.25403663516044617, "learning_rate": 1.8237525911736025e-06, "loss": 0.3655, "step": 122315 }, { "epoch": 4.4084045121995175, "grad_norm": 0.27949368953704834, "learning_rate": 1.8226586299222943e-06, "loss": 0.3404, "step": 122320 }, { "epoch": 4.408584711860741, "grad_norm": 0.2646028399467468, "learning_rate": 1.821564984457777e-06, "loss": 0.3935, "step": 122325 }, { "epoch": 4.408764911521966, "grad_norm": 0.32724955677986145, "learning_rate": 1.8204716547949502e-06, "loss": 0.3568, "step": 122330 }, { "epoch": 4.408945111183191, "grad_norm": 0.26123833656311035, "learning_rate": 1.8193786409487046e-06, "loss": 0.3589, "step": 122335 }, { "epoch": 4.4091253108444155, "grad_norm": 0.2541702389717102, "learning_rate": 1.8182859429339422e-06, "loss": 0.3606, "step": 122340 }, { "epoch": 4.40930551050564, "grad_norm": 0.25994282960891724, "learning_rate": 1.8171935607655427e-06, "loss": 0.3641, "step": 122345 }, { "epoch": 4.409485710166865, "grad_norm": 0.31410351395606995, "learning_rate": 1.816101494458397e-06, "loss": 0.3891, "step": 122350 }, { "epoch": 4.40966590982809, "grad_norm": 0.23764289915561676, "learning_rate": 1.8150097440273767e-06, "loss": 0.407, "step": 122355 }, { "epoch": 4.409846109489314, "grad_norm": 0.24512486159801483, "learning_rate": 1.8139183094873558e-06, "loss": 0.3627, "step": 122360 }, { "epoch": 4.410026309150539, "grad_norm": 0.2727164328098297, "learning_rate": 1.812827190853217e-06, "loss": 0.3738, "step": 122365 }, { "epoch": 4.410206508811763, "grad_norm": 0.2821141183376312, "learning_rate": 1.8117363881398174e-06, "loss": 0.3829, "step": 122370 }, { "epoch": 4.410386708472988, "grad_norm": 0.22460602223873138, "learning_rate": 1.8106459013620126e-06, "loss": 0.3681, "step": 122375 }, { "epoch": 4.4105669081342125, "grad_norm": 0.24663448333740234, "learning_rate": 1.8095557305346734e-06, "loss": 0.3807, "step": 122380 }, { "epoch": 4.410747107795437, "grad_norm": 0.32760992646217346, "learning_rate": 1.8084658756726463e-06, "loss": 0.3769, "step": 122385 }, { "epoch": 4.410927307456662, "grad_norm": 0.26017165184020996, "learning_rate": 1.8073763367907836e-06, "loss": 0.4191, "step": 122390 }, { "epoch": 4.411107507117887, "grad_norm": 0.2591916620731354, "learning_rate": 1.806287113903929e-06, "loss": 0.3955, "step": 122395 }, { "epoch": 4.411287706779111, "grad_norm": 0.2518169581890106, "learning_rate": 1.8051982070269147e-06, "loss": 0.3571, "step": 122400 }, { "epoch": 4.411467906440336, "grad_norm": 0.3138507604598999, "learning_rate": 1.8041096161745902e-06, "loss": 0.3739, "step": 122405 }, { "epoch": 4.411648106101561, "grad_norm": 0.22780956327915192, "learning_rate": 1.8030213413617798e-06, "loss": 0.3652, "step": 122410 }, { "epoch": 4.411828305762786, "grad_norm": 0.2590252161026001, "learning_rate": 1.801933382603313e-06, "loss": 0.3928, "step": 122415 }, { "epoch": 4.4120085054240095, "grad_norm": 0.2547787129878998, "learning_rate": 1.8008457399140144e-06, "loss": 0.3554, "step": 122420 }, { "epoch": 4.412188705085234, "grad_norm": 0.29435890913009644, "learning_rate": 1.7997584133086943e-06, "loss": 0.4092, "step": 122425 }, { "epoch": 4.412368904746459, "grad_norm": 0.23160386085510254, "learning_rate": 1.7986714028021794e-06, "loss": 0.3793, "step": 122430 }, { "epoch": 4.412549104407684, "grad_norm": 0.1971481442451477, "learning_rate": 1.7975847084092806e-06, "loss": 0.3858, "step": 122435 }, { "epoch": 4.412729304068908, "grad_norm": 0.29055434465408325, "learning_rate": 1.7964983301447886e-06, "loss": 0.4172, "step": 122440 }, { "epoch": 4.412909503730133, "grad_norm": 0.2608969509601593, "learning_rate": 1.7954122680235164e-06, "loss": 0.3736, "step": 122445 }, { "epoch": 4.413089703391358, "grad_norm": 0.2528577148914337, "learning_rate": 1.794326522060258e-06, "loss": 0.3502, "step": 122450 }, { "epoch": 4.413269903052583, "grad_norm": 0.21361927688121796, "learning_rate": 1.7932410922698122e-06, "loss": 0.3278, "step": 122455 }, { "epoch": 4.4134501027138064, "grad_norm": 0.33277052640914917, "learning_rate": 1.7921559786669679e-06, "loss": 0.3626, "step": 122460 }, { "epoch": 4.413630302375031, "grad_norm": 0.21200527250766754, "learning_rate": 1.7910711812664931e-06, "loss": 0.3842, "step": 122465 }, { "epoch": 4.413810502036256, "grad_norm": 0.2637445628643036, "learning_rate": 1.7899867000831878e-06, "loss": 0.3866, "step": 122470 }, { "epoch": 4.413990701697481, "grad_norm": 0.3103804588317871, "learning_rate": 1.78890253513182e-06, "loss": 0.3754, "step": 122475 }, { "epoch": 4.414170901358705, "grad_norm": 0.25578418374061584, "learning_rate": 1.7878186864271618e-06, "loss": 0.3487, "step": 122480 }, { "epoch": 4.41435110101993, "grad_norm": 0.23292893171310425, "learning_rate": 1.786735153983976e-06, "loss": 0.3688, "step": 122485 }, { "epoch": 4.414531300681155, "grad_norm": 0.23511111736297607, "learning_rate": 1.7856519378170289e-06, "loss": 0.3837, "step": 122490 }, { "epoch": 4.41471150034238, "grad_norm": 0.3069644272327423, "learning_rate": 1.7845690379410835e-06, "loss": 0.3729, "step": 122495 }, { "epoch": 4.414891700003604, "grad_norm": 0.26386865973472595, "learning_rate": 1.783486454370889e-06, "loss": 0.379, "step": 122500 }, { "epoch": 4.414891700003604, "eval_loss": 0.42894604802131653, "eval_runtime": 3.5255, "eval_samples_per_second": 28.365, "eval_steps_per_second": 7.091, "step": 122500 }, { "epoch": 4.415071899664829, "grad_norm": 0.2563088834285736, "learning_rate": 1.7824041871211976e-06, "loss": 0.3939, "step": 122505 }, { "epoch": 4.415252099326053, "grad_norm": 0.2134285569190979, "learning_rate": 1.781322236206756e-06, "loss": 0.3831, "step": 122510 }, { "epoch": 4.415432298987278, "grad_norm": 0.21774479746818542, "learning_rate": 1.7802406016423022e-06, "loss": 0.3738, "step": 122515 }, { "epoch": 4.415612498648502, "grad_norm": 0.19313018023967743, "learning_rate": 1.779159283442569e-06, "loss": 0.34, "step": 122520 }, { "epoch": 4.415792698309727, "grad_norm": 0.24598465859889984, "learning_rate": 1.7780782816223001e-06, "loss": 0.3575, "step": 122525 }, { "epoch": 4.415972897970952, "grad_norm": 0.2550036311149597, "learning_rate": 1.77699759619622e-06, "loss": 0.381, "step": 122530 }, { "epoch": 4.416153097632177, "grad_norm": 0.3060535192489624, "learning_rate": 1.7759172271790498e-06, "loss": 0.4224, "step": 122535 }, { "epoch": 4.416333297293401, "grad_norm": 0.3150867223739624, "learning_rate": 1.7748371745855114e-06, "loss": 0.3928, "step": 122540 }, { "epoch": 4.416513496954626, "grad_norm": 0.24567945301532745, "learning_rate": 1.7737574384303152e-06, "loss": 0.3674, "step": 122545 }, { "epoch": 4.416693696615851, "grad_norm": 0.19101324677467346, "learning_rate": 1.7726780187281854e-06, "loss": 0.3462, "step": 122550 }, { "epoch": 4.416873896277075, "grad_norm": 0.3299705982208252, "learning_rate": 1.771598915493819e-06, "loss": 0.3893, "step": 122555 }, { "epoch": 4.417054095938299, "grad_norm": 0.29562848806381226, "learning_rate": 1.7705201287419121e-06, "loss": 0.3921, "step": 122560 }, { "epoch": 4.417234295599524, "grad_norm": 0.2895338535308838, "learning_rate": 1.769441658487178e-06, "loss": 0.4102, "step": 122565 }, { "epoch": 4.417414495260749, "grad_norm": 0.2296198308467865, "learning_rate": 1.7683635047442998e-06, "loss": 0.3691, "step": 122570 }, { "epoch": 4.4175946949219735, "grad_norm": 0.24252089858055115, "learning_rate": 1.7672856675279791e-06, "loss": 0.4019, "step": 122575 }, { "epoch": 4.417774894583198, "grad_norm": 0.24175925552845, "learning_rate": 1.7662081468528879e-06, "loss": 0.3703, "step": 122580 }, { "epoch": 4.417955094244423, "grad_norm": 0.22977383434772491, "learning_rate": 1.7651309427337087e-06, "loss": 0.4019, "step": 122585 }, { "epoch": 4.418135293905648, "grad_norm": 0.28015124797821045, "learning_rate": 1.764054055185127e-06, "loss": 0.3954, "step": 122590 }, { "epoch": 4.4183154935668725, "grad_norm": 0.231642484664917, "learning_rate": 1.7629774842218089e-06, "loss": 0.3531, "step": 122595 }, { "epoch": 4.418495693228096, "grad_norm": 0.24805359542369843, "learning_rate": 1.7619012298584259e-06, "loss": 0.4105, "step": 122600 }, { "epoch": 4.418675892889321, "grad_norm": 0.29648274183273315, "learning_rate": 1.7608252921096357e-06, "loss": 0.3828, "step": 122605 }, { "epoch": 4.418856092550546, "grad_norm": 0.28801605105400085, "learning_rate": 1.7597496709901017e-06, "loss": 0.3365, "step": 122610 }, { "epoch": 4.4190362922117705, "grad_norm": 0.25377213954925537, "learning_rate": 1.7586743665144816e-06, "loss": 0.3936, "step": 122615 }, { "epoch": 4.419216491872995, "grad_norm": 0.2505911588668823, "learning_rate": 1.7575993786974221e-06, "loss": 0.3729, "step": 122620 }, { "epoch": 4.41939669153422, "grad_norm": 0.2358839064836502, "learning_rate": 1.7565247075535724e-06, "loss": 0.3607, "step": 122625 }, { "epoch": 4.419576891195445, "grad_norm": 0.2155429720878601, "learning_rate": 1.7554503530975735e-06, "loss": 0.3934, "step": 122630 }, { "epoch": 4.4197570908566695, "grad_norm": 0.21443898975849152, "learning_rate": 1.7543763153440585e-06, "loss": 0.3833, "step": 122635 }, { "epoch": 4.419937290517894, "grad_norm": 0.2692784368991852, "learning_rate": 1.7533025943076708e-06, "loss": 0.3753, "step": 122640 }, { "epoch": 4.420117490179118, "grad_norm": 0.1909954696893692, "learning_rate": 1.752229190003038e-06, "loss": 0.3936, "step": 122645 }, { "epoch": 4.420297689840343, "grad_norm": 0.26652899384498596, "learning_rate": 1.751156102444776e-06, "loss": 0.3609, "step": 122650 }, { "epoch": 4.4204778895015675, "grad_norm": 0.220448300242424, "learning_rate": 1.7500833316475118e-06, "loss": 0.3454, "step": 122655 }, { "epoch": 4.420658089162792, "grad_norm": 0.2539389431476593, "learning_rate": 1.7490108776258617e-06, "loss": 0.3466, "step": 122660 }, { "epoch": 4.420838288824017, "grad_norm": 0.2393910139799118, "learning_rate": 1.7479387403944363e-06, "loss": 0.345, "step": 122665 }, { "epoch": 4.421018488485242, "grad_norm": 0.27179139852523804, "learning_rate": 1.7468669199678462e-06, "loss": 0.3895, "step": 122670 }, { "epoch": 4.421198688146466, "grad_norm": 0.2731141149997711, "learning_rate": 1.7457954163606848e-06, "loss": 0.3681, "step": 122675 }, { "epoch": 4.421378887807691, "grad_norm": 0.23041914403438568, "learning_rate": 1.744724229587566e-06, "loss": 0.3366, "step": 122680 }, { "epoch": 4.421559087468916, "grad_norm": 0.2509845793247223, "learning_rate": 1.743653359663075e-06, "loss": 0.3587, "step": 122685 }, { "epoch": 4.421739287130141, "grad_norm": 0.250013530254364, "learning_rate": 1.7425828066018056e-06, "loss": 0.3569, "step": 122690 }, { "epoch": 4.4219194867913645, "grad_norm": 0.27683958411216736, "learning_rate": 1.7415125704183411e-06, "loss": 0.3784, "step": 122695 }, { "epoch": 4.422099686452589, "grad_norm": 0.2434561401605606, "learning_rate": 1.7404426511272664e-06, "loss": 0.3768, "step": 122700 }, { "epoch": 4.422279886113814, "grad_norm": 0.30197739601135254, "learning_rate": 1.7393730487431536e-06, "loss": 0.3838, "step": 122705 }, { "epoch": 4.422460085775039, "grad_norm": 0.28687089681625366, "learning_rate": 1.7383037632805826e-06, "loss": 0.3882, "step": 122710 }, { "epoch": 4.422640285436263, "grad_norm": 0.21703818440437317, "learning_rate": 1.7372347947541194e-06, "loss": 0.3486, "step": 122715 }, { "epoch": 4.422820485097488, "grad_norm": 0.27013471722602844, "learning_rate": 1.7361661431783304e-06, "loss": 0.3774, "step": 122720 }, { "epoch": 4.423000684758713, "grad_norm": 0.25899776816368103, "learning_rate": 1.7350978085677704e-06, "loss": 0.3916, "step": 122725 }, { "epoch": 4.423180884419938, "grad_norm": 0.24092505872249603, "learning_rate": 1.7340297909369973e-06, "loss": 0.3897, "step": 122730 }, { "epoch": 4.4233610840811615, "grad_norm": 0.25030139088630676, "learning_rate": 1.7329620903005661e-06, "loss": 0.3741, "step": 122735 }, { "epoch": 4.423541283742386, "grad_norm": 0.218690425157547, "learning_rate": 1.7318947066730263e-06, "loss": 0.3597, "step": 122740 }, { "epoch": 4.423721483403611, "grad_norm": 0.2734704613685608, "learning_rate": 1.7308276400689077e-06, "loss": 0.3735, "step": 122745 }, { "epoch": 4.423901683064836, "grad_norm": 0.37903356552124023, "learning_rate": 1.7297608905027602e-06, "loss": 0.3653, "step": 122750 }, { "epoch": 4.42408188272606, "grad_norm": 0.29031357169151306, "learning_rate": 1.7286944579891134e-06, "loss": 0.411, "step": 122755 }, { "epoch": 4.424262082387285, "grad_norm": 0.24027031660079956, "learning_rate": 1.7276283425425088e-06, "loss": 0.3777, "step": 122760 }, { "epoch": 4.42444228204851, "grad_norm": 0.26298728585243225, "learning_rate": 1.726562544177454e-06, "loss": 0.3635, "step": 122765 }, { "epoch": 4.424622481709735, "grad_norm": 0.24697257578372955, "learning_rate": 1.7254970629084765e-06, "loss": 0.3843, "step": 122770 }, { "epoch": 4.424802681370959, "grad_norm": 0.26722240447998047, "learning_rate": 1.7244318987501007e-06, "loss": 0.3959, "step": 122775 }, { "epoch": 4.424982881032184, "grad_norm": 0.23795725405216217, "learning_rate": 1.7233670517168344e-06, "loss": 0.3746, "step": 122780 }, { "epoch": 4.425163080693408, "grad_norm": 0.31174471974372864, "learning_rate": 1.7223025218231826e-06, "loss": 0.3546, "step": 122785 }, { "epoch": 4.425343280354633, "grad_norm": 0.2947745621204376, "learning_rate": 1.7212383090836558e-06, "loss": 0.3637, "step": 122790 }, { "epoch": 4.425523480015857, "grad_norm": 0.24382677674293518, "learning_rate": 1.7201744135127428e-06, "loss": 0.3861, "step": 122795 }, { "epoch": 4.425703679677082, "grad_norm": 0.2894967794418335, "learning_rate": 1.7191108351249513e-06, "loss": 0.3632, "step": 122800 }, { "epoch": 4.425883879338307, "grad_norm": 0.23039253056049347, "learning_rate": 1.7180475739347695e-06, "loss": 0.3493, "step": 122805 }, { "epoch": 4.426064078999532, "grad_norm": 0.295876681804657, "learning_rate": 1.7169846299566806e-06, "loss": 0.3588, "step": 122810 }, { "epoch": 4.426244278660756, "grad_norm": 0.23762252926826477, "learning_rate": 1.715922003205167e-06, "loss": 0.3371, "step": 122815 }, { "epoch": 4.426424478321981, "grad_norm": 0.22657039761543274, "learning_rate": 1.7148596936947064e-06, "loss": 0.3865, "step": 122820 }, { "epoch": 4.426604677983206, "grad_norm": 0.22004631161689758, "learning_rate": 1.713797701439776e-06, "loss": 0.3896, "step": 122825 }, { "epoch": 4.42678487764443, "grad_norm": 0.2557975947856903, "learning_rate": 1.7127360264548475e-06, "loss": 0.3903, "step": 122830 }, { "epoch": 4.426965077305654, "grad_norm": 0.2727845311164856, "learning_rate": 1.711674668754376e-06, "loss": 0.3312, "step": 122835 }, { "epoch": 4.427145276966879, "grad_norm": 0.24959102272987366, "learning_rate": 1.7106136283528336e-06, "loss": 0.3425, "step": 122840 }, { "epoch": 4.427325476628104, "grad_norm": 0.25791552662849426, "learning_rate": 1.709552905264669e-06, "loss": 0.3923, "step": 122845 }, { "epoch": 4.427505676289329, "grad_norm": 0.22956585884094238, "learning_rate": 1.7084924995043355e-06, "loss": 0.3631, "step": 122850 }, { "epoch": 4.427685875950553, "grad_norm": 0.25182798504829407, "learning_rate": 1.707432411086285e-06, "loss": 0.3912, "step": 122855 }, { "epoch": 4.427866075611778, "grad_norm": 0.2180362045764923, "learning_rate": 1.7063726400249531e-06, "loss": 0.3683, "step": 122860 }, { "epoch": 4.428046275273003, "grad_norm": 0.24969041347503662, "learning_rate": 1.7053131863347893e-06, "loss": 0.3792, "step": 122865 }, { "epoch": 4.4282264749342275, "grad_norm": 0.3251616060733795, "learning_rate": 1.704254050030224e-06, "loss": 0.3662, "step": 122870 }, { "epoch": 4.428406674595452, "grad_norm": 0.2780360281467438, "learning_rate": 1.7031952311256845e-06, "loss": 0.4135, "step": 122875 }, { "epoch": 4.428586874256676, "grad_norm": 0.23237118124961853, "learning_rate": 1.7021367296356005e-06, "loss": 0.3865, "step": 122880 }, { "epoch": 4.428767073917901, "grad_norm": 0.21650679409503937, "learning_rate": 1.7010785455743943e-06, "loss": 0.3883, "step": 122885 }, { "epoch": 4.4289472735791255, "grad_norm": 0.2289179116487503, "learning_rate": 1.7000206789564765e-06, "loss": 0.3903, "step": 122890 }, { "epoch": 4.42912747324035, "grad_norm": 0.27513277530670166, "learning_rate": 1.6989631297962717e-06, "loss": 0.3564, "step": 122895 }, { "epoch": 4.429307672901575, "grad_norm": 0.2146390825510025, "learning_rate": 1.6979058981081819e-06, "loss": 0.3701, "step": 122900 }, { "epoch": 4.4294878725628, "grad_norm": 0.36598315834999084, "learning_rate": 1.6968489839066125e-06, "loss": 0.3704, "step": 122905 }, { "epoch": 4.4296680722240245, "grad_norm": 0.2601289749145508, "learning_rate": 1.6957923872059634e-06, "loss": 0.3677, "step": 122910 }, { "epoch": 4.429848271885249, "grad_norm": 0.25093069672584534, "learning_rate": 1.6947361080206282e-06, "loss": 0.3744, "step": 122915 }, { "epoch": 4.430028471546473, "grad_norm": 0.3232559859752655, "learning_rate": 1.6936801463650066e-06, "loss": 0.3804, "step": 122920 }, { "epoch": 4.430208671207698, "grad_norm": 0.26259225606918335, "learning_rate": 1.6926245022534842e-06, "loss": 0.3743, "step": 122925 }, { "epoch": 4.4303888708689225, "grad_norm": 0.25600606203079224, "learning_rate": 1.6915691757004303e-06, "loss": 0.3987, "step": 122930 }, { "epoch": 4.430569070530147, "grad_norm": 0.2362159788608551, "learning_rate": 1.690514166720239e-06, "loss": 0.3972, "step": 122935 }, { "epoch": 4.430749270191372, "grad_norm": 0.21137534081935883, "learning_rate": 1.6894594753272763e-06, "loss": 0.3447, "step": 122940 }, { "epoch": 4.430929469852597, "grad_norm": 0.23811127245426178, "learning_rate": 1.6884051015359226e-06, "loss": 0.3834, "step": 122945 }, { "epoch": 4.4311096695138215, "grad_norm": 0.22823196649551392, "learning_rate": 1.687351045360533e-06, "loss": 0.3802, "step": 122950 }, { "epoch": 4.431289869175046, "grad_norm": 0.23941099643707275, "learning_rate": 1.6862973068154653e-06, "loss": 0.3415, "step": 122955 }, { "epoch": 4.431470068836271, "grad_norm": 0.26783496141433716, "learning_rate": 1.6852438859150887e-06, "loss": 0.3696, "step": 122960 }, { "epoch": 4.431650268497496, "grad_norm": 0.31337860226631165, "learning_rate": 1.6841907826737503e-06, "loss": 0.3926, "step": 122965 }, { "epoch": 4.4318304681587195, "grad_norm": 0.2646377384662628, "learning_rate": 1.6831379971057992e-06, "loss": 0.3866, "step": 122970 }, { "epoch": 4.432010667819944, "grad_norm": 0.27343621850013733, "learning_rate": 1.6820855292255772e-06, "loss": 0.3935, "step": 122975 }, { "epoch": 4.432190867481169, "grad_norm": 0.3007572591304779, "learning_rate": 1.6810333790474226e-06, "loss": 0.3962, "step": 122980 }, { "epoch": 4.432371067142394, "grad_norm": 0.24796468019485474, "learning_rate": 1.6799815465856767e-06, "loss": 0.365, "step": 122985 }, { "epoch": 4.432551266803618, "grad_norm": 0.2737863063812256, "learning_rate": 1.6789300318546697e-06, "loss": 0.4177, "step": 122990 }, { "epoch": 4.432731466464843, "grad_norm": 0.298483282327652, "learning_rate": 1.6778788348687235e-06, "loss": 0.3822, "step": 122995 }, { "epoch": 4.432911666126068, "grad_norm": 0.23559249937534332, "learning_rate": 1.6768279556421628e-06, "loss": 0.3506, "step": 123000 }, { "epoch": 4.432911666126068, "eval_loss": 0.42897456884384155, "eval_runtime": 3.5635, "eval_samples_per_second": 28.062, "eval_steps_per_second": 7.016, "step": 123000 }, { "epoch": 4.433091865787293, "grad_norm": 0.2772116959095001, "learning_rate": 1.6757773941893068e-06, "loss": 0.4011, "step": 123005 }, { "epoch": 4.4332720654485165, "grad_norm": 0.2690448760986328, "learning_rate": 1.6747271505244634e-06, "loss": 0.3637, "step": 123010 }, { "epoch": 4.433452265109741, "grad_norm": 0.23633429408073425, "learning_rate": 1.6736772246619543e-06, "loss": 0.3459, "step": 123015 }, { "epoch": 4.433632464770966, "grad_norm": 0.2238181084394455, "learning_rate": 1.6726276166160681e-06, "loss": 0.3827, "step": 123020 }, { "epoch": 4.433812664432191, "grad_norm": 0.24402472376823425, "learning_rate": 1.6715783264011188e-06, "loss": 0.388, "step": 123025 }, { "epoch": 4.433992864093415, "grad_norm": 0.2225305289030075, "learning_rate": 1.6705293540313998e-06, "loss": 0.3813, "step": 123030 }, { "epoch": 4.43417306375464, "grad_norm": 0.2935526967048645, "learning_rate": 1.6694806995211976e-06, "loss": 0.3678, "step": 123035 }, { "epoch": 4.434353263415865, "grad_norm": 0.24194492399692535, "learning_rate": 1.6684323628848113e-06, "loss": 0.3804, "step": 123040 }, { "epoch": 4.43453346307709, "grad_norm": 0.22049802541732788, "learning_rate": 1.6673843441365077e-06, "loss": 0.3625, "step": 123045 }, { "epoch": 4.434713662738314, "grad_norm": 0.28984788060188293, "learning_rate": 1.6663366432905809e-06, "loss": 0.3471, "step": 123050 }, { "epoch": 4.434893862399539, "grad_norm": 0.2572941780090332, "learning_rate": 1.6652892603612968e-06, "loss": 0.378, "step": 123055 }, { "epoch": 4.435074062060763, "grad_norm": 0.2342897206544876, "learning_rate": 1.6642421953629305e-06, "loss": 0.3621, "step": 123060 }, { "epoch": 4.435254261721988, "grad_norm": 0.2670467793941498, "learning_rate": 1.6631954483097483e-06, "loss": 0.3689, "step": 123065 }, { "epoch": 4.435434461383212, "grad_norm": 0.29698285460472107, "learning_rate": 1.6621490192160082e-06, "loss": 0.3722, "step": 123070 }, { "epoch": 4.435614661044437, "grad_norm": 0.21422305703163147, "learning_rate": 1.6611029080959655e-06, "loss": 0.356, "step": 123075 }, { "epoch": 4.435794860705662, "grad_norm": 0.2958966791629791, "learning_rate": 1.660057114963881e-06, "loss": 0.3852, "step": 123080 }, { "epoch": 4.435975060366887, "grad_norm": 0.20923343300819397, "learning_rate": 1.659011639833999e-06, "loss": 0.3784, "step": 123085 }, { "epoch": 4.436155260028111, "grad_norm": 0.2286258041858673, "learning_rate": 1.657966482720566e-06, "loss": 0.3705, "step": 123090 }, { "epoch": 4.436335459689336, "grad_norm": 0.25596699118614197, "learning_rate": 1.656921643637821e-06, "loss": 0.3921, "step": 123095 }, { "epoch": 4.436515659350561, "grad_norm": 0.2769662141799927, "learning_rate": 1.6558771225999942e-06, "loss": 0.3694, "step": 123100 }, { "epoch": 4.436695859011785, "grad_norm": 0.2575506865978241, "learning_rate": 1.6548329196213242e-06, "loss": 0.3762, "step": 123105 }, { "epoch": 4.436876058673009, "grad_norm": 0.2907906174659729, "learning_rate": 1.6537890347160412e-06, "loss": 0.3744, "step": 123110 }, { "epoch": 4.437056258334234, "grad_norm": 0.2527605891227722, "learning_rate": 1.652745467898356e-06, "loss": 0.3641, "step": 123115 }, { "epoch": 4.437236457995459, "grad_norm": 0.31895288825035095, "learning_rate": 1.6517022191824965e-06, "loss": 0.3943, "step": 123120 }, { "epoch": 4.437416657656684, "grad_norm": 0.23418550193309784, "learning_rate": 1.6506592885826704e-06, "loss": 0.3661, "step": 123125 }, { "epoch": 4.437596857317908, "grad_norm": 0.2372836172580719, "learning_rate": 1.649616676113097e-06, "loss": 0.3527, "step": 123130 }, { "epoch": 4.437777056979133, "grad_norm": 0.33892858028411865, "learning_rate": 1.6485743817879735e-06, "loss": 0.3835, "step": 123135 }, { "epoch": 4.437957256640358, "grad_norm": 0.32817450165748596, "learning_rate": 1.6475324056214992e-06, "loss": 0.3921, "step": 123140 }, { "epoch": 4.4381374563015825, "grad_norm": 0.30701467394828796, "learning_rate": 1.6466990537720912e-06, "loss": 0.3915, "step": 123145 }, { "epoch": 4.438317655962807, "grad_norm": 0.21967414021492004, "learning_rate": 1.6456576503269705e-06, "loss": 0.361, "step": 123150 }, { "epoch": 4.438497855624031, "grad_norm": 0.23973840475082397, "learning_rate": 1.6446165650802398e-06, "loss": 0.3769, "step": 123155 }, { "epoch": 4.438678055285256, "grad_norm": 0.26962411403656006, "learning_rate": 1.6435757980460876e-06, "loss": 0.35, "step": 123160 }, { "epoch": 4.438858254946481, "grad_norm": 0.21383236348628998, "learning_rate": 1.642535349238694e-06, "loss": 0.3269, "step": 123165 }, { "epoch": 4.439038454607705, "grad_norm": 0.23623572289943695, "learning_rate": 1.6414952186722287e-06, "loss": 0.3909, "step": 123170 }, { "epoch": 4.43921865426893, "grad_norm": 0.24700896441936493, "learning_rate": 1.6404554063608746e-06, "loss": 0.3601, "step": 123175 }, { "epoch": 4.439398853930155, "grad_norm": 0.24526774883270264, "learning_rate": 1.6394159123187953e-06, "loss": 0.3701, "step": 123180 }, { "epoch": 4.4395790535913795, "grad_norm": 0.20414763689041138, "learning_rate": 1.6383767365601433e-06, "loss": 0.3538, "step": 123185 }, { "epoch": 4.439759253252604, "grad_norm": 0.19671322405338287, "learning_rate": 1.6373378790990906e-06, "loss": 0.3585, "step": 123190 }, { "epoch": 4.439939452913828, "grad_norm": 0.2665334939956665, "learning_rate": 1.6362993399497816e-06, "loss": 0.3915, "step": 123195 }, { "epoch": 4.440119652575053, "grad_norm": 0.21728205680847168, "learning_rate": 1.6352611191263745e-06, "loss": 0.3661, "step": 123200 }, { "epoch": 4.4402998522362775, "grad_norm": 0.23190873861312866, "learning_rate": 1.634223216643016e-06, "loss": 0.3862, "step": 123205 }, { "epoch": 4.440480051897502, "grad_norm": 0.28229013085365295, "learning_rate": 1.633185632513834e-06, "loss": 0.364, "step": 123210 }, { "epoch": 4.440660251558727, "grad_norm": 0.2951110303401947, "learning_rate": 1.632148366752978e-06, "loss": 0.4201, "step": 123215 }, { "epoch": 4.440840451219952, "grad_norm": 0.2822306156158447, "learning_rate": 1.6311114193745757e-06, "loss": 0.3883, "step": 123220 }, { "epoch": 4.4410206508811765, "grad_norm": 0.280408650636673, "learning_rate": 1.6300747903927576e-06, "loss": 0.4214, "step": 123225 }, { "epoch": 4.441200850542401, "grad_norm": 0.23964573442935944, "learning_rate": 1.6290384798216428e-06, "loss": 0.3636, "step": 123230 }, { "epoch": 4.441381050203626, "grad_norm": 0.2101919949054718, "learning_rate": 1.6280024876753535e-06, "loss": 0.3383, "step": 123235 }, { "epoch": 4.441561249864851, "grad_norm": 0.24973724782466888, "learning_rate": 1.626966813968006e-06, "loss": 0.3767, "step": 123240 }, { "epoch": 4.4417414495260745, "grad_norm": 0.23754936456680298, "learning_rate": 1.6259314587137114e-06, "loss": 0.3969, "step": 123245 }, { "epoch": 4.441921649187299, "grad_norm": 0.28546053171157837, "learning_rate": 1.6248964219265777e-06, "loss": 0.3638, "step": 123250 }, { "epoch": 4.442101848848524, "grad_norm": 0.2416713684797287, "learning_rate": 1.623861703620702e-06, "loss": 0.3669, "step": 123255 }, { "epoch": 4.442282048509749, "grad_norm": 0.26729458570480347, "learning_rate": 1.6228273038101815e-06, "loss": 0.3851, "step": 123260 }, { "epoch": 4.4424622481709735, "grad_norm": 0.24362029135227203, "learning_rate": 1.6217932225091187e-06, "loss": 0.405, "step": 123265 }, { "epoch": 4.442642447832198, "grad_norm": 0.24386849999427795, "learning_rate": 1.620759459731594e-06, "loss": 0.4102, "step": 123270 }, { "epoch": 4.442822647493423, "grad_norm": 0.27166756987571716, "learning_rate": 1.619726015491696e-06, "loss": 0.3791, "step": 123275 }, { "epoch": 4.443002847154648, "grad_norm": 0.2460469901561737, "learning_rate": 1.6186928898035082e-06, "loss": 0.3231, "step": 123280 }, { "epoch": 4.443183046815872, "grad_norm": 0.21668177843093872, "learning_rate": 1.6176600826810994e-06, "loss": 0.3385, "step": 123285 }, { "epoch": 4.443363246477096, "grad_norm": 0.27097898721694946, "learning_rate": 1.6166275941385422e-06, "loss": 0.3826, "step": 123290 }, { "epoch": 4.443543446138321, "grad_norm": 0.2606200575828552, "learning_rate": 1.615595424189914e-06, "loss": 0.3223, "step": 123295 }, { "epoch": 4.443723645799546, "grad_norm": 0.26379239559173584, "learning_rate": 1.6145635728492619e-06, "loss": 0.3753, "step": 123300 }, { "epoch": 4.44390384546077, "grad_norm": 0.2528885006904602, "learning_rate": 1.6135320401306608e-06, "loss": 0.3818, "step": 123305 }, { "epoch": 4.444084045121995, "grad_norm": 0.22059816122055054, "learning_rate": 1.612500826048155e-06, "loss": 0.3668, "step": 123310 }, { "epoch": 4.44426424478322, "grad_norm": 0.285345196723938, "learning_rate": 1.6114699306157944e-06, "loss": 0.356, "step": 123315 }, { "epoch": 4.444444444444445, "grad_norm": 0.29682835936546326, "learning_rate": 1.610439353847637e-06, "loss": 0.3959, "step": 123320 }, { "epoch": 4.444624644105669, "grad_norm": 0.2404378205537796, "learning_rate": 1.6094090957577079e-06, "loss": 0.372, "step": 123325 }, { "epoch": 4.444804843766894, "grad_norm": 0.31720393896102905, "learning_rate": 1.6083791563600514e-06, "loss": 0.3816, "step": 123330 }, { "epoch": 4.444985043428118, "grad_norm": 0.19714432954788208, "learning_rate": 1.6073495356687008e-06, "loss": 0.3454, "step": 123335 }, { "epoch": 4.445165243089343, "grad_norm": 0.2751675546169281, "learning_rate": 1.6063202336976862e-06, "loss": 0.4002, "step": 123340 }, { "epoch": 4.445345442750567, "grad_norm": 0.2857431471347809, "learning_rate": 1.6052912504610273e-06, "loss": 0.4002, "step": 123345 }, { "epoch": 4.445525642411792, "grad_norm": 0.2135014832019806, "learning_rate": 1.6042625859727461e-06, "loss": 0.3727, "step": 123350 }, { "epoch": 4.445705842073017, "grad_norm": 0.2780386507511139, "learning_rate": 1.6032342402468536e-06, "loss": 0.3768, "step": 123355 }, { "epoch": 4.445886041734242, "grad_norm": 0.24021285772323608, "learning_rate": 1.6022062132973692e-06, "loss": 0.4057, "step": 123360 }, { "epoch": 4.446066241395466, "grad_norm": 0.2460976094007492, "learning_rate": 1.6011785051382956e-06, "loss": 0.3774, "step": 123365 }, { "epoch": 4.446246441056691, "grad_norm": 0.25038421154022217, "learning_rate": 1.6001511157836352e-06, "loss": 0.3843, "step": 123370 }, { "epoch": 4.446426640717916, "grad_norm": 0.3057589828968048, "learning_rate": 1.5991240452473854e-06, "loss": 0.3501, "step": 123375 }, { "epoch": 4.44660684037914, "grad_norm": 0.2682446241378784, "learning_rate": 1.5980972935435351e-06, "loss": 0.3708, "step": 123380 }, { "epoch": 4.446787040040364, "grad_norm": 0.2691839337348938, "learning_rate": 1.5970708606860813e-06, "loss": 0.3683, "step": 123385 }, { "epoch": 4.446967239701589, "grad_norm": 0.26984599232673645, "learning_rate": 1.5960447466890128e-06, "loss": 0.4096, "step": 123390 }, { "epoch": 4.447147439362814, "grad_norm": 0.2700972557067871, "learning_rate": 1.5950189515662934e-06, "loss": 0.3566, "step": 123395 }, { "epoch": 4.447327639024039, "grad_norm": 0.2507593333721161, "learning_rate": 1.5939934753319146e-06, "loss": 0.4151, "step": 123400 }, { "epoch": 4.447507838685263, "grad_norm": 0.2393806129693985, "learning_rate": 1.5929683179998434e-06, "loss": 0.367, "step": 123405 }, { "epoch": 4.447688038346488, "grad_norm": 0.2460200935602188, "learning_rate": 1.591943479584046e-06, "loss": 0.4182, "step": 123410 }, { "epoch": 4.447868238007713, "grad_norm": 0.24296921491622925, "learning_rate": 1.5909189600984865e-06, "loss": 0.3385, "step": 123415 }, { "epoch": 4.4480484376689375, "grad_norm": 0.24117806553840637, "learning_rate": 1.58989475955712e-06, "loss": 0.3475, "step": 123420 }, { "epoch": 4.448228637330162, "grad_norm": 0.21672047674655914, "learning_rate": 1.588870877973911e-06, "loss": 0.3675, "step": 123425 }, { "epoch": 4.448408836991386, "grad_norm": 0.29379647970199585, "learning_rate": 1.5878473153628004e-06, "loss": 0.3689, "step": 123430 }, { "epoch": 4.448589036652611, "grad_norm": 0.2538691461086273, "learning_rate": 1.5868240717377413e-06, "loss": 0.387, "step": 123435 }, { "epoch": 4.448769236313836, "grad_norm": 0.22580280900001526, "learning_rate": 1.58580114711267e-06, "loss": 0.3792, "step": 123440 }, { "epoch": 4.44894943597506, "grad_norm": 0.24915678799152374, "learning_rate": 1.584778541501522e-06, "loss": 0.3549, "step": 123445 }, { "epoch": 4.449129635636285, "grad_norm": 0.28229835629463196, "learning_rate": 1.583756254918231e-06, "loss": 0.3472, "step": 123450 }, { "epoch": 4.44930983529751, "grad_norm": 0.2428547888994217, "learning_rate": 1.5827342873767332e-06, "loss": 0.3503, "step": 123455 }, { "epoch": 4.4494900349587345, "grad_norm": 0.3034095764160156, "learning_rate": 1.5817126388909448e-06, "loss": 0.3411, "step": 123460 }, { "epoch": 4.449670234619959, "grad_norm": 0.3237626254558563, "learning_rate": 1.5806913094747883e-06, "loss": 0.3616, "step": 123465 }, { "epoch": 4.449850434281183, "grad_norm": 0.26890459656715393, "learning_rate": 1.5796702991421802e-06, "loss": 0.407, "step": 123470 }, { "epoch": 4.450030633942408, "grad_norm": 0.2624853849411011, "learning_rate": 1.5786496079070262e-06, "loss": 0.3613, "step": 123475 }, { "epoch": 4.450210833603633, "grad_norm": 0.20167163014411926, "learning_rate": 1.5776292357832374e-06, "loss": 0.3715, "step": 123480 }, { "epoch": 4.450391033264857, "grad_norm": 0.33518025279045105, "learning_rate": 1.576609182784719e-06, "loss": 0.3528, "step": 123485 }, { "epoch": 4.450571232926082, "grad_norm": 0.2814764082431793, "learning_rate": 1.575589448925363e-06, "loss": 0.3977, "step": 123490 }, { "epoch": 4.450751432587307, "grad_norm": 0.2996358871459961, "learning_rate": 1.5745700342190667e-06, "loss": 0.3737, "step": 123495 }, { "epoch": 4.4509316322485315, "grad_norm": 0.2564932107925415, "learning_rate": 1.5735509386797132e-06, "loss": 0.3465, "step": 123500 }, { "epoch": 4.4509316322485315, "eval_loss": 0.4289168119430542, "eval_runtime": 3.5469, "eval_samples_per_second": 28.194, "eval_steps_per_second": 7.048, "step": 123500 }, { "epoch": 4.451111831909756, "grad_norm": 0.24683259427547455, "learning_rate": 1.5725321623212025e-06, "loss": 0.3773, "step": 123505 }, { "epoch": 4.451292031570981, "grad_norm": 0.22746077179908752, "learning_rate": 1.5715137051574013e-06, "loss": 0.3871, "step": 123510 }, { "epoch": 4.451472231232206, "grad_norm": 0.304383784532547, "learning_rate": 1.5704955672021848e-06, "loss": 0.3756, "step": 123515 }, { "epoch": 4.4516524308934295, "grad_norm": 0.2046549767255783, "learning_rate": 1.5694777484694334e-06, "loss": 0.3864, "step": 123520 }, { "epoch": 4.451832630554654, "grad_norm": 0.2234576940536499, "learning_rate": 1.5684602489730137e-06, "loss": 0.3441, "step": 123525 }, { "epoch": 4.452012830215879, "grad_norm": 0.2881526052951813, "learning_rate": 1.5674430687267843e-06, "loss": 0.3734, "step": 123530 }, { "epoch": 4.452193029877104, "grad_norm": 0.22432564198970795, "learning_rate": 1.5664262077446091e-06, "loss": 0.3581, "step": 123535 }, { "epoch": 4.4523732295383285, "grad_norm": 0.2805180549621582, "learning_rate": 1.565409666040335e-06, "loss": 0.3709, "step": 123540 }, { "epoch": 4.452553429199553, "grad_norm": 0.22590774297714233, "learning_rate": 1.5643934436278207e-06, "loss": 0.3828, "step": 123545 }, { "epoch": 4.452733628860778, "grad_norm": 0.3044675588607788, "learning_rate": 1.5633775405209079e-06, "loss": 0.3755, "step": 123550 }, { "epoch": 4.452913828522003, "grad_norm": 0.22538445889949799, "learning_rate": 1.562361956733438e-06, "loss": 0.3691, "step": 123555 }, { "epoch": 4.453094028183227, "grad_norm": 0.21017488837242126, "learning_rate": 1.5613466922792502e-06, "loss": 0.3595, "step": 123560 }, { "epoch": 4.453274227844451, "grad_norm": 0.24644504487514496, "learning_rate": 1.5603317471721723e-06, "loss": 0.3807, "step": 123565 }, { "epoch": 4.453454427505676, "grad_norm": 0.289897084236145, "learning_rate": 1.5593171214260377e-06, "loss": 0.3825, "step": 123570 }, { "epoch": 4.453634627166901, "grad_norm": 0.26970839500427246, "learning_rate": 1.558302815054677e-06, "loss": 0.371, "step": 123575 }, { "epoch": 4.4538148268281255, "grad_norm": 0.2419775426387787, "learning_rate": 1.55728882807189e-06, "loss": 0.3658, "step": 123580 }, { "epoch": 4.45399502648935, "grad_norm": 0.25539085268974304, "learning_rate": 1.5562751604915105e-06, "loss": 0.3781, "step": 123585 }, { "epoch": 4.454175226150575, "grad_norm": 0.2259790003299713, "learning_rate": 1.5552618123273438e-06, "loss": 0.3886, "step": 123590 }, { "epoch": 4.4543554258118, "grad_norm": 0.2829951047897339, "learning_rate": 1.554248783593193e-06, "loss": 0.3758, "step": 123595 }, { "epoch": 4.454535625473024, "grad_norm": 0.2783590257167816, "learning_rate": 1.5532360743028662e-06, "loss": 0.386, "step": 123600 }, { "epoch": 4.454715825134249, "grad_norm": 0.27192917466163635, "learning_rate": 1.5522236844701527e-06, "loss": 0.3568, "step": 123605 }, { "epoch": 4.454896024795473, "grad_norm": 0.21945591270923615, "learning_rate": 1.551211614108855e-06, "loss": 0.3553, "step": 123610 }, { "epoch": 4.455076224456698, "grad_norm": 0.2255585640668869, "learning_rate": 1.5501998632327624e-06, "loss": 0.3524, "step": 123615 }, { "epoch": 4.455256424117922, "grad_norm": 0.22020921111106873, "learning_rate": 1.5491884318556526e-06, "loss": 0.3737, "step": 123620 }, { "epoch": 4.455436623779147, "grad_norm": 0.3251590132713318, "learning_rate": 1.548177319991312e-06, "loss": 0.3473, "step": 123625 }, { "epoch": 4.455616823440372, "grad_norm": 0.3003102242946625, "learning_rate": 1.5471665276535151e-06, "loss": 0.3673, "step": 123630 }, { "epoch": 4.455797023101597, "grad_norm": 0.22849665582180023, "learning_rate": 1.5461560548560322e-06, "loss": 0.3556, "step": 123635 }, { "epoch": 4.455977222762821, "grad_norm": 0.23016008734703064, "learning_rate": 1.5451459016126325e-06, "loss": 0.4272, "step": 123640 }, { "epoch": 4.456157422424046, "grad_norm": 0.25868502259254456, "learning_rate": 1.5441360679370798e-06, "loss": 0.3825, "step": 123645 }, { "epoch": 4.456337622085271, "grad_norm": 0.2868441641330719, "learning_rate": 1.5431265538431327e-06, "loss": 0.3849, "step": 123650 }, { "epoch": 4.456517821746495, "grad_norm": 0.2647748589515686, "learning_rate": 1.5421173593445442e-06, "loss": 0.3783, "step": 123655 }, { "epoch": 4.456698021407719, "grad_norm": 0.26466822624206543, "learning_rate": 1.5411084844550617e-06, "loss": 0.3664, "step": 123660 }, { "epoch": 4.456878221068944, "grad_norm": 0.2780085504055023, "learning_rate": 1.5400999291884377e-06, "loss": 0.3951, "step": 123665 }, { "epoch": 4.457058420730169, "grad_norm": 0.26032882928848267, "learning_rate": 1.5390916935584116e-06, "loss": 0.4208, "step": 123670 }, { "epoch": 4.457238620391394, "grad_norm": 0.23003022372722626, "learning_rate": 1.5380837775787194e-06, "loss": 0.3772, "step": 123675 }, { "epoch": 4.457418820052618, "grad_norm": 0.2510555684566498, "learning_rate": 1.5370761812630918e-06, "loss": 0.3706, "step": 123680 }, { "epoch": 4.457599019713843, "grad_norm": 0.2845185697078705, "learning_rate": 1.5360689046252542e-06, "loss": 0.3997, "step": 123685 }, { "epoch": 4.457779219375068, "grad_norm": 0.2645367980003357, "learning_rate": 1.5350619476789425e-06, "loss": 0.3494, "step": 123690 }, { "epoch": 4.457959419036293, "grad_norm": 0.32513341307640076, "learning_rate": 1.5340553104378652e-06, "loss": 0.3991, "step": 123695 }, { "epoch": 4.458139618697517, "grad_norm": 0.2185758650302887, "learning_rate": 1.5330489929157394e-06, "loss": 0.3575, "step": 123700 }, { "epoch": 4.458319818358741, "grad_norm": 0.24072350561618805, "learning_rate": 1.5320429951262788e-06, "loss": 0.3673, "step": 123705 }, { "epoch": 4.458500018019966, "grad_norm": 0.2706233263015747, "learning_rate": 1.5310373170831922e-06, "loss": 0.4193, "step": 123710 }, { "epoch": 4.458680217681191, "grad_norm": 0.21717756986618042, "learning_rate": 1.5300319588001766e-06, "loss": 0.3553, "step": 123715 }, { "epoch": 4.458860417342415, "grad_norm": 0.22631758451461792, "learning_rate": 1.5290269202909297e-06, "loss": 0.3442, "step": 123720 }, { "epoch": 4.45904061700364, "grad_norm": 0.2610069215297699, "learning_rate": 1.528022201569146e-06, "loss": 0.3812, "step": 123725 }, { "epoch": 4.459220816664865, "grad_norm": 0.3348616361618042, "learning_rate": 1.5270178026485172e-06, "loss": 0.4102, "step": 123730 }, { "epoch": 4.4594010163260895, "grad_norm": 0.27878302335739136, "learning_rate": 1.5260137235427268e-06, "loss": 0.3641, "step": 123735 }, { "epoch": 4.459581215987314, "grad_norm": 0.2503393292427063, "learning_rate": 1.5250099642654558e-06, "loss": 0.3192, "step": 123740 }, { "epoch": 4.459761415648538, "grad_norm": 0.2925935983657837, "learning_rate": 1.524006524830379e-06, "loss": 0.3556, "step": 123745 }, { "epoch": 4.459941615309763, "grad_norm": 0.25310271978378296, "learning_rate": 1.5230034052511637e-06, "loss": 0.3719, "step": 123750 }, { "epoch": 4.460121814970988, "grad_norm": 0.28005677461624146, "learning_rate": 1.5220006055414848e-06, "loss": 0.3673, "step": 123755 }, { "epoch": 4.460302014632212, "grad_norm": 0.26591956615448, "learning_rate": 1.520998125715009e-06, "loss": 0.3491, "step": 123760 }, { "epoch": 4.460482214293437, "grad_norm": 0.24851107597351074, "learning_rate": 1.5199959657853784e-06, "loss": 0.3389, "step": 123765 }, { "epoch": 4.460662413954662, "grad_norm": 0.24951453506946564, "learning_rate": 1.5189941257662598e-06, "loss": 0.3878, "step": 123770 }, { "epoch": 4.4608426136158865, "grad_norm": 0.26914355158805847, "learning_rate": 1.5179926056713006e-06, "loss": 0.3913, "step": 123775 }, { "epoch": 4.461022813277111, "grad_norm": 0.2922750413417816, "learning_rate": 1.5169914055141427e-06, "loss": 0.3501, "step": 123780 }, { "epoch": 4.461203012938336, "grad_norm": 0.22999052703380585, "learning_rate": 1.5159905253084388e-06, "loss": 0.3749, "step": 123785 }, { "epoch": 4.461383212599561, "grad_norm": 0.3160970211029053, "learning_rate": 1.514989965067809e-06, "loss": 0.3529, "step": 123790 }, { "epoch": 4.461563412260785, "grad_norm": 0.21242515742778778, "learning_rate": 1.5139897248058977e-06, "loss": 0.3838, "step": 123795 }, { "epoch": 4.461743611922009, "grad_norm": 0.24923621118068695, "learning_rate": 1.5129898045363273e-06, "loss": 0.3617, "step": 123800 }, { "epoch": 4.461923811583234, "grad_norm": 0.25680771470069885, "learning_rate": 1.5119902042727257e-06, "loss": 0.3805, "step": 123805 }, { "epoch": 4.462104011244459, "grad_norm": 0.29142534732818604, "learning_rate": 1.51099092402871e-06, "loss": 0.3772, "step": 123810 }, { "epoch": 4.4622842109056835, "grad_norm": 0.212846577167511, "learning_rate": 1.5099919638178972e-06, "loss": 0.3341, "step": 123815 }, { "epoch": 4.462464410566908, "grad_norm": 0.22237548232078552, "learning_rate": 1.50899332365389e-06, "loss": 0.354, "step": 123820 }, { "epoch": 4.462644610228133, "grad_norm": 0.2658112645149231, "learning_rate": 1.5079950035503054e-06, "loss": 0.3755, "step": 123825 }, { "epoch": 4.462824809889358, "grad_norm": 0.2840268611907959, "learning_rate": 1.506997003520741e-06, "loss": 0.3935, "step": 123830 }, { "epoch": 4.463005009550582, "grad_norm": 0.2549954354763031, "learning_rate": 1.5059993235787912e-06, "loss": 0.3839, "step": 123835 }, { "epoch": 4.463185209211806, "grad_norm": 0.26900264620780945, "learning_rate": 1.5050019637380565e-06, "loss": 0.3575, "step": 123840 }, { "epoch": 4.463365408873031, "grad_norm": 0.28676745295524597, "learning_rate": 1.5040049240121145e-06, "loss": 0.3925, "step": 123845 }, { "epoch": 4.463545608534256, "grad_norm": 0.24285760521888733, "learning_rate": 1.5030082044145605e-06, "loss": 0.3623, "step": 123850 }, { "epoch": 4.4637258081954805, "grad_norm": 0.2883954346179962, "learning_rate": 1.5020118049589722e-06, "loss": 0.3763, "step": 123855 }, { "epoch": 4.463906007856705, "grad_norm": 0.31590425968170166, "learning_rate": 1.5010157256589218e-06, "loss": 0.3716, "step": 123860 }, { "epoch": 4.46408620751793, "grad_norm": 0.21175405383110046, "learning_rate": 1.5000199665279824e-06, "loss": 0.3788, "step": 123865 }, { "epoch": 4.464266407179155, "grad_norm": 0.23191265761852264, "learning_rate": 1.4990245275797177e-06, "loss": 0.3739, "step": 123870 }, { "epoch": 4.464446606840379, "grad_norm": 0.23299415409564972, "learning_rate": 1.498029408827703e-06, "loss": 0.3612, "step": 123875 }, { "epoch": 4.464626806501604, "grad_norm": 0.28248894214630127, "learning_rate": 1.4970346102854831e-06, "loss": 0.3685, "step": 123880 }, { "epoch": 4.464807006162829, "grad_norm": 0.27142465114593506, "learning_rate": 1.4960401319666113e-06, "loss": 0.3475, "step": 123885 }, { "epoch": 4.464987205824053, "grad_norm": 0.2543891966342926, "learning_rate": 1.4950459738846455e-06, "loss": 0.3551, "step": 123890 }, { "epoch": 4.4651674054852775, "grad_norm": 0.22357742488384247, "learning_rate": 1.4940521360531257e-06, "loss": 0.396, "step": 123895 }, { "epoch": 4.465347605146502, "grad_norm": 0.3259972333908081, "learning_rate": 1.4930586184856016e-06, "loss": 0.3484, "step": 123900 }, { "epoch": 4.465527804807727, "grad_norm": 0.23107150197029114, "learning_rate": 1.4920654211955987e-06, "loss": 0.3455, "step": 123905 }, { "epoch": 4.465708004468952, "grad_norm": 0.2234567403793335, "learning_rate": 1.4910725441966505e-06, "loss": 0.3741, "step": 123910 }, { "epoch": 4.465888204130176, "grad_norm": 0.23561283946037292, "learning_rate": 1.4900799875022908e-06, "loss": 0.3616, "step": 123915 }, { "epoch": 4.466068403791401, "grad_norm": 0.2155967354774475, "learning_rate": 1.4890877511260393e-06, "loss": 0.3914, "step": 123920 }, { "epoch": 4.466248603452626, "grad_norm": 0.22571438550949097, "learning_rate": 1.4880958350814156e-06, "loss": 0.3525, "step": 123925 }, { "epoch": 4.46642880311385, "grad_norm": 0.2629721462726593, "learning_rate": 1.487104239381934e-06, "loss": 0.4074, "step": 123930 }, { "epoch": 4.466609002775074, "grad_norm": 0.2295321524143219, "learning_rate": 1.4861129640411004e-06, "loss": 0.3878, "step": 123935 }, { "epoch": 4.466789202436299, "grad_norm": 0.2172783464193344, "learning_rate": 1.4851220090724288e-06, "loss": 0.3812, "step": 123940 }, { "epoch": 4.466969402097524, "grad_norm": 0.2786194980144501, "learning_rate": 1.484131374489417e-06, "loss": 0.3771, "step": 123945 }, { "epoch": 4.467149601758749, "grad_norm": 0.25498166680336, "learning_rate": 1.4831410603055651e-06, "loss": 0.3795, "step": 123950 }, { "epoch": 4.467329801419973, "grad_norm": 0.24183687567710876, "learning_rate": 1.4821510665343595e-06, "loss": 0.4133, "step": 123955 }, { "epoch": 4.467510001081198, "grad_norm": 0.2495322972536087, "learning_rate": 1.4811613931892953e-06, "loss": 0.3759, "step": 123960 }, { "epoch": 4.467690200742423, "grad_norm": 0.2298755794763565, "learning_rate": 1.4801720402838503e-06, "loss": 0.3509, "step": 123965 }, { "epoch": 4.467870400403648, "grad_norm": 0.2513538897037506, "learning_rate": 1.4791830078315139e-06, "loss": 0.3569, "step": 123970 }, { "epoch": 4.468050600064872, "grad_norm": 0.22999824583530426, "learning_rate": 1.4781942958457474e-06, "loss": 0.3452, "step": 123975 }, { "epoch": 4.468230799726096, "grad_norm": 0.29147133231163025, "learning_rate": 1.4772059043400317e-06, "loss": 0.3672, "step": 123980 }, { "epoch": 4.468410999387321, "grad_norm": 0.2764071226119995, "learning_rate": 1.4762178333278337e-06, "loss": 0.365, "step": 123985 }, { "epoch": 4.468591199048546, "grad_norm": 0.22821201384067535, "learning_rate": 1.475230082822615e-06, "loss": 0.3937, "step": 123990 }, { "epoch": 4.46877139870977, "grad_norm": 0.27766650915145874, "learning_rate": 1.474242652837829e-06, "loss": 0.3654, "step": 123995 }, { "epoch": 4.468951598370995, "grad_norm": 0.21973931789398193, "learning_rate": 1.4732555433869339e-06, "loss": 0.3532, "step": 124000 }, { "epoch": 4.468951598370995, "eval_loss": 0.42887455224990845, "eval_runtime": 3.5305, "eval_samples_per_second": 28.324, "eval_steps_per_second": 7.081, "step": 124000 }, { "epoch": 4.46913179803222, "grad_norm": 0.18677954375743866, "learning_rate": 1.4722687544833719e-06, "loss": 0.3637, "step": 124005 }, { "epoch": 4.469311997693445, "grad_norm": 0.22494982182979584, "learning_rate": 1.4712822861405962e-06, "loss": 0.3444, "step": 124010 }, { "epoch": 4.469492197354669, "grad_norm": 0.2011212706565857, "learning_rate": 1.470296138372046e-06, "loss": 0.3575, "step": 124015 }, { "epoch": 4.469672397015893, "grad_norm": 0.2682434320449829, "learning_rate": 1.4693103111911549e-06, "loss": 0.3655, "step": 124020 }, { "epoch": 4.469852596677118, "grad_norm": 0.2674734890460968, "learning_rate": 1.468324804611354e-06, "loss": 0.3717, "step": 124025 }, { "epoch": 4.470032796338343, "grad_norm": 0.24927879869937897, "learning_rate": 1.4673396186460687e-06, "loss": 0.3839, "step": 124030 }, { "epoch": 4.470212995999567, "grad_norm": 0.2811104953289032, "learning_rate": 1.4663547533087297e-06, "loss": 0.347, "step": 124035 }, { "epoch": 4.470393195660792, "grad_norm": 0.28087154030799866, "learning_rate": 1.4653702086127541e-06, "loss": 0.4091, "step": 124040 }, { "epoch": 4.470573395322017, "grad_norm": 0.2959977686405182, "learning_rate": 1.464385984571545e-06, "loss": 0.3983, "step": 124045 }, { "epoch": 4.4707535949832415, "grad_norm": 0.3171277344226837, "learning_rate": 1.4634020811985255e-06, "loss": 0.36, "step": 124050 }, { "epoch": 4.470933794644466, "grad_norm": 0.2086503803730011, "learning_rate": 1.46241849850709e-06, "loss": 0.3771, "step": 124055 }, { "epoch": 4.471113994305691, "grad_norm": 0.23770023882389069, "learning_rate": 1.46143523651065e-06, "loss": 0.3546, "step": 124060 }, { "epoch": 4.471294193966916, "grad_norm": 0.319051057100296, "learning_rate": 1.4604522952226001e-06, "loss": 0.3848, "step": 124065 }, { "epoch": 4.47147439362814, "grad_norm": 0.1936783492565155, "learning_rate": 1.4594696746563246e-06, "loss": 0.3537, "step": 124070 }, { "epoch": 4.471654593289364, "grad_norm": 0.25065305829048157, "learning_rate": 1.4584873748252208e-06, "loss": 0.3907, "step": 124075 }, { "epoch": 4.471834792950589, "grad_norm": 0.23407846689224243, "learning_rate": 1.4575053957426643e-06, "loss": 0.3819, "step": 124080 }, { "epoch": 4.472014992611814, "grad_norm": 0.2705335319042206, "learning_rate": 1.456523737422047e-06, "loss": 0.3762, "step": 124085 }, { "epoch": 4.4721951922730385, "grad_norm": 0.2572081685066223, "learning_rate": 1.4555423998767304e-06, "loss": 0.3961, "step": 124090 }, { "epoch": 4.472375391934263, "grad_norm": 0.2333773970603943, "learning_rate": 1.45456138312009e-06, "loss": 0.3601, "step": 124095 }, { "epoch": 4.472555591595488, "grad_norm": 0.2367962896823883, "learning_rate": 1.4535806871654927e-06, "loss": 0.3246, "step": 124100 }, { "epoch": 4.472735791256713, "grad_norm": 0.23657187819480896, "learning_rate": 1.4526003120263005e-06, "loss": 0.387, "step": 124105 }, { "epoch": 4.4729159909179375, "grad_norm": 0.2925667464733124, "learning_rate": 1.451620257715869e-06, "loss": 0.3759, "step": 124110 }, { "epoch": 4.473096190579161, "grad_norm": 0.30918559432029724, "learning_rate": 1.450640524247554e-06, "loss": 0.3738, "step": 124115 }, { "epoch": 4.473276390240386, "grad_norm": 0.32785508036613464, "learning_rate": 1.449661111634698e-06, "loss": 0.3538, "step": 124120 }, { "epoch": 4.473456589901611, "grad_norm": 0.25581833720207214, "learning_rate": 1.4486820198906542e-06, "loss": 0.3863, "step": 124125 }, { "epoch": 4.4736367895628355, "grad_norm": 0.2755228877067566, "learning_rate": 1.447703249028759e-06, "loss": 0.3925, "step": 124130 }, { "epoch": 4.47381698922406, "grad_norm": 0.29217204451560974, "learning_rate": 1.4467247990623462e-06, "loss": 0.3763, "step": 124135 }, { "epoch": 4.473997188885285, "grad_norm": 0.22582022845745087, "learning_rate": 1.4457466700047496e-06, "loss": 0.3484, "step": 124140 }, { "epoch": 4.47417738854651, "grad_norm": 0.2606388330459595, "learning_rate": 1.4447688618692916e-06, "loss": 0.3651, "step": 124145 }, { "epoch": 4.474357588207734, "grad_norm": 0.24026912450790405, "learning_rate": 1.4437913746692955e-06, "loss": 0.3714, "step": 124150 }, { "epoch": 4.474537787868959, "grad_norm": 0.28807055950164795, "learning_rate": 1.4428142084180862e-06, "loss": 0.359, "step": 124155 }, { "epoch": 4.474717987530184, "grad_norm": 0.23027734458446503, "learning_rate": 1.4418373631289671e-06, "loss": 0.3887, "step": 124160 }, { "epoch": 4.474898187191408, "grad_norm": 0.2151884138584137, "learning_rate": 1.4408608388152583e-06, "loss": 0.3844, "step": 124165 }, { "epoch": 4.4750783868526325, "grad_norm": 0.30616864562034607, "learning_rate": 1.4398846354902574e-06, "loss": 0.376, "step": 124170 }, { "epoch": 4.475258586513857, "grad_norm": 0.28647398948669434, "learning_rate": 1.4389087531672674e-06, "loss": 0.3926, "step": 124175 }, { "epoch": 4.475438786175082, "grad_norm": 0.24233390390872955, "learning_rate": 1.4379331918595835e-06, "loss": 0.3548, "step": 124180 }, { "epoch": 4.475618985836307, "grad_norm": 0.2877272963523865, "learning_rate": 1.4369579515804975e-06, "loss": 0.3599, "step": 124185 }, { "epoch": 4.475799185497531, "grad_norm": 0.2793196737766266, "learning_rate": 1.4359830323432937e-06, "loss": 0.3681, "step": 124190 }, { "epoch": 4.475979385158756, "grad_norm": 0.21865501999855042, "learning_rate": 1.435008434161264e-06, "loss": 0.3772, "step": 124195 }, { "epoch": 4.476159584819981, "grad_norm": 0.24669122695922852, "learning_rate": 1.434034157047684e-06, "loss": 0.3853, "step": 124200 }, { "epoch": 4.476339784481205, "grad_norm": 0.23632431030273438, "learning_rate": 1.4330602010158235e-06, "loss": 0.3896, "step": 124205 }, { "epoch": 4.4765199841424295, "grad_norm": 0.2615738809108734, "learning_rate": 1.4320865660789551e-06, "loss": 0.3571, "step": 124210 }, { "epoch": 4.476700183803654, "grad_norm": 0.23187601566314697, "learning_rate": 1.4311132522503407e-06, "loss": 0.3799, "step": 124215 }, { "epoch": 4.476880383464879, "grad_norm": 0.23954427242279053, "learning_rate": 1.4301402595432472e-06, "loss": 0.3563, "step": 124220 }, { "epoch": 4.477060583126104, "grad_norm": 0.2520751953125, "learning_rate": 1.429167587970931e-06, "loss": 0.3862, "step": 124225 }, { "epoch": 4.477240782787328, "grad_norm": 0.2765583097934723, "learning_rate": 1.4281952375466451e-06, "loss": 0.3983, "step": 124230 }, { "epoch": 4.477420982448553, "grad_norm": 0.2432951033115387, "learning_rate": 1.4272232082836317e-06, "loss": 0.3751, "step": 124235 }, { "epoch": 4.477601182109778, "grad_norm": 0.2171791046857834, "learning_rate": 1.4262515001951389e-06, "loss": 0.3707, "step": 124240 }, { "epoch": 4.477781381771003, "grad_norm": 0.2439403533935547, "learning_rate": 1.4252801132944055e-06, "loss": 0.3821, "step": 124245 }, { "epoch": 4.477961581432227, "grad_norm": 0.2420462816953659, "learning_rate": 1.4243090475946713e-06, "loss": 0.3772, "step": 124250 }, { "epoch": 4.478141781093451, "grad_norm": 0.28287315368652344, "learning_rate": 1.4233383031091535e-06, "loss": 0.3906, "step": 124255 }, { "epoch": 4.478321980754676, "grad_norm": 0.22504858672618866, "learning_rate": 1.4223678798510915e-06, "loss": 0.3988, "step": 124260 }, { "epoch": 4.478502180415901, "grad_norm": 0.2430412620306015, "learning_rate": 1.4213977778336995e-06, "loss": 0.3859, "step": 124265 }, { "epoch": 4.478682380077125, "grad_norm": 0.31164422631263733, "learning_rate": 1.4204279970702034e-06, "loss": 0.4001, "step": 124270 }, { "epoch": 4.47886257973835, "grad_norm": 0.3133993446826935, "learning_rate": 1.4194585375738061e-06, "loss": 0.3915, "step": 124275 }, { "epoch": 4.479042779399575, "grad_norm": 0.27546271681785583, "learning_rate": 1.4184893993577198e-06, "loss": 0.3829, "step": 124280 }, { "epoch": 4.4792229790608, "grad_norm": 0.32316166162490845, "learning_rate": 1.417520582435153e-06, "loss": 0.3139, "step": 124285 }, { "epoch": 4.479403178722024, "grad_norm": 0.23698268830776215, "learning_rate": 1.4165520868193034e-06, "loss": 0.3568, "step": 124290 }, { "epoch": 4.479583378383249, "grad_norm": 0.26185452938079834, "learning_rate": 1.4155839125233634e-06, "loss": 0.3757, "step": 124295 }, { "epoch": 4.479763578044473, "grad_norm": 0.24788565933704376, "learning_rate": 1.414616059560528e-06, "loss": 0.3441, "step": 124300 }, { "epoch": 4.479943777705698, "grad_norm": 0.3030286729335785, "learning_rate": 1.4136485279439783e-06, "loss": 0.3465, "step": 124305 }, { "epoch": 4.480123977366922, "grad_norm": 0.2569609582424164, "learning_rate": 1.4126813176869036e-06, "loss": 0.3192, "step": 124310 }, { "epoch": 4.480304177028147, "grad_norm": 0.22701065242290497, "learning_rate": 1.4117144288024797e-06, "loss": 0.3387, "step": 124315 }, { "epoch": 4.480484376689372, "grad_norm": 0.23131400346755981, "learning_rate": 1.4107478613038821e-06, "loss": 0.3596, "step": 124320 }, { "epoch": 4.480664576350597, "grad_norm": 0.260778546333313, "learning_rate": 1.4097816152042753e-06, "loss": 0.3689, "step": 124325 }, { "epoch": 4.480844776011821, "grad_norm": 0.23251162469387054, "learning_rate": 1.4088156905168293e-06, "loss": 0.3959, "step": 124330 }, { "epoch": 4.481024975673046, "grad_norm": 0.25117242336273193, "learning_rate": 1.4078500872546973e-06, "loss": 0.3879, "step": 124335 }, { "epoch": 4.481205175334271, "grad_norm": 0.2599792778491974, "learning_rate": 1.4068848054310469e-06, "loss": 0.3692, "step": 124340 }, { "epoch": 4.481385374995495, "grad_norm": 0.28005966544151306, "learning_rate": 1.4059198450590143e-06, "loss": 0.3937, "step": 124345 }, { "epoch": 4.481565574656719, "grad_norm": 0.24943847954273224, "learning_rate": 1.4049552061517619e-06, "loss": 0.344, "step": 124350 }, { "epoch": 4.481745774317944, "grad_norm": 0.2192724198102951, "learning_rate": 1.4039908887224285e-06, "loss": 0.3625, "step": 124355 }, { "epoch": 4.481925973979169, "grad_norm": 0.2770889401435852, "learning_rate": 1.4030268927841427e-06, "loss": 0.3775, "step": 124360 }, { "epoch": 4.4821061736403935, "grad_norm": 0.22299063205718994, "learning_rate": 1.4020632183500583e-06, "loss": 0.3846, "step": 124365 }, { "epoch": 4.482286373301618, "grad_norm": 0.2386179119348526, "learning_rate": 1.4010998654332892e-06, "loss": 0.3591, "step": 124370 }, { "epoch": 4.482466572962843, "grad_norm": 0.24074918031692505, "learning_rate": 1.4001368340469611e-06, "loss": 0.3799, "step": 124375 }, { "epoch": 4.482646772624068, "grad_norm": 0.2530936896800995, "learning_rate": 1.3991741242042027e-06, "loss": 0.3429, "step": 124380 }, { "epoch": 4.4828269722852925, "grad_norm": 0.28871747851371765, "learning_rate": 1.3982117359181286e-06, "loss": 0.3948, "step": 124385 }, { "epoch": 4.483007171946516, "grad_norm": 0.2461950182914734, "learning_rate": 1.3972496692018499e-06, "loss": 0.357, "step": 124390 }, { "epoch": 4.483187371607741, "grad_norm": 0.21096403896808624, "learning_rate": 1.3962879240684734e-06, "loss": 0.3569, "step": 124395 }, { "epoch": 4.483367571268966, "grad_norm": 0.3317560851573944, "learning_rate": 1.3953265005311021e-06, "loss": 0.3861, "step": 124400 }, { "epoch": 4.4835477709301905, "grad_norm": 0.2790171504020691, "learning_rate": 1.39436539860284e-06, "loss": 0.3131, "step": 124405 }, { "epoch": 4.483727970591415, "grad_norm": 0.21916787326335907, "learning_rate": 1.3934046182967814e-06, "loss": 0.3983, "step": 124410 }, { "epoch": 4.48390817025264, "grad_norm": 0.25883591175079346, "learning_rate": 1.3924441596260107e-06, "loss": 0.3775, "step": 124415 }, { "epoch": 4.484088369913865, "grad_norm": 0.18844369053840637, "learning_rate": 1.3914840226036202e-06, "loss": 0.4003, "step": 124420 }, { "epoch": 4.4842685695750895, "grad_norm": 0.23868656158447266, "learning_rate": 1.3905242072426854e-06, "loss": 0.3791, "step": 124425 }, { "epoch": 4.484448769236314, "grad_norm": 0.27947574853897095, "learning_rate": 1.3895647135562906e-06, "loss": 0.3521, "step": 124430 }, { "epoch": 4.484628968897539, "grad_norm": 0.23540154099464417, "learning_rate": 1.3886055415575084e-06, "loss": 0.355, "step": 124435 }, { "epoch": 4.484809168558763, "grad_norm": 0.23615595698356628, "learning_rate": 1.3876466912593976e-06, "loss": 0.3757, "step": 124440 }, { "epoch": 4.4849893682199875, "grad_norm": 0.22659413516521454, "learning_rate": 1.3866881626750345e-06, "loss": 0.3436, "step": 124445 }, { "epoch": 4.485169567881212, "grad_norm": 0.24826356768608093, "learning_rate": 1.385729955817469e-06, "loss": 0.3645, "step": 124450 }, { "epoch": 4.485349767542437, "grad_norm": 0.24432316422462463, "learning_rate": 1.3847720706997663e-06, "loss": 0.3564, "step": 124455 }, { "epoch": 4.485529967203662, "grad_norm": 0.21962685883045197, "learning_rate": 1.3838145073349712e-06, "loss": 0.3265, "step": 124460 }, { "epoch": 4.485710166864886, "grad_norm": 0.28142571449279785, "learning_rate": 1.382857265736126e-06, "loss": 0.3877, "step": 124465 }, { "epoch": 4.485890366526111, "grad_norm": 0.20428895950317383, "learning_rate": 1.3819003459162816e-06, "loss": 0.3767, "step": 124470 }, { "epoch": 4.486070566187336, "grad_norm": 0.27324962615966797, "learning_rate": 1.3809437478884719e-06, "loss": 0.3281, "step": 124475 }, { "epoch": 4.48625076584856, "grad_norm": 0.2558141350746155, "learning_rate": 1.3799874716657307e-06, "loss": 0.384, "step": 124480 }, { "epoch": 4.4864309655097845, "grad_norm": 0.24941542744636536, "learning_rate": 1.3790315172610896e-06, "loss": 0.3501, "step": 124485 }, { "epoch": 4.486611165171009, "grad_norm": 0.2563149333000183, "learning_rate": 1.3780758846875658e-06, "loss": 0.3841, "step": 124490 }, { "epoch": 4.486791364832234, "grad_norm": 0.2376670092344284, "learning_rate": 1.3771205739581877e-06, "loss": 0.3725, "step": 124495 }, { "epoch": 4.486971564493459, "grad_norm": 0.30792880058288574, "learning_rate": 1.37616558508597e-06, "loss": 0.3779, "step": 124500 }, { "epoch": 4.486971564493459, "eval_loss": 0.42892351746559143, "eval_runtime": 3.5425, "eval_samples_per_second": 28.229, "eval_steps_per_second": 7.057, "step": 124500 }, { "epoch": 4.487151764154683, "grad_norm": 0.28269314765930176, "learning_rate": 1.3752109180839217e-06, "loss": 0.4113, "step": 124505 }, { "epoch": 4.487331963815908, "grad_norm": 0.27605292201042175, "learning_rate": 1.3742565729650491e-06, "loss": 0.3593, "step": 124510 }, { "epoch": 4.487512163477133, "grad_norm": 0.2970362901687622, "learning_rate": 1.3733025497423586e-06, "loss": 0.3946, "step": 124515 }, { "epoch": 4.487692363138358, "grad_norm": 0.285276859998703, "learning_rate": 1.3723488484288421e-06, "loss": 0.3863, "step": 124520 }, { "epoch": 4.487872562799582, "grad_norm": 0.2632303535938263, "learning_rate": 1.3713954690375035e-06, "loss": 0.4355, "step": 124525 }, { "epoch": 4.488052762460806, "grad_norm": 0.3081340193748474, "learning_rate": 1.370442411581324e-06, "loss": 0.3526, "step": 124530 }, { "epoch": 4.488232962122031, "grad_norm": 0.2594117820262909, "learning_rate": 1.3694896760732933e-06, "loss": 0.3852, "step": 124535 }, { "epoch": 4.488413161783256, "grad_norm": 0.30716049671173096, "learning_rate": 1.3685372625263899e-06, "loss": 0.3624, "step": 124540 }, { "epoch": 4.48859336144448, "grad_norm": 0.23581121861934662, "learning_rate": 1.367585170953589e-06, "loss": 0.3477, "step": 124545 }, { "epoch": 4.488773561105705, "grad_norm": 0.24266508221626282, "learning_rate": 1.3666334013678729e-06, "loss": 0.4177, "step": 124550 }, { "epoch": 4.48895376076693, "grad_norm": 0.22927452623844147, "learning_rate": 1.365681953782194e-06, "loss": 0.3946, "step": 124555 }, { "epoch": 4.489133960428155, "grad_norm": 0.24199864268302917, "learning_rate": 1.3647308282095206e-06, "loss": 0.3845, "step": 124560 }, { "epoch": 4.489314160089379, "grad_norm": 0.2787548303604126, "learning_rate": 1.3637800246628197e-06, "loss": 0.385, "step": 124565 }, { "epoch": 4.489494359750604, "grad_norm": 0.2417118102312088, "learning_rate": 1.3628295431550365e-06, "loss": 0.3755, "step": 124570 }, { "epoch": 4.489674559411828, "grad_norm": 0.2551795542240143, "learning_rate": 1.3618793836991273e-06, "loss": 0.3499, "step": 124575 }, { "epoch": 4.489854759073053, "grad_norm": 0.3174431324005127, "learning_rate": 1.3609295463080318e-06, "loss": 0.3807, "step": 124580 }, { "epoch": 4.490034958734277, "grad_norm": 0.23123787343502045, "learning_rate": 1.3599800309946925e-06, "loss": 0.3648, "step": 124585 }, { "epoch": 4.490215158395502, "grad_norm": 0.27223360538482666, "learning_rate": 1.3590308377720546e-06, "loss": 0.3421, "step": 124590 }, { "epoch": 4.490395358056727, "grad_norm": 0.25846990942955017, "learning_rate": 1.3580819666530408e-06, "loss": 0.3496, "step": 124595 }, { "epoch": 4.490575557717952, "grad_norm": 0.25922471284866333, "learning_rate": 1.3571334176505856e-06, "loss": 0.3783, "step": 124600 }, { "epoch": 4.490755757379176, "grad_norm": 0.2560346722602844, "learning_rate": 1.3561851907776064e-06, "loss": 0.3274, "step": 124605 }, { "epoch": 4.490935957040401, "grad_norm": 0.25044140219688416, "learning_rate": 1.355237286047026e-06, "loss": 0.377, "step": 124610 }, { "epoch": 4.491116156701626, "grad_norm": 0.25878745317459106, "learning_rate": 1.3542897034717644e-06, "loss": 0.3426, "step": 124615 }, { "epoch": 4.49129635636285, "grad_norm": 0.23946064710617065, "learning_rate": 1.3533424430647285e-06, "loss": 0.3811, "step": 124620 }, { "epoch": 4.491476556024074, "grad_norm": 0.282932311296463, "learning_rate": 1.3523955048388188e-06, "loss": 0.3618, "step": 124625 }, { "epoch": 4.491656755685299, "grad_norm": 0.2500767707824707, "learning_rate": 1.3514488888069443e-06, "loss": 0.3679, "step": 124630 }, { "epoch": 4.491836955346524, "grad_norm": 0.2235366553068161, "learning_rate": 1.3505025949819978e-06, "loss": 0.3695, "step": 124635 }, { "epoch": 4.492017155007749, "grad_norm": 0.31541529297828674, "learning_rate": 1.3495566233768742e-06, "loss": 0.4089, "step": 124640 }, { "epoch": 4.492197354668973, "grad_norm": 0.3026379942893982, "learning_rate": 1.3486109740044688e-06, "loss": 0.3709, "step": 124645 }, { "epoch": 4.492377554330198, "grad_norm": 0.26099175214767456, "learning_rate": 1.347665646877655e-06, "loss": 0.3478, "step": 124650 }, { "epoch": 4.492557753991423, "grad_norm": 0.2050563395023346, "learning_rate": 1.3467206420093191e-06, "loss": 0.367, "step": 124655 }, { "epoch": 4.4927379536526475, "grad_norm": 0.26844698190689087, "learning_rate": 1.3457759594123348e-06, "loss": 0.3862, "step": 124660 }, { "epoch": 4.492918153313871, "grad_norm": 0.2691669762134552, "learning_rate": 1.3448315990995746e-06, "loss": 0.3652, "step": 124665 }, { "epoch": 4.493098352975096, "grad_norm": 0.26715192198753357, "learning_rate": 1.3438875610839035e-06, "loss": 0.3557, "step": 124670 }, { "epoch": 4.493278552636321, "grad_norm": 0.27473947405815125, "learning_rate": 1.3429438453781807e-06, "loss": 0.3646, "step": 124675 }, { "epoch": 4.4934587522975455, "grad_norm": 0.272559255361557, "learning_rate": 1.3420004519952706e-06, "loss": 0.3924, "step": 124680 }, { "epoch": 4.49363895195877, "grad_norm": 0.2625700831413269, "learning_rate": 1.3410573809480242e-06, "loss": 0.4042, "step": 124685 }, { "epoch": 4.493819151619995, "grad_norm": 0.2726500332355499, "learning_rate": 1.3401146322492924e-06, "loss": 0.3691, "step": 124690 }, { "epoch": 4.49399935128122, "grad_norm": 0.23851507902145386, "learning_rate": 1.339172205911915e-06, "loss": 0.3906, "step": 124695 }, { "epoch": 4.4941795509424445, "grad_norm": 0.212006077170372, "learning_rate": 1.338230101948737e-06, "loss": 0.3953, "step": 124700 }, { "epoch": 4.494359750603669, "grad_norm": 0.24577181041240692, "learning_rate": 1.33728832037259e-06, "loss": 0.354, "step": 124705 }, { "epoch": 4.494539950264894, "grad_norm": 0.24666406214237213, "learning_rate": 1.336346861196311e-06, "loss": 0.3328, "step": 124710 }, { "epoch": 4.494720149926118, "grad_norm": 0.22719623148441315, "learning_rate": 1.3354057244327256e-06, "loss": 0.3629, "step": 124715 }, { "epoch": 4.4949003495873425, "grad_norm": 0.24336789548397064, "learning_rate": 1.3344649100946543e-06, "loss": 0.3702, "step": 124720 }, { "epoch": 4.495080549248567, "grad_norm": 0.2589198350906372, "learning_rate": 1.3335244181949174e-06, "loss": 0.3501, "step": 124725 }, { "epoch": 4.495260748909792, "grad_norm": 0.2021062821149826, "learning_rate": 1.3325842487463241e-06, "loss": 0.3659, "step": 124730 }, { "epoch": 4.495440948571017, "grad_norm": 0.22475126385688782, "learning_rate": 1.3316444017616975e-06, "loss": 0.3618, "step": 124735 }, { "epoch": 4.4956211482322415, "grad_norm": 0.29188576340675354, "learning_rate": 1.3307048772538272e-06, "loss": 0.3786, "step": 124740 }, { "epoch": 4.495801347893466, "grad_norm": 0.2823890149593353, "learning_rate": 1.3297656752355197e-06, "loss": 0.3519, "step": 124745 }, { "epoch": 4.495981547554691, "grad_norm": 0.2329128384590149, "learning_rate": 1.3288267957195733e-06, "loss": 0.3333, "step": 124750 }, { "epoch": 4.496161747215915, "grad_norm": 0.3408520817756653, "learning_rate": 1.3278882387187775e-06, "loss": 0.3682, "step": 124755 }, { "epoch": 4.4963419468771395, "grad_norm": 0.2713524103164673, "learning_rate": 1.3269500042459276e-06, "loss": 0.3608, "step": 124760 }, { "epoch": 4.496522146538364, "grad_norm": 0.2413674145936966, "learning_rate": 1.3260120923137943e-06, "loss": 0.3557, "step": 124765 }, { "epoch": 4.496702346199589, "grad_norm": 0.29502227902412415, "learning_rate": 1.325074502935164e-06, "loss": 0.3778, "step": 124770 }, { "epoch": 4.496882545860814, "grad_norm": 0.21458546817302704, "learning_rate": 1.3241372361228104e-06, "loss": 0.3857, "step": 124775 }, { "epoch": 4.497062745522038, "grad_norm": 0.241383358836174, "learning_rate": 1.3232002918895036e-06, "loss": 0.4047, "step": 124780 }, { "epoch": 4.497242945183263, "grad_norm": 0.2607963979244232, "learning_rate": 1.3222636702480084e-06, "loss": 0.3352, "step": 124785 }, { "epoch": 4.497423144844488, "grad_norm": 0.24018988013267517, "learning_rate": 1.3213273712110895e-06, "loss": 0.3648, "step": 124790 }, { "epoch": 4.497603344505713, "grad_norm": 0.22829695045948029, "learning_rate": 1.3203913947914953e-06, "loss": 0.373, "step": 124795 }, { "epoch": 4.497783544166937, "grad_norm": 0.22519542276859283, "learning_rate": 1.3194557410019875e-06, "loss": 0.3512, "step": 124800 }, { "epoch": 4.497963743828161, "grad_norm": 0.2739558219909668, "learning_rate": 1.3185204098553089e-06, "loss": 0.3571, "step": 124805 }, { "epoch": 4.498143943489386, "grad_norm": 0.29086029529571533, "learning_rate": 1.3175854013642074e-06, "loss": 0.3761, "step": 124810 }, { "epoch": 4.498324143150611, "grad_norm": 0.2614690959453583, "learning_rate": 1.3166507155414176e-06, "loss": 0.3772, "step": 124815 }, { "epoch": 4.498504342811835, "grad_norm": 0.2470005601644516, "learning_rate": 1.315716352399679e-06, "loss": 0.3845, "step": 124820 }, { "epoch": 4.49868454247306, "grad_norm": 0.22588050365447998, "learning_rate": 1.3147823119517122e-06, "loss": 0.3627, "step": 124825 }, { "epoch": 4.498864742134285, "grad_norm": 0.187801331281662, "learning_rate": 1.3138485942102625e-06, "loss": 0.3807, "step": 124830 }, { "epoch": 4.49904494179551, "grad_norm": 0.23127955198287964, "learning_rate": 1.3129151991880307e-06, "loss": 0.3522, "step": 124835 }, { "epoch": 4.499225141456734, "grad_norm": 0.20928794145584106, "learning_rate": 1.3119821268977455e-06, "loss": 0.3553, "step": 124840 }, { "epoch": 4.499405341117959, "grad_norm": 0.2646942138671875, "learning_rate": 1.3110493773521193e-06, "loss": 0.3659, "step": 124845 }, { "epoch": 4.499585540779183, "grad_norm": 0.2235756665468216, "learning_rate": 1.3101169505638582e-06, "loss": 0.3924, "step": 124850 }, { "epoch": 4.499765740440408, "grad_norm": 0.2302202582359314, "learning_rate": 1.3091848465456692e-06, "loss": 0.3704, "step": 124855 }, { "epoch": 4.499945940101632, "grad_norm": 0.2802170515060425, "learning_rate": 1.3082530653102444e-06, "loss": 0.3699, "step": 124860 }, { "epoch": 4.500126139762857, "grad_norm": 0.2077890932559967, "learning_rate": 1.3073216068702876e-06, "loss": 0.3576, "step": 124865 }, { "epoch": 4.500306339424082, "grad_norm": 0.2789739966392517, "learning_rate": 1.3063904712384889e-06, "loss": 0.3412, "step": 124870 }, { "epoch": 4.500486539085307, "grad_norm": 0.2757323086261749, "learning_rate": 1.3054596584275325e-06, "loss": 0.3565, "step": 124875 }, { "epoch": 4.500666738746531, "grad_norm": 0.18960040807724, "learning_rate": 1.3045291684500999e-06, "loss": 0.3404, "step": 124880 }, { "epoch": 4.500846938407756, "grad_norm": 0.23652799427509308, "learning_rate": 1.3035990013188698e-06, "loss": 0.3883, "step": 124885 }, { "epoch": 4.501027138068981, "grad_norm": 0.2553980350494385, "learning_rate": 1.3026691570465128e-06, "loss": 0.3724, "step": 124890 }, { "epoch": 4.5012073377302055, "grad_norm": 0.28166186809539795, "learning_rate": 1.3017396356457045e-06, "loss": 0.3743, "step": 124895 }, { "epoch": 4.501387537391429, "grad_norm": 0.25217875838279724, "learning_rate": 1.3008104371291047e-06, "loss": 0.3456, "step": 124900 }, { "epoch": 4.501567737052654, "grad_norm": 0.22290779650211334, "learning_rate": 1.299881561509375e-06, "loss": 0.3797, "step": 124905 }, { "epoch": 4.501747936713879, "grad_norm": 0.25568974018096924, "learning_rate": 1.2989530087991696e-06, "loss": 0.3846, "step": 124910 }, { "epoch": 4.501928136375104, "grad_norm": 0.23674674332141876, "learning_rate": 1.2980247790111367e-06, "loss": 0.4019, "step": 124915 }, { "epoch": 4.502108336036328, "grad_norm": 0.22385911643505096, "learning_rate": 1.2970968721579324e-06, "loss": 0.3634, "step": 124920 }, { "epoch": 4.502288535697553, "grad_norm": 0.3423471748828888, "learning_rate": 1.296169288252197e-06, "loss": 0.3883, "step": 124925 }, { "epoch": 4.502468735358778, "grad_norm": 0.29338136315345764, "learning_rate": 1.295242027306559e-06, "loss": 0.3775, "step": 124930 }, { "epoch": 4.5026489350200025, "grad_norm": 0.17532098293304443, "learning_rate": 1.2943150893336614e-06, "loss": 0.343, "step": 124935 }, { "epoch": 4.502829134681226, "grad_norm": 0.27964937686920166, "learning_rate": 1.29338847434613e-06, "loss": 0.3547, "step": 124940 }, { "epoch": 4.503009334342451, "grad_norm": 0.2344919592142105, "learning_rate": 1.2924621823565964e-06, "loss": 0.3394, "step": 124945 }, { "epoch": 4.503189534003676, "grad_norm": 0.2620304226875305, "learning_rate": 1.2915362133776725e-06, "loss": 0.3778, "step": 124950 }, { "epoch": 4.503369733664901, "grad_norm": 0.26436105370521545, "learning_rate": 1.2906105674219737e-06, "loss": 0.3555, "step": 124955 }, { "epoch": 4.503549933326125, "grad_norm": 0.25804218649864197, "learning_rate": 1.2896852445021173e-06, "loss": 0.361, "step": 124960 }, { "epoch": 4.50373013298735, "grad_norm": 0.2341192215681076, "learning_rate": 1.2887602446307128e-06, "loss": 0.3672, "step": 124965 }, { "epoch": 4.503910332648575, "grad_norm": 0.26045674085617065, "learning_rate": 1.2878355678203558e-06, "loss": 0.3609, "step": 124970 }, { "epoch": 4.5040905323097995, "grad_norm": 0.2202635258436203, "learning_rate": 1.2869112140836525e-06, "loss": 0.38, "step": 124975 }, { "epoch": 4.504270731971024, "grad_norm": 0.2518618404865265, "learning_rate": 1.285987183433185e-06, "loss": 0.3294, "step": 124980 }, { "epoch": 4.504450931632249, "grad_norm": 0.22585217654705048, "learning_rate": 1.2850634758815567e-06, "loss": 0.3649, "step": 124985 }, { "epoch": 4.504631131293473, "grad_norm": 0.24542665481567383, "learning_rate": 1.2841400914413465e-06, "loss": 0.3599, "step": 124990 }, { "epoch": 4.5048113309546975, "grad_norm": 0.23829962313175201, "learning_rate": 1.2832170301251362e-06, "loss": 0.379, "step": 124995 }, { "epoch": 4.504991530615922, "grad_norm": 0.30551934242248535, "learning_rate": 1.2822942919455045e-06, "loss": 0.4066, "step": 125000 }, { "epoch": 4.504991530615922, "eval_loss": 0.42903777956962585, "eval_runtime": 3.5368, "eval_samples_per_second": 28.274, "eval_steps_per_second": 7.068, "step": 125000 }, { "epoch": 4.505171730277147, "grad_norm": 0.2084054797887802, "learning_rate": 1.281371876915019e-06, "loss": 0.351, "step": 125005 }, { "epoch": 4.505351929938372, "grad_norm": 0.20273062586784363, "learning_rate": 1.280449785046245e-06, "loss": 0.366, "step": 125010 }, { "epoch": 4.5055321295995965, "grad_norm": 0.2866862714290619, "learning_rate": 1.2795280163517581e-06, "loss": 0.413, "step": 125015 }, { "epoch": 4.505712329260821, "grad_norm": 0.2754046618938446, "learning_rate": 1.2786065708441042e-06, "loss": 0.3871, "step": 125020 }, { "epoch": 4.505892528922046, "grad_norm": 0.2317970097064972, "learning_rate": 1.2776854485358453e-06, "loss": 0.3907, "step": 125025 }, { "epoch": 4.50607272858327, "grad_norm": 0.19607658684253693, "learning_rate": 1.2767646494395296e-06, "loss": 0.3727, "step": 125030 }, { "epoch": 4.5062529282444945, "grad_norm": 0.18511025607585907, "learning_rate": 1.2758441735677028e-06, "loss": 0.3539, "step": 125035 }, { "epoch": 4.506433127905719, "grad_norm": 0.27356868982315063, "learning_rate": 1.2749240209329049e-06, "loss": 0.3884, "step": 125040 }, { "epoch": 4.506613327566944, "grad_norm": 0.22257938981056213, "learning_rate": 1.274004191547673e-06, "loss": 0.3783, "step": 125045 }, { "epoch": 4.506793527228169, "grad_norm": 0.28302866220474243, "learning_rate": 1.2730846854245416e-06, "loss": 0.3706, "step": 125050 }, { "epoch": 4.5069737268893935, "grad_norm": 0.33327221870422363, "learning_rate": 1.2721655025760365e-06, "loss": 0.387, "step": 125055 }, { "epoch": 4.507153926550618, "grad_norm": 0.24522875249385834, "learning_rate": 1.2712466430146868e-06, "loss": 0.3623, "step": 125060 }, { "epoch": 4.507334126211843, "grad_norm": 0.2788931131362915, "learning_rate": 1.270328106753005e-06, "loss": 0.3606, "step": 125065 }, { "epoch": 4.507514325873068, "grad_norm": 0.23487606644630432, "learning_rate": 1.2694098938035082e-06, "loss": 0.3898, "step": 125070 }, { "epoch": 4.507694525534292, "grad_norm": 0.24076257646083832, "learning_rate": 1.2684920041787034e-06, "loss": 0.3831, "step": 125075 }, { "epoch": 4.507874725195517, "grad_norm": 0.22647157311439514, "learning_rate": 1.2675744378911058e-06, "loss": 0.3621, "step": 125080 }, { "epoch": 4.508054924856741, "grad_norm": 0.28119003772735596, "learning_rate": 1.2666571949532109e-06, "loss": 0.3778, "step": 125085 }, { "epoch": 4.508235124517966, "grad_norm": 0.20872727036476135, "learning_rate": 1.2657402753775165e-06, "loss": 0.3657, "step": 125090 }, { "epoch": 4.50841532417919, "grad_norm": 0.2806181311607361, "learning_rate": 1.2648236791765161e-06, "loss": 0.3691, "step": 125095 }, { "epoch": 4.508595523840415, "grad_norm": 0.2301296591758728, "learning_rate": 1.2639074063626939e-06, "loss": 0.376, "step": 125100 }, { "epoch": 4.50877572350164, "grad_norm": 0.31790241599082947, "learning_rate": 1.2629914569485396e-06, "loss": 0.3857, "step": 125105 }, { "epoch": 4.508955923162865, "grad_norm": 0.24608851969242096, "learning_rate": 1.262075830946538e-06, "loss": 0.3611, "step": 125110 }, { "epoch": 4.509136122824089, "grad_norm": 0.2477652132511139, "learning_rate": 1.2611605283691485e-06, "loss": 0.3455, "step": 125115 }, { "epoch": 4.509316322485314, "grad_norm": 0.2697392702102661, "learning_rate": 1.2602455492288556e-06, "loss": 0.3734, "step": 125120 }, { "epoch": 4.509496522146538, "grad_norm": 0.29126474261283875, "learning_rate": 1.259330893538116e-06, "loss": 0.3937, "step": 125125 }, { "epoch": 4.509676721807763, "grad_norm": 0.281927227973938, "learning_rate": 1.2584165613094029e-06, "loss": 0.3374, "step": 125130 }, { "epoch": 4.509856921468987, "grad_norm": 0.30632874369621277, "learning_rate": 1.2575025525551648e-06, "loss": 0.3651, "step": 125135 }, { "epoch": 4.510037121130212, "grad_norm": 0.2316470891237259, "learning_rate": 1.2565888672878556e-06, "loss": 0.3384, "step": 125140 }, { "epoch": 4.510217320791437, "grad_norm": 0.2622465193271637, "learning_rate": 1.2556755055199292e-06, "loss": 0.3726, "step": 125145 }, { "epoch": 4.510397520452662, "grad_norm": 0.25914567708969116, "learning_rate": 1.254762467263826e-06, "loss": 0.3963, "step": 125150 }, { "epoch": 4.510577720113886, "grad_norm": 0.26706215739250183, "learning_rate": 1.2538497525319882e-06, "loss": 0.3942, "step": 125155 }, { "epoch": 4.510757919775111, "grad_norm": 0.23364363610744476, "learning_rate": 1.2529373613368506e-06, "loss": 0.3678, "step": 125160 }, { "epoch": 4.510938119436336, "grad_norm": 0.28146228194236755, "learning_rate": 1.2520252936908394e-06, "loss": 0.3441, "step": 125165 }, { "epoch": 4.511118319097561, "grad_norm": 0.25890275835990906, "learning_rate": 1.251113549606389e-06, "loss": 0.3795, "step": 125170 }, { "epoch": 4.511298518758784, "grad_norm": 0.24945032596588135, "learning_rate": 1.2502021290959177e-06, "loss": 0.3772, "step": 125175 }, { "epoch": 4.511478718420009, "grad_norm": 0.2131379246711731, "learning_rate": 1.2492910321718453e-06, "loss": 0.3562, "step": 125180 }, { "epoch": 4.511658918081234, "grad_norm": 0.32095032930374146, "learning_rate": 1.2483802588465848e-06, "loss": 0.3529, "step": 125185 }, { "epoch": 4.511839117742459, "grad_norm": 0.2094382345676422, "learning_rate": 1.2474698091325454e-06, "loss": 0.3687, "step": 125190 }, { "epoch": 4.512019317403683, "grad_norm": 0.279956191778183, "learning_rate": 1.2465596830421255e-06, "loss": 0.426, "step": 125195 }, { "epoch": 4.512199517064908, "grad_norm": 0.2812948226928711, "learning_rate": 1.2456498805877404e-06, "loss": 0.339, "step": 125200 }, { "epoch": 4.512379716726133, "grad_norm": 0.29968753457069397, "learning_rate": 1.2447404017817687e-06, "loss": 0.3693, "step": 125205 }, { "epoch": 4.5125599163873575, "grad_norm": 0.2712326943874359, "learning_rate": 1.243831246636612e-06, "loss": 0.3906, "step": 125210 }, { "epoch": 4.512740116048581, "grad_norm": 0.24692299962043762, "learning_rate": 1.2429224151646574e-06, "loss": 0.3267, "step": 125215 }, { "epoch": 4.512920315709806, "grad_norm": 0.25159400701522827, "learning_rate": 1.242013907378281e-06, "loss": 0.3634, "step": 125220 }, { "epoch": 4.513100515371031, "grad_norm": 0.2895723581314087, "learning_rate": 1.2411057232898733e-06, "loss": 0.3826, "step": 125225 }, { "epoch": 4.513280715032256, "grad_norm": 0.23947159945964813, "learning_rate": 1.2401978629117934e-06, "loss": 0.3526, "step": 125230 }, { "epoch": 4.51346091469348, "grad_norm": 0.2706504166126251, "learning_rate": 1.2392903262564176e-06, "loss": 0.3864, "step": 125235 }, { "epoch": 4.513641114354705, "grad_norm": 0.2251925766468048, "learning_rate": 1.238383113336114e-06, "loss": 0.3673, "step": 125240 }, { "epoch": 4.51382131401593, "grad_norm": 0.2925741374492645, "learning_rate": 1.2374762241632393e-06, "loss": 0.3739, "step": 125245 }, { "epoch": 4.5140015136771545, "grad_norm": 0.24656470119953156, "learning_rate": 1.2365696587501502e-06, "loss": 0.3843, "step": 125250 }, { "epoch": 4.514181713338379, "grad_norm": 0.2367469221353531, "learning_rate": 1.235663417109198e-06, "loss": 0.3662, "step": 125255 }, { "epoch": 4.514361912999604, "grad_norm": 0.23144228756427765, "learning_rate": 1.2347574992527283e-06, "loss": 0.3484, "step": 125260 }, { "epoch": 4.514542112660828, "grad_norm": 0.3216107189655304, "learning_rate": 1.2338519051930925e-06, "loss": 0.3521, "step": 125265 }, { "epoch": 4.514722312322053, "grad_norm": 0.3389444649219513, "learning_rate": 1.2329466349426194e-06, "loss": 0.4033, "step": 125270 }, { "epoch": 4.514902511983277, "grad_norm": 0.21677029132843018, "learning_rate": 1.2320416885136493e-06, "loss": 0.3743, "step": 125275 }, { "epoch": 4.515082711644502, "grad_norm": 0.2557178735733032, "learning_rate": 1.2311370659185084e-06, "loss": 0.3911, "step": 125280 }, { "epoch": 4.515262911305727, "grad_norm": 0.2284223735332489, "learning_rate": 1.2302327671695201e-06, "loss": 0.3583, "step": 125285 }, { "epoch": 4.5154431109669515, "grad_norm": 0.23775717616081238, "learning_rate": 1.2293287922790108e-06, "loss": 0.3642, "step": 125290 }, { "epoch": 4.515623310628176, "grad_norm": 0.24203971028327942, "learning_rate": 1.2284251412593011e-06, "loss": 0.3632, "step": 125295 }, { "epoch": 4.515803510289401, "grad_norm": 0.30471011996269226, "learning_rate": 1.2275218141226868e-06, "loss": 0.3797, "step": 125300 }, { "epoch": 4.515983709950625, "grad_norm": 0.2442302256822586, "learning_rate": 1.2266188108814886e-06, "loss": 0.3587, "step": 125305 }, { "epoch": 4.5161639096118495, "grad_norm": 0.2527487874031067, "learning_rate": 1.2257161315480048e-06, "loss": 0.3862, "step": 125310 }, { "epoch": 4.516344109273074, "grad_norm": 0.2215377241373062, "learning_rate": 1.2248137761345424e-06, "loss": 0.3901, "step": 125315 }, { "epoch": 4.516524308934299, "grad_norm": 0.23120597004890442, "learning_rate": 1.2239117446533833e-06, "loss": 0.4032, "step": 125320 }, { "epoch": 4.516704508595524, "grad_norm": 0.2772935926914215, "learning_rate": 1.2230100371168229e-06, "loss": 0.3507, "step": 125325 }, { "epoch": 4.5168847082567485, "grad_norm": 0.31849685311317444, "learning_rate": 1.222108653537149e-06, "loss": 0.3966, "step": 125330 }, { "epoch": 4.517064907917973, "grad_norm": 0.23283176124095917, "learning_rate": 1.2212075939266432e-06, "loss": 0.3609, "step": 125335 }, { "epoch": 4.517245107579198, "grad_norm": 0.268390953540802, "learning_rate": 1.220306858297579e-06, "loss": 0.3574, "step": 125340 }, { "epoch": 4.517425307240423, "grad_norm": 0.315220445394516, "learning_rate": 1.21940644666223e-06, "loss": 0.3818, "step": 125345 }, { "epoch": 4.517605506901647, "grad_norm": 0.243333101272583, "learning_rate": 1.2185063590328616e-06, "loss": 0.381, "step": 125350 }, { "epoch": 4.517785706562872, "grad_norm": 0.22509504854679108, "learning_rate": 1.2176065954217414e-06, "loss": 0.3696, "step": 125355 }, { "epoch": 4.517965906224096, "grad_norm": 0.27468258142471313, "learning_rate": 1.2167071558411263e-06, "loss": 0.3988, "step": 125360 }, { "epoch": 4.518146105885321, "grad_norm": 0.3038482367992401, "learning_rate": 1.2158080403032734e-06, "loss": 0.3707, "step": 125365 }, { "epoch": 4.5183263055465455, "grad_norm": 0.23414720594882965, "learning_rate": 1.2149092488204311e-06, "loss": 0.339, "step": 125370 }, { "epoch": 4.51850650520777, "grad_norm": 0.2685023546218872, "learning_rate": 1.214010781404845e-06, "loss": 0.3801, "step": 125375 }, { "epoch": 4.518686704868995, "grad_norm": 0.25739866495132446, "learning_rate": 1.2131126380687558e-06, "loss": 0.3869, "step": 125380 }, { "epoch": 4.51886690453022, "grad_norm": 0.2300902158021927, "learning_rate": 1.2122148188244004e-06, "loss": 0.3804, "step": 125385 }, { "epoch": 4.519047104191444, "grad_norm": 0.2580128610134125, "learning_rate": 1.2113173236840164e-06, "loss": 0.3291, "step": 125390 }, { "epoch": 4.519227303852669, "grad_norm": 0.27533698081970215, "learning_rate": 1.2104201526598274e-06, "loss": 0.3588, "step": 125395 }, { "epoch": 4.519407503513893, "grad_norm": 0.2444005161523819, "learning_rate": 1.209523305764057e-06, "loss": 0.364, "step": 125400 }, { "epoch": 4.519587703175118, "grad_norm": 0.25520777702331543, "learning_rate": 1.208626783008923e-06, "loss": 0.3582, "step": 125405 }, { "epoch": 4.519767902836342, "grad_norm": 0.2750135064125061, "learning_rate": 1.2077305844066494e-06, "loss": 0.3841, "step": 125410 }, { "epoch": 4.519948102497567, "grad_norm": 0.24095915257930756, "learning_rate": 1.206834709969437e-06, "loss": 0.3664, "step": 125415 }, { "epoch": 4.520128302158792, "grad_norm": 0.2322002798318863, "learning_rate": 1.2059391597094905e-06, "loss": 0.3576, "step": 125420 }, { "epoch": 4.520308501820017, "grad_norm": 0.2865905463695526, "learning_rate": 1.2050439336390217e-06, "loss": 0.3656, "step": 125425 }, { "epoch": 4.520488701481241, "grad_norm": 0.26659440994262695, "learning_rate": 1.2041490317702214e-06, "loss": 0.4096, "step": 125430 }, { "epoch": 4.520668901142466, "grad_norm": 0.29634758830070496, "learning_rate": 1.2032544541152851e-06, "loss": 0.3838, "step": 125435 }, { "epoch": 4.520849100803691, "grad_norm": 0.2613474428653717, "learning_rate": 1.2023602006863976e-06, "loss": 0.3772, "step": 125440 }, { "epoch": 4.521029300464916, "grad_norm": 0.2532717287540436, "learning_rate": 1.2014662714957409e-06, "loss": 0.3852, "step": 125445 }, { "epoch": 4.521209500126139, "grad_norm": 0.22666624188423157, "learning_rate": 1.2005726665555051e-06, "loss": 0.3694, "step": 125450 }, { "epoch": 4.521389699787364, "grad_norm": 0.22881825268268585, "learning_rate": 1.1996793858778554e-06, "loss": 0.2936, "step": 125455 }, { "epoch": 4.521569899448589, "grad_norm": 0.24387423694133759, "learning_rate": 1.1987864294749685e-06, "loss": 0.3588, "step": 125460 }, { "epoch": 4.521750099109814, "grad_norm": 0.2651515305042267, "learning_rate": 1.1978937973590092e-06, "loss": 0.3621, "step": 125465 }, { "epoch": 4.521930298771038, "grad_norm": 0.21903403103351593, "learning_rate": 1.197001489542135e-06, "loss": 0.3943, "step": 125470 }, { "epoch": 4.522110498432263, "grad_norm": 0.2606528103351593, "learning_rate": 1.196109506036508e-06, "loss": 0.353, "step": 125475 }, { "epoch": 4.522290698093488, "grad_norm": 0.22265635430812836, "learning_rate": 1.1952178468542852e-06, "loss": 0.3417, "step": 125480 }, { "epoch": 4.522470897754713, "grad_norm": 0.24026912450790405, "learning_rate": 1.1943265120076042e-06, "loss": 0.3699, "step": 125485 }, { "epoch": 4.522651097415936, "grad_norm": 0.23182328045368195, "learning_rate": 1.193435501508619e-06, "loss": 0.3662, "step": 125490 }, { "epoch": 4.522831297077161, "grad_norm": 0.26761478185653687, "learning_rate": 1.1925448153694619e-06, "loss": 0.3798, "step": 125495 }, { "epoch": 4.523011496738386, "grad_norm": 0.23140046000480652, "learning_rate": 1.1916544536022783e-06, "loss": 0.3844, "step": 125500 }, { "epoch": 4.523011496738386, "eval_loss": 0.428926557302475, "eval_runtime": 3.5228, "eval_samples_per_second": 28.386, "eval_steps_per_second": 7.097, "step": 125500 }, { "epoch": 4.523191696399611, "grad_norm": 0.2863670885562897, "learning_rate": 1.1907644162191922e-06, "loss": 0.3947, "step": 125505 }, { "epoch": 4.523371896060835, "grad_norm": 0.1922617256641388, "learning_rate": 1.189874703232327e-06, "loss": 0.3523, "step": 125510 }, { "epoch": 4.52355209572206, "grad_norm": 0.26853513717651367, "learning_rate": 1.1889853146538148e-06, "loss": 0.3827, "step": 125515 }, { "epoch": 4.523732295383285, "grad_norm": 0.31399276852607727, "learning_rate": 1.1880962504957655e-06, "loss": 0.39, "step": 125520 }, { "epoch": 4.5239124950445095, "grad_norm": 0.2765243649482727, "learning_rate": 1.187207510770294e-06, "loss": 0.3471, "step": 125525 }, { "epoch": 4.524092694705734, "grad_norm": 0.2347140908241272, "learning_rate": 1.1863190954895104e-06, "loss": 0.3601, "step": 125530 }, { "epoch": 4.524272894366959, "grad_norm": 0.25988760590553284, "learning_rate": 1.1854310046655158e-06, "loss": 0.3578, "step": 125535 }, { "epoch": 4.524453094028183, "grad_norm": 0.26233744621276855, "learning_rate": 1.184543238310415e-06, "loss": 0.3576, "step": 125540 }, { "epoch": 4.524633293689408, "grad_norm": 0.24501554667949677, "learning_rate": 1.1836557964363032e-06, "loss": 0.3731, "step": 125545 }, { "epoch": 4.524813493350632, "grad_norm": 0.20214052498340607, "learning_rate": 1.1827686790552711e-06, "loss": 0.3533, "step": 125550 }, { "epoch": 4.524993693011857, "grad_norm": 0.27101585268974304, "learning_rate": 1.1818818861794007e-06, "loss": 0.3721, "step": 125555 }, { "epoch": 4.525173892673082, "grad_norm": 0.22587630152702332, "learning_rate": 1.1809954178207821e-06, "loss": 0.3643, "step": 125560 }, { "epoch": 4.5253540923343065, "grad_norm": 0.293062686920166, "learning_rate": 1.1801092739914838e-06, "loss": 0.3687, "step": 125565 }, { "epoch": 4.525534291995531, "grad_norm": 0.3036471903324127, "learning_rate": 1.1792234547035903e-06, "loss": 0.3325, "step": 125570 }, { "epoch": 4.525714491656756, "grad_norm": 0.2614469826221466, "learning_rate": 1.178337959969164e-06, "loss": 0.3833, "step": 125575 }, { "epoch": 4.52589469131798, "grad_norm": 0.26248598098754883, "learning_rate": 1.1774527898002707e-06, "loss": 0.3678, "step": 125580 }, { "epoch": 4.526074890979205, "grad_norm": 0.22794273495674133, "learning_rate": 1.1765679442089728e-06, "loss": 0.3575, "step": 125585 }, { "epoch": 4.526255090640429, "grad_norm": 0.2476157546043396, "learning_rate": 1.1756834232073189e-06, "loss": 0.3442, "step": 125590 }, { "epoch": 4.526435290301654, "grad_norm": 0.236716166138649, "learning_rate": 1.1747992268073743e-06, "loss": 0.3679, "step": 125595 }, { "epoch": 4.526615489962879, "grad_norm": 0.295484334230423, "learning_rate": 1.173915355021174e-06, "loss": 0.3566, "step": 125600 }, { "epoch": 4.5267956896241035, "grad_norm": 0.237171933054924, "learning_rate": 1.1730318078607584e-06, "loss": 0.3829, "step": 125605 }, { "epoch": 4.526975889285328, "grad_norm": 0.2715757191181183, "learning_rate": 1.1721485853381787e-06, "loss": 0.3776, "step": 125610 }, { "epoch": 4.527156088946553, "grad_norm": 0.21538300812244415, "learning_rate": 1.1712656874654586e-06, "loss": 0.3446, "step": 125615 }, { "epoch": 4.527336288607778, "grad_norm": 0.21776896715164185, "learning_rate": 1.1703831142546306e-06, "loss": 0.3613, "step": 125620 }, { "epoch": 4.527516488269002, "grad_norm": 0.22091469168663025, "learning_rate": 1.1695008657177182e-06, "loss": 0.3677, "step": 125625 }, { "epoch": 4.527696687930227, "grad_norm": 0.24481423199176788, "learning_rate": 1.1686189418667393e-06, "loss": 0.3669, "step": 125630 }, { "epoch": 4.527876887591451, "grad_norm": 0.24965307116508484, "learning_rate": 1.1677373427137178e-06, "loss": 0.366, "step": 125635 }, { "epoch": 4.528057087252676, "grad_norm": 0.2324383556842804, "learning_rate": 1.1668560682706608e-06, "loss": 0.3827, "step": 125640 }, { "epoch": 4.5282372869139005, "grad_norm": 0.2629086971282959, "learning_rate": 1.1659751185495754e-06, "loss": 0.3757, "step": 125645 }, { "epoch": 4.528417486575125, "grad_norm": 0.2776727080345154, "learning_rate": 1.1650944935624658e-06, "loss": 0.3613, "step": 125650 }, { "epoch": 4.52859768623635, "grad_norm": 0.28128179907798767, "learning_rate": 1.1642141933213253e-06, "loss": 0.3698, "step": 125655 }, { "epoch": 4.528777885897575, "grad_norm": 0.2046898454427719, "learning_rate": 1.163334217838155e-06, "loss": 0.3678, "step": 125660 }, { "epoch": 4.528958085558799, "grad_norm": 0.21193328499794006, "learning_rate": 1.1624545671249405e-06, "loss": 0.3739, "step": 125665 }, { "epoch": 4.529138285220024, "grad_norm": 0.30449825525283813, "learning_rate": 1.161575241193666e-06, "loss": 0.3799, "step": 125670 }, { "epoch": 4.529318484881248, "grad_norm": 0.28137627243995667, "learning_rate": 1.1606962400563166e-06, "loss": 0.3485, "step": 125675 }, { "epoch": 4.529498684542473, "grad_norm": 0.26191192865371704, "learning_rate": 1.1598175637248605e-06, "loss": 0.3948, "step": 125680 }, { "epoch": 4.5296788842036975, "grad_norm": 0.21634413301944733, "learning_rate": 1.1589392122112769e-06, "loss": 0.3728, "step": 125685 }, { "epoch": 4.529859083864922, "grad_norm": 0.2029135525226593, "learning_rate": 1.1580611855275341e-06, "loss": 0.3492, "step": 125690 }, { "epoch": 4.530039283526147, "grad_norm": 0.2269791066646576, "learning_rate": 1.1571834836855866e-06, "loss": 0.3489, "step": 125695 }, { "epoch": 4.530219483187372, "grad_norm": 0.2346084862947464, "learning_rate": 1.1563061066974024e-06, "loss": 0.3775, "step": 125700 }, { "epoch": 4.530399682848596, "grad_norm": 0.21212127804756165, "learning_rate": 1.1554290545749273e-06, "loss": 0.3526, "step": 125705 }, { "epoch": 4.530579882509821, "grad_norm": 0.22559018433094025, "learning_rate": 1.154552327330119e-06, "loss": 0.3672, "step": 125710 }, { "epoch": 4.530760082171046, "grad_norm": 0.24675384163856506, "learning_rate": 1.1536759249749173e-06, "loss": 0.3749, "step": 125715 }, { "epoch": 4.530940281832271, "grad_norm": 0.20101101696491241, "learning_rate": 1.1527998475212604e-06, "loss": 0.3611, "step": 125720 }, { "epoch": 4.5311204814934944, "grad_norm": 0.24855537712574005, "learning_rate": 1.151924094981091e-06, "loss": 0.4015, "step": 125725 }, { "epoch": 4.531300681154719, "grad_norm": 0.2826426029205322, "learning_rate": 1.151048667366339e-06, "loss": 0.3815, "step": 125730 }, { "epoch": 4.531480880815944, "grad_norm": 0.2761218547821045, "learning_rate": 1.1501735646889332e-06, "loss": 0.3988, "step": 125735 }, { "epoch": 4.531661080477169, "grad_norm": 0.2532573938369751, "learning_rate": 1.1492987869607952e-06, "loss": 0.3876, "step": 125740 }, { "epoch": 4.531841280138393, "grad_norm": 0.2559954524040222, "learning_rate": 1.1484243341938427e-06, "loss": 0.367, "step": 125745 }, { "epoch": 4.532021479799618, "grad_norm": 0.22612114250659943, "learning_rate": 1.147550206399986e-06, "loss": 0.3544, "step": 125750 }, { "epoch": 4.532201679460843, "grad_norm": 0.24155735969543457, "learning_rate": 1.146676403591146e-06, "loss": 0.4076, "step": 125755 }, { "epoch": 4.532381879122068, "grad_norm": 0.3148764967918396, "learning_rate": 1.1458029257792185e-06, "loss": 0.3787, "step": 125760 }, { "epoch": 4.532562078783291, "grad_norm": 0.2415306717157364, "learning_rate": 1.144929772976111e-06, "loss": 0.3981, "step": 125765 }, { "epoch": 4.532742278444516, "grad_norm": 0.2210559993982315, "learning_rate": 1.1440569451937166e-06, "loss": 0.3602, "step": 125770 }, { "epoch": 4.532922478105741, "grad_norm": 0.22382661700248718, "learning_rate": 1.143184442443923e-06, "loss": 0.337, "step": 125775 }, { "epoch": 4.533102677766966, "grad_norm": 0.24328698217868805, "learning_rate": 1.1423122647386291e-06, "loss": 0.3835, "step": 125780 }, { "epoch": 4.53328287742819, "grad_norm": 0.2541979253292084, "learning_rate": 1.1414404120897087e-06, "loss": 0.38, "step": 125785 }, { "epoch": 4.533463077089415, "grad_norm": 0.24431033432483673, "learning_rate": 1.1405688845090383e-06, "loss": 0.3388, "step": 125790 }, { "epoch": 4.53364327675064, "grad_norm": 0.24632686376571655, "learning_rate": 1.139697682008503e-06, "loss": 0.351, "step": 125795 }, { "epoch": 4.533823476411865, "grad_norm": 0.24334456026554108, "learning_rate": 1.1388268045999655e-06, "loss": 0.342, "step": 125800 }, { "epoch": 4.534003676073089, "grad_norm": 0.22440917789936066, "learning_rate": 1.137956252295297e-06, "loss": 0.3438, "step": 125805 }, { "epoch": 4.534183875734314, "grad_norm": 0.25877583026885986, "learning_rate": 1.1370860251063515e-06, "loss": 0.3857, "step": 125810 }, { "epoch": 4.534364075395538, "grad_norm": 0.2521132826805115, "learning_rate": 1.1362161230449863e-06, "loss": 0.3869, "step": 125815 }, { "epoch": 4.534544275056763, "grad_norm": 0.23100094497203827, "learning_rate": 1.1353465461230616e-06, "loss": 0.3642, "step": 125820 }, { "epoch": 4.534724474717987, "grad_norm": 0.2277524769306183, "learning_rate": 1.134477294352418e-06, "loss": 0.3368, "step": 125825 }, { "epoch": 4.534904674379212, "grad_norm": 0.2838972210884094, "learning_rate": 1.1336083677448984e-06, "loss": 0.4055, "step": 125830 }, { "epoch": 4.535084874040437, "grad_norm": 0.26135292649269104, "learning_rate": 1.132739766312349e-06, "loss": 0.3795, "step": 125835 }, { "epoch": 4.5352650737016615, "grad_norm": 0.24618162214756012, "learning_rate": 1.131871490066591e-06, "loss": 0.3496, "step": 125840 }, { "epoch": 4.535445273362886, "grad_norm": 0.28488776087760925, "learning_rate": 1.1310035390194706e-06, "loss": 0.3829, "step": 125845 }, { "epoch": 4.535625473024111, "grad_norm": 0.28462615609169006, "learning_rate": 1.1301359131828032e-06, "loss": 0.3896, "step": 125850 }, { "epoch": 4.535805672685335, "grad_norm": 0.295928955078125, "learning_rate": 1.1292686125684127e-06, "loss": 0.4013, "step": 125855 }, { "epoch": 4.53598587234656, "grad_norm": 0.27669599652290344, "learning_rate": 1.1284016371881173e-06, "loss": 0.3411, "step": 125860 }, { "epoch": 4.536166072007784, "grad_norm": 0.31112879514694214, "learning_rate": 1.1275349870537243e-06, "loss": 0.3581, "step": 125865 }, { "epoch": 4.536346271669009, "grad_norm": 0.30249273777008057, "learning_rate": 1.1266686621770466e-06, "loss": 0.3908, "step": 125870 }, { "epoch": 4.536526471330234, "grad_norm": 0.260932058095932, "learning_rate": 1.1258026625698914e-06, "loss": 0.3768, "step": 125875 }, { "epoch": 4.5367066709914585, "grad_norm": 0.2824000418186188, "learning_rate": 1.1249369882440464e-06, "loss": 0.3298, "step": 125880 }, { "epoch": 4.536886870652683, "grad_norm": 0.2413741648197174, "learning_rate": 1.124071639211316e-06, "loss": 0.3432, "step": 125885 }, { "epoch": 4.537067070313908, "grad_norm": 0.2007882446050644, "learning_rate": 1.1232066154834852e-06, "loss": 0.3411, "step": 125890 }, { "epoch": 4.537247269975133, "grad_norm": 0.2404080480337143, "learning_rate": 1.122341917072342e-06, "loss": 0.3577, "step": 125895 }, { "epoch": 4.5374274696363575, "grad_norm": 0.2568254768848419, "learning_rate": 1.1214775439896686e-06, "loss": 0.3915, "step": 125900 }, { "epoch": 4.537607669297582, "grad_norm": 0.26964160799980164, "learning_rate": 1.1206134962472387e-06, "loss": 0.3875, "step": 125905 }, { "epoch": 4.537787868958806, "grad_norm": 0.26435330510139465, "learning_rate": 1.1197497738568264e-06, "loss": 0.3631, "step": 125910 }, { "epoch": 4.537968068620031, "grad_norm": 0.27454784512519836, "learning_rate": 1.1188863768302027e-06, "loss": 0.3484, "step": 125915 }, { "epoch": 4.5381482682812555, "grad_norm": 0.2890207767486572, "learning_rate": 1.1180233051791279e-06, "loss": 0.3799, "step": 125920 }, { "epoch": 4.53832846794248, "grad_norm": 0.27648934721946716, "learning_rate": 1.1171605589153616e-06, "loss": 0.3902, "step": 125925 }, { "epoch": 4.538508667603705, "grad_norm": 0.2612933814525604, "learning_rate": 1.1162981380506587e-06, "loss": 0.3866, "step": 125930 }, { "epoch": 4.53868886726493, "grad_norm": 0.23902583122253418, "learning_rate": 1.1154360425967652e-06, "loss": 0.3764, "step": 125935 }, { "epoch": 4.538869066926154, "grad_norm": 0.2665193974971771, "learning_rate": 1.1145742725654357e-06, "loss": 0.3214, "step": 125940 }, { "epoch": 4.539049266587379, "grad_norm": 0.3075253665447235, "learning_rate": 1.1137128279684078e-06, "loss": 0.3819, "step": 125945 }, { "epoch": 4.539229466248603, "grad_norm": 0.25425055623054504, "learning_rate": 1.1128517088174195e-06, "loss": 0.3686, "step": 125950 }, { "epoch": 4.539409665909828, "grad_norm": 0.2914336025714874, "learning_rate": 1.1119909151242003e-06, "loss": 0.4104, "step": 125955 }, { "epoch": 4.5395898655710525, "grad_norm": 0.23320336639881134, "learning_rate": 1.1111304469004769e-06, "loss": 0.3615, "step": 125960 }, { "epoch": 4.539770065232277, "grad_norm": 0.2035553902387619, "learning_rate": 1.1102703041579787e-06, "loss": 0.3544, "step": 125965 }, { "epoch": 4.539950264893502, "grad_norm": 0.27693793177604675, "learning_rate": 1.1094104869084242e-06, "loss": 0.3813, "step": 125970 }, { "epoch": 4.540130464554727, "grad_norm": 0.2863492965698242, "learning_rate": 1.1085509951635236e-06, "loss": 0.3601, "step": 125975 }, { "epoch": 4.540310664215951, "grad_norm": 0.22666563093662262, "learning_rate": 1.1076918289349924e-06, "loss": 0.3503, "step": 125980 }, { "epoch": 4.540490863877176, "grad_norm": 0.25621458888053894, "learning_rate": 1.1068329882345296e-06, "loss": 0.3974, "step": 125985 }, { "epoch": 4.540671063538401, "grad_norm": 0.23761415481567383, "learning_rate": 1.105974473073848e-06, "loss": 0.3689, "step": 125990 }, { "epoch": 4.540851263199626, "grad_norm": 0.27705660462379456, "learning_rate": 1.1051162834646356e-06, "loss": 0.4032, "step": 125995 }, { "epoch": 4.5410314628608495, "grad_norm": 0.25998204946517944, "learning_rate": 1.1042584194185857e-06, "loss": 0.3646, "step": 126000 }, { "epoch": 4.5410314628608495, "eval_loss": 0.42872315645217896, "eval_runtime": 3.5299, "eval_samples_per_second": 28.33, "eval_steps_per_second": 7.082, "step": 126000 }, { "epoch": 4.541211662522074, "grad_norm": 0.2178216576576233, "learning_rate": 1.1034008809473916e-06, "loss": 0.365, "step": 126005 }, { "epoch": 4.541391862183299, "grad_norm": 0.2195737063884735, "learning_rate": 1.1025436680627332e-06, "loss": 0.3845, "step": 126010 }, { "epoch": 4.541572061844524, "grad_norm": 0.24314382672309875, "learning_rate": 1.10168678077629e-06, "loss": 0.3472, "step": 126015 }, { "epoch": 4.541752261505748, "grad_norm": 0.2001175880432129, "learning_rate": 1.1008302190997383e-06, "loss": 0.3606, "step": 126020 }, { "epoch": 4.541932461166973, "grad_norm": 0.24921679496765137, "learning_rate": 1.0999739830447441e-06, "loss": 0.3683, "step": 126025 }, { "epoch": 4.542112660828198, "grad_norm": 0.2515079379081726, "learning_rate": 1.0991180726229789e-06, "loss": 0.3579, "step": 126030 }, { "epoch": 4.542292860489423, "grad_norm": 0.2457331418991089, "learning_rate": 1.0982624878461051e-06, "loss": 0.3869, "step": 126035 }, { "epoch": 4.5424730601506464, "grad_norm": 0.22210632264614105, "learning_rate": 1.0974072287257775e-06, "loss": 0.3974, "step": 126040 }, { "epoch": 4.542653259811871, "grad_norm": 0.256584107875824, "learning_rate": 1.0965522952736478e-06, "loss": 0.3527, "step": 126045 }, { "epoch": 4.542833459473096, "grad_norm": 0.25087007880210876, "learning_rate": 1.0956976875013598e-06, "loss": 0.3566, "step": 126050 }, { "epoch": 4.543013659134321, "grad_norm": 0.2532568871974945, "learning_rate": 1.0948434054205704e-06, "loss": 0.3707, "step": 126055 }, { "epoch": 4.543193858795545, "grad_norm": 0.22819988429546356, "learning_rate": 1.093989449042912e-06, "loss": 0.3395, "step": 126060 }, { "epoch": 4.54337405845677, "grad_norm": 0.26745355129241943, "learning_rate": 1.0931358183800117e-06, "loss": 0.3549, "step": 126065 }, { "epoch": 4.543554258117995, "grad_norm": 0.23124882578849792, "learning_rate": 1.0922825134435127e-06, "loss": 0.3598, "step": 126070 }, { "epoch": 4.54373445777922, "grad_norm": 0.32220420241355896, "learning_rate": 1.0914295342450337e-06, "loss": 0.361, "step": 126075 }, { "epoch": 4.543914657440444, "grad_norm": 0.24584202468395233, "learning_rate": 1.0905768807961987e-06, "loss": 0.3809, "step": 126080 }, { "epoch": 4.544094857101669, "grad_norm": 0.24640391767024994, "learning_rate": 1.0897245531086287e-06, "loss": 0.369, "step": 126085 }, { "epoch": 4.544275056762894, "grad_norm": 0.2793191373348236, "learning_rate": 1.0888725511939258e-06, "loss": 0.3747, "step": 126090 }, { "epoch": 4.544455256424118, "grad_norm": 0.2635978162288666, "learning_rate": 1.0880208750637083e-06, "loss": 0.3755, "step": 126095 }, { "epoch": 4.544635456085342, "grad_norm": 0.2227722406387329, "learning_rate": 1.0871695247295782e-06, "loss": 0.3653, "step": 126100 }, { "epoch": 4.544815655746567, "grad_norm": 0.23662735521793365, "learning_rate": 1.0863185002031317e-06, "loss": 0.3871, "step": 126105 }, { "epoch": 4.544995855407792, "grad_norm": 0.25815528631210327, "learning_rate": 1.0854678014959679e-06, "loss": 0.3697, "step": 126110 }, { "epoch": 4.545176055069017, "grad_norm": 0.2185879647731781, "learning_rate": 1.084617428619672e-06, "loss": 0.3678, "step": 126115 }, { "epoch": 4.545356254730241, "grad_norm": 0.2667752504348755, "learning_rate": 1.0837673815858345e-06, "loss": 0.3512, "step": 126120 }, { "epoch": 4.545536454391466, "grad_norm": 0.20545603334903717, "learning_rate": 1.0829176604060353e-06, "loss": 0.373, "step": 126125 }, { "epoch": 4.545716654052691, "grad_norm": 0.2735787332057953, "learning_rate": 1.0820682650918569e-06, "loss": 0.4048, "step": 126130 }, { "epoch": 4.545896853713915, "grad_norm": 0.27663978934288025, "learning_rate": 1.0812191956548644e-06, "loss": 0.4058, "step": 126135 }, { "epoch": 4.546077053375139, "grad_norm": 0.33949437737464905, "learning_rate": 1.0803704521066298e-06, "loss": 0.367, "step": 126140 }, { "epoch": 4.546257253036364, "grad_norm": 0.28756633400917053, "learning_rate": 1.0795220344587131e-06, "loss": 0.3924, "step": 126145 }, { "epoch": 4.546437452697589, "grad_norm": 0.2777945101261139, "learning_rate": 1.0786739427226827e-06, "loss": 0.3827, "step": 126150 }, { "epoch": 4.5466176523588135, "grad_norm": 0.2093537449836731, "learning_rate": 1.0778261769100905e-06, "loss": 0.3825, "step": 126155 }, { "epoch": 4.546797852020038, "grad_norm": 0.2346353679895401, "learning_rate": 1.0769787370324802e-06, "loss": 0.3685, "step": 126160 }, { "epoch": 4.546978051681263, "grad_norm": 0.23039868474006653, "learning_rate": 1.0761316231014061e-06, "loss": 0.3617, "step": 126165 }, { "epoch": 4.547158251342488, "grad_norm": 0.23052442073822021, "learning_rate": 1.0752848351284011e-06, "loss": 0.3888, "step": 126170 }, { "epoch": 4.5473384510037125, "grad_norm": 0.26234328746795654, "learning_rate": 1.0744383731250168e-06, "loss": 0.3682, "step": 126175 }, { "epoch": 4.547518650664937, "grad_norm": 0.25200456380844116, "learning_rate": 1.0735922371027746e-06, "loss": 0.3748, "step": 126180 }, { "epoch": 4.547698850326161, "grad_norm": 0.31448420882225037, "learning_rate": 1.0727464270732013e-06, "loss": 0.3791, "step": 126185 }, { "epoch": 4.547879049987386, "grad_norm": 0.22912736237049103, "learning_rate": 1.0719009430478294e-06, "loss": 0.3655, "step": 126190 }, { "epoch": 4.5480592496486105, "grad_norm": 0.23380719125270844, "learning_rate": 1.0710557850381747e-06, "loss": 0.3199, "step": 126195 }, { "epoch": 4.548239449309835, "grad_norm": 0.23022237420082092, "learning_rate": 1.0702109530557502e-06, "loss": 0.3819, "step": 126200 }, { "epoch": 4.54841964897106, "grad_norm": 0.2493658810853958, "learning_rate": 1.069366447112069e-06, "loss": 0.4007, "step": 126205 }, { "epoch": 4.548599848632285, "grad_norm": 0.21301326155662537, "learning_rate": 1.0685222672186357e-06, "loss": 0.3805, "step": 126210 }, { "epoch": 4.5487800482935095, "grad_norm": 0.299532949924469, "learning_rate": 1.067678413386955e-06, "loss": 0.3399, "step": 126215 }, { "epoch": 4.548960247954734, "grad_norm": 0.21883751451969147, "learning_rate": 1.0668348856285231e-06, "loss": 0.3422, "step": 126220 }, { "epoch": 4.549140447615958, "grad_norm": 0.2648196518421173, "learning_rate": 1.0659916839548313e-06, "loss": 0.3592, "step": 126225 }, { "epoch": 4.549320647277183, "grad_norm": 0.20908887684345245, "learning_rate": 1.0651488083773697e-06, "loss": 0.3723, "step": 126230 }, { "epoch": 4.5495008469384075, "grad_norm": 0.25661706924438477, "learning_rate": 1.0643062589076186e-06, "loss": 0.3885, "step": 126235 }, { "epoch": 4.549681046599632, "grad_norm": 0.2967999279499054, "learning_rate": 1.0634640355570658e-06, "loss": 0.3833, "step": 126240 }, { "epoch": 4.549861246260857, "grad_norm": 0.3139897286891937, "learning_rate": 1.062622138337177e-06, "loss": 0.3911, "step": 126245 }, { "epoch": 4.550041445922082, "grad_norm": 0.2221304029226303, "learning_rate": 1.0617805672594295e-06, "loss": 0.369, "step": 126250 }, { "epoch": 4.550221645583306, "grad_norm": 0.2807883322238922, "learning_rate": 1.0609393223352887e-06, "loss": 0.372, "step": 126255 }, { "epoch": 4.550401845244531, "grad_norm": 0.2690640389919281, "learning_rate": 1.0600984035762124e-06, "loss": 0.3985, "step": 126260 }, { "epoch": 4.550582044905756, "grad_norm": 0.2200150489807129, "learning_rate": 1.059257810993658e-06, "loss": 0.3896, "step": 126265 }, { "epoch": 4.550762244566981, "grad_norm": 0.21496446430683136, "learning_rate": 1.0584175445990884e-06, "loss": 0.3742, "step": 126270 }, { "epoch": 4.5509424442282045, "grad_norm": 0.27852797508239746, "learning_rate": 1.0575776044039366e-06, "loss": 0.3906, "step": 126275 }, { "epoch": 4.551122643889429, "grad_norm": 0.27898815274238586, "learning_rate": 1.0567379904196567e-06, "loss": 0.3217, "step": 126280 }, { "epoch": 4.551302843550654, "grad_norm": 0.25052666664123535, "learning_rate": 1.0558987026576873e-06, "loss": 0.3821, "step": 126285 }, { "epoch": 4.551483043211879, "grad_norm": 0.306427538394928, "learning_rate": 1.0550597411294633e-06, "loss": 0.3712, "step": 126290 }, { "epoch": 4.551663242873103, "grad_norm": 0.30519047379493713, "learning_rate": 1.0542211058464146e-06, "loss": 0.416, "step": 126295 }, { "epoch": 4.551843442534328, "grad_norm": 0.2536962032318115, "learning_rate": 1.0533827968199656e-06, "loss": 0.3969, "step": 126300 }, { "epoch": 4.552023642195553, "grad_norm": 0.27985987067222595, "learning_rate": 1.0525448140615374e-06, "loss": 0.3781, "step": 126305 }, { "epoch": 4.552203841856778, "grad_norm": 0.2177259922027588, "learning_rate": 1.0517071575825544e-06, "loss": 0.3797, "step": 126310 }, { "epoch": 4.5523840415180015, "grad_norm": 0.25428178906440735, "learning_rate": 1.0508698273944212e-06, "loss": 0.3856, "step": 126315 }, { "epoch": 4.552564241179226, "grad_norm": 0.235152468085289, "learning_rate": 1.0500328235085538e-06, "loss": 0.36, "step": 126320 }, { "epoch": 4.552744440840451, "grad_norm": 0.22013433277606964, "learning_rate": 1.0491961459363515e-06, "loss": 0.3802, "step": 126325 }, { "epoch": 4.552924640501676, "grad_norm": 0.277230829000473, "learning_rate": 1.0483597946892104e-06, "loss": 0.3904, "step": 126330 }, { "epoch": 4.5531048401629, "grad_norm": 0.25709518790245056, "learning_rate": 1.0475237697785328e-06, "loss": 0.3708, "step": 126335 }, { "epoch": 4.553285039824125, "grad_norm": 0.2280193567276001, "learning_rate": 1.0466880712157095e-06, "loss": 0.3783, "step": 126340 }, { "epoch": 4.55346523948535, "grad_norm": 0.2616412937641144, "learning_rate": 1.0458526990121175e-06, "loss": 0.4052, "step": 126345 }, { "epoch": 4.553645439146575, "grad_norm": 0.33539026975631714, "learning_rate": 1.0450176531791478e-06, "loss": 0.3937, "step": 126350 }, { "epoch": 4.553825638807799, "grad_norm": 0.21199902892112732, "learning_rate": 1.0441829337281744e-06, "loss": 0.3597, "step": 126355 }, { "epoch": 4.554005838469024, "grad_norm": 0.24767298996448517, "learning_rate": 1.0433485406705718e-06, "loss": 0.3845, "step": 126360 }, { "epoch": 4.554186038130249, "grad_norm": 0.2962040305137634, "learning_rate": 1.0425144740177085e-06, "loss": 0.3648, "step": 126365 }, { "epoch": 4.554366237791473, "grad_norm": 0.2016834169626236, "learning_rate": 1.041680733780942e-06, "loss": 0.3578, "step": 126370 }, { "epoch": 4.554546437452697, "grad_norm": 0.21861585974693298, "learning_rate": 1.040847319971641e-06, "loss": 0.3705, "step": 126375 }, { "epoch": 4.554726637113922, "grad_norm": 0.3241893947124481, "learning_rate": 1.040014232601158e-06, "loss": 0.3971, "step": 126380 }, { "epoch": 4.554906836775147, "grad_norm": 0.24871119856834412, "learning_rate": 1.0391814716808391e-06, "loss": 0.3911, "step": 126385 }, { "epoch": 4.555087036436372, "grad_norm": 0.20087426900863647, "learning_rate": 1.0383490372220361e-06, "loss": 0.3938, "step": 126390 }, { "epoch": 4.555267236097596, "grad_norm": 0.2455914169549942, "learning_rate": 1.0375169292360847e-06, "loss": 0.3256, "step": 126395 }, { "epoch": 4.555447435758821, "grad_norm": 0.2423105537891388, "learning_rate": 1.0366851477343286e-06, "loss": 0.3616, "step": 126400 }, { "epoch": 4.555627635420046, "grad_norm": 0.2773745059967041, "learning_rate": 1.0358536927280977e-06, "loss": 0.3839, "step": 126405 }, { "epoch": 4.55580783508127, "grad_norm": 0.2075750231742859, "learning_rate": 1.0350225642287215e-06, "loss": 0.3333, "step": 126410 }, { "epoch": 4.555988034742494, "grad_norm": 0.28897416591644287, "learning_rate": 1.0341917622475216e-06, "loss": 0.3451, "step": 126415 }, { "epoch": 4.556168234403719, "grad_norm": 0.2481168508529663, "learning_rate": 1.0333612867958197e-06, "loss": 0.3703, "step": 126420 }, { "epoch": 4.556348434064944, "grad_norm": 0.23715002834796906, "learning_rate": 1.032531137884926e-06, "loss": 0.3816, "step": 126425 }, { "epoch": 4.556528633726169, "grad_norm": 0.24655555188655853, "learning_rate": 1.0317013155261595e-06, "loss": 0.3749, "step": 126430 }, { "epoch": 4.556708833387393, "grad_norm": 0.20687216520309448, "learning_rate": 1.030871819730822e-06, "loss": 0.3603, "step": 126435 }, { "epoch": 4.556889033048618, "grad_norm": 0.2607882022857666, "learning_rate": 1.0300426505102156e-06, "loss": 0.3703, "step": 126440 }, { "epoch": 4.557069232709843, "grad_norm": 0.2784349322319031, "learning_rate": 1.0292138078756396e-06, "loss": 0.3719, "step": 126445 }, { "epoch": 4.5572494323710675, "grad_norm": 0.261934757232666, "learning_rate": 1.0283852918383768e-06, "loss": 0.3879, "step": 126450 }, { "epoch": 4.557429632032292, "grad_norm": 0.27113890647888184, "learning_rate": 1.0275571024097348e-06, "loss": 0.3513, "step": 126455 }, { "epoch": 4.557609831693516, "grad_norm": 0.2984657287597656, "learning_rate": 1.0267292396009764e-06, "loss": 0.3412, "step": 126460 }, { "epoch": 4.557790031354741, "grad_norm": 0.2706381380558014, "learning_rate": 1.0259017034233932e-06, "loss": 0.3714, "step": 126465 }, { "epoch": 4.5579702310159655, "grad_norm": 0.27217623591423035, "learning_rate": 1.025074493888259e-06, "loss": 0.3975, "step": 126470 }, { "epoch": 4.55815043067719, "grad_norm": 0.22170761227607727, "learning_rate": 1.0242476110068426e-06, "loss": 0.384, "step": 126475 }, { "epoch": 4.558330630338415, "grad_norm": 0.2368282973766327, "learning_rate": 1.0234210547904132e-06, "loss": 0.3971, "step": 126480 }, { "epoch": 4.55851082999964, "grad_norm": 0.22124750912189484, "learning_rate": 1.0225948252502283e-06, "loss": 0.3928, "step": 126485 }, { "epoch": 4.5586910296608645, "grad_norm": 0.27596476674079895, "learning_rate": 1.0217689223975425e-06, "loss": 0.3498, "step": 126490 }, { "epoch": 4.558871229322089, "grad_norm": 0.2869780659675598, "learning_rate": 1.0209433462436164e-06, "loss": 0.3905, "step": 126495 }, { "epoch": 4.559051428983313, "grad_norm": 0.2928743362426758, "learning_rate": 1.0201180967996937e-06, "loss": 0.3722, "step": 126500 }, { "epoch": 4.559051428983313, "eval_loss": 0.42879581451416016, "eval_runtime": 3.5284, "eval_samples_per_second": 28.342, "eval_steps_per_second": 7.085, "step": 126500 }, { "epoch": 4.559231628644538, "grad_norm": 0.2935590147972107, "learning_rate": 1.0192931740770213e-06, "loss": 0.388, "step": 126505 }, { "epoch": 4.5594118283057625, "grad_norm": 0.22944243252277374, "learning_rate": 1.018468578086837e-06, "loss": 0.3727, "step": 126510 }, { "epoch": 4.559592027966987, "grad_norm": 0.25734102725982666, "learning_rate": 1.0176443088403708e-06, "loss": 0.3495, "step": 126515 }, { "epoch": 4.559772227628212, "grad_norm": 0.2919704020023346, "learning_rate": 1.0168203663488612e-06, "loss": 0.3703, "step": 126520 }, { "epoch": 4.559952427289437, "grad_norm": 0.29848772287368774, "learning_rate": 1.0159967506235297e-06, "loss": 0.3914, "step": 126525 }, { "epoch": 4.5601326269506615, "grad_norm": 0.28151485323905945, "learning_rate": 1.0151734616756003e-06, "loss": 0.356, "step": 126530 }, { "epoch": 4.560312826611886, "grad_norm": 0.387460857629776, "learning_rate": 1.0143504995162895e-06, "loss": 0.3724, "step": 126535 }, { "epoch": 4.560493026273111, "grad_norm": 0.221677765250206, "learning_rate": 1.0135278641568046e-06, "loss": 0.3535, "step": 126540 }, { "epoch": 4.560673225934336, "grad_norm": 0.3218268156051636, "learning_rate": 1.012705555608362e-06, "loss": 0.3796, "step": 126545 }, { "epoch": 4.5608534255955595, "grad_norm": 0.2881567180156708, "learning_rate": 1.0118835738821664e-06, "loss": 0.3839, "step": 126550 }, { "epoch": 4.561033625256784, "grad_norm": 0.25800442695617676, "learning_rate": 1.0110619189894061e-06, "loss": 0.3872, "step": 126555 }, { "epoch": 4.561213824918009, "grad_norm": 0.2594134509563446, "learning_rate": 1.010240590941286e-06, "loss": 0.3691, "step": 126560 }, { "epoch": 4.561394024579234, "grad_norm": 0.22755463421344757, "learning_rate": 1.0094195897489944e-06, "loss": 0.3766, "step": 126565 }, { "epoch": 4.561574224240458, "grad_norm": 0.21717192232608795, "learning_rate": 1.008598915423714e-06, "loss": 0.3823, "step": 126570 }, { "epoch": 4.561754423901683, "grad_norm": 0.21237117052078247, "learning_rate": 1.0077785679766304e-06, "loss": 0.3484, "step": 126575 }, { "epoch": 4.561934623562908, "grad_norm": 0.24171051383018494, "learning_rate": 1.0069585474189126e-06, "loss": 0.4059, "step": 126580 }, { "epoch": 4.562114823224133, "grad_norm": 0.24933667480945587, "learning_rate": 1.0061388537617456e-06, "loss": 0.3449, "step": 126585 }, { "epoch": 4.5622950228853565, "grad_norm": 0.27278992533683777, "learning_rate": 1.0053194870162901e-06, "loss": 0.3666, "step": 126590 }, { "epoch": 4.562475222546581, "grad_norm": 0.21958884596824646, "learning_rate": 1.0045004471937125e-06, "loss": 0.3687, "step": 126595 }, { "epoch": 4.562655422207806, "grad_norm": 0.2822725474834442, "learning_rate": 1.0036817343051674e-06, "loss": 0.3441, "step": 126600 }, { "epoch": 4.562835621869031, "grad_norm": 0.2642861604690552, "learning_rate": 1.0028633483618154e-06, "loss": 0.3919, "step": 126605 }, { "epoch": 4.563015821530255, "grad_norm": 0.2586185336112976, "learning_rate": 1.0020452893748006e-06, "loss": 0.3887, "step": 126610 }, { "epoch": 4.56319602119148, "grad_norm": 0.2781605124473572, "learning_rate": 1.0012275573552748e-06, "loss": 0.3715, "step": 126615 }, { "epoch": 4.563376220852705, "grad_norm": 0.27998635172843933, "learning_rate": 1.0004101523143794e-06, "loss": 0.383, "step": 126620 }, { "epoch": 4.56355642051393, "grad_norm": 0.29203617572784424, "learning_rate": 9.99593074263247e-07, "loss": 0.3644, "step": 126625 }, { "epoch": 4.563736620175154, "grad_norm": 0.2884252965450287, "learning_rate": 9.987763232130132e-07, "loss": 0.4022, "step": 126630 }, { "epoch": 4.563916819836379, "grad_norm": 0.26518526673316956, "learning_rate": 9.979598991748023e-07, "loss": 0.3545, "step": 126635 }, { "epoch": 4.564097019497604, "grad_norm": 0.28502172231674194, "learning_rate": 9.971438021597473e-07, "loss": 0.3894, "step": 126640 }, { "epoch": 4.564277219158828, "grad_norm": 0.2587156593799591, "learning_rate": 9.96328032178953e-07, "loss": 0.3615, "step": 126645 }, { "epoch": 4.564457418820052, "grad_norm": 0.2818557024002075, "learning_rate": 9.955125892435468e-07, "loss": 0.3558, "step": 126650 }, { "epoch": 4.564637618481277, "grad_norm": 0.2464769035577774, "learning_rate": 9.946974733646335e-07, "loss": 0.3825, "step": 126655 }, { "epoch": 4.564817818142502, "grad_norm": 0.21170103549957275, "learning_rate": 9.938826845533183e-07, "loss": 0.3545, "step": 126660 }, { "epoch": 4.564998017803727, "grad_norm": 0.22021377086639404, "learning_rate": 9.930682228207089e-07, "loss": 0.3835, "step": 126665 }, { "epoch": 4.565178217464951, "grad_norm": 0.24116267263889313, "learning_rate": 9.922540881778936e-07, "loss": 0.3767, "step": 126670 }, { "epoch": 4.565358417126176, "grad_norm": 0.20193111896514893, "learning_rate": 9.914402806359635e-07, "loss": 0.3498, "step": 126675 }, { "epoch": 4.565538616787401, "grad_norm": 0.3153676986694336, "learning_rate": 9.906268002060155e-07, "loss": 0.3707, "step": 126680 }, { "epoch": 4.565718816448625, "grad_norm": 0.21089020371437073, "learning_rate": 9.898136468991293e-07, "loss": 0.362, "step": 126685 }, { "epoch": 4.565899016109849, "grad_norm": 0.22067415714263916, "learning_rate": 9.890008207263852e-07, "loss": 0.3148, "step": 126690 }, { "epoch": 4.566079215771074, "grad_norm": 0.27838194370269775, "learning_rate": 9.881883216988547e-07, "loss": 0.3914, "step": 126695 }, { "epoch": 4.566259415432299, "grad_norm": 0.2392108291387558, "learning_rate": 9.87376149827607e-07, "loss": 0.3841, "step": 126700 }, { "epoch": 4.566439615093524, "grad_norm": 0.2691670358181, "learning_rate": 9.865643051237134e-07, "loss": 0.3909, "step": 126705 }, { "epoch": 4.566619814754748, "grad_norm": 0.29195481538772583, "learning_rate": 9.857527875982292e-07, "loss": 0.4007, "step": 126710 }, { "epoch": 4.566800014415973, "grad_norm": 0.2075537145137787, "learning_rate": 9.849415972622178e-07, "loss": 0.3993, "step": 126715 }, { "epoch": 4.566980214077198, "grad_norm": 0.24173662066459656, "learning_rate": 9.841307341267258e-07, "loss": 0.3896, "step": 126720 }, { "epoch": 4.5671604137384225, "grad_norm": 0.21936601400375366, "learning_rate": 9.833201982028e-07, "loss": 0.369, "step": 126725 }, { "epoch": 4.567340613399647, "grad_norm": 0.2615318298339844, "learning_rate": 9.8250998950149e-07, "loss": 0.3511, "step": 126730 }, { "epoch": 4.567520813060871, "grad_norm": 0.23326557874679565, "learning_rate": 9.81700108033834e-07, "loss": 0.347, "step": 126735 }, { "epoch": 4.567701012722096, "grad_norm": 0.26399147510528564, "learning_rate": 9.808905538108593e-07, "loss": 0.3568, "step": 126740 }, { "epoch": 4.567881212383321, "grad_norm": 0.23725526034832, "learning_rate": 9.800813268436016e-07, "loss": 0.4013, "step": 126745 }, { "epoch": 4.568061412044545, "grad_norm": 0.23820684850215912, "learning_rate": 9.792724271430881e-07, "loss": 0.3677, "step": 126750 }, { "epoch": 4.56824161170577, "grad_norm": 0.23823130130767822, "learning_rate": 9.78463854720335e-07, "loss": 0.3627, "step": 126755 }, { "epoch": 4.568421811366995, "grad_norm": 0.2619285583496094, "learning_rate": 9.776556095863615e-07, "loss": 0.3665, "step": 126760 }, { "epoch": 4.5686020110282195, "grad_norm": 0.2296546995639801, "learning_rate": 9.768476917521752e-07, "loss": 0.3725, "step": 126765 }, { "epoch": 4.568782210689444, "grad_norm": 0.24734491109848022, "learning_rate": 9.76040101228795e-07, "loss": 0.3575, "step": 126770 }, { "epoch": 4.568962410350668, "grad_norm": 0.24005775153636932, "learning_rate": 9.752328380272125e-07, "loss": 0.3752, "step": 126775 }, { "epoch": 4.569142610011893, "grad_norm": 0.27801743149757385, "learning_rate": 9.74425902158435e-07, "loss": 0.4004, "step": 126780 }, { "epoch": 4.5693228096731175, "grad_norm": 0.23164783418178558, "learning_rate": 9.736192936334516e-07, "loss": 0.3666, "step": 126785 }, { "epoch": 4.569503009334342, "grad_norm": 0.2679736912250519, "learning_rate": 9.72813012463253e-07, "loss": 0.4026, "step": 126790 }, { "epoch": 4.569683208995567, "grad_norm": 0.21138164401054382, "learning_rate": 9.72007058658822e-07, "loss": 0.4035, "step": 126795 }, { "epoch": 4.569863408656792, "grad_norm": 0.27566009759902954, "learning_rate": 9.712014322311475e-07, "loss": 0.3793, "step": 126800 }, { "epoch": 4.5700436083180165, "grad_norm": 0.20621134340763092, "learning_rate": 9.703961331912009e-07, "loss": 0.367, "step": 126805 }, { "epoch": 4.570223807979241, "grad_norm": 0.2810385227203369, "learning_rate": 9.695911615499542e-07, "loss": 0.3406, "step": 126810 }, { "epoch": 4.570404007640466, "grad_norm": 0.24304364621639252, "learning_rate": 9.687865173183762e-07, "loss": 0.3867, "step": 126815 }, { "epoch": 4.570584207301691, "grad_norm": 0.2781292200088501, "learning_rate": 9.679822005074251e-07, "loss": 0.4125, "step": 126820 }, { "epoch": 4.5707644069629145, "grad_norm": 0.2525552809238434, "learning_rate": 9.671782111280698e-07, "loss": 0.3521, "step": 126825 }, { "epoch": 4.570944606624139, "grad_norm": 0.19801495969295502, "learning_rate": 9.66374549191254e-07, "loss": 0.3962, "step": 126830 }, { "epoch": 4.571124806285364, "grad_norm": 0.26284393668174744, "learning_rate": 9.65571214707936e-07, "loss": 0.3921, "step": 126835 }, { "epoch": 4.571305005946589, "grad_norm": 0.24263347685337067, "learning_rate": 9.647682076890541e-07, "loss": 0.3704, "step": 126840 }, { "epoch": 4.5714852056078135, "grad_norm": 0.2582213580608368, "learning_rate": 9.639655281455496e-07, "loss": 0.3755, "step": 126845 }, { "epoch": 4.571665405269038, "grad_norm": 0.2347324639558792, "learning_rate": 9.631631760883692e-07, "loss": 0.3776, "step": 126850 }, { "epoch": 4.571845604930263, "grad_norm": 0.26129037141799927, "learning_rate": 9.62361151528432e-07, "loss": 0.386, "step": 126855 }, { "epoch": 4.572025804591488, "grad_norm": 0.3019455373287201, "learning_rate": 9.615594544766681e-07, "loss": 0.3658, "step": 126860 }, { "epoch": 4.5722060042527115, "grad_norm": 0.24731415510177612, "learning_rate": 9.60758084944005e-07, "loss": 0.3629, "step": 126865 }, { "epoch": 4.572386203913936, "grad_norm": 0.2329123318195343, "learning_rate": 9.599570429413591e-07, "loss": 0.3584, "step": 126870 }, { "epoch": 4.572566403575161, "grad_norm": 0.270871102809906, "learning_rate": 9.591563284796435e-07, "loss": 0.3416, "step": 126875 }, { "epoch": 4.572746603236386, "grad_norm": 0.2525206208229065, "learning_rate": 9.58355941569769e-07, "loss": 0.3663, "step": 126880 }, { "epoch": 4.57292680289761, "grad_norm": 0.25082695484161377, "learning_rate": 9.575558822226354e-07, "loss": 0.3574, "step": 126885 }, { "epoch": 4.573107002558835, "grad_norm": 0.25247716903686523, "learning_rate": 9.567561504491535e-07, "loss": 0.3674, "step": 126890 }, { "epoch": 4.57328720222006, "grad_norm": 0.3109354078769684, "learning_rate": 9.559567462602143e-07, "loss": 0.3811, "step": 126895 }, { "epoch": 4.573467401881285, "grad_norm": 0.26343855261802673, "learning_rate": 9.551576696667092e-07, "loss": 0.3876, "step": 126900 }, { "epoch": 4.573647601542509, "grad_norm": 0.28081122040748596, "learning_rate": 9.545186442679322e-07, "loss": 0.3957, "step": 126905 }, { "epoch": 4.573827801203734, "grad_norm": 0.2132662832736969, "learning_rate": 9.537201573736415e-07, "loss": 0.373, "step": 126910 }, { "epoch": 4.574008000864959, "grad_norm": 0.3238392770290375, "learning_rate": 9.52921998105255e-07, "loss": 0.3838, "step": 126915 }, { "epoch": 4.574188200526183, "grad_norm": 0.30945807695388794, "learning_rate": 9.521241664736558e-07, "loss": 0.3599, "step": 126920 }, { "epoch": 4.574368400187407, "grad_norm": 0.28242048621177673, "learning_rate": 9.513266624897072e-07, "loss": 0.3748, "step": 126925 }, { "epoch": 4.574548599848632, "grad_norm": 0.24541223049163818, "learning_rate": 9.505294861642727e-07, "loss": 0.386, "step": 126930 }, { "epoch": 4.574728799509857, "grad_norm": 0.23810042440891266, "learning_rate": 9.497326375082216e-07, "loss": 0.3933, "step": 126935 }, { "epoch": 4.574908999171082, "grad_norm": 0.2471175640821457, "learning_rate": 9.489361165324062e-07, "loss": 0.3648, "step": 126940 }, { "epoch": 4.575089198832306, "grad_norm": 0.2977747321128845, "learning_rate": 9.481399232476817e-07, "loss": 0.4237, "step": 126945 }, { "epoch": 4.575269398493531, "grad_norm": 0.2620290517807007, "learning_rate": 9.473440576648923e-07, "loss": 0.3927, "step": 126950 }, { "epoch": 4.575449598154756, "grad_norm": 0.2539403736591339, "learning_rate": 9.46548519794882e-07, "loss": 0.3549, "step": 126955 }, { "epoch": 4.57562979781598, "grad_norm": 0.2643360197544098, "learning_rate": 9.457533096484922e-07, "loss": 0.3877, "step": 126960 }, { "epoch": 4.575809997477204, "grad_norm": 0.22208766639232635, "learning_rate": 9.449584272365585e-07, "loss": 0.3733, "step": 126965 }, { "epoch": 4.575990197138429, "grad_norm": 0.22070464491844177, "learning_rate": 9.441638725699059e-07, "loss": 0.3158, "step": 126970 }, { "epoch": 4.576170396799654, "grad_norm": 0.2639147937297821, "learning_rate": 9.433696456593671e-07, "loss": 0.3675, "step": 126975 }, { "epoch": 4.576350596460879, "grad_norm": 0.2987234592437744, "learning_rate": 9.425757465157531e-07, "loss": 0.4199, "step": 126980 }, { "epoch": 4.576530796122103, "grad_norm": 0.24704287946224213, "learning_rate": 9.417821751498912e-07, "loss": 0.3624, "step": 126985 }, { "epoch": 4.576710995783328, "grad_norm": 0.23351232707500458, "learning_rate": 9.409889315725895e-07, "loss": 0.3265, "step": 126990 }, { "epoch": 4.576891195444553, "grad_norm": 0.23723867535591125, "learning_rate": 9.40196015794656e-07, "loss": 0.4, "step": 126995 }, { "epoch": 4.5770713951057775, "grad_norm": 0.24983817338943481, "learning_rate": 9.394034278268931e-07, "loss": 0.3816, "step": 127000 }, { "epoch": 4.5770713951057775, "eval_loss": 0.42867863178253174, "eval_runtime": 3.5177, "eval_samples_per_second": 28.428, "eval_steps_per_second": 7.107, "step": 127000 }, { "epoch": 4.577251594767002, "grad_norm": 0.2775065302848816, "learning_rate": 9.386111676801007e-07, "loss": 0.414, "step": 127005 }, { "epoch": 4.577431794428226, "grad_norm": 0.31633272767066956, "learning_rate": 9.3781923536507e-07, "loss": 0.3846, "step": 127010 }, { "epoch": 4.577611994089451, "grad_norm": 0.2062579095363617, "learning_rate": 9.37027630892598e-07, "loss": 0.3812, "step": 127015 }, { "epoch": 4.577792193750676, "grad_norm": 0.3232049345970154, "learning_rate": 9.36236354273462e-07, "loss": 0.3796, "step": 127020 }, { "epoch": 4.5779723934119, "grad_norm": 0.24797019362449646, "learning_rate": 9.35445405518448e-07, "loss": 0.384, "step": 127025 }, { "epoch": 4.578152593073125, "grad_norm": 0.23074999451637268, "learning_rate": 9.346547846383308e-07, "loss": 0.3794, "step": 127030 }, { "epoch": 4.57833279273435, "grad_norm": 0.2560942769050598, "learning_rate": 9.338644916438849e-07, "loss": 0.3497, "step": 127035 }, { "epoch": 4.5785129923955745, "grad_norm": 0.20177873969078064, "learning_rate": 9.33074526545874e-07, "loss": 0.3644, "step": 127040 }, { "epoch": 4.578693192056799, "grad_norm": 0.2853398025035858, "learning_rate": 9.322848893550645e-07, "loss": 0.3395, "step": 127045 }, { "epoch": 4.578873391718023, "grad_norm": 0.26799461245536804, "learning_rate": 9.314955800822117e-07, "loss": 0.3263, "step": 127050 }, { "epoch": 4.579053591379248, "grad_norm": 0.2538415491580963, "learning_rate": 9.307065987380736e-07, "loss": 0.356, "step": 127055 }, { "epoch": 4.579233791040473, "grad_norm": 0.2968897819519043, "learning_rate": 9.299179453333973e-07, "loss": 0.3384, "step": 127060 }, { "epoch": 4.579413990701697, "grad_norm": 0.2750302255153656, "learning_rate": 9.291296198789296e-07, "loss": 0.3623, "step": 127065 }, { "epoch": 4.579594190362922, "grad_norm": 0.2515812814235687, "learning_rate": 9.283416223854119e-07, "loss": 0.3995, "step": 127070 }, { "epoch": 4.579774390024147, "grad_norm": 0.2370171695947647, "learning_rate": 9.275539528635746e-07, "loss": 0.3846, "step": 127075 }, { "epoch": 4.5799545896853715, "grad_norm": 0.2444002330303192, "learning_rate": 9.267666113241563e-07, "loss": 0.3358, "step": 127080 }, { "epoch": 4.580134789346596, "grad_norm": 0.25110915303230286, "learning_rate": 9.259795977778846e-07, "loss": 0.3625, "step": 127085 }, { "epoch": 4.580314989007821, "grad_norm": 0.2334173619747162, "learning_rate": 9.251929122354785e-07, "loss": 0.3718, "step": 127090 }, { "epoch": 4.580495188669046, "grad_norm": 0.3335031270980835, "learning_rate": 9.244065547076574e-07, "loss": 0.3461, "step": 127095 }, { "epoch": 4.58067538833027, "grad_norm": 0.286377489566803, "learning_rate": 9.23620525205135e-07, "loss": 0.3706, "step": 127100 }, { "epoch": 4.580855587991494, "grad_norm": 0.28314265608787537, "learning_rate": 9.228348237386248e-07, "loss": 0.3618, "step": 127105 }, { "epoch": 4.581035787652719, "grad_norm": 0.29264768958091736, "learning_rate": 9.220494503188265e-07, "loss": 0.3632, "step": 127110 }, { "epoch": 4.581215987313944, "grad_norm": 0.22267203032970428, "learning_rate": 9.212644049564401e-07, "loss": 0.3707, "step": 127115 }, { "epoch": 4.5813961869751685, "grad_norm": 0.26017507910728455, "learning_rate": 9.204796876621679e-07, "loss": 0.3522, "step": 127120 }, { "epoch": 4.581576386636393, "grad_norm": 0.297837495803833, "learning_rate": 9.196952984466961e-07, "loss": 0.3786, "step": 127125 }, { "epoch": 4.581756586297618, "grad_norm": 0.23576004803180695, "learning_rate": 9.189112373207188e-07, "loss": 0.3523, "step": 127130 }, { "epoch": 4.581936785958843, "grad_norm": 0.22233489155769348, "learning_rate": 9.181275042949078e-07, "loss": 0.3784, "step": 127135 }, { "epoch": 4.582116985620067, "grad_norm": 0.19266854226589203, "learning_rate": 9.173440993799492e-07, "loss": 0.3736, "step": 127140 }, { "epoch": 4.582297185281291, "grad_norm": 0.2523898482322693, "learning_rate": 9.165610225865152e-07, "loss": 0.3829, "step": 127145 }, { "epoch": 4.582477384942516, "grad_norm": 0.26051437854766846, "learning_rate": 9.157782739252718e-07, "loss": 0.3785, "step": 127150 }, { "epoch": 4.582657584603741, "grad_norm": 0.3132672607898712, "learning_rate": 9.149958534068915e-07, "loss": 0.349, "step": 127155 }, { "epoch": 4.5828377842649655, "grad_norm": 0.24046078324317932, "learning_rate": 9.142137610420265e-07, "loss": 0.3656, "step": 127160 }, { "epoch": 4.58301798392619, "grad_norm": 0.2993525564670563, "learning_rate": 9.134319968413325e-07, "loss": 0.3774, "step": 127165 }, { "epoch": 4.583198183587415, "grad_norm": 0.24848264455795288, "learning_rate": 9.1265056081547e-07, "loss": 0.4103, "step": 127170 }, { "epoch": 4.58337838324864, "grad_norm": 0.24963483214378357, "learning_rate": 9.118694529750782e-07, "loss": 0.3906, "step": 127175 }, { "epoch": 4.583558582909864, "grad_norm": 0.3283641040325165, "learning_rate": 9.11088673330801e-07, "loss": 0.4036, "step": 127180 }, { "epoch": 4.583738782571089, "grad_norm": 0.24759909510612488, "learning_rate": 9.103082218932774e-07, "loss": 0.3573, "step": 127185 }, { "epoch": 4.583918982232314, "grad_norm": 0.2345927208662033, "learning_rate": 9.09528098673143e-07, "loss": 0.3574, "step": 127190 }, { "epoch": 4.584099181893538, "grad_norm": 0.2747328579425812, "learning_rate": 9.087483036810174e-07, "loss": 0.3511, "step": 127195 }, { "epoch": 4.5842793815547624, "grad_norm": 0.2693836987018585, "learning_rate": 9.07968836927539e-07, "loss": 0.3638, "step": 127200 }, { "epoch": 4.584459581215987, "grad_norm": 0.29041051864624023, "learning_rate": 9.071896984233163e-07, "loss": 0.4105, "step": 127205 }, { "epoch": 4.584639780877212, "grad_norm": 0.23431363701820374, "learning_rate": 9.064108881789712e-07, "loss": 0.3591, "step": 127210 }, { "epoch": 4.584819980538437, "grad_norm": 0.21707414090633392, "learning_rate": 9.056324062051147e-07, "loss": 0.3548, "step": 127215 }, { "epoch": 4.585000180199661, "grad_norm": 0.2566896378993988, "learning_rate": 9.048542525123493e-07, "loss": 0.3801, "step": 127220 }, { "epoch": 4.585180379860886, "grad_norm": 0.22780990600585938, "learning_rate": 9.040764271112806e-07, "loss": 0.3584, "step": 127225 }, { "epoch": 4.585360579522111, "grad_norm": 0.28133270144462585, "learning_rate": 9.032989300125055e-07, "loss": 0.3961, "step": 127230 }, { "epoch": 4.585540779183335, "grad_norm": 0.24993495643138885, "learning_rate": 9.025217612266157e-07, "loss": 0.3699, "step": 127235 }, { "epoch": 4.585720978844559, "grad_norm": 0.20704279839992523, "learning_rate": 9.017449207642026e-07, "loss": 0.3575, "step": 127240 }, { "epoch": 4.585901178505784, "grad_norm": 0.22936898469924927, "learning_rate": 9.009684086358494e-07, "loss": 0.3541, "step": 127245 }, { "epoch": 4.586081378167009, "grad_norm": 0.26734253764152527, "learning_rate": 9.001922248521366e-07, "loss": 0.399, "step": 127250 }, { "epoch": 4.586261577828234, "grad_norm": 0.26072803139686584, "learning_rate": 8.994163694236391e-07, "loss": 0.3603, "step": 127255 }, { "epoch": 4.586441777489458, "grad_norm": 0.21624024212360382, "learning_rate": 8.98640842360926e-07, "loss": 0.3603, "step": 127260 }, { "epoch": 4.586621977150683, "grad_norm": 0.2746710479259491, "learning_rate": 8.97865643674567e-07, "loss": 0.3696, "step": 127265 }, { "epoch": 4.586802176811908, "grad_norm": 0.2964709997177124, "learning_rate": 8.970907733751199e-07, "loss": 0.3768, "step": 127270 }, { "epoch": 4.586982376473133, "grad_norm": 0.2429097592830658, "learning_rate": 8.96316231473146e-07, "loss": 0.3656, "step": 127275 }, { "epoch": 4.587162576134357, "grad_norm": 0.22786717116832733, "learning_rate": 8.955420179791979e-07, "loss": 0.3619, "step": 127280 }, { "epoch": 4.587342775795581, "grad_norm": 0.30810287594795227, "learning_rate": 8.947681329038198e-07, "loss": 0.3812, "step": 127285 }, { "epoch": 4.587522975456806, "grad_norm": 0.2238711714744568, "learning_rate": 8.939945762575619e-07, "loss": 0.3629, "step": 127290 }, { "epoch": 4.587703175118031, "grad_norm": 0.22540000081062317, "learning_rate": 8.932213480509627e-07, "loss": 0.3756, "step": 127295 }, { "epoch": 4.587883374779255, "grad_norm": 0.2444496601819992, "learning_rate": 8.9244844829455e-07, "loss": 0.3786, "step": 127300 }, { "epoch": 4.58806357444048, "grad_norm": 0.2592555582523346, "learning_rate": 8.916758769988626e-07, "loss": 0.3718, "step": 127305 }, { "epoch": 4.588243774101705, "grad_norm": 0.26862582564353943, "learning_rate": 8.909036341744226e-07, "loss": 0.373, "step": 127310 }, { "epoch": 4.5884239737629295, "grad_norm": 0.24260741472244263, "learning_rate": 8.901317198317577e-07, "loss": 0.3717, "step": 127315 }, { "epoch": 4.588604173424154, "grad_norm": 0.24818958342075348, "learning_rate": 8.893601339813762e-07, "loss": 0.3762, "step": 127320 }, { "epoch": 4.588784373085378, "grad_norm": 0.2344391644001007, "learning_rate": 8.885888766337947e-07, "loss": 0.39, "step": 127325 }, { "epoch": 4.588964572746603, "grad_norm": 0.28583937883377075, "learning_rate": 8.878179477995214e-07, "loss": 0.3784, "step": 127330 }, { "epoch": 4.589144772407828, "grad_norm": 0.24101144075393677, "learning_rate": 8.870473474890617e-07, "loss": 0.3615, "step": 127335 }, { "epoch": 4.589324972069052, "grad_norm": 0.2540043890476227, "learning_rate": 8.86277075712913e-07, "loss": 0.3619, "step": 127340 }, { "epoch": 4.589505171730277, "grad_norm": 0.2533954381942749, "learning_rate": 8.855071324815723e-07, "loss": 0.3504, "step": 127345 }, { "epoch": 4.589685371391502, "grad_norm": 0.20439958572387695, "learning_rate": 8.847375178055228e-07, "loss": 0.3585, "step": 127350 }, { "epoch": 4.5898655710527265, "grad_norm": 0.22959500551223755, "learning_rate": 8.839682316952619e-07, "loss": 0.3976, "step": 127355 }, { "epoch": 4.590045770713951, "grad_norm": 0.22936657071113586, "learning_rate": 8.831992741612616e-07, "loss": 0.3839, "step": 127360 }, { "epoch": 4.590225970375176, "grad_norm": 0.2100255936384201, "learning_rate": 8.824306452140025e-07, "loss": 0.3651, "step": 127365 }, { "epoch": 4.590406170036401, "grad_norm": 0.26398009061813354, "learning_rate": 8.816623448639594e-07, "loss": 0.3644, "step": 127370 }, { "epoch": 4.5905863696976255, "grad_norm": 0.2545202374458313, "learning_rate": 8.808943731215935e-07, "loss": 0.3634, "step": 127375 }, { "epoch": 4.590766569358849, "grad_norm": 0.2735772430896759, "learning_rate": 8.801267299973715e-07, "loss": 0.3916, "step": 127380 }, { "epoch": 4.590946769020074, "grad_norm": 0.2763000428676605, "learning_rate": 8.793594155017598e-07, "loss": 0.3839, "step": 127385 }, { "epoch": 4.591126968681299, "grad_norm": 0.26428860425949097, "learning_rate": 8.785924296451975e-07, "loss": 0.381, "step": 127390 }, { "epoch": 4.5913071683425235, "grad_norm": 0.2190413475036621, "learning_rate": 8.778257724381483e-07, "loss": 0.357, "step": 127395 }, { "epoch": 4.591487368003748, "grad_norm": 0.2818199694156647, "learning_rate": 8.770594438910512e-07, "loss": 0.4062, "step": 127400 }, { "epoch": 4.591667567664973, "grad_norm": 0.25005903840065, "learning_rate": 8.762934440143478e-07, "loss": 0.3673, "step": 127405 }, { "epoch": 4.591847767326198, "grad_norm": 0.2070559412240982, "learning_rate": 8.755277728184796e-07, "loss": 0.3492, "step": 127410 }, { "epoch": 4.592027966987422, "grad_norm": 0.23622746765613556, "learning_rate": 8.747624303138746e-07, "loss": 0.3909, "step": 127415 }, { "epoch": 4.592208166648646, "grad_norm": 0.32990074157714844, "learning_rate": 8.739974165109549e-07, "loss": 0.3831, "step": 127420 }, { "epoch": 4.592388366309871, "grad_norm": 0.21549347043037415, "learning_rate": 8.732327314201539e-07, "loss": 0.3512, "step": 127425 }, { "epoch": 4.592568565971096, "grad_norm": 0.23620551824569702, "learning_rate": 8.724683750518853e-07, "loss": 0.3524, "step": 127430 }, { "epoch": 4.5927487656323205, "grad_norm": 0.2451809197664261, "learning_rate": 8.717043474165659e-07, "loss": 0.3549, "step": 127435 }, { "epoch": 4.592928965293545, "grad_norm": 0.2065710574388504, "learning_rate": 8.709406485246013e-07, "loss": 0.3637, "step": 127440 }, { "epoch": 4.59310916495477, "grad_norm": 0.2208905667066574, "learning_rate": 8.701772783863943e-07, "loss": 0.3542, "step": 127445 }, { "epoch": 4.593289364615995, "grad_norm": 0.2570354640483856, "learning_rate": 8.69414237012356e-07, "loss": 0.3631, "step": 127450 }, { "epoch": 4.593469564277219, "grad_norm": 0.2893807888031006, "learning_rate": 8.686515244128752e-07, "loss": 0.3994, "step": 127455 }, { "epoch": 4.593649763938444, "grad_norm": 0.2452847957611084, "learning_rate": 8.678891405983464e-07, "loss": 0.367, "step": 127460 }, { "epoch": 4.593829963599669, "grad_norm": 0.28028956055641174, "learning_rate": 8.671270855791558e-07, "loss": 0.388, "step": 127465 }, { "epoch": 4.594010163260893, "grad_norm": 0.2985093295574188, "learning_rate": 8.66365359365684e-07, "loss": 0.4046, "step": 127470 }, { "epoch": 4.5941903629221175, "grad_norm": 0.2665720283985138, "learning_rate": 8.656039619683143e-07, "loss": 0.3886, "step": 127475 }, { "epoch": 4.594370562583342, "grad_norm": 0.21940386295318604, "learning_rate": 8.64842893397419e-07, "loss": 0.3497, "step": 127480 }, { "epoch": 4.594550762244567, "grad_norm": 0.21801802515983582, "learning_rate": 8.640821536633647e-07, "loss": 0.3911, "step": 127485 }, { "epoch": 4.594730961905792, "grad_norm": 0.22866007685661316, "learning_rate": 8.633217427765184e-07, "loss": 0.3706, "step": 127490 }, { "epoch": 4.594911161567016, "grad_norm": 0.21963122487068176, "learning_rate": 8.625616607472381e-07, "loss": 0.3706, "step": 127495 }, { "epoch": 4.595091361228241, "grad_norm": 0.22869311273097992, "learning_rate": 8.618019075858852e-07, "loss": 0.3886, "step": 127500 }, { "epoch": 4.595091361228241, "eval_loss": 0.42893698811531067, "eval_runtime": 3.5318, "eval_samples_per_second": 28.314, "eval_steps_per_second": 7.078, "step": 127500 }, { "epoch": 4.595271560889466, "grad_norm": 0.26364636421203613, "learning_rate": 8.610424833028097e-07, "loss": 0.3719, "step": 127505 }, { "epoch": 4.59545176055069, "grad_norm": 0.2287474125623703, "learning_rate": 8.602833879083505e-07, "loss": 0.382, "step": 127510 }, { "epoch": 4.5956319602119144, "grad_norm": 0.2660192549228668, "learning_rate": 8.595246214128605e-07, "loss": 0.3737, "step": 127515 }, { "epoch": 4.595812159873139, "grad_norm": 0.2630321681499481, "learning_rate": 8.587661838266731e-07, "loss": 0.3626, "step": 127520 }, { "epoch": 4.595992359534364, "grad_norm": 0.21891890466213226, "learning_rate": 8.580080751601244e-07, "loss": 0.3754, "step": 127525 }, { "epoch": 4.596172559195589, "grad_norm": 0.2554580271244049, "learning_rate": 8.572502954235422e-07, "loss": 0.4112, "step": 127530 }, { "epoch": 4.596352758856813, "grad_norm": 0.2666684091091156, "learning_rate": 8.564928446272463e-07, "loss": 0.3948, "step": 127535 }, { "epoch": 4.596532958518038, "grad_norm": 0.22191660106182098, "learning_rate": 8.557357227815615e-07, "loss": 0.3486, "step": 127540 }, { "epoch": 4.596713158179263, "grad_norm": 0.2158440500497818, "learning_rate": 8.549789298968075e-07, "loss": 0.3562, "step": 127545 }, { "epoch": 4.596893357840488, "grad_norm": 0.2409515678882599, "learning_rate": 8.542224659832871e-07, "loss": 0.3832, "step": 127550 }, { "epoch": 4.597073557501712, "grad_norm": 0.3206073045730591, "learning_rate": 8.534663310513141e-07, "loss": 0.3694, "step": 127555 }, { "epoch": 4.597253757162936, "grad_norm": 0.2606680393218994, "learning_rate": 8.527105251111833e-07, "loss": 0.3742, "step": 127560 }, { "epoch": 4.597433956824161, "grad_norm": 0.3032326102256775, "learning_rate": 8.519550481731975e-07, "loss": 0.3228, "step": 127565 }, { "epoch": 4.597614156485386, "grad_norm": 0.303232342004776, "learning_rate": 8.511999002476512e-07, "loss": 0.396, "step": 127570 }, { "epoch": 4.59779435614661, "grad_norm": 0.2779444754123688, "learning_rate": 8.504450813448306e-07, "loss": 0.3345, "step": 127575 }, { "epoch": 4.597974555807835, "grad_norm": 0.2656551003456116, "learning_rate": 8.496905914750192e-07, "loss": 0.3711, "step": 127580 }, { "epoch": 4.59815475546906, "grad_norm": 0.2177533507347107, "learning_rate": 8.489364306484976e-07, "loss": 0.3436, "step": 127585 }, { "epoch": 4.598334955130285, "grad_norm": 0.25983306765556335, "learning_rate": 8.481825988755382e-07, "loss": 0.3416, "step": 127590 }, { "epoch": 4.598515154791509, "grad_norm": 0.23284339904785156, "learning_rate": 8.474290961664244e-07, "loss": 0.333, "step": 127595 }, { "epoch": 4.598695354452733, "grad_norm": 0.2582654356956482, "learning_rate": 8.466759225314063e-07, "loss": 0.33, "step": 127600 }, { "epoch": 4.598875554113958, "grad_norm": 0.31043708324432373, "learning_rate": 8.459230779807508e-07, "loss": 0.3663, "step": 127605 }, { "epoch": 4.599055753775183, "grad_norm": 0.25590458512306213, "learning_rate": 8.451705625247191e-07, "loss": 0.3797, "step": 127610 }, { "epoch": 4.599235953436407, "grad_norm": 0.2824951410293579, "learning_rate": 8.444183761735641e-07, "loss": 0.385, "step": 127615 }, { "epoch": 4.599416153097632, "grad_norm": 0.3160955309867859, "learning_rate": 8.436665189375303e-07, "loss": 0.351, "step": 127620 }, { "epoch": 4.599596352758857, "grad_norm": 0.29997727274894714, "learning_rate": 8.429149908268652e-07, "loss": 0.3823, "step": 127625 }, { "epoch": 4.5997765524200815, "grad_norm": 0.2773001194000244, "learning_rate": 8.421637918518022e-07, "loss": 0.3737, "step": 127630 }, { "epoch": 4.599956752081306, "grad_norm": 0.2861831486225128, "learning_rate": 8.41412922022583e-07, "loss": 0.3375, "step": 127635 }, { "epoch": 4.600136951742531, "grad_norm": 0.27983224391937256, "learning_rate": 8.406623813494358e-07, "loss": 0.365, "step": 127640 }, { "epoch": 4.600317151403756, "grad_norm": 0.26674070954322815, "learning_rate": 8.399121698425855e-07, "loss": 0.3933, "step": 127645 }, { "epoch": 4.6004973510649805, "grad_norm": 0.23168079555034637, "learning_rate": 8.391622875122545e-07, "loss": 0.3665, "step": 127650 }, { "epoch": 4.600677550726204, "grad_norm": 0.2457718551158905, "learning_rate": 8.384127343686599e-07, "loss": 0.3455, "step": 127655 }, { "epoch": 4.600857750387429, "grad_norm": 0.25445955991744995, "learning_rate": 8.376635104220126e-07, "loss": 0.3738, "step": 127660 }, { "epoch": 4.601037950048654, "grad_norm": 0.23274029791355133, "learning_rate": 8.369146156825269e-07, "loss": 0.3727, "step": 127665 }, { "epoch": 4.6012181497098785, "grad_norm": 0.2078734189271927, "learning_rate": 8.361660501603946e-07, "loss": 0.3685, "step": 127670 }, { "epoch": 4.601398349371103, "grad_norm": 0.24284294247627258, "learning_rate": 8.354178138658269e-07, "loss": 0.3768, "step": 127675 }, { "epoch": 4.601578549032328, "grad_norm": 0.3138650059700012, "learning_rate": 8.346699068090074e-07, "loss": 0.3789, "step": 127680 }, { "epoch": 4.601758748693553, "grad_norm": 0.23819488286972046, "learning_rate": 8.339223290001363e-07, "loss": 0.3685, "step": 127685 }, { "epoch": 4.6019389483547775, "grad_norm": 0.2773071825504303, "learning_rate": 8.331750804493971e-07, "loss": 0.373, "step": 127690 }, { "epoch": 4.602119148016001, "grad_norm": 0.2507835924625397, "learning_rate": 8.324281611669621e-07, "loss": 0.3704, "step": 127695 }, { "epoch": 4.602299347677226, "grad_norm": 0.1873769313097, "learning_rate": 8.316815711630205e-07, "loss": 0.3327, "step": 127700 }, { "epoch": 4.602479547338451, "grad_norm": 0.23078793287277222, "learning_rate": 8.309353104477335e-07, "loss": 0.3727, "step": 127705 }, { "epoch": 4.6026597469996755, "grad_norm": 0.23477856814861298, "learning_rate": 8.301893790312765e-07, "loss": 0.3795, "step": 127710 }, { "epoch": 4.6028399466609, "grad_norm": 0.27823832631111145, "learning_rate": 8.294437769238106e-07, "loss": 0.3449, "step": 127715 }, { "epoch": 4.603020146322125, "grad_norm": 0.27423784136772156, "learning_rate": 8.286985041354889e-07, "loss": 0.3604, "step": 127720 }, { "epoch": 4.60320034598335, "grad_norm": 0.2629181146621704, "learning_rate": 8.279535606764755e-07, "loss": 0.3842, "step": 127725 }, { "epoch": 4.603380545644574, "grad_norm": 0.25341519713401794, "learning_rate": 8.272089465569121e-07, "loss": 0.3454, "step": 127730 }, { "epoch": 4.603560745305799, "grad_norm": 0.2602289617061615, "learning_rate": 8.264646617869493e-07, "loss": 0.3771, "step": 127735 }, { "epoch": 4.603740944967024, "grad_norm": 0.2383667379617691, "learning_rate": 8.25720706376723e-07, "loss": 0.3977, "step": 127740 }, { "epoch": 4.603921144628248, "grad_norm": 0.2735196650028229, "learning_rate": 8.249770803363727e-07, "loss": 0.3894, "step": 127745 }, { "epoch": 4.6041013442894725, "grad_norm": 0.2749096751213074, "learning_rate": 8.242337836760261e-07, "loss": 0.3741, "step": 127750 }, { "epoch": 4.604281543950697, "grad_norm": 0.22528064250946045, "learning_rate": 8.234908164058169e-07, "loss": 0.3538, "step": 127755 }, { "epoch": 4.604461743611922, "grad_norm": 0.22581934928894043, "learning_rate": 8.227481785358648e-07, "loss": 0.3696, "step": 127760 }, { "epoch": 4.604641943273147, "grad_norm": 0.31339600682258606, "learning_rate": 8.220058700762894e-07, "loss": 0.4024, "step": 127765 }, { "epoch": 4.604822142934371, "grad_norm": 0.2499316930770874, "learning_rate": 8.212638910372023e-07, "loss": 0.386, "step": 127770 }, { "epoch": 4.605002342595596, "grad_norm": 0.27650460600852966, "learning_rate": 8.205222414287089e-07, "loss": 0.378, "step": 127775 }, { "epoch": 4.605182542256821, "grad_norm": 0.2796940505504608, "learning_rate": 8.197809212609236e-07, "loss": 0.3827, "step": 127780 }, { "epoch": 4.605362741918045, "grad_norm": 0.24295328557491302, "learning_rate": 8.190399305439412e-07, "loss": 0.3327, "step": 127785 }, { "epoch": 4.6055429415792695, "grad_norm": 0.2779107689857483, "learning_rate": 8.182992692878561e-07, "loss": 0.3532, "step": 127790 }, { "epoch": 4.605723141240494, "grad_norm": 0.22981327772140503, "learning_rate": 8.175589375027631e-07, "loss": 0.3642, "step": 127795 }, { "epoch": 4.605903340901719, "grad_norm": 0.2713334262371063, "learning_rate": 8.168189351987488e-07, "loss": 0.3618, "step": 127800 }, { "epoch": 4.606083540562944, "grad_norm": 0.2509160339832306, "learning_rate": 8.160792623858909e-07, "loss": 0.3942, "step": 127805 }, { "epoch": 4.606263740224168, "grad_norm": 0.3234541416168213, "learning_rate": 8.153399190742761e-07, "loss": 0.3853, "step": 127810 }, { "epoch": 4.606443939885393, "grad_norm": 0.35334426164627075, "learning_rate": 8.146009052739656e-07, "loss": 0.4016, "step": 127815 }, { "epoch": 4.606624139546618, "grad_norm": 0.3184077739715576, "learning_rate": 8.138622209950403e-07, "loss": 0.3873, "step": 127820 }, { "epoch": 4.606804339207843, "grad_norm": 0.20487551391124725, "learning_rate": 8.131238662475588e-07, "loss": 0.329, "step": 127825 }, { "epoch": 4.606984538869067, "grad_norm": 0.24536176025867462, "learning_rate": 8.123858410415825e-07, "loss": 0.3856, "step": 127830 }, { "epoch": 4.607164738530291, "grad_norm": 0.27180904150009155, "learning_rate": 8.116481453871672e-07, "loss": 0.3782, "step": 127835 }, { "epoch": 4.607344938191516, "grad_norm": 0.27243977785110474, "learning_rate": 8.109107792943577e-07, "loss": 0.3721, "step": 127840 }, { "epoch": 4.607525137852741, "grad_norm": 0.2593880295753479, "learning_rate": 8.101737427732098e-07, "loss": 0.3698, "step": 127845 }, { "epoch": 4.607705337513965, "grad_norm": 0.2483491450548172, "learning_rate": 8.0943703583376e-07, "loss": 0.3611, "step": 127850 }, { "epoch": 4.60788553717519, "grad_norm": 0.28499743342399597, "learning_rate": 8.087006584860501e-07, "loss": 0.3617, "step": 127855 }, { "epoch": 4.608065736836415, "grad_norm": 0.29936861991882324, "learning_rate": 8.079646107401085e-07, "loss": 0.3898, "step": 127860 }, { "epoch": 4.60824593649764, "grad_norm": 0.25267350673675537, "learning_rate": 8.072288926059601e-07, "loss": 0.3722, "step": 127865 }, { "epoch": 4.608426136158864, "grad_norm": 0.23155082762241364, "learning_rate": 8.064935040936417e-07, "loss": 0.3845, "step": 127870 }, { "epoch": 4.608606335820088, "grad_norm": 0.21807222068309784, "learning_rate": 8.057584452131644e-07, "loss": 0.3625, "step": 127875 }, { "epoch": 4.608786535481313, "grad_norm": 0.21520547568798065, "learning_rate": 8.0502371597454e-07, "loss": 0.388, "step": 127880 }, { "epoch": 4.608966735142538, "grad_norm": 0.3115110993385315, "learning_rate": 8.042893163877852e-07, "loss": 0.3558, "step": 127885 }, { "epoch": 4.609146934803762, "grad_norm": 0.2501071095466614, "learning_rate": 8.035552464629059e-07, "loss": 0.3805, "step": 127890 }, { "epoch": 4.609327134464987, "grad_norm": 0.27980107069015503, "learning_rate": 8.028215062098998e-07, "loss": 0.35, "step": 127895 }, { "epoch": 4.609507334126212, "grad_norm": 0.35373973846435547, "learning_rate": 8.020880956387672e-07, "loss": 0.3654, "step": 127900 }, { "epoch": 4.609687533787437, "grad_norm": 0.3000072240829468, "learning_rate": 8.013550147594945e-07, "loss": 0.3785, "step": 127905 }, { "epoch": 4.609867733448661, "grad_norm": 0.258821576833725, "learning_rate": 8.006222635820792e-07, "loss": 0.3589, "step": 127910 }, { "epoch": 4.610047933109886, "grad_norm": 0.2523718774318695, "learning_rate": 7.998898421165025e-07, "loss": 0.3906, "step": 127915 }, { "epoch": 4.610228132771111, "grad_norm": 0.2900547385215759, "learning_rate": 7.991577503727366e-07, "loss": 0.3752, "step": 127920 }, { "epoch": 4.6104083324323355, "grad_norm": 0.23753364384174347, "learning_rate": 7.984259883607653e-07, "loss": 0.343, "step": 127925 }, { "epoch": 4.610588532093559, "grad_norm": 0.24349209666252136, "learning_rate": 7.97694556090553e-07, "loss": 0.3692, "step": 127930 }, { "epoch": 4.610768731754784, "grad_norm": 0.2928297519683838, "learning_rate": 7.969634535720639e-07, "loss": 0.3934, "step": 127935 }, { "epoch": 4.610948931416009, "grad_norm": 0.29633548855781555, "learning_rate": 7.962326808152621e-07, "loss": 0.3514, "step": 127940 }, { "epoch": 4.6111291310772335, "grad_norm": 0.26136893033981323, "learning_rate": 7.955022378301064e-07, "loss": 0.3772, "step": 127945 }, { "epoch": 4.611309330738458, "grad_norm": 0.24420562386512756, "learning_rate": 7.947721246265472e-07, "loss": 0.352, "step": 127950 }, { "epoch": 4.611489530399683, "grad_norm": 0.24144157767295837, "learning_rate": 7.940423412145292e-07, "loss": 0.4035, "step": 127955 }, { "epoch": 4.611669730060908, "grad_norm": 0.19324125349521637, "learning_rate": 7.933128876039974e-07, "loss": 0.3594, "step": 127960 }, { "epoch": 4.6118499297221325, "grad_norm": 0.2407853752374649, "learning_rate": 7.925837638048967e-07, "loss": 0.394, "step": 127965 }, { "epoch": 4.612030129383356, "grad_norm": 0.2682248055934906, "learning_rate": 7.918549698271494e-07, "loss": 0.36, "step": 127970 }, { "epoch": 4.612210329044581, "grad_norm": 0.21902526915073395, "learning_rate": 7.911265056806921e-07, "loss": 0.3495, "step": 127975 }, { "epoch": 4.612390528705806, "grad_norm": 0.25772321224212646, "learning_rate": 7.903983713754503e-07, "loss": 0.3791, "step": 127980 }, { "epoch": 4.6125707283670305, "grad_norm": 0.23041696846485138, "learning_rate": 7.896705669213411e-07, "loss": 0.4039, "step": 127985 }, { "epoch": 4.612750928028255, "grad_norm": 0.2209273725748062, "learning_rate": 7.889430923282898e-07, "loss": 0.3744, "step": 127990 }, { "epoch": 4.61293112768948, "grad_norm": 0.23536401987075806, "learning_rate": 7.882159476061968e-07, "loss": 0.3478, "step": 127995 }, { "epoch": 4.613111327350705, "grad_norm": 0.29315903782844543, "learning_rate": 7.874891327649709e-07, "loss": 0.4023, "step": 128000 }, { "epoch": 4.613111327350705, "eval_loss": 0.42875730991363525, "eval_runtime": 3.5339, "eval_samples_per_second": 28.298, "eval_steps_per_second": 7.074, "step": 128000 }, { "epoch": 4.6132915270119295, "grad_norm": 0.25371137261390686, "learning_rate": 7.867626478145235e-07, "loss": 0.3724, "step": 128005 }, { "epoch": 4.613471726673154, "grad_norm": 0.2467198222875595, "learning_rate": 7.86036492764744e-07, "loss": 0.386, "step": 128010 }, { "epoch": 4.613651926334379, "grad_norm": 0.36411789059638977, "learning_rate": 7.853106676255328e-07, "loss": 0.3526, "step": 128015 }, { "epoch": 4.613832125995603, "grad_norm": 0.31409749388694763, "learning_rate": 7.84585172406771e-07, "loss": 0.4179, "step": 128020 }, { "epoch": 4.6140123256568275, "grad_norm": 0.23237422108650208, "learning_rate": 7.838600071183478e-07, "loss": 0.3654, "step": 128025 }, { "epoch": 4.614192525318052, "grad_norm": 0.2930370271205902, "learning_rate": 7.831351717701469e-07, "loss": 0.3874, "step": 128030 }, { "epoch": 4.614372724979277, "grad_norm": 0.24373428523540497, "learning_rate": 7.82410666372041e-07, "loss": 0.3839, "step": 128035 }, { "epoch": 4.614552924640502, "grad_norm": 0.31576600670814514, "learning_rate": 7.816864909339e-07, "loss": 0.384, "step": 128040 }, { "epoch": 4.614733124301726, "grad_norm": 0.2532004714012146, "learning_rate": 7.809626454655911e-07, "loss": 0.3523, "step": 128045 }, { "epoch": 4.614913323962951, "grad_norm": 0.2986234724521637, "learning_rate": 7.802391299769784e-07, "loss": 0.3987, "step": 128050 }, { "epoch": 4.615093523624176, "grad_norm": 0.24925264716148376, "learning_rate": 7.795159444779154e-07, "loss": 0.3647, "step": 128055 }, { "epoch": 4.6152737232854, "grad_norm": 0.19276399910449982, "learning_rate": 7.787930889782663e-07, "loss": 0.3356, "step": 128060 }, { "epoch": 4.6154539229466245, "grad_norm": 0.23550952970981598, "learning_rate": 7.780705634878621e-07, "loss": 0.4043, "step": 128065 }, { "epoch": 4.615634122607849, "grad_norm": 0.29155659675598145, "learning_rate": 7.773483680165617e-07, "loss": 0.3733, "step": 128070 }, { "epoch": 4.615814322269074, "grad_norm": 0.2625998854637146, "learning_rate": 7.766265025742014e-07, "loss": 0.3544, "step": 128075 }, { "epoch": 4.615994521930299, "grad_norm": 0.26157450675964355, "learning_rate": 7.759049671706125e-07, "loss": 0.3456, "step": 128080 }, { "epoch": 4.616174721591523, "grad_norm": 0.2234465479850769, "learning_rate": 7.751837618156316e-07, "loss": 0.4037, "step": 128085 }, { "epoch": 4.616354921252748, "grad_norm": 0.2910038232803345, "learning_rate": 7.744628865190784e-07, "loss": 0.384, "step": 128090 }, { "epoch": 4.616535120913973, "grad_norm": 0.28692278265953064, "learning_rate": 7.737423412907785e-07, "loss": 0.3577, "step": 128095 }, { "epoch": 4.616715320575198, "grad_norm": 0.23942017555236816, "learning_rate": 7.730221261405518e-07, "loss": 0.3879, "step": 128100 }, { "epoch": 4.616895520236422, "grad_norm": 0.23349037766456604, "learning_rate": 7.723022410782044e-07, "loss": 0.3775, "step": 128105 }, { "epoch": 4.617075719897647, "grad_norm": 0.2587268352508545, "learning_rate": 7.715826861135505e-07, "loss": 0.4072, "step": 128110 }, { "epoch": 4.617255919558871, "grad_norm": 0.24212481081485748, "learning_rate": 7.708634612563936e-07, "loss": 0.3423, "step": 128115 }, { "epoch": 4.617436119220096, "grad_norm": 0.2429906278848648, "learning_rate": 7.701445665165258e-07, "loss": 0.3702, "step": 128120 }, { "epoch": 4.61761631888132, "grad_norm": 0.2274543046951294, "learning_rate": 7.694260019037502e-07, "loss": 0.3496, "step": 128125 }, { "epoch": 4.617796518542545, "grad_norm": 0.27825266122817993, "learning_rate": 7.687077674278564e-07, "loss": 0.4096, "step": 128130 }, { "epoch": 4.61797671820377, "grad_norm": 0.2594509720802307, "learning_rate": 7.679898630986254e-07, "loss": 0.3866, "step": 128135 }, { "epoch": 4.618156917864995, "grad_norm": 0.2519370913505554, "learning_rate": 7.67272288925841e-07, "loss": 0.3612, "step": 128140 }, { "epoch": 4.618337117526219, "grad_norm": 0.26362189650535583, "learning_rate": 7.665550449192788e-07, "loss": 0.3569, "step": 128145 }, { "epoch": 4.618517317187444, "grad_norm": 0.20409050583839417, "learning_rate": 7.658381310887142e-07, "loss": 0.4079, "step": 128150 }, { "epoch": 4.618697516848668, "grad_norm": 0.24394989013671875, "learning_rate": 7.651215474439172e-07, "loss": 0.3567, "step": 128155 }, { "epoch": 4.618877716509893, "grad_norm": 0.2227751463651657, "learning_rate": 7.644052939946411e-07, "loss": 0.4044, "step": 128160 }, { "epoch": 4.619057916171117, "grad_norm": 0.2579880952835083, "learning_rate": 7.636893707506532e-07, "loss": 0.3804, "step": 128165 }, { "epoch": 4.619238115832342, "grad_norm": 0.2594741880893707, "learning_rate": 7.629737777217011e-07, "loss": 0.3805, "step": 128170 }, { "epoch": 4.619418315493567, "grad_norm": 0.2516389787197113, "learning_rate": 7.622585149175465e-07, "loss": 0.3612, "step": 128175 }, { "epoch": 4.619598515154792, "grad_norm": 0.24397249519824982, "learning_rate": 7.615435823479233e-07, "loss": 0.3967, "step": 128180 }, { "epoch": 4.619778714816016, "grad_norm": 0.2816333472728729, "learning_rate": 7.608289800225737e-07, "loss": 0.4114, "step": 128185 }, { "epoch": 4.619958914477241, "grad_norm": 0.19909071922302246, "learning_rate": 7.601147079512399e-07, "loss": 0.3857, "step": 128190 }, { "epoch": 4.620139114138466, "grad_norm": 0.2591770589351654, "learning_rate": 7.594007661436475e-07, "loss": 0.3494, "step": 128195 }, { "epoch": 4.6203193137996905, "grad_norm": 0.2472904771566391, "learning_rate": 7.586871546095303e-07, "loss": 0.3945, "step": 128200 }, { "epoch": 4.620499513460914, "grad_norm": 0.32332003116607666, "learning_rate": 7.579738733586056e-07, "loss": 0.3713, "step": 128205 }, { "epoch": 4.620679713122139, "grad_norm": 0.25053974986076355, "learning_rate": 7.572609224005905e-07, "loss": 0.3696, "step": 128210 }, { "epoch": 4.620859912783364, "grad_norm": 0.2518098056316376, "learning_rate": 7.565483017452024e-07, "loss": 0.3672, "step": 128215 }, { "epoch": 4.621040112444589, "grad_norm": 0.25849613547325134, "learning_rate": 7.558360114021528e-07, "loss": 0.3614, "step": 128220 }, { "epoch": 4.621220312105813, "grad_norm": 0.19711540639400482, "learning_rate": 7.551240513811425e-07, "loss": 0.3363, "step": 128225 }, { "epoch": 4.621400511767038, "grad_norm": 0.28743451833724976, "learning_rate": 7.544124216918746e-07, "loss": 0.3525, "step": 128230 }, { "epoch": 4.621580711428263, "grad_norm": 0.2442832887172699, "learning_rate": 7.537011223440415e-07, "loss": 0.3821, "step": 128235 }, { "epoch": 4.6217609110894875, "grad_norm": 0.23622441291809082, "learning_rate": 7.529901533473355e-07, "loss": 0.3843, "step": 128240 }, { "epoch": 4.621941110750711, "grad_norm": 0.274114191532135, "learning_rate": 7.522795147114486e-07, "loss": 0.364, "step": 128245 }, { "epoch": 4.622121310411936, "grad_norm": 0.28449776768684387, "learning_rate": 7.515692064460539e-07, "loss": 0.3848, "step": 128250 }, { "epoch": 4.622301510073161, "grad_norm": 0.2581150531768799, "learning_rate": 7.508592285608351e-07, "loss": 0.3404, "step": 128255 }, { "epoch": 4.6224817097343855, "grad_norm": 0.25341930985450745, "learning_rate": 7.501495810654679e-07, "loss": 0.3227, "step": 128260 }, { "epoch": 4.62266190939561, "grad_norm": 0.24932630360126495, "learning_rate": 7.494402639696113e-07, "loss": 0.3482, "step": 128265 }, { "epoch": 4.622842109056835, "grad_norm": 0.2252834588289261, "learning_rate": 7.487312772829436e-07, "loss": 0.356, "step": 128270 }, { "epoch": 4.62302230871806, "grad_norm": 0.23649568855762482, "learning_rate": 7.480226210151098e-07, "loss": 0.3956, "step": 128275 }, { "epoch": 4.6232025083792845, "grad_norm": 0.20822374522686005, "learning_rate": 7.473142951757772e-07, "loss": 0.3501, "step": 128280 }, { "epoch": 4.623382708040509, "grad_norm": 0.2828695476055145, "learning_rate": 7.466062997745909e-07, "loss": 0.356, "step": 128285 }, { "epoch": 4.623562907701734, "grad_norm": 0.2521428167819977, "learning_rate": 7.458986348211988e-07, "loss": 0.4002, "step": 128290 }, { "epoch": 4.623743107362958, "grad_norm": 0.25071465969085693, "learning_rate": 7.45191300325243e-07, "loss": 0.4166, "step": 128295 }, { "epoch": 4.6239233070241825, "grad_norm": 0.2567957937717438, "learning_rate": 7.444842962963577e-07, "loss": 0.3731, "step": 128300 }, { "epoch": 4.624103506685407, "grad_norm": 0.2071949541568756, "learning_rate": 7.437776227441767e-07, "loss": 0.3451, "step": 128305 }, { "epoch": 4.624283706346632, "grad_norm": 0.2758256793022156, "learning_rate": 7.430712796783312e-07, "loss": 0.3545, "step": 128310 }, { "epoch": 4.624463906007857, "grad_norm": 0.26326316595077515, "learning_rate": 7.423652671084441e-07, "loss": 0.3581, "step": 128315 }, { "epoch": 4.6246441056690815, "grad_norm": 0.28544881939888, "learning_rate": 7.416595850441327e-07, "loss": 0.3611, "step": 128320 }, { "epoch": 4.624824305330306, "grad_norm": 0.2547943592071533, "learning_rate": 7.409542334950143e-07, "loss": 0.3622, "step": 128325 }, { "epoch": 4.625004504991531, "grad_norm": 0.24495618045330048, "learning_rate": 7.402492124706922e-07, "loss": 0.3712, "step": 128330 }, { "epoch": 4.625184704652755, "grad_norm": 0.33649563789367676, "learning_rate": 7.395445219807839e-07, "loss": 0.3987, "step": 128335 }, { "epoch": 4.6253649043139795, "grad_norm": 0.2949081361293793, "learning_rate": 7.388401620348844e-07, "loss": 0.3823, "step": 128340 }, { "epoch": 4.625545103975204, "grad_norm": 0.23140224814414978, "learning_rate": 7.381361326425862e-07, "loss": 0.3957, "step": 128345 }, { "epoch": 4.625725303636429, "grad_norm": 0.2955685257911682, "learning_rate": 7.374324338134897e-07, "loss": 0.3733, "step": 128350 }, { "epoch": 4.625905503297654, "grad_norm": 0.2494993358850479, "learning_rate": 7.367290655571762e-07, "loss": 0.3525, "step": 128355 }, { "epoch": 4.626085702958878, "grad_norm": 0.21933454275131226, "learning_rate": 7.360260278832353e-07, "loss": 0.3365, "step": 128360 }, { "epoch": 4.626265902620103, "grad_norm": 0.26655271649360657, "learning_rate": 7.3532332080124e-07, "loss": 0.3693, "step": 128365 }, { "epoch": 4.626446102281328, "grad_norm": 0.25093698501586914, "learning_rate": 7.346209443207657e-07, "loss": 0.364, "step": 128370 }, { "epoch": 4.626626301942553, "grad_norm": 0.2572937309741974, "learning_rate": 7.339188984513828e-07, "loss": 0.369, "step": 128375 }, { "epoch": 4.626806501603777, "grad_norm": 0.2407890409231186, "learning_rate": 7.332171832026586e-07, "loss": 0.3885, "step": 128380 }, { "epoch": 4.626986701265002, "grad_norm": 0.2307957410812378, "learning_rate": 7.325157985841518e-07, "loss": 0.3407, "step": 128385 }, { "epoch": 4.627166900926226, "grad_norm": 0.3046651780605316, "learning_rate": 7.318147446054191e-07, "loss": 0.398, "step": 128390 }, { "epoch": 4.627347100587451, "grad_norm": 0.2512384355068207, "learning_rate": 7.311140212760109e-07, "loss": 0.3883, "step": 128395 }, { "epoch": 4.627527300248675, "grad_norm": 0.33214032649993896, "learning_rate": 7.304136286054753e-07, "loss": 0.3687, "step": 128400 }, { "epoch": 4.6277074999099, "grad_norm": 0.22773124277591705, "learning_rate": 7.297135666033572e-07, "loss": 0.382, "step": 128405 }, { "epoch": 4.627887699571125, "grad_norm": 0.28903457522392273, "learning_rate": 7.290138352791936e-07, "loss": 0.365, "step": 128410 }, { "epoch": 4.62806789923235, "grad_norm": 0.26188942790031433, "learning_rate": 7.283144346425158e-07, "loss": 0.3868, "step": 128415 }, { "epoch": 4.628248098893574, "grad_norm": 0.2626602053642273, "learning_rate": 7.27615364702855e-07, "loss": 0.3717, "step": 128420 }, { "epoch": 4.628428298554799, "grad_norm": 0.21673083305358887, "learning_rate": 7.269166254697313e-07, "loss": 0.405, "step": 128425 }, { "epoch": 4.628608498216023, "grad_norm": 0.2583613991737366, "learning_rate": 7.26218216952676e-07, "loss": 0.372, "step": 128430 }, { "epoch": 4.628788697877248, "grad_norm": 0.2558477818965912, "learning_rate": 7.255201391611954e-07, "loss": 0.3432, "step": 128435 }, { "epoch": 4.628968897538472, "grad_norm": 0.23397044837474823, "learning_rate": 7.248223921048014e-07, "loss": 0.3572, "step": 128440 }, { "epoch": 4.629149097199697, "grad_norm": 0.23054839670658112, "learning_rate": 7.241249757930057e-07, "loss": 0.352, "step": 128445 }, { "epoch": 4.629329296860922, "grad_norm": 0.20861117541790009, "learning_rate": 7.234278902353036e-07, "loss": 0.3908, "step": 128450 }, { "epoch": 4.629509496522147, "grad_norm": 0.25009605288505554, "learning_rate": 7.227311354412014e-07, "loss": 0.3717, "step": 128455 }, { "epoch": 4.629689696183371, "grad_norm": 0.20757490396499634, "learning_rate": 7.220347114201803e-07, "loss": 0.3404, "step": 128460 }, { "epoch": 4.629869895844596, "grad_norm": 0.21684591472148895, "learning_rate": 7.213386181817411e-07, "loss": 0.3976, "step": 128465 }, { "epoch": 4.630050095505821, "grad_norm": 0.2429099678993225, "learning_rate": 7.206428557353623e-07, "loss": 0.3773, "step": 128470 }, { "epoch": 4.6302302951670455, "grad_norm": 0.22892560064792633, "learning_rate": 7.199474240905224e-07, "loss": 0.3536, "step": 128475 }, { "epoch": 4.630410494828269, "grad_norm": 0.2792302072048187, "learning_rate": 7.192523232567e-07, "loss": 0.3563, "step": 128480 }, { "epoch": 4.630590694489494, "grad_norm": 0.3034243881702423, "learning_rate": 7.185575532433625e-07, "loss": 0.3868, "step": 128485 }, { "epoch": 4.630770894150719, "grad_norm": 0.2696426212787628, "learning_rate": 7.178631140599746e-07, "loss": 0.3712, "step": 128490 }, { "epoch": 4.630951093811944, "grad_norm": 0.2592863440513611, "learning_rate": 7.171690057160035e-07, "loss": 0.3662, "step": 128495 }, { "epoch": 4.631131293473168, "grad_norm": 0.27280524373054504, "learning_rate": 7.164752282209031e-07, "loss": 0.381, "step": 128500 }, { "epoch": 4.631131293473168, "eval_loss": 0.42883244156837463, "eval_runtime": 3.5159, "eval_samples_per_second": 28.443, "eval_steps_per_second": 7.111, "step": 128500 }, { "epoch": 4.631311493134393, "grad_norm": 0.27146467566490173, "learning_rate": 7.157817815841267e-07, "loss": 0.3605, "step": 128505 }, { "epoch": 4.631491692795618, "grad_norm": 0.2853062152862549, "learning_rate": 7.150886658151224e-07, "loss": 0.3779, "step": 128510 }, { "epoch": 4.6316718924568425, "grad_norm": 0.28604602813720703, "learning_rate": 7.143958809233297e-07, "loss": 0.3426, "step": 128515 }, { "epoch": 4.631852092118066, "grad_norm": 0.230341836810112, "learning_rate": 7.137034269181969e-07, "loss": 0.3912, "step": 128520 }, { "epoch": 4.632032291779291, "grad_norm": 0.24526306986808777, "learning_rate": 7.130113038091524e-07, "loss": 0.386, "step": 128525 }, { "epoch": 4.632212491440516, "grad_norm": 0.2500110864639282, "learning_rate": 7.12319511605622e-07, "loss": 0.3485, "step": 128530 }, { "epoch": 4.632392691101741, "grad_norm": 0.23886556923389435, "learning_rate": 7.116280503170398e-07, "loss": 0.381, "step": 128535 }, { "epoch": 4.632572890762965, "grad_norm": 0.19476771354675293, "learning_rate": 7.109369199528177e-07, "loss": 0.3623, "step": 128540 }, { "epoch": 4.63275309042419, "grad_norm": 0.23774556815624237, "learning_rate": 7.102461205223843e-07, "loss": 0.3641, "step": 128545 }, { "epoch": 4.632933290085415, "grad_norm": 0.26775142550468445, "learning_rate": 7.095556520351432e-07, "loss": 0.3751, "step": 128550 }, { "epoch": 4.6331134897466395, "grad_norm": 0.24131718277931213, "learning_rate": 7.088655145005008e-07, "loss": 0.3709, "step": 128555 }, { "epoch": 4.633293689407864, "grad_norm": 0.2844558358192444, "learning_rate": 7.081757079278662e-07, "loss": 0.3807, "step": 128560 }, { "epoch": 4.633473889069089, "grad_norm": 0.303622305393219, "learning_rate": 7.074862323266318e-07, "loss": 0.3549, "step": 128565 }, { "epoch": 4.633654088730313, "grad_norm": 0.21193251013755798, "learning_rate": 7.067970877061958e-07, "loss": 0.3675, "step": 128570 }, { "epoch": 4.6338342883915375, "grad_norm": 0.2827516496181488, "learning_rate": 7.061082740759478e-07, "loss": 0.3776, "step": 128575 }, { "epoch": 4.634014488052762, "grad_norm": 0.2597757577896118, "learning_rate": 7.054197914452664e-07, "loss": 0.3736, "step": 128580 }, { "epoch": 4.634194687713987, "grad_norm": 0.23663856089115143, "learning_rate": 7.047316398235387e-07, "loss": 0.3633, "step": 128585 }, { "epoch": 4.634374887375212, "grad_norm": 0.22754615545272827, "learning_rate": 7.040438192201403e-07, "loss": 0.3843, "step": 128590 }, { "epoch": 4.6345550870364365, "grad_norm": 0.28135913610458374, "learning_rate": 7.033563296444417e-07, "loss": 0.354, "step": 128595 }, { "epoch": 4.634735286697661, "grad_norm": 0.31850412487983704, "learning_rate": 7.026691711058076e-07, "loss": 0.3579, "step": 128600 }, { "epoch": 4.634915486358886, "grad_norm": 0.24080584943294525, "learning_rate": 7.019823436136024e-07, "loss": 0.3839, "step": 128605 }, { "epoch": 4.63509568602011, "grad_norm": 0.2874895930290222, "learning_rate": 7.012958471771803e-07, "loss": 0.3642, "step": 128610 }, { "epoch": 4.6352758856813345, "grad_norm": 0.24868187308311462, "learning_rate": 7.006096818059e-07, "loss": 0.3436, "step": 128615 }, { "epoch": 4.635456085342559, "grad_norm": 0.24118977785110474, "learning_rate": 6.999238475091096e-07, "loss": 0.3891, "step": 128620 }, { "epoch": 4.635636285003784, "grad_norm": 0.202513188123703, "learning_rate": 6.99238344296152e-07, "loss": 0.3655, "step": 128625 }, { "epoch": 4.635816484665009, "grad_norm": 0.24123841524124146, "learning_rate": 6.985531721763666e-07, "loss": 0.4058, "step": 128630 }, { "epoch": 4.6359966843262335, "grad_norm": 0.2426537424325943, "learning_rate": 6.978683311590877e-07, "loss": 0.3598, "step": 128635 }, { "epoch": 4.636176883987458, "grad_norm": 0.2862418591976166, "learning_rate": 6.971838212536524e-07, "loss": 0.3541, "step": 128640 }, { "epoch": 4.636357083648683, "grad_norm": 0.24865272641181946, "learning_rate": 6.96499642469381e-07, "loss": 0.3766, "step": 128645 }, { "epoch": 4.636537283309908, "grad_norm": 0.31950682401657104, "learning_rate": 6.958157948155936e-07, "loss": 0.347, "step": 128650 }, { "epoch": 4.636717482971132, "grad_norm": 0.2226303666830063, "learning_rate": 6.951322783016107e-07, "loss": 0.3515, "step": 128655 }, { "epoch": 4.636897682632357, "grad_norm": 0.22242146730422974, "learning_rate": 6.944490929367469e-07, "loss": 0.3747, "step": 128660 }, { "epoch": 4.637077882293581, "grad_norm": 0.3434227705001831, "learning_rate": 6.93766238730309e-07, "loss": 0.399, "step": 128665 }, { "epoch": 4.637258081954806, "grad_norm": 0.27048400044441223, "learning_rate": 6.930837156916004e-07, "loss": 0.3702, "step": 128670 }, { "epoch": 4.6374382816160304, "grad_norm": 0.2766757011413574, "learning_rate": 6.924015238299164e-07, "loss": 0.373, "step": 128675 }, { "epoch": 4.637618481277255, "grad_norm": 0.22949692606925964, "learning_rate": 6.91719663154558e-07, "loss": 0.3402, "step": 128680 }, { "epoch": 4.63779868093848, "grad_norm": 0.2594715654850006, "learning_rate": 6.910381336748123e-07, "loss": 0.3437, "step": 128685 }, { "epoch": 4.637978880599705, "grad_norm": 0.24005542695522308, "learning_rate": 6.903569353999661e-07, "loss": 0.3574, "step": 128690 }, { "epoch": 4.638159080260929, "grad_norm": 0.2983018755912781, "learning_rate": 6.896760683393011e-07, "loss": 0.366, "step": 128695 }, { "epoch": 4.638339279922154, "grad_norm": 0.2457517385482788, "learning_rate": 6.889955325020903e-07, "loss": 0.3647, "step": 128700 }, { "epoch": 4.638519479583378, "grad_norm": 0.27278947830200195, "learning_rate": 6.883153278976123e-07, "loss": 0.3514, "step": 128705 }, { "epoch": 4.638699679244603, "grad_norm": 0.3145797848701477, "learning_rate": 6.876354545351293e-07, "loss": 0.3964, "step": 128710 }, { "epoch": 4.638879878905827, "grad_norm": 0.26128581166267395, "learning_rate": 6.869559124239061e-07, "loss": 0.3822, "step": 128715 }, { "epoch": 4.639060078567052, "grad_norm": 0.2640693187713623, "learning_rate": 6.862767015732019e-07, "loss": 0.3944, "step": 128720 }, { "epoch": 4.639240278228277, "grad_norm": 0.24374765157699585, "learning_rate": 6.855978219922676e-07, "loss": 0.3894, "step": 128725 }, { "epoch": 4.639420477889502, "grad_norm": 0.24732322990894318, "learning_rate": 6.849192736903598e-07, "loss": 0.3685, "step": 128730 }, { "epoch": 4.639600677550726, "grad_norm": 0.307091623544693, "learning_rate": 6.84241056676721e-07, "loss": 0.4219, "step": 128735 }, { "epoch": 4.639780877211951, "grad_norm": 0.2447475790977478, "learning_rate": 6.835631709605856e-07, "loss": 0.3844, "step": 128740 }, { "epoch": 4.639961076873176, "grad_norm": 0.21926872432231903, "learning_rate": 6.828856165511932e-07, "loss": 0.3567, "step": 128745 }, { "epoch": 4.640141276534401, "grad_norm": 0.27030664682388306, "learning_rate": 6.822083934577811e-07, "loss": 0.4035, "step": 128750 }, { "epoch": 4.640321476195624, "grad_norm": 0.28935354948043823, "learning_rate": 6.815315016895696e-07, "loss": 0.3833, "step": 128755 }, { "epoch": 4.640501675856849, "grad_norm": 0.25917571783065796, "learning_rate": 6.808549412557819e-07, "loss": 0.3641, "step": 128760 }, { "epoch": 4.640681875518074, "grad_norm": 0.2617967426776886, "learning_rate": 6.801787121656355e-07, "loss": 0.3447, "step": 128765 }, { "epoch": 4.640862075179299, "grad_norm": 0.23066262900829315, "learning_rate": 6.795028144283483e-07, "loss": 0.3868, "step": 128770 }, { "epoch": 4.641042274840523, "grad_norm": 0.20748838782310486, "learning_rate": 6.788272480531266e-07, "loss": 0.3589, "step": 128775 }, { "epoch": 4.641222474501748, "grad_norm": 0.2675074636936188, "learning_rate": 6.781520130491742e-07, "loss": 0.3952, "step": 128780 }, { "epoch": 4.641402674162973, "grad_norm": 0.25951436161994934, "learning_rate": 6.774771094256949e-07, "loss": 0.3544, "step": 128785 }, { "epoch": 4.6415828738241975, "grad_norm": 0.23791882395744324, "learning_rate": 6.768025371918785e-07, "loss": 0.379, "step": 128790 }, { "epoch": 4.641763073485421, "grad_norm": 0.2497406154870987, "learning_rate": 6.761282963569149e-07, "loss": 0.3833, "step": 128795 }, { "epoch": 4.641943273146646, "grad_norm": 0.2554693818092346, "learning_rate": 6.754543869299996e-07, "loss": 0.3814, "step": 128800 }, { "epoch": 4.642123472807871, "grad_norm": 0.22991681098937988, "learning_rate": 6.747808089203056e-07, "loss": 0.3698, "step": 128805 }, { "epoch": 4.642303672469096, "grad_norm": 0.2903478741645813, "learning_rate": 6.741075623370147e-07, "loss": 0.3504, "step": 128810 }, { "epoch": 4.64248387213032, "grad_norm": 0.21784360706806183, "learning_rate": 6.734346471893e-07, "loss": 0.3714, "step": 128815 }, { "epoch": 4.642664071791545, "grad_norm": 0.2957072854042053, "learning_rate": 6.727620634863235e-07, "loss": 0.3464, "step": 128820 }, { "epoch": 4.64284427145277, "grad_norm": 0.2421649545431137, "learning_rate": 6.720898112372614e-07, "loss": 0.354, "step": 128825 }, { "epoch": 4.6430244711139945, "grad_norm": 0.21480095386505127, "learning_rate": 6.714178904512619e-07, "loss": 0.3823, "step": 128830 }, { "epoch": 4.643204670775219, "grad_norm": 0.22311148047447205, "learning_rate": 6.707463011374815e-07, "loss": 0.3979, "step": 128835 }, { "epoch": 4.643384870436444, "grad_norm": 0.2693723440170288, "learning_rate": 6.70075043305074e-07, "loss": 0.3854, "step": 128840 }, { "epoch": 4.643565070097668, "grad_norm": 0.29857292771339417, "learning_rate": 6.69404116963182e-07, "loss": 0.3685, "step": 128845 }, { "epoch": 4.643745269758893, "grad_norm": 0.2625799775123596, "learning_rate": 6.68733522120954e-07, "loss": 0.3603, "step": 128850 }, { "epoch": 4.643925469420117, "grad_norm": 0.2390749603509903, "learning_rate": 6.680632587875185e-07, "loss": 0.3602, "step": 128855 }, { "epoch": 4.644105669081342, "grad_norm": 0.28638026118278503, "learning_rate": 6.673933269720072e-07, "loss": 0.3673, "step": 128860 }, { "epoch": 4.644285868742567, "grad_norm": 0.2660246789455414, "learning_rate": 6.667237266835519e-07, "loss": 0.3327, "step": 128865 }, { "epoch": 4.6444660684037915, "grad_norm": 0.218661829829216, "learning_rate": 6.660544579312755e-07, "loss": 0.3471, "step": 128870 }, { "epoch": 4.644646268065016, "grad_norm": 0.2507161498069763, "learning_rate": 6.65385520724296e-07, "loss": 0.3942, "step": 128875 }, { "epoch": 4.644826467726241, "grad_norm": 0.2658718526363373, "learning_rate": 6.64716915071728e-07, "loss": 0.3769, "step": 128880 }, { "epoch": 4.645006667387465, "grad_norm": 0.24641652405261993, "learning_rate": 6.640486409826785e-07, "loss": 0.3482, "step": 128885 }, { "epoch": 4.6451868670486895, "grad_norm": 0.27895164489746094, "learning_rate": 6.633806984662566e-07, "loss": 0.3843, "step": 128890 }, { "epoch": 4.645367066709914, "grad_norm": 0.2246672362089157, "learning_rate": 6.627130875315607e-07, "loss": 0.3708, "step": 128895 }, { "epoch": 4.645547266371139, "grad_norm": 0.2589907944202423, "learning_rate": 6.620458081876891e-07, "loss": 0.364, "step": 128900 }, { "epoch": 4.645727466032364, "grad_norm": 0.31070613861083984, "learning_rate": 6.613788604437288e-07, "loss": 0.3812, "step": 128905 }, { "epoch": 4.6459076656935885, "grad_norm": 0.24168938398361206, "learning_rate": 6.607122443087672e-07, "loss": 0.3644, "step": 128910 }, { "epoch": 4.646087865354813, "grad_norm": 0.24737270176410675, "learning_rate": 6.60045959791894e-07, "loss": 0.361, "step": 128915 }, { "epoch": 4.646268065016038, "grad_norm": 0.24914908409118652, "learning_rate": 6.593800069021827e-07, "loss": 0.3888, "step": 128920 }, { "epoch": 4.646448264677263, "grad_norm": 0.24401962757110596, "learning_rate": 6.587143856487011e-07, "loss": 0.3582, "step": 128925 }, { "epoch": 4.646628464338487, "grad_norm": 0.24380113184452057, "learning_rate": 6.58049096040525e-07, "loss": 0.3765, "step": 128930 }, { "epoch": 4.646808663999712, "grad_norm": 0.241029292345047, "learning_rate": 6.573841380867197e-07, "loss": 0.3522, "step": 128935 }, { "epoch": 4.646988863660936, "grad_norm": 0.229360893368721, "learning_rate": 6.567195117963415e-07, "loss": 0.36, "step": 128940 }, { "epoch": 4.647169063322161, "grad_norm": 0.2620939016342163, "learning_rate": 6.560552171784473e-07, "loss": 0.4135, "step": 128945 }, { "epoch": 4.6473492629833855, "grad_norm": 0.2812032997608185, "learning_rate": 6.553912542420826e-07, "loss": 0.3603, "step": 128950 }, { "epoch": 4.64752946264461, "grad_norm": 0.27785706520080566, "learning_rate": 6.54727622996304e-07, "loss": 0.3793, "step": 128955 }, { "epoch": 4.647709662305835, "grad_norm": 0.2647446095943451, "learning_rate": 6.540643234501487e-07, "loss": 0.3661, "step": 128960 }, { "epoch": 4.64788986196706, "grad_norm": 0.28079158067703247, "learning_rate": 6.534013556126511e-07, "loss": 0.3658, "step": 128965 }, { "epoch": 4.648070061628284, "grad_norm": 0.291545569896698, "learning_rate": 6.527387194928486e-07, "loss": 0.3871, "step": 128970 }, { "epoch": 4.648250261289509, "grad_norm": 0.24639180302619934, "learning_rate": 6.520764150997644e-07, "loss": 0.3754, "step": 128975 }, { "epoch": 4.648430460950733, "grad_norm": 0.1946740746498108, "learning_rate": 6.514144424424246e-07, "loss": 0.3622, "step": 128980 }, { "epoch": 4.648610660611958, "grad_norm": 0.24029649794101715, "learning_rate": 6.507528015298525e-07, "loss": 0.3713, "step": 128985 }, { "epoch": 4.6487908602731824, "grad_norm": 0.22794067859649658, "learning_rate": 6.500914923710577e-07, "loss": 0.3872, "step": 128990 }, { "epoch": 4.648971059934407, "grad_norm": 0.2424376904964447, "learning_rate": 6.494305149750524e-07, "loss": 0.3783, "step": 128995 }, { "epoch": 4.649151259595632, "grad_norm": 0.27544233202934265, "learning_rate": 6.487698693508432e-07, "loss": 0.3789, "step": 129000 }, { "epoch": 4.649151259595632, "eval_loss": 0.4288814663887024, "eval_runtime": 3.5274, "eval_samples_per_second": 28.349, "eval_steps_per_second": 7.087, "step": 129000 }, { "epoch": 4.649331459256857, "grad_norm": 0.23391346633434296, "learning_rate": 6.481095555074257e-07, "loss": 0.3823, "step": 129005 }, { "epoch": 4.649511658918081, "grad_norm": 0.3378024697303772, "learning_rate": 6.474495734538038e-07, "loss": 0.381, "step": 129010 }, { "epoch": 4.649691858579306, "grad_norm": 0.23834291100502014, "learning_rate": 6.467899231989704e-07, "loss": 0.3852, "step": 129015 }, { "epoch": 4.649872058240531, "grad_norm": 0.2810116410255432, "learning_rate": 6.461306047519017e-07, "loss": 0.3648, "step": 129020 }, { "epoch": 4.650052257901756, "grad_norm": 0.2411430925130844, "learning_rate": 6.45471618121593e-07, "loss": 0.3573, "step": 129025 }, { "epoch": 4.650232457562979, "grad_norm": 0.22861650586128235, "learning_rate": 6.448129633170153e-07, "loss": 0.3441, "step": 129030 }, { "epoch": 4.650412657224204, "grad_norm": 0.29254698753356934, "learning_rate": 6.441546403471499e-07, "loss": 0.3794, "step": 129035 }, { "epoch": 4.650592856885429, "grad_norm": 0.2996603548526764, "learning_rate": 6.434966492209621e-07, "loss": 0.375, "step": 129040 }, { "epoch": 4.650773056546654, "grad_norm": 0.29424965381622314, "learning_rate": 6.428389899474113e-07, "loss": 0.3789, "step": 129045 }, { "epoch": 4.650953256207878, "grad_norm": 0.22816717624664307, "learning_rate": 6.421816625354682e-07, "loss": 0.3889, "step": 129050 }, { "epoch": 4.651133455869103, "grad_norm": 0.29083213210105896, "learning_rate": 6.415246669940838e-07, "loss": 0.3914, "step": 129055 }, { "epoch": 4.651313655530328, "grad_norm": 0.24897891283035278, "learning_rate": 6.408680033322096e-07, "loss": 0.382, "step": 129060 }, { "epoch": 4.651493855191553, "grad_norm": 0.2584581971168518, "learning_rate": 6.402116715587908e-07, "loss": 0.41, "step": 129065 }, { "epoch": 4.651674054852776, "grad_norm": 0.26626285910606384, "learning_rate": 6.395556716827705e-07, "loss": 0.3482, "step": 129070 }, { "epoch": 4.651854254514001, "grad_norm": 0.29574957489967346, "learning_rate": 6.389000037130916e-07, "loss": 0.3603, "step": 129075 }, { "epoch": 4.652034454175226, "grad_norm": 0.21474981307983398, "learning_rate": 6.382446676586828e-07, "loss": 0.3585, "step": 129080 }, { "epoch": 4.652214653836451, "grad_norm": 0.25126680731773376, "learning_rate": 6.375896635284734e-07, "loss": 0.385, "step": 129085 }, { "epoch": 4.652394853497675, "grad_norm": 0.20911748707294464, "learning_rate": 6.369349913313893e-07, "loss": 0.3573, "step": 129090 }, { "epoch": 4.6525750531589, "grad_norm": 0.24984316527843475, "learning_rate": 6.362806510763458e-07, "loss": 0.3599, "step": 129095 }, { "epoch": 4.652755252820125, "grad_norm": 0.20926958322525024, "learning_rate": 6.356266427722663e-07, "loss": 0.37, "step": 129100 }, { "epoch": 4.6529354524813495, "grad_norm": 0.23557382822036743, "learning_rate": 6.349729664280546e-07, "loss": 0.3507, "step": 129105 }, { "epoch": 4.653115652142574, "grad_norm": 0.27930939197540283, "learning_rate": 6.343196220526176e-07, "loss": 0.3653, "step": 129110 }, { "epoch": 4.653295851803799, "grad_norm": 0.20511141419410706, "learning_rate": 6.336666096548593e-07, "loss": 0.3598, "step": 129115 }, { "epoch": 4.653476051465024, "grad_norm": 0.2338797003030777, "learning_rate": 6.330139292436782e-07, "loss": 0.3693, "step": 129120 }, { "epoch": 4.653656251126248, "grad_norm": 0.29057690501213074, "learning_rate": 6.323615808279587e-07, "loss": 0.3527, "step": 129125 }, { "epoch": 4.653836450787472, "grad_norm": 0.26193034648895264, "learning_rate": 6.317095644166021e-07, "loss": 0.379, "step": 129130 }, { "epoch": 4.654016650448697, "grad_norm": 0.2552226781845093, "learning_rate": 6.310578800184791e-07, "loss": 0.3968, "step": 129135 }, { "epoch": 4.654196850109922, "grad_norm": 0.22633424401283264, "learning_rate": 6.30406527642477e-07, "loss": 0.3954, "step": 129140 }, { "epoch": 4.6543770497711465, "grad_norm": 0.2168351113796234, "learning_rate": 6.297555072974692e-07, "loss": 0.4123, "step": 129145 }, { "epoch": 4.654557249432371, "grad_norm": 0.2150443196296692, "learning_rate": 6.291048189923238e-07, "loss": 0.3593, "step": 129150 }, { "epoch": 4.654737449093596, "grad_norm": 0.2785971462726593, "learning_rate": 6.284544627359057e-07, "loss": 0.365, "step": 129155 }, { "epoch": 4.654917648754821, "grad_norm": 0.28096702694892883, "learning_rate": 6.278044385370774e-07, "loss": 0.3482, "step": 129160 }, { "epoch": 4.655097848416045, "grad_norm": 0.2804014980792999, "learning_rate": 6.271547464046928e-07, "loss": 0.3895, "step": 129165 }, { "epoch": 4.655278048077269, "grad_norm": 0.26343923807144165, "learning_rate": 6.265053863476089e-07, "loss": 0.4001, "step": 129170 }, { "epoch": 4.655458247738494, "grad_norm": 0.23427803814411163, "learning_rate": 6.258563583746713e-07, "loss": 0.3745, "step": 129175 }, { "epoch": 4.655638447399719, "grad_norm": 0.24222131073474884, "learning_rate": 6.252076624947201e-07, "loss": 0.3622, "step": 129180 }, { "epoch": 4.6558186470609435, "grad_norm": 0.25545430183410645, "learning_rate": 6.245592987165955e-07, "loss": 0.388, "step": 129185 }, { "epoch": 4.655998846722168, "grad_norm": 0.30208820104599, "learning_rate": 6.239112670491293e-07, "loss": 0.4266, "step": 129190 }, { "epoch": 4.656179046383393, "grad_norm": 0.26093167066574097, "learning_rate": 6.232635675011562e-07, "loss": 0.3532, "step": 129195 }, { "epoch": 4.656359246044618, "grad_norm": 0.3261514902114868, "learning_rate": 6.226162000814967e-07, "loss": 0.3773, "step": 129200 }, { "epoch": 4.656539445705842, "grad_norm": 0.22967924177646637, "learning_rate": 6.219691647989689e-07, "loss": 0.3761, "step": 129205 }, { "epoch": 4.656719645367067, "grad_norm": 0.23772135376930237, "learning_rate": 6.213224616623964e-07, "loss": 0.3674, "step": 129210 }, { "epoch": 4.656899845028291, "grad_norm": 0.24498851597309113, "learning_rate": 6.206760906805803e-07, "loss": 0.3609, "step": 129215 }, { "epoch": 4.657080044689516, "grad_norm": 0.22295355796813965, "learning_rate": 6.200300518623387e-07, "loss": 0.3527, "step": 129220 }, { "epoch": 4.6572602443507405, "grad_norm": 0.2808474600315094, "learning_rate": 6.193843452164616e-07, "loss": 0.3905, "step": 129225 }, { "epoch": 4.657440444011965, "grad_norm": 0.2916402816772461, "learning_rate": 6.187389707517532e-07, "loss": 0.385, "step": 129230 }, { "epoch": 4.65762064367319, "grad_norm": 0.26787781715393066, "learning_rate": 6.180939284770093e-07, "loss": 0.3638, "step": 129235 }, { "epoch": 4.657800843334415, "grad_norm": 0.29607486724853516, "learning_rate": 6.174492184010117e-07, "loss": 0.3698, "step": 129240 }, { "epoch": 4.657981042995639, "grad_norm": 0.2748851478099823, "learning_rate": 6.168048405325505e-07, "loss": 0.3977, "step": 129245 }, { "epoch": 4.658161242656864, "grad_norm": 0.2305481731891632, "learning_rate": 6.161607948804021e-07, "loss": 0.3268, "step": 129250 }, { "epoch": 4.658341442318088, "grad_norm": 0.20638981461524963, "learning_rate": 6.155170814533401e-07, "loss": 0.3884, "step": 129255 }, { "epoch": 4.658521641979313, "grad_norm": 0.25829213857650757, "learning_rate": 6.148737002601379e-07, "loss": 0.4005, "step": 129260 }, { "epoch": 4.6587018416405375, "grad_norm": 0.28095561265945435, "learning_rate": 6.142306513095608e-07, "loss": 0.373, "step": 129265 }, { "epoch": 4.658882041301762, "grad_norm": 0.22143559157848358, "learning_rate": 6.135879346103712e-07, "loss": 0.359, "step": 129270 }, { "epoch": 4.659062240962987, "grad_norm": 0.24161243438720703, "learning_rate": 6.129455501713233e-07, "loss": 0.3702, "step": 129275 }, { "epoch": 4.659242440624212, "grad_norm": 0.20303134620189667, "learning_rate": 6.123034980011683e-07, "loss": 0.3933, "step": 129280 }, { "epoch": 4.659422640285436, "grad_norm": 0.29990285634994507, "learning_rate": 6.116617781086603e-07, "loss": 0.387, "step": 129285 }, { "epoch": 4.659602839946661, "grad_norm": 0.24720498919487, "learning_rate": 6.110203905025397e-07, "loss": 0.3566, "step": 129290 }, { "epoch": 4.659783039607886, "grad_norm": 0.2561245560646057, "learning_rate": 6.10379335191541e-07, "loss": 0.3665, "step": 129295 }, { "epoch": 4.659963239269111, "grad_norm": 0.24636390805244446, "learning_rate": 6.097386121844045e-07, "loss": 0.3752, "step": 129300 }, { "epoch": 4.6601434389303344, "grad_norm": 0.22616252303123474, "learning_rate": 6.090982214898567e-07, "loss": 0.3541, "step": 129305 }, { "epoch": 4.660323638591559, "grad_norm": 0.2326437532901764, "learning_rate": 6.084581631166208e-07, "loss": 0.4103, "step": 129310 }, { "epoch": 4.660503838252784, "grad_norm": 0.26222485303878784, "learning_rate": 6.078184370734236e-07, "loss": 0.3368, "step": 129315 }, { "epoch": 4.660684037914009, "grad_norm": 0.20036746561527252, "learning_rate": 6.071790433689744e-07, "loss": 0.3811, "step": 129320 }, { "epoch": 4.660864237575233, "grad_norm": 0.3011195659637451, "learning_rate": 6.065399820119888e-07, "loss": 0.3701, "step": 129325 }, { "epoch": 4.661044437236458, "grad_norm": 0.3017103672027588, "learning_rate": 6.059012530111763e-07, "loss": 0.3723, "step": 129330 }, { "epoch": 4.661224636897683, "grad_norm": 0.2928502857685089, "learning_rate": 6.052628563752355e-07, "loss": 0.4019, "step": 129335 }, { "epoch": 4.661404836558908, "grad_norm": 0.26223188638687134, "learning_rate": 6.046247921128623e-07, "loss": 0.376, "step": 129340 }, { "epoch": 4.661585036220131, "grad_norm": 0.25763529539108276, "learning_rate": 6.039870602327552e-07, "loss": 0.383, "step": 129345 }, { "epoch": 4.661765235881356, "grad_norm": 0.3097740709781647, "learning_rate": 6.03349660743599e-07, "loss": 0.3831, "step": 129350 }, { "epoch": 4.661945435542581, "grad_norm": 0.21880759298801422, "learning_rate": 6.027125936540839e-07, "loss": 0.325, "step": 129355 }, { "epoch": 4.662125635203806, "grad_norm": 0.27281638979911804, "learning_rate": 6.020758589728864e-07, "loss": 0.3572, "step": 129360 }, { "epoch": 4.66230583486503, "grad_norm": 0.26208505034446716, "learning_rate": 6.014394567086801e-07, "loss": 0.3569, "step": 129365 }, { "epoch": 4.662486034526255, "grad_norm": 0.2751930058002472, "learning_rate": 6.008033868701357e-07, "loss": 0.3527, "step": 129370 }, { "epoch": 4.66266623418748, "grad_norm": 0.2358606606721878, "learning_rate": 6.001676494659214e-07, "loss": 0.3303, "step": 129375 }, { "epoch": 4.662846433848705, "grad_norm": 0.22096312046051025, "learning_rate": 5.995322445047025e-07, "loss": 0.3917, "step": 129380 }, { "epoch": 4.663026633509929, "grad_norm": 0.2579168677330017, "learning_rate": 5.988971719951331e-07, "loss": 0.41, "step": 129385 }, { "epoch": 4.663206833171154, "grad_norm": 0.29768040776252747, "learning_rate": 5.982624319458618e-07, "loss": 0.3481, "step": 129390 }, { "epoch": 4.663387032832379, "grad_norm": 0.24240122735500336, "learning_rate": 5.976280243655403e-07, "loss": 0.3567, "step": 129395 }, { "epoch": 4.663567232493603, "grad_norm": 0.3015746772289276, "learning_rate": 5.96993949262814e-07, "loss": 0.3612, "step": 129400 }, { "epoch": 4.663747432154827, "grad_norm": 0.2500225901603699, "learning_rate": 5.963602066463236e-07, "loss": 0.3912, "step": 129405 }, { "epoch": 4.663927631816052, "grad_norm": 0.25465816259384155, "learning_rate": 5.957267965246982e-07, "loss": 0.3754, "step": 129410 }, { "epoch": 4.664107831477277, "grad_norm": 0.24899233877658844, "learning_rate": 5.95093718906567e-07, "loss": 0.3582, "step": 129415 }, { "epoch": 4.6642880311385015, "grad_norm": 0.22887267172336578, "learning_rate": 5.944609738005618e-07, "loss": 0.3409, "step": 129420 }, { "epoch": 4.664468230799726, "grad_norm": 0.26909008622169495, "learning_rate": 5.93828561215301e-07, "loss": 0.3867, "step": 129425 }, { "epoch": 4.664648430460951, "grad_norm": 0.2556828558444977, "learning_rate": 5.931964811594026e-07, "loss": 0.356, "step": 129430 }, { "epoch": 4.664828630122176, "grad_norm": 0.25458240509033203, "learning_rate": 5.925647336414735e-07, "loss": 0.4164, "step": 129435 }, { "epoch": 4.6650088297834, "grad_norm": 0.26027604937553406, "learning_rate": 5.919333186701265e-07, "loss": 0.3915, "step": 129440 }, { "epoch": 4.665189029444624, "grad_norm": 0.23065000772476196, "learning_rate": 5.9130223625396e-07, "loss": 0.3516, "step": 129445 }, { "epoch": 4.665369229105849, "grad_norm": 0.2661188542842865, "learning_rate": 5.906714864015783e-07, "loss": 0.3747, "step": 129450 }, { "epoch": 4.665549428767074, "grad_norm": 0.25803518295288086, "learning_rate": 5.900410691215719e-07, "loss": 0.3264, "step": 129455 }, { "epoch": 4.6657296284282985, "grad_norm": 0.24731706082820892, "learning_rate": 5.894109844225281e-07, "loss": 0.3546, "step": 129460 }, { "epoch": 4.665909828089523, "grad_norm": 0.21868941187858582, "learning_rate": 5.88781232313032e-07, "loss": 0.3743, "step": 129465 }, { "epoch": 4.666090027750748, "grad_norm": 0.2019900679588318, "learning_rate": 5.881518128016655e-07, "loss": 0.3616, "step": 129470 }, { "epoch": 4.666270227411973, "grad_norm": 0.37569132447242737, "learning_rate": 5.875227258970078e-07, "loss": 0.3833, "step": 129475 }, { "epoch": 4.6664504270731975, "grad_norm": 0.2718876600265503, "learning_rate": 5.868939716076244e-07, "loss": 0.3947, "step": 129480 }, { "epoch": 4.666630626734422, "grad_norm": 0.23850391805171967, "learning_rate": 5.86265549942086e-07, "loss": 0.3544, "step": 129485 }, { "epoch": 4.666810826395646, "grad_norm": 0.28313085436820984, "learning_rate": 5.856374609089526e-07, "loss": 0.3506, "step": 129490 }, { "epoch": 4.666991026056871, "grad_norm": 0.21799631416797638, "learning_rate": 5.850097045167785e-07, "loss": 0.3657, "step": 129495 }, { "epoch": 4.6671712257180955, "grad_norm": 0.25782206654548645, "learning_rate": 5.843822807741261e-07, "loss": 0.3886, "step": 129500 }, { "epoch": 4.6671712257180955, "eval_loss": 0.4288308322429657, "eval_runtime": 3.5293, "eval_samples_per_second": 28.335, "eval_steps_per_second": 7.084, "step": 129500 }, { "epoch": 4.66735142537932, "grad_norm": 0.2763291001319885, "learning_rate": 5.837551896895305e-07, "loss": 0.373, "step": 129505 }, { "epoch": 4.667531625040545, "grad_norm": 0.3070751130580902, "learning_rate": 5.831284312715485e-07, "loss": 0.3634, "step": 129510 }, { "epoch": 4.66771182470177, "grad_norm": 0.3483225703239441, "learning_rate": 5.82502005528715e-07, "loss": 0.3704, "step": 129515 }, { "epoch": 4.667892024362994, "grad_norm": 0.20731894671916962, "learning_rate": 5.818759124695622e-07, "loss": 0.3615, "step": 129520 }, { "epoch": 4.668072224024219, "grad_norm": 0.25448137521743774, "learning_rate": 5.81250152102622e-07, "loss": 0.3756, "step": 129525 }, { "epoch": 4.668252423685443, "grad_norm": 0.3217816948890686, "learning_rate": 5.806247244364238e-07, "loss": 0.4025, "step": 129530 }, { "epoch": 4.668432623346668, "grad_norm": 0.21159349381923676, "learning_rate": 5.799996294794801e-07, "loss": 0.382, "step": 129535 }, { "epoch": 4.6686128230078925, "grad_norm": 0.2236149162054062, "learning_rate": 5.793748672403204e-07, "loss": 0.3662, "step": 129540 }, { "epoch": 4.668793022669117, "grad_norm": 0.262952595949173, "learning_rate": 5.78750437727446e-07, "loss": 0.3876, "step": 129545 }, { "epoch": 4.668973222330342, "grad_norm": 0.27944156527519226, "learning_rate": 5.781263409493698e-07, "loss": 0.3889, "step": 129550 }, { "epoch": 4.669153421991567, "grad_norm": 0.24670124053955078, "learning_rate": 5.775025769145959e-07, "loss": 0.3498, "step": 129555 }, { "epoch": 4.669333621652791, "grad_norm": 0.2797219455242157, "learning_rate": 5.768791456316175e-07, "loss": 0.3675, "step": 129560 }, { "epoch": 4.669513821314016, "grad_norm": 0.19079472124576569, "learning_rate": 5.762560471089334e-07, "loss": 0.3674, "step": 129565 }, { "epoch": 4.669694020975241, "grad_norm": 0.20952509343624115, "learning_rate": 5.756332813550369e-07, "loss": 0.3384, "step": 129570 }, { "epoch": 4.669874220636466, "grad_norm": 0.2480088174343109, "learning_rate": 5.750108483784017e-07, "loss": 0.3942, "step": 129575 }, { "epoch": 4.6700544202976895, "grad_norm": 0.19479909539222717, "learning_rate": 5.743887481875182e-07, "loss": 0.3673, "step": 129580 }, { "epoch": 4.670234619958914, "grad_norm": 0.27210843563079834, "learning_rate": 5.737669807908546e-07, "loss": 0.374, "step": 129585 }, { "epoch": 4.670414819620139, "grad_norm": 0.25238466262817383, "learning_rate": 5.731455461968932e-07, "loss": 0.3692, "step": 129590 }, { "epoch": 4.670595019281364, "grad_norm": 0.3293459117412567, "learning_rate": 5.725244444140937e-07, "loss": 0.365, "step": 129595 }, { "epoch": 4.670775218942588, "grad_norm": 0.25170597434043884, "learning_rate": 5.719036754509161e-07, "loss": 0.4202, "step": 129600 }, { "epoch": 4.670955418603813, "grad_norm": 0.22425323724746704, "learning_rate": 5.712832393158229e-07, "loss": 0.3636, "step": 129605 }, { "epoch": 4.671135618265038, "grad_norm": 0.3065611720085144, "learning_rate": 5.706631360172659e-07, "loss": 0.3638, "step": 129610 }, { "epoch": 4.671315817926263, "grad_norm": 0.2282622754573822, "learning_rate": 5.700433655636939e-07, "loss": 0.3536, "step": 129615 }, { "epoch": 4.6714960175874864, "grad_norm": 0.22002369165420532, "learning_rate": 5.694239279635527e-07, "loss": 0.3777, "step": 129620 }, { "epoch": 4.671676217248711, "grad_norm": 0.263820081949234, "learning_rate": 5.688048232252746e-07, "loss": 0.3732, "step": 129625 }, { "epoch": 4.671856416909936, "grad_norm": 0.24897272884845734, "learning_rate": 5.681860513573084e-07, "loss": 0.3497, "step": 129630 }, { "epoch": 4.672036616571161, "grad_norm": 0.2410743087530136, "learning_rate": 5.675676123680724e-07, "loss": 0.3739, "step": 129635 }, { "epoch": 4.672216816232385, "grad_norm": 0.28402799367904663, "learning_rate": 5.669495062660013e-07, "loss": 0.3734, "step": 129640 }, { "epoch": 4.67239701589361, "grad_norm": 0.22168543934822083, "learning_rate": 5.663317330595108e-07, "loss": 0.3817, "step": 129645 }, { "epoch": 4.672577215554835, "grad_norm": 0.25576576590538025, "learning_rate": 5.657142927570163e-07, "loss": 0.4007, "step": 129650 }, { "epoch": 4.67275741521606, "grad_norm": 0.21974904835224152, "learning_rate": 5.65097185366939e-07, "loss": 0.399, "step": 129655 }, { "epoch": 4.672937614877284, "grad_norm": 0.3177945911884308, "learning_rate": 5.644804108976804e-07, "loss": 0.3589, "step": 129660 }, { "epoch": 4.673117814538509, "grad_norm": 0.3027752637863159, "learning_rate": 5.638639693576447e-07, "loss": 0.3754, "step": 129665 }, { "epoch": 4.673298014199734, "grad_norm": 0.3068021237850189, "learning_rate": 5.63247860755231e-07, "loss": 0.3751, "step": 129670 }, { "epoch": 4.673478213860958, "grad_norm": 0.2814277410507202, "learning_rate": 5.626320850988353e-07, "loss": 0.3641, "step": 129675 }, { "epoch": 4.673658413522182, "grad_norm": 0.2657163441181183, "learning_rate": 5.620166423968454e-07, "loss": 0.3845, "step": 129680 }, { "epoch": 4.673838613183407, "grad_norm": 0.21708977222442627, "learning_rate": 5.614015326576489e-07, "loss": 0.3781, "step": 129685 }, { "epoch": 4.674018812844632, "grad_norm": 0.2891923189163208, "learning_rate": 5.607867558896224e-07, "loss": 0.3721, "step": 129690 }, { "epoch": 4.674199012505857, "grad_norm": 0.25727081298828125, "learning_rate": 5.601723121011482e-07, "loss": 0.32, "step": 129695 }, { "epoch": 4.674379212167081, "grad_norm": 0.18455742299556732, "learning_rate": 5.595582013005918e-07, "loss": 0.3469, "step": 129700 }, { "epoch": 4.674559411828306, "grad_norm": 0.27723222970962524, "learning_rate": 5.589444234963214e-07, "loss": 0.3841, "step": 129705 }, { "epoch": 4.674739611489531, "grad_norm": 0.2724769115447998, "learning_rate": 5.583309786967084e-07, "loss": 0.3844, "step": 129710 }, { "epoch": 4.674919811150755, "grad_norm": 0.25393152236938477, "learning_rate": 5.577178669100986e-07, "loss": 0.4236, "step": 129715 }, { "epoch": 4.675100010811979, "grad_norm": 0.2612922489643097, "learning_rate": 5.571050881448492e-07, "loss": 0.3766, "step": 129720 }, { "epoch": 4.675280210473204, "grad_norm": 0.2447577714920044, "learning_rate": 5.56492642409312e-07, "loss": 0.3888, "step": 129725 }, { "epoch": 4.675460410134429, "grad_norm": 0.30114662647247314, "learning_rate": 5.55880529711833e-07, "loss": 0.3967, "step": 129730 }, { "epoch": 4.6756406097956535, "grad_norm": 0.24598009884357452, "learning_rate": 5.552687500607473e-07, "loss": 0.376, "step": 129735 }, { "epoch": 4.675820809456878, "grad_norm": 0.23333467543125153, "learning_rate": 5.546573034643926e-07, "loss": 0.373, "step": 129740 }, { "epoch": 4.676001009118103, "grad_norm": 0.3097272515296936, "learning_rate": 5.540461899310956e-07, "loss": 0.4054, "step": 129745 }, { "epoch": 4.676181208779328, "grad_norm": 0.2813061773777008, "learning_rate": 5.534354094691912e-07, "loss": 0.3875, "step": 129750 }, { "epoch": 4.6763614084405525, "grad_norm": 0.27265501022338867, "learning_rate": 5.528249620869952e-07, "loss": 0.3326, "step": 129755 }, { "epoch": 4.676541608101777, "grad_norm": 0.23373085260391235, "learning_rate": 5.522148477928257e-07, "loss": 0.355, "step": 129760 }, { "epoch": 4.676721807763001, "grad_norm": 0.2653125822544098, "learning_rate": 5.516050665949956e-07, "loss": 0.374, "step": 129765 }, { "epoch": 4.676902007424226, "grad_norm": 0.2680438160896301, "learning_rate": 5.509956185018123e-07, "loss": 0.3668, "step": 129770 }, { "epoch": 4.6770822070854505, "grad_norm": 0.24834401905536652, "learning_rate": 5.503865035215799e-07, "loss": 0.3672, "step": 129775 }, { "epoch": 4.677262406746675, "grad_norm": 0.2544165551662445, "learning_rate": 5.497777216626004e-07, "loss": 0.3523, "step": 129780 }, { "epoch": 4.6774426064079, "grad_norm": 0.2632615864276886, "learning_rate": 5.491692729331643e-07, "loss": 0.3629, "step": 129785 }, { "epoch": 4.677622806069125, "grad_norm": 0.27221032977104187, "learning_rate": 5.485611573415622e-07, "loss": 0.3558, "step": 129790 }, { "epoch": 4.6778030057303495, "grad_norm": 0.23004166781902313, "learning_rate": 5.479533748960819e-07, "loss": 0.3677, "step": 129795 }, { "epoch": 4.677983205391574, "grad_norm": 0.23140105605125427, "learning_rate": 5.473459256050029e-07, "loss": 0.3682, "step": 129800 }, { "epoch": 4.678163405052798, "grad_norm": 0.25781282782554626, "learning_rate": 5.46738809476599e-07, "loss": 0.3737, "step": 129805 }, { "epoch": 4.678343604714023, "grad_norm": 0.28814247250556946, "learning_rate": 5.461320265191445e-07, "loss": 0.4094, "step": 129810 }, { "epoch": 4.6785238043752475, "grad_norm": 0.2671186327934265, "learning_rate": 5.455255767409101e-07, "loss": 0.3495, "step": 129815 }, { "epoch": 4.678704004036472, "grad_norm": 0.25495436787605286, "learning_rate": 5.449194601501534e-07, "loss": 0.4176, "step": 129820 }, { "epoch": 4.678884203697697, "grad_norm": 0.24299226701259613, "learning_rate": 5.443136767551343e-07, "loss": 0.3457, "step": 129825 }, { "epoch": 4.679064403358922, "grad_norm": 0.2552716135978699, "learning_rate": 5.437082265641075e-07, "loss": 0.4135, "step": 129830 }, { "epoch": 4.679244603020146, "grad_norm": 0.2803386151790619, "learning_rate": 5.431031095853189e-07, "loss": 0.3884, "step": 129835 }, { "epoch": 4.679424802681371, "grad_norm": 0.20010852813720703, "learning_rate": 5.42498325827015e-07, "loss": 0.3422, "step": 129840 }, { "epoch": 4.679605002342596, "grad_norm": 0.28240904211997986, "learning_rate": 5.41893875297439e-07, "loss": 0.3968, "step": 129845 }, { "epoch": 4.679785202003821, "grad_norm": 0.28269240260124207, "learning_rate": 5.412897580048231e-07, "loss": 0.3503, "step": 129850 }, { "epoch": 4.6799654016650445, "grad_norm": 0.295452743768692, "learning_rate": 5.406859739574e-07, "loss": 0.3895, "step": 129855 }, { "epoch": 4.680145601326269, "grad_norm": 0.2563684284687042, "learning_rate": 5.400825231633932e-07, "loss": 0.38, "step": 129860 }, { "epoch": 4.680325800987494, "grad_norm": 0.22655951976776123, "learning_rate": 5.394794056310243e-07, "loss": 0.3844, "step": 129865 }, { "epoch": 4.680506000648719, "grad_norm": 0.2583646774291992, "learning_rate": 5.38876621368517e-07, "loss": 0.3703, "step": 129870 }, { "epoch": 4.680686200309943, "grad_norm": 0.24790343642234802, "learning_rate": 5.382741703840788e-07, "loss": 0.3648, "step": 129875 }, { "epoch": 4.680866399971168, "grad_norm": 0.2448999434709549, "learning_rate": 5.376720526859197e-07, "loss": 0.3749, "step": 129880 }, { "epoch": 4.681046599632393, "grad_norm": 0.2374943196773529, "learning_rate": 5.370702682822415e-07, "loss": 0.3917, "step": 129885 }, { "epoch": 4.681226799293618, "grad_norm": 0.23962445557117462, "learning_rate": 5.364688171812432e-07, "loss": 0.3672, "step": 129890 }, { "epoch": 4.6814069989548415, "grad_norm": 0.2775137424468994, "learning_rate": 5.358676993911238e-07, "loss": 0.391, "step": 129895 }, { "epoch": 4.681587198616066, "grad_norm": 0.23689554631710052, "learning_rate": 5.352669149200711e-07, "loss": 0.3584, "step": 129900 }, { "epoch": 4.681767398277291, "grad_norm": 0.22408610582351685, "learning_rate": 5.34666463776265e-07, "loss": 0.363, "step": 129905 }, { "epoch": 4.681947597938516, "grad_norm": 0.2599274218082428, "learning_rate": 5.340663459678958e-07, "loss": 0.3446, "step": 129910 }, { "epoch": 4.68212779759974, "grad_norm": 0.2526589632034302, "learning_rate": 5.334665615031376e-07, "loss": 0.3397, "step": 129915 }, { "epoch": 4.682307997260965, "grad_norm": 0.23241139948368073, "learning_rate": 5.328671103901589e-07, "loss": 0.3638, "step": 129920 }, { "epoch": 4.68248819692219, "grad_norm": 0.2700398862361908, "learning_rate": 5.322679926371282e-07, "loss": 0.3878, "step": 129925 }, { "epoch": 4.682668396583415, "grad_norm": 0.24214357137680054, "learning_rate": 5.316692082522057e-07, "loss": 0.3821, "step": 129930 }, { "epoch": 4.682848596244639, "grad_norm": 0.1930561661720276, "learning_rate": 5.310707572435569e-07, "loss": 0.3813, "step": 129935 }, { "epoch": 4.683028795905864, "grad_norm": 0.28772681951522827, "learning_rate": 5.30472639619331e-07, "loss": 0.3628, "step": 129940 }, { "epoch": 4.683208995567089, "grad_norm": 0.2775450050830841, "learning_rate": 5.298748553876797e-07, "loss": 0.4035, "step": 129945 }, { "epoch": 4.683389195228313, "grad_norm": 0.3269951045513153, "learning_rate": 5.292774045567439e-07, "loss": 0.3668, "step": 129950 }, { "epoch": 4.683569394889537, "grad_norm": 0.27343136072158813, "learning_rate": 5.286802871346641e-07, "loss": 0.3986, "step": 129955 }, { "epoch": 4.683749594550762, "grad_norm": 0.2822220027446747, "learning_rate": 5.280835031295811e-07, "loss": 0.3583, "step": 129960 }, { "epoch": 4.683929794211987, "grad_norm": 0.26671725511550903, "learning_rate": 5.274870525496245e-07, "loss": 0.3761, "step": 129965 }, { "epoch": 4.684109993873212, "grad_norm": 0.2931014597415924, "learning_rate": 5.268909354029127e-07, "loss": 0.3933, "step": 129970 }, { "epoch": 4.684290193534436, "grad_norm": 0.218561589717865, "learning_rate": 5.262951516975756e-07, "loss": 0.4029, "step": 129975 }, { "epoch": 4.684470393195661, "grad_norm": 0.25005042552948, "learning_rate": 5.256997014417315e-07, "loss": 0.3756, "step": 129980 }, { "epoch": 4.684650592856886, "grad_norm": 0.2712259590625763, "learning_rate": 5.251045846434876e-07, "loss": 0.3682, "step": 129985 }, { "epoch": 4.68483079251811, "grad_norm": 0.2651374936103821, "learning_rate": 5.245098013109573e-07, "loss": 0.3631, "step": 129990 }, { "epoch": 4.685010992179334, "grad_norm": 0.27646490931510925, "learning_rate": 5.239153514522393e-07, "loss": 0.3945, "step": 129995 }, { "epoch": 4.685191191840559, "grad_norm": 0.24400193989276886, "learning_rate": 5.233212350754385e-07, "loss": 0.3355, "step": 130000 }, { "epoch": 4.685191191840559, "eval_loss": 0.4288380742073059, "eval_runtime": 3.5366, "eval_samples_per_second": 28.276, "eval_steps_per_second": 7.069, "step": 130000 }, { "epoch": 4.685371391501784, "grad_norm": 0.22407753765583038, "learning_rate": 5.227274521886483e-07, "loss": 0.3887, "step": 130005 }, { "epoch": 4.685551591163009, "grad_norm": 0.2578125298023224, "learning_rate": 5.221340027999566e-07, "loss": 0.3554, "step": 130010 }, { "epoch": 4.685731790824233, "grad_norm": 0.26388269662857056, "learning_rate": 5.215408869174487e-07, "loss": 0.3703, "step": 130015 }, { "epoch": 4.685911990485458, "grad_norm": 0.3199911117553711, "learning_rate": 5.20948104549207e-07, "loss": 0.3835, "step": 130020 }, { "epoch": 4.686092190146683, "grad_norm": 0.2785996198654175, "learning_rate": 5.203556557033085e-07, "loss": 0.3619, "step": 130025 }, { "epoch": 4.6862723898079075, "grad_norm": 0.2726043462753296, "learning_rate": 5.197635403878243e-07, "loss": 0.3626, "step": 130030 }, { "epoch": 4.686452589469132, "grad_norm": 0.2747058868408203, "learning_rate": 5.19171758610823e-07, "loss": 0.3786, "step": 130035 }, { "epoch": 4.686632789130356, "grad_norm": 0.2753622233867645, "learning_rate": 5.185803103803677e-07, "loss": 0.4035, "step": 130040 }, { "epoch": 4.686812988791581, "grad_norm": 0.3083060383796692, "learning_rate": 5.179891957045158e-07, "loss": 0.3657, "step": 130045 }, { "epoch": 4.6869931884528055, "grad_norm": 0.23582348227500916, "learning_rate": 5.173984145913191e-07, "loss": 0.3944, "step": 130050 }, { "epoch": 4.68717338811403, "grad_norm": 0.2400096207857132, "learning_rate": 5.168079670488296e-07, "loss": 0.3639, "step": 130055 }, { "epoch": 4.687353587775255, "grad_norm": 0.2639099657535553, "learning_rate": 5.162178530850937e-07, "loss": 0.3944, "step": 130060 }, { "epoch": 4.68753378743648, "grad_norm": 0.21031659841537476, "learning_rate": 5.156280727081492e-07, "loss": 0.3775, "step": 130065 }, { "epoch": 4.6877139870977045, "grad_norm": 0.2745669484138489, "learning_rate": 5.150386259260314e-07, "loss": 0.3419, "step": 130070 }, { "epoch": 4.687894186758929, "grad_norm": 0.25483670830726624, "learning_rate": 5.144495127467675e-07, "loss": 0.3432, "step": 130075 }, { "epoch": 4.688074386420153, "grad_norm": 0.27415239810943604, "learning_rate": 5.138607331783951e-07, "loss": 0.4003, "step": 130080 }, { "epoch": 4.688254586081378, "grad_norm": 0.33515846729278564, "learning_rate": 5.132722872289275e-07, "loss": 0.3553, "step": 130085 }, { "epoch": 4.6884347857426025, "grad_norm": 0.2727743089199066, "learning_rate": 5.126841749063805e-07, "loss": 0.3815, "step": 130090 }, { "epoch": 4.688614985403827, "grad_norm": 0.239381343126297, "learning_rate": 5.120963962187753e-07, "loss": 0.373, "step": 130095 }, { "epoch": 4.688795185065052, "grad_norm": 0.23639050126075745, "learning_rate": 5.115089511741139e-07, "loss": 0.3469, "step": 130100 }, { "epoch": 4.688975384726277, "grad_norm": 0.2537215054035187, "learning_rate": 5.109218397804011e-07, "loss": 0.3599, "step": 130105 }, { "epoch": 4.6891555843875015, "grad_norm": 0.268621027469635, "learning_rate": 5.103350620456387e-07, "loss": 0.4099, "step": 130110 }, { "epoch": 4.689335784048726, "grad_norm": 0.22903098165988922, "learning_rate": 5.097486179778177e-07, "loss": 0.3794, "step": 130115 }, { "epoch": 4.689515983709951, "grad_norm": 0.2540788948535919, "learning_rate": 5.091625075849316e-07, "loss": 0.3508, "step": 130120 }, { "epoch": 4.689696183371176, "grad_norm": 0.2338607758283615, "learning_rate": 5.085767308749629e-07, "loss": 0.366, "step": 130125 }, { "epoch": 4.6898763830324, "grad_norm": 0.2733408808708191, "learning_rate": 5.079912878558968e-07, "loss": 0.3504, "step": 130130 }, { "epoch": 4.690056582693624, "grad_norm": 0.26694294810295105, "learning_rate": 5.074061785357076e-07, "loss": 0.3895, "step": 130135 }, { "epoch": 4.690236782354849, "grad_norm": 0.2584874629974365, "learning_rate": 5.068214029223639e-07, "loss": 0.376, "step": 130140 }, { "epoch": 4.690416982016074, "grad_norm": 0.2830393314361572, "learning_rate": 5.062369610238399e-07, "loss": 0.3739, "step": 130145 }, { "epoch": 4.690597181677298, "grad_norm": 0.2988766133785248, "learning_rate": 5.056528528480958e-07, "loss": 0.3467, "step": 130150 }, { "epoch": 4.690777381338523, "grad_norm": 0.2446690797805786, "learning_rate": 5.050690784030892e-07, "loss": 0.393, "step": 130155 }, { "epoch": 4.690957580999748, "grad_norm": 0.23521770536899567, "learning_rate": 5.044856376967721e-07, "loss": 0.387, "step": 130160 }, { "epoch": 4.691137780660973, "grad_norm": 0.26140204071998596, "learning_rate": 5.039025307370965e-07, "loss": 0.3775, "step": 130165 }, { "epoch": 4.691317980322197, "grad_norm": 0.3232240676879883, "learning_rate": 5.033197575320059e-07, "loss": 0.3765, "step": 130170 }, { "epoch": 4.691498179983421, "grad_norm": 0.2332460880279541, "learning_rate": 5.027373180894441e-07, "loss": 0.375, "step": 130175 }, { "epoch": 4.691678379644646, "grad_norm": 0.25555670261383057, "learning_rate": 5.021552124173379e-07, "loss": 0.3493, "step": 130180 }, { "epoch": 4.691858579305871, "grad_norm": 0.2844722270965576, "learning_rate": 5.015734405236284e-07, "loss": 0.3675, "step": 130185 }, { "epoch": 4.692038778967095, "grad_norm": 0.282840758562088, "learning_rate": 5.009920024162368e-07, "loss": 0.4054, "step": 130190 }, { "epoch": 4.69221897862832, "grad_norm": 0.22690938413143158, "learning_rate": 5.004108981030847e-07, "loss": 0.3435, "step": 130195 }, { "epoch": 4.692399178289545, "grad_norm": 0.24626252055168152, "learning_rate": 4.998301275920936e-07, "loss": 0.3498, "step": 130200 }, { "epoch": 4.69257937795077, "grad_norm": 0.23748280107975006, "learning_rate": 4.99249690891171e-07, "loss": 0.401, "step": 130205 }, { "epoch": 4.692759577611994, "grad_norm": 0.2302643209695816, "learning_rate": 4.98669588008227e-07, "loss": 0.3868, "step": 130210 }, { "epoch": 4.692939777273219, "grad_norm": 0.2521742582321167, "learning_rate": 4.980898189511668e-07, "loss": 0.3973, "step": 130215 }, { "epoch": 4.693119976934444, "grad_norm": 0.24064558744430542, "learning_rate": 4.975103837278921e-07, "loss": 0.3831, "step": 130220 }, { "epoch": 4.693300176595668, "grad_norm": 0.233879953622818, "learning_rate": 4.969312823462912e-07, "loss": 0.3357, "step": 130225 }, { "epoch": 4.693480376256892, "grad_norm": 0.27745091915130615, "learning_rate": 4.963525148142606e-07, "loss": 0.3951, "step": 130230 }, { "epoch": 4.693660575918117, "grad_norm": 0.2301134169101715, "learning_rate": 4.9577408113968e-07, "loss": 0.381, "step": 130235 }, { "epoch": 4.693840775579342, "grad_norm": 0.2147936373949051, "learning_rate": 4.951959813304346e-07, "loss": 0.3665, "step": 130240 }, { "epoch": 4.694020975240567, "grad_norm": 0.24145711958408356, "learning_rate": 4.94618215394399e-07, "loss": 0.4068, "step": 130245 }, { "epoch": 4.694201174901791, "grad_norm": 0.23923490941524506, "learning_rate": 4.940407833394473e-07, "loss": 0.4088, "step": 130250 }, { "epoch": 4.694381374563016, "grad_norm": 0.24783268570899963, "learning_rate": 4.934636851734453e-07, "loss": 0.3628, "step": 130255 }, { "epoch": 4.694561574224241, "grad_norm": 0.2156895250082016, "learning_rate": 4.928869209042536e-07, "loss": 0.3927, "step": 130260 }, { "epoch": 4.694741773885465, "grad_norm": 0.28466811776161194, "learning_rate": 4.923104905397379e-07, "loss": 0.4006, "step": 130265 }, { "epoch": 4.694921973546689, "grad_norm": 0.25431156158447266, "learning_rate": 4.917343940877423e-07, "loss": 0.3977, "step": 130270 }, { "epoch": 4.695102173207914, "grad_norm": 0.3024144470691681, "learning_rate": 4.911586315561212e-07, "loss": 0.3901, "step": 130275 }, { "epoch": 4.695282372869139, "grad_norm": 0.29389986395835876, "learning_rate": 4.905832029527186e-07, "loss": 0.3691, "step": 130280 }, { "epoch": 4.695462572530364, "grad_norm": 0.26747405529022217, "learning_rate": 4.900081082853753e-07, "loss": 0.4077, "step": 130285 }, { "epoch": 4.695642772191588, "grad_norm": 0.259396493434906, "learning_rate": 4.894333475619295e-07, "loss": 0.3603, "step": 130290 }, { "epoch": 4.695822971852813, "grad_norm": 0.25687748193740845, "learning_rate": 4.888589207902056e-07, "loss": 0.374, "step": 130295 }, { "epoch": 4.696003171514038, "grad_norm": 0.2638847231864929, "learning_rate": 4.882848279780306e-07, "loss": 0.4057, "step": 130300 }, { "epoch": 4.6961833711752625, "grad_norm": 0.24750365316867828, "learning_rate": 4.877110691332343e-07, "loss": 0.3794, "step": 130305 }, { "epoch": 4.696363570836487, "grad_norm": 0.22299759089946747, "learning_rate": 4.871376442636272e-07, "loss": 0.379, "step": 130310 }, { "epoch": 4.696543770497711, "grad_norm": 0.24523359537124634, "learning_rate": 4.865645533770224e-07, "loss": 0.3825, "step": 130315 }, { "epoch": 4.696723970158936, "grad_norm": 0.24802939593791962, "learning_rate": 4.859917964812305e-07, "loss": 0.4062, "step": 130320 }, { "epoch": 4.696904169820161, "grad_norm": 0.25451767444610596, "learning_rate": 4.854193735840534e-07, "loss": 0.3949, "step": 130325 }, { "epoch": 4.697084369481385, "grad_norm": 0.2552180588245392, "learning_rate": 4.848472846932933e-07, "loss": 0.3683, "step": 130330 }, { "epoch": 4.69726456914261, "grad_norm": 0.2642126679420471, "learning_rate": 4.842755298167412e-07, "loss": 0.3984, "step": 130335 }, { "epoch": 4.697444768803835, "grad_norm": 0.2669326961040497, "learning_rate": 4.837041089621908e-07, "loss": 0.3565, "step": 130340 }, { "epoch": 4.6976249684650595, "grad_norm": 0.27341264486312866, "learning_rate": 4.831330221374248e-07, "loss": 0.4032, "step": 130345 }, { "epoch": 4.697805168126284, "grad_norm": 0.2548832297325134, "learning_rate": 4.82562269350223e-07, "loss": 0.3906, "step": 130350 }, { "epoch": 4.697985367787508, "grad_norm": 0.29564833641052246, "learning_rate": 4.819918506083627e-07, "loss": 0.3535, "step": 130355 }, { "epoch": 4.698165567448733, "grad_norm": 0.29607510566711426, "learning_rate": 4.814217659196207e-07, "loss": 0.339, "step": 130360 }, { "epoch": 4.6983457671099575, "grad_norm": 0.2381865680217743, "learning_rate": 4.80852015291755e-07, "loss": 0.3721, "step": 130365 }, { "epoch": 4.698525966771182, "grad_norm": 0.2766072154045105, "learning_rate": 4.802825987325371e-07, "loss": 0.3651, "step": 130370 }, { "epoch": 4.698706166432407, "grad_norm": 0.26471713185310364, "learning_rate": 4.797135162497219e-07, "loss": 0.3566, "step": 130375 }, { "epoch": 4.698886366093632, "grad_norm": 0.2708904445171356, "learning_rate": 4.791447678510613e-07, "loss": 0.39, "step": 130380 }, { "epoch": 4.6990665657548565, "grad_norm": 0.23361581563949585, "learning_rate": 4.785763535443078e-07, "loss": 0.3869, "step": 130385 }, { "epoch": 4.699246765416081, "grad_norm": 0.19293129444122314, "learning_rate": 4.780082733372021e-07, "loss": 0.3327, "step": 130390 }, { "epoch": 4.699426965077306, "grad_norm": 0.2304544597864151, "learning_rate": 4.774405272374827e-07, "loss": 0.3904, "step": 130395 }, { "epoch": 4.699607164738531, "grad_norm": 0.2673514187335968, "learning_rate": 4.768731152528933e-07, "loss": 0.4132, "step": 130400 }, { "epoch": 4.699787364399755, "grad_norm": 0.27514368295669556, "learning_rate": 4.763060373911582e-07, "loss": 0.3417, "step": 130405 }, { "epoch": 4.699967564060979, "grad_norm": 0.24293185770511627, "learning_rate": 4.7573929366000746e-07, "loss": 0.3843, "step": 130410 }, { "epoch": 4.700147763722204, "grad_norm": 0.27645057439804077, "learning_rate": 4.751728840671599e-07, "loss": 0.3684, "step": 130415 }, { "epoch": 4.700327963383429, "grad_norm": 0.23572184145450592, "learning_rate": 4.7460680862033146e-07, "loss": 0.3592, "step": 130420 }, { "epoch": 4.7005081630446535, "grad_norm": 0.26540112495422363, "learning_rate": 4.740410673272383e-07, "loss": 0.3695, "step": 130425 }, { "epoch": 4.700688362705878, "grad_norm": 0.29115819931030273, "learning_rate": 4.7347566019558807e-07, "loss": 0.3774, "step": 130430 }, { "epoch": 4.700868562367103, "grad_norm": 0.239894837141037, "learning_rate": 4.72910587233083e-07, "loss": 0.3866, "step": 130435 }, { "epoch": 4.701048762028328, "grad_norm": 0.2407197803258896, "learning_rate": 4.723458484474225e-07, "loss": 0.3989, "step": 130440 }, { "epoch": 4.701228961689552, "grad_norm": 0.20911681652069092, "learning_rate": 4.717814438462975e-07, "loss": 0.3579, "step": 130445 }, { "epoch": 4.701409161350776, "grad_norm": 0.3005968928337097, "learning_rate": 4.712173734374048e-07, "loss": 0.3647, "step": 130450 }, { "epoch": 4.701589361012001, "grad_norm": 0.24574686586856842, "learning_rate": 4.706536372284298e-07, "loss": 0.3752, "step": 130455 }, { "epoch": 4.701769560673226, "grad_norm": 0.33672651648521423, "learning_rate": 4.700902352270442e-07, "loss": 0.4029, "step": 130460 }, { "epoch": 4.7019497603344504, "grad_norm": 0.22763417661190033, "learning_rate": 4.695271674409335e-07, "loss": 0.3716, "step": 130465 }, { "epoch": 4.702129959995675, "grad_norm": 0.24694277346134186, "learning_rate": 4.689644338777638e-07, "loss": 0.3485, "step": 130470 }, { "epoch": 4.7023101596569, "grad_norm": 0.21548400819301605, "learning_rate": 4.684020345452067e-07, "loss": 0.3963, "step": 130475 }, { "epoch": 4.702490359318125, "grad_norm": 0.260619580745697, "learning_rate": 4.678399694509228e-07, "loss": 0.4, "step": 130480 }, { "epoch": 4.702670558979349, "grad_norm": 0.31047627329826355, "learning_rate": 4.67278238602567e-07, "loss": 0.3997, "step": 130485 }, { "epoch": 4.702850758640574, "grad_norm": 0.2056533545255661, "learning_rate": 4.6671684200779717e-07, "loss": 0.3476, "step": 130490 }, { "epoch": 4.703030958301799, "grad_norm": 0.24057982861995697, "learning_rate": 4.6615577967425996e-07, "loss": 0.3588, "step": 130495 }, { "epoch": 4.703211157963023, "grad_norm": 0.25371670722961426, "learning_rate": 4.6559505160959916e-07, "loss": 0.4112, "step": 130500 }, { "epoch": 4.703211157963023, "eval_loss": 0.42873358726501465, "eval_runtime": 3.5322, "eval_samples_per_second": 28.311, "eval_steps_per_second": 7.078, "step": 130500 }, { "epoch": 4.703391357624247, "grad_norm": 0.3009524345397949, "learning_rate": 4.6503465782145873e-07, "loss": 0.3473, "step": 130505 }, { "epoch": 4.703571557285472, "grad_norm": 0.27046623826026917, "learning_rate": 4.6447459831746586e-07, "loss": 0.395, "step": 130510 }, { "epoch": 4.703751756946697, "grad_norm": 0.2447580248117447, "learning_rate": 4.639148731052617e-07, "loss": 0.3678, "step": 130515 }, { "epoch": 4.703931956607922, "grad_norm": 0.28649699687957764, "learning_rate": 4.6335548219246517e-07, "loss": 0.3448, "step": 130520 }, { "epoch": 4.704112156269146, "grad_norm": 0.28706684708595276, "learning_rate": 4.6279642558670064e-07, "loss": 0.3887, "step": 130525 }, { "epoch": 4.704292355930371, "grad_norm": 0.20302236080169678, "learning_rate": 4.622377032955816e-07, "loss": 0.3479, "step": 130530 }, { "epoch": 4.704472555591596, "grad_norm": 0.2422175258398056, "learning_rate": 4.616793153267268e-07, "loss": 0.3707, "step": 130535 }, { "epoch": 4.70465275525282, "grad_norm": 0.25601354241371155, "learning_rate": 4.6112126168773587e-07, "loss": 0.3576, "step": 130540 }, { "epoch": 4.704832954914044, "grad_norm": 0.2792492210865021, "learning_rate": 4.60563542386222e-07, "loss": 0.3739, "step": 130545 }, { "epoch": 4.705013154575269, "grad_norm": 0.22106266021728516, "learning_rate": 4.600061574297765e-07, "loss": 0.3579, "step": 130550 }, { "epoch": 4.705193354236494, "grad_norm": 0.2654285430908203, "learning_rate": 4.5944910682599597e-07, "loss": 0.3411, "step": 130555 }, { "epoch": 4.705373553897719, "grad_norm": 0.32557204365730286, "learning_rate": 4.588923905824688e-07, "loss": 0.3711, "step": 130560 }, { "epoch": 4.705553753558943, "grad_norm": 0.308724969625473, "learning_rate": 4.583360087067834e-07, "loss": 0.3891, "step": 130565 }, { "epoch": 4.705733953220168, "grad_norm": 0.2696959376335144, "learning_rate": 4.577799612065198e-07, "loss": 0.3632, "step": 130570 }, { "epoch": 4.705914152881393, "grad_norm": 0.280472069978714, "learning_rate": 4.572242480892497e-07, "loss": 0.3685, "step": 130575 }, { "epoch": 4.7060943525426175, "grad_norm": 0.26516270637512207, "learning_rate": 4.566688693625476e-07, "loss": 0.3442, "step": 130580 }, { "epoch": 4.706274552203842, "grad_norm": 0.2824823558330536, "learning_rate": 4.5611382503398246e-07, "loss": 0.372, "step": 130585 }, { "epoch": 4.706454751865066, "grad_norm": 0.2615325450897217, "learning_rate": 4.5555911511111495e-07, "loss": 0.3817, "step": 130590 }, { "epoch": 4.706634951526291, "grad_norm": 0.2668514549732208, "learning_rate": 4.5500473960150004e-07, "loss": 0.3485, "step": 130595 }, { "epoch": 4.706815151187516, "grad_norm": 0.286246657371521, "learning_rate": 4.5445069851269564e-07, "loss": 0.4019, "step": 130600 }, { "epoch": 4.70699535084874, "grad_norm": 0.26146531105041504, "learning_rate": 4.5389699185224564e-07, "loss": 0.3399, "step": 130605 }, { "epoch": 4.707175550509965, "grad_norm": 0.27129244804382324, "learning_rate": 4.5334361962769965e-07, "loss": 0.384, "step": 130610 }, { "epoch": 4.70735575017119, "grad_norm": 0.21053126454353333, "learning_rate": 4.527905818465933e-07, "loss": 0.3281, "step": 130615 }, { "epoch": 4.7075359498324145, "grad_norm": 0.287487655878067, "learning_rate": 4.5223787851646215e-07, "loss": 0.3927, "step": 130620 }, { "epoch": 4.707716149493639, "grad_norm": 0.25808537006378174, "learning_rate": 4.5168550964483915e-07, "loss": 0.3529, "step": 130625 }, { "epoch": 4.707896349154863, "grad_norm": 0.21509693562984467, "learning_rate": 4.51133475239246e-07, "loss": 0.3739, "step": 130630 }, { "epoch": 4.708076548816088, "grad_norm": 0.3704529404640198, "learning_rate": 4.5058177530720724e-07, "loss": 0.3907, "step": 130635 }, { "epoch": 4.708256748477313, "grad_norm": 0.25836381316185, "learning_rate": 4.500304098562419e-07, "loss": 0.3895, "step": 130640 }, { "epoch": 4.708436948138537, "grad_norm": 0.21811331808567047, "learning_rate": 4.4947937889385507e-07, "loss": 0.382, "step": 130645 }, { "epoch": 4.708617147799762, "grad_norm": 0.24641865491867065, "learning_rate": 4.489286824275601e-07, "loss": 0.3452, "step": 130650 }, { "epoch": 4.708797347460987, "grad_norm": 0.23483498394489288, "learning_rate": 4.483783204648595e-07, "loss": 0.3834, "step": 130655 }, { "epoch": 4.7089775471222115, "grad_norm": 0.28817734122276306, "learning_rate": 4.4782829301324993e-07, "loss": 0.3866, "step": 130660 }, { "epoch": 4.709157746783436, "grad_norm": 0.2239607572555542, "learning_rate": 4.4738851190498645e-07, "loss": 0.3863, "step": 130665 }, { "epoch": 4.709337946444661, "grad_norm": 0.22553732991218567, "learning_rate": 4.4683908659222396e-07, "loss": 0.3521, "step": 130670 }, { "epoch": 4.709518146105886, "grad_norm": 0.21854856610298157, "learning_rate": 4.4628999581152463e-07, "loss": 0.3594, "step": 130675 }, { "epoch": 4.70969834576711, "grad_norm": 0.22778260707855225, "learning_rate": 4.457412395703714e-07, "loss": 0.3684, "step": 130680 }, { "epoch": 4.709878545428334, "grad_norm": 0.20570991933345795, "learning_rate": 4.451928178762416e-07, "loss": 0.3913, "step": 130685 }, { "epoch": 4.710058745089559, "grad_norm": 0.26178503036499023, "learning_rate": 4.446447307366042e-07, "loss": 0.3578, "step": 130690 }, { "epoch": 4.710238944750784, "grad_norm": 0.2906634211540222, "learning_rate": 4.4409697815892827e-07, "loss": 0.3814, "step": 130695 }, { "epoch": 4.7104191444120085, "grad_norm": 0.24634572863578796, "learning_rate": 4.435495601506745e-07, "loss": 0.3709, "step": 130700 }, { "epoch": 4.710599344073233, "grad_norm": 0.205975741147995, "learning_rate": 4.430024767193064e-07, "loss": 0.3627, "step": 130705 }, { "epoch": 4.710779543734458, "grad_norm": 0.24753928184509277, "learning_rate": 4.4245572787227905e-07, "loss": 0.3751, "step": 130710 }, { "epoch": 4.710959743395683, "grad_norm": 0.24739667773246765, "learning_rate": 4.41909313617031e-07, "loss": 0.3678, "step": 130715 }, { "epoch": 4.711139943056907, "grad_norm": 0.19426771998405457, "learning_rate": 4.4136323396101743e-07, "loss": 0.3344, "step": 130720 }, { "epoch": 4.711320142718131, "grad_norm": 0.2439907044172287, "learning_rate": 4.408174889116712e-07, "loss": 0.3633, "step": 130725 }, { "epoch": 4.711500342379356, "grad_norm": 0.2416488081216812, "learning_rate": 4.402720784764391e-07, "loss": 0.3851, "step": 130730 }, { "epoch": 4.711680542040581, "grad_norm": 0.23560959100723267, "learning_rate": 4.3972700266274035e-07, "loss": 0.3564, "step": 130735 }, { "epoch": 4.7118607417018055, "grad_norm": 0.24532419443130493, "learning_rate": 4.3918226147800503e-07, "loss": 0.3914, "step": 130740 }, { "epoch": 4.71204094136303, "grad_norm": 0.25437724590301514, "learning_rate": 4.386378549296577e-07, "loss": 0.3805, "step": 130745 }, { "epoch": 4.712221141024255, "grad_norm": 0.2842262089252472, "learning_rate": 4.380937830251175e-07, "loss": 0.3494, "step": 130750 }, { "epoch": 4.71240134068548, "grad_norm": 0.21695636212825775, "learning_rate": 4.375500457717896e-07, "loss": 0.3494, "step": 130755 }, { "epoch": 4.712581540346704, "grad_norm": 0.26148954033851624, "learning_rate": 4.3700664317709026e-07, "loss": 0.3834, "step": 130760 }, { "epoch": 4.712761740007929, "grad_norm": 0.31055474281311035, "learning_rate": 4.364635752484164e-07, "loss": 0.3661, "step": 130765 }, { "epoch": 4.712941939669154, "grad_norm": 0.32295408844947815, "learning_rate": 4.3592084199317316e-07, "loss": 0.3913, "step": 130770 }, { "epoch": 4.713122139330378, "grad_norm": 0.239763081073761, "learning_rate": 4.353784434187519e-07, "loss": 0.3283, "step": 130775 }, { "epoch": 4.7133023389916024, "grad_norm": 0.2873198986053467, "learning_rate": 4.3483637953254384e-07, "loss": 0.3751, "step": 130780 }, { "epoch": 4.713482538652827, "grad_norm": 0.25636720657348633, "learning_rate": 4.3429465034193485e-07, "loss": 0.3496, "step": 130785 }, { "epoch": 4.713662738314052, "grad_norm": 0.29088860750198364, "learning_rate": 4.33753255854305e-07, "loss": 0.3963, "step": 130790 }, { "epoch": 4.713842937975277, "grad_norm": 0.2415892779827118, "learning_rate": 4.3321219607702634e-07, "loss": 0.3655, "step": 130795 }, { "epoch": 4.714023137636501, "grad_norm": 0.2409953773021698, "learning_rate": 4.3267147101747896e-07, "loss": 0.3836, "step": 130800 }, { "epoch": 4.714203337297726, "grad_norm": 0.2819431722164154, "learning_rate": 4.321310806830292e-07, "loss": 0.3651, "step": 130805 }, { "epoch": 4.714383536958951, "grad_norm": 0.24938331544399261, "learning_rate": 4.3159102508103233e-07, "loss": 0.3752, "step": 130810 }, { "epoch": 4.714563736620175, "grad_norm": 0.2521766424179077, "learning_rate": 4.3105130421885463e-07, "loss": 0.3826, "step": 130815 }, { "epoch": 4.714743936281399, "grad_norm": 0.25512444972991943, "learning_rate": 4.3051191810384304e-07, "loss": 0.3638, "step": 130820 }, { "epoch": 4.714924135942624, "grad_norm": 0.2716846466064453, "learning_rate": 4.299728667433528e-07, "loss": 0.3844, "step": 130825 }, { "epoch": 4.715104335603849, "grad_norm": 0.2582548260688782, "learning_rate": 4.2943415014472523e-07, "loss": 0.3979, "step": 130830 }, { "epoch": 4.715284535265074, "grad_norm": 0.2618710398674011, "learning_rate": 4.288957683152961e-07, "loss": 0.3923, "step": 130835 }, { "epoch": 4.715464734926298, "grad_norm": 0.24619565904140472, "learning_rate": 4.283577212624096e-07, "loss": 0.389, "step": 130840 }, { "epoch": 4.715644934587523, "grad_norm": 0.28034818172454834, "learning_rate": 4.2782000899339035e-07, "loss": 0.3825, "step": 130845 }, { "epoch": 4.715825134248748, "grad_norm": 0.27038708329200745, "learning_rate": 4.272826315155659e-07, "loss": 0.398, "step": 130850 }, { "epoch": 4.716005333909973, "grad_norm": 0.2420760691165924, "learning_rate": 4.267455888362581e-07, "loss": 0.387, "step": 130855 }, { "epoch": 4.716185533571197, "grad_norm": 0.26630204916000366, "learning_rate": 4.2620888096278346e-07, "loss": 0.4072, "step": 130860 }, { "epoch": 4.716365733232421, "grad_norm": 0.2816385328769684, "learning_rate": 4.256725079024554e-07, "loss": 0.372, "step": 130865 }, { "epoch": 4.716545932893646, "grad_norm": 0.23565801978111267, "learning_rate": 4.25136469662582e-07, "loss": 0.3533, "step": 130870 }, { "epoch": 4.716726132554871, "grad_norm": 0.2744382321834564, "learning_rate": 4.2460076625046865e-07, "loss": 0.377, "step": 130875 }, { "epoch": 4.716906332216095, "grad_norm": 0.28961044549942017, "learning_rate": 4.2406539767340934e-07, "loss": 0.3608, "step": 130880 }, { "epoch": 4.71708653187732, "grad_norm": 0.24323128163814545, "learning_rate": 4.2353036393870116e-07, "loss": 0.3578, "step": 130885 }, { "epoch": 4.717266731538545, "grad_norm": 0.2941163182258606, "learning_rate": 4.2299566505363266e-07, "loss": 0.3813, "step": 130890 }, { "epoch": 4.7174469311997695, "grad_norm": 0.28495699167251587, "learning_rate": 4.224613010254952e-07, "loss": 0.4055, "step": 130895 }, { "epoch": 4.717627130860994, "grad_norm": 0.36021918058395386, "learning_rate": 4.21927271861558e-07, "loss": 0.3711, "step": 130900 }, { "epoch": 4.717807330522218, "grad_norm": 0.23534350097179413, "learning_rate": 4.213935775691041e-07, "loss": 0.3555, "step": 130905 }, { "epoch": 4.717987530183443, "grad_norm": 0.2570185959339142, "learning_rate": 4.2086021815540553e-07, "loss": 0.3603, "step": 130910 }, { "epoch": 4.718167729844668, "grad_norm": 0.207231804728508, "learning_rate": 4.2032719362772864e-07, "loss": 0.3445, "step": 130915 }, { "epoch": 4.718347929505892, "grad_norm": 0.24607731401920319, "learning_rate": 4.1979450399333434e-07, "loss": 0.3545, "step": 130920 }, { "epoch": 4.718528129167117, "grad_norm": 0.19284264743328094, "learning_rate": 4.1926214925948057e-07, "loss": 0.3441, "step": 130925 }, { "epoch": 4.718708328828342, "grad_norm": 0.2840045988559723, "learning_rate": 4.1873012943342007e-07, "loss": 0.3717, "step": 130930 }, { "epoch": 4.7188885284895665, "grad_norm": 0.26149991154670715, "learning_rate": 4.181984445224052e-07, "loss": 0.352, "step": 130935 }, { "epoch": 4.719068728150791, "grad_norm": 0.2628280818462372, "learning_rate": 4.176670945336747e-07, "loss": 0.3827, "step": 130940 }, { "epoch": 4.719248927812016, "grad_norm": 0.29585734009742737, "learning_rate": 4.1713607947447276e-07, "loss": 0.3666, "step": 130945 }, { "epoch": 4.719429127473241, "grad_norm": 0.26492440700531006, "learning_rate": 4.166053993520297e-07, "loss": 0.3757, "step": 130950 }, { "epoch": 4.7196093271344655, "grad_norm": 0.24920345842838287, "learning_rate": 4.160750541735814e-07, "loss": 0.3477, "step": 130955 }, { "epoch": 4.719789526795689, "grad_norm": 0.2741335928440094, "learning_rate": 4.1554504394634984e-07, "loss": 0.3818, "step": 130960 }, { "epoch": 4.719969726456914, "grad_norm": 0.2235456109046936, "learning_rate": 4.1501536867755706e-07, "loss": 0.3711, "step": 130965 }, { "epoch": 4.720149926118139, "grad_norm": 0.27255016565322876, "learning_rate": 4.144860283744223e-07, "loss": 0.381, "step": 130970 }, { "epoch": 4.7203301257793635, "grad_norm": 0.3171219229698181, "learning_rate": 4.1395702304415364e-07, "loss": 0.3889, "step": 130975 }, { "epoch": 4.720510325440588, "grad_norm": 0.2652469873428345, "learning_rate": 4.134283526939592e-07, "loss": 0.3715, "step": 130980 }, { "epoch": 4.720690525101813, "grad_norm": 0.2219124734401703, "learning_rate": 4.129000173310443e-07, "loss": 0.3463, "step": 130985 }, { "epoch": 4.720870724763038, "grad_norm": 0.24788452684879303, "learning_rate": 4.123720169626061e-07, "loss": 0.3595, "step": 130990 }, { "epoch": 4.721050924424262, "grad_norm": 0.21029269695281982, "learning_rate": 4.1184435159584143e-07, "loss": 0.3676, "step": 130995 }, { "epoch": 4.721231124085486, "grad_norm": 0.28664112091064453, "learning_rate": 4.1131702123793357e-07, "loss": 0.3637, "step": 131000 }, { "epoch": 4.721231124085486, "eval_loss": 0.42884311079978943, "eval_runtime": 3.5394, "eval_samples_per_second": 28.253, "eval_steps_per_second": 7.063, "step": 131000 }, { "epoch": 4.721411323746711, "grad_norm": 0.3436447083950043, "learning_rate": 4.107900258960712e-07, "loss": 0.39, "step": 131005 }, { "epoch": 4.721591523407936, "grad_norm": 0.20727182924747467, "learning_rate": 4.102633655774374e-07, "loss": 0.3699, "step": 131010 }, { "epoch": 4.7217717230691605, "grad_norm": 0.24989864230155945, "learning_rate": 4.0973704028920423e-07, "loss": 0.3706, "step": 131015 }, { "epoch": 4.721951922730385, "grad_norm": 0.24569204449653625, "learning_rate": 4.092110500385382e-07, "loss": 0.3611, "step": 131020 }, { "epoch": 4.72213212239161, "grad_norm": 0.2474668025970459, "learning_rate": 4.0868539483261413e-07, "loss": 0.3665, "step": 131025 }, { "epoch": 4.722312322052835, "grad_norm": 0.2459608018398285, "learning_rate": 4.081600746785874e-07, "loss": 0.3716, "step": 131030 }, { "epoch": 4.722492521714059, "grad_norm": 0.2828958034515381, "learning_rate": 4.076350895836245e-07, "loss": 0.3439, "step": 131035 }, { "epoch": 4.722672721375284, "grad_norm": 0.4490903317928314, "learning_rate": 4.0711043955486974e-07, "loss": 0.3709, "step": 131040 }, { "epoch": 4.722852921036509, "grad_norm": 0.30186668038368225, "learning_rate": 4.065861245994701e-07, "loss": 0.3289, "step": 131045 }, { "epoch": 4.723033120697733, "grad_norm": 0.2187676876783371, "learning_rate": 4.0606214472457826e-07, "loss": 0.3377, "step": 131050 }, { "epoch": 4.7232133203589575, "grad_norm": 0.21422752737998962, "learning_rate": 4.055384999373246e-07, "loss": 0.3654, "step": 131055 }, { "epoch": 4.723393520020182, "grad_norm": 0.20950256288051605, "learning_rate": 4.050151902448507e-07, "loss": 0.375, "step": 131060 }, { "epoch": 4.723573719681407, "grad_norm": 0.23349595069885254, "learning_rate": 4.044922156542813e-07, "loss": 0.371, "step": 131065 }, { "epoch": 4.723753919342632, "grad_norm": 0.2319730669260025, "learning_rate": 4.0396957617274134e-07, "loss": 0.3757, "step": 131070 }, { "epoch": 4.723934119003856, "grad_norm": 0.22478604316711426, "learning_rate": 4.034472718073556e-07, "loss": 0.3753, "step": 131075 }, { "epoch": 4.724114318665081, "grad_norm": 0.19446243345737457, "learning_rate": 4.02925302565238e-07, "loss": 0.3447, "step": 131080 }, { "epoch": 4.724294518326306, "grad_norm": 0.2668735682964325, "learning_rate": 4.0240366845350485e-07, "loss": 0.3687, "step": 131085 }, { "epoch": 4.72447471798753, "grad_norm": 0.22886505722999573, "learning_rate": 4.018823694792562e-07, "loss": 0.3632, "step": 131090 }, { "epoch": 4.7246549176487544, "grad_norm": 0.25273075699806213, "learning_rate": 4.0136140564959735e-07, "loss": 0.3876, "step": 131095 }, { "epoch": 4.724835117309979, "grad_norm": 0.2730422914028168, "learning_rate": 4.0084077697162826e-07, "loss": 0.4057, "step": 131100 }, { "epoch": 4.725015316971204, "grad_norm": 0.21895499527454376, "learning_rate": 4.003204834524432e-07, "loss": 0.3848, "step": 131105 }, { "epoch": 4.725195516632429, "grad_norm": 0.23920688033103943, "learning_rate": 3.9980052509912267e-07, "loss": 0.3527, "step": 131110 }, { "epoch": 4.725375716293653, "grad_norm": 0.2694016695022583, "learning_rate": 3.99280901918761e-07, "loss": 0.3571, "step": 131115 }, { "epoch": 4.725555915954878, "grad_norm": 0.2753653824329376, "learning_rate": 3.987616139184358e-07, "loss": 0.3968, "step": 131120 }, { "epoch": 4.725736115616103, "grad_norm": 0.24241603910923004, "learning_rate": 3.9824266110521924e-07, "loss": 0.3743, "step": 131125 }, { "epoch": 4.725916315277328, "grad_norm": 0.28032630681991577, "learning_rate": 3.977240434861834e-07, "loss": 0.382, "step": 131130 }, { "epoch": 4.726096514938552, "grad_norm": 0.2554890215396881, "learning_rate": 3.972057610683921e-07, "loss": 0.4077, "step": 131135 }, { "epoch": 4.726276714599777, "grad_norm": 0.27855169773101807, "learning_rate": 3.9668781385891194e-07, "loss": 0.3812, "step": 131140 }, { "epoch": 4.726456914261001, "grad_norm": 0.23848845064640045, "learning_rate": 3.9617020186479557e-07, "loss": 0.3784, "step": 131145 }, { "epoch": 4.726637113922226, "grad_norm": 0.2614748477935791, "learning_rate": 3.956529250930985e-07, "loss": 0.3939, "step": 131150 }, { "epoch": 4.72681731358345, "grad_norm": 0.22815276682376862, "learning_rate": 3.95135983550865e-07, "loss": 0.3932, "step": 131155 }, { "epoch": 4.726997513244675, "grad_norm": 0.28664857149124146, "learning_rate": 3.9461937724514233e-07, "loss": 0.369, "step": 131160 }, { "epoch": 4.7271777129059, "grad_norm": 0.25510892271995544, "learning_rate": 3.941031061829636e-07, "loss": 0.3868, "step": 131165 }, { "epoch": 4.727357912567125, "grad_norm": 0.227322056889534, "learning_rate": 3.9358717037136786e-07, "loss": 0.357, "step": 131170 }, { "epoch": 4.727538112228349, "grad_norm": 0.2149222195148468, "learning_rate": 3.930715698173826e-07, "loss": 0.3443, "step": 131175 }, { "epoch": 4.727718311889574, "grad_norm": 0.2911987900733948, "learning_rate": 3.925563045280328e-07, "loss": 0.385, "step": 131180 }, { "epoch": 4.727898511550798, "grad_norm": 0.22979766130447388, "learning_rate": 3.9204137451034076e-07, "loss": 0.3557, "step": 131185 }, { "epoch": 4.728078711212023, "grad_norm": 0.24125829339027405, "learning_rate": 3.9152677977131734e-07, "loss": 0.3766, "step": 131190 }, { "epoch": 4.728258910873247, "grad_norm": 0.270080029964447, "learning_rate": 3.910125203179821e-07, "loss": 0.3529, "step": 131195 }, { "epoch": 4.728439110534472, "grad_norm": 0.20879442989826202, "learning_rate": 3.904985961573349e-07, "loss": 0.3869, "step": 131200 }, { "epoch": 4.728619310195697, "grad_norm": 0.23659254610538483, "learning_rate": 3.899850072963784e-07, "loss": 0.3665, "step": 131205 }, { "epoch": 4.7287995098569215, "grad_norm": 0.23854908347129822, "learning_rate": 3.8947175374211274e-07, "loss": 0.3918, "step": 131210 }, { "epoch": 4.728979709518146, "grad_norm": 0.2165476530790329, "learning_rate": 3.889588355015267e-07, "loss": 0.3698, "step": 131215 }, { "epoch": 4.729159909179371, "grad_norm": 0.2346785068511963, "learning_rate": 3.8844625258161737e-07, "loss": 0.3508, "step": 131220 }, { "epoch": 4.729340108840596, "grad_norm": 0.25256285071372986, "learning_rate": 3.879340049893598e-07, "loss": 0.373, "step": 131225 }, { "epoch": 4.7295203085018205, "grad_norm": 0.22178764641284943, "learning_rate": 3.874220927317346e-07, "loss": 0.3534, "step": 131230 }, { "epoch": 4.729700508163044, "grad_norm": 0.27392688393592834, "learning_rate": 3.8691051581571933e-07, "loss": 0.3442, "step": 131235 }, { "epoch": 4.729880707824269, "grad_norm": 0.26804807782173157, "learning_rate": 3.8639927424828635e-07, "loss": 0.3818, "step": 131240 }, { "epoch": 4.730060907485494, "grad_norm": 0.2786375880241394, "learning_rate": 3.8588836803639397e-07, "loss": 0.4149, "step": 131245 }, { "epoch": 4.7302411071467185, "grad_norm": 0.2680922746658325, "learning_rate": 3.853777971870087e-07, "loss": 0.3972, "step": 131250 }, { "epoch": 4.730421306807943, "grad_norm": 0.2476295381784439, "learning_rate": 3.848675617070835e-07, "loss": 0.4079, "step": 131255 }, { "epoch": 4.730601506469168, "grad_norm": 0.2629091143608093, "learning_rate": 3.843576616035738e-07, "loss": 0.3755, "step": 131260 }, { "epoch": 4.730781706130393, "grad_norm": 0.26332563161849976, "learning_rate": 3.8384809688342684e-07, "loss": 0.3467, "step": 131265 }, { "epoch": 4.7309619057916175, "grad_norm": 0.21108414232730865, "learning_rate": 3.833388675535815e-07, "loss": 0.3619, "step": 131270 }, { "epoch": 4.731142105452841, "grad_norm": 0.24823954701423645, "learning_rate": 3.828299736209795e-07, "loss": 0.3529, "step": 131275 }, { "epoch": 4.731322305114066, "grad_norm": 0.261547714471817, "learning_rate": 3.8232141509255137e-07, "loss": 0.3579, "step": 131280 }, { "epoch": 4.731502504775291, "grad_norm": 0.22427648305892944, "learning_rate": 3.818131919752277e-07, "loss": 0.3685, "step": 131285 }, { "epoch": 4.7316827044365155, "grad_norm": 0.27016377449035645, "learning_rate": 3.8130530427593626e-07, "loss": 0.3812, "step": 131290 }, { "epoch": 4.73186290409774, "grad_norm": 0.2747372090816498, "learning_rate": 3.8079775200159096e-07, "loss": 0.3558, "step": 131295 }, { "epoch": 4.732043103758965, "grad_norm": 0.2099730223417282, "learning_rate": 3.802905351591113e-07, "loss": 0.3624, "step": 131300 }, { "epoch": 4.73222330342019, "grad_norm": 0.2557778060436249, "learning_rate": 3.797836537554056e-07, "loss": 0.3553, "step": 131305 }, { "epoch": 4.732403503081414, "grad_norm": 0.26313820481300354, "learning_rate": 3.792771077973795e-07, "loss": 0.3633, "step": 131310 }, { "epoch": 4.732583702742639, "grad_norm": 0.25919729471206665, "learning_rate": 3.787708972919385e-07, "loss": 0.4063, "step": 131315 }, { "epoch": 4.732763902403864, "grad_norm": 0.26296132802963257, "learning_rate": 3.7826502224597716e-07, "loss": 0.3512, "step": 131320 }, { "epoch": 4.732944102065088, "grad_norm": 0.23905587196350098, "learning_rate": 3.7775948266638717e-07, "loss": 0.389, "step": 131325 }, { "epoch": 4.7331243017263125, "grad_norm": 0.21689580380916595, "learning_rate": 3.7725427856005747e-07, "loss": 0.3284, "step": 131330 }, { "epoch": 4.733304501387537, "grad_norm": 0.28716617822647095, "learning_rate": 3.767494099338714e-07, "loss": 0.3821, "step": 131335 }, { "epoch": 4.733484701048762, "grad_norm": 0.21327391266822815, "learning_rate": 3.762448767947069e-07, "loss": 0.3663, "step": 131340 }, { "epoch": 4.733664900709987, "grad_norm": 0.2822578251361847, "learning_rate": 3.757406791494389e-07, "loss": 0.3799, "step": 131345 }, { "epoch": 4.733845100371211, "grad_norm": 0.23147007822990417, "learning_rate": 3.752368170049342e-07, "loss": 0.3403, "step": 131350 }, { "epoch": 4.734025300032436, "grad_norm": 0.2217051088809967, "learning_rate": 3.747332903680623e-07, "loss": 0.3961, "step": 131355 }, { "epoch": 4.734205499693661, "grad_norm": 0.23157884180545807, "learning_rate": 3.742300992456843e-07, "loss": 0.3447, "step": 131360 }, { "epoch": 4.734385699354885, "grad_norm": 0.3138419985771179, "learning_rate": 3.737272436446504e-07, "loss": 0.4007, "step": 131365 }, { "epoch": 4.7345658990161095, "grad_norm": 0.2702825963497162, "learning_rate": 3.7322472357181615e-07, "loss": 0.3663, "step": 131370 }, { "epoch": 4.734746098677334, "grad_norm": 0.22164510190486908, "learning_rate": 3.7272253903402607e-07, "loss": 0.328, "step": 131375 }, { "epoch": 4.734926298338559, "grad_norm": 0.267595499753952, "learning_rate": 3.722206900381248e-07, "loss": 0.368, "step": 131380 }, { "epoch": 4.735106497999784, "grad_norm": 0.2832372486591339, "learning_rate": 3.7171917659094834e-07, "loss": 0.3878, "step": 131385 }, { "epoch": 4.735286697661008, "grad_norm": 0.22526337206363678, "learning_rate": 3.7121799869933026e-07, "loss": 0.3577, "step": 131390 }, { "epoch": 4.735466897322233, "grad_norm": 0.3088375926017761, "learning_rate": 3.707171563700956e-07, "loss": 0.4084, "step": 131395 }, { "epoch": 4.735647096983458, "grad_norm": 0.3013952374458313, "learning_rate": 3.702166496100723e-07, "loss": 0.3899, "step": 131400 }, { "epoch": 4.735827296644683, "grad_norm": 0.24282796680927277, "learning_rate": 3.697164784260826e-07, "loss": 0.3846, "step": 131405 }, { "epoch": 4.736007496305907, "grad_norm": 0.21715568006038666, "learning_rate": 3.6921664282493493e-07, "loss": 0.345, "step": 131410 }, { "epoch": 4.736187695967132, "grad_norm": 0.23071317374706268, "learning_rate": 3.687171428134406e-07, "loss": 0.3733, "step": 131415 }, { "epoch": 4.736367895628356, "grad_norm": 0.22580982744693756, "learning_rate": 3.68217978398408e-07, "loss": 0.3632, "step": 131420 }, { "epoch": 4.736548095289581, "grad_norm": 0.2745913863182068, "learning_rate": 3.6771914958663723e-07, "loss": 0.3948, "step": 131425 }, { "epoch": 4.736728294950805, "grad_norm": 0.22211065888404846, "learning_rate": 3.6722065638492287e-07, "loss": 0.3529, "step": 131430 }, { "epoch": 4.73690849461203, "grad_norm": 0.2897282838821411, "learning_rate": 3.667224988000567e-07, "loss": 0.3725, "step": 131435 }, { "epoch": 4.737088694273255, "grad_norm": 0.2873408794403076, "learning_rate": 3.6622467683882777e-07, "loss": 0.3692, "step": 131440 }, { "epoch": 4.73726889393448, "grad_norm": 0.24957486987113953, "learning_rate": 3.657271905080195e-07, "loss": 0.3915, "step": 131445 }, { "epoch": 4.737449093595704, "grad_norm": 0.24464282393455505, "learning_rate": 3.652300398144098e-07, "loss": 0.3876, "step": 131450 }, { "epoch": 4.737629293256929, "grad_norm": 0.3088054358959198, "learning_rate": 3.647332247647711e-07, "loss": 0.3674, "step": 131455 }, { "epoch": 4.737809492918153, "grad_norm": 0.26032140851020813, "learning_rate": 3.6423674536587015e-07, "loss": 0.387, "step": 131460 }, { "epoch": 4.737989692579378, "grad_norm": 0.2570837736129761, "learning_rate": 3.6374060162447653e-07, "loss": 0.3837, "step": 131465 }, { "epoch": 4.738169892240602, "grad_norm": 0.19201694428920746, "learning_rate": 3.6324479354734595e-07, "loss": 0.3706, "step": 131470 }, { "epoch": 4.738350091901827, "grad_norm": 0.28280356526374817, "learning_rate": 3.627493211412342e-07, "loss": 0.4057, "step": 131475 }, { "epoch": 4.738530291563052, "grad_norm": 0.26518356800079346, "learning_rate": 3.6225418441289684e-07, "loss": 0.4072, "step": 131480 }, { "epoch": 4.738710491224277, "grad_norm": 0.261747270822525, "learning_rate": 3.61759383369073e-07, "loss": 0.3822, "step": 131485 }, { "epoch": 4.738890690885501, "grad_norm": 0.2915024757385254, "learning_rate": 3.612649180165101e-07, "loss": 0.3657, "step": 131490 }, { "epoch": 4.739070890546726, "grad_norm": 0.2875518202781677, "learning_rate": 3.607707883619388e-07, "loss": 0.3783, "step": 131495 }, { "epoch": 4.739251090207951, "grad_norm": 0.2599601745605469, "learning_rate": 3.6027699441209815e-07, "loss": 0.3707, "step": 131500 }, { "epoch": 4.739251090207951, "eval_loss": 0.42884549498558044, "eval_runtime": 3.5372, "eval_samples_per_second": 28.271, "eval_steps_per_second": 7.068, "step": 131500 }, { "epoch": 4.7394312898691755, "grad_norm": 0.23225200176239014, "learning_rate": 3.5978353617371065e-07, "loss": 0.3767, "step": 131505 }, { "epoch": 4.739611489530399, "grad_norm": 0.2974475920200348, "learning_rate": 3.5929041365350413e-07, "loss": 0.3841, "step": 131510 }, { "epoch": 4.739791689191624, "grad_norm": 0.28682082891464233, "learning_rate": 3.587976268581955e-07, "loss": 0.3533, "step": 131515 }, { "epoch": 4.739971888852849, "grad_norm": 0.2352001816034317, "learning_rate": 3.5830517579449605e-07, "loss": 0.3648, "step": 131520 }, { "epoch": 4.7401520885140735, "grad_norm": 0.25316321849823, "learning_rate": 3.5781306046911986e-07, "loss": 0.3353, "step": 131525 }, { "epoch": 4.740332288175298, "grad_norm": 0.26786184310913086, "learning_rate": 3.573212808887699e-07, "loss": 0.3874, "step": 131530 }, { "epoch": 4.740512487836523, "grad_norm": 0.27805742621421814, "learning_rate": 3.568298370601436e-07, "loss": 0.4236, "step": 131535 }, { "epoch": 4.740692687497748, "grad_norm": 0.3236870765686035, "learning_rate": 3.5633872898994113e-07, "loss": 0.4053, "step": 131540 }, { "epoch": 4.7408728871589725, "grad_norm": 0.2795065641403198, "learning_rate": 3.5584795668485435e-07, "loss": 0.3755, "step": 131545 }, { "epoch": 4.741053086820196, "grad_norm": 0.23501679301261902, "learning_rate": 3.553575201515669e-07, "loss": 0.3576, "step": 131550 }, { "epoch": 4.741233286481421, "grad_norm": 0.2597077190876007, "learning_rate": 3.5486741939676215e-07, "loss": 0.3755, "step": 131555 }, { "epoch": 4.741413486142646, "grad_norm": 0.2638082504272461, "learning_rate": 3.5437765442711545e-07, "loss": 0.3753, "step": 131560 }, { "epoch": 4.7415936858038705, "grad_norm": 0.32323339581489563, "learning_rate": 3.53888225249302e-07, "loss": 0.3974, "step": 131565 }, { "epoch": 4.741773885465095, "grad_norm": 0.26547950506210327, "learning_rate": 3.5339913186999416e-07, "loss": 0.3688, "step": 131570 }, { "epoch": 4.74195408512632, "grad_norm": 0.3076135516166687, "learning_rate": 3.5291037429584784e-07, "loss": 0.3674, "step": 131575 }, { "epoch": 4.742134284787545, "grad_norm": 0.2687076926231384, "learning_rate": 3.5242195253352426e-07, "loss": 0.335, "step": 131580 }, { "epoch": 4.7423144844487695, "grad_norm": 0.23814557492733002, "learning_rate": 3.519338665896821e-07, "loss": 0.3682, "step": 131585 }, { "epoch": 4.742494684109994, "grad_norm": 0.21820604801177979, "learning_rate": 3.514461164709687e-07, "loss": 0.3579, "step": 131590 }, { "epoch": 4.742674883771219, "grad_norm": 0.23697353899478912, "learning_rate": 3.509587021840316e-07, "loss": 0.337, "step": 131595 }, { "epoch": 4.742855083432443, "grad_norm": 0.36326366662979126, "learning_rate": 3.5047162373550433e-07, "loss": 0.4014, "step": 131600 }, { "epoch": 4.7430352830936675, "grad_norm": 0.2886950373649597, "learning_rate": 3.499848811320344e-07, "loss": 0.3733, "step": 131605 }, { "epoch": 4.743215482754892, "grad_norm": 0.2526874840259552, "learning_rate": 3.494984743802443e-07, "loss": 0.3709, "step": 131610 }, { "epoch": 4.743395682416117, "grad_norm": 0.3060406744480133, "learning_rate": 3.4901240348676756e-07, "loss": 0.368, "step": 131615 }, { "epoch": 4.743575882077342, "grad_norm": 0.23225359618663788, "learning_rate": 3.485266684582239e-07, "loss": 0.3697, "step": 131620 }, { "epoch": 4.743756081738566, "grad_norm": 0.2793751060962677, "learning_rate": 3.4804126930123026e-07, "loss": 0.3779, "step": 131625 }, { "epoch": 4.743936281399791, "grad_norm": 0.25954195857048035, "learning_rate": 3.4755620602240357e-07, "loss": 0.3558, "step": 131630 }, { "epoch": 4.744116481061016, "grad_norm": 0.224226713180542, "learning_rate": 3.4707147862834965e-07, "loss": 0.3742, "step": 131635 }, { "epoch": 4.74429668072224, "grad_norm": 0.22823956608772278, "learning_rate": 3.4658708712567433e-07, "loss": 0.3546, "step": 131640 }, { "epoch": 4.7444768803834645, "grad_norm": 0.27476179599761963, "learning_rate": 3.461030315209779e-07, "loss": 0.3922, "step": 131645 }, { "epoch": 4.744657080044689, "grad_norm": 0.26549628376960754, "learning_rate": 3.4561931182085225e-07, "loss": 0.3772, "step": 131650 }, { "epoch": 4.744837279705914, "grad_norm": 0.29317188262939453, "learning_rate": 3.4513592803188944e-07, "loss": 0.3897, "step": 131655 }, { "epoch": 4.745017479367139, "grad_norm": 0.2582772970199585, "learning_rate": 3.446528801606785e-07, "loss": 0.3766, "step": 131660 }, { "epoch": 4.745197679028363, "grad_norm": 0.2573517858982086, "learning_rate": 3.4417016821379765e-07, "loss": 0.3822, "step": 131665 }, { "epoch": 4.745377878689588, "grad_norm": 0.2170310914516449, "learning_rate": 3.436877921978249e-07, "loss": 0.3824, "step": 131670 }, { "epoch": 4.745558078350813, "grad_norm": 0.2496137022972107, "learning_rate": 3.432057521193327e-07, "loss": 0.3849, "step": 131675 }, { "epoch": 4.745738278012038, "grad_norm": 0.285085529088974, "learning_rate": 3.427240479848881e-07, "loss": 0.3786, "step": 131680 }, { "epoch": 4.745918477673262, "grad_norm": 0.2879694104194641, "learning_rate": 3.4224267980105807e-07, "loss": 0.3835, "step": 131685 }, { "epoch": 4.746098677334487, "grad_norm": 0.26048213243484497, "learning_rate": 3.417616475743929e-07, "loss": 0.3415, "step": 131690 }, { "epoch": 4.746278876995711, "grad_norm": 0.29208385944366455, "learning_rate": 3.412809513114512e-07, "loss": 0.362, "step": 131695 }, { "epoch": 4.746459076656936, "grad_norm": 0.23016516864299774, "learning_rate": 3.4080059101878614e-07, "loss": 0.3496, "step": 131700 }, { "epoch": 4.74663927631816, "grad_norm": 0.22892868518829346, "learning_rate": 3.403205667029369e-07, "loss": 0.37, "step": 131705 }, { "epoch": 4.746819475979385, "grad_norm": 0.2265908122062683, "learning_rate": 3.398408783704454e-07, "loss": 0.3758, "step": 131710 }, { "epoch": 4.74699967564061, "grad_norm": 0.22055870294570923, "learning_rate": 3.393615260278482e-07, "loss": 0.4055, "step": 131715 }, { "epoch": 4.747179875301835, "grad_norm": 0.22395636141300201, "learning_rate": 3.3888250968167335e-07, "loss": 0.4186, "step": 131720 }, { "epoch": 4.747360074963059, "grad_norm": 0.24537676572799683, "learning_rate": 3.3840382933845174e-07, "loss": 0.3678, "step": 131725 }, { "epoch": 4.747540274624284, "grad_norm": 0.28544852137565613, "learning_rate": 3.379254850047031e-07, "loss": 0.3729, "step": 131730 }, { "epoch": 4.747720474285508, "grad_norm": 0.2334669977426529, "learning_rate": 3.3744747668694456e-07, "loss": 0.366, "step": 131735 }, { "epoch": 4.747900673946733, "grad_norm": 0.26452332735061646, "learning_rate": 3.369698043916902e-07, "loss": 0.3512, "step": 131740 }, { "epoch": 4.748080873607957, "grad_norm": 0.2582310140132904, "learning_rate": 3.364924681254461e-07, "loss": 0.3748, "step": 131745 }, { "epoch": 4.748261073269182, "grad_norm": 0.23445448279380798, "learning_rate": 3.36015467894718e-07, "loss": 0.3843, "step": 131750 }, { "epoch": 4.748441272930407, "grad_norm": 0.25989097356796265, "learning_rate": 3.355388037060064e-07, "loss": 0.3922, "step": 131755 }, { "epoch": 4.748621472591632, "grad_norm": 0.24597260355949402, "learning_rate": 3.350624755658005e-07, "loss": 0.3349, "step": 131760 }, { "epoch": 4.748801672252856, "grad_norm": 0.2952145040035248, "learning_rate": 3.345864834805951e-07, "loss": 0.3894, "step": 131765 }, { "epoch": 4.748981871914081, "grad_norm": 0.25986775755882263, "learning_rate": 3.341108274568683e-07, "loss": 0.3596, "step": 131770 }, { "epoch": 4.749162071575306, "grad_norm": 0.2798271179199219, "learning_rate": 3.336355075011122e-07, "loss": 0.3774, "step": 131775 }, { "epoch": 4.7493422712365305, "grad_norm": 0.25875961780548096, "learning_rate": 3.3316052361979655e-07, "loss": 0.3871, "step": 131780 }, { "epoch": 4.749522470897754, "grad_norm": 0.32598257064819336, "learning_rate": 3.326858758193885e-07, "loss": 0.394, "step": 131785 }, { "epoch": 4.749702670558979, "grad_norm": 0.2517693042755127, "learning_rate": 3.3221156410636335e-07, "loss": 0.3926, "step": 131790 }, { "epoch": 4.749882870220204, "grad_norm": 0.21797238290309906, "learning_rate": 3.3173758848717705e-07, "loss": 0.3232, "step": 131795 }, { "epoch": 4.750063069881429, "grad_norm": 0.1989823281764984, "learning_rate": 3.312639489682939e-07, "loss": 0.3844, "step": 131800 }, { "epoch": 4.750243269542653, "grad_norm": 0.22494913637638092, "learning_rate": 3.307906455561616e-07, "loss": 0.3855, "step": 131805 }, { "epoch": 4.750423469203878, "grad_norm": 0.22416415810585022, "learning_rate": 3.303176782572276e-07, "loss": 0.3918, "step": 131810 }, { "epoch": 4.750603668865103, "grad_norm": 0.2782622277736664, "learning_rate": 3.2984504707794237e-07, "loss": 0.3599, "step": 131815 }, { "epoch": 4.7507838685263275, "grad_norm": 0.23325783014297485, "learning_rate": 3.2937275202473974e-07, "loss": 0.3743, "step": 131820 }, { "epoch": 4.750964068187551, "grad_norm": 0.2581305503845215, "learning_rate": 3.2890079310405885e-07, "loss": 0.3498, "step": 131825 }, { "epoch": 4.751144267848776, "grad_norm": 0.30278897285461426, "learning_rate": 3.2842917032233076e-07, "loss": 0.3916, "step": 131830 }, { "epoch": 4.751324467510001, "grad_norm": 0.26210495829582214, "learning_rate": 3.279578836859754e-07, "loss": 0.3663, "step": 131835 }, { "epoch": 4.7515046671712255, "grad_norm": 0.3384738862514496, "learning_rate": 3.274869332014152e-07, "loss": 0.379, "step": 131840 }, { "epoch": 4.75168486683245, "grad_norm": 0.3079512119293213, "learning_rate": 3.27016318875073e-07, "loss": 0.4015, "step": 131845 }, { "epoch": 4.751865066493675, "grad_norm": 0.2897801995277405, "learning_rate": 3.265460407133547e-07, "loss": 0.382, "step": 131850 }, { "epoch": 4.7520452661549, "grad_norm": 0.2497911900281906, "learning_rate": 3.260760987226691e-07, "loss": 0.3617, "step": 131855 }, { "epoch": 4.7522254658161245, "grad_norm": 0.23933720588684082, "learning_rate": 3.2560649290942216e-07, "loss": 0.3815, "step": 131860 }, { "epoch": 4.752405665477349, "grad_norm": 0.24069957435131073, "learning_rate": 3.2513722328000595e-07, "loss": 0.363, "step": 131865 }, { "epoch": 4.752585865138574, "grad_norm": 0.3069649636745453, "learning_rate": 3.2466828984082096e-07, "loss": 0.3922, "step": 131870 }, { "epoch": 4.752766064799798, "grad_norm": 0.21737022697925568, "learning_rate": 3.24199692598251e-07, "loss": 0.3574, "step": 131875 }, { "epoch": 4.7529462644610225, "grad_norm": 0.22841662168502808, "learning_rate": 3.2373143155868255e-07, "loss": 0.344, "step": 131880 }, { "epoch": 4.753126464122247, "grad_norm": 0.25134187936782837, "learning_rate": 3.232635067284995e-07, "loss": 0.3917, "step": 131885 }, { "epoch": 4.753306663783472, "grad_norm": 0.21516527235507965, "learning_rate": 3.2279591811406884e-07, "loss": 0.3816, "step": 131890 }, { "epoch": 4.753486863444697, "grad_norm": 0.20590807497501373, "learning_rate": 3.223286657217717e-07, "loss": 0.3616, "step": 131895 }, { "epoch": 4.7536670631059215, "grad_norm": 0.2603895962238312, "learning_rate": 3.21861749557964e-07, "loss": 0.3731, "step": 131900 }, { "epoch": 4.753847262767146, "grad_norm": 0.2670796513557434, "learning_rate": 3.2139516962901297e-07, "loss": 0.3868, "step": 131905 }, { "epoch": 4.754027462428371, "grad_norm": 0.2581007480621338, "learning_rate": 3.2092892594127454e-07, "loss": 0.382, "step": 131910 }, { "epoch": 4.754207662089595, "grad_norm": 0.29957446455955505, "learning_rate": 3.2046301850110484e-07, "loss": 0.3801, "step": 131915 }, { "epoch": 4.7543878617508195, "grad_norm": 0.21832120418548584, "learning_rate": 3.199974473148459e-07, "loss": 0.3558, "step": 131920 }, { "epoch": 4.754568061412044, "grad_norm": 0.3172387182712555, "learning_rate": 3.195322123888428e-07, "loss": 0.3774, "step": 131925 }, { "epoch": 4.754748261073269, "grad_norm": 0.2534702718257904, "learning_rate": 3.1906731372943476e-07, "loss": 0.386, "step": 131930 }, { "epoch": 4.754928460734494, "grad_norm": 0.21878264844417572, "learning_rate": 3.186027513429585e-07, "loss": 0.3593, "step": 131935 }, { "epoch": 4.7551086603957184, "grad_norm": 0.2851618528366089, "learning_rate": 3.181385252357394e-07, "loss": 0.4047, "step": 131940 }, { "epoch": 4.755288860056943, "grad_norm": 0.24602822959423065, "learning_rate": 3.176746354141058e-07, "loss": 0.3892, "step": 131945 }, { "epoch": 4.755469059718168, "grad_norm": 0.25003623962402344, "learning_rate": 3.172110818843749e-07, "loss": 0.3674, "step": 131950 }, { "epoch": 4.755649259379393, "grad_norm": 0.282927930355072, "learning_rate": 3.167478646528638e-07, "loss": 0.3198, "step": 131955 }, { "epoch": 4.755829459040617, "grad_norm": 0.23673711717128754, "learning_rate": 3.162849837258869e-07, "loss": 0.3578, "step": 131960 }, { "epoch": 4.756009658701842, "grad_norm": 0.2531015872955322, "learning_rate": 3.158224391097503e-07, "loss": 0.3597, "step": 131965 }, { "epoch": 4.756189858363066, "grad_norm": 0.25352203845977783, "learning_rate": 3.1536023081074894e-07, "loss": 0.4158, "step": 131970 }, { "epoch": 4.756370058024291, "grad_norm": 0.278499037027359, "learning_rate": 3.1489835883518615e-07, "loss": 0.3489, "step": 131975 }, { "epoch": 4.756550257685515, "grad_norm": 0.27848923206329346, "learning_rate": 3.144368231893541e-07, "loss": 0.4042, "step": 131980 }, { "epoch": 4.75673045734674, "grad_norm": 0.25936684012413025, "learning_rate": 3.1397562387954214e-07, "loss": 0.3477, "step": 131985 }, { "epoch": 4.756910657007965, "grad_norm": 0.21595986187458038, "learning_rate": 3.1351476091203427e-07, "loss": 0.3695, "step": 131990 }, { "epoch": 4.75709085666919, "grad_norm": 0.2597789466381073, "learning_rate": 3.1305423429310587e-07, "loss": 0.3714, "step": 131995 }, { "epoch": 4.757271056330414, "grad_norm": 0.22275707125663757, "learning_rate": 3.1259404402903536e-07, "loss": 0.3491, "step": 132000 }, { "epoch": 4.757271056330414, "eval_loss": 0.4288484454154968, "eval_runtime": 3.5292, "eval_samples_per_second": 28.335, "eval_steps_per_second": 7.084, "step": 132000 }, { "epoch": 4.757451255991639, "grad_norm": 0.25196489691734314, "learning_rate": 3.121341901260899e-07, "loss": 0.3829, "step": 132005 }, { "epoch": 4.757631455652863, "grad_norm": 0.21652761101722717, "learning_rate": 3.1167467259053675e-07, "loss": 0.3318, "step": 132010 }, { "epoch": 4.757811655314088, "grad_norm": 0.2428998500108719, "learning_rate": 3.112154914286375e-07, "loss": 0.357, "step": 132015 }, { "epoch": 4.757991854975312, "grad_norm": 0.2448771446943283, "learning_rate": 3.1075664664664827e-07, "loss": 0.3707, "step": 132020 }, { "epoch": 4.758172054636537, "grad_norm": 0.2827889621257782, "learning_rate": 3.1029813825081687e-07, "loss": 0.393, "step": 132025 }, { "epoch": 4.758352254297762, "grad_norm": 0.21509301662445068, "learning_rate": 3.098399662473939e-07, "loss": 0.3583, "step": 132030 }, { "epoch": 4.758532453958987, "grad_norm": 0.23091164231300354, "learning_rate": 3.0938213064262143e-07, "loss": 0.3593, "step": 132035 }, { "epoch": 4.758712653620211, "grad_norm": 0.3308013677597046, "learning_rate": 3.089246314427391e-07, "loss": 0.383, "step": 132040 }, { "epoch": 4.758892853281436, "grad_norm": 0.2075890302658081, "learning_rate": 3.084674686539779e-07, "loss": 0.3666, "step": 132045 }, { "epoch": 4.759073052942661, "grad_norm": 0.2468065321445465, "learning_rate": 3.080106422825635e-07, "loss": 0.3804, "step": 132050 }, { "epoch": 4.7592532526038855, "grad_norm": 0.24998079240322113, "learning_rate": 3.0755415233472693e-07, "loss": 0.4156, "step": 132055 }, { "epoch": 4.759433452265109, "grad_norm": 0.23068219423294067, "learning_rate": 3.0709799881668554e-07, "loss": 0.3924, "step": 132060 }, { "epoch": 4.759613651926334, "grad_norm": 0.2239476889371872, "learning_rate": 3.066421817346482e-07, "loss": 0.3956, "step": 132065 }, { "epoch": 4.759793851587559, "grad_norm": 0.2508222162723541, "learning_rate": 3.0618670109483494e-07, "loss": 0.3624, "step": 132070 }, { "epoch": 4.759974051248784, "grad_norm": 0.21215388178825378, "learning_rate": 3.0573155690344355e-07, "loss": 0.3786, "step": 132075 }, { "epoch": 4.760154250910008, "grad_norm": 0.23385365307331085, "learning_rate": 3.05276749166683e-07, "loss": 0.3711, "step": 132080 }, { "epoch": 4.760334450571233, "grad_norm": 0.26072898507118225, "learning_rate": 3.048222778907428e-07, "loss": 0.3787, "step": 132085 }, { "epoch": 4.760514650232458, "grad_norm": 0.27673956751823425, "learning_rate": 3.0436814308181794e-07, "loss": 0.3841, "step": 132090 }, { "epoch": 4.7606948498936825, "grad_norm": 0.2762385308742523, "learning_rate": 3.0391434474609516e-07, "loss": 0.3801, "step": 132095 }, { "epoch": 4.760875049554906, "grad_norm": 0.27955085039138794, "learning_rate": 3.0346088288976117e-07, "loss": 0.3601, "step": 132100 }, { "epoch": 4.761055249216131, "grad_norm": 0.2552419900894165, "learning_rate": 3.030077575189888e-07, "loss": 0.363, "step": 132105 }, { "epoch": 4.761235448877356, "grad_norm": 0.30773216485977173, "learning_rate": 3.0255496863995646e-07, "loss": 0.3942, "step": 132110 }, { "epoch": 4.761415648538581, "grad_norm": 0.22618763148784637, "learning_rate": 3.021025162588259e-07, "loss": 0.391, "step": 132115 }, { "epoch": 4.761595848199805, "grad_norm": 0.2810955047607422, "learning_rate": 3.016504003817727e-07, "loss": 0.3841, "step": 132120 }, { "epoch": 4.76177604786103, "grad_norm": 0.25989750027656555, "learning_rate": 3.0119862101494754e-07, "loss": 0.3548, "step": 132125 }, { "epoch": 4.761956247522255, "grad_norm": 0.2658855617046356, "learning_rate": 3.0074717816451214e-07, "loss": 0.3629, "step": 132130 }, { "epoch": 4.7621364471834795, "grad_norm": 0.27331972122192383, "learning_rate": 3.002960718366116e-07, "loss": 0.3689, "step": 132135 }, { "epoch": 4.762316646844704, "grad_norm": 0.22226150333881378, "learning_rate": 2.998453020373965e-07, "loss": 0.4006, "step": 132140 }, { "epoch": 4.762496846505929, "grad_norm": 0.24564702808856964, "learning_rate": 2.9939486877300636e-07, "loss": 0.3452, "step": 132145 }, { "epoch": 4.762677046167154, "grad_norm": 0.3113086521625519, "learning_rate": 2.989447720495808e-07, "loss": 0.3813, "step": 132150 }, { "epoch": 4.7628572458283776, "grad_norm": 0.24232225120067596, "learning_rate": 2.9849501187324823e-07, "loss": 0.3843, "step": 132155 }, { "epoch": 4.763037445489602, "grad_norm": 0.21868427097797394, "learning_rate": 2.980455882501398e-07, "loss": 0.3898, "step": 132160 }, { "epoch": 4.763217645150827, "grad_norm": 0.2546680271625519, "learning_rate": 2.975965011863785e-07, "loss": 0.318, "step": 132165 }, { "epoch": 4.763397844812052, "grad_norm": 0.2764042615890503, "learning_rate": 2.971477506880815e-07, "loss": 0.3723, "step": 132170 }, { "epoch": 4.7635780444732765, "grad_norm": 0.23360565304756165, "learning_rate": 2.9669933676136354e-07, "loss": 0.3803, "step": 132175 }, { "epoch": 4.763758244134501, "grad_norm": 0.258620023727417, "learning_rate": 2.9625125941233345e-07, "loss": 0.4101, "step": 132180 }, { "epoch": 4.763938443795726, "grad_norm": 0.2147550880908966, "learning_rate": 2.9580351864710034e-07, "loss": 0.362, "step": 132185 }, { "epoch": 4.764118643456951, "grad_norm": 0.2418157309293747, "learning_rate": 2.9535611447175924e-07, "loss": 0.358, "step": 132190 }, { "epoch": 4.7642988431181745, "grad_norm": 0.26910528540611267, "learning_rate": 2.9490904689240815e-07, "loss": 0.4377, "step": 132195 }, { "epoch": 4.764479042779399, "grad_norm": 0.26641741394996643, "learning_rate": 2.944623159151394e-07, "loss": 0.3293, "step": 132200 }, { "epoch": 4.764659242440624, "grad_norm": 0.2724882960319519, "learning_rate": 2.9401592154603694e-07, "loss": 0.3949, "step": 132205 }, { "epoch": 4.764839442101849, "grad_norm": 0.245937317609787, "learning_rate": 2.935698637911849e-07, "loss": 0.3721, "step": 132210 }, { "epoch": 4.7650196417630735, "grad_norm": 0.25321248173713684, "learning_rate": 2.9312414265665886e-07, "loss": 0.3822, "step": 132215 }, { "epoch": 4.765199841424298, "grad_norm": 0.27613797783851624, "learning_rate": 2.9267875814853463e-07, "loss": 0.3634, "step": 132220 }, { "epoch": 4.765380041085523, "grad_norm": 0.2703777849674225, "learning_rate": 2.922337102728795e-07, "loss": 0.3705, "step": 132225 }, { "epoch": 4.765560240746748, "grad_norm": 0.23466283082962036, "learning_rate": 2.917889990357553e-07, "loss": 0.3601, "step": 132230 }, { "epoch": 4.765740440407972, "grad_norm": 0.26712194085121155, "learning_rate": 2.9134462444322106e-07, "loss": 0.3573, "step": 132235 }, { "epoch": 4.765920640069197, "grad_norm": 0.23665735125541687, "learning_rate": 2.909005865013331e-07, "loss": 0.3711, "step": 132240 }, { "epoch": 4.766100839730421, "grad_norm": 0.26551133394241333, "learning_rate": 2.904568852161449e-07, "loss": 0.3894, "step": 132245 }, { "epoch": 4.766281039391646, "grad_norm": 0.21451817452907562, "learning_rate": 2.900135205936932e-07, "loss": 0.3891, "step": 132250 }, { "epoch": 4.7664612390528704, "grad_norm": 0.256231427192688, "learning_rate": 2.8957049264002334e-07, "loss": 0.3673, "step": 132255 }, { "epoch": 4.766641438714095, "grad_norm": 0.2589702904224396, "learning_rate": 2.89127801361172e-07, "loss": 0.4113, "step": 132260 }, { "epoch": 4.76682163837532, "grad_norm": 0.25339174270629883, "learning_rate": 2.886854467631733e-07, "loss": 0.3935, "step": 132265 }, { "epoch": 4.767001838036545, "grad_norm": 0.23474030196666718, "learning_rate": 2.882434288520475e-07, "loss": 0.3563, "step": 132270 }, { "epoch": 4.767182037697769, "grad_norm": 0.2822888195514679, "learning_rate": 2.8780174763382296e-07, "loss": 0.3655, "step": 132275 }, { "epoch": 4.767362237358994, "grad_norm": 0.32862353324890137, "learning_rate": 2.8736040311451163e-07, "loss": 0.3644, "step": 132280 }, { "epoch": 4.767542437020218, "grad_norm": 0.2945389151573181, "learning_rate": 2.869193953001337e-07, "loss": 0.3493, "step": 132285 }, { "epoch": 4.767722636681443, "grad_norm": 0.20633207261562347, "learning_rate": 2.8647872419669544e-07, "loss": 0.3379, "step": 132290 }, { "epoch": 4.767902836342667, "grad_norm": 0.25659510493278503, "learning_rate": 2.8603838981019757e-07, "loss": 0.3413, "step": 132295 }, { "epoch": 4.768083036003892, "grad_norm": 0.24695339798927307, "learning_rate": 2.8559839214664094e-07, "loss": 0.3909, "step": 132300 }, { "epoch": 4.768263235665117, "grad_norm": 0.2526165544986725, "learning_rate": 2.8515873121202076e-07, "loss": 0.3805, "step": 132305 }, { "epoch": 4.768443435326342, "grad_norm": 0.31589943170547485, "learning_rate": 2.8471940701233216e-07, "loss": 0.4108, "step": 132310 }, { "epoch": 4.768623634987566, "grad_norm": 0.24592305719852448, "learning_rate": 2.8428041955355365e-07, "loss": 0.3881, "step": 132315 }, { "epoch": 4.768803834648791, "grad_norm": 0.30194249749183655, "learning_rate": 2.8384176884166947e-07, "loss": 0.3638, "step": 132320 }, { "epoch": 4.768984034310016, "grad_norm": 0.26434436440467834, "learning_rate": 2.834034548826553e-07, "loss": 0.3949, "step": 132325 }, { "epoch": 4.769164233971241, "grad_norm": 0.24037402868270874, "learning_rate": 2.8296547768248415e-07, "loss": 0.3672, "step": 132330 }, { "epoch": 4.769344433632464, "grad_norm": 0.2738417088985443, "learning_rate": 2.825278372471263e-07, "loss": 0.3967, "step": 132335 }, { "epoch": 4.769524633293689, "grad_norm": 0.2520720958709717, "learning_rate": 2.82090533582538e-07, "loss": 0.3668, "step": 132340 }, { "epoch": 4.769704832954914, "grad_norm": 0.2859429717063904, "learning_rate": 2.8165356669468126e-07, "loss": 0.3258, "step": 132345 }, { "epoch": 4.769885032616139, "grad_norm": 0.2699842154979706, "learning_rate": 2.8121693658950955e-07, "loss": 0.352, "step": 132350 }, { "epoch": 4.770065232277363, "grad_norm": 0.29695814847946167, "learning_rate": 2.807806432729682e-07, "loss": 0.3885, "step": 132355 }, { "epoch": 4.770245431938588, "grad_norm": 0.2401593029499054, "learning_rate": 2.803446867510079e-07, "loss": 0.3437, "step": 132360 }, { "epoch": 4.770425631599813, "grad_norm": 0.26505574584007263, "learning_rate": 2.7990906702956566e-07, "loss": 0.3474, "step": 132365 }, { "epoch": 4.7706058312610375, "grad_norm": 0.7576366662979126, "learning_rate": 2.7947378411457557e-07, "loss": 0.3716, "step": 132370 }, { "epoch": 4.770786030922261, "grad_norm": 0.3026728332042694, "learning_rate": 2.7903883801196906e-07, "loss": 0.342, "step": 132375 }, { "epoch": 4.770966230583486, "grad_norm": 0.3232538104057312, "learning_rate": 2.786042287276719e-07, "loss": 0.3823, "step": 132380 }, { "epoch": 4.771146430244711, "grad_norm": 0.24834728240966797, "learning_rate": 2.781699562676071e-07, "loss": 0.3594, "step": 132385 }, { "epoch": 4.771326629905936, "grad_norm": 0.23225435614585876, "learning_rate": 2.7773602063768946e-07, "loss": 0.3594, "step": 132390 }, { "epoch": 4.77150682956716, "grad_norm": 0.27175524830818176, "learning_rate": 2.773024218438308e-07, "loss": 0.3937, "step": 132395 }, { "epoch": 4.771687029228385, "grad_norm": 0.250227153301239, "learning_rate": 2.768691598919432e-07, "loss": 0.3693, "step": 132400 }, { "epoch": 4.77186722888961, "grad_norm": 0.23589470982551575, "learning_rate": 2.7643623478792456e-07, "loss": 0.369, "step": 132405 }, { "epoch": 4.7720474285508345, "grad_norm": 0.32625946402549744, "learning_rate": 2.7600364653767584e-07, "loss": 0.3857, "step": 132410 }, { "epoch": 4.772227628212059, "grad_norm": 0.29416128993034363, "learning_rate": 2.755713951470923e-07, "loss": 0.3979, "step": 132415 }, { "epoch": 4.772407827873284, "grad_norm": 0.22298985719680786, "learning_rate": 2.7513948062205807e-07, "loss": 0.383, "step": 132420 }, { "epoch": 4.772588027534509, "grad_norm": 0.23173697292804718, "learning_rate": 2.747079029684657e-07, "loss": 0.335, "step": 132425 }, { "epoch": 4.772768227195733, "grad_norm": 0.3118121922016144, "learning_rate": 2.7427666219219104e-07, "loss": 0.411, "step": 132430 }, { "epoch": 4.772948426856957, "grad_norm": 0.23155774176120758, "learning_rate": 2.738457582991072e-07, "loss": 0.3765, "step": 132435 }, { "epoch": 4.773128626518182, "grad_norm": 0.25789666175842285, "learning_rate": 2.734151912950872e-07, "loss": 0.3977, "step": 132440 }, { "epoch": 4.773308826179407, "grad_norm": 0.24202270805835724, "learning_rate": 2.729849611859986e-07, "loss": 0.3611, "step": 132445 }, { "epoch": 4.7734890258406315, "grad_norm": 0.26847487688064575, "learning_rate": 2.7255506797770615e-07, "loss": 0.3389, "step": 132450 }, { "epoch": 4.773669225501856, "grad_norm": 0.23348930478096008, "learning_rate": 2.721255116760607e-07, "loss": 0.3824, "step": 132455 }, { "epoch": 4.773849425163081, "grad_norm": 0.23757703602313995, "learning_rate": 2.7169629228691597e-07, "loss": 0.3796, "step": 132460 }, { "epoch": 4.774029624824306, "grad_norm": 0.2990122139453888, "learning_rate": 2.7126740981612555e-07, "loss": 0.4045, "step": 132465 }, { "epoch": 4.7742098244855296, "grad_norm": 0.2594546675682068, "learning_rate": 2.708388642695237e-07, "loss": 0.3844, "step": 132470 }, { "epoch": 4.774390024146754, "grad_norm": 0.2652234435081482, "learning_rate": 2.7041065565296117e-07, "loss": 0.3808, "step": 132475 }, { "epoch": 4.774570223807979, "grad_norm": 0.22843249142169952, "learning_rate": 2.6998278397226404e-07, "loss": 0.3518, "step": 132480 }, { "epoch": 4.774750423469204, "grad_norm": 0.24594981968402863, "learning_rate": 2.695552492332609e-07, "loss": 0.3952, "step": 132485 }, { "epoch": 4.7749306231304285, "grad_norm": 0.20578601956367493, "learning_rate": 2.6912805144178046e-07, "loss": 0.3949, "step": 132490 }, { "epoch": 4.775110822791653, "grad_norm": 0.22884593904018402, "learning_rate": 2.6870119060364295e-07, "loss": 0.366, "step": 132495 }, { "epoch": 4.775291022452878, "grad_norm": 0.2841814458370209, "learning_rate": 2.682746667246633e-07, "loss": 0.339, "step": 132500 }, { "epoch": 4.775291022452878, "eval_loss": 0.4288337230682373, "eval_runtime": 3.5448, "eval_samples_per_second": 28.21, "eval_steps_per_second": 7.053, "step": 132500 }, { "epoch": 4.775471222114103, "grad_norm": 0.216835618019104, "learning_rate": 2.678484798106534e-07, "loss": 0.381, "step": 132505 }, { "epoch": 4.775651421775327, "grad_norm": 0.28074148297309875, "learning_rate": 2.6742262986741706e-07, "loss": 0.3725, "step": 132510 }, { "epoch": 4.775831621436552, "grad_norm": 0.3115323781967163, "learning_rate": 2.669971169007607e-07, "loss": 0.3841, "step": 132515 }, { "epoch": 4.776011821097776, "grad_norm": 0.31163284182548523, "learning_rate": 2.6657194091648243e-07, "loss": 0.3959, "step": 132520 }, { "epoch": 4.776192020759001, "grad_norm": 0.25598570704460144, "learning_rate": 2.661471019203693e-07, "loss": 0.3576, "step": 132525 }, { "epoch": 4.7763722204202255, "grad_norm": 0.25508904457092285, "learning_rate": 2.657225999182167e-07, "loss": 0.3747, "step": 132530 }, { "epoch": 4.77655242008145, "grad_norm": 0.24659277498722076, "learning_rate": 2.652984349158033e-07, "loss": 0.3652, "step": 132535 }, { "epoch": 4.776732619742675, "grad_norm": 0.2608989179134369, "learning_rate": 2.6487460691890787e-07, "loss": 0.381, "step": 132540 }, { "epoch": 4.7769128194039, "grad_norm": 0.22622261941432953, "learning_rate": 2.6445111593331187e-07, "loss": 0.3443, "step": 132545 }, { "epoch": 4.777093019065124, "grad_norm": 0.23941785097122192, "learning_rate": 2.640279619647773e-07, "loss": 0.3482, "step": 132550 }, { "epoch": 4.777273218726349, "grad_norm": 0.25073572993278503, "learning_rate": 2.6360514501907183e-07, "loss": 0.3629, "step": 132555 }, { "epoch": 4.777453418387573, "grad_norm": 0.2828969359397888, "learning_rate": 2.631826651019603e-07, "loss": 0.3381, "step": 132560 }, { "epoch": 4.777633618048798, "grad_norm": 0.29804477095603943, "learning_rate": 2.627605222191937e-07, "loss": 0.373, "step": 132565 }, { "epoch": 4.7778138177100224, "grad_norm": 0.21889689564704895, "learning_rate": 2.6233871637652563e-07, "loss": 0.3954, "step": 132570 }, { "epoch": 4.777994017371247, "grad_norm": 0.32417771220207214, "learning_rate": 2.619172475797044e-07, "loss": 0.4157, "step": 132575 }, { "epoch": 4.778174217032472, "grad_norm": 0.24297624826431274, "learning_rate": 2.6149611583446975e-07, "loss": 0.374, "step": 132580 }, { "epoch": 4.778354416693697, "grad_norm": 0.2629621922969818, "learning_rate": 2.6107532114656166e-07, "loss": 0.3859, "step": 132585 }, { "epoch": 4.778534616354921, "grad_norm": 0.2679719924926758, "learning_rate": 2.6065486352171153e-07, "loss": 0.366, "step": 132590 }, { "epoch": 4.778714816016146, "grad_norm": 0.25725606083869934, "learning_rate": 2.6023474296565096e-07, "loss": 0.4017, "step": 132595 }, { "epoch": 4.778895015677371, "grad_norm": 0.26632368564605713, "learning_rate": 2.598149594841004e-07, "loss": 0.412, "step": 132600 }, { "epoch": 4.779075215338596, "grad_norm": 0.20511886477470398, "learning_rate": 2.59395513082783e-07, "loss": 0.3194, "step": 132605 }, { "epoch": 4.779255414999819, "grad_norm": 0.26528292894363403, "learning_rate": 2.589764037674108e-07, "loss": 0.3529, "step": 132610 }, { "epoch": 4.779435614661044, "grad_norm": 0.2384311705827713, "learning_rate": 2.5855763154369604e-07, "loss": 0.3273, "step": 132615 }, { "epoch": 4.779615814322269, "grad_norm": 0.28175267577171326, "learning_rate": 2.581391964173424e-07, "loss": 0.3562, "step": 132620 }, { "epoch": 4.779796013983494, "grad_norm": 0.2850888967514038, "learning_rate": 2.5772109839405367e-07, "loss": 0.39, "step": 132625 }, { "epoch": 4.779976213644718, "grad_norm": 0.2696197032928467, "learning_rate": 2.573033374795225e-07, "loss": 0.3607, "step": 132630 }, { "epoch": 4.780156413305943, "grad_norm": 0.2529081702232361, "learning_rate": 2.5688591367944437e-07, "loss": 0.3348, "step": 132635 }, { "epoch": 4.780336612967168, "grad_norm": 0.2832430601119995, "learning_rate": 2.5646882699950634e-07, "loss": 0.3933, "step": 132640 }, { "epoch": 4.780516812628393, "grad_norm": 0.22554835677146912, "learning_rate": 2.5605207744538727e-07, "loss": 0.3706, "step": 132645 }, { "epoch": 4.780697012289616, "grad_norm": 0.24301104247570038, "learning_rate": 2.556356650227687e-07, "loss": 0.3657, "step": 132650 }, { "epoch": 4.780877211950841, "grad_norm": 0.2747708559036255, "learning_rate": 2.5521958973732383e-07, "loss": 0.3806, "step": 132655 }, { "epoch": 4.781057411612066, "grad_norm": 0.2549690008163452, "learning_rate": 2.548038515947232e-07, "loss": 0.3701, "step": 132660 }, { "epoch": 4.781237611273291, "grad_norm": 0.24783384799957275, "learning_rate": 2.5438845060062887e-07, "loss": 0.3806, "step": 132665 }, { "epoch": 4.781417810934515, "grad_norm": 0.2004653662443161, "learning_rate": 2.5397338676069746e-07, "loss": 0.3775, "step": 132670 }, { "epoch": 4.78159801059574, "grad_norm": 0.2571395933628082, "learning_rate": 2.5355866008059117e-07, "loss": 0.3793, "step": 132675 }, { "epoch": 4.781778210256965, "grad_norm": 0.2348882555961609, "learning_rate": 2.5314427056595537e-07, "loss": 0.391, "step": 132680 }, { "epoch": 4.7819584099181895, "grad_norm": 0.26653140783309937, "learning_rate": 2.527302182224384e-07, "loss": 0.3878, "step": 132685 }, { "epoch": 4.782138609579414, "grad_norm": 0.2474595308303833, "learning_rate": 2.523165030556829e-07, "loss": 0.3423, "step": 132690 }, { "epoch": 4.782318809240639, "grad_norm": 0.2751745879650116, "learning_rate": 2.519031250713205e-07, "loss": 0.3803, "step": 132695 }, { "epoch": 4.782499008901864, "grad_norm": 0.27088475227355957, "learning_rate": 2.5149008427498567e-07, "loss": 0.3678, "step": 132700 }, { "epoch": 4.782679208563088, "grad_norm": 0.2260597050189972, "learning_rate": 2.510773806723099e-07, "loss": 0.3565, "step": 132705 }, { "epoch": 4.782859408224312, "grad_norm": 0.23380564153194427, "learning_rate": 2.5066501426891377e-07, "loss": 0.3805, "step": 132710 }, { "epoch": 4.783039607885537, "grad_norm": 0.22910314798355103, "learning_rate": 2.5025298507041494e-07, "loss": 0.325, "step": 132715 }, { "epoch": 4.783219807546762, "grad_norm": 0.2516254782676697, "learning_rate": 2.498412930824257e-07, "loss": 0.372, "step": 132720 }, { "epoch": 4.7834000072079865, "grad_norm": 0.24351219832897186, "learning_rate": 2.4942993831055803e-07, "loss": 0.3611, "step": 132725 }, { "epoch": 4.783580206869211, "grad_norm": 0.3001359701156616, "learning_rate": 2.49018920760416e-07, "loss": 0.4113, "step": 132730 }, { "epoch": 4.783760406530436, "grad_norm": 0.256946861743927, "learning_rate": 2.486082404375978e-07, "loss": 0.3766, "step": 132735 }, { "epoch": 4.783940606191661, "grad_norm": 0.25258302688598633, "learning_rate": 2.4819789734770174e-07, "loss": 0.3731, "step": 132740 }, { "epoch": 4.784120805852885, "grad_norm": 0.24291817843914032, "learning_rate": 2.4778789149631773e-07, "loss": 0.3833, "step": 132745 }, { "epoch": 4.784301005514109, "grad_norm": 0.25857844948768616, "learning_rate": 2.4737822288902756e-07, "loss": 0.3858, "step": 132750 }, { "epoch": 4.784481205175334, "grad_norm": 0.276473730802536, "learning_rate": 2.4696889153142386e-07, "loss": 0.3602, "step": 132755 }, { "epoch": 4.784661404836559, "grad_norm": 0.2200738787651062, "learning_rate": 2.465598974290717e-07, "loss": 0.3473, "step": 132760 }, { "epoch": 4.7848416044977835, "grad_norm": 0.22857971489429474, "learning_rate": 2.461512405875471e-07, "loss": 0.3452, "step": 132765 }, { "epoch": 4.785021804159008, "grad_norm": 0.2619878351688385, "learning_rate": 2.4574292101242347e-07, "loss": 0.3811, "step": 132770 }, { "epoch": 4.785202003820233, "grad_norm": 0.2593579888343811, "learning_rate": 2.453349387092574e-07, "loss": 0.3729, "step": 132775 }, { "epoch": 4.785382203481458, "grad_norm": 0.22074739634990692, "learning_rate": 2.449272936836111e-07, "loss": 0.3484, "step": 132780 }, { "epoch": 4.785562403142682, "grad_norm": 0.25108104944229126, "learning_rate": 2.4451998594103854e-07, "loss": 0.3631, "step": 132785 }, { "epoch": 4.785742602803907, "grad_norm": 0.2209988385438919, "learning_rate": 2.441130154870852e-07, "loss": 0.3648, "step": 132790 }, { "epoch": 4.785922802465131, "grad_norm": 0.2671109735965729, "learning_rate": 2.437063823273023e-07, "loss": 0.3766, "step": 132795 }, { "epoch": 4.786103002126356, "grad_norm": 0.25873270630836487, "learning_rate": 2.4330008646722693e-07, "loss": 0.4221, "step": 132800 }, { "epoch": 4.7862832017875805, "grad_norm": 0.24569159746170044, "learning_rate": 2.428941279123936e-07, "loss": 0.349, "step": 132805 }, { "epoch": 4.786463401448805, "grad_norm": 0.22799314558506012, "learning_rate": 2.424885066683341e-07, "loss": 0.3881, "step": 132810 }, { "epoch": 4.78664360111003, "grad_norm": 0.258485347032547, "learning_rate": 2.420832227405745e-07, "loss": 0.3969, "step": 132815 }, { "epoch": 4.786823800771255, "grad_norm": 0.208878755569458, "learning_rate": 2.416782761346409e-07, "loss": 0.361, "step": 132820 }, { "epoch": 4.787004000432479, "grad_norm": 0.2455977350473404, "learning_rate": 2.4127366685604834e-07, "loss": 0.3483, "step": 132825 }, { "epoch": 4.787184200093704, "grad_norm": 0.25862938165664673, "learning_rate": 2.408693949103036e-07, "loss": 0.3815, "step": 132830 }, { "epoch": 4.787364399754928, "grad_norm": 0.2472279667854309, "learning_rate": 2.4046546030292437e-07, "loss": 0.3725, "step": 132835 }, { "epoch": 4.787544599416153, "grad_norm": 0.2628474533557892, "learning_rate": 2.400618630394064e-07, "loss": 0.3722, "step": 132840 }, { "epoch": 4.7877247990773775, "grad_norm": 0.2300945371389389, "learning_rate": 2.3965860312525344e-07, "loss": 0.3503, "step": 132845 }, { "epoch": 4.787904998738602, "grad_norm": 0.27947181463241577, "learning_rate": 2.392556805659585e-07, "loss": 0.3705, "step": 132850 }, { "epoch": 4.788085198399827, "grad_norm": 0.22649262845516205, "learning_rate": 2.3885309536700874e-07, "loss": 0.3519, "step": 132855 }, { "epoch": 4.788265398061052, "grad_norm": 0.2503747045993805, "learning_rate": 2.3845084753389426e-07, "loss": 0.3673, "step": 132860 }, { "epoch": 4.788445597722276, "grad_norm": 0.2534830570220947, "learning_rate": 2.3804893707209396e-07, "loss": 0.3949, "step": 132865 }, { "epoch": 4.788625797383501, "grad_norm": 0.2691611349582672, "learning_rate": 2.3764736398708133e-07, "loss": 0.3847, "step": 132870 }, { "epoch": 4.788805997044726, "grad_norm": 0.23171448707580566, "learning_rate": 2.3724612828432968e-07, "loss": 0.3372, "step": 132875 }, { "epoch": 4.788986196705951, "grad_norm": 0.20922648906707764, "learning_rate": 2.3684522996930137e-07, "loss": 0.3914, "step": 132880 }, { "epoch": 4.7891663963671744, "grad_norm": 0.24962696433067322, "learning_rate": 2.3644466904746697e-07, "loss": 0.3608, "step": 132885 }, { "epoch": 4.789346596028399, "grad_norm": 0.3583545982837677, "learning_rate": 2.3604444552427773e-07, "loss": 0.3603, "step": 132890 }, { "epoch": 4.789526795689624, "grad_norm": 0.26689061522483826, "learning_rate": 2.3564455940519037e-07, "loss": 0.3372, "step": 132895 }, { "epoch": 4.789706995350849, "grad_norm": 0.24085666239261627, "learning_rate": 2.3524501069565053e-07, "loss": 0.3906, "step": 132900 }, { "epoch": 4.789887195012073, "grad_norm": 0.2830328345298767, "learning_rate": 2.3484579940110385e-07, "loss": 0.3628, "step": 132905 }, { "epoch": 4.790067394673298, "grad_norm": 0.30713436007499695, "learning_rate": 2.3444692552698488e-07, "loss": 0.3912, "step": 132910 }, { "epoch": 4.790247594334523, "grad_norm": 0.21235887706279755, "learning_rate": 2.340483890787337e-07, "loss": 0.3317, "step": 132915 }, { "epoch": 4.790427793995748, "grad_norm": 0.2915695011615753, "learning_rate": 2.3365019006177936e-07, "loss": 0.3714, "step": 132920 }, { "epoch": 4.790607993656971, "grad_norm": 0.29393500089645386, "learning_rate": 2.3325232848154522e-07, "loss": 0.3477, "step": 132925 }, { "epoch": 4.790788193318196, "grad_norm": 0.2399698942899704, "learning_rate": 2.3285480434345474e-07, "loss": 0.3784, "step": 132930 }, { "epoch": 4.790968392979421, "grad_norm": 0.25534698367118835, "learning_rate": 2.324576176529203e-07, "loss": 0.3805, "step": 132935 }, { "epoch": 4.791148592640646, "grad_norm": 0.23360496759414673, "learning_rate": 2.3206076841535696e-07, "loss": 0.3425, "step": 132940 }, { "epoch": 4.79132879230187, "grad_norm": 0.2366493046283722, "learning_rate": 2.3166425663617154e-07, "loss": 0.3638, "step": 132945 }, { "epoch": 4.791508991963095, "grad_norm": 0.23220838606357574, "learning_rate": 2.3126808232076247e-07, "loss": 0.367, "step": 132950 }, { "epoch": 4.79168919162432, "grad_norm": 0.22431600093841553, "learning_rate": 2.30872245474531e-07, "loss": 0.3759, "step": 132955 }, { "epoch": 4.791869391285545, "grad_norm": 0.24509382247924805, "learning_rate": 2.3047674610287007e-07, "loss": 0.3951, "step": 132960 }, { "epoch": 4.792049590946769, "grad_norm": 0.3701198101043701, "learning_rate": 2.3008158421116977e-07, "loss": 0.3721, "step": 132965 }, { "epoch": 4.792229790607994, "grad_norm": 0.2326425015926361, "learning_rate": 2.2968675980480913e-07, "loss": 0.3701, "step": 132970 }, { "epoch": 4.792409990269219, "grad_norm": 0.25888702273368835, "learning_rate": 2.2929227288917278e-07, "loss": 0.3776, "step": 132975 }, { "epoch": 4.792590189930443, "grad_norm": 0.2808634638786316, "learning_rate": 2.2889812346963134e-07, "loss": 0.3671, "step": 132980 }, { "epoch": 4.792770389591667, "grad_norm": 0.24987556040287018, "learning_rate": 2.2850431155156116e-07, "loss": 0.3775, "step": 132985 }, { "epoch": 4.792950589252892, "grad_norm": 0.3451884388923645, "learning_rate": 2.2811083714032177e-07, "loss": 0.3949, "step": 132990 }, { "epoch": 4.793130788914117, "grad_norm": 0.23816993832588196, "learning_rate": 2.2771770024127559e-07, "loss": 0.3669, "step": 132995 }, { "epoch": 4.7933109885753415, "grad_norm": 0.28144383430480957, "learning_rate": 2.2732490085977943e-07, "loss": 0.3463, "step": 133000 }, { "epoch": 4.7933109885753415, "eval_loss": 0.428801029920578, "eval_runtime": 3.5412, "eval_samples_per_second": 28.239, "eval_steps_per_second": 7.06, "step": 133000 }, { "epoch": 4.793491188236566, "grad_norm": 0.29074543714523315, "learning_rate": 2.2693243900118454e-07, "loss": 0.3782, "step": 133005 }, { "epoch": 4.793671387897791, "grad_norm": 0.25907793641090393, "learning_rate": 2.2654031467084225e-07, "loss": 0.3529, "step": 133010 }, { "epoch": 4.793851587559016, "grad_norm": 0.2522429823875427, "learning_rate": 2.261485278740899e-07, "loss": 0.3729, "step": 133015 }, { "epoch": 4.79403178722024, "grad_norm": 0.2468443363904953, "learning_rate": 2.257570786162677e-07, "loss": 0.3641, "step": 133020 }, { "epoch": 4.794211986881464, "grad_norm": 0.2643374502658844, "learning_rate": 2.2536596690270751e-07, "loss": 0.3705, "step": 133025 }, { "epoch": 4.794392186542689, "grad_norm": 0.34247177839279175, "learning_rate": 2.2497519273874113e-07, "loss": 0.3723, "step": 133030 }, { "epoch": 4.794572386203914, "grad_norm": 0.28834789991378784, "learning_rate": 2.245847561296921e-07, "loss": 0.367, "step": 133035 }, { "epoch": 4.7947525858651385, "grad_norm": 0.2463623285293579, "learning_rate": 2.241946570808756e-07, "loss": 0.3654, "step": 133040 }, { "epoch": 4.794932785526363, "grad_norm": 0.24550725519657135, "learning_rate": 2.2380489559761242e-07, "loss": 0.3917, "step": 133045 }, { "epoch": 4.795112985187588, "grad_norm": 0.23969918489456177, "learning_rate": 2.2341547168521215e-07, "loss": 0.3357, "step": 133050 }, { "epoch": 4.795293184848813, "grad_norm": 0.1967889666557312, "learning_rate": 2.230263853489789e-07, "loss": 0.3329, "step": 133055 }, { "epoch": 4.7954733845100375, "grad_norm": 0.2610912322998047, "learning_rate": 2.2263763659421123e-07, "loss": 0.3567, "step": 133060 }, { "epoch": 4.795653584171262, "grad_norm": 0.26562637090682983, "learning_rate": 2.222492254262104e-07, "loss": 0.348, "step": 133065 }, { "epoch": 4.795833783832486, "grad_norm": 0.2973119020462036, "learning_rate": 2.2186115185026668e-07, "loss": 0.4043, "step": 133070 }, { "epoch": 4.796013983493711, "grad_norm": 0.28032979369163513, "learning_rate": 2.2147341587166748e-07, "loss": 0.3818, "step": 133075 }, { "epoch": 4.7961941831549355, "grad_norm": 0.2844066917896271, "learning_rate": 2.2108601749569467e-07, "loss": 0.3509, "step": 133080 }, { "epoch": 4.79637438281616, "grad_norm": 0.25133219361305237, "learning_rate": 2.206989567276302e-07, "loss": 0.3783, "step": 133085 }, { "epoch": 4.796554582477385, "grad_norm": 0.2511965036392212, "learning_rate": 2.2031223357274477e-07, "loss": 0.3864, "step": 133090 }, { "epoch": 4.79673478213861, "grad_norm": 0.27423515915870667, "learning_rate": 2.1992584803630368e-07, "loss": 0.3573, "step": 133095 }, { "epoch": 4.796914981799834, "grad_norm": 0.24834780395030975, "learning_rate": 2.1953980012357767e-07, "loss": 0.3601, "step": 133100 }, { "epoch": 4.797095181461059, "grad_norm": 0.20566605031490326, "learning_rate": 2.1915408983982643e-07, "loss": 0.3793, "step": 133105 }, { "epoch": 4.797275381122283, "grad_norm": 0.30788904428482056, "learning_rate": 2.1876871719030135e-07, "loss": 0.3647, "step": 133110 }, { "epoch": 4.797455580783508, "grad_norm": 0.2940426766872406, "learning_rate": 2.1838368218025374e-07, "loss": 0.3761, "step": 133115 }, { "epoch": 4.7976357804447325, "grad_norm": 0.24170757830142975, "learning_rate": 2.1799898481493219e-07, "loss": 0.3499, "step": 133120 }, { "epoch": 4.797815980105957, "grad_norm": 0.19300325214862823, "learning_rate": 2.1761462509957698e-07, "loss": 0.3965, "step": 133125 }, { "epoch": 4.797996179767182, "grad_norm": 0.2832929491996765, "learning_rate": 2.1723060303942278e-07, "loss": 0.3731, "step": 133130 }, { "epoch": 4.798176379428407, "grad_norm": 0.2704760730266571, "learning_rate": 2.1684691863970152e-07, "loss": 0.3728, "step": 133135 }, { "epoch": 4.798356579089631, "grad_norm": 0.2666027247905731, "learning_rate": 2.1646357190564514e-07, "loss": 0.3435, "step": 133140 }, { "epoch": 4.798536778750856, "grad_norm": 0.2326856553554535, "learning_rate": 2.1608056284247725e-07, "loss": 0.3331, "step": 133145 }, { "epoch": 4.798716978412081, "grad_norm": 0.23978599905967712, "learning_rate": 2.1569789145541031e-07, "loss": 0.3723, "step": 133150 }, { "epoch": 4.798897178073306, "grad_norm": 0.2229306995868683, "learning_rate": 2.1531555774966238e-07, "loss": 0.3922, "step": 133155 }, { "epoch": 4.79907737773453, "grad_norm": 0.26587215065956116, "learning_rate": 2.1493356173043765e-07, "loss": 0.3805, "step": 133160 }, { "epoch": 4.799257577395754, "grad_norm": 0.21258600056171417, "learning_rate": 2.1455190340294863e-07, "loss": 0.3379, "step": 133165 }, { "epoch": 4.799437777056979, "grad_norm": 0.2616501450538635, "learning_rate": 2.141705827723911e-07, "loss": 0.3754, "step": 133170 }, { "epoch": 4.799617976718204, "grad_norm": 0.21897155046463013, "learning_rate": 2.1378959984395818e-07, "loss": 0.3835, "step": 133175 }, { "epoch": 4.799798176379428, "grad_norm": 0.26772749423980713, "learning_rate": 2.1340895462284571e-07, "loss": 0.3344, "step": 133180 }, { "epoch": 4.799978376040653, "grad_norm": 0.3039627969264984, "learning_rate": 2.1302864711423564e-07, "loss": 0.4117, "step": 133185 }, { "epoch": 4.800158575701878, "grad_norm": 0.27932974696159363, "learning_rate": 2.1264867732331272e-07, "loss": 0.3639, "step": 133190 }, { "epoch": 4.800338775363103, "grad_norm": 0.23750893771648407, "learning_rate": 2.1226904525525336e-07, "loss": 0.3618, "step": 133195 }, { "epoch": 4.800518975024327, "grad_norm": 0.26062673330307007, "learning_rate": 2.1188975091522568e-07, "loss": 0.372, "step": 133200 }, { "epoch": 4.800699174685551, "grad_norm": 0.250861257314682, "learning_rate": 2.1151079430840325e-07, "loss": 0.393, "step": 133205 }, { "epoch": 4.800879374346776, "grad_norm": 0.2910214364528656, "learning_rate": 2.111321754399487e-07, "loss": 0.3412, "step": 133210 }, { "epoch": 4.801059574008001, "grad_norm": 0.23345068097114563, "learning_rate": 2.1075389431501613e-07, "loss": 0.3914, "step": 133215 }, { "epoch": 4.801239773669225, "grad_norm": 0.22395312786102295, "learning_rate": 2.1037595093876538e-07, "loss": 0.3657, "step": 133220 }, { "epoch": 4.80141997333045, "grad_norm": 0.21458613872528076, "learning_rate": 2.0999834531633955e-07, "loss": 0.3573, "step": 133225 }, { "epoch": 4.801600172991675, "grad_norm": 0.2688688635826111, "learning_rate": 2.0962107745288727e-07, "loss": 0.3631, "step": 133230 }, { "epoch": 4.8017803726529, "grad_norm": 0.2928563952445984, "learning_rate": 2.0924414735355168e-07, "loss": 0.3654, "step": 133235 }, { "epoch": 4.801960572314124, "grad_norm": 0.360637903213501, "learning_rate": 2.08867555023462e-07, "loss": 0.3995, "step": 133240 }, { "epoch": 4.802140771975349, "grad_norm": 0.3004918694496155, "learning_rate": 2.08491300467753e-07, "loss": 0.3326, "step": 133245 }, { "epoch": 4.802320971636574, "grad_norm": 0.25441572070121765, "learning_rate": 2.0811538369155115e-07, "loss": 0.4041, "step": 133250 }, { "epoch": 4.802501171297798, "grad_norm": 0.22652685642242432, "learning_rate": 2.0773980469997456e-07, "loss": 0.3531, "step": 133255 }, { "epoch": 4.802681370959022, "grad_norm": 0.2670671343803406, "learning_rate": 2.0736456349814416e-07, "loss": 0.3764, "step": 133260 }, { "epoch": 4.802861570620247, "grad_norm": 0.2373109608888626, "learning_rate": 2.069896600911725e-07, "loss": 0.3733, "step": 133265 }, { "epoch": 4.803041770281472, "grad_norm": 0.2623651921749115, "learning_rate": 2.066150944841666e-07, "loss": 0.3785, "step": 133270 }, { "epoch": 4.803221969942697, "grad_norm": 0.26846858859062195, "learning_rate": 2.0624086668223074e-07, "loss": 0.366, "step": 133275 }, { "epoch": 4.803402169603921, "grad_norm": 0.2998198866844177, "learning_rate": 2.0586697669046083e-07, "loss": 0.399, "step": 133280 }, { "epoch": 4.803582369265146, "grad_norm": 0.29226166009902954, "learning_rate": 2.054934245139528e-07, "loss": 0.3444, "step": 133285 }, { "epoch": 4.803762568926371, "grad_norm": 0.24952854216098785, "learning_rate": 2.0512021015779704e-07, "loss": 0.3419, "step": 133290 }, { "epoch": 4.803942768587595, "grad_norm": 0.28424960374832153, "learning_rate": 2.0474733362708109e-07, "loss": 0.3488, "step": 133295 }, { "epoch": 4.804122968248819, "grad_norm": 0.3396574854850769, "learning_rate": 2.0437479492687872e-07, "loss": 0.3507, "step": 133300 }, { "epoch": 4.804303167910044, "grad_norm": 0.23657868802547455, "learning_rate": 2.040025940622692e-07, "loss": 0.3767, "step": 133305 }, { "epoch": 4.804483367571269, "grad_norm": 0.24988289177417755, "learning_rate": 2.0363073103832619e-07, "loss": 0.3637, "step": 133310 }, { "epoch": 4.8046635672324935, "grad_norm": 0.26382821798324585, "learning_rate": 2.032592058601124e-07, "loss": 0.3908, "step": 133315 }, { "epoch": 4.804843766893718, "grad_norm": 0.24887283146381378, "learning_rate": 2.0288801853269036e-07, "loss": 0.4073, "step": 133320 }, { "epoch": 4.805023966554943, "grad_norm": 0.27872660756111145, "learning_rate": 2.0251716906111718e-07, "loss": 0.3677, "step": 133325 }, { "epoch": 4.805204166216168, "grad_norm": 0.22657455503940582, "learning_rate": 2.0214665745044714e-07, "loss": 0.3901, "step": 133330 }, { "epoch": 4.8053843658773925, "grad_norm": 0.22239211201667786, "learning_rate": 2.0177648370573176e-07, "loss": 0.3766, "step": 133335 }, { "epoch": 4.805564565538617, "grad_norm": 0.28701508045196533, "learning_rate": 2.014066478320087e-07, "loss": 0.4004, "step": 133340 }, { "epoch": 4.805744765199841, "grad_norm": 0.31527572870254517, "learning_rate": 2.0103714983431553e-07, "loss": 0.3964, "step": 133345 }, { "epoch": 4.805924964861066, "grad_norm": 0.27578893303871155, "learning_rate": 2.0066798971769273e-07, "loss": 0.3874, "step": 133350 }, { "epoch": 4.8061051645222905, "grad_norm": 0.25750282406806946, "learning_rate": 2.0029916748716683e-07, "loss": 0.3622, "step": 133355 }, { "epoch": 4.806285364183515, "grad_norm": 0.2739262580871582, "learning_rate": 1.999306831477643e-07, "loss": 0.3426, "step": 133360 }, { "epoch": 4.80646556384474, "grad_norm": 0.25903549790382385, "learning_rate": 1.995625367045062e-07, "loss": 0.3852, "step": 133365 }, { "epoch": 4.806645763505965, "grad_norm": 0.19403640925884247, "learning_rate": 1.9919472816240237e-07, "loss": 0.3446, "step": 133370 }, { "epoch": 4.8068259631671895, "grad_norm": 0.22920994460582733, "learning_rate": 1.9882725752647102e-07, "loss": 0.3628, "step": 133375 }, { "epoch": 4.807006162828414, "grad_norm": 0.24877864122390747, "learning_rate": 1.984601248017165e-07, "loss": 0.3912, "step": 133380 }, { "epoch": 4.807186362489638, "grad_norm": 0.22861328721046448, "learning_rate": 1.9809332999314311e-07, "loss": 0.3505, "step": 133385 }, { "epoch": 4.807366562150863, "grad_norm": 0.2941596806049347, "learning_rate": 1.9772687310574412e-07, "loss": 0.3911, "step": 133390 }, { "epoch": 4.8075467618120875, "grad_norm": 0.24592210352420807, "learning_rate": 1.973607541445155e-07, "loss": 0.4009, "step": 133395 }, { "epoch": 4.807726961473312, "grad_norm": 0.26915445923805237, "learning_rate": 1.9699497311444493e-07, "loss": 0.381, "step": 133400 }, { "epoch": 4.807907161134537, "grad_norm": 0.2586546838283539, "learning_rate": 1.9662953002051455e-07, "loss": 0.3685, "step": 133405 }, { "epoch": 4.808087360795762, "grad_norm": 0.21903952956199646, "learning_rate": 1.9626442486770647e-07, "loss": 0.4036, "step": 133410 }, { "epoch": 4.8082675604569864, "grad_norm": 0.2535596191883087, "learning_rate": 1.9589965766099172e-07, "loss": 0.3819, "step": 133415 }, { "epoch": 4.808447760118211, "grad_norm": 0.28337281942367554, "learning_rate": 1.9553522840534133e-07, "loss": 0.3731, "step": 133420 }, { "epoch": 4.808627959779436, "grad_norm": 0.2324288785457611, "learning_rate": 1.9517113710572354e-07, "loss": 0.3678, "step": 133425 }, { "epoch": 4.808808159440661, "grad_norm": 0.31472426652908325, "learning_rate": 1.948073837670955e-07, "loss": 0.3801, "step": 133430 }, { "epoch": 4.808988359101885, "grad_norm": 0.2125542312860489, "learning_rate": 1.9444396839441436e-07, "loss": 0.3243, "step": 133435 }, { "epoch": 4.809168558763109, "grad_norm": 0.27946358919143677, "learning_rate": 1.9408089099263172e-07, "loss": 0.3759, "step": 133440 }, { "epoch": 4.809348758424334, "grad_norm": 0.2657468914985657, "learning_rate": 1.9371815156669358e-07, "loss": 0.385, "step": 133445 }, { "epoch": 4.809528958085559, "grad_norm": 0.31190598011016846, "learning_rate": 1.9335575012154327e-07, "loss": 0.4146, "step": 133450 }, { "epoch": 4.809709157746783, "grad_norm": 0.2902090549468994, "learning_rate": 1.9299368666211847e-07, "loss": 0.3892, "step": 133455 }, { "epoch": 4.809889357408008, "grad_norm": 0.27345964312553406, "learning_rate": 1.9263196119335413e-07, "loss": 0.3975, "step": 133460 }, { "epoch": 4.810069557069233, "grad_norm": 0.2101958841085434, "learning_rate": 1.9227057372017132e-07, "loss": 0.3399, "step": 133465 }, { "epoch": 4.810249756730458, "grad_norm": 0.24289266765117645, "learning_rate": 1.9190952424750496e-07, "loss": 0.3661, "step": 133470 }, { "epoch": 4.810429956391682, "grad_norm": 0.27102965116500854, "learning_rate": 1.9154881278026504e-07, "loss": 0.3835, "step": 133475 }, { "epoch": 4.810610156052906, "grad_norm": 0.2354983687400818, "learning_rate": 1.9118843932336982e-07, "loss": 0.3508, "step": 133480 }, { "epoch": 4.810790355714131, "grad_norm": 0.2682526707649231, "learning_rate": 1.908284038817293e-07, "loss": 0.3447, "step": 133485 }, { "epoch": 4.810970555375356, "grad_norm": 0.2339244782924652, "learning_rate": 1.9046870646024785e-07, "loss": 0.373, "step": 133490 }, { "epoch": 4.81115075503658, "grad_norm": 0.2050226777791977, "learning_rate": 1.901093470638271e-07, "loss": 0.368, "step": 133495 }, { "epoch": 4.811330954697805, "grad_norm": 0.21445462107658386, "learning_rate": 1.8975032569736318e-07, "loss": 0.3391, "step": 133500 }, { "epoch": 4.811330954697805, "eval_loss": 0.4288221001625061, "eval_runtime": 3.5381, "eval_samples_per_second": 28.264, "eval_steps_per_second": 7.066, "step": 133500 }, { "epoch": 4.81151115435903, "grad_norm": 0.23659485578536987, "learning_rate": 1.8939164236574658e-07, "loss": 0.3578, "step": 133505 }, { "epoch": 4.811691354020255, "grad_norm": 0.2790174186229706, "learning_rate": 1.8903329707386785e-07, "loss": 0.3562, "step": 133510 }, { "epoch": 4.811871553681479, "grad_norm": 0.2516117990016937, "learning_rate": 1.8867528982660643e-07, "loss": 0.3731, "step": 133515 }, { "epoch": 4.812051753342704, "grad_norm": 0.2900271713733673, "learning_rate": 1.8831762062883896e-07, "loss": 0.3736, "step": 133520 }, { "epoch": 4.812231953003929, "grad_norm": 0.22807054221630096, "learning_rate": 1.8796028948544209e-07, "loss": 0.3544, "step": 133525 }, { "epoch": 4.812412152665153, "grad_norm": 0.26726317405700684, "learning_rate": 1.8760329640128139e-07, "loss": 0.3583, "step": 133530 }, { "epoch": 4.812592352326377, "grad_norm": 0.23728644847869873, "learning_rate": 1.8724664138122238e-07, "loss": 0.3509, "step": 133535 }, { "epoch": 4.812772551987602, "grad_norm": 0.27613380551338196, "learning_rate": 1.8689032443012234e-07, "loss": 0.3841, "step": 133540 }, { "epoch": 4.812952751648827, "grad_norm": 0.23071031272411346, "learning_rate": 1.8653434555284123e-07, "loss": 0.3659, "step": 133545 }, { "epoch": 4.813132951310052, "grad_norm": 0.2540687024593353, "learning_rate": 1.8617870475422238e-07, "loss": 0.374, "step": 133550 }, { "epoch": 4.813313150971276, "grad_norm": 0.2460535317659378, "learning_rate": 1.8582340203911475e-07, "loss": 0.3813, "step": 133555 }, { "epoch": 4.813493350632501, "grad_norm": 0.18833312392234802, "learning_rate": 1.8546843741236163e-07, "loss": 0.3373, "step": 133560 }, { "epoch": 4.813673550293726, "grad_norm": 0.2607569694519043, "learning_rate": 1.851138108787953e-07, "loss": 0.3953, "step": 133565 }, { "epoch": 4.81385374995495, "grad_norm": 0.2937115430831909, "learning_rate": 1.84759522443248e-07, "loss": 0.3231, "step": 133570 }, { "epoch": 4.814033949616174, "grad_norm": 0.27710872888565063, "learning_rate": 1.844055721105492e-07, "loss": 0.389, "step": 133575 }, { "epoch": 4.814214149277399, "grad_norm": 0.19749537110328674, "learning_rate": 1.8405195988552003e-07, "loss": 0.3749, "step": 133580 }, { "epoch": 4.814394348938624, "grad_norm": 0.20698542892932892, "learning_rate": 1.8369868577297612e-07, "loss": 0.3486, "step": 133585 }, { "epoch": 4.814574548599849, "grad_norm": 0.22517277300357819, "learning_rate": 1.8334574977773577e-07, "loss": 0.3636, "step": 133590 }, { "epoch": 4.814754748261073, "grad_norm": 0.2940255105495453, "learning_rate": 1.8299315190460352e-07, "loss": 0.388, "step": 133595 }, { "epoch": 4.814934947922298, "grad_norm": 0.21540936827659607, "learning_rate": 1.8264089215838386e-07, "loss": 0.414, "step": 133600 }, { "epoch": 4.815115147583523, "grad_norm": 0.2051449418067932, "learning_rate": 1.8228897054388128e-07, "loss": 0.3345, "step": 133605 }, { "epoch": 4.8152953472447475, "grad_norm": 0.2661552429199219, "learning_rate": 1.8193738706588082e-07, "loss": 0.3696, "step": 133610 }, { "epoch": 4.815475546905972, "grad_norm": 0.26067331433296204, "learning_rate": 1.815861417291842e-07, "loss": 0.3499, "step": 133615 }, { "epoch": 4.815655746567196, "grad_norm": 0.23874883353710175, "learning_rate": 1.8123523453856816e-07, "loss": 0.3581, "step": 133620 }, { "epoch": 4.815835946228421, "grad_norm": 0.264144629240036, "learning_rate": 1.8088466549881778e-07, "loss": 0.3731, "step": 133625 }, { "epoch": 4.8160161458896455, "grad_norm": 0.2632257342338562, "learning_rate": 1.8053443461470698e-07, "loss": 0.4175, "step": 133630 }, { "epoch": 4.81619634555087, "grad_norm": 0.2465040385723114, "learning_rate": 1.8018454189101254e-07, "loss": 0.3609, "step": 133635 }, { "epoch": 4.816376545212095, "grad_norm": 0.2356046885251999, "learning_rate": 1.7983498733249725e-07, "loss": 0.3647, "step": 133640 }, { "epoch": 4.81655674487332, "grad_norm": 0.2914571166038513, "learning_rate": 1.7948577094392405e-07, "loss": 0.3861, "step": 133645 }, { "epoch": 4.8167369445345445, "grad_norm": 0.2516320049762726, "learning_rate": 1.7913689273005018e-07, "loss": 0.389, "step": 133650 }, { "epoch": 4.816917144195769, "grad_norm": 0.2756977379322052, "learning_rate": 1.78788352695633e-07, "loss": 0.397, "step": 133655 }, { "epoch": 4.817097343856993, "grad_norm": 0.31001341342926025, "learning_rate": 1.7844015084542143e-07, "loss": 0.3965, "step": 133660 }, { "epoch": 4.817277543518218, "grad_norm": 0.31293249130249023, "learning_rate": 1.780922871841534e-07, "loss": 0.3876, "step": 133665 }, { "epoch": 4.8174577431794425, "grad_norm": 0.23081745207309723, "learning_rate": 1.7774476171657229e-07, "loss": 0.3338, "step": 133670 }, { "epoch": 4.817637942840667, "grad_norm": 0.229642853140831, "learning_rate": 1.7739757444741323e-07, "loss": 0.4005, "step": 133675 }, { "epoch": 4.817818142501892, "grad_norm": 0.29115715622901917, "learning_rate": 1.7705072538140856e-07, "loss": 0.3731, "step": 133680 }, { "epoch": 4.817998342163117, "grad_norm": 0.27003014087677, "learning_rate": 1.7670421452328224e-07, "loss": 0.3786, "step": 133685 }, { "epoch": 4.8181785418243415, "grad_norm": 0.2552355229854584, "learning_rate": 1.7635804187775273e-07, "loss": 0.3423, "step": 133690 }, { "epoch": 4.818358741485566, "grad_norm": 0.29862159490585327, "learning_rate": 1.760122074495385e-07, "loss": 0.373, "step": 133695 }, { "epoch": 4.818538941146791, "grad_norm": 0.2521141469478607, "learning_rate": 1.7566671124335242e-07, "loss": 0.3318, "step": 133700 }, { "epoch": 4.818719140808016, "grad_norm": 0.23437343537807465, "learning_rate": 1.7532155326390464e-07, "loss": 0.373, "step": 133705 }, { "epoch": 4.81889934046924, "grad_norm": 0.24322153627872467, "learning_rate": 1.749767335158914e-07, "loss": 0.3533, "step": 133710 }, { "epoch": 4.819079540130464, "grad_norm": 0.29220882058143616, "learning_rate": 1.7463225200401167e-07, "loss": 0.3859, "step": 133715 }, { "epoch": 4.819259739791689, "grad_norm": 0.19493746757507324, "learning_rate": 1.7428810873296453e-07, "loss": 0.3794, "step": 133720 }, { "epoch": 4.819439939452914, "grad_norm": 0.23761507868766785, "learning_rate": 1.739443037074351e-07, "loss": 0.3677, "step": 133725 }, { "epoch": 4.8196201391141384, "grad_norm": 0.31488823890686035, "learning_rate": 1.736008369321085e-07, "loss": 0.3997, "step": 133730 }, { "epoch": 4.819800338775363, "grad_norm": 0.23459258675575256, "learning_rate": 1.7325770841166156e-07, "loss": 0.3629, "step": 133735 }, { "epoch": 4.819980538436588, "grad_norm": 0.29168611764907837, "learning_rate": 1.7291491815077388e-07, "loss": 0.3874, "step": 133740 }, { "epoch": 4.820160738097813, "grad_norm": 0.28949815034866333, "learning_rate": 1.7257246615411393e-07, "loss": 0.3727, "step": 133745 }, { "epoch": 4.820340937759037, "grad_norm": 0.23711396753787994, "learning_rate": 1.7223035242634467e-07, "loss": 0.3764, "step": 133750 }, { "epoch": 4.820521137420261, "grad_norm": 0.26176103949546814, "learning_rate": 1.7188857697213178e-07, "loss": 0.3257, "step": 133755 }, { "epoch": 4.820701337081486, "grad_norm": 0.3358304798603058, "learning_rate": 1.715471397961327e-07, "loss": 0.3996, "step": 133760 }, { "epoch": 4.820881536742711, "grad_norm": 0.32944080233573914, "learning_rate": 1.7120604090299363e-07, "loss": 0.3569, "step": 133765 }, { "epoch": 4.821061736403935, "grad_norm": 0.25890570878982544, "learning_rate": 1.708652802973637e-07, "loss": 0.3859, "step": 133770 }, { "epoch": 4.82124193606516, "grad_norm": 0.23768959939479828, "learning_rate": 1.7052485798389196e-07, "loss": 0.3734, "step": 133775 }, { "epoch": 4.821422135726385, "grad_norm": 0.2727597951889038, "learning_rate": 1.70184773967208e-07, "loss": 0.3652, "step": 133780 }, { "epoch": 4.82160233538761, "grad_norm": 0.31934478878974915, "learning_rate": 1.698450282519526e-07, "loss": 0.3821, "step": 133785 }, { "epoch": 4.821782535048834, "grad_norm": 0.2313583940267563, "learning_rate": 1.695056208427498e-07, "loss": 0.3477, "step": 133790 }, { "epoch": 4.821962734710059, "grad_norm": 0.28459036350250244, "learning_rate": 1.691665517442237e-07, "loss": 0.3909, "step": 133795 }, { "epoch": 4.822142934371284, "grad_norm": 0.2991600036621094, "learning_rate": 1.6882782096099836e-07, "loss": 0.3633, "step": 133800 }, { "epoch": 4.822323134032508, "grad_norm": 0.2798830270767212, "learning_rate": 1.684894284976868e-07, "loss": 0.3536, "step": 133805 }, { "epoch": 4.822503333693732, "grad_norm": 0.2202059030532837, "learning_rate": 1.681513743588964e-07, "loss": 0.3627, "step": 133810 }, { "epoch": 4.822683533354957, "grad_norm": 0.2335967868566513, "learning_rate": 1.6781365854924014e-07, "loss": 0.3384, "step": 133815 }, { "epoch": 4.822863733016182, "grad_norm": 0.21627481281757355, "learning_rate": 1.674762810733116e-07, "loss": 0.3793, "step": 133820 }, { "epoch": 4.823043932677407, "grad_norm": 0.2823790907859802, "learning_rate": 1.671392419357126e-07, "loss": 0.3695, "step": 133825 }, { "epoch": 4.823224132338631, "grad_norm": 0.2549593448638916, "learning_rate": 1.66802541141034e-07, "loss": 0.342, "step": 133830 }, { "epoch": 4.823404331999856, "grad_norm": 0.21052326261997223, "learning_rate": 1.6646617869386095e-07, "loss": 0.3486, "step": 133835 }, { "epoch": 4.823584531661081, "grad_norm": 0.24714645743370056, "learning_rate": 1.6613015459877868e-07, "loss": 0.377, "step": 133840 }, { "epoch": 4.823764731322305, "grad_norm": 0.24139024317264557, "learning_rate": 1.6579446886036687e-07, "loss": 0.3834, "step": 133845 }, { "epoch": 4.823944930983529, "grad_norm": 0.29688748717308044, "learning_rate": 1.6545912148319687e-07, "loss": 0.4207, "step": 133850 }, { "epoch": 4.824125130644754, "grad_norm": 0.26805707812309265, "learning_rate": 1.6512411247183724e-07, "loss": 0.3626, "step": 133855 }, { "epoch": 4.824305330305979, "grad_norm": 0.2496339976787567, "learning_rate": 1.6478944183085376e-07, "loss": 0.3988, "step": 133860 }, { "epoch": 4.824485529967204, "grad_norm": 0.20922985672950745, "learning_rate": 1.644551095648067e-07, "loss": 0.3854, "step": 133865 }, { "epoch": 4.824665729628428, "grad_norm": 0.2915142774581909, "learning_rate": 1.6412111567825074e-07, "loss": 0.3817, "step": 133870 }, { "epoch": 4.824845929289653, "grad_norm": 0.25543341040611267, "learning_rate": 1.6378746017573222e-07, "loss": 0.3862, "step": 133875 }, { "epoch": 4.825026128950878, "grad_norm": 0.2814326882362366, "learning_rate": 1.6345414306180584e-07, "loss": 0.3492, "step": 133880 }, { "epoch": 4.8252063286121025, "grad_norm": 0.2885473370552063, "learning_rate": 1.6312116434100412e-07, "loss": 0.3942, "step": 133885 }, { "epoch": 4.825386528273327, "grad_norm": 0.239909365773201, "learning_rate": 1.6278852401787336e-07, "loss": 0.4162, "step": 133890 }, { "epoch": 4.825566727934551, "grad_norm": 0.22957023978233337, "learning_rate": 1.6245622209693778e-07, "loss": 0.3542, "step": 133895 }, { "epoch": 4.825746927595776, "grad_norm": 0.22540193796157837, "learning_rate": 1.621242585827243e-07, "loss": 0.3753, "step": 133900 }, { "epoch": 4.825927127257001, "grad_norm": 0.22572508454322815, "learning_rate": 1.617926334797626e-07, "loss": 0.3706, "step": 133905 }, { "epoch": 4.826107326918225, "grad_norm": 0.2843494117259979, "learning_rate": 1.6146134679256574e-07, "loss": 0.399, "step": 133910 }, { "epoch": 4.82628752657945, "grad_norm": 0.2644733190536499, "learning_rate": 1.6113039852565237e-07, "loss": 0.3451, "step": 133915 }, { "epoch": 4.826467726240675, "grad_norm": 0.25811052322387695, "learning_rate": 1.6079978868352442e-07, "loss": 0.388, "step": 133920 }, { "epoch": 4.8266479259018995, "grad_norm": 0.2605394721031189, "learning_rate": 1.6046951727069214e-07, "loss": 0.3679, "step": 133925 }, { "epoch": 4.826828125563124, "grad_norm": 0.29219627380371094, "learning_rate": 1.60139584291652e-07, "loss": 0.3613, "step": 133930 }, { "epoch": 4.827008325224348, "grad_norm": 0.24204973876476288, "learning_rate": 1.598099897509031e-07, "loss": 0.3677, "step": 133935 }, { "epoch": 4.827188524885573, "grad_norm": 0.2921901047229767, "learning_rate": 1.5948073365293358e-07, "loss": 0.3456, "step": 133940 }, { "epoch": 4.8273687245467976, "grad_norm": 0.2787991166114807, "learning_rate": 1.5915181600222872e-07, "loss": 0.3672, "step": 133945 }, { "epoch": 4.827548924208022, "grad_norm": 0.2651299238204956, "learning_rate": 1.5882323680327104e-07, "loss": 0.3701, "step": 133950 }, { "epoch": 4.827729123869247, "grad_norm": 0.27234312891960144, "learning_rate": 1.5849499606053753e-07, "loss": 0.3357, "step": 133955 }, { "epoch": 4.827909323530472, "grad_norm": 0.2102580964565277, "learning_rate": 1.5816709377849957e-07, "loss": 0.3899, "step": 133960 }, { "epoch": 4.8280895231916965, "grad_norm": 0.29363569617271423, "learning_rate": 1.5783952996162864e-07, "loss": 0.3687, "step": 133965 }, { "epoch": 4.828269722852921, "grad_norm": 0.22374111413955688, "learning_rate": 1.5751230461438228e-07, "loss": 0.3412, "step": 133970 }, { "epoch": 4.828449922514146, "grad_norm": 0.2635408341884613, "learning_rate": 1.5718541774122076e-07, "loss": 0.4143, "step": 133975 }, { "epoch": 4.828630122175371, "grad_norm": 0.25199902057647705, "learning_rate": 1.568588693465961e-07, "loss": 0.375, "step": 133980 }, { "epoch": 4.828810321836595, "grad_norm": 0.21928834915161133, "learning_rate": 1.5653265943496587e-07, "loss": 0.3635, "step": 133985 }, { "epoch": 4.828990521497819, "grad_norm": 0.25685426592826843, "learning_rate": 1.562067880107626e-07, "loss": 0.3996, "step": 133990 }, { "epoch": 4.829170721159044, "grad_norm": 0.24886153638362885, "learning_rate": 1.5588125507843275e-07, "loss": 0.3769, "step": 133995 }, { "epoch": 4.829350920820269, "grad_norm": 0.2590484321117401, "learning_rate": 1.5555606064241168e-07, "loss": 0.3867, "step": 134000 }, { "epoch": 4.829350920820269, "eval_loss": 0.42881202697753906, "eval_runtime": 3.5296, "eval_samples_per_second": 28.332, "eval_steps_per_second": 7.083, "step": 134000 }, { "epoch": 4.8295311204814935, "grad_norm": 0.21452315151691437, "learning_rate": 1.5523120470712915e-07, "loss": 0.3729, "step": 134005 }, { "epoch": 4.829711320142718, "grad_norm": 0.2168400138616562, "learning_rate": 1.5490668727701217e-07, "loss": 0.3683, "step": 134010 }, { "epoch": 4.829891519803943, "grad_norm": 0.3310934603214264, "learning_rate": 1.5458250835648225e-07, "loss": 0.3743, "step": 134015 }, { "epoch": 4.830071719465168, "grad_norm": 0.2809125781059265, "learning_rate": 1.5425866794995247e-07, "loss": 0.377, "step": 134020 }, { "epoch": 4.830251919126392, "grad_norm": 0.2371680587530136, "learning_rate": 1.5393516606183878e-07, "loss": 0.3484, "step": 134025 }, { "epoch": 4.830432118787616, "grad_norm": 0.22989128530025482, "learning_rate": 1.5361200269655153e-07, "loss": 0.3637, "step": 134030 }, { "epoch": 4.830612318448841, "grad_norm": 0.250777930021286, "learning_rate": 1.5328917785848719e-07, "loss": 0.3924, "step": 134035 }, { "epoch": 4.830792518110066, "grad_norm": 0.26133477687835693, "learning_rate": 1.5296669155204778e-07, "loss": 0.3637, "step": 134040 }, { "epoch": 4.8309727177712904, "grad_norm": 0.28973469138145447, "learning_rate": 1.526445437816243e-07, "loss": 0.3738, "step": 134045 }, { "epoch": 4.831152917432515, "grad_norm": 0.2642209529876709, "learning_rate": 1.5232273455161316e-07, "loss": 0.3481, "step": 134050 }, { "epoch": 4.83133311709374, "grad_norm": 0.259695827960968, "learning_rate": 1.5200126386639148e-07, "loss": 0.3832, "step": 134055 }, { "epoch": 4.831513316754965, "grad_norm": 0.2560080885887146, "learning_rate": 1.516801317303418e-07, "loss": 0.3417, "step": 134060 }, { "epoch": 4.831693516416189, "grad_norm": 0.24287177622318268, "learning_rate": 1.5135933814783844e-07, "loss": 0.3666, "step": 134065 }, { "epoch": 4.831873716077414, "grad_norm": 0.2676023840904236, "learning_rate": 1.5103888312325566e-07, "loss": 0.3918, "step": 134070 }, { "epoch": 4.832053915738639, "grad_norm": 0.35573047399520874, "learning_rate": 1.5071876666095385e-07, "loss": 0.3883, "step": 134075 }, { "epoch": 4.832234115399863, "grad_norm": 0.26907774806022644, "learning_rate": 1.503989887653018e-07, "loss": 0.3785, "step": 134080 }, { "epoch": 4.832414315061087, "grad_norm": 0.269654244184494, "learning_rate": 1.500795494406515e-07, "loss": 0.3896, "step": 134085 }, { "epoch": 4.832594514722312, "grad_norm": 0.3965383470058441, "learning_rate": 1.497604486913551e-07, "loss": 0.377, "step": 134090 }, { "epoch": 4.832774714383537, "grad_norm": 0.22493110597133636, "learning_rate": 1.4944168652176183e-07, "loss": 0.3609, "step": 134095 }, { "epoch": 4.832954914044762, "grad_norm": 0.2781234681606293, "learning_rate": 1.4912326293621549e-07, "loss": 0.3761, "step": 134100 }, { "epoch": 4.833135113705986, "grad_norm": 0.3429318070411682, "learning_rate": 1.4880517793905148e-07, "loss": 0.3934, "step": 134105 }, { "epoch": 4.833315313367211, "grad_norm": 0.2164117842912674, "learning_rate": 1.4848743153460522e-07, "loss": 0.3973, "step": 134110 }, { "epoch": 4.833495513028436, "grad_norm": 0.32656198740005493, "learning_rate": 1.4817002372720935e-07, "loss": 0.3912, "step": 134115 }, { "epoch": 4.83367571268966, "grad_norm": 0.25910449028015137, "learning_rate": 1.4785295452118264e-07, "loss": 0.3827, "step": 134120 }, { "epoch": 4.833855912350884, "grad_norm": 0.20214806497097015, "learning_rate": 1.4753622392084943e-07, "loss": 0.3517, "step": 134125 }, { "epoch": 4.834036112012109, "grad_norm": 0.27396053075790405, "learning_rate": 1.472198319305229e-07, "loss": 0.3526, "step": 134130 }, { "epoch": 4.834216311673334, "grad_norm": 0.20608721673488617, "learning_rate": 1.4690377855451353e-07, "loss": 0.3631, "step": 134135 }, { "epoch": 4.834396511334559, "grad_norm": 0.26418453454971313, "learning_rate": 1.4658806379712897e-07, "loss": 0.3475, "step": 134140 }, { "epoch": 4.834576710995783, "grad_norm": 0.2818256616592407, "learning_rate": 1.4627268766267133e-07, "loss": 0.36, "step": 134145 }, { "epoch": 4.834756910657008, "grad_norm": 0.30141425132751465, "learning_rate": 1.4595765015543715e-07, "loss": 0.373, "step": 134150 }, { "epoch": 4.834937110318233, "grad_norm": 0.309048056602478, "learning_rate": 1.456429512797175e-07, "loss": 0.4023, "step": 134155 }, { "epoch": 4.8351173099794575, "grad_norm": 0.28096258640289307, "learning_rate": 1.4532859103980056e-07, "loss": 0.3722, "step": 134160 }, { "epoch": 4.835297509640682, "grad_norm": 0.30201297998428345, "learning_rate": 1.4501456943996628e-07, "loss": 0.3901, "step": 134165 }, { "epoch": 4.835477709301906, "grad_norm": 0.2812051773071289, "learning_rate": 1.447008864845001e-07, "loss": 0.4076, "step": 134170 }, { "epoch": 4.835657908963131, "grad_norm": 0.3078659176826477, "learning_rate": 1.443875421776736e-07, "loss": 0.3531, "step": 134175 }, { "epoch": 4.835838108624356, "grad_norm": 0.2857990264892578, "learning_rate": 1.4407453652375002e-07, "loss": 0.389, "step": 134180 }, { "epoch": 4.83601830828558, "grad_norm": 0.23130671679973602, "learning_rate": 1.43761869527001e-07, "loss": 0.4073, "step": 134185 }, { "epoch": 4.836198507946805, "grad_norm": 0.2094799429178238, "learning_rate": 1.434495411916842e-07, "loss": 0.3676, "step": 134190 }, { "epoch": 4.83637870760803, "grad_norm": 0.26732537150382996, "learning_rate": 1.4313755152205455e-07, "loss": 0.3971, "step": 134195 }, { "epoch": 4.8365589072692545, "grad_norm": 0.2994996905326843, "learning_rate": 1.4282590052236423e-07, "loss": 0.3768, "step": 134200 }, { "epoch": 4.836739106930479, "grad_norm": 0.22868464887142181, "learning_rate": 1.4251458819685704e-07, "loss": 0.3547, "step": 134205 }, { "epoch": 4.836919306591704, "grad_norm": 0.26898080110549927, "learning_rate": 1.4220361454977682e-07, "loss": 0.3795, "step": 134210 }, { "epoch": 4.837099506252928, "grad_norm": 0.2844333350658417, "learning_rate": 1.4189297958536185e-07, "loss": 0.4158, "step": 134215 }, { "epoch": 4.837279705914153, "grad_norm": 0.22372843325138092, "learning_rate": 1.415826833078393e-07, "loss": 0.3845, "step": 134220 }, { "epoch": 4.837459905575377, "grad_norm": 0.3177548050880432, "learning_rate": 1.412727257214419e-07, "loss": 0.4199, "step": 134225 }, { "epoch": 4.837640105236602, "grad_norm": 0.22313030064105988, "learning_rate": 1.409631068303885e-07, "loss": 0.3804, "step": 134230 }, { "epoch": 4.837820304897827, "grad_norm": 0.21491451561450958, "learning_rate": 1.4065382663890347e-07, "loss": 0.3386, "step": 134235 }, { "epoch": 4.8380005045590515, "grad_norm": 0.24090984463691711, "learning_rate": 1.403448851511946e-07, "loss": 0.3939, "step": 134240 }, { "epoch": 4.838180704220276, "grad_norm": 0.3148958384990692, "learning_rate": 1.4003628237147238e-07, "loss": 0.3783, "step": 134245 }, { "epoch": 4.838360903881501, "grad_norm": 0.32658737897872925, "learning_rate": 1.3972801830394732e-07, "loss": 0.403, "step": 134250 }, { "epoch": 4.838541103542726, "grad_norm": 0.26316890120506287, "learning_rate": 1.3942009295281056e-07, "loss": 0.3716, "step": 134255 }, { "epoch": 4.83872130320395, "grad_norm": 0.3213282823562622, "learning_rate": 1.3911250632226146e-07, "loss": 0.3641, "step": 134260 }, { "epoch": 4.838901502865174, "grad_norm": 0.2604977786540985, "learning_rate": 1.3880525841649673e-07, "loss": 0.361, "step": 134265 }, { "epoch": 4.839081702526399, "grad_norm": 0.22943316400051117, "learning_rate": 1.3849834923969073e-07, "loss": 0.3516, "step": 134270 }, { "epoch": 4.839261902187624, "grad_norm": 0.29876038432121277, "learning_rate": 1.3819177879603462e-07, "loss": 0.3684, "step": 134275 }, { "epoch": 4.8394421018488485, "grad_norm": 0.2451476752758026, "learning_rate": 1.378855470897028e-07, "loss": 0.3882, "step": 134280 }, { "epoch": 4.839622301510073, "grad_norm": 0.2849496603012085, "learning_rate": 1.3757965412486696e-07, "loss": 0.3678, "step": 134285 }, { "epoch": 4.839802501171298, "grad_norm": 0.19778481125831604, "learning_rate": 1.3727409990569319e-07, "loss": 0.3739, "step": 134290 }, { "epoch": 4.839982700832523, "grad_norm": 0.22249376773834229, "learning_rate": 1.369688844363448e-07, "loss": 0.357, "step": 134295 }, { "epoch": 4.840162900493747, "grad_norm": 0.2733737528324127, "learning_rate": 1.366640077209852e-07, "loss": 0.3621, "step": 134300 }, { "epoch": 4.840343100154971, "grad_norm": 0.21390418708324432, "learning_rate": 1.363594697637638e-07, "loss": 0.3495, "step": 134305 }, { "epoch": 4.840523299816196, "grad_norm": 0.3072846233844757, "learning_rate": 1.3605527056883004e-07, "loss": 0.4137, "step": 134310 }, { "epoch": 4.840703499477421, "grad_norm": 0.25589028000831604, "learning_rate": 1.3575141014032788e-07, "loss": 0.4014, "step": 134315 }, { "epoch": 4.8408836991386455, "grad_norm": 0.2160835713148117, "learning_rate": 1.354478884824012e-07, "loss": 0.3557, "step": 134320 }, { "epoch": 4.84106389879987, "grad_norm": 0.2661241888999939, "learning_rate": 1.351447055991828e-07, "loss": 0.3846, "step": 134325 }, { "epoch": 4.841244098461095, "grad_norm": 0.23646846413612366, "learning_rate": 1.3484186149480272e-07, "loss": 0.3725, "step": 134330 }, { "epoch": 4.84142429812232, "grad_norm": 0.24086101353168488, "learning_rate": 1.3453935617339099e-07, "loss": 0.3693, "step": 134335 }, { "epoch": 4.841604497783544, "grad_norm": 0.19936615228652954, "learning_rate": 1.3423718963906374e-07, "loss": 0.3585, "step": 134340 }, { "epoch": 4.841784697444769, "grad_norm": 0.2506243884563446, "learning_rate": 1.339353618959427e-07, "loss": 0.3775, "step": 134345 }, { "epoch": 4.841964897105994, "grad_norm": 0.22023634612560272, "learning_rate": 1.3363387294813568e-07, "loss": 0.3357, "step": 134350 }, { "epoch": 4.842145096767218, "grad_norm": 0.27518585324287415, "learning_rate": 1.3333272279975328e-07, "loss": 0.3789, "step": 134355 }, { "epoch": 4.8423252964284424, "grad_norm": 0.20245154201984406, "learning_rate": 1.3303191145490057e-07, "loss": 0.359, "step": 134360 }, { "epoch": 4.842505496089667, "grad_norm": 0.29023030400276184, "learning_rate": 1.3273143891767147e-07, "loss": 0.3642, "step": 134365 }, { "epoch": 4.842685695750892, "grad_norm": 0.23865652084350586, "learning_rate": 1.3243130519216274e-07, "loss": 0.3794, "step": 134370 }, { "epoch": 4.842865895412117, "grad_norm": 0.222527414560318, "learning_rate": 1.3213151028246273e-07, "loss": 0.3462, "step": 134375 }, { "epoch": 4.843046095073341, "grad_norm": 0.23358391225337982, "learning_rate": 1.3183205419265708e-07, "loss": 0.3391, "step": 134380 }, { "epoch": 4.843226294734566, "grad_norm": 0.25445687770843506, "learning_rate": 1.315329369268231e-07, "loss": 0.3423, "step": 134385 }, { "epoch": 4.843406494395791, "grad_norm": 0.312282532453537, "learning_rate": 1.3123415848903809e-07, "loss": 0.3769, "step": 134390 }, { "epoch": 4.843586694057015, "grad_norm": 0.25823071599006653, "learning_rate": 1.3093571888337652e-07, "loss": 0.3843, "step": 134395 }, { "epoch": 4.843766893718239, "grad_norm": 0.24276918172836304, "learning_rate": 1.3063761811389906e-07, "loss": 0.3791, "step": 134400 }, { "epoch": 4.843947093379464, "grad_norm": 0.25537583231925964, "learning_rate": 1.3033985618466914e-07, "loss": 0.4017, "step": 134405 }, { "epoch": 4.844127293040689, "grad_norm": 0.2588360011577606, "learning_rate": 1.3004243309974184e-07, "loss": 0.3756, "step": 134410 }, { "epoch": 4.844307492701914, "grad_norm": 0.3001175820827484, "learning_rate": 1.2974534886317225e-07, "loss": 0.3726, "step": 134415 }, { "epoch": 4.844487692363138, "grad_norm": 0.3025680184364319, "learning_rate": 1.294486034790099e-07, "loss": 0.3936, "step": 134420 }, { "epoch": 4.844667892024363, "grad_norm": 0.27103391289711, "learning_rate": 1.2915219695129321e-07, "loss": 0.3672, "step": 134425 }, { "epoch": 4.844848091685588, "grad_norm": 0.27774909138679504, "learning_rate": 1.2885612928406342e-07, "loss": 0.3652, "step": 134430 }, { "epoch": 4.845028291346813, "grad_norm": 0.2541874945163727, "learning_rate": 1.2856040048135342e-07, "loss": 0.3296, "step": 134435 }, { "epoch": 4.845208491008037, "grad_norm": 0.2172260284423828, "learning_rate": 1.2826501054719054e-07, "loss": 0.3267, "step": 134440 }, { "epoch": 4.845388690669262, "grad_norm": 0.177218958735466, "learning_rate": 1.2796995948560487e-07, "loss": 0.349, "step": 134445 }, { "epoch": 4.845568890330486, "grad_norm": 0.26676860451698303, "learning_rate": 1.2767524730061263e-07, "loss": 0.4027, "step": 134450 }, { "epoch": 4.845749089991711, "grad_norm": 0.27540668845176697, "learning_rate": 1.2738087399622733e-07, "loss": 0.3745, "step": 134455 }, { "epoch": 4.845929289652935, "grad_norm": 0.25069594383239746, "learning_rate": 1.2708683957646238e-07, "loss": 0.3432, "step": 134460 }, { "epoch": 4.84610948931416, "grad_norm": 0.29599523544311523, "learning_rate": 1.2679314404532572e-07, "loss": 0.3566, "step": 134465 }, { "epoch": 4.846289688975385, "grad_norm": 0.2683459520339966, "learning_rate": 1.264997874068169e-07, "loss": 0.3686, "step": 134470 }, { "epoch": 4.8464698886366095, "grad_norm": 0.250180184841156, "learning_rate": 1.2620676966493272e-07, "loss": 0.3944, "step": 134475 }, { "epoch": 4.846650088297834, "grad_norm": 0.27705875039100647, "learning_rate": 1.2591409082366445e-07, "loss": 0.3683, "step": 134480 }, { "epoch": 4.846830287959059, "grad_norm": 0.267696350812912, "learning_rate": 1.2562175088700057e-07, "loss": 0.3831, "step": 134485 }, { "epoch": 4.847010487620283, "grad_norm": 0.26516005396842957, "learning_rate": 1.2532974985892398e-07, "loss": 0.3802, "step": 134490 }, { "epoch": 4.847190687281508, "grad_norm": 0.22032172977924347, "learning_rate": 1.2503808774341486e-07, "loss": 0.346, "step": 134495 }, { "epoch": 4.847370886942732, "grad_norm": 0.25521814823150635, "learning_rate": 1.2474676454444778e-07, "loss": 0.3357, "step": 134500 }, { "epoch": 4.847370886942732, "eval_loss": 0.4288318157196045, "eval_runtime": 3.5358, "eval_samples_per_second": 28.282, "eval_steps_per_second": 7.07, "step": 134500 }, { "epoch": 4.847551086603957, "grad_norm": 0.25918158888816833, "learning_rate": 1.2445578026598903e-07, "loss": 0.3733, "step": 134505 }, { "epoch": 4.847731286265182, "grad_norm": 0.2234797328710556, "learning_rate": 1.241651349120021e-07, "loss": 0.3643, "step": 134510 }, { "epoch": 4.8479114859264065, "grad_norm": 0.30298492312431335, "learning_rate": 1.238748284864505e-07, "loss": 0.361, "step": 134515 }, { "epoch": 4.848091685587631, "grad_norm": 0.2233441025018692, "learning_rate": 1.2358486099328658e-07, "loss": 0.3642, "step": 134520 }, { "epoch": 4.848271885248856, "grad_norm": 0.26897063851356506, "learning_rate": 1.2329523243646556e-07, "loss": 0.3709, "step": 134525 }, { "epoch": 4.848452084910081, "grad_norm": 0.2564341723918915, "learning_rate": 1.2300594281992872e-07, "loss": 0.3741, "step": 134530 }, { "epoch": 4.8486322845713055, "grad_norm": 0.1938348412513733, "learning_rate": 1.227169921476201e-07, "loss": 0.3406, "step": 134535 }, { "epoch": 4.848812484232529, "grad_norm": 0.25928547978401184, "learning_rate": 1.224283804234755e-07, "loss": 0.3673, "step": 134540 }, { "epoch": 4.848992683893754, "grad_norm": 0.3056635558605194, "learning_rate": 1.221401076514306e-07, "loss": 0.3775, "step": 134545 }, { "epoch": 4.849172883554979, "grad_norm": 0.2699172794818878, "learning_rate": 1.2185217383540725e-07, "loss": 0.3643, "step": 134550 }, { "epoch": 4.8493530832162035, "grad_norm": 0.2580404579639435, "learning_rate": 1.2156457897933014e-07, "loss": 0.3634, "step": 134555 }, { "epoch": 4.849533282877428, "grad_norm": 0.2560892701148987, "learning_rate": 1.2127732308712114e-07, "loss": 0.3698, "step": 134560 }, { "epoch": 4.849713482538653, "grad_norm": 0.2861708998680115, "learning_rate": 1.2099040616269374e-07, "loss": 0.3928, "step": 134565 }, { "epoch": 4.849893682199878, "grad_norm": 0.21710778772830963, "learning_rate": 1.207038282099532e-07, "loss": 0.384, "step": 134570 }, { "epoch": 4.850073881861102, "grad_norm": 0.2481117993593216, "learning_rate": 1.2041758923280465e-07, "loss": 0.3562, "step": 134575 }, { "epoch": 4.850254081522326, "grad_norm": 0.24015110731124878, "learning_rate": 1.2013168923515338e-07, "loss": 0.3729, "step": 134580 }, { "epoch": 4.850434281183551, "grad_norm": 0.24284091591835022, "learning_rate": 1.1990321330488386e-07, "loss": 0.3809, "step": 134585 }, { "epoch": 4.850614480844776, "grad_norm": 0.23821890354156494, "learning_rate": 1.196179234801309e-07, "loss": 0.3453, "step": 134590 }, { "epoch": 4.8507946805060005, "grad_norm": 0.2183995395898819, "learning_rate": 1.1933297264576927e-07, "loss": 0.3285, "step": 134595 }, { "epoch": 4.850974880167225, "grad_norm": 0.22939856350421906, "learning_rate": 1.1904836080567638e-07, "loss": 0.3785, "step": 134600 }, { "epoch": 4.85115507982845, "grad_norm": 0.27934911847114563, "learning_rate": 1.187640879637325e-07, "loss": 0.3638, "step": 134605 }, { "epoch": 4.851335279489675, "grad_norm": 0.263106107711792, "learning_rate": 1.184801541238123e-07, "loss": 0.3625, "step": 134610 }, { "epoch": 4.851515479150899, "grad_norm": 0.31745436787605286, "learning_rate": 1.181965592897849e-07, "loss": 0.3788, "step": 134615 }, { "epoch": 4.851695678812124, "grad_norm": 0.297881543636322, "learning_rate": 1.1791330346550832e-07, "loss": 0.3824, "step": 134620 }, { "epoch": 4.851875878473349, "grad_norm": 0.2612767219543457, "learning_rate": 1.1763038665484616e-07, "loss": 0.3582, "step": 134625 }, { "epoch": 4.852056078134573, "grad_norm": 0.27403807640075684, "learning_rate": 1.1734780886165364e-07, "loss": 0.4069, "step": 134630 }, { "epoch": 4.8522362777957975, "grad_norm": 0.250141978263855, "learning_rate": 1.1706557008978048e-07, "loss": 0.3807, "step": 134635 }, { "epoch": 4.852416477457022, "grad_norm": 0.24126245081424713, "learning_rate": 1.167836703430708e-07, "loss": 0.3707, "step": 134640 }, { "epoch": 4.852596677118247, "grad_norm": 0.2205542027950287, "learning_rate": 1.1650210962536323e-07, "loss": 0.3328, "step": 134645 }, { "epoch": 4.852776876779472, "grad_norm": 0.23354937136173248, "learning_rate": 1.1622088794049912e-07, "loss": 0.3718, "step": 134650 }, { "epoch": 4.852957076440696, "grad_norm": 0.25610846281051636, "learning_rate": 1.1594000529230875e-07, "loss": 0.3912, "step": 134655 }, { "epoch": 4.853137276101921, "grad_norm": 0.3403148055076599, "learning_rate": 1.1565946168461684e-07, "loss": 0.3726, "step": 134660 }, { "epoch": 4.853317475763146, "grad_norm": 0.2914038598537445, "learning_rate": 1.1537925712124809e-07, "loss": 0.3737, "step": 134665 }, { "epoch": 4.85349767542437, "grad_norm": 0.3024619519710541, "learning_rate": 1.150993916060189e-07, "loss": 0.3859, "step": 134670 }, { "epoch": 4.8536778750855945, "grad_norm": 0.2520431876182556, "learning_rate": 1.1481986514274012e-07, "loss": 0.3754, "step": 134675 }, { "epoch": 4.853858074746819, "grad_norm": 0.23025912046432495, "learning_rate": 1.1454067773522537e-07, "loss": 0.3815, "step": 134680 }, { "epoch": 4.854038274408044, "grad_norm": 0.2676393687725067, "learning_rate": 1.1426182938727714e-07, "loss": 0.369, "step": 134685 }, { "epoch": 4.854218474069269, "grad_norm": 0.2332090437412262, "learning_rate": 1.1398332010269241e-07, "loss": 0.3455, "step": 134690 }, { "epoch": 4.854398673730493, "grad_norm": 0.23634545505046844, "learning_rate": 1.1370514988526538e-07, "loss": 0.4167, "step": 134695 }, { "epoch": 4.854578873391718, "grad_norm": 0.25552475452423096, "learning_rate": 1.1342731873878742e-07, "loss": 0.4016, "step": 134700 }, { "epoch": 4.854759073052943, "grad_norm": 0.21695470809936523, "learning_rate": 1.1314982666704444e-07, "loss": 0.3786, "step": 134705 }, { "epoch": 4.854939272714168, "grad_norm": 0.27159446477890015, "learning_rate": 1.1287267367381671e-07, "loss": 0.396, "step": 134710 }, { "epoch": 4.855119472375392, "grad_norm": 0.2568185329437256, "learning_rate": 1.1259585976288179e-07, "loss": 0.3537, "step": 134715 }, { "epoch": 4.855299672036617, "grad_norm": 0.29152026772499084, "learning_rate": 1.1231938493800887e-07, "loss": 0.341, "step": 134720 }, { "epoch": 4.855479871697841, "grad_norm": 0.2555713951587677, "learning_rate": 1.1204324920296716e-07, "loss": 0.3395, "step": 134725 }, { "epoch": 4.855660071359066, "grad_norm": 0.2556825280189514, "learning_rate": 1.1176745256151755e-07, "loss": 0.3501, "step": 134730 }, { "epoch": 4.85584027102029, "grad_norm": 0.30311036109924316, "learning_rate": 1.1149199501741537e-07, "loss": 0.3778, "step": 134735 }, { "epoch": 4.856020470681515, "grad_norm": 0.23065444827079773, "learning_rate": 1.1121687657441871e-07, "loss": 0.373, "step": 134740 }, { "epoch": 4.85620067034274, "grad_norm": 0.26827138662338257, "learning_rate": 1.1094209723627458e-07, "loss": 0.3632, "step": 134745 }, { "epoch": 4.856380870003965, "grad_norm": 0.29760560393333435, "learning_rate": 1.1066765700672166e-07, "loss": 0.3913, "step": 134750 }, { "epoch": 4.856561069665189, "grad_norm": 0.27333009243011475, "learning_rate": 1.1039355588950695e-07, "loss": 0.3704, "step": 134755 }, { "epoch": 4.856741269326414, "grad_norm": 0.244674414396286, "learning_rate": 1.101197938883608e-07, "loss": 0.3931, "step": 134760 }, { "epoch": 4.856921468987638, "grad_norm": 0.2720971405506134, "learning_rate": 1.0984637100701078e-07, "loss": 0.3967, "step": 134765 }, { "epoch": 4.857101668648863, "grad_norm": 0.17154434323310852, "learning_rate": 1.0957328724918725e-07, "loss": 0.3374, "step": 134770 }, { "epoch": 4.857281868310087, "grad_norm": 0.27293217182159424, "learning_rate": 1.0930054261860668e-07, "loss": 0.3667, "step": 134775 }, { "epoch": 4.857462067971312, "grad_norm": 0.24722565710544586, "learning_rate": 1.0902813711899107e-07, "loss": 0.4101, "step": 134780 }, { "epoch": 4.857642267632537, "grad_norm": 0.3114294707775116, "learning_rate": 1.0875607075404582e-07, "loss": 0.3313, "step": 134785 }, { "epoch": 4.8578224672937615, "grad_norm": 0.3135533332824707, "learning_rate": 1.0848434352747905e-07, "loss": 0.3961, "step": 134790 }, { "epoch": 4.858002666954986, "grad_norm": 0.30858752131462097, "learning_rate": 1.0821295544299337e-07, "loss": 0.4051, "step": 134795 }, { "epoch": 4.858182866616211, "grad_norm": 0.25825393199920654, "learning_rate": 1.0794190650429137e-07, "loss": 0.3879, "step": 134800 }, { "epoch": 4.858363066277436, "grad_norm": 0.31138578057289124, "learning_rate": 1.0767119671505899e-07, "loss": 0.3673, "step": 134805 }, { "epoch": 4.8585432659386605, "grad_norm": 0.21300721168518066, "learning_rate": 1.0740082607898494e-07, "loss": 0.3528, "step": 134810 }, { "epoch": 4.858723465599884, "grad_norm": 0.23539184033870697, "learning_rate": 1.0713079459975795e-07, "loss": 0.3364, "step": 134815 }, { "epoch": 4.858903665261109, "grad_norm": 0.2637368440628052, "learning_rate": 1.0686110228105284e-07, "loss": 0.3895, "step": 134820 }, { "epoch": 4.859083864922334, "grad_norm": 0.24945858120918274, "learning_rate": 1.0659174912655001e-07, "loss": 0.3608, "step": 134825 }, { "epoch": 4.8592640645835585, "grad_norm": 0.2507854402065277, "learning_rate": 1.0632273513991042e-07, "loss": 0.3788, "step": 134830 }, { "epoch": 4.859444264244783, "grad_norm": 0.2690359950065613, "learning_rate": 1.0605406032480614e-07, "loss": 0.3606, "step": 134835 }, { "epoch": 4.859624463906008, "grad_norm": 0.2506677210330963, "learning_rate": 1.0578572468489534e-07, "loss": 0.4075, "step": 134840 }, { "epoch": 4.859804663567233, "grad_norm": 0.2608148455619812, "learning_rate": 1.0551772822383343e-07, "loss": 0.336, "step": 134845 }, { "epoch": 4.8599848632284575, "grad_norm": 0.26057958602905273, "learning_rate": 1.0525007094527305e-07, "loss": 0.3707, "step": 134850 }, { "epoch": 4.860165062889681, "grad_norm": 0.3034597337245941, "learning_rate": 1.049827528528613e-07, "loss": 0.3569, "step": 134855 }, { "epoch": 4.860345262550906, "grad_norm": 0.25797319412231445, "learning_rate": 1.047157739502369e-07, "loss": 0.3628, "step": 134860 }, { "epoch": 4.860525462212131, "grad_norm": 0.2869213819503784, "learning_rate": 1.0444913424104418e-07, "loss": 0.4191, "step": 134865 }, { "epoch": 4.8607056618733555, "grad_norm": 0.2047075480222702, "learning_rate": 1.041828337289108e-07, "loss": 0.3554, "step": 134870 }, { "epoch": 4.86088586153458, "grad_norm": 0.2842639088630676, "learning_rate": 1.0391687241746717e-07, "loss": 0.3925, "step": 134875 }, { "epoch": 4.861066061195805, "grad_norm": 0.2997124493122101, "learning_rate": 1.036512503103354e-07, "loss": 0.3753, "step": 134880 }, { "epoch": 4.86124626085703, "grad_norm": 0.23771364986896515, "learning_rate": 1.0338596741113204e-07, "loss": 0.3682, "step": 134885 }, { "epoch": 4.8614264605182544, "grad_norm": 0.2713213264942169, "learning_rate": 1.0312102372347921e-07, "loss": 0.3753, "step": 134890 }, { "epoch": 4.861606660179479, "grad_norm": 0.28782111406326294, "learning_rate": 1.0285641925097956e-07, "loss": 0.3416, "step": 134895 }, { "epoch": 4.861786859840704, "grad_norm": 0.21895797550678253, "learning_rate": 1.0259215399724132e-07, "loss": 0.3613, "step": 134900 }, { "epoch": 4.861967059501928, "grad_norm": 0.21650052070617676, "learning_rate": 1.0232822796586716e-07, "loss": 0.3367, "step": 134905 }, { "epoch": 4.8621472591631525, "grad_norm": 0.22131061553955078, "learning_rate": 1.0206464116044868e-07, "loss": 0.4108, "step": 134910 }, { "epoch": 4.862327458824377, "grad_norm": 0.2649054527282715, "learning_rate": 1.0180139358458018e-07, "loss": 0.3535, "step": 134915 }, { "epoch": 4.862507658485602, "grad_norm": 0.1987181305885315, "learning_rate": 1.0153848524184494e-07, "loss": 0.4098, "step": 134920 }, { "epoch": 4.862687858146827, "grad_norm": 0.22203120589256287, "learning_rate": 1.0127591613582899e-07, "loss": 0.369, "step": 134925 }, { "epoch": 4.862868057808051, "grad_norm": 0.28719407320022583, "learning_rate": 1.010136862701072e-07, "loss": 0.3782, "step": 134930 }, { "epoch": 4.863048257469276, "grad_norm": 0.2688189148902893, "learning_rate": 1.0075179564825455e-07, "loss": 0.3805, "step": 134935 }, { "epoch": 4.863228457130501, "grad_norm": 0.22982627153396606, "learning_rate": 1.0049024427383758e-07, "loss": 0.3613, "step": 134940 }, { "epoch": 4.863408656791725, "grad_norm": 0.18782635033130646, "learning_rate": 1.0022903215042012e-07, "loss": 0.3668, "step": 134945 }, { "epoch": 4.8635888564529495, "grad_norm": 0.26904863119125366, "learning_rate": 9.996815928156044e-08, "loss": 0.371, "step": 134950 }, { "epoch": 4.863769056114174, "grad_norm": 0.29253089427948, "learning_rate": 9.970762567081404e-08, "loss": 0.3492, "step": 134955 }, { "epoch": 4.863949255775399, "grad_norm": 0.25840044021606445, "learning_rate": 9.944743132173085e-08, "loss": 0.3703, "step": 134960 }, { "epoch": 4.864129455436624, "grad_norm": 0.27990660071372986, "learning_rate": 9.918757623785802e-08, "loss": 0.3802, "step": 134965 }, { "epoch": 4.864309655097848, "grad_norm": 0.24384404718875885, "learning_rate": 9.892806042273162e-08, "loss": 0.3459, "step": 134970 }, { "epoch": 4.864489854759073, "grad_norm": 0.189422607421875, "learning_rate": 9.866888387988771e-08, "loss": 0.3627, "step": 134975 }, { "epoch": 4.864670054420298, "grad_norm": 0.2541491985321045, "learning_rate": 9.841004661285957e-08, "loss": 0.3354, "step": 134980 }, { "epoch": 4.864850254081523, "grad_norm": 0.24088044464588165, "learning_rate": 9.815154862517495e-08, "loss": 0.3643, "step": 134985 }, { "epoch": 4.865030453742747, "grad_norm": 0.19148464500904083, "learning_rate": 9.789338992035324e-08, "loss": 0.3547, "step": 134990 }, { "epoch": 4.865210653403972, "grad_norm": 0.2473505735397339, "learning_rate": 9.763557050191385e-08, "loss": 0.3399, "step": 134995 }, { "epoch": 4.865390853065196, "grad_norm": 0.2551048696041107, "learning_rate": 9.737809037336787e-08, "loss": 0.3505, "step": 135000 }, { "epoch": 4.865390853065196, "eval_loss": 0.42877399921417236, "eval_runtime": 3.5351, "eval_samples_per_second": 28.288, "eval_steps_per_second": 7.072, "step": 135000 }, { "epoch": 4.865571052726421, "grad_norm": 0.24867664277553558, "learning_rate": 9.71209495382236e-08, "loss": 0.3733, "step": 135005 }, { "epoch": 4.865751252387645, "grad_norm": 0.33623263239860535, "learning_rate": 9.686414799998656e-08, "loss": 0.4158, "step": 135010 }, { "epoch": 4.86593145204887, "grad_norm": 0.301376074552536, "learning_rate": 9.660768576215396e-08, "loss": 0.3439, "step": 135015 }, { "epoch": 4.866111651710095, "grad_norm": 0.3229968845844269, "learning_rate": 9.635156282822023e-08, "loss": 0.3624, "step": 135020 }, { "epoch": 4.86629185137132, "grad_norm": 0.2948048710823059, "learning_rate": 9.609577920167422e-08, "loss": 0.3791, "step": 135025 }, { "epoch": 4.866472051032544, "grad_norm": 0.26417121291160583, "learning_rate": 9.584033488600208e-08, "loss": 0.357, "step": 135030 }, { "epoch": 4.866652250693769, "grad_norm": 0.234276682138443, "learning_rate": 9.558522988468432e-08, "loss": 0.3794, "step": 135035 }, { "epoch": 4.866832450354993, "grad_norm": 0.22871927917003632, "learning_rate": 9.533046420119318e-08, "loss": 0.3673, "step": 135040 }, { "epoch": 4.867012650016218, "grad_norm": 0.2720992863178253, "learning_rate": 9.507603783900366e-08, "loss": 0.3691, "step": 135045 }, { "epoch": 4.867192849677442, "grad_norm": 0.187312513589859, "learning_rate": 9.482195080158517e-08, "loss": 0.3576, "step": 135050 }, { "epoch": 4.867373049338667, "grad_norm": 0.2163442075252533, "learning_rate": 9.456820309239333e-08, "loss": 0.329, "step": 135055 }, { "epoch": 4.867553248999892, "grad_norm": 0.2720244526863098, "learning_rate": 9.431479471488647e-08, "loss": 0.364, "step": 135060 }, { "epoch": 4.867733448661117, "grad_norm": 0.23891782760620117, "learning_rate": 9.406172567252015e-08, "loss": 0.3779, "step": 135065 }, { "epoch": 4.867913648322341, "grad_norm": 0.21805892884731293, "learning_rate": 9.380899596873882e-08, "loss": 0.3617, "step": 135070 }, { "epoch": 4.868093847983566, "grad_norm": 0.23532462120056152, "learning_rate": 9.355660560699254e-08, "loss": 0.3854, "step": 135075 }, { "epoch": 4.868274047644791, "grad_norm": 0.3134779632091522, "learning_rate": 9.330455459071185e-08, "loss": 0.3714, "step": 135080 }, { "epoch": 4.8684542473060155, "grad_norm": 0.24097296595573425, "learning_rate": 9.305284292333572e-08, "loss": 0.3793, "step": 135085 }, { "epoch": 4.868634446967239, "grad_norm": 0.24849741160869598, "learning_rate": 9.280147060829192e-08, "loss": 0.3469, "step": 135090 }, { "epoch": 4.868814646628464, "grad_norm": 0.23304787278175354, "learning_rate": 9.25504376490055e-08, "loss": 0.3691, "step": 135095 }, { "epoch": 4.868994846289689, "grad_norm": 0.2561473548412323, "learning_rate": 9.229974404889875e-08, "loss": 0.3874, "step": 135100 }, { "epoch": 4.8691750459509135, "grad_norm": 0.23009338974952698, "learning_rate": 9.204938981138277e-08, "loss": 0.3618, "step": 135105 }, { "epoch": 4.869355245612138, "grad_norm": 0.22427034378051758, "learning_rate": 9.179937493987434e-08, "loss": 0.4131, "step": 135110 }, { "epoch": 4.869535445273363, "grad_norm": 0.30147793889045715, "learning_rate": 9.154969943777625e-08, "loss": 0.4054, "step": 135115 }, { "epoch": 4.869715644934588, "grad_norm": 0.2412918359041214, "learning_rate": 9.130036330849134e-08, "loss": 0.3642, "step": 135120 }, { "epoch": 4.8698958445958125, "grad_norm": 0.2309122383594513, "learning_rate": 9.105136655541691e-08, "loss": 0.3581, "step": 135125 }, { "epoch": 4.870076044257036, "grad_norm": 0.29338333010673523, "learning_rate": 9.080270918194466e-08, "loss": 0.3669, "step": 135130 }, { "epoch": 4.870256243918261, "grad_norm": 0.2538418471813202, "learning_rate": 9.05543911914608e-08, "loss": 0.3738, "step": 135135 }, { "epoch": 4.870436443579486, "grad_norm": 0.2361038327217102, "learning_rate": 9.03064125873515e-08, "loss": 0.3925, "step": 135140 }, { "epoch": 4.8706166432407105, "grad_norm": 0.27490556240081787, "learning_rate": 9.00587733729974e-08, "loss": 0.3542, "step": 135145 }, { "epoch": 4.870796842901935, "grad_norm": 0.21929150819778442, "learning_rate": 8.9811473551768e-08, "loss": 0.35, "step": 135150 }, { "epoch": 4.87097704256316, "grad_norm": 0.234662726521492, "learning_rate": 8.956451312703562e-08, "loss": 0.3558, "step": 135155 }, { "epoch": 4.871157242224385, "grad_norm": 0.2809460759162903, "learning_rate": 8.931789210216146e-08, "loss": 0.4014, "step": 135160 }, { "epoch": 4.8713374418856095, "grad_norm": 0.2753678560256958, "learning_rate": 8.907161048050949e-08, "loss": 0.3782, "step": 135165 }, { "epoch": 4.871517641546834, "grad_norm": 0.3030547499656677, "learning_rate": 8.882566826543259e-08, "loss": 0.4204, "step": 135170 }, { "epoch": 4.871697841208059, "grad_norm": 0.2812124788761139, "learning_rate": 8.858006546028641e-08, "loss": 0.3582, "step": 135175 }, { "epoch": 4.871878040869283, "grad_norm": 0.24554339051246643, "learning_rate": 8.833480206840994e-08, "loss": 0.3686, "step": 135180 }, { "epoch": 4.8720582405305075, "grad_norm": 0.24936611950397491, "learning_rate": 8.80898780931505e-08, "loss": 0.3608, "step": 135185 }, { "epoch": 4.872238440191732, "grad_norm": 0.26903268694877625, "learning_rate": 8.784529353784432e-08, "loss": 0.3633, "step": 135190 }, { "epoch": 4.872418639852957, "grad_norm": 0.2582688629627228, "learning_rate": 8.760104840582207e-08, "loss": 0.3629, "step": 135195 }, { "epoch": 4.872598839514182, "grad_norm": 0.24837349355220795, "learning_rate": 8.735714270041162e-08, "loss": 0.3739, "step": 135200 }, { "epoch": 4.8727790391754064, "grad_norm": 0.2944589853286743, "learning_rate": 8.711357642493812e-08, "loss": 0.3869, "step": 135205 }, { "epoch": 4.872959238836631, "grad_norm": 0.2535194456577301, "learning_rate": 8.687034958271833e-08, "loss": 0.3885, "step": 135210 }, { "epoch": 4.873139438497856, "grad_norm": 0.2899385392665863, "learning_rate": 8.66274621770663e-08, "loss": 0.3827, "step": 135215 }, { "epoch": 4.873319638159081, "grad_norm": 0.23525474965572357, "learning_rate": 8.638491421129325e-08, "loss": 0.4044, "step": 135220 }, { "epoch": 4.8734998378203045, "grad_norm": 0.2679741084575653, "learning_rate": 8.614270568869931e-08, "loss": 0.3682, "step": 135225 }, { "epoch": 4.873680037481529, "grad_norm": 0.2726483643054962, "learning_rate": 8.590083661259019e-08, "loss": 0.4052, "step": 135230 }, { "epoch": 4.873860237142754, "grad_norm": 0.2760205864906311, "learning_rate": 8.565930698625769e-08, "loss": 0.3704, "step": 135235 }, { "epoch": 4.874040436803979, "grad_norm": 0.21959510445594788, "learning_rate": 8.54181168129936e-08, "loss": 0.3618, "step": 135240 }, { "epoch": 4.874220636465203, "grad_norm": 0.2475598305463791, "learning_rate": 8.517726609608424e-08, "loss": 0.3569, "step": 135245 }, { "epoch": 4.874400836126428, "grad_norm": 0.3407549560070038, "learning_rate": 8.493675483881303e-08, "loss": 0.3741, "step": 135250 }, { "epoch": 4.874581035787653, "grad_norm": 0.2598182260990143, "learning_rate": 8.46965830444496e-08, "loss": 0.373, "step": 135255 }, { "epoch": 4.874761235448878, "grad_norm": 0.20785191655158997, "learning_rate": 8.445675071627468e-08, "loss": 0.3365, "step": 135260 }, { "epoch": 4.874941435110102, "grad_norm": 0.33232206106185913, "learning_rate": 8.421725785755508e-08, "loss": 0.3491, "step": 135265 }, { "epoch": 4.875121634771327, "grad_norm": 0.26161181926727295, "learning_rate": 8.397810447154653e-08, "loss": 0.3811, "step": 135270 }, { "epoch": 4.875301834432551, "grad_norm": 0.2408609241247177, "learning_rate": 8.373929056151586e-08, "loss": 0.3769, "step": 135275 }, { "epoch": 4.875482034093776, "grad_norm": 0.23764941096305847, "learning_rate": 8.350081613071326e-08, "loss": 0.3892, "step": 135280 }, { "epoch": 4.875662233755, "grad_norm": 0.2595727741718292, "learning_rate": 8.326268118238612e-08, "loss": 0.3808, "step": 135285 }, { "epoch": 4.875842433416225, "grad_norm": 0.25060775876045227, "learning_rate": 8.302488571978185e-08, "loss": 0.3857, "step": 135290 }, { "epoch": 4.87602263307745, "grad_norm": 0.26845234632492065, "learning_rate": 8.278742974613951e-08, "loss": 0.4005, "step": 135295 }, { "epoch": 4.876202832738675, "grad_norm": 0.2484569400548935, "learning_rate": 8.255031326469542e-08, "loss": 0.3708, "step": 135300 }, { "epoch": 4.876383032399899, "grad_norm": 0.22691169381141663, "learning_rate": 8.231353627867755e-08, "loss": 0.379, "step": 135305 }, { "epoch": 4.876563232061124, "grad_norm": 0.28611308336257935, "learning_rate": 8.207709879131386e-08, "loss": 0.3717, "step": 135310 }, { "epoch": 4.876743431722348, "grad_norm": 0.21973417699337006, "learning_rate": 8.184100080582679e-08, "loss": 0.3596, "step": 135315 }, { "epoch": 4.876923631383573, "grad_norm": 0.3248337209224701, "learning_rate": 8.160524232543043e-08, "loss": 0.3808, "step": 135320 }, { "epoch": 4.877103831044797, "grad_norm": 0.29214221239089966, "learning_rate": 8.136982335333887e-08, "loss": 0.3813, "step": 135325 }, { "epoch": 4.877284030706022, "grad_norm": 0.26933974027633667, "learning_rate": 8.113474389275788e-08, "loss": 0.3874, "step": 135330 }, { "epoch": 4.877464230367247, "grad_norm": 0.2623523473739624, "learning_rate": 8.090000394689324e-08, "loss": 0.4208, "step": 135335 }, { "epoch": 4.877644430028472, "grad_norm": 0.2307843416929245, "learning_rate": 8.06656035189396e-08, "loss": 0.3574, "step": 135340 }, { "epoch": 4.877824629689696, "grad_norm": 0.26475924253463745, "learning_rate": 8.04315426120944e-08, "loss": 0.3531, "step": 135345 }, { "epoch": 4.878004829350921, "grad_norm": 0.2883531153202057, "learning_rate": 8.0197821229544e-08, "loss": 0.3882, "step": 135350 }, { "epoch": 4.878185029012146, "grad_norm": 0.2579416036605835, "learning_rate": 7.996443937447196e-08, "loss": 0.3979, "step": 135355 }, { "epoch": 4.8783652286733705, "grad_norm": 0.21452683210372925, "learning_rate": 7.973139705006183e-08, "loss": 0.3527, "step": 135360 }, { "epoch": 4.878545428334594, "grad_norm": 0.2238248735666275, "learning_rate": 7.949869425948609e-08, "loss": 0.3585, "step": 135365 }, { "epoch": 4.878725627995819, "grad_norm": 0.2821867763996124, "learning_rate": 7.926633100591719e-08, "loss": 0.3877, "step": 135370 }, { "epoch": 4.878905827657044, "grad_norm": 0.2567353844642639, "learning_rate": 7.903430729251926e-08, "loss": 0.3633, "step": 135375 }, { "epoch": 4.879086027318269, "grad_norm": 0.26188910007476807, "learning_rate": 7.880262312245368e-08, "loss": 0.3812, "step": 135380 }, { "epoch": 4.879266226979493, "grad_norm": 0.2313767969608307, "learning_rate": 7.857127849887624e-08, "loss": 0.3708, "step": 135385 }, { "epoch": 4.879446426640718, "grad_norm": 0.318751722574234, "learning_rate": 7.834027342494277e-08, "loss": 0.3856, "step": 135390 }, { "epoch": 4.879626626301943, "grad_norm": 0.2602209150791168, "learning_rate": 7.810960790379517e-08, "loss": 0.3871, "step": 135395 }, { "epoch": 4.8798068259631675, "grad_norm": 0.2891538441181183, "learning_rate": 7.787928193858096e-08, "loss": 0.3592, "step": 135400 }, { "epoch": 4.879987025624391, "grad_norm": 0.30516698956489563, "learning_rate": 7.76492955324365e-08, "loss": 0.3672, "step": 135405 }, { "epoch": 4.880167225285616, "grad_norm": 0.2555546462535858, "learning_rate": 7.741964868849539e-08, "loss": 0.3664, "step": 135410 }, { "epoch": 4.880347424946841, "grad_norm": 0.25104543566703796, "learning_rate": 7.719034140988569e-08, "loss": 0.3555, "step": 135415 }, { "epoch": 4.8805276246080656, "grad_norm": 0.2774941027164459, "learning_rate": 7.696137369973266e-08, "loss": 0.3659, "step": 135420 }, { "epoch": 4.88070782426929, "grad_norm": 0.31660085916519165, "learning_rate": 7.673274556115328e-08, "loss": 0.3489, "step": 135425 }, { "epoch": 4.880888023930515, "grad_norm": 0.32428672909736633, "learning_rate": 7.650445699727005e-08, "loss": 0.4014, "step": 135430 }, { "epoch": 4.88106822359174, "grad_norm": 0.2850460112094879, "learning_rate": 7.627650801118325e-08, "loss": 0.3708, "step": 135435 }, { "epoch": 4.8812484232529645, "grad_norm": 0.24217717349529266, "learning_rate": 7.604889860600706e-08, "loss": 0.3699, "step": 135440 }, { "epoch": 4.881428622914189, "grad_norm": 0.2553984224796295, "learning_rate": 7.582162878483623e-08, "loss": 0.4042, "step": 135445 }, { "epoch": 4.881608822575414, "grad_norm": 0.23735560476779938, "learning_rate": 7.559469855077383e-08, "loss": 0.3997, "step": 135450 }, { "epoch": 4.881789022236639, "grad_norm": 0.2347259223461151, "learning_rate": 7.53681079069063e-08, "loss": 0.3606, "step": 135455 }, { "epoch": 4.8819692218978625, "grad_norm": 0.24018234014511108, "learning_rate": 7.51418568563228e-08, "loss": 0.3564, "step": 135460 }, { "epoch": 4.882149421559087, "grad_norm": 0.26346853375434875, "learning_rate": 7.491594540210423e-08, "loss": 0.3791, "step": 135465 }, { "epoch": 4.882329621220312, "grad_norm": 0.29124143719673157, "learning_rate": 7.469037354733144e-08, "loss": 0.3546, "step": 135470 }, { "epoch": 4.882509820881537, "grad_norm": 0.24946916103363037, "learning_rate": 7.446514129507698e-08, "loss": 0.4008, "step": 135475 }, { "epoch": 4.8826900205427615, "grad_norm": 0.23744343221187592, "learning_rate": 7.424024864841061e-08, "loss": 0.3892, "step": 135480 }, { "epoch": 4.882870220203986, "grad_norm": 0.25719282031059265, "learning_rate": 7.401569561039379e-08, "loss": 0.3679, "step": 135485 }, { "epoch": 4.883050419865211, "grad_norm": 0.27071502804756165, "learning_rate": 7.379148218408516e-08, "loss": 0.3677, "step": 135490 }, { "epoch": 4.883230619526436, "grad_norm": 0.28900146484375, "learning_rate": 7.356760837254617e-08, "loss": 0.3915, "step": 135495 }, { "epoch": 4.8834108191876595, "grad_norm": 0.2625620663166046, "learning_rate": 7.334407417881883e-08, "loss": 0.389, "step": 135500 }, { "epoch": 4.8834108191876595, "eval_loss": 0.4288046956062317, "eval_runtime": 3.533, "eval_samples_per_second": 28.305, "eval_steps_per_second": 7.076, "step": 135500 }, { "epoch": 4.883591018848884, "grad_norm": 0.21090403199195862, "learning_rate": 7.312087960595348e-08, "loss": 0.3617, "step": 135505 }, { "epoch": 4.883771218510109, "grad_norm": 0.300222784280777, "learning_rate": 7.289802465698936e-08, "loss": 0.3612, "step": 135510 }, { "epoch": 4.883951418171334, "grad_norm": 0.2370145618915558, "learning_rate": 7.267550933496569e-08, "loss": 0.3661, "step": 135515 }, { "epoch": 4.8841316178325584, "grad_norm": 0.3078131377696991, "learning_rate": 7.245333364291063e-08, "loss": 0.3926, "step": 135520 }, { "epoch": 4.884311817493783, "grad_norm": 0.30718788504600525, "learning_rate": 7.223149758385506e-08, "loss": 0.3866, "step": 135525 }, { "epoch": 4.884492017155008, "grad_norm": 0.251179575920105, "learning_rate": 7.201000116081602e-08, "loss": 0.3607, "step": 135530 }, { "epoch": 4.884672216816233, "grad_norm": 0.21268831193447113, "learning_rate": 7.17888443768161e-08, "loss": 0.3675, "step": 135535 }, { "epoch": 4.884852416477457, "grad_norm": 0.22792141139507294, "learning_rate": 7.156802723486678e-08, "loss": 0.3899, "step": 135540 }, { "epoch": 4.885032616138682, "grad_norm": 0.2687564790248871, "learning_rate": 7.134754973797674e-08, "loss": 0.3699, "step": 135545 }, { "epoch": 4.885212815799906, "grad_norm": 0.2454073429107666, "learning_rate": 7.112741188915195e-08, "loss": 0.3885, "step": 135550 }, { "epoch": 4.885393015461131, "grad_norm": 0.2492527812719345, "learning_rate": 7.09076136913872e-08, "loss": 0.3554, "step": 135555 }, { "epoch": 4.885573215122355, "grad_norm": 0.24872152507305145, "learning_rate": 7.068815514768013e-08, "loss": 0.3674, "step": 135560 }, { "epoch": 4.88575341478358, "grad_norm": 0.3966946005821228, "learning_rate": 7.046903626101997e-08, "loss": 0.3429, "step": 135565 }, { "epoch": 4.885933614444805, "grad_norm": 0.2777992784976959, "learning_rate": 7.025025703439325e-08, "loss": 0.3934, "step": 135570 }, { "epoch": 4.88611381410603, "grad_norm": 0.21319259703159332, "learning_rate": 7.003181747078091e-08, "loss": 0.373, "step": 135575 }, { "epoch": 4.886294013767254, "grad_norm": 0.22784800827503204, "learning_rate": 6.981371757315835e-08, "loss": 0.3694, "step": 135580 }, { "epoch": 4.886474213428479, "grad_norm": 0.3045668601989746, "learning_rate": 6.959595734449542e-08, "loss": 0.4188, "step": 135585 }, { "epoch": 4.886654413089703, "grad_norm": 0.22848592698574066, "learning_rate": 6.937853678776474e-08, "loss": 0.3826, "step": 135590 }, { "epoch": 4.886834612750928, "grad_norm": 0.17041075229644775, "learning_rate": 6.916145590592227e-08, "loss": 0.3806, "step": 135595 }, { "epoch": 4.887014812412152, "grad_norm": 0.3228053152561188, "learning_rate": 6.894471470192676e-08, "loss": 0.3686, "step": 135600 }, { "epoch": 4.887195012073377, "grad_norm": 0.26596736907958984, "learning_rate": 6.872831317873418e-08, "loss": 0.3763, "step": 135605 }, { "epoch": 4.887375211734602, "grad_norm": 0.2538110017776489, "learning_rate": 6.851225133929217e-08, "loss": 0.3434, "step": 135610 }, { "epoch": 4.887555411395827, "grad_norm": 0.23110675811767578, "learning_rate": 6.829652918654284e-08, "loss": 0.3542, "step": 135615 }, { "epoch": 4.887735611057051, "grad_norm": 0.24243219196796417, "learning_rate": 6.808114672342825e-08, "loss": 0.3663, "step": 135620 }, { "epoch": 4.887915810718276, "grad_norm": 0.27579575777053833, "learning_rate": 6.78661039528794e-08, "loss": 0.3798, "step": 135625 }, { "epoch": 4.888096010379501, "grad_norm": 0.2889721393585205, "learning_rate": 6.765140087782729e-08, "loss": 0.3903, "step": 135630 }, { "epoch": 4.8882762100407255, "grad_norm": 0.22426536679267883, "learning_rate": 6.743703750120011e-08, "loss": 0.3345, "step": 135635 }, { "epoch": 4.888456409701949, "grad_norm": 0.3247759938240051, "learning_rate": 6.722301382591223e-08, "loss": 0.4081, "step": 135640 }, { "epoch": 4.888636609363174, "grad_norm": 0.24855536222457886, "learning_rate": 6.700932985488628e-08, "loss": 0.4023, "step": 135645 }, { "epoch": 4.888816809024399, "grad_norm": 0.26974859833717346, "learning_rate": 6.679598559103107e-08, "loss": 0.3685, "step": 135650 }, { "epoch": 4.888997008685624, "grad_norm": 0.2807910442352295, "learning_rate": 6.658298103725258e-08, "loss": 0.3518, "step": 135655 }, { "epoch": 4.889177208346848, "grad_norm": 0.271327406167984, "learning_rate": 6.637031619645684e-08, "loss": 0.3636, "step": 135660 }, { "epoch": 4.889357408008073, "grad_norm": 0.2790520489215851, "learning_rate": 6.61579910715332e-08, "loss": 0.3721, "step": 135665 }, { "epoch": 4.889537607669298, "grad_norm": 0.2329198122024536, "learning_rate": 6.594600566538212e-08, "loss": 0.356, "step": 135670 }, { "epoch": 4.8897178073305225, "grad_norm": 0.23520605266094208, "learning_rate": 6.573435998089018e-08, "loss": 0.3887, "step": 135675 }, { "epoch": 4.889898006991746, "grad_norm": 0.27827075123786926, "learning_rate": 6.552305402093838e-08, "loss": 0.3742, "step": 135680 }, { "epoch": 4.890078206652971, "grad_norm": 0.2520049214363098, "learning_rate": 6.531208778841059e-08, "loss": 0.379, "step": 135685 }, { "epoch": 4.890258406314196, "grad_norm": 0.27177202701568604, "learning_rate": 6.510146128617389e-08, "loss": 0.3425, "step": 135690 }, { "epoch": 4.890438605975421, "grad_norm": 0.2742045819759369, "learning_rate": 6.48911745171038e-08, "loss": 0.4138, "step": 135695 }, { "epoch": 4.890618805636645, "grad_norm": 0.2716894745826721, "learning_rate": 6.468122748406469e-08, "loss": 0.3458, "step": 135700 }, { "epoch": 4.89079900529787, "grad_norm": 0.2410261183977127, "learning_rate": 6.447162018991537e-08, "loss": 0.3588, "step": 135705 }, { "epoch": 4.890979204959095, "grad_norm": 0.2710227072238922, "learning_rate": 6.426235263751468e-08, "loss": 0.3844, "step": 135710 }, { "epoch": 4.8911594046203195, "grad_norm": 0.27327826619148254, "learning_rate": 6.405342482970755e-08, "loss": 0.3764, "step": 135715 }, { "epoch": 4.891339604281544, "grad_norm": 0.2611567974090576, "learning_rate": 6.384483676934727e-08, "loss": 0.3772, "step": 135720 }, { "epoch": 4.891519803942769, "grad_norm": 0.23226478695869446, "learning_rate": 6.363658845927323e-08, "loss": 0.3606, "step": 135725 }, { "epoch": 4.891700003603994, "grad_norm": 0.28792256116867065, "learning_rate": 6.342867990232481e-08, "loss": 0.3684, "step": 135730 }, { "epoch": 4.8918802032652176, "grad_norm": 0.24476850032806396, "learning_rate": 6.322111110133033e-08, "loss": 0.3505, "step": 135735 }, { "epoch": 4.892060402926442, "grad_norm": 0.25403422117233276, "learning_rate": 6.301388205912084e-08, "loss": 0.3466, "step": 135740 }, { "epoch": 4.892240602587667, "grad_norm": 0.2334025502204895, "learning_rate": 6.28069927785191e-08, "loss": 0.3667, "step": 135745 }, { "epoch": 4.892420802248892, "grad_norm": 0.30650824308395386, "learning_rate": 6.260044326234227e-08, "loss": 0.3441, "step": 135750 }, { "epoch": 4.8926010019101165, "grad_norm": 0.2807537317276001, "learning_rate": 6.239423351341034e-08, "loss": 0.3514, "step": 135755 }, { "epoch": 4.892781201571341, "grad_norm": 0.28259849548339844, "learning_rate": 6.218836353452662e-08, "loss": 0.3651, "step": 135760 }, { "epoch": 4.892961401232566, "grad_norm": 0.2717583179473877, "learning_rate": 6.198283332849719e-08, "loss": 0.3725, "step": 135765 }, { "epoch": 4.893141600893791, "grad_norm": 0.24983961880207062, "learning_rate": 6.177764289812538e-08, "loss": 0.3589, "step": 135770 }, { "epoch": 4.8933218005550145, "grad_norm": 0.2629464566707611, "learning_rate": 6.157279224620616e-08, "loss": 0.361, "step": 135775 }, { "epoch": 4.893502000216239, "grad_norm": 0.22154061496257782, "learning_rate": 6.136828137552619e-08, "loss": 0.3872, "step": 135780 }, { "epoch": 4.893682199877464, "grad_norm": 0.294659823179245, "learning_rate": 6.11641102888777e-08, "loss": 0.3613, "step": 135785 }, { "epoch": 4.893862399538689, "grad_norm": 0.20993182063102722, "learning_rate": 6.0960278989039e-08, "loss": 0.3675, "step": 135790 }, { "epoch": 4.8940425991999135, "grad_norm": 0.21128609776496887, "learning_rate": 6.075678747878567e-08, "loss": 0.3701, "step": 135795 }, { "epoch": 4.894222798861138, "grad_norm": 0.22223599255084991, "learning_rate": 6.055363576089601e-08, "loss": 0.3529, "step": 135800 }, { "epoch": 4.894402998522363, "grad_norm": 0.24055111408233643, "learning_rate": 6.035082383813451e-08, "loss": 0.3486, "step": 135805 }, { "epoch": 4.894583198183588, "grad_norm": 0.30938276648521423, "learning_rate": 6.014835171326283e-08, "loss": 0.3945, "step": 135810 }, { "epoch": 4.894763397844812, "grad_norm": 0.2910751700401306, "learning_rate": 5.994621938904266e-08, "loss": 0.337, "step": 135815 }, { "epoch": 4.894943597506037, "grad_norm": 0.2789898216724396, "learning_rate": 5.974442686822457e-08, "loss": 0.3547, "step": 135820 }, { "epoch": 4.895123797167261, "grad_norm": 0.2609977424144745, "learning_rate": 5.9542974153561916e-08, "loss": 0.3549, "step": 135825 }, { "epoch": 4.895303996828486, "grad_norm": 0.2318691611289978, "learning_rate": 5.934186124779972e-08, "loss": 0.352, "step": 135830 }, { "epoch": 4.8954841964897104, "grad_norm": 0.24600878357887268, "learning_rate": 5.914108815367192e-08, "loss": 0.3723, "step": 135835 }, { "epoch": 4.895664396150935, "grad_norm": 0.2980553209781647, "learning_rate": 5.894065487392075e-08, "loss": 0.3656, "step": 135840 }, { "epoch": 4.89584459581216, "grad_norm": 0.2521979808807373, "learning_rate": 5.8740561411271824e-08, "loss": 0.3641, "step": 135845 }, { "epoch": 4.896024795473385, "grad_norm": 0.31992876529693604, "learning_rate": 5.854080776845627e-08, "loss": 0.3926, "step": 135850 }, { "epoch": 4.896204995134609, "grad_norm": 0.28693369030952454, "learning_rate": 5.8341393948194154e-08, "loss": 0.3604, "step": 135855 }, { "epoch": 4.896385194795834, "grad_norm": 0.2383836805820465, "learning_rate": 5.814231995319996e-08, "loss": 0.3979, "step": 135860 }, { "epoch": 4.896565394457058, "grad_norm": 0.32336336374282837, "learning_rate": 5.794358578618819e-08, "loss": 0.3494, "step": 135865 }, { "epoch": 4.896745594118283, "grad_norm": 0.2836247682571411, "learning_rate": 5.774519144986501e-08, "loss": 0.3809, "step": 135870 }, { "epoch": 4.896925793779507, "grad_norm": 0.2685668170452118, "learning_rate": 5.754713694693659e-08, "loss": 0.3776, "step": 135875 }, { "epoch": 4.897105993440732, "grad_norm": 0.28975343704223633, "learning_rate": 5.734942228009799e-08, "loss": 0.3755, "step": 135880 }, { "epoch": 4.897286193101957, "grad_norm": 0.3143594264984131, "learning_rate": 5.715204745204428e-08, "loss": 0.3687, "step": 135885 }, { "epoch": 4.897466392763182, "grad_norm": 0.21849270164966583, "learning_rate": 5.695501246546775e-08, "loss": 0.3852, "step": 135890 }, { "epoch": 4.897646592424406, "grad_norm": 0.26976725459098816, "learning_rate": 5.6758317323046815e-08, "loss": 0.3919, "step": 135895 }, { "epoch": 4.897826792085631, "grad_norm": 0.32330963015556335, "learning_rate": 5.656196202746544e-08, "loss": 0.3806, "step": 135900 }, { "epoch": 4.898006991746856, "grad_norm": 0.26278913021087646, "learning_rate": 5.6365946581399265e-08, "loss": 0.3952, "step": 135905 }, { "epoch": 4.898187191408081, "grad_norm": 0.28790876269340515, "learning_rate": 5.617027098751559e-08, "loss": 0.3506, "step": 135910 }, { "epoch": 4.898367391069304, "grad_norm": 0.26349297165870667, "learning_rate": 5.597493524848452e-08, "loss": 0.3843, "step": 135915 }, { "epoch": 4.898547590730529, "grad_norm": 0.2596375644207001, "learning_rate": 5.577993936696502e-08, "loss": 0.3871, "step": 135920 }, { "epoch": 4.898727790391754, "grad_norm": 0.22803239524364471, "learning_rate": 5.5585283345613304e-08, "loss": 0.3691, "step": 135925 }, { "epoch": 4.898907990052979, "grad_norm": 0.22493457794189453, "learning_rate": 5.5390967187085585e-08, "loss": 0.37, "step": 135930 }, { "epoch": 4.899088189714203, "grad_norm": 0.2446439415216446, "learning_rate": 5.519699089402419e-08, "loss": 0.3802, "step": 135935 }, { "epoch": 4.899268389375428, "grad_norm": 0.2472543567419052, "learning_rate": 5.5003354469077006e-08, "loss": 0.3364, "step": 135940 }, { "epoch": 4.899448589036653, "grad_norm": 0.23050671815872192, "learning_rate": 5.481005791487526e-08, "loss": 0.3832, "step": 135945 }, { "epoch": 4.8996287886978775, "grad_norm": 0.3573814034461975, "learning_rate": 5.461710123406128e-08, "loss": 0.368, "step": 135950 }, { "epoch": 4.899808988359101, "grad_norm": 0.24702803790569305, "learning_rate": 5.442448442925796e-08, "loss": 0.3933, "step": 135955 }, { "epoch": 4.899989188020326, "grad_norm": 0.20937135815620422, "learning_rate": 5.423220750309099e-08, "loss": 0.3516, "step": 135960 }, { "epoch": 4.900169387681551, "grad_norm": 0.30278798937797546, "learning_rate": 5.4040270458180496e-08, "loss": 0.3488, "step": 135965 }, { "epoch": 4.900349587342776, "grad_norm": 0.2523956298828125, "learning_rate": 5.384867329714105e-08, "loss": 0.3533, "step": 135970 }, { "epoch": 4.900529787004, "grad_norm": 0.30250051617622375, "learning_rate": 5.365741602258445e-08, "loss": 0.3883, "step": 135975 }, { "epoch": 4.900709986665225, "grad_norm": 0.21715682744979858, "learning_rate": 5.3466498637116945e-08, "loss": 0.3592, "step": 135980 }, { "epoch": 4.90089018632645, "grad_norm": 0.2727758586406708, "learning_rate": 5.327592114333646e-08, "loss": 0.3527, "step": 135985 }, { "epoch": 4.9010703859876745, "grad_norm": 0.33506885170936584, "learning_rate": 5.308568354384369e-08, "loss": 0.4078, "step": 135990 }, { "epoch": 4.901250585648899, "grad_norm": 0.2240988165140152, "learning_rate": 5.289578584122823e-08, "loss": 0.3615, "step": 135995 }, { "epoch": 4.901430785310124, "grad_norm": 0.2511599361896515, "learning_rate": 5.27062280380769e-08, "loss": 0.3921, "step": 136000 }, { "epoch": 4.901430785310124, "eval_loss": 0.4287901222705841, "eval_runtime": 3.5335, "eval_samples_per_second": 28.3, "eval_steps_per_second": 7.075, "step": 136000 }, { "epoch": 4.901610984971349, "grad_norm": 0.27596327662467957, "learning_rate": 5.251701013697374e-08, "loss": 0.3939, "step": 136005 }, { "epoch": 4.901791184632573, "grad_norm": 0.3502033054828644, "learning_rate": 5.2328132140497255e-08, "loss": 0.3705, "step": 136010 }, { "epoch": 4.901971384293797, "grad_norm": 0.269580602645874, "learning_rate": 5.21395940512176e-08, "loss": 0.3879, "step": 136015 }, { "epoch": 4.902151583955022, "grad_norm": 0.24801914393901825, "learning_rate": 5.195139587170772e-08, "loss": 0.3859, "step": 136020 }, { "epoch": 4.902331783616247, "grad_norm": 0.2341059297323227, "learning_rate": 5.1763537604529453e-08, "loss": 0.371, "step": 136025 }, { "epoch": 4.9025119832774715, "grad_norm": 0.21445782482624054, "learning_rate": 5.1576019252241866e-08, "loss": 0.3202, "step": 136030 }, { "epoch": 4.902692182938696, "grad_norm": 0.1884135752916336, "learning_rate": 5.138884081740125e-08, "loss": 0.3182, "step": 136035 }, { "epoch": 4.902872382599921, "grad_norm": 0.24446237087249756, "learning_rate": 5.120200230255834e-08, "loss": 0.3877, "step": 136040 }, { "epoch": 4.903052582261146, "grad_norm": 0.26809754967689514, "learning_rate": 5.101550371025832e-08, "loss": 0.3798, "step": 136045 }, { "epoch": 4.9032327819223696, "grad_norm": 0.2706208825111389, "learning_rate": 5.082934504304082e-08, "loss": 0.3462, "step": 136050 }, { "epoch": 4.903412981583594, "grad_norm": 0.2580229341983795, "learning_rate": 5.0643526303445485e-08, "loss": 0.3824, "step": 136055 }, { "epoch": 4.903593181244819, "grad_norm": 0.24854421615600586, "learning_rate": 5.045804749399807e-08, "loss": 0.3535, "step": 136060 }, { "epoch": 4.903773380906044, "grad_norm": 0.22989878058433533, "learning_rate": 5.0272908617229885e-08, "loss": 0.3991, "step": 136065 }, { "epoch": 4.9039535805672685, "grad_norm": 0.24265380203723907, "learning_rate": 5.0088109675666684e-08, "loss": 0.3802, "step": 136070 }, { "epoch": 4.904133780228493, "grad_norm": 0.2347709685564041, "learning_rate": 4.990365067181757e-08, "loss": 0.3465, "step": 136075 }, { "epoch": 4.904313979889718, "grad_norm": 0.18331743776798248, "learning_rate": 4.9719531608205527e-08, "loss": 0.3867, "step": 136080 }, { "epoch": 4.904494179550943, "grad_norm": 0.3420829772949219, "learning_rate": 4.9535752487331335e-08, "loss": 0.3615, "step": 136085 }, { "epoch": 4.904674379212167, "grad_norm": 0.31512126326560974, "learning_rate": 4.9352313311701317e-08, "loss": 0.3836, "step": 136090 }, { "epoch": 4.904854578873392, "grad_norm": 0.3358932137489319, "learning_rate": 4.916921408381625e-08, "loss": 0.3708, "step": 136095 }, { "epoch": 4.905034778534616, "grad_norm": 0.21517984569072723, "learning_rate": 4.898645480617137e-08, "loss": 0.3639, "step": 136100 }, { "epoch": 4.905214978195841, "grad_norm": 0.28815269470214844, "learning_rate": 4.880403548125356e-08, "loss": 0.3626, "step": 136105 }, { "epoch": 4.9053951778570655, "grad_norm": 0.2533737123012543, "learning_rate": 4.8621956111549735e-08, "loss": 0.3567, "step": 136110 }, { "epoch": 4.90557537751829, "grad_norm": 0.2487945407629013, "learning_rate": 4.8440216699544015e-08, "loss": 0.3604, "step": 136115 }, { "epoch": 4.905755577179515, "grad_norm": 0.22873222827911377, "learning_rate": 4.8258817247706646e-08, "loss": 0.346, "step": 136120 }, { "epoch": 4.90593577684074, "grad_norm": 0.2279924899339676, "learning_rate": 4.807775775851342e-08, "loss": 0.3948, "step": 136125 }, { "epoch": 4.906115976501964, "grad_norm": 0.24945561587810516, "learning_rate": 4.7897038234429035e-08, "loss": 0.3805, "step": 136130 }, { "epoch": 4.906296176163189, "grad_norm": 0.19843456149101257, "learning_rate": 4.7716658677918194e-08, "loss": 0.3494, "step": 136135 }, { "epoch": 4.906476375824413, "grad_norm": 0.2849428057670593, "learning_rate": 4.753661909143448e-08, "loss": 0.4041, "step": 136140 }, { "epoch": 4.906656575485638, "grad_norm": 0.2670860290527344, "learning_rate": 4.735691947743703e-08, "loss": 0.3781, "step": 136145 }, { "epoch": 4.9068367751468625, "grad_norm": 0.2657918930053711, "learning_rate": 4.717755983836836e-08, "loss": 0.3338, "step": 136150 }, { "epoch": 4.907016974808087, "grad_norm": 0.2555624544620514, "learning_rate": 4.699854017667371e-08, "loss": 0.3745, "step": 136155 }, { "epoch": 4.907197174469312, "grad_norm": 0.2596377432346344, "learning_rate": 4.6819860494792814e-08, "loss": 0.3466, "step": 136160 }, { "epoch": 4.907377374130537, "grad_norm": 0.21930861473083496, "learning_rate": 4.6641520795162596e-08, "loss": 0.3511, "step": 136165 }, { "epoch": 4.907557573791761, "grad_norm": 0.2237590104341507, "learning_rate": 4.6463521080208906e-08, "loss": 0.3618, "step": 136170 }, { "epoch": 4.907737773452986, "grad_norm": 0.2339451014995575, "learning_rate": 4.6285861352357574e-08, "loss": 0.3686, "step": 136175 }, { "epoch": 4.907917973114211, "grad_norm": 0.27476266026496887, "learning_rate": 4.610854161403166e-08, "loss": 0.3819, "step": 136180 }, { "epoch": 4.908098172775436, "grad_norm": 0.2562359869480133, "learning_rate": 4.593156186764591e-08, "loss": 0.3852, "step": 136185 }, { "epoch": 4.908278372436659, "grad_norm": 0.2553425133228302, "learning_rate": 4.57549221156095e-08, "loss": 0.3646, "step": 136190 }, { "epoch": 4.908458572097884, "grad_norm": 0.25126445293426514, "learning_rate": 4.5578622360334386e-08, "loss": 0.3564, "step": 136195 }, { "epoch": 4.908638771759109, "grad_norm": 0.2578040659427643, "learning_rate": 4.540266260421588e-08, "loss": 0.346, "step": 136200 }, { "epoch": 4.908818971420334, "grad_norm": 0.2958472967147827, "learning_rate": 4.5227042849654844e-08, "loss": 0.4071, "step": 136205 }, { "epoch": 4.908999171081558, "grad_norm": 0.2669653594493866, "learning_rate": 4.505176309904657e-08, "loss": 0.3787, "step": 136210 }, { "epoch": 4.909179370742783, "grad_norm": 0.294811874628067, "learning_rate": 4.4876823354775275e-08, "loss": 0.3732, "step": 136215 }, { "epoch": 4.909359570404008, "grad_norm": 0.2965461015701294, "learning_rate": 4.470222361922516e-08, "loss": 0.3712, "step": 136220 }, { "epoch": 4.909539770065233, "grad_norm": 0.24471497535705566, "learning_rate": 4.452796389477487e-08, "loss": 0.355, "step": 136225 }, { "epoch": 4.909719969726457, "grad_norm": 0.2504851520061493, "learning_rate": 4.435404418380029e-08, "loss": 0.3638, "step": 136230 }, { "epoch": 4.909900169387681, "grad_norm": 0.26970013976097107, "learning_rate": 4.418046448867175e-08, "loss": 0.3881, "step": 136235 }, { "epoch": 4.910080369048906, "grad_norm": 0.31252387166023254, "learning_rate": 4.400722481175401e-08, "loss": 0.4242, "step": 136240 }, { "epoch": 4.910260568710131, "grad_norm": 0.27864494919776917, "learning_rate": 4.3834325155403535e-08, "loss": 0.3557, "step": 136245 }, { "epoch": 4.910440768371355, "grad_norm": 0.2759137451648712, "learning_rate": 4.366176552197676e-08, "loss": 0.3641, "step": 136250 }, { "epoch": 4.91062096803258, "grad_norm": 0.23578329384326935, "learning_rate": 4.348954591383014e-08, "loss": 0.3712, "step": 136255 }, { "epoch": 4.910801167693805, "grad_norm": 0.2522645592689514, "learning_rate": 4.331766633330625e-08, "loss": 0.3695, "step": 136260 }, { "epoch": 4.9109813673550295, "grad_norm": 0.300750195980072, "learning_rate": 4.3146126782747655e-08, "loss": 0.3721, "step": 136265 }, { "epoch": 4.911161567016254, "grad_norm": 0.26035916805267334, "learning_rate": 4.29749272644886e-08, "loss": 0.3666, "step": 136270 }, { "epoch": 4.911341766677479, "grad_norm": 0.2753373086452484, "learning_rate": 4.28040677808661e-08, "loss": 0.372, "step": 136275 }, { "epoch": 4.911521966338704, "grad_norm": 0.24623210728168488, "learning_rate": 4.263354833420607e-08, "loss": 0.3655, "step": 136280 }, { "epoch": 4.911702165999928, "grad_norm": 0.25700467824935913, "learning_rate": 4.246336892683445e-08, "loss": 0.3749, "step": 136285 }, { "epoch": 4.911882365661152, "grad_norm": 0.21152786910533905, "learning_rate": 4.229352956106603e-08, "loss": 0.3809, "step": 136290 }, { "epoch": 4.912062565322377, "grad_norm": 0.3113376200199127, "learning_rate": 4.2124030239212855e-08, "loss": 0.3877, "step": 136295 }, { "epoch": 4.912242764983602, "grad_norm": 0.22789722681045532, "learning_rate": 4.195487096359252e-08, "loss": 0.3668, "step": 136300 }, { "epoch": 4.9124229646448265, "grad_norm": 0.22643516957759857, "learning_rate": 4.178605173650318e-08, "loss": 0.3519, "step": 136305 }, { "epoch": 4.912603164306051, "grad_norm": 0.1881403774023056, "learning_rate": 4.161757256024579e-08, "loss": 0.3422, "step": 136310 }, { "epoch": 4.912783363967276, "grad_norm": 0.2479417622089386, "learning_rate": 4.144943343711849e-08, "loss": 0.4007, "step": 136315 }, { "epoch": 4.912963563628501, "grad_norm": 0.23649077117443085, "learning_rate": 4.128163436941113e-08, "loss": 0.3453, "step": 136320 }, { "epoch": 4.913143763289725, "grad_norm": 0.25313544273376465, "learning_rate": 4.111417535940798e-08, "loss": 0.4058, "step": 136325 }, { "epoch": 4.913323962950949, "grad_norm": 0.2785737216472626, "learning_rate": 4.094705640939056e-08, "loss": 0.3633, "step": 136330 }, { "epoch": 4.913504162612174, "grad_norm": 0.292338490486145, "learning_rate": 4.0780277521640374e-08, "loss": 0.3945, "step": 136335 }, { "epoch": 4.913684362273399, "grad_norm": 0.2278224229812622, "learning_rate": 4.061383869842506e-08, "loss": 0.3593, "step": 136340 }, { "epoch": 4.9138645619346235, "grad_norm": 0.2170296460390091, "learning_rate": 4.044773994201223e-08, "loss": 0.3742, "step": 136345 }, { "epoch": 4.914044761595848, "grad_norm": 0.2683648467063904, "learning_rate": 4.0281981254669535e-08, "loss": 0.4156, "step": 136350 }, { "epoch": 4.914224961257073, "grad_norm": 0.2352461963891983, "learning_rate": 4.011656263865071e-08, "loss": 0.3855, "step": 136355 }, { "epoch": 4.914405160918298, "grad_norm": 0.28996291756629944, "learning_rate": 3.99514840962123e-08, "loss": 0.4141, "step": 136360 }, { "epoch": 4.914585360579522, "grad_norm": 0.22475266456604004, "learning_rate": 3.9786745629602494e-08, "loss": 0.3708, "step": 136365 }, { "epoch": 4.914765560240747, "grad_norm": 0.19460433721542358, "learning_rate": 3.962234724106395e-08, "loss": 0.3589, "step": 136370 }, { "epoch": 4.914945759901971, "grad_norm": 0.2615772485733032, "learning_rate": 3.945828893284209e-08, "loss": 0.3617, "step": 136375 }, { "epoch": 4.915125959563196, "grad_norm": 0.26762276887893677, "learning_rate": 3.929457070716569e-08, "loss": 0.3672, "step": 136380 }, { "epoch": 4.9153061592244205, "grad_norm": 0.40078943967819214, "learning_rate": 3.913119256626907e-08, "loss": 0.3814, "step": 136385 }, { "epoch": 4.915486358885645, "grad_norm": 0.2580387592315674, "learning_rate": 3.896815451237823e-08, "loss": 0.3731, "step": 136390 }, { "epoch": 4.91566655854687, "grad_norm": 0.2312445044517517, "learning_rate": 3.880545654771084e-08, "loss": 0.3327, "step": 136395 }, { "epoch": 4.915846758208095, "grad_norm": 0.22005638480186462, "learning_rate": 3.864309867449012e-08, "loss": 0.3817, "step": 136400 }, { "epoch": 4.916026957869319, "grad_norm": 0.2570466995239258, "learning_rate": 3.848108089492264e-08, "loss": 0.3434, "step": 136405 }, { "epoch": 4.916207157530544, "grad_norm": 0.2323221117258072, "learning_rate": 3.8319403211217744e-08, "loss": 0.3523, "step": 136410 }, { "epoch": 4.916387357191768, "grad_norm": 0.26294851303100586, "learning_rate": 3.815806562557644e-08, "loss": 0.3646, "step": 136415 }, { "epoch": 4.916567556852993, "grad_norm": 0.29720500111579895, "learning_rate": 3.799706814020254e-08, "loss": 0.3537, "step": 136420 }, { "epoch": 4.9167477565142175, "grad_norm": 0.27285248041152954, "learning_rate": 3.783641075728317e-08, "loss": 0.3456, "step": 136425 }, { "epoch": 4.916927956175442, "grad_norm": 0.2373269498348236, "learning_rate": 3.767609347901102e-08, "loss": 0.3396, "step": 136430 }, { "epoch": 4.917108155836667, "grad_norm": 0.232501819729805, "learning_rate": 3.751611630756768e-08, "loss": 0.3594, "step": 136435 }, { "epoch": 4.917288355497892, "grad_norm": 0.24298062920570374, "learning_rate": 3.735647924513475e-08, "loss": 0.3462, "step": 136440 }, { "epoch": 4.917468555159116, "grad_norm": 0.2818912863731384, "learning_rate": 3.719718229388824e-08, "loss": 0.3911, "step": 136445 }, { "epoch": 4.917648754820341, "grad_norm": 0.2380588948726654, "learning_rate": 3.703822545599589e-08, "loss": 0.424, "step": 136450 }, { "epoch": 4.917828954481566, "grad_norm": 0.23889237642288208, "learning_rate": 3.687960873362539e-08, "loss": 0.3688, "step": 136455 }, { "epoch": 4.918009154142791, "grad_norm": 0.2520101070404053, "learning_rate": 3.672133212893891e-08, "loss": 0.3466, "step": 136460 }, { "epoch": 4.918189353804015, "grad_norm": 0.19939465820789337, "learning_rate": 3.6563395644087504e-08, "loss": 0.3633, "step": 136465 }, { "epoch": 4.918369553465239, "grad_norm": 0.21489644050598145, "learning_rate": 3.6405799281230556e-08, "loss": 0.373, "step": 136470 }, { "epoch": 4.918549753126464, "grad_norm": 0.22894160449504852, "learning_rate": 3.6248543042508023e-08, "loss": 0.3401, "step": 136475 }, { "epoch": 4.918729952787689, "grad_norm": 0.2765687108039856, "learning_rate": 3.609162693006818e-08, "loss": 0.3808, "step": 136480 }, { "epoch": 4.918910152448913, "grad_norm": 0.2676806151866913, "learning_rate": 3.5935050946045434e-08, "loss": 0.3748, "step": 136485 }, { "epoch": 4.919090352110138, "grad_norm": 0.1796978861093521, "learning_rate": 3.5778815092576965e-08, "loss": 0.364, "step": 136490 }, { "epoch": 4.919270551771363, "grad_norm": 0.2699376344680786, "learning_rate": 3.562291937178608e-08, "loss": 0.3574, "step": 136495 }, { "epoch": 4.919450751432588, "grad_norm": 0.2322850078344345, "learning_rate": 3.546736378580162e-08, "loss": 0.375, "step": 136500 }, { "epoch": 4.919450751432588, "eval_loss": 0.42881008982658386, "eval_runtime": 3.537, "eval_samples_per_second": 28.272, "eval_steps_per_second": 7.068, "step": 136500 }, { "epoch": 4.919630951093812, "grad_norm": 0.266796350479126, "learning_rate": 3.531214833673857e-08, "loss": 0.4061, "step": 136505 }, { "epoch": 4.919811150755036, "grad_norm": 0.23469606041908264, "learning_rate": 3.515727302671468e-08, "loss": 0.3704, "step": 136510 }, { "epoch": 4.919991350416261, "grad_norm": 0.27841997146606445, "learning_rate": 3.500273785784214e-08, "loss": 0.3688, "step": 136515 }, { "epoch": 4.920171550077486, "grad_norm": 0.23662640154361725, "learning_rate": 3.4848542832222056e-08, "loss": 0.3902, "step": 136520 }, { "epoch": 4.92035174973871, "grad_norm": 0.3038434386253357, "learning_rate": 3.46946879519583e-08, "loss": 0.3871, "step": 136525 }, { "epoch": 4.920531949399935, "grad_norm": 0.2633965015411377, "learning_rate": 3.454117321914363e-08, "loss": 0.3661, "step": 136530 }, { "epoch": 4.92071214906116, "grad_norm": 0.3141336441040039, "learning_rate": 3.438799863587361e-08, "loss": 0.3908, "step": 136535 }, { "epoch": 4.920892348722385, "grad_norm": 0.21846546232700348, "learning_rate": 3.423516420423545e-08, "loss": 0.3788, "step": 136540 }, { "epoch": 4.921072548383609, "grad_norm": 0.27012789249420166, "learning_rate": 3.408266992630804e-08, "loss": 0.3757, "step": 136545 }, { "epoch": 4.921252748044834, "grad_norm": 0.24852919578552246, "learning_rate": 3.393051580417028e-08, "loss": 0.3594, "step": 136550 }, { "epoch": 4.921432947706059, "grad_norm": 0.2667607069015503, "learning_rate": 3.377870183989551e-08, "loss": 0.3766, "step": 136555 }, { "epoch": 4.921613147367283, "grad_norm": 0.26542437076568604, "learning_rate": 3.3627228035551515e-08, "loss": 0.368, "step": 136560 }, { "epoch": 4.921793347028507, "grad_norm": 0.3026573657989502, "learning_rate": 3.3476094393203313e-08, "loss": 0.3884, "step": 136565 }, { "epoch": 4.921973546689732, "grad_norm": 0.26088666915893555, "learning_rate": 3.3325300914910374e-08, "loss": 0.3654, "step": 136570 }, { "epoch": 4.922153746350957, "grad_norm": 0.22487777471542358, "learning_rate": 3.31748476027266e-08, "loss": 0.3848, "step": 136575 }, { "epoch": 4.9223339460121815, "grad_norm": 0.2889478802680969, "learning_rate": 3.302473445870313e-08, "loss": 0.4139, "step": 136580 }, { "epoch": 4.922514145673406, "grad_norm": 0.21475286781787872, "learning_rate": 3.2874961484882785e-08, "loss": 0.3401, "step": 136585 }, { "epoch": 4.922694345334631, "grad_norm": 0.27408671379089355, "learning_rate": 3.272552868330558e-08, "loss": 0.4108, "step": 136590 }, { "epoch": 4.922874544995856, "grad_norm": 0.2363937646150589, "learning_rate": 3.257643605601157e-08, "loss": 0.3962, "step": 136595 }, { "epoch": 4.92305474465708, "grad_norm": 0.22098080813884735, "learning_rate": 3.2427683605026905e-08, "loss": 0.3596, "step": 136600 }, { "epoch": 4.923234944318304, "grad_norm": 0.2587352693080902, "learning_rate": 3.227927133238329e-08, "loss": 0.4166, "step": 136605 }, { "epoch": 4.923415143979529, "grad_norm": 0.3030674159526825, "learning_rate": 3.213119924010133e-08, "loss": 0.3743, "step": 136610 }, { "epoch": 4.923595343640754, "grad_norm": 0.24243710935115814, "learning_rate": 3.1983467330196084e-08, "loss": 0.372, "step": 136615 }, { "epoch": 4.9237755433019785, "grad_norm": 0.2680684030056, "learning_rate": 3.1836075604685375e-08, "loss": 0.4072, "step": 136620 }, { "epoch": 4.923955742963203, "grad_norm": 0.2981574237346649, "learning_rate": 3.1689024065570397e-08, "loss": 0.3586, "step": 136625 }, { "epoch": 4.924135942624428, "grad_norm": 0.24394196271896362, "learning_rate": 3.1542312714860635e-08, "loss": 0.3844, "step": 136630 }, { "epoch": 4.924316142285653, "grad_norm": 0.26692622900009155, "learning_rate": 3.139594155455172e-08, "loss": 0.3926, "step": 136635 }, { "epoch": 4.9244963419468775, "grad_norm": 0.3153393268585205, "learning_rate": 3.1249910586639286e-08, "loss": 0.3689, "step": 136640 }, { "epoch": 4.924676541608102, "grad_norm": 0.2018764615058899, "learning_rate": 3.11042198131134e-08, "loss": 0.3361, "step": 136645 }, { "epoch": 4.924856741269326, "grad_norm": 0.24366877973079681, "learning_rate": 3.095886923595581e-08, "loss": 0.3578, "step": 136650 }, { "epoch": 4.925036940930551, "grad_norm": 0.21055671572685242, "learning_rate": 3.081385885715105e-08, "loss": 0.3606, "step": 136655 }, { "epoch": 4.9252171405917755, "grad_norm": 0.25222551822662354, "learning_rate": 3.06691886786753e-08, "loss": 0.3939, "step": 136660 }, { "epoch": 4.925397340253, "grad_norm": 0.18292470276355743, "learning_rate": 3.052485870249644e-08, "loss": 0.3271, "step": 136665 }, { "epoch": 4.925577539914225, "grad_norm": 0.2845352590084076, "learning_rate": 3.038086893057956e-08, "loss": 0.366, "step": 136670 }, { "epoch": 4.92575773957545, "grad_norm": 0.26779624819755554, "learning_rate": 3.0237219364892544e-08, "loss": 0.3505, "step": 136675 }, { "epoch": 4.9259379392366744, "grad_norm": 0.24947002530097961, "learning_rate": 3.009391000738659e-08, "loss": 0.3692, "step": 136680 }, { "epoch": 4.926118138897899, "grad_norm": 0.2781367301940918, "learning_rate": 2.995094086001848e-08, "loss": 0.3752, "step": 136685 }, { "epoch": 4.926298338559123, "grad_norm": 0.24780642986297607, "learning_rate": 2.980831192473388e-08, "loss": 0.3734, "step": 136690 }, { "epoch": 4.926478538220348, "grad_norm": 0.2541256248950958, "learning_rate": 2.9666023203475667e-08, "loss": 0.3417, "step": 136695 }, { "epoch": 4.9266587378815725, "grad_norm": 0.2498217672109604, "learning_rate": 2.9524074698186743e-08, "loss": 0.3829, "step": 136700 }, { "epoch": 4.926838937542797, "grad_norm": 0.2726345360279083, "learning_rate": 2.938246641079334e-08, "loss": 0.3742, "step": 136705 }, { "epoch": 4.927019137204022, "grad_norm": 0.29105934500694275, "learning_rate": 2.9241198343232802e-08, "loss": 0.3852, "step": 136710 }, { "epoch": 4.927199336865247, "grad_norm": 0.24921295046806335, "learning_rate": 2.910027049742581e-08, "loss": 0.3763, "step": 136715 }, { "epoch": 4.927379536526471, "grad_norm": 0.3070908784866333, "learning_rate": 2.8959682875293047e-08, "loss": 0.3878, "step": 136720 }, { "epoch": 4.927559736187696, "grad_norm": 0.319961279630661, "learning_rate": 2.8819435478749657e-08, "loss": 0.4065, "step": 136725 }, { "epoch": 4.927739935848921, "grad_norm": 0.3225443959236145, "learning_rate": 2.8679528309705217e-08, "loss": 0.4024, "step": 136730 }, { "epoch": 4.927920135510146, "grad_norm": 0.2788389325141907, "learning_rate": 2.8539961370069314e-08, "loss": 0.3578, "step": 136735 }, { "epoch": 4.92810033517137, "grad_norm": 0.2546929717063904, "learning_rate": 2.84007346617432e-08, "loss": 0.3585, "step": 136740 }, { "epoch": 4.928280534832594, "grad_norm": 0.23796042799949646, "learning_rate": 2.826184818661981e-08, "loss": 0.3595, "step": 136745 }, { "epoch": 4.928460734493819, "grad_norm": 0.22895626723766327, "learning_rate": 2.8123301946594847e-08, "loss": 0.3904, "step": 136750 }, { "epoch": 4.928640934155044, "grad_norm": 0.2509956657886505, "learning_rate": 2.7985095943555696e-08, "loss": 0.4023, "step": 136755 }, { "epoch": 4.928821133816268, "grad_norm": 0.34779104590415955, "learning_rate": 2.7847230179384176e-08, "loss": 0.3417, "step": 136760 }, { "epoch": 4.929001333477493, "grad_norm": 0.19889460504055023, "learning_rate": 2.7709704655959344e-08, "loss": 0.342, "step": 136765 }, { "epoch": 4.929181533138718, "grad_norm": 0.2757225036621094, "learning_rate": 2.757251937515748e-08, "loss": 0.3928, "step": 136770 }, { "epoch": 4.929361732799943, "grad_norm": 0.32304972410202026, "learning_rate": 2.7435674338843752e-08, "loss": 0.379, "step": 136775 }, { "epoch": 4.929541932461167, "grad_norm": 0.21971245110034943, "learning_rate": 2.729916954888334e-08, "loss": 0.3514, "step": 136780 }, { "epoch": 4.929722132122391, "grad_norm": 0.2933555841445923, "learning_rate": 2.7163005007135867e-08, "loss": 0.3583, "step": 136785 }, { "epoch": 4.929902331783616, "grad_norm": 0.25747615098953247, "learning_rate": 2.7027180715458177e-08, "loss": 0.3979, "step": 136790 }, { "epoch": 4.930082531444841, "grad_norm": 0.27800336480140686, "learning_rate": 2.689169667570157e-08, "loss": 0.3521, "step": 136795 }, { "epoch": 4.930262731106065, "grad_norm": 0.24476049840450287, "learning_rate": 2.675655288970902e-08, "loss": 0.3589, "step": 136800 }, { "epoch": 4.93044293076729, "grad_norm": 0.21203723549842834, "learning_rate": 2.6621749359326263e-08, "loss": 0.3473, "step": 136805 }, { "epoch": 4.930623130428515, "grad_norm": 0.25764790177345276, "learning_rate": 2.6487286086385177e-08, "loss": 0.3886, "step": 136810 }, { "epoch": 4.93080333008974, "grad_norm": 0.2776869535446167, "learning_rate": 2.63531630727204e-08, "loss": 0.4101, "step": 136815 }, { "epoch": 4.930983529750964, "grad_norm": 0.23723161220550537, "learning_rate": 2.6219380320158248e-08, "loss": 0.3742, "step": 136820 }, { "epoch": 4.931163729412189, "grad_norm": 0.28152310848236084, "learning_rate": 2.6085937830522266e-08, "loss": 0.3533, "step": 136825 }, { "epoch": 4.931343929073414, "grad_norm": 0.3157213032245636, "learning_rate": 2.595283560562767e-08, "loss": 0.3813, "step": 136830 }, { "epoch": 4.931524128734638, "grad_norm": 0.22716785967350006, "learning_rate": 2.582007364729522e-08, "loss": 0.3488, "step": 136835 }, { "epoch": 4.931704328395862, "grad_norm": 0.27288514375686646, "learning_rate": 2.568765195732625e-08, "loss": 0.3763, "step": 136840 }, { "epoch": 4.931884528057087, "grad_norm": 0.22473356127738953, "learning_rate": 2.5555570537527663e-08, "loss": 0.3811, "step": 136845 }, { "epoch": 4.932064727718312, "grad_norm": 0.25203046202659607, "learning_rate": 2.5423829389700783e-08, "loss": 0.3522, "step": 136850 }, { "epoch": 4.932244927379537, "grad_norm": 0.24768032133579254, "learning_rate": 2.5292428515638622e-08, "loss": 0.3684, "step": 136855 }, { "epoch": 4.932425127040761, "grad_norm": 0.23223379254341125, "learning_rate": 2.5161367917131416e-08, "loss": 0.3809, "step": 136860 }, { "epoch": 4.932605326701986, "grad_norm": 0.24357058107852936, "learning_rate": 2.5030647595963853e-08, "loss": 0.3799, "step": 136865 }, { "epoch": 4.932785526363211, "grad_norm": 0.3189919590950012, "learning_rate": 2.4900267553920608e-08, "loss": 0.3666, "step": 136870 }, { "epoch": 4.932965726024435, "grad_norm": 0.2528684437274933, "learning_rate": 2.4770227792775268e-08, "loss": 0.3559, "step": 136875 }, { "epoch": 4.933145925685659, "grad_norm": 0.2626766562461853, "learning_rate": 2.464052831429864e-08, "loss": 0.4019, "step": 136880 }, { "epoch": 4.933326125346884, "grad_norm": 0.2691434323787689, "learning_rate": 2.4511169120261524e-08, "loss": 0.3848, "step": 136885 }, { "epoch": 4.933506325008109, "grad_norm": 0.26981598138809204, "learning_rate": 2.4382150212423625e-08, "loss": 0.3692, "step": 136890 }, { "epoch": 4.9336865246693336, "grad_norm": 0.2676762342453003, "learning_rate": 2.425347159254465e-08, "loss": 0.3717, "step": 136895 }, { "epoch": 4.933866724330558, "grad_norm": 0.2530960738658905, "learning_rate": 2.4125133262373202e-08, "loss": 0.3849, "step": 136900 }, { "epoch": 4.934046923991783, "grad_norm": 0.2379455268383026, "learning_rate": 2.3997135223663424e-08, "loss": 0.3896, "step": 136905 }, { "epoch": 4.934227123653008, "grad_norm": 0.29824960231781006, "learning_rate": 2.3869477478158374e-08, "loss": 0.3574, "step": 136910 }, { "epoch": 4.9344073233142325, "grad_norm": 0.21034584939479828, "learning_rate": 2.374216002759555e-08, "loss": 0.3759, "step": 136915 }, { "epoch": 4.934587522975457, "grad_norm": 0.28804388642311096, "learning_rate": 2.3615182873709673e-08, "loss": 0.386, "step": 136920 }, { "epoch": 4.934767722636681, "grad_norm": 0.25442805886268616, "learning_rate": 2.3488546018232692e-08, "loss": 0.39, "step": 136925 }, { "epoch": 4.934947922297906, "grad_norm": 0.2353426218032837, "learning_rate": 2.3362249462885454e-08, "loss": 0.3843, "step": 136930 }, { "epoch": 4.9351281219591305, "grad_norm": 0.22822360694408417, "learning_rate": 2.3236293209394354e-08, "loss": 0.3774, "step": 136935 }, { "epoch": 4.935308321620355, "grad_norm": 0.21662981808185577, "learning_rate": 2.311067725947469e-08, "loss": 0.3724, "step": 136940 }, { "epoch": 4.93548852128158, "grad_norm": 0.2427043616771698, "learning_rate": 2.2985401614833422e-08, "loss": 0.3599, "step": 136945 }, { "epoch": 4.935668720942805, "grad_norm": 0.2436215728521347, "learning_rate": 2.2860466277180303e-08, "loss": 0.3463, "step": 136950 }, { "epoch": 4.9358489206040295, "grad_norm": 0.18670439720153809, "learning_rate": 2.273587124821952e-08, "loss": 0.3419, "step": 136955 }, { "epoch": 4.936029120265254, "grad_norm": 0.23611606657505035, "learning_rate": 2.2611616529644164e-08, "loss": 0.374, "step": 136960 }, { "epoch": 4.936209319926478, "grad_norm": 0.27556440234184265, "learning_rate": 2.2487702123152877e-08, "loss": 0.3414, "step": 136965 }, { "epoch": 4.936389519587703, "grad_norm": 0.2503463625907898, "learning_rate": 2.2364128030430422e-08, "loss": 0.3384, "step": 136970 }, { "epoch": 4.9365697192489275, "grad_norm": 0.2716796398162842, "learning_rate": 2.2240894253158785e-08, "loss": 0.3669, "step": 136975 }, { "epoch": 4.936749918910152, "grad_norm": 0.2600260376930237, "learning_rate": 2.2118000793019956e-08, "loss": 0.4075, "step": 136980 }, { "epoch": 4.936930118571377, "grad_norm": 0.20133672654628754, "learning_rate": 2.1995447651687595e-08, "loss": 0.3881, "step": 136985 }, { "epoch": 4.937110318232602, "grad_norm": 0.268351674079895, "learning_rate": 2.187323483083259e-08, "loss": 0.355, "step": 136990 }, { "epoch": 4.9372905178938264, "grad_norm": 0.24614043533802032, "learning_rate": 2.1751362332117496e-08, "loss": 0.3716, "step": 136995 }, { "epoch": 4.937470717555051, "grad_norm": 0.24108512699604034, "learning_rate": 2.162983015720488e-08, "loss": 0.3567, "step": 137000 }, { "epoch": 4.937470717555051, "eval_loss": 0.4288177788257599, "eval_runtime": 3.5288, "eval_samples_per_second": 28.339, "eval_steps_per_second": 7.085, "step": 137000 }, { "epoch": 4.937650917216276, "grad_norm": 0.2266797125339508, "learning_rate": 2.150863830774896e-08, "loss": 0.3821, "step": 137005 }, { "epoch": 4.937831116877501, "grad_norm": 0.2394786775112152, "learning_rate": 2.138778678540121e-08, "loss": 0.3445, "step": 137010 }, { "epoch": 4.938011316538725, "grad_norm": 0.2852950394153595, "learning_rate": 2.12672755918103e-08, "loss": 0.3695, "step": 137015 }, { "epoch": 4.938191516199949, "grad_norm": 0.270437628030777, "learning_rate": 2.1147104728616584e-08, "loss": 0.3827, "step": 137020 }, { "epoch": 4.938371715861174, "grad_norm": 0.26983991265296936, "learning_rate": 2.1027274197457646e-08, "loss": 0.3773, "step": 137025 }, { "epoch": 4.938551915522399, "grad_norm": 0.2832012474536896, "learning_rate": 2.0907783999965515e-08, "loss": 0.3651, "step": 137030 }, { "epoch": 4.938732115183623, "grad_norm": 0.28883448243141174, "learning_rate": 2.0788634137769437e-08, "loss": 0.3767, "step": 137035 }, { "epoch": 4.938912314844848, "grad_norm": 0.29116883873939514, "learning_rate": 2.066982461249034e-08, "loss": 0.3505, "step": 137040 }, { "epoch": 4.939092514506073, "grad_norm": 0.2609765827655792, "learning_rate": 2.0551355425749154e-08, "loss": 0.3741, "step": 137045 }, { "epoch": 4.939272714167298, "grad_norm": 0.23762352764606476, "learning_rate": 2.0433226579161247e-08, "loss": 0.402, "step": 137050 }, { "epoch": 4.939452913828522, "grad_norm": 0.23419687151908875, "learning_rate": 2.031543807433367e-08, "loss": 0.3582, "step": 137055 }, { "epoch": 4.939633113489746, "grad_norm": 0.23861204087734222, "learning_rate": 2.0197989912873473e-08, "loss": 0.3914, "step": 137060 }, { "epoch": 4.939813313150971, "grad_norm": 0.2407989501953125, "learning_rate": 2.0080882096376595e-08, "loss": 0.3529, "step": 137065 }, { "epoch": 4.939993512812196, "grad_norm": 0.2740454375743866, "learning_rate": 1.9964114626441764e-08, "loss": 0.3948, "step": 137070 }, { "epoch": 4.94017371247342, "grad_norm": 0.28621092438697815, "learning_rate": 1.984768750466215e-08, "loss": 0.3807, "step": 137075 }, { "epoch": 4.940353912134645, "grad_norm": 0.26705020666122437, "learning_rate": 1.9731600732619816e-08, "loss": 0.4056, "step": 137080 }, { "epoch": 4.94053411179587, "grad_norm": 0.27954116463661194, "learning_rate": 1.961585431189683e-08, "loss": 0.3494, "step": 137085 }, { "epoch": 4.940714311457095, "grad_norm": 0.2583678662776947, "learning_rate": 1.9500448244072487e-08, "loss": 0.4074, "step": 137090 }, { "epoch": 4.940894511118319, "grad_norm": 0.27501171827316284, "learning_rate": 1.9385382530717755e-08, "loss": 0.3644, "step": 137095 }, { "epoch": 4.941074710779544, "grad_norm": 0.2305932641029358, "learning_rate": 1.9270657173403593e-08, "loss": 0.3497, "step": 137100 }, { "epoch": 4.941254910440769, "grad_norm": 0.23129644989967346, "learning_rate": 1.9156272173687094e-08, "loss": 0.3861, "step": 137105 }, { "epoch": 4.941435110101993, "grad_norm": 0.295730322599411, "learning_rate": 1.9042227533130897e-08, "loss": 0.3406, "step": 137110 }, { "epoch": 4.941615309763217, "grad_norm": 0.2581028938293457, "learning_rate": 1.8928523253286536e-08, "loss": 0.3465, "step": 137115 }, { "epoch": 4.941795509424442, "grad_norm": 0.21017776429653168, "learning_rate": 1.881515933570832e-08, "loss": 0.3547, "step": 137120 }, { "epoch": 4.941975709085667, "grad_norm": 0.24874837696552277, "learning_rate": 1.8702135781933917e-08, "loss": 0.3988, "step": 137125 }, { "epoch": 4.942155908746892, "grad_norm": 0.2943408191204071, "learning_rate": 1.858945259350653e-08, "loss": 0.3855, "step": 137130 }, { "epoch": 4.942336108408116, "grad_norm": 0.27087777853012085, "learning_rate": 1.8477109771963817e-08, "loss": 0.381, "step": 137135 }, { "epoch": 4.942516308069341, "grad_norm": 0.2375209629535675, "learning_rate": 1.8365107318829568e-08, "loss": 0.3586, "step": 137140 }, { "epoch": 4.942696507730566, "grad_norm": 0.2501397430896759, "learning_rate": 1.8253445235638654e-08, "loss": 0.3695, "step": 137145 }, { "epoch": 4.94287670739179, "grad_norm": 0.2996070981025696, "learning_rate": 1.8142123523903765e-08, "loss": 0.4018, "step": 137150 }, { "epoch": 4.943056907053014, "grad_norm": 0.3034278154373169, "learning_rate": 1.8031142185148674e-08, "loss": 0.3636, "step": 137155 }, { "epoch": 4.943237106714239, "grad_norm": 0.2226463109254837, "learning_rate": 1.792050122088329e-08, "loss": 0.3754, "step": 137160 }, { "epoch": 4.943417306375464, "grad_norm": 0.24234063923358917, "learning_rate": 1.7810200632611963e-08, "loss": 0.3599, "step": 137165 }, { "epoch": 4.943597506036689, "grad_norm": 0.25728529691696167, "learning_rate": 1.7700240421841817e-08, "loss": 0.3817, "step": 137170 }, { "epoch": 4.943777705697913, "grad_norm": 0.22455373406410217, "learning_rate": 1.759062059006611e-08, "loss": 0.3744, "step": 137175 }, { "epoch": 4.943957905359138, "grad_norm": 0.2268945425748825, "learning_rate": 1.7481341138783634e-08, "loss": 0.3862, "step": 137180 }, { "epoch": 4.944138105020363, "grad_norm": 0.27768200635910034, "learning_rate": 1.7372402069482097e-08, "loss": 0.3813, "step": 137185 }, { "epoch": 4.9443183046815875, "grad_norm": 0.256106436252594, "learning_rate": 1.726380338364364e-08, "loss": 0.3873, "step": 137190 }, { "epoch": 4.944498504342812, "grad_norm": 0.21937619149684906, "learning_rate": 1.7155545082750413e-08, "loss": 0.3762, "step": 137195 }, { "epoch": 4.944678704004036, "grad_norm": 0.2467588484287262, "learning_rate": 1.704762716827346e-08, "loss": 0.3593, "step": 137200 }, { "epoch": 4.944858903665261, "grad_norm": 0.282312273979187, "learning_rate": 1.6940049641686605e-08, "loss": 0.3672, "step": 137205 }, { "epoch": 4.9450391033264856, "grad_norm": 0.2458559274673462, "learning_rate": 1.683281250445534e-08, "loss": 0.3546, "step": 137210 }, { "epoch": 4.94521930298771, "grad_norm": 0.2797468602657318, "learning_rate": 1.6725915758039612e-08, "loss": 0.3572, "step": 137215 }, { "epoch": 4.945399502648935, "grad_norm": 0.2865085005760193, "learning_rate": 1.661935940389936e-08, "loss": 0.3809, "step": 137220 }, { "epoch": 4.94557970231016, "grad_norm": 0.2759266495704651, "learning_rate": 1.6513143443477873e-08, "loss": 0.3687, "step": 137225 }, { "epoch": 4.9457599019713845, "grad_norm": 0.2847881615161896, "learning_rate": 1.6407267878232326e-08, "loss": 0.3586, "step": 137230 }, { "epoch": 4.945940101632609, "grad_norm": 0.2958735227584839, "learning_rate": 1.630173270960045e-08, "loss": 0.3507, "step": 137235 }, { "epoch": 4.946120301293834, "grad_norm": 0.29240357875823975, "learning_rate": 1.619653793901721e-08, "loss": 0.3602, "step": 137240 }, { "epoch": 4.946300500955058, "grad_norm": 0.25079208612442017, "learning_rate": 1.609168356792312e-08, "loss": 0.3342, "step": 137245 }, { "epoch": 4.9464807006162825, "grad_norm": 0.2954399287700653, "learning_rate": 1.598716959773927e-08, "loss": 0.3812, "step": 137250 }, { "epoch": 4.946660900277507, "grad_norm": 0.28976091742515564, "learning_rate": 1.588299602989507e-08, "loss": 0.3823, "step": 137255 }, { "epoch": 4.946841099938732, "grad_norm": 0.328617125749588, "learning_rate": 1.5779162865806053e-08, "loss": 0.3833, "step": 137260 }, { "epoch": 4.947021299599957, "grad_norm": 0.2867159843444824, "learning_rate": 1.5675670106890527e-08, "loss": 0.3928, "step": 137265 }, { "epoch": 4.9472014992611815, "grad_norm": 0.221615731716156, "learning_rate": 1.557251775455848e-08, "loss": 0.3601, "step": 137270 }, { "epoch": 4.947381698922406, "grad_norm": 0.17245341837406158, "learning_rate": 1.5469705810211566e-08, "loss": 0.3856, "step": 137275 }, { "epoch": 4.947561898583631, "grad_norm": 0.23257580399513245, "learning_rate": 1.5367234275254215e-08, "loss": 0.391, "step": 137280 }, { "epoch": 4.947742098244856, "grad_norm": 0.2740226984024048, "learning_rate": 1.5265103151079762e-08, "loss": 0.3524, "step": 137285 }, { "epoch": 4.94792229790608, "grad_norm": 0.23505844175815582, "learning_rate": 1.5163312439081533e-08, "loss": 0.3628, "step": 137290 }, { "epoch": 4.948102497567304, "grad_norm": 0.26007720828056335, "learning_rate": 1.5061862140644535e-08, "loss": 0.3935, "step": 137295 }, { "epoch": 4.948282697228529, "grad_norm": 0.24692949652671814, "learning_rate": 1.4960752257153764e-08, "loss": 0.3707, "step": 137300 }, { "epoch": 4.948462896889754, "grad_norm": 0.25878119468688965, "learning_rate": 1.4859982789985905e-08, "loss": 0.3508, "step": 137305 }, { "epoch": 4.9486430965509784, "grad_norm": 0.3022497892379761, "learning_rate": 1.4759553740512078e-08, "loss": 0.3793, "step": 137310 }, { "epoch": 4.948823296212203, "grad_norm": 0.21404872834682465, "learning_rate": 1.4659465110103409e-08, "loss": 0.3658, "step": 137315 }, { "epoch": 4.949003495873428, "grad_norm": 0.23077760636806488, "learning_rate": 1.4559716900122699e-08, "loss": 0.3532, "step": 137320 }, { "epoch": 4.949183695534653, "grad_norm": 0.29384756088256836, "learning_rate": 1.4460309111927194e-08, "loss": 0.3861, "step": 137325 }, { "epoch": 4.949363895195877, "grad_norm": 0.26955410838127136, "learning_rate": 1.4361241746874143e-08, "loss": 0.3721, "step": 137330 }, { "epoch": 4.949544094857101, "grad_norm": 0.3485639989376068, "learning_rate": 1.426251480630969e-08, "loss": 0.4169, "step": 137335 }, { "epoch": 4.949724294518326, "grad_norm": 0.2870672643184662, "learning_rate": 1.4164128291582757e-08, "loss": 0.344, "step": 137340 }, { "epoch": 4.949904494179551, "grad_norm": 0.2651885449886322, "learning_rate": 1.4066082204031162e-08, "loss": 0.386, "step": 137345 }, { "epoch": 4.950084693840775, "grad_norm": 0.21898417174816132, "learning_rate": 1.3968376544992723e-08, "loss": 0.3918, "step": 137350 }, { "epoch": 4.950264893502, "grad_norm": 0.23120352625846863, "learning_rate": 1.3871011315796934e-08, "loss": 0.3546, "step": 137355 }, { "epoch": 4.950445093163225, "grad_norm": 0.2565666437149048, "learning_rate": 1.377398651777051e-08, "loss": 0.3648, "step": 137360 }, { "epoch": 4.95062529282445, "grad_norm": 0.26287904381752014, "learning_rate": 1.3677302152237393e-08, "loss": 0.3705, "step": 137365 }, { "epoch": 4.950805492485674, "grad_norm": 0.2553930878639221, "learning_rate": 1.3580958220513195e-08, "loss": 0.3652, "step": 137370 }, { "epoch": 4.950985692146899, "grad_norm": 0.2285178005695343, "learning_rate": 1.3484954723910759e-08, "loss": 0.3703, "step": 137375 }, { "epoch": 4.951165891808124, "grad_norm": 0.2720915377140045, "learning_rate": 1.3389291663737368e-08, "loss": 0.3932, "step": 137380 }, { "epoch": 4.951346091469348, "grad_norm": 0.26943936944007874, "learning_rate": 1.3293969041297538e-08, "loss": 0.3849, "step": 137385 }, { "epoch": 4.951526291130572, "grad_norm": 0.2463103085756302, "learning_rate": 1.3198986857890227e-08, "loss": 0.3596, "step": 137390 }, { "epoch": 4.951706490791797, "grad_norm": 0.3053506016731262, "learning_rate": 1.3104345114808848e-08, "loss": 0.3752, "step": 137395 }, { "epoch": 4.951886690453022, "grad_norm": 0.24420666694641113, "learning_rate": 1.301004381334403e-08, "loss": 0.3704, "step": 137400 }, { "epoch": 4.952066890114247, "grad_norm": 0.26522696018218994, "learning_rate": 1.2916082954780861e-08, "loss": 0.372, "step": 137405 }, { "epoch": 4.952247089775471, "grad_norm": 0.2860383987426758, "learning_rate": 1.2822462540396096e-08, "loss": 0.3768, "step": 137410 }, { "epoch": 4.952427289436696, "grad_norm": 0.21746990084648132, "learning_rate": 1.2729182571466492e-08, "loss": 0.3822, "step": 137415 }, { "epoch": 4.952607489097921, "grad_norm": 0.24215134978294373, "learning_rate": 1.2636243049266028e-08, "loss": 0.3612, "step": 137420 }, { "epoch": 4.952787688759145, "grad_norm": 0.23102861642837524, "learning_rate": 1.254364397506036e-08, "loss": 0.3624, "step": 137425 }, { "epoch": 4.952967888420369, "grad_norm": 0.2532614767551422, "learning_rate": 1.245138535010959e-08, "loss": 0.3882, "step": 137430 }, { "epoch": 4.953148088081594, "grad_norm": 0.25287774205207825, "learning_rate": 1.235946717566827e-08, "loss": 0.4318, "step": 137435 }, { "epoch": 4.953328287742819, "grad_norm": 0.26038363575935364, "learning_rate": 1.226788945299373e-08, "loss": 0.3708, "step": 137440 }, { "epoch": 4.953508487404044, "grad_norm": 0.2988220453262329, "learning_rate": 1.2176652183329418e-08, "loss": 0.3655, "step": 137445 }, { "epoch": 4.953688687065268, "grad_norm": 0.3282366096973419, "learning_rate": 1.2085755367921558e-08, "loss": 0.3618, "step": 137450 }, { "epoch": 4.953868886726493, "grad_norm": 0.31220465898513794, "learning_rate": 1.1995199008008051e-08, "loss": 0.3505, "step": 137455 }, { "epoch": 4.954049086387718, "grad_norm": 0.32093510031700134, "learning_rate": 1.1904983104821243e-08, "loss": 0.3727, "step": 137460 }, { "epoch": 4.9542292860489425, "grad_norm": 0.2403687983751297, "learning_rate": 1.1815107659590707e-08, "loss": 0.3588, "step": 137465 }, { "epoch": 4.954409485710167, "grad_norm": 0.21455620229244232, "learning_rate": 1.172557267354324e-08, "loss": 0.3521, "step": 137470 }, { "epoch": 4.954589685371392, "grad_norm": 0.26999422907829285, "learning_rate": 1.1636378147897308e-08, "loss": 0.3487, "step": 137475 }, { "epoch": 4.954769885032616, "grad_norm": 0.2714780569076538, "learning_rate": 1.1547524083865835e-08, "loss": 0.3968, "step": 137480 }, { "epoch": 4.954950084693841, "grad_norm": 0.3368239104747772, "learning_rate": 1.1459010482661735e-08, "loss": 0.3995, "step": 137485 }, { "epoch": 4.955130284355065, "grad_norm": 0.24230121076107025, "learning_rate": 1.137083734549238e-08, "loss": 0.3657, "step": 137490 }, { "epoch": 4.95531048401629, "grad_norm": 0.20803499221801758, "learning_rate": 1.1283004673554032e-08, "loss": 0.3669, "step": 137495 }, { "epoch": 4.955490683677515, "grad_norm": 0.19922496378421783, "learning_rate": 1.1195512468048508e-08, "loss": 0.3611, "step": 137500 }, { "epoch": 4.955490683677515, "eval_loss": 0.42881152033805847, "eval_runtime": 3.5375, "eval_samples_per_second": 28.268, "eval_steps_per_second": 7.067, "step": 137500 }, { "epoch": 4.9556708833387395, "grad_norm": 0.2823123335838318, "learning_rate": 1.1108360730169299e-08, "loss": 0.3637, "step": 137505 }, { "epoch": 4.955851082999964, "grad_norm": 0.2755952775478363, "learning_rate": 1.1021549461096015e-08, "loss": 0.3775, "step": 137510 }, { "epoch": 4.956031282661189, "grad_norm": 0.24063868820667267, "learning_rate": 1.093507866201937e-08, "loss": 0.3269, "step": 137515 }, { "epoch": 4.956211482322413, "grad_norm": 0.20716242492198944, "learning_rate": 1.0848948334113428e-08, "loss": 0.3577, "step": 137520 }, { "epoch": 4.9563916819836376, "grad_norm": 0.30676987767219543, "learning_rate": 1.0763158478549474e-08, "loss": 0.3848, "step": 137525 }, { "epoch": 4.956571881644862, "grad_norm": 0.2790820896625519, "learning_rate": 1.0677709096501565e-08, "loss": 0.354, "step": 137530 }, { "epoch": 4.956752081306087, "grad_norm": 0.22698865830898285, "learning_rate": 1.0609594732577389e-08, "loss": 0.3403, "step": 137535 }, { "epoch": 4.956932280967312, "grad_norm": 0.19917847216129303, "learning_rate": 1.0524758205784913e-08, "loss": 0.3678, "step": 137540 }, { "epoch": 4.9571124806285365, "grad_norm": 0.2684897482395172, "learning_rate": 1.0440262155755886e-08, "loss": 0.3663, "step": 137545 }, { "epoch": 4.957292680289761, "grad_norm": 0.24667024612426758, "learning_rate": 1.0356106583639391e-08, "loss": 0.3596, "step": 137550 }, { "epoch": 4.957472879950986, "grad_norm": 0.1977359503507614, "learning_rate": 1.0272291490581731e-08, "loss": 0.3723, "step": 137555 }, { "epoch": 4.957653079612211, "grad_norm": 0.26738354563713074, "learning_rate": 1.0188816877726437e-08, "loss": 0.3911, "step": 137560 }, { "epoch": 4.957833279273435, "grad_norm": 0.2211945503950119, "learning_rate": 1.0105682746211487e-08, "loss": 0.3196, "step": 137565 }, { "epoch": 4.958013478934659, "grad_norm": 0.27656158804893494, "learning_rate": 1.0022889097169308e-08, "loss": 0.348, "step": 137570 }, { "epoch": 4.958193678595884, "grad_norm": 0.2519603967666626, "learning_rate": 9.940435931724001e-09, "loss": 0.3757, "step": 137575 }, { "epoch": 4.958373878257109, "grad_norm": 0.25226572155952454, "learning_rate": 9.858323251005219e-09, "loss": 0.3778, "step": 137580 }, { "epoch": 4.9585540779183335, "grad_norm": 0.20732538402080536, "learning_rate": 9.77655105612596e-09, "loss": 0.3805, "step": 137585 }, { "epoch": 4.958734277579558, "grad_norm": 0.2726946771144867, "learning_rate": 9.695119348204773e-09, "loss": 0.3624, "step": 137590 }, { "epoch": 4.958914477240783, "grad_norm": 0.20067138969898224, "learning_rate": 9.614028128346331e-09, "loss": 0.333, "step": 137595 }, { "epoch": 4.959094676902008, "grad_norm": 0.2827952206134796, "learning_rate": 9.533277397660856e-09, "loss": 0.3483, "step": 137600 }, { "epoch": 4.959274876563232, "grad_norm": 0.23344914615154266, "learning_rate": 9.452867157247469e-09, "loss": 0.3538, "step": 137605 }, { "epoch": 4.959455076224456, "grad_norm": 0.2827022969722748, "learning_rate": 9.372797408196965e-09, "loss": 0.3309, "step": 137610 }, { "epoch": 4.959635275885681, "grad_norm": 0.2828150987625122, "learning_rate": 9.293068151605689e-09, "loss": 0.3894, "step": 137615 }, { "epoch": 4.959815475546906, "grad_norm": 0.26365163922309875, "learning_rate": 9.213679388558882e-09, "loss": 0.3679, "step": 137620 }, { "epoch": 4.9599956752081305, "grad_norm": 0.291981965303421, "learning_rate": 9.134631120136238e-09, "loss": 0.3676, "step": 137625 }, { "epoch": 4.960175874869355, "grad_norm": 0.2089691460132599, "learning_rate": 9.055923347414675e-09, "loss": 0.3302, "step": 137630 }, { "epoch": 4.96035607453058, "grad_norm": 0.33166277408599854, "learning_rate": 8.977556071471105e-09, "loss": 0.352, "step": 137635 }, { "epoch": 4.960536274191805, "grad_norm": 0.2414046823978424, "learning_rate": 8.899529293365794e-09, "loss": 0.3702, "step": 137640 }, { "epoch": 4.960716473853029, "grad_norm": 0.2852274477481842, "learning_rate": 8.821843014170106e-09, "loss": 0.3833, "step": 137645 }, { "epoch": 4.960896673514254, "grad_norm": 0.23705297708511353, "learning_rate": 8.744497234935978e-09, "loss": 0.3855, "step": 137650 }, { "epoch": 4.961076873175479, "grad_norm": 0.28535374999046326, "learning_rate": 8.66749195671812e-09, "loss": 0.3653, "step": 137655 }, { "epoch": 4.961257072836703, "grad_norm": 0.23056595027446747, "learning_rate": 8.590827180571248e-09, "loss": 0.3719, "step": 137660 }, { "epoch": 4.961437272497927, "grad_norm": 0.24574077129364014, "learning_rate": 8.514502907533418e-09, "loss": 0.397, "step": 137665 }, { "epoch": 4.961617472159152, "grad_norm": 0.24938498437404633, "learning_rate": 8.438519138645462e-09, "loss": 0.3871, "step": 137670 }, { "epoch": 4.961797671820377, "grad_norm": 0.35726431012153625, "learning_rate": 8.362875874945442e-09, "loss": 0.3636, "step": 137675 }, { "epoch": 4.961977871481602, "grad_norm": 0.2681273818016052, "learning_rate": 8.287573117465863e-09, "loss": 0.3763, "step": 137680 }, { "epoch": 4.962158071142826, "grad_norm": 0.25855353474617004, "learning_rate": 8.212610867225356e-09, "loss": 0.3967, "step": 137685 }, { "epoch": 4.962338270804051, "grad_norm": 0.22436444461345673, "learning_rate": 8.137989125250877e-09, "loss": 0.3584, "step": 137690 }, { "epoch": 4.962518470465276, "grad_norm": 0.2069372534751892, "learning_rate": 8.063707892558281e-09, "loss": 0.3502, "step": 137695 }, { "epoch": 4.9626986701265, "grad_norm": 0.23629046976566315, "learning_rate": 7.989767170157869e-09, "loss": 0.3882, "step": 137700 }, { "epoch": 4.962878869787724, "grad_norm": 0.24616286158561707, "learning_rate": 7.916166959059944e-09, "loss": 0.3603, "step": 137705 }, { "epoch": 4.963059069448949, "grad_norm": 0.22220478951931, "learning_rate": 7.84290726026371e-09, "loss": 0.3726, "step": 137710 }, { "epoch": 4.963239269110174, "grad_norm": 0.23332755267620087, "learning_rate": 7.769988074771139e-09, "loss": 0.3892, "step": 137715 }, { "epoch": 4.963419468771399, "grad_norm": 0.2551371455192566, "learning_rate": 7.697409403573108e-09, "loss": 0.3594, "step": 137720 }, { "epoch": 4.963599668432623, "grad_norm": 0.26924464106559753, "learning_rate": 7.625171247657714e-09, "loss": 0.3503, "step": 137725 }, { "epoch": 4.963779868093848, "grad_norm": 0.25808215141296387, "learning_rate": 7.553273608013057e-09, "loss": 0.3743, "step": 137730 }, { "epoch": 4.963960067755073, "grad_norm": 0.30064940452575684, "learning_rate": 7.481716485616131e-09, "loss": 0.3813, "step": 137735 }, { "epoch": 4.9641402674162975, "grad_norm": 0.23391714692115784, "learning_rate": 7.410499881438382e-09, "loss": 0.3778, "step": 137740 }, { "epoch": 4.964320467077522, "grad_norm": 0.25494813919067383, "learning_rate": 7.339623796456807e-09, "loss": 0.3887, "step": 137745 }, { "epoch": 4.964500666738747, "grad_norm": 0.30371445417404175, "learning_rate": 7.2690882316345245e-09, "loss": 0.3799, "step": 137750 }, { "epoch": 4.964680866399971, "grad_norm": 0.3035849630832672, "learning_rate": 7.198893187931877e-09, "loss": 0.372, "step": 137755 }, { "epoch": 4.964861066061196, "grad_norm": 0.7903338670730591, "learning_rate": 7.129038666306431e-09, "loss": 0.3787, "step": 137760 }, { "epoch": 4.96504126572242, "grad_norm": 0.24585622549057007, "learning_rate": 7.059524667707429e-09, "loss": 0.3528, "step": 137765 }, { "epoch": 4.965221465383645, "grad_norm": 0.6391294002532959, "learning_rate": 6.99035119308411e-09, "loss": 0.3562, "step": 137770 }, { "epoch": 4.96540166504487, "grad_norm": 0.2620919942855835, "learning_rate": 6.9215182433773895e-09, "loss": 0.349, "step": 137775 }, { "epoch": 4.9655818647060945, "grad_norm": 0.25720539689064026, "learning_rate": 6.85302581952818e-09, "loss": 0.334, "step": 137780 }, { "epoch": 4.965762064367319, "grad_norm": 0.23249337077140808, "learning_rate": 6.784873922466295e-09, "loss": 0.3948, "step": 137785 }, { "epoch": 4.965942264028544, "grad_norm": 0.29254230856895447, "learning_rate": 6.717062553121545e-09, "loss": 0.3938, "step": 137790 }, { "epoch": 4.966122463689768, "grad_norm": 0.27492207288742065, "learning_rate": 6.649591712420966e-09, "loss": 0.4013, "step": 137795 }, { "epoch": 4.966302663350993, "grad_norm": 0.24353964626789093, "learning_rate": 6.5824614012777174e-09, "loss": 0.3498, "step": 137800 }, { "epoch": 4.966482863012217, "grad_norm": 0.2534390389919281, "learning_rate": 6.515671620610508e-09, "loss": 0.3779, "step": 137805 }, { "epoch": 4.966663062673442, "grad_norm": 0.28155553340911865, "learning_rate": 6.449222371326946e-09, "loss": 0.3622, "step": 137810 }, { "epoch": 4.966843262334667, "grad_norm": 0.28064605593681335, "learning_rate": 6.383113654334638e-09, "loss": 0.3944, "step": 137815 }, { "epoch": 4.9670234619958915, "grad_norm": 0.32647833228111267, "learning_rate": 6.31734547053564e-09, "loss": 0.3521, "step": 137820 }, { "epoch": 4.967203661657116, "grad_norm": 0.2887493073940277, "learning_rate": 6.2519178208209075e-09, "loss": 0.3774, "step": 137825 }, { "epoch": 4.967383861318341, "grad_norm": 0.2260408252477646, "learning_rate": 6.186830706086943e-09, "loss": 0.4083, "step": 137830 }, { "epoch": 4.967564060979566, "grad_norm": 0.2433057725429535, "learning_rate": 6.122084127216377e-09, "loss": 0.4013, "step": 137835 }, { "epoch": 4.96774426064079, "grad_norm": 0.2385009527206421, "learning_rate": 6.057678085094609e-09, "loss": 0.3676, "step": 137840 }, { "epoch": 4.967924460302014, "grad_norm": 0.20035654306411743, "learning_rate": 5.993612580598717e-09, "loss": 0.3494, "step": 137845 }, { "epoch": 4.968104659963239, "grad_norm": 0.28439047932624817, "learning_rate": 5.929887614600227e-09, "loss": 0.3352, "step": 137850 }, { "epoch": 4.968284859624464, "grad_norm": 0.2770126163959503, "learning_rate": 5.866503187967886e-09, "loss": 0.3925, "step": 137855 }, { "epoch": 4.9684650592856885, "grad_norm": 0.33329346776008606, "learning_rate": 5.803459301564895e-09, "loss": 0.3521, "step": 137860 }, { "epoch": 4.968645258946913, "grad_norm": 0.27743661403656006, "learning_rate": 5.7407559562516754e-09, "loss": 0.3377, "step": 137865 }, { "epoch": 4.968825458608138, "grad_norm": 0.2770087718963623, "learning_rate": 5.678393152880323e-09, "loss": 0.3618, "step": 137870 }, { "epoch": 4.969005658269363, "grad_norm": 0.26707491278648376, "learning_rate": 5.616370892302936e-09, "loss": 0.3735, "step": 137875 }, { "epoch": 4.969185857930587, "grad_norm": 0.24807052314281464, "learning_rate": 5.554689175363281e-09, "loss": 0.3664, "step": 137880 }, { "epoch": 4.969366057591811, "grad_norm": 0.2263546884059906, "learning_rate": 5.493348002902354e-09, "loss": 0.3802, "step": 137885 }, { "epoch": 4.969546257253036, "grad_norm": 0.2367497682571411, "learning_rate": 5.432347375755598e-09, "loss": 0.3439, "step": 137890 }, { "epoch": 4.969726456914261, "grad_norm": 0.25867414474487305, "learning_rate": 5.371687294752903e-09, "loss": 0.3795, "step": 137895 }, { "epoch": 4.9699066565754855, "grad_norm": 0.2083641141653061, "learning_rate": 5.311367760721387e-09, "loss": 0.3653, "step": 137900 }, { "epoch": 4.97008685623671, "grad_norm": 0.24108898639678955, "learning_rate": 5.251388774485388e-09, "loss": 0.3576, "step": 137905 }, { "epoch": 4.970267055897935, "grad_norm": 0.2666069567203522, "learning_rate": 5.1917503368581474e-09, "loss": 0.3818, "step": 137910 }, { "epoch": 4.97044725555916, "grad_norm": 0.24382546544075012, "learning_rate": 5.132452448655678e-09, "loss": 0.3721, "step": 137915 }, { "epoch": 4.970627455220384, "grad_norm": 0.307109534740448, "learning_rate": 5.073495110682891e-09, "loss": 0.4167, "step": 137920 }, { "epoch": 4.970807654881609, "grad_norm": 0.2820093631744385, "learning_rate": 5.014878323744698e-09, "loss": 0.3609, "step": 137925 }, { "epoch": 4.970987854542834, "grad_norm": 0.2773934304714203, "learning_rate": 4.956602088640461e-09, "loss": 0.3742, "step": 137930 }, { "epoch": 4.971168054204058, "grad_norm": 0.23025000095367432, "learning_rate": 4.898666406163988e-09, "loss": 0.3682, "step": 137935 }, { "epoch": 4.9713482538652825, "grad_norm": 0.24413655698299408, "learning_rate": 4.841071277103537e-09, "loss": 0.4005, "step": 137940 }, { "epoch": 4.971528453526507, "grad_norm": 0.23541654646396637, "learning_rate": 4.783816702241817e-09, "loss": 0.3692, "step": 137945 }, { "epoch": 4.971708653187732, "grad_norm": 0.2659054398536682, "learning_rate": 4.7269026823643095e-09, "loss": 0.4064, "step": 137950 }, { "epoch": 4.971888852848957, "grad_norm": 0.23656433820724487, "learning_rate": 4.670329218242619e-09, "loss": 0.3935, "step": 137955 }, { "epoch": 4.972069052510181, "grad_norm": 0.27703598141670227, "learning_rate": 4.614096310648352e-09, "loss": 0.4068, "step": 137960 }, { "epoch": 4.972249252171406, "grad_norm": 0.20020613074302673, "learning_rate": 4.558203960347562e-09, "loss": 0.3666, "step": 137965 }, { "epoch": 4.972429451832631, "grad_norm": 0.27739599347114563, "learning_rate": 4.5026521681035275e-09, "loss": 0.3525, "step": 137970 }, { "epoch": 4.972609651493855, "grad_norm": 0.31181418895721436, "learning_rate": 4.4474409346711984e-09, "loss": 0.412, "step": 137975 }, { "epoch": 4.972789851155079, "grad_norm": 0.2787044048309326, "learning_rate": 4.392570260799977e-09, "loss": 0.3389, "step": 137980 }, { "epoch": 4.972970050816304, "grad_norm": 0.25257426500320435, "learning_rate": 4.338040147244815e-09, "loss": 0.3701, "step": 137985 }, { "epoch": 4.973150250477529, "grad_norm": 0.34394869208335876, "learning_rate": 4.283850594741234e-09, "loss": 0.3779, "step": 137990 }, { "epoch": 4.973330450138754, "grad_norm": 0.2812308073043823, "learning_rate": 4.230001604033085e-09, "loss": 0.3751, "step": 137995 }, { "epoch": 4.973510649799978, "grad_norm": 0.30743271112442017, "learning_rate": 4.176493175850338e-09, "loss": 0.392, "step": 138000 }, { "epoch": 4.973510649799978, "eval_loss": 0.4287694990634918, "eval_runtime": 3.5429, "eval_samples_per_second": 28.226, "eval_steps_per_second": 7.056, "step": 138000 }, { "epoch": 4.973690849461203, "grad_norm": 0.20221595466136932, "learning_rate": 4.123325310925741e-09, "loss": 0.374, "step": 138005 }, { "epoch": 4.973871049122428, "grad_norm": 0.19391195476055145, "learning_rate": 4.070498009978163e-09, "loss": 0.3898, "step": 138010 }, { "epoch": 4.974051248783653, "grad_norm": 0.2685374617576599, "learning_rate": 4.018011273734801e-09, "loss": 0.3656, "step": 138015 }, { "epoch": 4.974231448444877, "grad_norm": 0.2981413006782532, "learning_rate": 3.965865102903421e-09, "loss": 0.339, "step": 138020 }, { "epoch": 4.974411648106102, "grad_norm": 0.24895471334457397, "learning_rate": 3.914059498197342e-09, "loss": 0.3855, "step": 138025 }, { "epoch": 4.974591847767326, "grad_norm": 0.289763867855072, "learning_rate": 3.862594460324332e-09, "loss": 0.3667, "step": 138030 }, { "epoch": 4.974772047428551, "grad_norm": 0.2891743779182434, "learning_rate": 3.811469989983829e-09, "loss": 0.3728, "step": 138035 }, { "epoch": 4.974952247089775, "grad_norm": 0.30129414796829224, "learning_rate": 3.7606860878725005e-09, "loss": 0.3678, "step": 138040 }, { "epoch": 4.975132446751, "grad_norm": 0.3116011619567871, "learning_rate": 3.7102427546842343e-09, "loss": 0.3739, "step": 138045 }, { "epoch": 4.975312646412225, "grad_norm": 0.24255453050136566, "learning_rate": 3.6601399911018185e-09, "loss": 0.3675, "step": 138050 }, { "epoch": 4.9754928460734495, "grad_norm": 0.33993199467658997, "learning_rate": 3.610377797813591e-09, "loss": 0.3956, "step": 138055 }, { "epoch": 4.975673045734674, "grad_norm": 0.23433293402194977, "learning_rate": 3.560956175491237e-09, "loss": 0.3584, "step": 138060 }, { "epoch": 4.975853245395899, "grad_norm": 0.3143203556537628, "learning_rate": 3.511875124811992e-09, "loss": 0.3765, "step": 138065 }, { "epoch": 4.976033445057123, "grad_norm": 0.25443294644355774, "learning_rate": 3.4631346464447658e-09, "loss": 0.3904, "step": 138070 }, { "epoch": 4.976213644718348, "grad_norm": 0.22525587677955627, "learning_rate": 3.4147347410529164e-09, "loss": 0.3743, "step": 138075 }, { "epoch": 4.976393844379572, "grad_norm": 0.27883243560791016, "learning_rate": 3.3666754092970266e-09, "loss": 0.3883, "step": 138080 }, { "epoch": 4.976574044040797, "grad_norm": 0.2141328603029251, "learning_rate": 3.3189566518293526e-09, "loss": 0.3603, "step": 138085 }, { "epoch": 4.976754243702022, "grad_norm": 0.24589018523693085, "learning_rate": 3.2715784692993747e-09, "loss": 0.3851, "step": 138090 }, { "epoch": 4.9769344433632465, "grad_norm": 0.27227839827537537, "learning_rate": 3.2245408623565733e-09, "loss": 0.3899, "step": 138095 }, { "epoch": 4.977114643024471, "grad_norm": 0.2859554588794708, "learning_rate": 3.1778438316393266e-09, "loss": 0.3719, "step": 138100 }, { "epoch": 4.977294842685696, "grad_norm": 0.2409016191959381, "learning_rate": 3.1314873777860133e-09, "loss": 0.3319, "step": 138105 }, { "epoch": 4.977475042346921, "grad_norm": 0.27938273549079895, "learning_rate": 3.0854715014266845e-09, "loss": 0.3947, "step": 138110 }, { "epoch": 4.9776552420081455, "grad_norm": 0.26294222474098206, "learning_rate": 3.039796203185841e-09, "loss": 0.3686, "step": 138115 }, { "epoch": 4.977835441669369, "grad_norm": 0.267437607049942, "learning_rate": 2.994461483690758e-09, "loss": 0.367, "step": 138120 }, { "epoch": 4.978015641330594, "grad_norm": 0.23144756257534027, "learning_rate": 2.9494673435548346e-09, "loss": 0.3605, "step": 138125 }, { "epoch": 4.978195840991819, "grad_norm": 0.24928845465183258, "learning_rate": 2.9048137833942446e-09, "loss": 0.3923, "step": 138130 }, { "epoch": 4.9783760406530435, "grad_norm": 0.2760869860649109, "learning_rate": 2.860500803814059e-09, "loss": 0.3978, "step": 138135 }, { "epoch": 4.978556240314268, "grad_norm": 0.25941580533981323, "learning_rate": 2.8165284054193498e-09, "loss": 0.3736, "step": 138140 }, { "epoch": 4.978736439975493, "grad_norm": 0.26913321018218994, "learning_rate": 2.7728965888124124e-09, "loss": 0.3906, "step": 138145 }, { "epoch": 4.978916639636718, "grad_norm": 0.24921806156635284, "learning_rate": 2.729605354584441e-09, "loss": 0.3461, "step": 138150 }, { "epoch": 4.9790968392979424, "grad_norm": 0.2868900001049042, "learning_rate": 2.68665470332663e-09, "loss": 0.3779, "step": 138155 }, { "epoch": 4.979277038959166, "grad_norm": 0.2325631082057953, "learning_rate": 2.644044635621845e-09, "loss": 0.384, "step": 138160 }, { "epoch": 4.979457238620391, "grad_norm": 0.24822711944580078, "learning_rate": 2.601775152052954e-09, "loss": 0.3653, "step": 138165 }, { "epoch": 4.979637438281616, "grad_norm": 0.2686733603477478, "learning_rate": 2.559846253194498e-09, "loss": 0.3809, "step": 138170 }, { "epoch": 4.9798176379428405, "grad_norm": 0.27422237396240234, "learning_rate": 2.518257939621016e-09, "loss": 0.3853, "step": 138175 }, { "epoch": 4.979997837604065, "grad_norm": 0.24748918414115906, "learning_rate": 2.4770102118931714e-09, "loss": 0.4023, "step": 138180 }, { "epoch": 4.98017803726529, "grad_norm": 0.2908730208873749, "learning_rate": 2.4361030705771782e-09, "loss": 0.3696, "step": 138185 }, { "epoch": 4.980358236926515, "grad_norm": 0.31870460510253906, "learning_rate": 2.395536516230923e-09, "loss": 0.3693, "step": 138190 }, { "epoch": 4.980538436587739, "grad_norm": 0.25807613134384155, "learning_rate": 2.355310549406742e-09, "loss": 0.4022, "step": 138195 }, { "epoch": 4.980718636248964, "grad_norm": 0.269734263420105, "learning_rate": 2.315425170648644e-09, "loss": 0.3582, "step": 138200 }, { "epoch": 4.980898835910189, "grad_norm": 0.22261251509189606, "learning_rate": 2.27588038050619e-09, "loss": 0.4311, "step": 138205 }, { "epoch": 4.981079035571413, "grad_norm": 0.25113579630851746, "learning_rate": 2.2366761795150626e-09, "loss": 0.3689, "step": 138210 }, { "epoch": 4.9812592352326375, "grad_norm": 0.21979762613773346, "learning_rate": 2.1978125682081683e-09, "loss": 0.3975, "step": 138215 }, { "epoch": 4.981439434893862, "grad_norm": 0.2908676564693451, "learning_rate": 2.159289547115639e-09, "loss": 0.3526, "step": 138220 }, { "epoch": 4.981619634555087, "grad_norm": 0.2306397557258606, "learning_rate": 2.121107116764831e-09, "loss": 0.3521, "step": 138225 }, { "epoch": 4.981799834216312, "grad_norm": 0.2513945698738098, "learning_rate": 2.0832652776719974e-09, "loss": 0.3938, "step": 138230 }, { "epoch": 4.981980033877536, "grad_norm": 0.2760458290576935, "learning_rate": 2.0457640303561676e-09, "loss": 0.3951, "step": 138235 }, { "epoch": 4.982160233538761, "grad_norm": 0.2553420066833496, "learning_rate": 2.0086033753280442e-09, "loss": 0.3646, "step": 138240 }, { "epoch": 4.982340433199986, "grad_norm": 0.25386497378349304, "learning_rate": 1.971783313090003e-09, "loss": 0.3633, "step": 138245 }, { "epoch": 4.982520632861211, "grad_norm": 0.20062953233718872, "learning_rate": 1.9353038441499716e-09, "loss": 0.3194, "step": 138250 }, { "epoch": 4.9827008325224345, "grad_norm": 0.26775026321411133, "learning_rate": 1.899164968999223e-09, "loss": 0.4075, "step": 138255 }, { "epoch": 4.982881032183659, "grad_norm": 0.273628830909729, "learning_rate": 1.8633666881345823e-09, "loss": 0.3616, "step": 138260 }, { "epoch": 4.983061231844884, "grad_norm": 0.2722027003765106, "learning_rate": 1.8279090020417721e-09, "loss": 0.3612, "step": 138265 }, { "epoch": 4.983241431506109, "grad_norm": 0.2538944184780121, "learning_rate": 1.7927919112009639e-09, "loss": 0.3556, "step": 138270 }, { "epoch": 4.983421631167333, "grad_norm": 0.264728844165802, "learning_rate": 1.7580154160951047e-09, "loss": 0.386, "step": 138275 }, { "epoch": 4.983601830828558, "grad_norm": 0.2643708288669586, "learning_rate": 1.7235795171988146e-09, "loss": 0.3676, "step": 138280 }, { "epoch": 4.983782030489783, "grad_norm": 0.2401455044746399, "learning_rate": 1.689484214975612e-09, "loss": 0.3727, "step": 138285 }, { "epoch": 4.983962230151008, "grad_norm": 0.2023637741804123, "learning_rate": 1.6557295098945658e-09, "loss": 0.4015, "step": 138290 }, { "epoch": 4.984142429812232, "grad_norm": 0.26757675409317017, "learning_rate": 1.622315402416419e-09, "loss": 0.3644, "step": 138295 }, { "epoch": 4.984322629473457, "grad_norm": 0.22083128988742828, "learning_rate": 1.5892418929908114e-09, "loss": 0.3287, "step": 138300 }, { "epoch": 4.984502829134681, "grad_norm": 0.3092709183692932, "learning_rate": 1.5565089820729351e-09, "loss": 0.3711, "step": 138305 }, { "epoch": 4.984683028795906, "grad_norm": 0.22019417583942413, "learning_rate": 1.5241166701096543e-09, "loss": 0.3608, "step": 138310 }, { "epoch": 4.98486322845713, "grad_norm": 0.24744448065757751, "learning_rate": 1.4920649575395073e-09, "loss": 0.3614, "step": 138315 }, { "epoch": 4.985043428118355, "grad_norm": 0.17398010194301605, "learning_rate": 1.4603538448010323e-09, "loss": 0.3591, "step": 138320 }, { "epoch": 4.98522362777958, "grad_norm": 0.22615504264831543, "learning_rate": 1.428983332321665e-09, "loss": 0.363, "step": 138325 }, { "epoch": 4.985403827440805, "grad_norm": 0.25961410999298096, "learning_rate": 1.397953420537168e-09, "loss": 0.3894, "step": 138330 }, { "epoch": 4.985584027102029, "grad_norm": 0.27555739879608154, "learning_rate": 1.367264109863875e-09, "loss": 0.3741, "step": 138335 }, { "epoch": 4.985764226763254, "grad_norm": 0.3149866759777069, "learning_rate": 1.336915400720895e-09, "loss": 0.4, "step": 138340 }, { "epoch": 4.985944426424478, "grad_norm": 0.2356063425540924, "learning_rate": 1.3069072935217863e-09, "loss": 0.3838, "step": 138345 }, { "epoch": 4.986124626085703, "grad_norm": 0.28220483660697937, "learning_rate": 1.2772397886773313e-09, "loss": 0.3788, "step": 138350 }, { "epoch": 4.986304825746927, "grad_norm": 0.21516813337802887, "learning_rate": 1.247912886589986e-09, "loss": 0.3715, "step": 138355 }, { "epoch": 4.986485025408152, "grad_norm": 0.2194688469171524, "learning_rate": 1.2189265876622058e-09, "loss": 0.3802, "step": 138360 }, { "epoch": 4.986665225069377, "grad_norm": 0.26937180757522583, "learning_rate": 1.1902808922853447e-09, "loss": 0.3905, "step": 138365 }, { "epoch": 4.9868454247306016, "grad_norm": 0.2718323767185211, "learning_rate": 1.1619758008507564e-09, "loss": 0.3827, "step": 138370 }, { "epoch": 4.987025624391826, "grad_norm": 0.31076350808143616, "learning_rate": 1.1340113137442431e-09, "loss": 0.3789, "step": 138375 }, { "epoch": 4.987205824053051, "grad_norm": 0.26941627264022827, "learning_rate": 1.1063874313460564e-09, "loss": 0.3854, "step": 138380 }, { "epoch": 4.987386023714276, "grad_norm": 0.2722313702106476, "learning_rate": 1.0791041540336722e-09, "loss": 0.3723, "step": 138385 }, { "epoch": 4.9875662233755005, "grad_norm": 0.2729116976261139, "learning_rate": 1.0521614821790148e-09, "loss": 0.3595, "step": 138390 }, { "epoch": 4.987746423036724, "grad_norm": 0.25742781162261963, "learning_rate": 1.0255594161484583e-09, "loss": 0.3608, "step": 138395 }, { "epoch": 4.987926622697949, "grad_norm": 0.27901482582092285, "learning_rate": 9.992979563056003e-10, "loss": 0.3677, "step": 138400 }, { "epoch": 4.988106822359174, "grad_norm": 0.20445889234542847, "learning_rate": 9.785340251255993e-10, "loss": 0.3751, "step": 138405 }, { "epoch": 4.9882870220203985, "grad_norm": 0.2854103446006775, "learning_rate": 9.528856573182144e-10, "loss": 0.3522, "step": 138410 }, { "epoch": 4.988467221681623, "grad_norm": 0.25394073128700256, "learning_rate": 9.275778966866267e-10, "loss": 0.3537, "step": 138415 }, { "epoch": 4.988647421342848, "grad_norm": 0.24267399311065674, "learning_rate": 9.026107435750053e-10, "loss": 0.3617, "step": 138420 }, { "epoch": 4.988827621004073, "grad_norm": 0.21222999691963196, "learning_rate": 8.779841983275195e-10, "loss": 0.3673, "step": 138425 }, { "epoch": 4.9890078206652975, "grad_norm": 0.24381837248802185, "learning_rate": 8.53698261277236e-10, "loss": 0.376, "step": 138430 }, { "epoch": 4.989188020326521, "grad_norm": 0.3303523659706116, "learning_rate": 8.297529327544462e-10, "loss": 0.3671, "step": 138435 }, { "epoch": 4.989368219987746, "grad_norm": 0.21352322399616241, "learning_rate": 8.061482130894416e-10, "loss": 0.3689, "step": 138440 }, { "epoch": 4.989548419648971, "grad_norm": 0.21397538483142853, "learning_rate": 7.8288410259586e-10, "loss": 0.3615, "step": 138445 }, { "epoch": 4.9897286193101955, "grad_norm": 0.25124630331993103, "learning_rate": 7.599606015984418e-10, "loss": 0.3633, "step": 138450 }, { "epoch": 4.98990881897142, "grad_norm": 0.31034645438194275, "learning_rate": 7.373777104052737e-10, "loss": 0.3662, "step": 138455 }, { "epoch": 4.990089018632645, "grad_norm": 0.23552536964416504, "learning_rate": 7.151354293272183e-10, "loss": 0.3589, "step": 138460 }, { "epoch": 4.99026921829387, "grad_norm": 0.23122437298297882, "learning_rate": 6.932337586640359e-10, "loss": 0.3852, "step": 138465 }, { "epoch": 4.9904494179550944, "grad_norm": 0.25336378812789917, "learning_rate": 6.716726987154865e-10, "loss": 0.3705, "step": 138470 }, { "epoch": 4.990629617616319, "grad_norm": 0.2418462485074997, "learning_rate": 6.504522497757792e-10, "loss": 0.395, "step": 138475 }, { "epoch": 4.990809817277544, "grad_norm": 0.23543506860733032, "learning_rate": 6.295724121335722e-10, "loss": 0.352, "step": 138480 }, { "epoch": 4.990990016938769, "grad_norm": 0.28832608461380005, "learning_rate": 6.090331860719722e-10, "loss": 0.3692, "step": 138485 }, { "epoch": 4.9911702165999925, "grad_norm": 0.21373923122882843, "learning_rate": 5.888345718740862e-10, "loss": 0.373, "step": 138490 }, { "epoch": 4.991350416261217, "grad_norm": 0.28929153084754944, "learning_rate": 5.689765698119187e-10, "loss": 0.4071, "step": 138495 }, { "epoch": 4.991530615922442, "grad_norm": 0.226441890001297, "learning_rate": 5.4945918016025e-10, "loss": 0.398, "step": 138500 }, { "epoch": 4.991530615922442, "eval_loss": 0.428780198097229, "eval_runtime": 3.5301, "eval_samples_per_second": 28.328, "eval_steps_per_second": 7.082, "step": 138500 } ], "logging_steps": 5, "max_steps": 138735, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.156594211931134e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }