{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.857035364936042, "eval_steps": 500, "global_step": 39300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 98.3414306640625, "learning_rate": 1.4082317531978931e-05, "loss": 1.674, "step": 50 }, { "epoch": 0.03, "grad_norm": 21.889772415161133, "learning_rate": 1.4064635063957864e-05, "loss": 1.7321, "step": 100 }, { "epoch": 0.04, "grad_norm": 78.81497955322266, "learning_rate": 1.4046952595936794e-05, "loss": 1.3246, "step": 150 }, { "epoch": 0.05, "grad_norm": 117.79057312011719, "learning_rate": 1.4029270127915727e-05, "loss": 1.8399, "step": 200 }, { "epoch": 0.06, "grad_norm": 89.93197631835938, "learning_rate": 1.4011587659894659e-05, "loss": 1.7021, "step": 250 }, { "epoch": 0.08, "grad_norm": 5.327052116394043, "learning_rate": 1.399390519187359e-05, "loss": 1.3229, "step": 300 }, { "epoch": 0.09, "grad_norm": 104.67691802978516, "learning_rate": 1.397622272385252e-05, "loss": 1.0449, "step": 350 }, { "epoch": 0.1, "grad_norm": 62.50383377075195, "learning_rate": 1.3958540255831453e-05, "loss": 1.2135, "step": 400 }, { "epoch": 0.11, "grad_norm": 30.1390380859375, "learning_rate": 1.3940857787810384e-05, "loss": 1.1312, "step": 450 }, { "epoch": 0.13, "grad_norm": 172.32058715820312, "learning_rate": 1.3923175319789316e-05, "loss": 1.1339, "step": 500 }, { "epoch": 0.14, "grad_norm": 149.6029052734375, "learning_rate": 1.3905492851768248e-05, "loss": 0.9226, "step": 550 }, { "epoch": 0.15, "grad_norm": 104.08654022216797, "learning_rate": 1.3887810383747179e-05, "loss": 0.9141, "step": 600 }, { "epoch": 0.16, "grad_norm": 28.90251350402832, "learning_rate": 1.387012791572611e-05, "loss": 0.7194, "step": 650 }, { "epoch": 0.18, "grad_norm": 78.85499572753906, "learning_rate": 1.3852445447705042e-05, "loss": 1.051, "step": 700 }, { "epoch": 0.19, "grad_norm": 59.84476089477539, "learning_rate": 1.3834762979683973e-05, "loss": 0.8815, "step": 750 }, { "epoch": 0.2, "grad_norm": 47.683658599853516, "learning_rate": 1.3817080511662905e-05, "loss": 1.1052, "step": 800 }, { "epoch": 0.21, "grad_norm": 73.24783325195312, "learning_rate": 1.3799398043641836e-05, "loss": 0.6957, "step": 850 }, { "epoch": 0.23, "grad_norm": 121.98059844970703, "learning_rate": 1.3781715575620768e-05, "loss": 1.1512, "step": 900 }, { "epoch": 0.24, "grad_norm": 115.57231140136719, "learning_rate": 1.3764033107599699e-05, "loss": 0.8512, "step": 950 }, { "epoch": 0.25, "grad_norm": 40.25959014892578, "learning_rate": 1.374635063957863e-05, "loss": 0.873, "step": 1000 }, { "epoch": 0.26, "grad_norm": 10.869709014892578, "learning_rate": 1.3728668171557562e-05, "loss": 0.7834, "step": 1050 }, { "epoch": 0.28, "grad_norm": 128.24893188476562, "learning_rate": 1.3710985703536495e-05, "loss": 0.8042, "step": 1100 }, { "epoch": 0.29, "grad_norm": 60.73322677612305, "learning_rate": 1.3693303235515425e-05, "loss": 1.0092, "step": 1150 }, { "epoch": 0.3, "grad_norm": 19.39624786376953, "learning_rate": 1.3675620767494358e-05, "loss": 0.662, "step": 1200 }, { "epoch": 0.31, "grad_norm": 0.13774849474430084, "learning_rate": 1.3657938299473288e-05, "loss": 0.98, "step": 1250 }, { "epoch": 0.33, "grad_norm": 79.46333312988281, "learning_rate": 1.3640255831452219e-05, "loss": 0.7967, "step": 1300 }, { "epoch": 0.34, "grad_norm": 13.158239364624023, "learning_rate": 1.3622573363431151e-05, "loss": 1.0218, "step": 1350 }, { "epoch": 0.35, "grad_norm": 17.267330169677734, "learning_rate": 1.3604890895410084e-05, "loss": 0.8711, "step": 1400 }, { "epoch": 0.36, "grad_norm": 174.72537231445312, "learning_rate": 1.3587208427389015e-05, "loss": 0.8711, "step": 1450 }, { "epoch": 0.38, "grad_norm": 77.13172149658203, "learning_rate": 1.3569525959367947e-05, "loss": 1.0233, "step": 1500 }, { "epoch": 0.39, "grad_norm": 48.417015075683594, "learning_rate": 1.3551843491346878e-05, "loss": 0.7682, "step": 1550 }, { "epoch": 0.4, "grad_norm": 6.1959638595581055, "learning_rate": 1.3534161023325808e-05, "loss": 0.8792, "step": 1600 }, { "epoch": 0.41, "grad_norm": 49.98043441772461, "learning_rate": 1.351647855530474e-05, "loss": 0.9868, "step": 1650 }, { "epoch": 0.43, "grad_norm": 45.13309860229492, "learning_rate": 1.3498796087283673e-05, "loss": 0.5272, "step": 1700 }, { "epoch": 0.44, "grad_norm": 8.423553466796875, "learning_rate": 1.3481113619262604e-05, "loss": 1.1983, "step": 1750 }, { "epoch": 0.45, "grad_norm": 17.5786190032959, "learning_rate": 1.3463431151241536e-05, "loss": 0.7065, "step": 1800 }, { "epoch": 0.46, "grad_norm": 5.939927577972412, "learning_rate": 1.3445748683220467e-05, "loss": 0.6674, "step": 1850 }, { "epoch": 0.48, "grad_norm": 23.781694412231445, "learning_rate": 1.3428066215199398e-05, "loss": 0.7267, "step": 1900 }, { "epoch": 0.49, "grad_norm": 0.4960607886314392, "learning_rate": 1.341038374717833e-05, "loss": 1.0549, "step": 1950 }, { "epoch": 0.5, "grad_norm": 83.99737548828125, "learning_rate": 1.3392701279157262e-05, "loss": 0.786, "step": 2000 }, { "epoch": 0.51, "grad_norm": 20.65607261657715, "learning_rate": 1.3375018811136193e-05, "loss": 0.9709, "step": 2050 }, { "epoch": 0.53, "grad_norm": 1.0673532485961914, "learning_rate": 1.3357336343115126e-05, "loss": 0.8208, "step": 2100 }, { "epoch": 0.54, "grad_norm": 10.350920677185059, "learning_rate": 1.3339653875094056e-05, "loss": 1.1503, "step": 2150 }, { "epoch": 0.55, "grad_norm": 0.7176612019538879, "learning_rate": 1.3321971407072987e-05, "loss": 0.5841, "step": 2200 }, { "epoch": 0.56, "grad_norm": 16.532655715942383, "learning_rate": 1.330428893905192e-05, "loss": 1.1618, "step": 2250 }, { "epoch": 0.58, "grad_norm": 0.24398092925548553, "learning_rate": 1.3286606471030852e-05, "loss": 0.6052, "step": 2300 }, { "epoch": 0.59, "grad_norm": 45.761695861816406, "learning_rate": 1.3268924003009782e-05, "loss": 1.0618, "step": 2350 }, { "epoch": 0.6, "grad_norm": 0.3656911849975586, "learning_rate": 1.3251241534988713e-05, "loss": 0.8395, "step": 2400 }, { "epoch": 0.61, "grad_norm": 56.36614227294922, "learning_rate": 1.3233559066967646e-05, "loss": 0.6547, "step": 2450 }, { "epoch": 0.63, "grad_norm": 50.591705322265625, "learning_rate": 1.3215876598946576e-05, "loss": 0.9528, "step": 2500 }, { "epoch": 0.64, "grad_norm": 11.290885925292969, "learning_rate": 1.3198194130925507e-05, "loss": 0.6811, "step": 2550 }, { "epoch": 0.65, "grad_norm": 0.10668418556451797, "learning_rate": 1.3180511662904441e-05, "loss": 0.7421, "step": 2600 }, { "epoch": 0.66, "grad_norm": 1.0529690980911255, "learning_rate": 1.3162829194883372e-05, "loss": 0.7665, "step": 2650 }, { "epoch": 0.68, "grad_norm": 35.5570068359375, "learning_rate": 1.3145146726862302e-05, "loss": 0.6587, "step": 2700 }, { "epoch": 0.69, "grad_norm": 47.973697662353516, "learning_rate": 1.3127464258841235e-05, "loss": 0.8273, "step": 2750 }, { "epoch": 0.7, "grad_norm": 42.45454788208008, "learning_rate": 1.3109781790820166e-05, "loss": 0.8512, "step": 2800 }, { "epoch": 0.71, "grad_norm": 52.255821228027344, "learning_rate": 1.3092099322799096e-05, "loss": 0.5748, "step": 2850 }, { "epoch": 0.73, "grad_norm": 55.622413635253906, "learning_rate": 1.307441685477803e-05, "loss": 0.6585, "step": 2900 }, { "epoch": 0.74, "grad_norm": 6.804417610168457, "learning_rate": 1.3056734386756961e-05, "loss": 0.9276, "step": 2950 }, { "epoch": 0.75, "grad_norm": 8.9085054397583, "learning_rate": 1.3039051918735892e-05, "loss": 0.9573, "step": 3000 }, { "epoch": 0.76, "grad_norm": 3.399890422821045, "learning_rate": 1.3021369450714824e-05, "loss": 0.815, "step": 3050 }, { "epoch": 0.78, "grad_norm": 9.621098518371582, "learning_rate": 1.3003686982693755e-05, "loss": 0.6272, "step": 3100 }, { "epoch": 0.79, "grad_norm": 34.52663803100586, "learning_rate": 1.2986004514672686e-05, "loss": 0.7548, "step": 3150 }, { "epoch": 0.8, "grad_norm": 38.8935661315918, "learning_rate": 1.296832204665162e-05, "loss": 1.0272, "step": 3200 }, { "epoch": 0.82, "grad_norm": 53.31705093383789, "learning_rate": 1.295063957863055e-05, "loss": 0.8594, "step": 3250 }, { "epoch": 0.83, "grad_norm": 24.726455688476562, "learning_rate": 1.2932957110609481e-05, "loss": 0.7025, "step": 3300 }, { "epoch": 0.84, "grad_norm": 35.29804992675781, "learning_rate": 1.2915274642588413e-05, "loss": 0.8359, "step": 3350 }, { "epoch": 0.85, "grad_norm": 15.382336616516113, "learning_rate": 1.2897592174567344e-05, "loss": 0.7358, "step": 3400 }, { "epoch": 0.87, "grad_norm": 2.9050614833831787, "learning_rate": 1.2879909706546275e-05, "loss": 0.8021, "step": 3450 }, { "epoch": 0.88, "grad_norm": 44.734962463378906, "learning_rate": 1.2862227238525209e-05, "loss": 0.844, "step": 3500 }, { "epoch": 0.89, "grad_norm": 14.811912536621094, "learning_rate": 1.284454477050414e-05, "loss": 0.7822, "step": 3550 }, { "epoch": 0.9, "grad_norm": 44.70045471191406, "learning_rate": 1.282686230248307e-05, "loss": 1.0654, "step": 3600 }, { "epoch": 0.92, "grad_norm": 48.43465805053711, "learning_rate": 1.2809179834462003e-05, "loss": 0.6354, "step": 3650 }, { "epoch": 0.93, "grad_norm": 47.798423767089844, "learning_rate": 1.2791497366440933e-05, "loss": 0.8125, "step": 3700 }, { "epoch": 0.94, "grad_norm": 42.33122634887695, "learning_rate": 1.2773814898419864e-05, "loss": 1.1325, "step": 3750 }, { "epoch": 0.95, "grad_norm": 0.14906466007232666, "learning_rate": 1.2756132430398797e-05, "loss": 0.5325, "step": 3800 }, { "epoch": 0.97, "grad_norm": 10.49329662322998, "learning_rate": 1.2738449962377729e-05, "loss": 0.7013, "step": 3850 }, { "epoch": 0.98, "grad_norm": 21.828550338745117, "learning_rate": 1.272076749435666e-05, "loss": 0.5134, "step": 3900 }, { "epoch": 0.99, "grad_norm": 1.0481252670288086, "learning_rate": 1.270308502633559e-05, "loss": 1.4255, "step": 3950 }, { "epoch": 1.0, "grad_norm": 1.075194001197815, "learning_rate": 1.2685402558314523e-05, "loss": 0.7727, "step": 4000 }, { "epoch": 1.02, "grad_norm": 17.64851188659668, "learning_rate": 1.2667720090293453e-05, "loss": 0.4984, "step": 4050 }, { "epoch": 1.03, "grad_norm": 49.92161178588867, "learning_rate": 1.2650037622272386e-05, "loss": 0.9065, "step": 4100 }, { "epoch": 1.04, "grad_norm": 11.019123077392578, "learning_rate": 1.2632355154251318e-05, "loss": 0.8184, "step": 4150 }, { "epoch": 1.05, "grad_norm": 12.537881851196289, "learning_rate": 1.2614672686230249e-05, "loss": 0.6989, "step": 4200 }, { "epoch": 1.07, "grad_norm": 0.5771467089653015, "learning_rate": 1.259699021820918e-05, "loss": 0.7282, "step": 4250 }, { "epoch": 1.08, "grad_norm": 60.68583297729492, "learning_rate": 1.2579307750188112e-05, "loss": 0.695, "step": 4300 }, { "epoch": 1.09, "grad_norm": 0.7341581583023071, "learning_rate": 1.2561625282167043e-05, "loss": 0.7021, "step": 4350 }, { "epoch": 1.1, "grad_norm": 0.020291157066822052, "learning_rate": 1.2543942814145975e-05, "loss": 0.8563, "step": 4400 }, { "epoch": 1.12, "grad_norm": 1.3924442529678345, "learning_rate": 1.2526260346124907e-05, "loss": 0.7378, "step": 4450 }, { "epoch": 1.13, "grad_norm": 31.691173553466797, "learning_rate": 1.2508577878103838e-05, "loss": 0.5887, "step": 4500 }, { "epoch": 1.14, "grad_norm": 1.1823307275772095, "learning_rate": 1.2490895410082769e-05, "loss": 0.8132, "step": 4550 }, { "epoch": 1.15, "grad_norm": 0.08653511106967926, "learning_rate": 1.2473212942061701e-05, "loss": 0.8374, "step": 4600 }, { "epoch": 1.17, "grad_norm": 2.169903039932251, "learning_rate": 1.2455530474040632e-05, "loss": 0.588, "step": 4650 }, { "epoch": 1.18, "grad_norm": 56.76768112182617, "learning_rate": 1.2437848006019564e-05, "loss": 0.7869, "step": 4700 }, { "epoch": 1.19, "grad_norm": 0.05390803515911102, "learning_rate": 1.2420165537998497e-05, "loss": 0.6243, "step": 4750 }, { "epoch": 1.2, "grad_norm": 5.537655830383301, "learning_rate": 1.2402483069977427e-05, "loss": 0.737, "step": 4800 }, { "epoch": 1.22, "grad_norm": 69.44229125976562, "learning_rate": 1.2384800601956358e-05, "loss": 1.0479, "step": 4850 }, { "epoch": 1.23, "grad_norm": 45.22208023071289, "learning_rate": 1.236711813393529e-05, "loss": 0.8327, "step": 4900 }, { "epoch": 1.24, "grad_norm": 22.553054809570312, "learning_rate": 1.2349435665914221e-05, "loss": 0.6587, "step": 4950 }, { "epoch": 1.25, "grad_norm": 2.1869142055511475, "learning_rate": 1.2331753197893154e-05, "loss": 0.5913, "step": 5000 }, { "epoch": 1.27, "grad_norm": 2.483933210372925, "learning_rate": 1.2314070729872086e-05, "loss": 0.8163, "step": 5050 }, { "epoch": 1.28, "grad_norm": 18.768310546875, "learning_rate": 1.2296388261851017e-05, "loss": 0.6273, "step": 5100 }, { "epoch": 1.29, "grad_norm": 56.0864372253418, "learning_rate": 1.2278705793829947e-05, "loss": 0.8787, "step": 5150 }, { "epoch": 1.3, "grad_norm": 51.98051834106445, "learning_rate": 1.226102332580888e-05, "loss": 0.4302, "step": 5200 }, { "epoch": 1.32, "grad_norm": 17.60165023803711, "learning_rate": 1.224334085778781e-05, "loss": 0.7238, "step": 5250 }, { "epoch": 1.33, "grad_norm": 48.4942626953125, "learning_rate": 1.2225658389766743e-05, "loss": 0.8018, "step": 5300 }, { "epoch": 1.34, "grad_norm": 14.206453323364258, "learning_rate": 1.2207975921745674e-05, "loss": 0.5428, "step": 5350 }, { "epoch": 1.35, "grad_norm": 65.64610290527344, "learning_rate": 1.2190293453724606e-05, "loss": 0.7923, "step": 5400 }, { "epoch": 1.37, "grad_norm": 9.786343574523926, "learning_rate": 1.2172610985703537e-05, "loss": 0.7779, "step": 5450 }, { "epoch": 1.38, "grad_norm": 3.1632120609283447, "learning_rate": 1.2154928517682467e-05, "loss": 0.6474, "step": 5500 }, { "epoch": 1.39, "grad_norm": 15.631272315979004, "learning_rate": 1.21372460496614e-05, "loss": 0.6736, "step": 5550 }, { "epoch": 1.4, "grad_norm": 0.025490593165159225, "learning_rate": 1.2119563581640332e-05, "loss": 0.6371, "step": 5600 }, { "epoch": 1.42, "grad_norm": 84.42486572265625, "learning_rate": 1.2101881113619263e-05, "loss": 0.9348, "step": 5650 }, { "epoch": 1.43, "grad_norm": 0.32389989495277405, "learning_rate": 1.2084198645598195e-05, "loss": 0.8304, "step": 5700 }, { "epoch": 1.44, "grad_norm": 49.16242599487305, "learning_rate": 1.2066516177577126e-05, "loss": 0.6624, "step": 5750 }, { "epoch": 1.45, "grad_norm": 119.3700942993164, "learning_rate": 1.2048833709556057e-05, "loss": 1.1135, "step": 5800 }, { "epoch": 1.47, "grad_norm": 0.15834768116474152, "learning_rate": 1.2031151241534989e-05, "loss": 0.6358, "step": 5850 }, { "epoch": 1.48, "grad_norm": 54.722652435302734, "learning_rate": 1.2013468773513922e-05, "loss": 0.5639, "step": 5900 }, { "epoch": 1.49, "grad_norm": 0.0872531533241272, "learning_rate": 1.1995786305492852e-05, "loss": 0.7912, "step": 5950 }, { "epoch": 1.5, "grad_norm": 2.5009591579437256, "learning_rate": 1.1978103837471785e-05, "loss": 0.6478, "step": 6000 }, { "epoch": 1.52, "grad_norm": 1.5101827383041382, "learning_rate": 1.1960421369450715e-05, "loss": 0.8577, "step": 6050 }, { "epoch": 1.53, "grad_norm": 3.4737539291381836, "learning_rate": 1.1942738901429646e-05, "loss": 0.9474, "step": 6100 }, { "epoch": 1.54, "grad_norm": 92.57341003417969, "learning_rate": 1.1925056433408578e-05, "loss": 0.8665, "step": 6150 }, { "epoch": 1.56, "grad_norm": 38.56670379638672, "learning_rate": 1.1907373965387509e-05, "loss": 0.7833, "step": 6200 }, { "epoch": 1.57, "grad_norm": 29.12518310546875, "learning_rate": 1.1889691497366442e-05, "loss": 0.7454, "step": 6250 }, { "epoch": 1.58, "grad_norm": 69.91959381103516, "learning_rate": 1.1872009029345374e-05, "loss": 0.7843, "step": 6300 }, { "epoch": 1.59, "grad_norm": 56.20566177368164, "learning_rate": 1.1854326561324305e-05, "loss": 0.841, "step": 6350 }, { "epoch": 1.61, "grad_norm": 66.2998275756836, "learning_rate": 1.1836644093303235e-05, "loss": 0.723, "step": 6400 }, { "epoch": 1.62, "grad_norm": 1.9407018423080444, "learning_rate": 1.1818961625282168e-05, "loss": 0.7235, "step": 6450 }, { "epoch": 1.63, "grad_norm": 61.69858932495117, "learning_rate": 1.1801279157261098e-05, "loss": 0.8241, "step": 6500 }, { "epoch": 1.64, "grad_norm": 8.412137985229492, "learning_rate": 1.178359668924003e-05, "loss": 0.564, "step": 6550 }, { "epoch": 1.66, "grad_norm": 9.307317733764648, "learning_rate": 1.1765914221218962e-05, "loss": 0.8438, "step": 6600 }, { "epoch": 1.67, "grad_norm": 41.45466995239258, "learning_rate": 1.1748231753197894e-05, "loss": 0.7763, "step": 6650 }, { "epoch": 1.68, "grad_norm": 2.8245513439178467, "learning_rate": 1.1730549285176825e-05, "loss": 0.7476, "step": 6700 }, { "epoch": 1.69, "grad_norm": 76.77831268310547, "learning_rate": 1.1712866817155757e-05, "loss": 0.9578, "step": 6750 }, { "epoch": 1.71, "grad_norm": 0.004409218207001686, "learning_rate": 1.1695184349134688e-05, "loss": 0.8765, "step": 6800 }, { "epoch": 1.72, "grad_norm": 46.58176803588867, "learning_rate": 1.167750188111362e-05, "loss": 0.5402, "step": 6850 }, { "epoch": 1.73, "grad_norm": 5.006879806518555, "learning_rate": 1.165981941309255e-05, "loss": 0.4722, "step": 6900 }, { "epoch": 1.74, "grad_norm": 2.194460153579712, "learning_rate": 1.1642136945071483e-05, "loss": 0.858, "step": 6950 }, { "epoch": 1.76, "grad_norm": 0.012106262147426605, "learning_rate": 1.1624454477050414e-05, "loss": 0.6607, "step": 7000 }, { "epoch": 1.77, "grad_norm": 6.08723258972168, "learning_rate": 1.1606772009029345e-05, "loss": 0.866, "step": 7050 }, { "epoch": 1.78, "grad_norm": 51.338478088378906, "learning_rate": 1.1589089541008277e-05, "loss": 0.7508, "step": 7100 }, { "epoch": 1.79, "grad_norm": 18.472858428955078, "learning_rate": 1.157140707298721e-05, "loss": 0.8686, "step": 7150 }, { "epoch": 1.81, "grad_norm": 4.837900638580322, "learning_rate": 1.155372460496614e-05, "loss": 0.5302, "step": 7200 }, { "epoch": 1.82, "grad_norm": 41.74524688720703, "learning_rate": 1.1536042136945072e-05, "loss": 0.7681, "step": 7250 }, { "epoch": 1.83, "grad_norm": 30.557188034057617, "learning_rate": 1.1518359668924003e-05, "loss": 0.9107, "step": 7300 }, { "epoch": 1.84, "grad_norm": 14.001880645751953, "learning_rate": 1.1500677200902934e-05, "loss": 0.5387, "step": 7350 }, { "epoch": 1.86, "grad_norm": 0.1815216839313507, "learning_rate": 1.1482994732881866e-05, "loss": 0.8152, "step": 7400 }, { "epoch": 1.87, "grad_norm": 36.915061950683594, "learning_rate": 1.1465312264860799e-05, "loss": 0.6313, "step": 7450 }, { "epoch": 1.88, "grad_norm": 0.20334406197071075, "learning_rate": 1.144762979683973e-05, "loss": 0.8265, "step": 7500 }, { "epoch": 1.89, "grad_norm": 0.0018741831881925464, "learning_rate": 1.1429947328818662e-05, "loss": 0.7202, "step": 7550 }, { "epoch": 1.91, "grad_norm": 0.000707630708348006, "learning_rate": 1.1412264860797592e-05, "loss": 0.6488, "step": 7600 }, { "epoch": 1.92, "grad_norm": 0.4616662561893463, "learning_rate": 1.1394582392776523e-05, "loss": 0.9402, "step": 7650 }, { "epoch": 1.93, "grad_norm": 43.170814514160156, "learning_rate": 1.1376899924755456e-05, "loss": 0.763, "step": 7700 }, { "epoch": 1.94, "grad_norm": 3.035790205001831, "learning_rate": 1.1359217456734388e-05, "loss": 0.5681, "step": 7750 }, { "epoch": 1.96, "grad_norm": 45.11912536621094, "learning_rate": 1.1341534988713319e-05, "loss": 0.9795, "step": 7800 }, { "epoch": 1.97, "grad_norm": 2.006427049636841, "learning_rate": 1.1323852520692251e-05, "loss": 0.4772, "step": 7850 }, { "epoch": 1.98, "grad_norm": 69.13399505615234, "learning_rate": 1.1306170052671182e-05, "loss": 0.8649, "step": 7900 }, { "epoch": 1.99, "grad_norm": 43.80717468261719, "learning_rate": 1.1288487584650112e-05, "loss": 0.6051, "step": 7950 }, { "epoch": 2.01, "grad_norm": 1.3676908016204834, "learning_rate": 1.1270805116629045e-05, "loss": 0.4737, "step": 8000 }, { "epoch": 2.02, "grad_norm": 18.533445358276367, "learning_rate": 1.1253122648607977e-05, "loss": 0.4353, "step": 8050 }, { "epoch": 2.03, "grad_norm": 0.649580717086792, "learning_rate": 1.1235440180586908e-05, "loss": 0.9283, "step": 8100 }, { "epoch": 2.04, "grad_norm": 37.0181999206543, "learning_rate": 1.1217757712565839e-05, "loss": 0.8631, "step": 8150 }, { "epoch": 2.06, "grad_norm": 1.1191781759262085, "learning_rate": 1.1200075244544771e-05, "loss": 0.7166, "step": 8200 }, { "epoch": 2.07, "grad_norm": 46.35097885131836, "learning_rate": 1.1182392776523702e-05, "loss": 0.6263, "step": 8250 }, { "epoch": 2.08, "grad_norm": 9.393693923950195, "learning_rate": 1.1164710308502632e-05, "loss": 0.7146, "step": 8300 }, { "epoch": 2.09, "grad_norm": 49.04343032836914, "learning_rate": 1.1147027840481567e-05, "loss": 0.5924, "step": 8350 }, { "epoch": 2.11, "grad_norm": 2.917092800140381, "learning_rate": 1.1129345372460497e-05, "loss": 0.815, "step": 8400 }, { "epoch": 2.12, "grad_norm": 6.2741618156433105, "learning_rate": 1.1111662904439428e-05, "loss": 0.8852, "step": 8450 }, { "epoch": 2.13, "grad_norm": 0.026425007730722427, "learning_rate": 1.109398043641836e-05, "loss": 0.609, "step": 8500 }, { "epoch": 2.14, "grad_norm": 8.229249954223633, "learning_rate": 1.1076297968397291e-05, "loss": 0.5546, "step": 8550 }, { "epoch": 2.16, "grad_norm": 3.257112979888916, "learning_rate": 1.1058615500376222e-05, "loss": 0.6084, "step": 8600 }, { "epoch": 2.17, "grad_norm": 44.147640228271484, "learning_rate": 1.1040933032355156e-05, "loss": 0.4687, "step": 8650 }, { "epoch": 2.18, "grad_norm": 93.26548767089844, "learning_rate": 1.1023250564334087e-05, "loss": 0.6323, "step": 8700 }, { "epoch": 2.19, "grad_norm": 83.17293548583984, "learning_rate": 1.1005568096313017e-05, "loss": 0.8759, "step": 8750 }, { "epoch": 2.21, "grad_norm": 51.27419662475586, "learning_rate": 1.098788562829195e-05, "loss": 0.69, "step": 8800 }, { "epoch": 2.22, "grad_norm": 0.0010558576323091984, "learning_rate": 1.097020316027088e-05, "loss": 0.5279, "step": 8850 }, { "epoch": 2.23, "grad_norm": 73.43231201171875, "learning_rate": 1.0952520692249811e-05, "loss": 0.9285, "step": 8900 }, { "epoch": 2.24, "grad_norm": 6.488553047180176, "learning_rate": 1.0934838224228745e-05, "loss": 0.6137, "step": 8950 }, { "epoch": 2.26, "grad_norm": 53.465972900390625, "learning_rate": 1.0917155756207676e-05, "loss": 0.4718, "step": 9000 }, { "epoch": 2.27, "grad_norm": 1.405421495437622, "learning_rate": 1.0899473288186607e-05, "loss": 0.7248, "step": 9050 }, { "epoch": 2.28, "grad_norm": 58.552490234375, "learning_rate": 1.0881790820165539e-05, "loss": 0.6312, "step": 9100 }, { "epoch": 2.29, "grad_norm": 85.75029754638672, "learning_rate": 1.086410835214447e-05, "loss": 1.1383, "step": 9150 }, { "epoch": 2.31, "grad_norm": 1.4940392971038818, "learning_rate": 1.08464258841234e-05, "loss": 0.6104, "step": 9200 }, { "epoch": 2.32, "grad_norm": 1.2434502840042114, "learning_rate": 1.0828743416102334e-05, "loss": 0.5124, "step": 9250 }, { "epoch": 2.33, "grad_norm": 0.002772190608084202, "learning_rate": 1.0811060948081265e-05, "loss": 0.8389, "step": 9300 }, { "epoch": 2.35, "grad_norm": 27.42812156677246, "learning_rate": 1.0793378480060196e-05, "loss": 0.6571, "step": 9350 }, { "epoch": 2.36, "grad_norm": 70.63783264160156, "learning_rate": 1.0775696012039128e-05, "loss": 0.5234, "step": 9400 }, { "epoch": 2.37, "grad_norm": 0.873970627784729, "learning_rate": 1.0758013544018059e-05, "loss": 0.7862, "step": 9450 }, { "epoch": 2.38, "grad_norm": 0.0001105390620068647, "learning_rate": 1.074033107599699e-05, "loss": 0.9885, "step": 9500 }, { "epoch": 2.4, "grad_norm": 2.0316097736358643, "learning_rate": 1.0722648607975922e-05, "loss": 0.6648, "step": 9550 }, { "epoch": 2.41, "grad_norm": 33.791568756103516, "learning_rate": 1.0704966139954854e-05, "loss": 0.6746, "step": 9600 }, { "epoch": 2.42, "grad_norm": 112.26337432861328, "learning_rate": 1.0687283671933785e-05, "loss": 0.787, "step": 9650 }, { "epoch": 2.43, "grad_norm": 53.35863494873047, "learning_rate": 1.0669601203912716e-05, "loss": 0.5922, "step": 9700 }, { "epoch": 2.45, "grad_norm": 0.0027942871674895287, "learning_rate": 1.0651918735891648e-05, "loss": 0.6236, "step": 9750 }, { "epoch": 2.46, "grad_norm": 0.00036070370697416365, "learning_rate": 1.0634236267870579e-05, "loss": 0.6559, "step": 9800 }, { "epoch": 2.47, "grad_norm": 2.5188686847686768, "learning_rate": 1.0616553799849511e-05, "loss": 1.002, "step": 9850 }, { "epoch": 2.48, "grad_norm": 42.79086685180664, "learning_rate": 1.0598871331828444e-05, "loss": 1.001, "step": 9900 }, { "epoch": 2.5, "grad_norm": 0.06492776423692703, "learning_rate": 1.0581188863807374e-05, "loss": 0.9975, "step": 9950 }, { "epoch": 2.51, "grad_norm": 12.079846382141113, "learning_rate": 1.0563506395786305e-05, "loss": 0.6417, "step": 10000 }, { "epoch": 2.52, "grad_norm": 98.72542572021484, "learning_rate": 1.0545823927765237e-05, "loss": 0.9242, "step": 10050 }, { "epoch": 2.53, "grad_norm": 0.15632659196853638, "learning_rate": 1.0528141459744168e-05, "loss": 0.4118, "step": 10100 }, { "epoch": 2.55, "grad_norm": 3.5314505100250244, "learning_rate": 1.05104589917231e-05, "loss": 0.6486, "step": 10150 }, { "epoch": 2.56, "grad_norm": 0.06171553581953049, "learning_rate": 1.0492776523702033e-05, "loss": 0.7782, "step": 10200 }, { "epoch": 2.57, "grad_norm": 69.53456115722656, "learning_rate": 1.0475094055680964e-05, "loss": 0.5421, "step": 10250 }, { "epoch": 2.58, "grad_norm": 27.149484634399414, "learning_rate": 1.0457411587659894e-05, "loss": 0.7476, "step": 10300 }, { "epoch": 2.6, "grad_norm": 3.7423877716064453, "learning_rate": 1.0439729119638827e-05, "loss": 0.7429, "step": 10350 }, { "epoch": 2.61, "grad_norm": 0.6006436944007874, "learning_rate": 1.0422046651617757e-05, "loss": 0.4376, "step": 10400 }, { "epoch": 2.62, "grad_norm": 0.2609996497631073, "learning_rate": 1.040436418359669e-05, "loss": 0.8938, "step": 10450 }, { "epoch": 2.63, "grad_norm": 73.91007232666016, "learning_rate": 1.0386681715575622e-05, "loss": 0.7273, "step": 10500 }, { "epoch": 2.65, "grad_norm": 0.010080622509121895, "learning_rate": 1.0368999247554553e-05, "loss": 0.7709, "step": 10550 }, { "epoch": 2.66, "grad_norm": 5.206912994384766, "learning_rate": 1.0351316779533484e-05, "loss": 0.696, "step": 10600 }, { "epoch": 2.67, "grad_norm": 94.36717987060547, "learning_rate": 1.0333634311512416e-05, "loss": 0.6964, "step": 10650 }, { "epoch": 2.68, "grad_norm": 0.6438612341880798, "learning_rate": 1.0315951843491347e-05, "loss": 0.6461, "step": 10700 }, { "epoch": 2.7, "grad_norm": 0.02532346546649933, "learning_rate": 1.029826937547028e-05, "loss": 0.8581, "step": 10750 }, { "epoch": 2.71, "grad_norm": 1.5096291303634644, "learning_rate": 1.0280586907449212e-05, "loss": 0.4629, "step": 10800 }, { "epoch": 2.72, "grad_norm": 81.77324676513672, "learning_rate": 1.0262904439428142e-05, "loss": 0.8681, "step": 10850 }, { "epoch": 2.73, "grad_norm": 1.1398659944534302, "learning_rate": 1.0245221971407073e-05, "loss": 0.5162, "step": 10900 }, { "epoch": 2.75, "grad_norm": 0.4226570725440979, "learning_rate": 1.0227539503386005e-05, "loss": 0.4572, "step": 10950 }, { "epoch": 2.76, "grad_norm": 0.02047480270266533, "learning_rate": 1.0209857035364936e-05, "loss": 0.8946, "step": 11000 }, { "epoch": 2.77, "grad_norm": 124.79954528808594, "learning_rate": 1.0192174567343868e-05, "loss": 0.8325, "step": 11050 }, { "epoch": 2.78, "grad_norm": 7.112376624718308e-05, "learning_rate": 1.01744920993228e-05, "loss": 0.5664, "step": 11100 }, { "epoch": 2.8, "grad_norm": 78.66365051269531, "learning_rate": 1.0156809631301732e-05, "loss": 0.9426, "step": 11150 }, { "epoch": 2.81, "grad_norm": 9.567934466758743e-05, "learning_rate": 1.0139127163280662e-05, "loss": 0.4818, "step": 11200 }, { "epoch": 2.82, "grad_norm": 0.003907013684511185, "learning_rate": 1.0121444695259593e-05, "loss": 0.743, "step": 11250 }, { "epoch": 2.83, "grad_norm": 84.53366088867188, "learning_rate": 1.0103762227238525e-05, "loss": 0.8544, "step": 11300 }, { "epoch": 2.85, "grad_norm": 3.4674291610717773, "learning_rate": 1.0086079759217458e-05, "loss": 0.5553, "step": 11350 }, { "epoch": 2.86, "grad_norm": 125.62838745117188, "learning_rate": 1.0068397291196388e-05, "loss": 0.6168, "step": 11400 }, { "epoch": 2.87, "grad_norm": 99.19140625, "learning_rate": 1.005071482317532e-05, "loss": 1.1238, "step": 11450 }, { "epoch": 2.88, "grad_norm": 38.717559814453125, "learning_rate": 1.0033032355154252e-05, "loss": 1.0667, "step": 11500 }, { "epoch": 2.9, "grad_norm": 28.915889739990234, "learning_rate": 1.0015349887133182e-05, "loss": 0.5045, "step": 11550 }, { "epoch": 2.91, "grad_norm": 48.31145477294922, "learning_rate": 9.997667419112115e-06, "loss": 0.754, "step": 11600 }, { "epoch": 2.92, "grad_norm": 0.06709738075733185, "learning_rate": 9.979984951091047e-06, "loss": 0.6229, "step": 11650 }, { "epoch": 2.93, "grad_norm": 1.2689626216888428, "learning_rate": 9.962302483069978e-06, "loss": 0.7818, "step": 11700 }, { "epoch": 2.95, "grad_norm": 35.311134338378906, "learning_rate": 9.94462001504891e-06, "loss": 1.0477, "step": 11750 }, { "epoch": 2.96, "grad_norm": 88.91561889648438, "learning_rate": 9.92693754702784e-06, "loss": 0.6488, "step": 11800 }, { "epoch": 2.97, "grad_norm": 70.55093383789062, "learning_rate": 9.909255079006772e-06, "loss": 0.5951, "step": 11850 }, { "epoch": 2.98, "grad_norm": 89.51988983154297, "learning_rate": 9.891572610985704e-06, "loss": 0.6867, "step": 11900 }, { "epoch": 3.0, "grad_norm": 0.40069764852523804, "learning_rate": 9.873890142964636e-06, "loss": 0.7094, "step": 11950 }, { "epoch": 3.01, "grad_norm": 2.006258964538574, "learning_rate": 9.856207674943567e-06, "loss": 0.5428, "step": 12000 }, { "epoch": 3.02, "grad_norm": 51.34798049926758, "learning_rate": 9.8385252069225e-06, "loss": 0.573, "step": 12050 }, { "epoch": 3.03, "grad_norm": 95.47881317138672, "learning_rate": 9.82084273890143e-06, "loss": 0.4226, "step": 12100 }, { "epoch": 3.05, "grad_norm": 0.07185523957014084, "learning_rate": 9.80316027088036e-06, "loss": 0.6424, "step": 12150 }, { "epoch": 3.06, "grad_norm": 109.8128662109375, "learning_rate": 9.785477802859293e-06, "loss": 0.5279, "step": 12200 }, { "epoch": 3.07, "grad_norm": 44.56191635131836, "learning_rate": 9.767795334838224e-06, "loss": 0.3463, "step": 12250 }, { "epoch": 3.09, "grad_norm": 0.45552492141723633, "learning_rate": 9.750112866817156e-06, "loss": 0.6696, "step": 12300 }, { "epoch": 3.1, "grad_norm": 0.0008902169647626579, "learning_rate": 9.732430398796089e-06, "loss": 0.3845, "step": 12350 }, { "epoch": 3.11, "grad_norm": 134.49839782714844, "learning_rate": 9.71474793077502e-06, "loss": 0.8803, "step": 12400 }, { "epoch": 3.12, "grad_norm": 0.21923835575580597, "learning_rate": 9.69706546275395e-06, "loss": 0.741, "step": 12450 }, { "epoch": 3.14, "grad_norm": 0.2331884801387787, "learning_rate": 9.679382994732883e-06, "loss": 0.7015, "step": 12500 }, { "epoch": 3.15, "grad_norm": 0.4663000702857971, "learning_rate": 9.661700526711813e-06, "loss": 0.7605, "step": 12550 }, { "epoch": 3.16, "grad_norm": 59.55733871459961, "learning_rate": 9.644018058690746e-06, "loss": 0.5855, "step": 12600 }, { "epoch": 3.17, "grad_norm": 0.8377301096916199, "learning_rate": 9.626335590669676e-06, "loss": 0.4117, "step": 12650 }, { "epoch": 3.19, "grad_norm": 64.69242095947266, "learning_rate": 9.608653122648609e-06, "loss": 0.5216, "step": 12700 }, { "epoch": 3.2, "grad_norm": 0.8485704660415649, "learning_rate": 9.59097065462754e-06, "loss": 0.6882, "step": 12750 }, { "epoch": 3.21, "grad_norm": 143.98147583007812, "learning_rate": 9.57328818660647e-06, "loss": 0.6463, "step": 12800 }, { "epoch": 3.22, "grad_norm": 132.84567260742188, "learning_rate": 9.555605718585403e-06, "loss": 0.7474, "step": 12850 }, { "epoch": 3.24, "grad_norm": 8.179304122924805, "learning_rate": 9.537923250564335e-06, "loss": 0.375, "step": 12900 }, { "epoch": 3.25, "grad_norm": 10.138591766357422, "learning_rate": 9.520240782543266e-06, "loss": 0.7204, "step": 12950 }, { "epoch": 3.26, "grad_norm": 0.00011070028267567977, "learning_rate": 9.502558314522198e-06, "loss": 0.3631, "step": 13000 }, { "epoch": 3.27, "grad_norm": 1.0425533056259155, "learning_rate": 9.484875846501129e-06, "loss": 0.6752, "step": 13050 }, { "epoch": 3.29, "grad_norm": 19.544971466064453, "learning_rate": 9.46719337848006e-06, "loss": 0.4082, "step": 13100 }, { "epoch": 3.3, "grad_norm": 3.29071121996094e-06, "learning_rate": 9.449510910458992e-06, "loss": 0.752, "step": 13150 }, { "epoch": 3.31, "grad_norm": 1.4096872806549072, "learning_rate": 9.431828442437924e-06, "loss": 0.739, "step": 13200 }, { "epoch": 3.32, "grad_norm": 0.1742667555809021, "learning_rate": 9.414145974416855e-06, "loss": 0.5783, "step": 13250 }, { "epoch": 3.34, "grad_norm": 0.8604665398597717, "learning_rate": 9.396463506395787e-06, "loss": 0.8603, "step": 13300 }, { "epoch": 3.35, "grad_norm": 6.3410016082343645e-06, "learning_rate": 9.378781038374718e-06, "loss": 0.4481, "step": 13350 }, { "epoch": 3.36, "grad_norm": 157.0394744873047, "learning_rate": 9.361098570353649e-06, "loss": 0.6242, "step": 13400 }, { "epoch": 3.37, "grad_norm": 0.00026235656696371734, "learning_rate": 9.343416102332581e-06, "loss": 0.7734, "step": 13450 }, { "epoch": 3.39, "grad_norm": 0.48436620831489563, "learning_rate": 9.325733634311513e-06, "loss": 0.4109, "step": 13500 }, { "epoch": 3.4, "grad_norm": 136.50823974609375, "learning_rate": 9.308051166290444e-06, "loss": 0.6214, "step": 13550 }, { "epoch": 3.41, "grad_norm": 0.14412285387516022, "learning_rate": 9.290368698269377e-06, "loss": 0.2606, "step": 13600 }, { "epoch": 3.42, "grad_norm": 11.025894165039062, "learning_rate": 9.272686230248307e-06, "loss": 0.7337, "step": 13650 }, { "epoch": 3.44, "grad_norm": 121.1470718383789, "learning_rate": 9.255003762227238e-06, "loss": 0.7108, "step": 13700 }, { "epoch": 3.45, "grad_norm": 0.08408990502357483, "learning_rate": 9.23732129420617e-06, "loss": 0.4979, "step": 13750 }, { "epoch": 3.46, "grad_norm": 0.05547923222184181, "learning_rate": 9.219638826185103e-06, "loss": 0.299, "step": 13800 }, { "epoch": 3.47, "grad_norm": 131.8295135498047, "learning_rate": 9.201956358164033e-06, "loss": 1.0513, "step": 13850 }, { "epoch": 3.49, "grad_norm": 40.073734283447266, "learning_rate": 9.184273890142966e-06, "loss": 0.4599, "step": 13900 }, { "epoch": 3.5, "grad_norm": 18.33232879638672, "learning_rate": 9.166591422121897e-06, "loss": 0.6182, "step": 13950 }, { "epoch": 3.51, "grad_norm": 0.02969328872859478, "learning_rate": 9.148908954100827e-06, "loss": 0.4793, "step": 14000 }, { "epoch": 3.52, "grad_norm": 0.36942940950393677, "learning_rate": 9.13122648607976e-06, "loss": 0.3778, "step": 14050 }, { "epoch": 3.54, "grad_norm": 0.076649971306324, "learning_rate": 9.113544018058692e-06, "loss": 0.6148, "step": 14100 }, { "epoch": 3.55, "grad_norm": 282.6568298339844, "learning_rate": 9.095861550037623e-06, "loss": 0.6784, "step": 14150 }, { "epoch": 3.56, "grad_norm": 0.14636385440826416, "learning_rate": 9.078179082016553e-06, "loss": 0.9237, "step": 14200 }, { "epoch": 3.57, "grad_norm": 0.014414280652999878, "learning_rate": 9.060496613995486e-06, "loss": 0.7111, "step": 14250 }, { "epoch": 3.59, "grad_norm": 0.10564962774515152, "learning_rate": 9.042814145974417e-06, "loss": 0.4485, "step": 14300 }, { "epoch": 3.6, "grad_norm": 0.10087831318378448, "learning_rate": 9.025131677953347e-06, "loss": 0.7537, "step": 14350 }, { "epoch": 3.61, "grad_norm": 75.64422607421875, "learning_rate": 9.007449209932281e-06, "loss": 0.4629, "step": 14400 }, { "epoch": 3.62, "grad_norm": 87.81208801269531, "learning_rate": 8.989766741911212e-06, "loss": 0.5313, "step": 14450 }, { "epoch": 3.64, "grad_norm": 0.0018619262846186757, "learning_rate": 8.972084273890143e-06, "loss": 0.7642, "step": 14500 }, { "epoch": 3.65, "grad_norm": 110.11195373535156, "learning_rate": 8.954401805869075e-06, "loss": 0.6499, "step": 14550 }, { "epoch": 3.66, "grad_norm": 0.008621015585958958, "learning_rate": 8.936719337848006e-06, "loss": 0.3583, "step": 14600 }, { "epoch": 3.67, "grad_norm": 0.022055380046367645, "learning_rate": 8.919036869826937e-06, "loss": 0.5497, "step": 14650 }, { "epoch": 3.69, "grad_norm": 67.4389419555664, "learning_rate": 8.90135440180587e-06, "loss": 0.5981, "step": 14700 }, { "epoch": 3.7, "grad_norm": 0.000478647300042212, "learning_rate": 8.883671933784801e-06, "loss": 0.259, "step": 14750 }, { "epoch": 3.71, "grad_norm": 1.5297553539276123, "learning_rate": 8.865989465763732e-06, "loss": 0.6259, "step": 14800 }, { "epoch": 3.72, "grad_norm": 36.321128845214844, "learning_rate": 8.848306997742664e-06, "loss": 0.6844, "step": 14850 }, { "epoch": 3.74, "grad_norm": 175.9180450439453, "learning_rate": 8.830624529721595e-06, "loss": 0.5772, "step": 14900 }, { "epoch": 3.75, "grad_norm": 178.33462524414062, "learning_rate": 8.812942061700526e-06, "loss": 0.5891, "step": 14950 }, { "epoch": 3.76, "grad_norm": 0.00013845643843524158, "learning_rate": 8.79525959367946e-06, "loss": 0.464, "step": 15000 }, { "epoch": 3.77, "grad_norm": 127.49348449707031, "learning_rate": 8.77757712565839e-06, "loss": 0.5844, "step": 15050 }, { "epoch": 3.79, "grad_norm": 1.6402578353881836, "learning_rate": 8.759894657637321e-06, "loss": 0.7526, "step": 15100 }, { "epoch": 3.8, "grad_norm": 0.008880015462636948, "learning_rate": 8.742212189616254e-06, "loss": 0.9234, "step": 15150 }, { "epoch": 3.81, "grad_norm": 0.4811843931674957, "learning_rate": 8.724529721595184e-06, "loss": 0.848, "step": 15200 }, { "epoch": 3.82, "grad_norm": 0.0008742750505916774, "learning_rate": 8.706847253574115e-06, "loss": 0.4136, "step": 15250 }, { "epoch": 3.84, "grad_norm": 45.28816604614258, "learning_rate": 8.68916478555305e-06, "loss": 0.6978, "step": 15300 }, { "epoch": 3.85, "grad_norm": 0.014465034939348698, "learning_rate": 8.67148231753198e-06, "loss": 0.6124, "step": 15350 }, { "epoch": 3.86, "grad_norm": 0.01468442752957344, "learning_rate": 8.65379984951091e-06, "loss": 0.925, "step": 15400 }, { "epoch": 3.88, "grad_norm": 1.076714283954061e-06, "learning_rate": 8.636117381489843e-06, "loss": 0.5271, "step": 15450 }, { "epoch": 3.89, "grad_norm": 4.781663847097661e-07, "learning_rate": 8.618434913468774e-06, "loss": 0.4686, "step": 15500 }, { "epoch": 3.9, "grad_norm": 1.0695022344589233, "learning_rate": 8.600752445447704e-06, "loss": 1.076, "step": 15550 }, { "epoch": 3.91, "grad_norm": 0.3064178228378296, "learning_rate": 8.583069977426637e-06, "loss": 0.4409, "step": 15600 }, { "epoch": 3.93, "grad_norm": 95.81256103515625, "learning_rate": 8.56538750940557e-06, "loss": 0.628, "step": 15650 }, { "epoch": 3.94, "grad_norm": 0.011423008516430855, "learning_rate": 8.5477050413845e-06, "loss": 0.4738, "step": 15700 }, { "epoch": 3.95, "grad_norm": 68.7823257446289, "learning_rate": 8.53002257336343e-06, "loss": 0.5614, "step": 15750 }, { "epoch": 3.96, "grad_norm": 0.0003278045041952282, "learning_rate": 8.512340105342363e-06, "loss": 0.451, "step": 15800 }, { "epoch": 3.98, "grad_norm": 5.685105293196102e-07, "learning_rate": 8.494657637321294e-06, "loss": 0.6919, "step": 15850 }, { "epoch": 3.99, "grad_norm": 0.006908051203936338, "learning_rate": 8.476975169300226e-06, "loss": 0.7209, "step": 15900 }, { "epoch": 4.0, "grad_norm": 0.14153322577476501, "learning_rate": 8.459292701279158e-06, "loss": 0.8544, "step": 15950 }, { "epoch": 4.01, "grad_norm": 0.01233228575438261, "learning_rate": 8.44161023325809e-06, "loss": 0.1127, "step": 16000 }, { "epoch": 4.03, "grad_norm": 0.02049972675740719, "learning_rate": 8.42392776523702e-06, "loss": 0.2392, "step": 16050 }, { "epoch": 4.04, "grad_norm": 0.6001113653182983, "learning_rate": 8.406245297215952e-06, "loss": 0.2408, "step": 16100 }, { "epoch": 4.05, "grad_norm": 0.7253586649894714, "learning_rate": 8.388562829194883e-06, "loss": 0.679, "step": 16150 }, { "epoch": 4.06, "grad_norm": 0.20070885121822357, "learning_rate": 8.370880361173815e-06, "loss": 0.5534, "step": 16200 }, { "epoch": 4.08, "grad_norm": 0.004428381100296974, "learning_rate": 8.353197893152748e-06, "loss": 0.3753, "step": 16250 }, { "epoch": 4.09, "grad_norm": 0.1646382063627243, "learning_rate": 8.335515425131678e-06, "loss": 0.7136, "step": 16300 }, { "epoch": 4.1, "grad_norm": 4.304123401641846, "learning_rate": 8.31783295711061e-06, "loss": 0.6533, "step": 16350 }, { "epoch": 4.11, "grad_norm": 0.0014060864923521876, "learning_rate": 8.300150489089542e-06, "loss": 0.2931, "step": 16400 }, { "epoch": 4.13, "grad_norm": 10.760331153869629, "learning_rate": 8.282468021068472e-06, "loss": 0.2996, "step": 16450 }, { "epoch": 4.14, "grad_norm": 151.8526611328125, "learning_rate": 8.264785553047405e-06, "loss": 0.2342, "step": 16500 }, { "epoch": 4.15, "grad_norm": 0.2262250781059265, "learning_rate": 8.247103085026337e-06, "loss": 0.2152, "step": 16550 }, { "epoch": 4.16, "grad_norm": 0.028175359591841698, "learning_rate": 8.229420617005268e-06, "loss": 0.2108, "step": 16600 }, { "epoch": 4.18, "grad_norm": 1.2244036197662354, "learning_rate": 8.211738148984198e-06, "loss": 0.4471, "step": 16650 }, { "epoch": 4.19, "grad_norm": 0.12875045835971832, "learning_rate": 8.194055680963131e-06, "loss": 0.5662, "step": 16700 }, { "epoch": 4.2, "grad_norm": 3.702627420425415, "learning_rate": 8.176373212942062e-06, "loss": 0.3945, "step": 16750 }, { "epoch": 4.21, "grad_norm": 50.61404800415039, "learning_rate": 8.158690744920994e-06, "loss": 0.2347, "step": 16800 }, { "epoch": 4.23, "grad_norm": 0.736967146396637, "learning_rate": 8.141008276899926e-06, "loss": 0.2615, "step": 16850 }, { "epoch": 4.24, "grad_norm": 0.00011446132702985778, "learning_rate": 8.123325808878857e-06, "loss": 0.5149, "step": 16900 }, { "epoch": 4.25, "grad_norm": 0.0010398293379694223, "learning_rate": 8.105643340857788e-06, "loss": 0.2957, "step": 16950 }, { "epoch": 4.26, "grad_norm": 0.26418277621269226, "learning_rate": 8.08796087283672e-06, "loss": 0.2704, "step": 17000 }, { "epoch": 4.28, "grad_norm": 0.8061837553977966, "learning_rate": 8.070278404815651e-06, "loss": 0.367, "step": 17050 }, { "epoch": 4.29, "grad_norm": 0.010115943849086761, "learning_rate": 8.052595936794583e-06, "loss": 0.3768, "step": 17100 }, { "epoch": 4.3, "grad_norm": 27.51811981201172, "learning_rate": 8.034913468773514e-06, "loss": 0.3892, "step": 17150 }, { "epoch": 4.31, "grad_norm": 0.000684226572047919, "learning_rate": 8.017231000752446e-06, "loss": 0.1805, "step": 17200 }, { "epoch": 4.33, "grad_norm": 0.08357678353786469, "learning_rate": 7.999548532731377e-06, "loss": 0.2773, "step": 17250 }, { "epoch": 4.34, "grad_norm": 292.503662109375, "learning_rate": 7.981866064710308e-06, "loss": 0.6283, "step": 17300 }, { "epoch": 4.35, "grad_norm": 0.1264430582523346, "learning_rate": 7.96418359668924e-06, "loss": 0.4072, "step": 17350 }, { "epoch": 4.36, "grad_norm": 1.3433716958388686e-05, "learning_rate": 7.946501128668173e-06, "loss": 0.8405, "step": 17400 }, { "epoch": 4.38, "grad_norm": 27.759994506835938, "learning_rate": 7.928818660647103e-06, "loss": 0.4456, "step": 17450 }, { "epoch": 4.39, "grad_norm": 369.9099426269531, "learning_rate": 7.911136192626036e-06, "loss": 0.3382, "step": 17500 }, { "epoch": 4.4, "grad_norm": 6.0055251121521, "learning_rate": 7.893453724604966e-06, "loss": 0.329, "step": 17550 }, { "epoch": 4.41, "grad_norm": 0.17973710596561432, "learning_rate": 7.875771256583897e-06, "loss": 0.6193, "step": 17600 }, { "epoch": 4.43, "grad_norm": 0.03942597284913063, "learning_rate": 7.85808878856283e-06, "loss": 0.3823, "step": 17650 }, { "epoch": 4.44, "grad_norm": 0.0010533991735428572, "learning_rate": 7.840406320541762e-06, "loss": 0.6641, "step": 17700 }, { "epoch": 4.45, "grad_norm": 3.6850650531050633e-07, "learning_rate": 7.822723852520693e-06, "loss": 0.4148, "step": 17750 }, { "epoch": 4.46, "grad_norm": 5.283959399093874e-05, "learning_rate": 7.805041384499625e-06, "loss": 0.799, "step": 17800 }, { "epoch": 4.48, "grad_norm": 0.01196613721549511, "learning_rate": 7.787358916478556e-06, "loss": 0.5424, "step": 17850 }, { "epoch": 4.49, "grad_norm": 211.05799865722656, "learning_rate": 7.769676448457486e-06, "loss": 0.2341, "step": 17900 }, { "epoch": 4.5, "grad_norm": 8.655371743770957e-07, "learning_rate": 7.751993980436419e-06, "loss": 0.5349, "step": 17950 }, { "epoch": 4.51, "grad_norm": 1.5644945408621602e-11, "learning_rate": 7.734311512415351e-06, "loss": 0.0804, "step": 18000 }, { "epoch": 4.53, "grad_norm": 0.00036508633638732135, "learning_rate": 7.716629044394282e-06, "loss": 0.3295, "step": 18050 }, { "epoch": 4.54, "grad_norm": 1.3209816270357e-13, "learning_rate": 7.698946576373214e-06, "loss": 0.3606, "step": 18100 }, { "epoch": 4.55, "grad_norm": 314.8194885253906, "learning_rate": 7.681264108352145e-06, "loss": 0.3064, "step": 18150 }, { "epoch": 4.56, "grad_norm": 1.250010797093637e-07, "learning_rate": 7.663581640331076e-06, "loss": 0.2967, "step": 18200 }, { "epoch": 4.58, "grad_norm": 1.0573174953460693, "learning_rate": 7.645899172310008e-06, "loss": 0.4857, "step": 18250 }, { "epoch": 4.59, "grad_norm": 204.4314727783203, "learning_rate": 7.628216704288939e-06, "loss": 0.358, "step": 18300 }, { "epoch": 4.6, "grad_norm": 0.02004345878958702, "learning_rate": 7.610534236267871e-06, "loss": 0.753, "step": 18350 }, { "epoch": 4.61, "grad_norm": 302.43280029296875, "learning_rate": 7.592851768246803e-06, "loss": 0.3723, "step": 18400 }, { "epoch": 4.63, "grad_norm": 0.0004978284705430269, "learning_rate": 7.575169300225734e-06, "loss": 0.4034, "step": 18450 }, { "epoch": 4.64, "grad_norm": 93.66849517822266, "learning_rate": 7.557486832204665e-06, "loss": 0.4249, "step": 18500 }, { "epoch": 4.65, "grad_norm": 0.001678618835285306, "learning_rate": 7.5398043641835965e-06, "loss": 0.7051, "step": 18550 }, { "epoch": 4.67, "grad_norm": 0.37766626477241516, "learning_rate": 7.522121896162528e-06, "loss": 0.2375, "step": 18600 }, { "epoch": 4.68, "grad_norm": 268.7151184082031, "learning_rate": 7.50443942814146e-06, "loss": 0.9723, "step": 18650 }, { "epoch": 4.69, "grad_norm": 10.93520450592041, "learning_rate": 7.486756960120392e-06, "loss": 0.4446, "step": 18700 }, { "epoch": 4.7, "grad_norm": 0.0002736333408392966, "learning_rate": 7.4690744920993235e-06, "loss": 0.5695, "step": 18750 }, { "epoch": 4.72, "grad_norm": 0.006334410980343819, "learning_rate": 7.451392024078254e-06, "loss": 0.4816, "step": 18800 }, { "epoch": 4.73, "grad_norm": 2.021748046754368e-10, "learning_rate": 7.433709556057186e-06, "loss": 0.6205, "step": 18850 }, { "epoch": 4.74, "grad_norm": 77.61640930175781, "learning_rate": 7.416027088036117e-06, "loss": 0.1743, "step": 18900 }, { "epoch": 4.75, "grad_norm": 0.24281173944473267, "learning_rate": 7.39834462001505e-06, "loss": 0.3958, "step": 18950 }, { "epoch": 4.77, "grad_norm": 0.0005730040138587356, "learning_rate": 7.380662151993981e-06, "loss": 0.2709, "step": 19000 }, { "epoch": 4.78, "grad_norm": 25.074310302734375, "learning_rate": 7.362979683972912e-06, "loss": 0.3811, "step": 19050 }, { "epoch": 4.79, "grad_norm": 0.0002688245731405914, "learning_rate": 7.3452972159518435e-06, "loss": 0.2937, "step": 19100 }, { "epoch": 4.8, "grad_norm": 6.246182601898909e-05, "learning_rate": 7.327614747930775e-06, "loss": 0.2862, "step": 19150 }, { "epoch": 4.82, "grad_norm": 0.000318751554004848, "learning_rate": 7.309932279909706e-06, "loss": 0.132, "step": 19200 }, { "epoch": 4.83, "grad_norm": 285.48297119140625, "learning_rate": 7.292249811888639e-06, "loss": 0.3126, "step": 19250 }, { "epoch": 4.84, "grad_norm": 214.23065185546875, "learning_rate": 7.2745673438675705e-06, "loss": 0.6408, "step": 19300 }, { "epoch": 4.85, "grad_norm": 305.9626159667969, "learning_rate": 7.256884875846501e-06, "loss": 0.4605, "step": 19350 }, { "epoch": 4.87, "grad_norm": 4.2915186782011006e-07, "learning_rate": 7.239202407825433e-06, "loss": 0.2248, "step": 19400 }, { "epoch": 4.88, "grad_norm": 265.24072265625, "learning_rate": 7.221519939804364e-06, "loss": 0.6776, "step": 19450 }, { "epoch": 4.89, "grad_norm": 250.4654083251953, "learning_rate": 7.203837471783295e-06, "loss": 0.3709, "step": 19500 }, { "epoch": 4.9, "grad_norm": 0.0005780484061688185, "learning_rate": 7.186155003762228e-06, "loss": 0.3521, "step": 19550 }, { "epoch": 4.92, "grad_norm": 8.780172348022461, "learning_rate": 7.168472535741159e-06, "loss": 0.3998, "step": 19600 }, { "epoch": 4.93, "grad_norm": 7.643636703491211, "learning_rate": 7.1507900677200905e-06, "loss": 0.5537, "step": 19650 }, { "epoch": 4.94, "grad_norm": 0.0002484459837432951, "learning_rate": 7.133107599699022e-06, "loss": 0.4808, "step": 19700 }, { "epoch": 4.95, "grad_norm": 0.2631732225418091, "learning_rate": 7.115425131677953e-06, "loss": 0.6781, "step": 19750 }, { "epoch": 4.97, "grad_norm": 0.0346391536295414, "learning_rate": 7.097742663656884e-06, "loss": 0.1673, "step": 19800 }, { "epoch": 4.98, "grad_norm": 0.006426098290830851, "learning_rate": 7.0800601956358176e-06, "loss": 0.2001, "step": 19850 }, { "epoch": 4.99, "grad_norm": 0.070701465010643, "learning_rate": 7.062377727614748e-06, "loss": 0.6119, "step": 19900 }, { "epoch": 5.0, "grad_norm": 9.641678479965776e-05, "learning_rate": 7.04469525959368e-06, "loss": 0.1432, "step": 19950 }, { "epoch": 5.02, "grad_norm": 80.3648681640625, "learning_rate": 7.027012791572611e-06, "loss": 0.2837, "step": 20000 }, { "epoch": 5.03, "grad_norm": 7.515856123063713e-05, "learning_rate": 7.009330323551543e-06, "loss": 0.0325, "step": 20050 }, { "epoch": 5.04, "grad_norm": 9.76786541286856e-05, "learning_rate": 6.9916478555304745e-06, "loss": 0.28, "step": 20100 }, { "epoch": 5.05, "grad_norm": 0.058834467083215714, "learning_rate": 6.973965387509406e-06, "loss": 0.119, "step": 20150 }, { "epoch": 5.07, "grad_norm": 3.0734496116638184, "learning_rate": 6.9562829194883376e-06, "loss": 0.1121, "step": 20200 }, { "epoch": 5.08, "grad_norm": 173.53060913085938, "learning_rate": 6.938600451467269e-06, "loss": 0.4994, "step": 20250 }, { "epoch": 5.09, "grad_norm": 1.482841071265284e-06, "learning_rate": 6.920917983446201e-06, "loss": 0.4273, "step": 20300 }, { "epoch": 5.1, "grad_norm": 0.06339254975318909, "learning_rate": 6.903235515425132e-06, "loss": 0.0653, "step": 20350 }, { "epoch": 5.12, "grad_norm": 29.73435401916504, "learning_rate": 6.885553047404064e-06, "loss": 0.0064, "step": 20400 }, { "epoch": 5.13, "grad_norm": 0.0535583458840847, "learning_rate": 6.8678705793829944e-06, "loss": 0.1328, "step": 20450 }, { "epoch": 5.14, "grad_norm": 0.016700129956007004, "learning_rate": 6.850188111361927e-06, "loss": 0.3879, "step": 20500 }, { "epoch": 5.15, "grad_norm": 3.702952017192729e-05, "learning_rate": 6.832505643340858e-06, "loss": 0.1604, "step": 20550 }, { "epoch": 5.17, "grad_norm": 0.03472837060689926, "learning_rate": 6.814823175319789e-06, "loss": 0.2436, "step": 20600 }, { "epoch": 5.18, "grad_norm": 3.1909748940961435e-05, "learning_rate": 6.7971407072987215e-06, "loss": 0.1352, "step": 20650 }, { "epoch": 5.19, "grad_norm": 0.3979862630367279, "learning_rate": 6.779458239277653e-06, "loss": 0.1159, "step": 20700 }, { "epoch": 5.2, "grad_norm": 0.0028309274930506945, "learning_rate": 6.761775771256584e-06, "loss": 0.2612, "step": 20750 }, { "epoch": 5.22, "grad_norm": 0.7586016654968262, "learning_rate": 6.744093303235516e-06, "loss": 0.4589, "step": 20800 }, { "epoch": 5.23, "grad_norm": 0.0062132058665156364, "learning_rate": 6.726410835214448e-06, "loss": 0.0843, "step": 20850 }, { "epoch": 5.24, "grad_norm": 0.01292335707694292, "learning_rate": 6.708728367193378e-06, "loss": 0.092, "step": 20900 }, { "epoch": 5.25, "grad_norm": 0.0012096440186724067, "learning_rate": 6.691045899172311e-06, "loss": 0.1515, "step": 20950 }, { "epoch": 5.27, "grad_norm": 0.003023844677954912, "learning_rate": 6.673363431151242e-06, "loss": 0.3177, "step": 21000 }, { "epoch": 5.28, "grad_norm": 106.2956771850586, "learning_rate": 6.655680963130173e-06, "loss": 0.0315, "step": 21050 }, { "epoch": 5.29, "grad_norm": 0.0011365425307303667, "learning_rate": 6.637998495109105e-06, "loss": 0.0159, "step": 21100 }, { "epoch": 5.3, "grad_norm": 39.502681732177734, "learning_rate": 6.620316027088036e-06, "loss": 0.3804, "step": 21150 }, { "epoch": 5.32, "grad_norm": 0.017230931669473648, "learning_rate": 6.602633559066968e-06, "loss": 0.0453, "step": 21200 }, { "epoch": 5.33, "grad_norm": 6.043082976248115e-06, "learning_rate": 6.584951091045899e-06, "loss": 0.3094, "step": 21250 }, { "epoch": 5.34, "grad_norm": 6.83969769710302e-10, "learning_rate": 6.567268623024831e-06, "loss": 0.3382, "step": 21300 }, { "epoch": 5.35, "grad_norm": 1.2151496714234156e-13, "learning_rate": 6.549586155003762e-06, "loss": 0.0469, "step": 21350 }, { "epoch": 5.37, "grad_norm": 129.63966369628906, "learning_rate": 6.531903686982694e-06, "loss": 0.1542, "step": 21400 }, { "epoch": 5.38, "grad_norm": 8.008062764019996e-07, "learning_rate": 6.514221218961625e-06, "loss": 0.1121, "step": 21450 }, { "epoch": 5.39, "grad_norm": 195.3101043701172, "learning_rate": 6.496538750940557e-06, "loss": 0.134, "step": 21500 }, { "epoch": 5.41, "grad_norm": 0.44227921962738037, "learning_rate": 6.4788562829194885e-06, "loss": 0.1614, "step": 21550 }, { "epoch": 5.42, "grad_norm": 389.9450988769531, "learning_rate": 6.46117381489842e-06, "loss": 0.2223, "step": 21600 }, { "epoch": 5.43, "grad_norm": 2.417748987681989e-07, "learning_rate": 6.443491346877352e-06, "loss": 0.2297, "step": 21650 }, { "epoch": 5.44, "grad_norm": 0.0011466313153505325, "learning_rate": 6.425808878856283e-06, "loss": 0.0367, "step": 21700 }, { "epoch": 5.46, "grad_norm": 0.4562750458717346, "learning_rate": 6.408126410835215e-06, "loss": 0.3361, "step": 21750 }, { "epoch": 5.47, "grad_norm": 3.822188591584563e-05, "learning_rate": 6.390443942814146e-06, "loss": 0.0979, "step": 21800 }, { "epoch": 5.48, "grad_norm": 100.44294738769531, "learning_rate": 6.372761474793078e-06, "loss": 0.0573, "step": 21850 }, { "epoch": 5.49, "grad_norm": 1.8141976397600956e-05, "learning_rate": 6.355079006772009e-06, "loss": 0.6044, "step": 21900 }, { "epoch": 5.51, "grad_norm": 2.5538651055034833e-12, "learning_rate": 6.337396538750941e-06, "loss": 0.2549, "step": 21950 }, { "epoch": 5.52, "grad_norm": 7.968230164578927e-08, "learning_rate": 6.319714070729872e-06, "loss": 0.36, "step": 22000 }, { "epoch": 5.53, "grad_norm": 0.001464845146983862, "learning_rate": 6.302031602708804e-06, "loss": 0.0043, "step": 22050 }, { "epoch": 5.54, "grad_norm": 0.5217474102973938, "learning_rate": 6.2843491346877355e-06, "loss": 0.3358, "step": 22100 }, { "epoch": 5.56, "grad_norm": 2.1627647583954968e-05, "learning_rate": 6.266666666666666e-06, "loss": 0.4962, "step": 22150 }, { "epoch": 5.57, "grad_norm": 0.0039770700968801975, "learning_rate": 6.248984198645599e-06, "loss": 0.0886, "step": 22200 }, { "epoch": 5.58, "grad_norm": 0.028452860191464424, "learning_rate": 6.23130173062453e-06, "loss": 0.2919, "step": 22250 }, { "epoch": 5.59, "grad_norm": 1.0354268550872803, "learning_rate": 6.213619262603461e-06, "loss": 0.1583, "step": 22300 }, { "epoch": 5.61, "grad_norm": 0.0001276719121960923, "learning_rate": 6.195936794582393e-06, "loss": 0.1062, "step": 22350 }, { "epoch": 5.62, "grad_norm": 213.48941040039062, "learning_rate": 6.178254326561325e-06, "loss": 0.377, "step": 22400 }, { "epoch": 5.63, "grad_norm": 8.587969205109403e-06, "learning_rate": 6.1605718585402555e-06, "loss": 0.2043, "step": 22450 }, { "epoch": 5.64, "grad_norm": 0.011805477552115917, "learning_rate": 6.142889390519188e-06, "loss": 0.2744, "step": 22500 }, { "epoch": 5.66, "grad_norm": 1.4445524776363072e-08, "learning_rate": 6.125206922498119e-06, "loss": 0.0145, "step": 22550 }, { "epoch": 5.67, "grad_norm": 136.72720336914062, "learning_rate": 6.10752445447705e-06, "loss": 0.1608, "step": 22600 }, { "epoch": 5.68, "grad_norm": 8.377895937883295e-06, "learning_rate": 6.0898419864559826e-06, "loss": 0.1146, "step": 22650 }, { "epoch": 5.69, "grad_norm": 0.0005771568394266069, "learning_rate": 6.072159518434913e-06, "loss": 0.3716, "step": 22700 }, { "epoch": 5.71, "grad_norm": 0.0033020416740328074, "learning_rate": 6.054477050413845e-06, "loss": 0.1609, "step": 22750 }, { "epoch": 5.72, "grad_norm": 0.014289168640971184, "learning_rate": 6.036794582392777e-06, "loss": 0.2873, "step": 22800 }, { "epoch": 5.73, "grad_norm": 433.4857482910156, "learning_rate": 6.019112114371708e-06, "loss": 0.2766, "step": 22850 }, { "epoch": 5.74, "grad_norm": 51.506011962890625, "learning_rate": 6.0014296463506395e-06, "loss": 0.2557, "step": 22900 }, { "epoch": 5.76, "grad_norm": 2.9865319106647803e-07, "learning_rate": 5.983747178329572e-06, "loss": 0.052, "step": 22950 }, { "epoch": 5.77, "grad_norm": 0.0004749756189994514, "learning_rate": 5.9660647103085026e-06, "loss": 0.048, "step": 23000 }, { "epoch": 5.78, "grad_norm": 296.063720703125, "learning_rate": 5.948382242287434e-06, "loss": 0.1432, "step": 23050 }, { "epoch": 5.79, "grad_norm": 0.002446663100272417, "learning_rate": 5.9306997742663665e-06, "loss": 0.3151, "step": 23100 }, { "epoch": 5.81, "grad_norm": 0.012231925502419472, "learning_rate": 5.913017306245297e-06, "loss": 0.0295, "step": 23150 }, { "epoch": 5.82, "grad_norm": 0.006459045223891735, "learning_rate": 5.895334838224229e-06, "loss": 0.0319, "step": 23200 }, { "epoch": 5.83, "grad_norm": 5.6175377238787405e-08, "learning_rate": 5.87765237020316e-06, "loss": 0.1096, "step": 23250 }, { "epoch": 5.84, "grad_norm": 9.727654060043278e-07, "learning_rate": 5.859969902182092e-06, "loss": 0.365, "step": 23300 }, { "epoch": 5.86, "grad_norm": 167.01791381835938, "learning_rate": 5.842287434161023e-06, "loss": 0.0494, "step": 23350 }, { "epoch": 5.87, "grad_norm": 0.05854243040084839, "learning_rate": 5.824604966139955e-06, "loss": 0.0218, "step": 23400 }, { "epoch": 5.88, "grad_norm": 2.8002886676148364e-09, "learning_rate": 5.8069224981188865e-06, "loss": 0.0119, "step": 23450 }, { "epoch": 5.89, "grad_norm": 455.8995361328125, "learning_rate": 5.789240030097818e-06, "loss": 0.3402, "step": 23500 }, { "epoch": 5.91, "grad_norm": 0.0034980960190296173, "learning_rate": 5.77155756207675e-06, "loss": 0.1623, "step": 23550 }, { "epoch": 5.92, "grad_norm": 0.048077382147312164, "learning_rate": 5.753875094055681e-06, "loss": 0.5028, "step": 23600 }, { "epoch": 5.93, "grad_norm": 1.1395950317382812, "learning_rate": 5.736192626034613e-06, "loss": 0.1841, "step": 23650 }, { "epoch": 5.94, "grad_norm": 3.0090935979387723e-05, "learning_rate": 5.718510158013544e-06, "loss": 0.5312, "step": 23700 }, { "epoch": 5.96, "grad_norm": 4.985315626981901e-08, "learning_rate": 5.700827689992476e-06, "loss": 0.0867, "step": 23750 }, { "epoch": 5.97, "grad_norm": 0.7515669465065002, "learning_rate": 5.683145221971407e-06, "loss": 0.3645, "step": 23800 }, { "epoch": 5.98, "grad_norm": 14.448786735534668, "learning_rate": 5.665462753950339e-06, "loss": 0.0975, "step": 23850 }, { "epoch": 5.99, "grad_norm": 0.58511883020401, "learning_rate": 5.6477802859292704e-06, "loss": 0.0981, "step": 23900 }, { "epoch": 6.01, "grad_norm": 5.8292873291065916e-05, "learning_rate": 5.630097817908202e-06, "loss": 0.2598, "step": 23950 }, { "epoch": 6.02, "grad_norm": 0.03704287111759186, "learning_rate": 5.6124153498871335e-06, "loss": 0.1594, "step": 24000 }, { "epoch": 6.03, "grad_norm": 0.0010854690335690975, "learning_rate": 5.594732881866065e-06, "loss": 0.2415, "step": 24050 }, { "epoch": 6.04, "grad_norm": 381.2314147949219, "learning_rate": 5.577050413844996e-06, "loss": 0.0477, "step": 24100 }, { "epoch": 6.06, "grad_norm": 8.66334667080082e-05, "learning_rate": 5.559367945823928e-06, "loss": 0.0424, "step": 24150 }, { "epoch": 6.07, "grad_norm": 0.019515322521328926, "learning_rate": 5.54168547780286e-06, "loss": 0.3617, "step": 24200 }, { "epoch": 6.08, "grad_norm": 0.00011614364484557882, "learning_rate": 5.52400300978179e-06, "loss": 0.1944, "step": 24250 }, { "epoch": 6.09, "grad_norm": 0.00019373864051885903, "learning_rate": 5.506320541760723e-06, "loss": 0.0011, "step": 24300 }, { "epoch": 6.11, "grad_norm": 1.0937032612901021e-08, "learning_rate": 5.488638073739654e-06, "loss": 0.0014, "step": 24350 }, { "epoch": 6.12, "grad_norm": 2.1784097691945198e-13, "learning_rate": 5.470955605718585e-06, "loss": 0.0055, "step": 24400 }, { "epoch": 6.13, "grad_norm": 0.01839843951165676, "learning_rate": 5.4532731376975175e-06, "loss": 0.0042, "step": 24450 }, { "epoch": 6.14, "grad_norm": 4.981990930907898e-10, "learning_rate": 5.435590669676449e-06, "loss": 0.103, "step": 24500 }, { "epoch": 6.16, "grad_norm": 0.0047708419151604176, "learning_rate": 5.41790820165538e-06, "loss": 0.0022, "step": 24550 }, { "epoch": 6.17, "grad_norm": 0.003085497999563813, "learning_rate": 5.400225733634312e-06, "loss": 0.0021, "step": 24600 }, { "epoch": 6.18, "grad_norm": 6.570710642250788e-11, "learning_rate": 5.382543265613244e-06, "loss": 0.2051, "step": 24650 }, { "epoch": 6.2, "grad_norm": 0.0029285515192896128, "learning_rate": 5.364860797592174e-06, "loss": 0.0012, "step": 24700 }, { "epoch": 6.21, "grad_norm": 3.4288578376617806e-07, "learning_rate": 5.347178329571107e-06, "loss": 0.0001, "step": 24750 }, { "epoch": 6.22, "grad_norm": 0.00539399404078722, "learning_rate": 5.3294958615500375e-06, "loss": 0.2899, "step": 24800 }, { "epoch": 6.23, "grad_norm": 2.6356909188507416e-07, "learning_rate": 5.311813393528969e-06, "loss": 0.0019, "step": 24850 }, { "epoch": 6.25, "grad_norm": 0.019658172503113747, "learning_rate": 5.294130925507901e-06, "loss": 0.1133, "step": 24900 }, { "epoch": 6.26, "grad_norm": 4.7282670834203344e-11, "learning_rate": 5.276448457486832e-06, "loss": 0.0001, "step": 24950 }, { "epoch": 6.27, "grad_norm": 1.2473710739868693e-06, "learning_rate": 5.258765989465764e-06, "loss": 0.1143, "step": 25000 }, { "epoch": 6.28, "grad_norm": 0.38085153698921204, "learning_rate": 5.241083521444696e-06, "loss": 0.059, "step": 25050 }, { "epoch": 6.3, "grad_norm": 5.584224224090576, "learning_rate": 5.223401053423627e-06, "loss": 0.0833, "step": 25100 }, { "epoch": 6.31, "grad_norm": 9.337106348539237e-06, "learning_rate": 5.205718585402558e-06, "loss": 0.0876, "step": 25150 }, { "epoch": 6.32, "grad_norm": 4.118080099146937e-08, "learning_rate": 5.188036117381491e-06, "loss": 0.0703, "step": 25200 }, { "epoch": 6.33, "grad_norm": 1.8987177554663504e-06, "learning_rate": 5.170353649360421e-06, "loss": 0.0625, "step": 25250 }, { "epoch": 6.35, "grad_norm": 4.3221673462490173e-10, "learning_rate": 5.152671181339353e-06, "loss": 0.0284, "step": 25300 }, { "epoch": 6.36, "grad_norm": 0.000691065622959286, "learning_rate": 5.134988713318285e-06, "loss": 0.0422, "step": 25350 }, { "epoch": 6.37, "grad_norm": 0.00046700576785951853, "learning_rate": 5.117306245297216e-06, "loss": 0.0001, "step": 25400 }, { "epoch": 6.38, "grad_norm": 0.008938438259065151, "learning_rate": 5.099623777276148e-06, "loss": 0.0141, "step": 25450 }, { "epoch": 6.4, "grad_norm": 0.16503383219242096, "learning_rate": 5.081941309255079e-06, "loss": 0.0646, "step": 25500 }, { "epoch": 6.41, "grad_norm": 8.952581993071362e-06, "learning_rate": 5.064258841234011e-06, "loss": 0.036, "step": 25550 }, { "epoch": 6.42, "grad_norm": 0.014195716008543968, "learning_rate": 5.046576373212942e-06, "loss": 0.0005, "step": 25600 }, { "epoch": 6.43, "grad_norm": 0.00028850819217041135, "learning_rate": 5.028893905191874e-06, "loss": 0.0754, "step": 25650 }, { "epoch": 6.45, "grad_norm": 0.00020963407587260008, "learning_rate": 5.011211437170805e-06, "loss": 0.0003, "step": 25700 }, { "epoch": 6.46, "grad_norm": 0.0010497659677639604, "learning_rate": 4.993528969149737e-06, "loss": 0.6013, "step": 25750 }, { "epoch": 6.47, "grad_norm": 1.387237716699019e-06, "learning_rate": 4.975846501128668e-06, "loss": 0.005, "step": 25800 }, { "epoch": 6.48, "grad_norm": 1.8294354958925396e-05, "learning_rate": 4.9581640331076e-06, "loss": 0.0, "step": 25850 }, { "epoch": 6.5, "grad_norm": 4.903622539131902e-06, "learning_rate": 4.9404815650865315e-06, "loss": 0.0003, "step": 25900 }, { "epoch": 6.51, "grad_norm": 0.000930552021600306, "learning_rate": 4.922799097065463e-06, "loss": 0.0464, "step": 25950 }, { "epoch": 6.52, "grad_norm": 2.9821951102348976e-05, "learning_rate": 4.905116629044395e-06, "loss": 0.0854, "step": 26000 }, { "epoch": 6.53, "grad_norm": 0.19266781210899353, "learning_rate": 4.887434161023326e-06, "loss": 0.1578, "step": 26050 }, { "epoch": 6.55, "grad_norm": 6.610630862269318e-06, "learning_rate": 4.869751693002258e-06, "loss": 0.0004, "step": 26100 }, { "epoch": 6.56, "grad_norm": 6.910874503773812e-07, "learning_rate": 4.852069224981189e-06, "loss": 0.0013, "step": 26150 }, { "epoch": 6.57, "grad_norm": 0.00030907560721971095, "learning_rate": 4.834386756960121e-06, "loss": 0.0005, "step": 26200 }, { "epoch": 6.58, "grad_norm": 1.2135699112292286e-09, "learning_rate": 4.816704288939052e-06, "loss": 0.1581, "step": 26250 }, { "epoch": 6.6, "grad_norm": 8.979808626463637e-06, "learning_rate": 4.799021820917984e-06, "loss": 0.1339, "step": 26300 }, { "epoch": 6.61, "grad_norm": 8.109305053949356e-05, "learning_rate": 4.781339352896915e-06, "loss": 0.1607, "step": 26350 }, { "epoch": 6.62, "grad_norm": 0.11362000554800034, "learning_rate": 4.763656884875847e-06, "loss": 0.0152, "step": 26400 }, { "epoch": 6.63, "grad_norm": 3.168620969518088e-05, "learning_rate": 4.7459744168547785e-06, "loss": 0.062, "step": 26450 }, { "epoch": 6.65, "grad_norm": 2.37572979927063, "learning_rate": 4.728291948833709e-06, "loss": 0.0001, "step": 26500 }, { "epoch": 6.66, "grad_norm": 1.1477128509795875e-06, "learning_rate": 4.710609480812642e-06, "loss": 0.2304, "step": 26550 }, { "epoch": 6.67, "grad_norm": 3.561492079029449e-08, "learning_rate": 4.692927012791573e-06, "loss": 0.1046, "step": 26600 }, { "epoch": 6.68, "grad_norm": 1.6958483457565308, "learning_rate": 4.675244544770504e-06, "loss": 0.0273, "step": 26650 }, { "epoch": 6.7, "grad_norm": 6.609186675632372e-05, "learning_rate": 4.657562076749436e-06, "loss": 0.0504, "step": 26700 }, { "epoch": 6.71, "grad_norm": 0.02066265046596527, "learning_rate": 4.639879608728368e-06, "loss": 0.0845, "step": 26750 }, { "epoch": 6.72, "grad_norm": 0.6868598461151123, "learning_rate": 4.6221971407072985e-06, "loss": 0.064, "step": 26800 }, { "epoch": 6.73, "grad_norm": 4.525861463378078e-09, "learning_rate": 4.604514672686231e-06, "loss": 0.0372, "step": 26850 }, { "epoch": 6.75, "grad_norm": 0.0018904170719906688, "learning_rate": 4.5868322046651625e-06, "loss": 0.171, "step": 26900 }, { "epoch": 6.76, "grad_norm": 0.06831281632184982, "learning_rate": 4.569149736644093e-06, "loss": 0.0005, "step": 26950 }, { "epoch": 6.77, "grad_norm": 2.7328371288604103e-05, "learning_rate": 4.5514672686230256e-06, "loss": 0.0834, "step": 27000 }, { "epoch": 6.78, "grad_norm": 1.312251782792373e-07, "learning_rate": 4.533784800601956e-06, "loss": 0.0009, "step": 27050 }, { "epoch": 6.8, "grad_norm": 0.006464004050940275, "learning_rate": 4.516102332580888e-06, "loss": 0.1302, "step": 27100 }, { "epoch": 6.81, "grad_norm": 4.0537888601477334e-09, "learning_rate": 4.49841986455982e-06, "loss": 0.1255, "step": 27150 }, { "epoch": 6.82, "grad_norm": 0.0004817073349840939, "learning_rate": 4.480737396538751e-06, "loss": 0.001, "step": 27200 }, { "epoch": 6.83, "grad_norm": 0.014918695203959942, "learning_rate": 4.4630549285176825e-06, "loss": 0.0019, "step": 27250 }, { "epoch": 6.85, "grad_norm": 6.75780752420712e-17, "learning_rate": 4.445372460496614e-06, "loss": 0.0179, "step": 27300 }, { "epoch": 6.86, "grad_norm": 382.1897888183594, "learning_rate": 4.4276899924755456e-06, "loss": 0.0396, "step": 27350 }, { "epoch": 6.87, "grad_norm": 0.30687054991722107, "learning_rate": 4.410007524454477e-06, "loss": 0.0576, "step": 27400 }, { "epoch": 6.88, "grad_norm": 1.2169127785455203e-06, "learning_rate": 4.392325056433409e-06, "loss": 0.0002, "step": 27450 }, { "epoch": 6.9, "grad_norm": 6.928129077377054e-12, "learning_rate": 4.37464258841234e-06, "loss": 0.0989, "step": 27500 }, { "epoch": 6.91, "grad_norm": 7.992535522305388e-10, "learning_rate": 4.356960120391272e-06, "loss": 0.0014, "step": 27550 }, { "epoch": 6.92, "grad_norm": 0.001016330672428012, "learning_rate": 4.339277652370203e-06, "loss": 0.0796, "step": 27600 }, { "epoch": 6.94, "grad_norm": 9.33817503323553e-08, "learning_rate": 4.321595184349135e-06, "loss": 0.0031, "step": 27650 }, { "epoch": 6.95, "grad_norm": 3.0769423120524664e-10, "learning_rate": 4.303912716328066e-06, "loss": 0.0482, "step": 27700 }, { "epoch": 6.96, "grad_norm": 5.2930868577050205e-09, "learning_rate": 4.286230248306998e-06, "loss": 0.0241, "step": 27750 }, { "epoch": 6.97, "grad_norm": 2.738467628660146e-05, "learning_rate": 4.2685477802859295e-06, "loss": 0.0094, "step": 27800 }, { "epoch": 6.99, "grad_norm": 1.259439272871532e-06, "learning_rate": 4.250865312264861e-06, "loss": 0.0011, "step": 27850 }, { "epoch": 7.0, "grad_norm": 433.7135925292969, "learning_rate": 4.233182844243792e-06, "loss": 0.4265, "step": 27900 }, { "epoch": 7.01, "grad_norm": 0.000105952778540086, "learning_rate": 4.215500376222724e-06, "loss": 0.0048, "step": 27950 }, { "epoch": 7.02, "grad_norm": 0.2630611062049866, "learning_rate": 4.197817908201656e-06, "loss": 0.083, "step": 28000 }, { "epoch": 7.04, "grad_norm": 1.2784289252221193e-11, "learning_rate": 4.180135440180586e-06, "loss": 0.0003, "step": 28050 }, { "epoch": 7.05, "grad_norm": 4.8076164577137703e-11, "learning_rate": 4.162452972159519e-06, "loss": 0.0, "step": 28100 }, { "epoch": 7.06, "grad_norm": 2.940306558230077e-07, "learning_rate": 4.14477050413845e-06, "loss": 0.0001, "step": 28150 }, { "epoch": 7.07, "grad_norm": 4.1964653064496815e-05, "learning_rate": 4.127088036117381e-06, "loss": 0.0005, "step": 28200 }, { "epoch": 7.09, "grad_norm": 0.005852025002241135, "learning_rate": 4.1094055680963134e-06, "loss": 0.0049, "step": 28250 }, { "epoch": 7.1, "grad_norm": 0.05330043286085129, "learning_rate": 4.091723100075245e-06, "loss": 0.0, "step": 28300 }, { "epoch": 7.11, "grad_norm": 2.5323606323013337e-08, "learning_rate": 4.074040632054176e-06, "loss": 0.0001, "step": 28350 }, { "epoch": 7.12, "grad_norm": 0.004866173956543207, "learning_rate": 4.056358164033108e-06, "loss": 0.0002, "step": 28400 }, { "epoch": 7.14, "grad_norm": 1.1348839645819453e-09, "learning_rate": 4.038675696012039e-06, "loss": 0.0361, "step": 28450 }, { "epoch": 7.15, "grad_norm": 4.0626005102240015e-06, "learning_rate": 4.02099322799097e-06, "loss": 0.0006, "step": 28500 }, { "epoch": 7.16, "grad_norm": 1.4158376870909706e-07, "learning_rate": 4.003310759969903e-06, "loss": 0.0, "step": 28550 }, { "epoch": 7.17, "grad_norm": 3.5035823202633765e-06, "learning_rate": 3.9856282919488334e-06, "loss": 0.0, "step": 28600 }, { "epoch": 7.19, "grad_norm": 7.668052421649918e-05, "learning_rate": 3.967945823927765e-06, "loss": 0.1484, "step": 28650 }, { "epoch": 7.2, "grad_norm": 0.0006498922011815012, "learning_rate": 3.950263355906697e-06, "loss": 0.0, "step": 28700 }, { "epoch": 7.21, "grad_norm": 1.2344708920863923e-05, "learning_rate": 3.932580887885628e-06, "loss": 0.0001, "step": 28750 }, { "epoch": 7.22, "grad_norm": 4.231491038808599e-05, "learning_rate": 3.91489841986456e-06, "loss": 0.0001, "step": 28800 }, { "epoch": 7.24, "grad_norm": 0.008648673072457314, "learning_rate": 3.897215951843492e-06, "loss": 0.0, "step": 28850 }, { "epoch": 7.25, "grad_norm": 0.0010539034847170115, "learning_rate": 3.879533483822423e-06, "loss": 0.0, "step": 28900 }, { "epoch": 7.26, "grad_norm": 5.991931902826764e-05, "learning_rate": 3.861851015801354e-06, "loss": 0.0001, "step": 28950 }, { "epoch": 7.27, "grad_norm": 0.017336919903755188, "learning_rate": 3.844168547780287e-06, "loss": 0.0206, "step": 29000 }, { "epoch": 7.29, "grad_norm": 0.0004083296225871891, "learning_rate": 3.826486079759217e-06, "loss": 0.0002, "step": 29050 }, { "epoch": 7.3, "grad_norm": 9.027652740478516, "learning_rate": 3.808803611738149e-06, "loss": 0.0067, "step": 29100 }, { "epoch": 7.31, "grad_norm": 0.0003242001694161445, "learning_rate": 3.791121143717081e-06, "loss": 0.0, "step": 29150 }, { "epoch": 7.32, "grad_norm": 2.259884604427498e-05, "learning_rate": 3.773438675696012e-06, "loss": 0.0002, "step": 29200 }, { "epoch": 7.34, "grad_norm": 9.495877265930176, "learning_rate": 3.7557562076749436e-06, "loss": 0.0002, "step": 29250 }, { "epoch": 7.35, "grad_norm": 0.0059493957087397575, "learning_rate": 3.7380737396538755e-06, "loss": 0.0, "step": 29300 }, { "epoch": 7.36, "grad_norm": 0.004485088866204023, "learning_rate": 3.7203912716328067e-06, "loss": 0.0, "step": 29350 }, { "epoch": 7.37, "grad_norm": 8.322012309412881e-15, "learning_rate": 3.702708803611738e-06, "loss": 0.0018, "step": 29400 }, { "epoch": 7.39, "grad_norm": 0.0009153097053058445, "learning_rate": 3.68502633559067e-06, "loss": 0.0001, "step": 29450 }, { "epoch": 7.4, "grad_norm": 2.3616248654434457e-05, "learning_rate": 3.6673438675696013e-06, "loss": 0.1638, "step": 29500 }, { "epoch": 7.41, "grad_norm": 0.0017722542397677898, "learning_rate": 3.6496613995485324e-06, "loss": 0.0123, "step": 29550 }, { "epoch": 7.42, "grad_norm": 0.06969759613275528, "learning_rate": 3.631978931527465e-06, "loss": 0.0155, "step": 29600 }, { "epoch": 7.44, "grad_norm": 1.5746809367556125e-06, "learning_rate": 3.614296463506396e-06, "loss": 0.0001, "step": 29650 }, { "epoch": 7.45, "grad_norm": 3.0426802744010217e-10, "learning_rate": 3.596613995485327e-06, "loss": 0.0, "step": 29700 }, { "epoch": 7.46, "grad_norm": 0.5712952017784119, "learning_rate": 3.578931527464259e-06, "loss": 0.1067, "step": 29750 }, { "epoch": 7.47, "grad_norm": 0.766385555267334, "learning_rate": 3.5612490594431906e-06, "loss": 0.0107, "step": 29800 }, { "epoch": 7.49, "grad_norm": 0.05696748197078705, "learning_rate": 3.5435665914221217e-06, "loss": 0.0013, "step": 29850 }, { "epoch": 7.5, "grad_norm": 7.25884137864341e-06, "learning_rate": 3.5258841234010537e-06, "loss": 0.0, "step": 29900 }, { "epoch": 7.51, "grad_norm": 1.7060403479263186e-05, "learning_rate": 3.5082016553799852e-06, "loss": 0.0002, "step": 29950 }, { "epoch": 7.52, "grad_norm": 0.012671858072280884, "learning_rate": 3.4905191873589168e-06, "loss": 0.0, "step": 30000 }, { "epoch": 7.54, "grad_norm": 2.8193007928223324e-09, "learning_rate": 3.472836719337848e-06, "loss": 0.0001, "step": 30050 }, { "epoch": 7.55, "grad_norm": 0.019155049696564674, "learning_rate": 3.4551542513167795e-06, "loss": 0.0, "step": 30100 }, { "epoch": 7.56, "grad_norm": 0.0020516354124993086, "learning_rate": 3.4374717832957114e-06, "loss": 0.0002, "step": 30150 }, { "epoch": 7.57, "grad_norm": 2.4088294594548643e-05, "learning_rate": 3.4197893152746425e-06, "loss": 0.0492, "step": 30200 }, { "epoch": 7.59, "grad_norm": 1.9164204786648043e-05, "learning_rate": 3.402106847253574e-06, "loss": 0.0312, "step": 30250 }, { "epoch": 7.6, "grad_norm": 0.0160346832126379, "learning_rate": 3.384424379232506e-06, "loss": 0.0007, "step": 30300 }, { "epoch": 7.61, "grad_norm": 7.57160614739405e-06, "learning_rate": 3.366741911211437e-06, "loss": 0.0005, "step": 30350 }, { "epoch": 7.62, "grad_norm": 1.1699286504851525e-11, "learning_rate": 3.3490594431903687e-06, "loss": 0.0027, "step": 30400 }, { "epoch": 7.64, "grad_norm": 1.0412069286758197e-06, "learning_rate": 3.3313769751693003e-06, "loss": 0.0011, "step": 30450 }, { "epoch": 7.65, "grad_norm": 1.0678839998945477e-06, "learning_rate": 3.313694507148232e-06, "loss": 0.0, "step": 30500 }, { "epoch": 7.66, "grad_norm": 2.537229315535683e-09, "learning_rate": 3.2960120391271634e-06, "loss": 0.0, "step": 30550 }, { "epoch": 7.67, "grad_norm": 8.23991967990878e-07, "learning_rate": 3.278329571106095e-06, "loss": 0.0001, "step": 30600 }, { "epoch": 7.69, "grad_norm": 0.0006322423578239977, "learning_rate": 3.2606471030850265e-06, "loss": 0.0001, "step": 30650 }, { "epoch": 7.7, "grad_norm": 1.3688865863059618e-07, "learning_rate": 3.242964635063958e-06, "loss": 0.062, "step": 30700 }, { "epoch": 7.71, "grad_norm": 2.41971292780363e-06, "learning_rate": 3.2252821670428896e-06, "loss": 0.1235, "step": 30750 }, { "epoch": 7.73, "grad_norm": 2.4634087480990274e-07, "learning_rate": 3.207599699021821e-06, "loss": 0.0035, "step": 30800 }, { "epoch": 7.74, "grad_norm": 3.1068152566149365e-06, "learning_rate": 3.1899172310007527e-06, "loss": 0.0205, "step": 30850 }, { "epoch": 7.75, "grad_norm": 5.763430177552209e-09, "learning_rate": 3.1722347629796842e-06, "loss": 0.0, "step": 30900 }, { "epoch": 7.76, "grad_norm": 0.008364195004105568, "learning_rate": 3.1545522949586153e-06, "loss": 0.0009, "step": 30950 }, { "epoch": 7.78, "grad_norm": 0.00012845598394051194, "learning_rate": 3.1368698269375473e-06, "loss": 0.0008, "step": 31000 }, { "epoch": 7.79, "grad_norm": 0.001842482597567141, "learning_rate": 3.119187358916479e-06, "loss": 0.0007, "step": 31050 }, { "epoch": 7.8, "grad_norm": 1.2641396263113336e-10, "learning_rate": 3.10150489089541e-06, "loss": 0.0019, "step": 31100 }, { "epoch": 7.81, "grad_norm": 0.00033131783129647374, "learning_rate": 3.083822422874342e-06, "loss": 0.0002, "step": 31150 }, { "epoch": 7.83, "grad_norm": 1.851675369834993e-05, "learning_rate": 3.0661399548532735e-06, "loss": 0.0009, "step": 31200 }, { "epoch": 7.84, "grad_norm": 0.00795644149184227, "learning_rate": 3.0484574868322046e-06, "loss": 0.077, "step": 31250 }, { "epoch": 7.85, "grad_norm": 0.07745194435119629, "learning_rate": 3.030775018811136e-06, "loss": 0.0001, "step": 31300 }, { "epoch": 7.86, "grad_norm": 0.10175588726997375, "learning_rate": 3.013092550790068e-06, "loss": 0.0307, "step": 31350 }, { "epoch": 7.88, "grad_norm": 8.556443935958669e-05, "learning_rate": 2.9954100827689993e-06, "loss": 0.0, "step": 31400 }, { "epoch": 7.89, "grad_norm": 0.9275371432304382, "learning_rate": 2.977727614747931e-06, "loss": 0.1478, "step": 31450 }, { "epoch": 7.9, "grad_norm": 5.1567803360796916e-09, "learning_rate": 2.960045146726863e-06, "loss": 0.0598, "step": 31500 }, { "epoch": 7.91, "grad_norm": 6.67710139623523e-07, "learning_rate": 2.942362678705794e-06, "loss": 0.0094, "step": 31550 }, { "epoch": 7.93, "grad_norm": 1.458290155298414e-10, "learning_rate": 2.9246802106847255e-06, "loss": 0.0009, "step": 31600 }, { "epoch": 7.94, "grad_norm": 5.1869348681066185e-05, "learning_rate": 2.906997742663657e-06, "loss": 0.0007, "step": 31650 }, { "epoch": 7.95, "grad_norm": 0.00036754223401658237, "learning_rate": 2.8893152746425886e-06, "loss": 0.1282, "step": 31700 }, { "epoch": 7.96, "grad_norm": 0.0028616636991500854, "learning_rate": 2.87163280662152e-06, "loss": 0.1516, "step": 31750 }, { "epoch": 7.98, "grad_norm": 0.0008008142467588186, "learning_rate": 2.8539503386004512e-06, "loss": 0.0004, "step": 31800 }, { "epoch": 7.99, "grad_norm": 1.0718519405372717e-07, "learning_rate": 2.8362678705793832e-06, "loss": 0.0, "step": 31850 }, { "epoch": 8.0, "grad_norm": 0.0009103859774768353, "learning_rate": 2.8185854025583148e-06, "loss": 0.0001, "step": 31900 }, { "epoch": 8.01, "grad_norm": 0.0001856798044173047, "learning_rate": 2.800902934537246e-06, "loss": 0.0, "step": 31950 }, { "epoch": 8.03, "grad_norm": 0.00011591133807087317, "learning_rate": 2.7832204665161774e-06, "loss": 0.0001, "step": 32000 }, { "epoch": 8.04, "grad_norm": 0.00040982267819345, "learning_rate": 2.7655379984951094e-06, "loss": 0.0, "step": 32050 }, { "epoch": 8.05, "grad_norm": 2.265534648770995e-09, "learning_rate": 2.7478555304740405e-06, "loss": 0.0001, "step": 32100 }, { "epoch": 8.06, "grad_norm": 5.858885425424898e-13, "learning_rate": 2.730173062452972e-06, "loss": 0.0001, "step": 32150 }, { "epoch": 8.08, "grad_norm": 2.8236866932730136e-18, "learning_rate": 2.712490594431904e-06, "loss": 0.0, "step": 32200 }, { "epoch": 8.09, "grad_norm": 0.0001981940004043281, "learning_rate": 2.694808126410835e-06, "loss": 0.0, "step": 32250 }, { "epoch": 8.1, "grad_norm": 3.2661256511856696e-12, "learning_rate": 2.6771256583897667e-06, "loss": 0.0, "step": 32300 }, { "epoch": 8.11, "grad_norm": 1.1293546776869334e-05, "learning_rate": 2.6594431903686983e-06, "loss": 0.0, "step": 32350 }, { "epoch": 8.13, "grad_norm": 0.0003391726640984416, "learning_rate": 2.64176072234763e-06, "loss": 0.0, "step": 32400 }, { "epoch": 8.14, "grad_norm": 6.486132042482495e-05, "learning_rate": 2.6240782543265614e-06, "loss": 0.0, "step": 32450 }, { "epoch": 8.15, "grad_norm": 2.1309777366695926e-05, "learning_rate": 2.606395786305493e-06, "loss": 0.0, "step": 32500 }, { "epoch": 8.16, "grad_norm": 4.4795211806558655e-07, "learning_rate": 2.5887133182844245e-06, "loss": 0.0007, "step": 32550 }, { "epoch": 8.18, "grad_norm": 2.0528705402256264e-09, "learning_rate": 2.571030850263356e-06, "loss": 0.0, "step": 32600 }, { "epoch": 8.19, "grad_norm": 4.783522308571264e-05, "learning_rate": 2.5533483822422876e-06, "loss": 0.0, "step": 32650 }, { "epoch": 8.2, "grad_norm": 1.7800081408836377e-08, "learning_rate": 2.535665914221219e-06, "loss": 0.0, "step": 32700 }, { "epoch": 8.21, "grad_norm": 0.0003143524518236518, "learning_rate": 2.5179834462001507e-06, "loss": 0.0003, "step": 32750 }, { "epoch": 8.23, "grad_norm": 9.409014455741271e-05, "learning_rate": 2.500300978179082e-06, "loss": 0.0, "step": 32800 }, { "epoch": 8.24, "grad_norm": 3.097814449404268e-09, "learning_rate": 2.4826185101580133e-06, "loss": 0.0, "step": 32850 }, { "epoch": 8.25, "grad_norm": 6.660656595158798e-07, "learning_rate": 2.4649360421369453e-06, "loss": 0.0, "step": 32900 }, { "epoch": 8.26, "grad_norm": 0.04804990068078041, "learning_rate": 2.447253574115877e-06, "loss": 0.0, "step": 32950 }, { "epoch": 8.28, "grad_norm": 3.926641234386352e-09, "learning_rate": 2.429571106094808e-06, "loss": 0.0, "step": 33000 }, { "epoch": 8.29, "grad_norm": 1.5834859022183257e-20, "learning_rate": 2.4118886380737395e-06, "loss": 0.0001, "step": 33050 }, { "epoch": 8.3, "grad_norm": 0.20250500738620758, "learning_rate": 2.3942061700526715e-06, "loss": 0.0004, "step": 33100 }, { "epoch": 8.31, "grad_norm": 5.932114959250612e-07, "learning_rate": 2.3765237020316026e-06, "loss": 0.0001, "step": 33150 }, { "epoch": 8.33, "grad_norm": 1.5223192498248217e-11, "learning_rate": 2.358841234010534e-06, "loss": 0.0, "step": 33200 }, { "epoch": 8.34, "grad_norm": 1.2739813826101454e-07, "learning_rate": 2.341158765989466e-06, "loss": 0.0, "step": 33250 }, { "epoch": 8.35, "grad_norm": 1.2789546310898459e-08, "learning_rate": 2.3234762979683973e-06, "loss": 0.0001, "step": 33300 }, { "epoch": 8.36, "grad_norm": 1.4692803233629093e-05, "learning_rate": 2.305793829947329e-06, "loss": 0.0, "step": 33350 }, { "epoch": 8.38, "grad_norm": 0.00019242956477683038, "learning_rate": 2.2881113619262604e-06, "loss": 0.0403, "step": 33400 }, { "epoch": 8.39, "grad_norm": 0.0, "learning_rate": 2.270428893905192e-06, "loss": 0.0, "step": 33450 }, { "epoch": 8.4, "grad_norm": 0.002393543953076005, "learning_rate": 2.2527464258841235e-06, "loss": 0.0399, "step": 33500 }, { "epoch": 8.41, "grad_norm": 1.7551202802223997e-07, "learning_rate": 2.235063957863055e-06, "loss": 0.0, "step": 33550 }, { "epoch": 8.43, "grad_norm": 2.735872639547665e-11, "learning_rate": 2.2173814898419866e-06, "loss": 0.0004, "step": 33600 }, { "epoch": 8.44, "grad_norm": 0.0003994428552687168, "learning_rate": 2.199699021820918e-06, "loss": 0.0, "step": 33650 }, { "epoch": 8.45, "grad_norm": 2.7801218032836914, "learning_rate": 2.1820165537998497e-06, "loss": 0.0, "step": 33700 }, { "epoch": 8.47, "grad_norm": 1.100529516406823e-06, "learning_rate": 2.164334085778781e-06, "loss": 0.0, "step": 33750 }, { "epoch": 8.48, "grad_norm": 0.02319416031241417, "learning_rate": 2.1466516177577128e-06, "loss": 0.0, "step": 33800 }, { "epoch": 8.49, "grad_norm": 0.0017326247179880738, "learning_rate": 2.1289691497366443e-06, "loss": 0.0, "step": 33850 }, { "epoch": 8.5, "grad_norm": 1.8130524859216735e-10, "learning_rate": 2.1112866817155754e-06, "loss": 0.0001, "step": 33900 }, { "epoch": 8.52, "grad_norm": 0.0017156396061182022, "learning_rate": 2.0936042136945074e-06, "loss": 0.0004, "step": 33950 }, { "epoch": 8.53, "grad_norm": 0.0018524077022448182, "learning_rate": 2.075921745673439e-06, "loss": 0.0, "step": 34000 }, { "epoch": 8.54, "grad_norm": 1.1194772923772689e-05, "learning_rate": 2.05823927765237e-06, "loss": 0.0, "step": 34050 }, { "epoch": 8.55, "grad_norm": 7.453370471921517e-06, "learning_rate": 2.040556809631302e-06, "loss": 0.0164, "step": 34100 }, { "epoch": 8.57, "grad_norm": 6.412294029090049e-10, "learning_rate": 2.0228743416102336e-06, "loss": 0.0, "step": 34150 }, { "epoch": 8.58, "grad_norm": 1.4134855689861236e-17, "learning_rate": 2.0051918735891647e-06, "loss": 0.0, "step": 34200 }, { "epoch": 8.59, "grad_norm": 0.00038817909080535173, "learning_rate": 1.9875094055680963e-06, "loss": 0.0365, "step": 34250 }, { "epoch": 8.6, "grad_norm": 2.2248328605201095e-05, "learning_rate": 1.9698269375470282e-06, "loss": 0.0, "step": 34300 }, { "epoch": 8.62, "grad_norm": 0.010381842032074928, "learning_rate": 1.9521444695259594e-06, "loss": 0.0, "step": 34350 }, { "epoch": 8.63, "grad_norm": 0.001246288768015802, "learning_rate": 1.934462001504891e-06, "loss": 0.0, "step": 34400 }, { "epoch": 8.64, "grad_norm": 1.8006402254104614, "learning_rate": 1.916779533483823e-06, "loss": 0.0007, "step": 34450 }, { "epoch": 8.65, "grad_norm": 1.644072100681626e-10, "learning_rate": 1.899097065462754e-06, "loss": 0.0, "step": 34500 }, { "epoch": 8.67, "grad_norm": 5.652666779099036e-09, "learning_rate": 1.8814145974416856e-06, "loss": 0.0, "step": 34550 }, { "epoch": 8.68, "grad_norm": 0.003141549648717046, "learning_rate": 1.8637321294206173e-06, "loss": 0.0, "step": 34600 }, { "epoch": 8.69, "grad_norm": 1.1486420135042863e-06, "learning_rate": 1.8460496613995484e-06, "loss": 0.0003, "step": 34650 }, { "epoch": 8.7, "grad_norm": 1.1713603271346074e-05, "learning_rate": 1.8283671933784802e-06, "loss": 0.0, "step": 34700 }, { "epoch": 8.72, "grad_norm": 1.2204428685436142e-06, "learning_rate": 1.8106847253574115e-06, "loss": 0.0, "step": 34750 }, { "epoch": 8.73, "grad_norm": 0.0014657212886959314, "learning_rate": 1.793002257336343e-06, "loss": 0.0, "step": 34800 }, { "epoch": 8.74, "grad_norm": 0.017868679016828537, "learning_rate": 1.7753197893152748e-06, "loss": 0.0, "step": 34850 }, { "epoch": 8.75, "grad_norm": 3.3499613891763147e-06, "learning_rate": 1.7576373212942062e-06, "loss": 0.0001, "step": 34900 }, { "epoch": 8.77, "grad_norm": 6.1278524476904295e-09, "learning_rate": 1.7399548532731377e-06, "loss": 0.0001, "step": 34950 }, { "epoch": 8.78, "grad_norm": 1.445396605959104e-06, "learning_rate": 1.7222723852520693e-06, "loss": 0.0001, "step": 35000 }, { "epoch": 8.79, "grad_norm": 0.0017798148328438401, "learning_rate": 1.7045899172310008e-06, "loss": 0.1651, "step": 35050 }, { "epoch": 8.8, "grad_norm": 3.6833380789857983e-08, "learning_rate": 1.6869074492099324e-06, "loss": 0.0, "step": 35100 }, { "epoch": 8.82, "grad_norm": 9.361156988463293e-11, "learning_rate": 1.669224981188864e-06, "loss": 0.0, "step": 35150 }, { "epoch": 8.83, "grad_norm": 7.828115933250501e-09, "learning_rate": 1.6515425131677955e-06, "loss": 0.0, "step": 35200 }, { "epoch": 8.84, "grad_norm": 1.020300643972405e-08, "learning_rate": 1.6338600451467268e-06, "loss": 0.0, "step": 35250 }, { "epoch": 8.85, "grad_norm": 0.000530413759406656, "learning_rate": 1.6161775771256586e-06, "loss": 0.0, "step": 35300 }, { "epoch": 8.87, "grad_norm": 8.391751182834639e-10, "learning_rate": 1.59849510910459e-06, "loss": 0.0, "step": 35350 }, { "epoch": 8.88, "grad_norm": 0.003899802453815937, "learning_rate": 1.5808126410835214e-06, "loss": 0.0, "step": 35400 }, { "epoch": 8.89, "grad_norm": 2.3727285224595107e-05, "learning_rate": 1.5631301730624532e-06, "loss": 0.0, "step": 35450 }, { "epoch": 8.9, "grad_norm": 4.068557245773263e-06, "learning_rate": 1.5454477050413845e-06, "loss": 0.0, "step": 35500 }, { "epoch": 8.92, "grad_norm": 0.007758264895528555, "learning_rate": 1.527765237020316e-06, "loss": 0.0, "step": 35550 }, { "epoch": 8.93, "grad_norm": 0.00016868404054548591, "learning_rate": 1.5100827689992474e-06, "loss": 0.0, "step": 35600 }, { "epoch": 8.94, "grad_norm": 0.00011449763405835256, "learning_rate": 1.4924003009781792e-06, "loss": 0.0, "step": 35650 }, { "epoch": 8.95, "grad_norm": 4.054548298881855e-06, "learning_rate": 1.4747178329571107e-06, "loss": 0.0, "step": 35700 }, { "epoch": 8.97, "grad_norm": 0.0010476693278178573, "learning_rate": 1.457035364936042e-06, "loss": 0.0001, "step": 35750 }, { "epoch": 8.98, "grad_norm": 0.06502784043550491, "learning_rate": 1.4393528969149738e-06, "loss": 0.0, "step": 35800 }, { "epoch": 8.99, "grad_norm": 2.2866407789479126e-07, "learning_rate": 1.4216704288939052e-06, "loss": 0.0, "step": 35850 }, { "epoch": 9.0, "grad_norm": 0.00021389636094681919, "learning_rate": 1.4039879608728367e-06, "loss": 0.0002, "step": 35900 }, { "epoch": 9.02, "grad_norm": 1.3870979032049036e-08, "learning_rate": 1.3863054928517683e-06, "loss": 0.0, "step": 35950 }, { "epoch": 9.03, "grad_norm": 6.817894586674811e-07, "learning_rate": 1.3686230248306998e-06, "loss": 0.0, "step": 36000 }, { "epoch": 9.04, "grad_norm": 6.899564031215277e-09, "learning_rate": 1.3509405568096314e-06, "loss": 0.0, "step": 36050 }, { "epoch": 9.05, "grad_norm": 5.222953859629342e-06, "learning_rate": 1.333258088788563e-06, "loss": 0.0, "step": 36100 }, { "epoch": 9.07, "grad_norm": 1.425233087104516e-08, "learning_rate": 1.3155756207674945e-06, "loss": 0.0, "step": 36150 }, { "epoch": 9.08, "grad_norm": 0.001089599565602839, "learning_rate": 1.2978931527464258e-06, "loss": 0.0, "step": 36200 }, { "epoch": 9.09, "grad_norm": 5.556981932386407e-07, "learning_rate": 1.2802106847253576e-06, "loss": 0.0, "step": 36250 }, { "epoch": 9.1, "grad_norm": 3.3812255423981696e-05, "learning_rate": 1.2625282167042889e-06, "loss": 0.0, "step": 36300 }, { "epoch": 9.12, "grad_norm": 0.008249111473560333, "learning_rate": 1.2448457486832204e-06, "loss": 0.0, "step": 36350 }, { "epoch": 9.13, "grad_norm": 1.3336196388991084e-05, "learning_rate": 1.2271632806621522e-06, "loss": 0.0, "step": 36400 }, { "epoch": 9.14, "grad_norm": 7.238022403655009e-10, "learning_rate": 1.2094808126410835e-06, "loss": 0.0, "step": 36450 }, { "epoch": 9.15, "grad_norm": 1.9307069831775436e-11, "learning_rate": 1.191798344620015e-06, "loss": 0.0, "step": 36500 }, { "epoch": 9.17, "grad_norm": 2.09102075932055e-11, "learning_rate": 1.1741158765989466e-06, "loss": 0.0, "step": 36550 }, { "epoch": 9.18, "grad_norm": 0.005663714837282896, "learning_rate": 1.1564334085778782e-06, "loss": 0.0, "step": 36600 }, { "epoch": 9.19, "grad_norm": 0.0010162381222471595, "learning_rate": 1.1387509405568097e-06, "loss": 0.0, "step": 36650 }, { "epoch": 9.2, "grad_norm": 6.629519339185208e-05, "learning_rate": 1.1210684725357413e-06, "loss": 0.0, "step": 36700 }, { "epoch": 9.22, "grad_norm": 3.708991016537766e-06, "learning_rate": 1.1033860045146728e-06, "loss": 0.0, "step": 36750 }, { "epoch": 9.23, "grad_norm": 1.2199451703054365e-05, "learning_rate": 1.0857035364936042e-06, "loss": 0.0, "step": 36800 }, { "epoch": 9.24, "grad_norm": 3.44480326930352e-07, "learning_rate": 1.068021068472536e-06, "loss": 0.0, "step": 36850 }, { "epoch": 9.26, "grad_norm": 4.88109819229976e-09, "learning_rate": 1.0503386004514673e-06, "loss": 0.0, "step": 36900 }, { "epoch": 9.27, "grad_norm": 1.233081690088511e-07, "learning_rate": 1.0326561324303988e-06, "loss": 0.0, "step": 36950 }, { "epoch": 9.28, "grad_norm": 4.49614967479306e-09, "learning_rate": 1.0149736644093304e-06, "loss": 0.0, "step": 37000 }, { "epoch": 9.29, "grad_norm": 1.2748416793328943e-06, "learning_rate": 9.97291196388262e-07, "loss": 0.0, "step": 37050 }, { "epoch": 9.31, "grad_norm": 2.0055947869265442e-14, "learning_rate": 9.796087283671935e-07, "loss": 0.0, "step": 37100 }, { "epoch": 9.32, "grad_norm": 1.1864563075153988e-15, "learning_rate": 9.619262603461248e-07, "loss": 0.0, "step": 37150 }, { "epoch": 9.33, "grad_norm": 3.7789734051330015e-05, "learning_rate": 9.442437923250566e-07, "loss": 0.0, "step": 37200 }, { "epoch": 9.34, "grad_norm": 0.2207726538181305, "learning_rate": 9.26561324303988e-07, "loss": 0.0, "step": 37250 }, { "epoch": 9.36, "grad_norm": 0.0002287498500663787, "learning_rate": 9.088788562829194e-07, "loss": 0.0, "step": 37300 }, { "epoch": 9.37, "grad_norm": 1.4228673350658028e-08, "learning_rate": 8.911963882618511e-07, "loss": 0.0, "step": 37350 }, { "epoch": 9.38, "grad_norm": 2.1739325584408233e-16, "learning_rate": 8.735139202407825e-07, "loss": 0.0, "step": 37400 }, { "epoch": 9.39, "grad_norm": 1.1177444037002715e-07, "learning_rate": 8.558314522197141e-07, "loss": 0.0, "step": 37450 }, { "epoch": 9.41, "grad_norm": 2.286371909576701e-06, "learning_rate": 8.381489841986456e-07, "loss": 0.0, "step": 37500 }, { "epoch": 9.42, "grad_norm": 0.0007677926332689822, "learning_rate": 8.204665161775772e-07, "loss": 0.0, "step": 37550 }, { "epoch": 9.43, "grad_norm": 7.146362435150877e-08, "learning_rate": 8.027840481565087e-07, "loss": 0.0, "step": 37600 }, { "epoch": 9.44, "grad_norm": 7.620369160576956e-06, "learning_rate": 7.851015801354402e-07, "loss": 0.0, "step": 37650 }, { "epoch": 9.46, "grad_norm": 1.6175413009023032e-07, "learning_rate": 7.674191121143717e-07, "loss": 0.0, "step": 37700 }, { "epoch": 9.47, "grad_norm": 1.4307224773801863e-06, "learning_rate": 7.497366440933033e-07, "loss": 0.0, "step": 37750 }, { "epoch": 9.48, "grad_norm": 1.4143168414193497e-07, "learning_rate": 7.320541760722348e-07, "loss": 0.0, "step": 37800 }, { "epoch": 9.49, "grad_norm": 5.554405676719276e-13, "learning_rate": 7.143717080511664e-07, "loss": 0.0, "step": 37850 }, { "epoch": 9.51, "grad_norm": 3.3866279225414075e-10, "learning_rate": 6.966892400300979e-07, "loss": 0.0941, "step": 37900 }, { "epoch": 9.52, "grad_norm": 6.048647804846041e-08, "learning_rate": 6.790067720090294e-07, "loss": 0.0, "step": 37950 }, { "epoch": 9.53, "grad_norm": 2.0648124632316467e-07, "learning_rate": 6.613243039879609e-07, "loss": 0.0, "step": 38000 }, { "epoch": 9.54, "grad_norm": 0.0016063437797129154, "learning_rate": 6.436418359668924e-07, "loss": 0.0, "step": 38050 }, { "epoch": 9.56, "grad_norm": 1.247152141559127e-07, "learning_rate": 6.259593679458239e-07, "loss": 0.0, "step": 38100 }, { "epoch": 9.57, "grad_norm": 8.07224120880079e-10, "learning_rate": 6.082768999247555e-07, "loss": 0.0, "step": 38150 }, { "epoch": 9.58, "grad_norm": 7.635571320184498e-13, "learning_rate": 5.90594431903687e-07, "loss": 0.0, "step": 38200 }, { "epoch": 9.59, "grad_norm": 4.792551688836966e-09, "learning_rate": 5.729119638826185e-07, "loss": 0.0, "step": 38250 }, { "epoch": 9.61, "grad_norm": 3.3811686535045737e-06, "learning_rate": 5.552294958615501e-07, "loss": 0.0, "step": 38300 }, { "epoch": 9.62, "grad_norm": 1.0496427338413383e-10, "learning_rate": 5.375470278404815e-07, "loss": 0.0, "step": 38350 }, { "epoch": 9.63, "grad_norm": 0.000780309725087136, "learning_rate": 5.198645598194131e-07, "loss": 0.0, "step": 38400 }, { "epoch": 9.64, "grad_norm": 4.170356078248005e-06, "learning_rate": 5.021820917983446e-07, "loss": 0.0, "step": 38450 }, { "epoch": 9.66, "grad_norm": 0.004391836933791637, "learning_rate": 4.844996237772762e-07, "loss": 0.0, "step": 38500 }, { "epoch": 9.67, "grad_norm": 6.341772859741468e-06, "learning_rate": 4.668171557562077e-07, "loss": 0.0, "step": 38550 }, { "epoch": 9.68, "grad_norm": 0.006547071970999241, "learning_rate": 4.4913468773513927e-07, "loss": 0.0, "step": 38600 }, { "epoch": 9.69, "grad_norm": 4.025184352940414e-06, "learning_rate": 4.3145221971407076e-07, "loss": 0.0, "step": 38650 }, { "epoch": 9.71, "grad_norm": 0.007137050852179527, "learning_rate": 4.1376975169300226e-07, "loss": 0.0, "step": 38700 }, { "epoch": 9.72, "grad_norm": 2.464033421745171e-10, "learning_rate": 3.960872836719338e-07, "loss": 0.0, "step": 38750 }, { "epoch": 9.73, "grad_norm": 1.651005368330516e-05, "learning_rate": 3.784048156508653e-07, "loss": 0.0, "step": 38800 }, { "epoch": 9.74, "grad_norm": 8.712972184021783e-12, "learning_rate": 3.6072234762979685e-07, "loss": 0.0, "step": 38850 }, { "epoch": 9.76, "grad_norm": 0.00013669347390532494, "learning_rate": 3.430398796087284e-07, "loss": 0.0, "step": 38900 }, { "epoch": 9.77, "grad_norm": 2.920106635428965e-05, "learning_rate": 3.253574115876599e-07, "loss": 0.0, "step": 38950 }, { "epoch": 9.78, "grad_norm": 3.793598768453421e-09, "learning_rate": 3.0767494356659144e-07, "loss": 0.0, "step": 39000 }, { "epoch": 9.79, "grad_norm": 1.0695604402144454e-09, "learning_rate": 2.89992475545523e-07, "loss": 0.0, "step": 39050 }, { "epoch": 9.81, "grad_norm": 1.2106751764691062e-16, "learning_rate": 2.723100075244545e-07, "loss": 0.0, "step": 39100 }, { "epoch": 9.82, "grad_norm": 3.6123870472692943e-07, "learning_rate": 2.54627539503386e-07, "loss": 0.0, "step": 39150 }, { "epoch": 9.83, "grad_norm": 3.495557336918864e-07, "learning_rate": 2.3694507148231756e-07, "loss": 0.0, "step": 39200 }, { "epoch": 9.84, "grad_norm": 4.76532950415276e-05, "learning_rate": 2.1926260346124908e-07, "loss": 0.0, "step": 39250 }, { "epoch": 9.86, "grad_norm": 8.871047612046823e-05, "learning_rate": 2.015801354401806e-07, "loss": 0.0, "step": 39300 } ], "logging_steps": 50, "max_steps": 39870, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 50, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }