{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999745398070118, "eval_steps": 500, "global_step": 9819, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 2.1875, "learning_rate": 3.389830508474576e-07, "loss": 2.0274, "step": 1 }, { "epoch": 0.0, "grad_norm": 2.875, "learning_rate": 1.6949152542372882e-06, "loss": 1.6129, "step": 5 }, { "epoch": 0.0, "grad_norm": 9.5625, "learning_rate": 3.3898305084745763e-06, "loss": 1.5432, "step": 10 }, { "epoch": 0.0, "grad_norm": 3.265625, "learning_rate": 5.084745762711865e-06, "loss": 1.6908, "step": 15 }, { "epoch": 0.0, "grad_norm": 2.0625, "learning_rate": 6.779661016949153e-06, "loss": 1.6712, "step": 20 }, { "epoch": 0.0, "grad_norm": 3.0625, "learning_rate": 8.47457627118644e-06, "loss": 1.7115, "step": 25 }, { "epoch": 0.0, "grad_norm": 3.328125, "learning_rate": 1.016949152542373e-05, "loss": 1.6835, "step": 30 }, { "epoch": 0.0, "grad_norm": 2.390625, "learning_rate": 1.1864406779661018e-05, "loss": 1.7974, "step": 35 }, { "epoch": 0.0, "grad_norm": 4.4375, "learning_rate": 1.3559322033898305e-05, "loss": 1.631, "step": 40 }, { "epoch": 0.0, "grad_norm": 11.3125, "learning_rate": 1.5254237288135596e-05, "loss": 1.3636, "step": 45 }, { "epoch": 0.01, "grad_norm": 2.359375, "learning_rate": 1.694915254237288e-05, "loss": 1.674, "step": 50 }, { "epoch": 0.01, "grad_norm": 7.84375, "learning_rate": 1.864406779661017e-05, "loss": 1.7198, "step": 55 }, { "epoch": 0.01, "grad_norm": 10.8125, "learning_rate": 2.033898305084746e-05, "loss": 1.5772, "step": 60 }, { "epoch": 0.01, "grad_norm": 3.203125, "learning_rate": 2.2033898305084748e-05, "loss": 1.5545, "step": 65 }, { "epoch": 0.01, "grad_norm": 4.125, "learning_rate": 2.3728813559322036e-05, "loss": 1.4847, "step": 70 }, { "epoch": 0.01, "grad_norm": 2.34375, "learning_rate": 2.5423728813559322e-05, "loss": 1.5674, "step": 75 }, { "epoch": 0.01, "grad_norm": 1.8671875, "learning_rate": 2.711864406779661e-05, "loss": 1.6044, "step": 80 }, { "epoch": 0.01, "grad_norm": 2.953125, "learning_rate": 2.88135593220339e-05, "loss": 1.7277, "step": 85 }, { "epoch": 0.01, "grad_norm": 10.0625, "learning_rate": 3.050847457627119e-05, "loss": 1.4924, "step": 90 }, { "epoch": 0.01, "grad_norm": 1.75, "learning_rate": 3.2203389830508473e-05, "loss": 1.5469, "step": 95 }, { "epoch": 0.01, "grad_norm": 2.609375, "learning_rate": 3.389830508474576e-05, "loss": 1.7402, "step": 100 }, { "epoch": 0.01, "grad_norm": 1.3203125, "learning_rate": 3.559322033898305e-05, "loss": 1.3229, "step": 105 }, { "epoch": 0.01, "grad_norm": 1.4140625, "learning_rate": 3.728813559322034e-05, "loss": 1.3506, "step": 110 }, { "epoch": 0.01, "grad_norm": 1.7578125, "learning_rate": 3.898305084745763e-05, "loss": 1.3828, "step": 115 }, { "epoch": 0.01, "grad_norm": 2.328125, "learning_rate": 4.067796610169492e-05, "loss": 1.5638, "step": 120 }, { "epoch": 0.01, "grad_norm": 11.4375, "learning_rate": 4.2372881355932206e-05, "loss": 1.337, "step": 125 }, { "epoch": 0.01, "grad_norm": 1.21875, "learning_rate": 4.4067796610169495e-05, "loss": 1.2208, "step": 130 }, { "epoch": 0.01, "grad_norm": 2.0, "learning_rate": 4.5762711864406784e-05, "loss": 1.3564, "step": 135 }, { "epoch": 0.01, "grad_norm": 4.5625, "learning_rate": 4.745762711864407e-05, "loss": 1.1207, "step": 140 }, { "epoch": 0.01, "grad_norm": 1.4375, "learning_rate": 4.915254237288136e-05, "loss": 1.6007, "step": 145 }, { "epoch": 0.02, "grad_norm": 1.8671875, "learning_rate": 5.0847457627118643e-05, "loss": 1.4556, "step": 150 }, { "epoch": 0.02, "grad_norm": 1.75, "learning_rate": 5.254237288135594e-05, "loss": 1.3288, "step": 155 }, { "epoch": 0.02, "grad_norm": 1.1796875, "learning_rate": 5.423728813559322e-05, "loss": 1.3321, "step": 160 }, { "epoch": 0.02, "grad_norm": 2.6875, "learning_rate": 5.593220338983051e-05, "loss": 1.316, "step": 165 }, { "epoch": 0.02, "grad_norm": 1.5625, "learning_rate": 5.76271186440678e-05, "loss": 1.0569, "step": 170 }, { "epoch": 0.02, "grad_norm": 1.375, "learning_rate": 5.932203389830509e-05, "loss": 1.2836, "step": 175 }, { "epoch": 0.02, "grad_norm": 3.578125, "learning_rate": 6.101694915254238e-05, "loss": 1.2244, "step": 180 }, { "epoch": 0.02, "grad_norm": 4.0625, "learning_rate": 6.271186440677966e-05, "loss": 1.3131, "step": 185 }, { "epoch": 0.02, "grad_norm": 0.9296875, "learning_rate": 6.440677966101695e-05, "loss": 1.2183, "step": 190 }, { "epoch": 0.02, "grad_norm": 3.609375, "learning_rate": 6.610169491525424e-05, "loss": 1.1306, "step": 195 }, { "epoch": 0.02, "grad_norm": 1.3203125, "learning_rate": 6.779661016949152e-05, "loss": 1.101, "step": 200 }, { "epoch": 0.02, "grad_norm": 1.125, "learning_rate": 6.949152542372882e-05, "loss": 1.1969, "step": 205 }, { "epoch": 0.02, "grad_norm": 1.71875, "learning_rate": 7.11864406779661e-05, "loss": 1.2251, "step": 210 }, { "epoch": 0.02, "grad_norm": 1.1484375, "learning_rate": 7.288135593220338e-05, "loss": 1.2799, "step": 215 }, { "epoch": 0.02, "grad_norm": 1.3984375, "learning_rate": 7.457627118644068e-05, "loss": 1.1388, "step": 220 }, { "epoch": 0.02, "grad_norm": 1.1796875, "learning_rate": 7.627118644067796e-05, "loss": 1.3403, "step": 225 }, { "epoch": 0.02, "grad_norm": 1.03125, "learning_rate": 7.796610169491526e-05, "loss": 1.064, "step": 230 }, { "epoch": 0.02, "grad_norm": 1.171875, "learning_rate": 7.966101694915254e-05, "loss": 1.1494, "step": 235 }, { "epoch": 0.02, "grad_norm": 0.90234375, "learning_rate": 8.135593220338983e-05, "loss": 1.1034, "step": 240 }, { "epoch": 0.02, "grad_norm": 0.953125, "learning_rate": 8.305084745762712e-05, "loss": 0.981, "step": 245 }, { "epoch": 0.03, "grad_norm": 1.6015625, "learning_rate": 8.474576271186441e-05, "loss": 1.13, "step": 250 }, { "epoch": 0.03, "grad_norm": 1.8828125, "learning_rate": 8.644067796610171e-05, "loss": 1.153, "step": 255 }, { "epoch": 0.03, "grad_norm": 3.953125, "learning_rate": 8.813559322033899e-05, "loss": 1.047, "step": 260 }, { "epoch": 0.03, "grad_norm": 1.1640625, "learning_rate": 8.983050847457629e-05, "loss": 1.0913, "step": 265 }, { "epoch": 0.03, "grad_norm": 0.92578125, "learning_rate": 9.152542372881357e-05, "loss": 1.2967, "step": 270 }, { "epoch": 0.03, "grad_norm": 1.046875, "learning_rate": 9.322033898305085e-05, "loss": 1.0959, "step": 275 }, { "epoch": 0.03, "grad_norm": 1.546875, "learning_rate": 9.491525423728815e-05, "loss": 1.2349, "step": 280 }, { "epoch": 0.03, "grad_norm": 0.953125, "learning_rate": 9.661016949152543e-05, "loss": 1.1364, "step": 285 }, { "epoch": 0.03, "grad_norm": 1.4765625, "learning_rate": 9.830508474576272e-05, "loss": 1.0485, "step": 290 }, { "epoch": 0.03, "grad_norm": 1.0546875, "learning_rate": 0.0001, "loss": 1.1364, "step": 295 }, { "epoch": 0.03, "grad_norm": 2.453125, "learning_rate": 9.999993199499282e-05, "loss": 1.1016, "step": 300 }, { "epoch": 0.03, "grad_norm": 3.1875, "learning_rate": 9.999972798015626e-05, "loss": 1.31, "step": 305 }, { "epoch": 0.03, "grad_norm": 1.0625, "learning_rate": 9.999938795604529e-05, "loss": 1.3549, "step": 310 }, { "epoch": 0.03, "grad_norm": 0.828125, "learning_rate": 9.999891192358482e-05, "loss": 1.004, "step": 315 }, { "epoch": 0.03, "grad_norm": 0.7734375, "learning_rate": 9.999829988406979e-05, "loss": 1.0005, "step": 320 }, { "epoch": 0.03, "grad_norm": 1.3515625, "learning_rate": 9.999755183916503e-05, "loss": 1.1688, "step": 325 }, { "epoch": 0.03, "grad_norm": 0.99609375, "learning_rate": 9.999666779090541e-05, "loss": 1.0039, "step": 330 }, { "epoch": 0.03, "grad_norm": 0.8359375, "learning_rate": 9.99956477416957e-05, "loss": 1.2665, "step": 335 }, { "epoch": 0.03, "grad_norm": 0.91796875, "learning_rate": 9.999449169431064e-05, "loss": 1.1508, "step": 340 }, { "epoch": 0.04, "grad_norm": 0.734375, "learning_rate": 9.99931996518949e-05, "loss": 1.04, "step": 345 }, { "epoch": 0.04, "grad_norm": 1.0390625, "learning_rate": 9.99917716179631e-05, "loss": 1.1615, "step": 350 }, { "epoch": 0.04, "grad_norm": 0.77734375, "learning_rate": 9.999020759639978e-05, "loss": 1.098, "step": 355 }, { "epoch": 0.04, "grad_norm": 1.421875, "learning_rate": 9.998850759145942e-05, "loss": 1.1783, "step": 360 }, { "epoch": 0.04, "grad_norm": 1.453125, "learning_rate": 9.998667160776634e-05, "loss": 1.1064, "step": 365 }, { "epoch": 0.04, "grad_norm": 1.1015625, "learning_rate": 9.998469965031477e-05, "loss": 1.1819, "step": 370 }, { "epoch": 0.04, "grad_norm": 1.5859375, "learning_rate": 9.998259172446887e-05, "loss": 1.0785, "step": 375 }, { "epoch": 0.04, "grad_norm": 1.5859375, "learning_rate": 9.998034783596259e-05, "loss": 1.2747, "step": 380 }, { "epoch": 0.04, "grad_norm": 0.8828125, "learning_rate": 9.997796799089978e-05, "loss": 1.1299, "step": 385 }, { "epoch": 0.04, "grad_norm": 0.83984375, "learning_rate": 9.997545219575406e-05, "loss": 1.2208, "step": 390 }, { "epoch": 0.04, "grad_norm": 0.7890625, "learning_rate": 9.997280045736894e-05, "loss": 1.1041, "step": 395 }, { "epoch": 0.04, "grad_norm": 0.8671875, "learning_rate": 9.997001278295766e-05, "loss": 1.188, "step": 400 }, { "epoch": 0.04, "grad_norm": 2.90625, "learning_rate": 9.996708918010323e-05, "loss": 1.1673, "step": 405 }, { "epoch": 0.04, "grad_norm": 0.87890625, "learning_rate": 9.996402965675849e-05, "loss": 1.1214, "step": 410 }, { "epoch": 0.04, "grad_norm": 1.0625, "learning_rate": 9.996083422124589e-05, "loss": 1.038, "step": 415 }, { "epoch": 0.04, "grad_norm": 0.87109375, "learning_rate": 9.995750288225769e-05, "loss": 1.09, "step": 420 }, { "epoch": 0.04, "grad_norm": 0.94921875, "learning_rate": 9.995403564885581e-05, "loss": 0.9791, "step": 425 }, { "epoch": 0.04, "grad_norm": 0.78125, "learning_rate": 9.995043253047181e-05, "loss": 1.1455, "step": 430 }, { "epoch": 0.04, "grad_norm": 0.77734375, "learning_rate": 9.99466935369069e-05, "loss": 1.2065, "step": 435 }, { "epoch": 0.04, "grad_norm": 0.984375, "learning_rate": 9.994281867833185e-05, "loss": 1.0559, "step": 440 }, { "epoch": 0.05, "grad_norm": 0.8984375, "learning_rate": 9.99388079652871e-05, "loss": 0.9766, "step": 445 }, { "epoch": 0.05, "grad_norm": 1.40625, "learning_rate": 9.993466140868258e-05, "loss": 1.1912, "step": 450 }, { "epoch": 0.05, "grad_norm": 0.93359375, "learning_rate": 9.993037901979773e-05, "loss": 1.1181, "step": 455 }, { "epoch": 0.05, "grad_norm": 0.671875, "learning_rate": 9.992596081028153e-05, "loss": 1.1538, "step": 460 }, { "epoch": 0.05, "grad_norm": 1.0234375, "learning_rate": 9.992140679215241e-05, "loss": 1.1806, "step": 465 }, { "epoch": 0.05, "grad_norm": 1.03125, "learning_rate": 9.991671697779817e-05, "loss": 1.1783, "step": 470 }, { "epoch": 0.05, "grad_norm": 1.1953125, "learning_rate": 9.991189137997607e-05, "loss": 1.1097, "step": 475 }, { "epoch": 0.05, "grad_norm": 1.8984375, "learning_rate": 9.990693001181271e-05, "loss": 1.1734, "step": 480 }, { "epoch": 0.05, "grad_norm": 0.859375, "learning_rate": 9.990183288680399e-05, "loss": 0.9929, "step": 485 }, { "epoch": 0.05, "grad_norm": 0.82421875, "learning_rate": 9.989660001881512e-05, "loss": 1.1162, "step": 490 }, { "epoch": 0.05, "grad_norm": 0.79296875, "learning_rate": 9.989123142208052e-05, "loss": 0.9353, "step": 495 }, { "epoch": 0.05, "grad_norm": 0.8984375, "learning_rate": 9.988572711120388e-05, "loss": 1.1636, "step": 500 }, { "epoch": 0.05, "grad_norm": 2.03125, "learning_rate": 9.988008710115803e-05, "loss": 1.1438, "step": 505 }, { "epoch": 0.05, "grad_norm": 1.03125, "learning_rate": 9.987431140728491e-05, "loss": 1.1095, "step": 510 }, { "epoch": 0.05, "grad_norm": 0.84375, "learning_rate": 9.986840004529558e-05, "loss": 1.1126, "step": 515 }, { "epoch": 0.05, "grad_norm": 1.015625, "learning_rate": 9.986235303127011e-05, "loss": 1.041, "step": 520 }, { "epoch": 0.05, "grad_norm": 0.82421875, "learning_rate": 9.98561703816576e-05, "loss": 1.2251, "step": 525 }, { "epoch": 0.05, "grad_norm": 0.91796875, "learning_rate": 9.984985211327611e-05, "loss": 1.5059, "step": 530 }, { "epoch": 0.05, "grad_norm": 0.94921875, "learning_rate": 9.984339824331255e-05, "loss": 1.0779, "step": 535 }, { "epoch": 0.05, "grad_norm": 1.015625, "learning_rate": 9.98368087893228e-05, "loss": 0.9696, "step": 540 }, { "epoch": 0.06, "grad_norm": 0.81640625, "learning_rate": 9.983008376923144e-05, "loss": 1.1885, "step": 545 }, { "epoch": 0.06, "grad_norm": 0.74609375, "learning_rate": 9.982322320133191e-05, "loss": 0.9718, "step": 550 }, { "epoch": 0.06, "grad_norm": 0.734375, "learning_rate": 9.98162271042863e-05, "loss": 1.0219, "step": 555 }, { "epoch": 0.06, "grad_norm": 0.75, "learning_rate": 9.98090954971254e-05, "loss": 0.9908, "step": 560 }, { "epoch": 0.06, "grad_norm": 0.96875, "learning_rate": 9.980182839924864e-05, "loss": 1.2804, "step": 565 }, { "epoch": 0.06, "grad_norm": 0.7109375, "learning_rate": 9.979442583042396e-05, "loss": 1.0956, "step": 570 }, { "epoch": 0.06, "grad_norm": 0.796875, "learning_rate": 9.978688781078783e-05, "loss": 1.1121, "step": 575 }, { "epoch": 0.06, "grad_norm": 0.9140625, "learning_rate": 9.977921436084516e-05, "loss": 1.1813, "step": 580 }, { "epoch": 0.06, "grad_norm": 2.0625, "learning_rate": 9.977140550146928e-05, "loss": 1.1323, "step": 585 }, { "epoch": 0.06, "grad_norm": 0.859375, "learning_rate": 9.976346125390187e-05, "loss": 1.0552, "step": 590 }, { "epoch": 0.06, "grad_norm": 0.78515625, "learning_rate": 9.975538163975285e-05, "loss": 1.0293, "step": 595 }, { "epoch": 0.06, "grad_norm": 0.81640625, "learning_rate": 9.974716668100041e-05, "loss": 1.0693, "step": 600 }, { "epoch": 0.06, "grad_norm": 0.78515625, "learning_rate": 9.973881639999086e-05, "loss": 1.055, "step": 605 }, { "epoch": 0.06, "grad_norm": 1.796875, "learning_rate": 9.973033081943865e-05, "loss": 1.1065, "step": 610 }, { "epoch": 0.06, "grad_norm": 1.1171875, "learning_rate": 9.972170996242627e-05, "loss": 1.1045, "step": 615 }, { "epoch": 0.06, "grad_norm": 0.921875, "learning_rate": 9.971295385240415e-05, "loss": 1.1502, "step": 620 }, { "epoch": 0.06, "grad_norm": 1.0, "learning_rate": 9.97040625131907e-05, "loss": 1.1024, "step": 625 }, { "epoch": 0.06, "grad_norm": 0.6796875, "learning_rate": 9.96950359689721e-05, "loss": 1.0256, "step": 630 }, { "epoch": 0.06, "grad_norm": 0.83203125, "learning_rate": 9.968587424430239e-05, "loss": 1.1097, "step": 635 }, { "epoch": 0.07, "grad_norm": 0.74609375, "learning_rate": 9.967657736410329e-05, "loss": 1.1827, "step": 640 }, { "epoch": 0.07, "grad_norm": 0.76953125, "learning_rate": 9.966714535366416e-05, "loss": 0.9866, "step": 645 }, { "epoch": 0.07, "grad_norm": 0.640625, "learning_rate": 9.965757823864197e-05, "loss": 1.047, "step": 650 }, { "epoch": 0.07, "grad_norm": 0.75390625, "learning_rate": 9.964787604506119e-05, "loss": 1.0826, "step": 655 }, { "epoch": 0.07, "grad_norm": 0.7890625, "learning_rate": 9.963803879931372e-05, "loss": 1.0919, "step": 660 }, { "epoch": 0.07, "grad_norm": 1.0078125, "learning_rate": 9.962806652815887e-05, "loss": 1.138, "step": 665 }, { "epoch": 0.07, "grad_norm": 0.8203125, "learning_rate": 9.961795925872319e-05, "loss": 1.3509, "step": 670 }, { "epoch": 0.07, "grad_norm": 0.7265625, "learning_rate": 9.960771701850046e-05, "loss": 1.2172, "step": 675 }, { "epoch": 0.07, "grad_norm": 0.8046875, "learning_rate": 9.959733983535166e-05, "loss": 1.0813, "step": 680 }, { "epoch": 0.07, "grad_norm": 0.80078125, "learning_rate": 9.958682773750478e-05, "loss": 0.9785, "step": 685 }, { "epoch": 0.07, "grad_norm": 0.8515625, "learning_rate": 9.957618075355483e-05, "loss": 1.1987, "step": 690 }, { "epoch": 0.07, "grad_norm": 0.85546875, "learning_rate": 9.956539891246378e-05, "loss": 1.1459, "step": 695 }, { "epoch": 0.07, "grad_norm": 0.87890625, "learning_rate": 9.955448224356035e-05, "loss": 1.1929, "step": 700 }, { "epoch": 0.07, "grad_norm": 1.609375, "learning_rate": 9.95434307765401e-05, "loss": 1.1577, "step": 705 }, { "epoch": 0.07, "grad_norm": 1.6640625, "learning_rate": 9.95322445414652e-05, "loss": 0.9736, "step": 710 }, { "epoch": 0.07, "grad_norm": 0.9375, "learning_rate": 9.952092356876447e-05, "loss": 1.0244, "step": 715 }, { "epoch": 0.07, "grad_norm": 0.828125, "learning_rate": 9.950946788923323e-05, "loss": 1.0721, "step": 720 }, { "epoch": 0.07, "grad_norm": 0.83984375, "learning_rate": 9.94978775340332e-05, "loss": 1.0923, "step": 725 }, { "epoch": 0.07, "grad_norm": 0.84375, "learning_rate": 9.94861525346925e-05, "loss": 1.1278, "step": 730 }, { "epoch": 0.07, "grad_norm": 0.765625, "learning_rate": 9.947429292310544e-05, "loss": 1.1433, "step": 735 }, { "epoch": 0.08, "grad_norm": 0.8671875, "learning_rate": 9.946229873153257e-05, "loss": 1.0413, "step": 740 }, { "epoch": 0.08, "grad_norm": 0.80859375, "learning_rate": 9.945016999260046e-05, "loss": 0.9796, "step": 745 }, { "epoch": 0.08, "grad_norm": 0.75, "learning_rate": 9.943790673930174e-05, "loss": 1.111, "step": 750 }, { "epoch": 0.08, "grad_norm": 0.71875, "learning_rate": 9.942550900499492e-05, "loss": 0.9821, "step": 755 }, { "epoch": 0.08, "grad_norm": 0.76171875, "learning_rate": 9.941297682340428e-05, "loss": 1.1518, "step": 760 }, { "epoch": 0.08, "grad_norm": 1.125, "learning_rate": 9.940031022861991e-05, "loss": 1.2209, "step": 765 }, { "epoch": 0.08, "grad_norm": 0.77734375, "learning_rate": 9.938750925509744e-05, "loss": 1.141, "step": 770 }, { "epoch": 0.08, "grad_norm": 0.71484375, "learning_rate": 9.937457393765812e-05, "loss": 1.1641, "step": 775 }, { "epoch": 0.08, "grad_norm": 0.8515625, "learning_rate": 9.936150431148858e-05, "loss": 1.1908, "step": 780 }, { "epoch": 0.08, "grad_norm": 0.76171875, "learning_rate": 9.934830041214085e-05, "loss": 1.1927, "step": 785 }, { "epoch": 0.08, "grad_norm": 0.76953125, "learning_rate": 9.933496227553213e-05, "loss": 1.1013, "step": 790 }, { "epoch": 0.08, "grad_norm": 0.7734375, "learning_rate": 9.932148993794489e-05, "loss": 1.1508, "step": 795 }, { "epoch": 0.08, "grad_norm": 0.80078125, "learning_rate": 9.930788343602653e-05, "loss": 1.0958, "step": 800 }, { "epoch": 0.08, "grad_norm": 0.73828125, "learning_rate": 9.929414280678948e-05, "loss": 1.0411, "step": 805 }, { "epoch": 0.08, "grad_norm": 0.81640625, "learning_rate": 9.928026808761101e-05, "loss": 1.135, "step": 810 }, { "epoch": 0.08, "grad_norm": 0.83984375, "learning_rate": 9.926625931623314e-05, "loss": 1.149, "step": 815 }, { "epoch": 0.08, "grad_norm": 0.703125, "learning_rate": 9.92521165307625e-05, "loss": 1.1497, "step": 820 }, { "epoch": 0.08, "grad_norm": 0.703125, "learning_rate": 9.923783976967033e-05, "loss": 1.0114, "step": 825 }, { "epoch": 0.08, "grad_norm": 0.83984375, "learning_rate": 9.922342907179229e-05, "loss": 1.2451, "step": 830 }, { "epoch": 0.09, "grad_norm": 0.84375, "learning_rate": 9.920888447632834e-05, "loss": 1.1813, "step": 835 }, { "epoch": 0.09, "grad_norm": 0.734375, "learning_rate": 9.919420602284268e-05, "loss": 1.0433, "step": 840 }, { "epoch": 0.09, "grad_norm": 0.71484375, "learning_rate": 9.917939375126368e-05, "loss": 1.1203, "step": 845 }, { "epoch": 0.09, "grad_norm": 0.90234375, "learning_rate": 9.916444770188366e-05, "loss": 1.0648, "step": 850 }, { "epoch": 0.09, "grad_norm": 0.83203125, "learning_rate": 9.914936791535889e-05, "loss": 1.1393, "step": 855 }, { "epoch": 0.09, "grad_norm": 0.78125, "learning_rate": 9.91341544327094e-05, "loss": 1.0386, "step": 860 }, { "epoch": 0.09, "grad_norm": 0.859375, "learning_rate": 9.911880729531889e-05, "loss": 1.1063, "step": 865 }, { "epoch": 0.09, "grad_norm": 0.89453125, "learning_rate": 9.910332654493466e-05, "loss": 1.0915, "step": 870 }, { "epoch": 0.09, "grad_norm": 0.73828125, "learning_rate": 9.908771222366746e-05, "loss": 1.056, "step": 875 }, { "epoch": 0.09, "grad_norm": 0.7421875, "learning_rate": 9.907196437399138e-05, "loss": 1.0099, "step": 880 }, { "epoch": 0.09, "grad_norm": 0.86328125, "learning_rate": 9.90560830387437e-05, "loss": 0.9894, "step": 885 }, { "epoch": 0.09, "grad_norm": 0.82421875, "learning_rate": 9.904006826112483e-05, "loss": 1.0191, "step": 890 }, { "epoch": 0.09, "grad_norm": 0.671875, "learning_rate": 9.902392008469821e-05, "loss": 1.0701, "step": 895 }, { "epoch": 0.09, "grad_norm": 0.703125, "learning_rate": 9.900763855339008e-05, "loss": 1.25, "step": 900 }, { "epoch": 0.09, "grad_norm": 5.71875, "learning_rate": 9.899122371148948e-05, "loss": 1.0782, "step": 905 }, { "epoch": 0.09, "grad_norm": 0.80859375, "learning_rate": 9.897467560364806e-05, "loss": 1.1018, "step": 910 }, { "epoch": 0.09, "grad_norm": 0.77734375, "learning_rate": 9.895799427487999e-05, "loss": 1.0424, "step": 915 }, { "epoch": 0.09, "grad_norm": 1.421875, "learning_rate": 9.894117977056185e-05, "loss": 1.062, "step": 920 }, { "epoch": 0.09, "grad_norm": 0.82421875, "learning_rate": 9.892423213643242e-05, "loss": 1.156, "step": 925 }, { "epoch": 0.09, "grad_norm": 0.9921875, "learning_rate": 9.890715141859268e-05, "loss": 1.0613, "step": 930 }, { "epoch": 0.1, "grad_norm": 0.8125, "learning_rate": 9.888993766350559e-05, "loss": 1.079, "step": 935 }, { "epoch": 0.1, "grad_norm": 0.73046875, "learning_rate": 9.887259091799604e-05, "loss": 1.129, "step": 940 }, { "epoch": 0.1, "grad_norm": 0.95703125, "learning_rate": 9.885511122925062e-05, "loss": 1.232, "step": 945 }, { "epoch": 0.1, "grad_norm": 0.71484375, "learning_rate": 9.883749864481759e-05, "loss": 1.0411, "step": 950 }, { "epoch": 0.1, "grad_norm": 0.7109375, "learning_rate": 9.881975321260672e-05, "loss": 0.9664, "step": 955 }, { "epoch": 0.1, "grad_norm": 0.76953125, "learning_rate": 9.880187498088915e-05, "loss": 1.2187, "step": 960 }, { "epoch": 0.1, "grad_norm": 0.7890625, "learning_rate": 9.878386399829723e-05, "loss": 1.1002, "step": 965 }, { "epoch": 0.1, "grad_norm": 0.765625, "learning_rate": 9.876572031382445e-05, "loss": 0.9465, "step": 970 }, { "epoch": 0.1, "grad_norm": 0.8828125, "learning_rate": 9.874744397682524e-05, "loss": 1.1099, "step": 975 }, { "epoch": 0.1, "grad_norm": 0.78125, "learning_rate": 9.872903503701494e-05, "loss": 1.0486, "step": 980 }, { "epoch": 0.1, "grad_norm": 0.71875, "learning_rate": 9.871049354446953e-05, "loss": 1.0597, "step": 985 }, { "epoch": 0.1, "grad_norm": 0.6328125, "learning_rate": 9.869181954962559e-05, "loss": 1.0353, "step": 990 }, { "epoch": 0.1, "grad_norm": 0.98046875, "learning_rate": 9.867301310328011e-05, "loss": 1.0108, "step": 995 }, { "epoch": 0.1, "grad_norm": 0.80859375, "learning_rate": 9.865407425659041e-05, "loss": 1.1462, "step": 1000 }, { "epoch": 0.1, "grad_norm": 0.77734375, "learning_rate": 9.863500306107394e-05, "loss": 1.0011, "step": 1005 }, { "epoch": 0.1, "grad_norm": 0.921875, "learning_rate": 9.861579956860816e-05, "loss": 1.1095, "step": 1010 }, { "epoch": 0.1, "grad_norm": 1.0, "learning_rate": 9.859646383143043e-05, "loss": 1.2169, "step": 1015 }, { "epoch": 0.1, "grad_norm": 0.74609375, "learning_rate": 9.857699590213783e-05, "loss": 1.0282, "step": 1020 }, { "epoch": 0.1, "grad_norm": 0.953125, "learning_rate": 9.855739583368702e-05, "loss": 1.3305, "step": 1025 }, { "epoch": 0.1, "grad_norm": 0.7109375, "learning_rate": 9.85376636793941e-05, "loss": 1.1528, "step": 1030 }, { "epoch": 0.11, "grad_norm": 0.60546875, "learning_rate": 9.851779949293452e-05, "loss": 0.9676, "step": 1035 }, { "epoch": 0.11, "grad_norm": 0.88671875, "learning_rate": 9.84978033283428e-05, "loss": 1.1296, "step": 1040 }, { "epoch": 0.11, "grad_norm": 0.80859375, "learning_rate": 9.847767524001252e-05, "loss": 1.09, "step": 1045 }, { "epoch": 0.11, "grad_norm": 0.7890625, "learning_rate": 9.845741528269614e-05, "loss": 1.11, "step": 1050 }, { "epoch": 0.11, "grad_norm": 0.75, "learning_rate": 9.843702351150479e-05, "loss": 1.1153, "step": 1055 }, { "epoch": 0.11, "grad_norm": 0.72265625, "learning_rate": 9.841649998190817e-05, "loss": 1.2189, "step": 1060 }, { "epoch": 0.11, "grad_norm": 0.75390625, "learning_rate": 9.839584474973438e-05, "loss": 1.1517, "step": 1065 }, { "epoch": 0.11, "grad_norm": 0.7265625, "learning_rate": 9.83750578711698e-05, "loss": 0.9702, "step": 1070 }, { "epoch": 0.11, "grad_norm": 0.75, "learning_rate": 9.835413940275891e-05, "loss": 0.9542, "step": 1075 }, { "epoch": 0.11, "grad_norm": 0.8359375, "learning_rate": 9.833308940140411e-05, "loss": 1.1009, "step": 1080 }, { "epoch": 0.11, "grad_norm": 1.265625, "learning_rate": 9.831190792436564e-05, "loss": 1.1224, "step": 1085 }, { "epoch": 0.11, "grad_norm": 0.80859375, "learning_rate": 9.829059502926137e-05, "loss": 0.9884, "step": 1090 }, { "epoch": 0.11, "grad_norm": 0.73046875, "learning_rate": 9.82691507740666e-05, "loss": 1.0683, "step": 1095 }, { "epoch": 0.11, "grad_norm": 2.15625, "learning_rate": 9.824757521711405e-05, "loss": 1.1777, "step": 1100 }, { "epoch": 0.11, "grad_norm": 0.83203125, "learning_rate": 9.822586841709351e-05, "loss": 1.1444, "step": 1105 }, { "epoch": 0.11, "grad_norm": 0.74609375, "learning_rate": 9.820403043305187e-05, "loss": 1.0494, "step": 1110 }, { "epoch": 0.11, "grad_norm": 0.75, "learning_rate": 9.818206132439278e-05, "loss": 0.9759, "step": 1115 }, { "epoch": 0.11, "grad_norm": 0.78515625, "learning_rate": 9.815996115087664e-05, "loss": 1.0381, "step": 1120 }, { "epoch": 0.11, "grad_norm": 0.88671875, "learning_rate": 9.813772997262034e-05, "loss": 1.1601, "step": 1125 }, { "epoch": 0.12, "grad_norm": 1.0625, "learning_rate": 9.811536785009714e-05, "loss": 1.1123, "step": 1130 }, { "epoch": 0.12, "grad_norm": 0.8515625, "learning_rate": 9.809287484413649e-05, "loss": 0.9873, "step": 1135 }, { "epoch": 0.12, "grad_norm": 0.69140625, "learning_rate": 9.807025101592387e-05, "loss": 0.9711, "step": 1140 }, { "epoch": 0.12, "grad_norm": 0.7734375, "learning_rate": 9.804749642700061e-05, "loss": 1.142, "step": 1145 }, { "epoch": 0.12, "grad_norm": 0.7890625, "learning_rate": 9.802461113926378e-05, "loss": 1.1325, "step": 1150 }, { "epoch": 0.12, "grad_norm": 1.5703125, "learning_rate": 9.800159521496595e-05, "loss": 1.0743, "step": 1155 }, { "epoch": 0.12, "grad_norm": 0.83984375, "learning_rate": 9.7978448716715e-05, "loss": 1.0507, "step": 1160 }, { "epoch": 0.12, "grad_norm": 2.375, "learning_rate": 9.795517170747407e-05, "loss": 1.0658, "step": 1165 }, { "epoch": 0.12, "grad_norm": 0.71875, "learning_rate": 9.793176425056128e-05, "loss": 1.0853, "step": 1170 }, { "epoch": 0.12, "grad_norm": 0.81640625, "learning_rate": 9.790822640964961e-05, "loss": 1.1156, "step": 1175 }, { "epoch": 0.12, "grad_norm": 0.95703125, "learning_rate": 9.788455824876671e-05, "loss": 1.2031, "step": 1180 }, { "epoch": 0.12, "grad_norm": 0.6953125, "learning_rate": 9.786075983229469e-05, "loss": 0.9827, "step": 1185 }, { "epoch": 0.12, "grad_norm": 0.7578125, "learning_rate": 9.783683122497003e-05, "loss": 1.1374, "step": 1190 }, { "epoch": 0.12, "grad_norm": 0.765625, "learning_rate": 9.781277249188332e-05, "loss": 1.0057, "step": 1195 }, { "epoch": 0.12, "grad_norm": 0.8203125, "learning_rate": 9.778858369847914e-05, "loss": 1.1041, "step": 1200 }, { "epoch": 0.12, "grad_norm": 0.6640625, "learning_rate": 9.776426491055587e-05, "loss": 1.1098, "step": 1205 }, { "epoch": 0.12, "grad_norm": 0.734375, "learning_rate": 9.773981619426546e-05, "loss": 0.9285, "step": 1210 }, { "epoch": 0.12, "grad_norm": 0.78125, "learning_rate": 9.771523761611332e-05, "loss": 1.228, "step": 1215 }, { "epoch": 0.12, "grad_norm": 0.77734375, "learning_rate": 9.76905292429581e-05, "loss": 1.005, "step": 1220 }, { "epoch": 0.12, "grad_norm": 0.60546875, "learning_rate": 9.766569114201154e-05, "loss": 0.9875, "step": 1225 }, { "epoch": 0.13, "grad_norm": 0.80859375, "learning_rate": 9.764072338083826e-05, "loss": 1.0508, "step": 1230 }, { "epoch": 0.13, "grad_norm": 0.90625, "learning_rate": 9.761562602735552e-05, "loss": 1.2457, "step": 1235 }, { "epoch": 0.13, "grad_norm": 0.78515625, "learning_rate": 9.759039914983319e-05, "loss": 0.8978, "step": 1240 }, { "epoch": 0.13, "grad_norm": 0.73046875, "learning_rate": 9.756504281689343e-05, "loss": 1.0165, "step": 1245 }, { "epoch": 0.13, "grad_norm": 0.8125, "learning_rate": 9.753955709751053e-05, "loss": 1.3444, "step": 1250 }, { "epoch": 0.13, "grad_norm": 0.78125, "learning_rate": 9.751394206101074e-05, "loss": 1.1455, "step": 1255 }, { "epoch": 0.13, "grad_norm": 0.75390625, "learning_rate": 9.748819777707212e-05, "loss": 1.0634, "step": 1260 }, { "epoch": 0.13, "grad_norm": 0.91796875, "learning_rate": 9.746232431572426e-05, "loss": 1.2258, "step": 1265 }, { "epoch": 0.13, "grad_norm": 0.80859375, "learning_rate": 9.743632174734816e-05, "loss": 1.0882, "step": 1270 }, { "epoch": 0.13, "grad_norm": 1.546875, "learning_rate": 9.741019014267601e-05, "loss": 1.1057, "step": 1275 }, { "epoch": 0.13, "grad_norm": 0.6796875, "learning_rate": 9.738392957279103e-05, "loss": 1.1838, "step": 1280 }, { "epoch": 0.13, "grad_norm": 0.73828125, "learning_rate": 9.735754010912719e-05, "loss": 1.0147, "step": 1285 }, { "epoch": 0.13, "grad_norm": 0.66796875, "learning_rate": 9.733102182346915e-05, "loss": 0.979, "step": 1290 }, { "epoch": 0.13, "grad_norm": 0.83203125, "learning_rate": 9.730437478795195e-05, "loss": 1.1407, "step": 1295 }, { "epoch": 0.13, "grad_norm": 0.75390625, "learning_rate": 9.727759907506085e-05, "loss": 1.0828, "step": 1300 }, { "epoch": 0.13, "grad_norm": 0.74609375, "learning_rate": 9.725069475763118e-05, "loss": 0.9176, "step": 1305 }, { "epoch": 0.13, "grad_norm": 0.82421875, "learning_rate": 9.722366190884805e-05, "loss": 0.9957, "step": 1310 }, { "epoch": 0.13, "grad_norm": 0.75, "learning_rate": 9.719650060224622e-05, "loss": 1.0143, "step": 1315 }, { "epoch": 0.13, "grad_norm": 0.734375, "learning_rate": 9.71692109117099e-05, "loss": 1.0765, "step": 1320 }, { "epoch": 0.13, "grad_norm": 0.76953125, "learning_rate": 9.71417929114725e-05, "loss": 1.1173, "step": 1325 }, { "epoch": 0.14, "grad_norm": 0.8984375, "learning_rate": 9.711424667611648e-05, "loss": 1.078, "step": 1330 }, { "epoch": 0.14, "grad_norm": 0.7890625, "learning_rate": 9.708657228057312e-05, "loss": 1.0597, "step": 1335 }, { "epoch": 0.14, "grad_norm": 0.765625, "learning_rate": 9.70587698001223e-05, "loss": 1.0769, "step": 1340 }, { "epoch": 0.14, "grad_norm": 0.65234375, "learning_rate": 9.703083931039236e-05, "loss": 1.0842, "step": 1345 }, { "epoch": 0.14, "grad_norm": 0.77734375, "learning_rate": 9.700278088735982e-05, "loss": 1.1953, "step": 1350 }, { "epoch": 0.14, "grad_norm": 0.68359375, "learning_rate": 9.697459460734921e-05, "loss": 0.9097, "step": 1355 }, { "epoch": 0.14, "grad_norm": 0.77734375, "learning_rate": 9.694628054703285e-05, "loss": 1.0923, "step": 1360 }, { "epoch": 0.14, "grad_norm": 0.75390625, "learning_rate": 9.691783878343063e-05, "loss": 0.996, "step": 1365 }, { "epoch": 0.14, "grad_norm": 1.3359375, "learning_rate": 9.68892693939099e-05, "loss": 0.946, "step": 1370 }, { "epoch": 0.14, "grad_norm": 0.70703125, "learning_rate": 9.68605724561851e-05, "loss": 0.892, "step": 1375 }, { "epoch": 0.14, "grad_norm": 0.65625, "learning_rate": 9.683174804831763e-05, "loss": 1.0465, "step": 1380 }, { "epoch": 0.14, "grad_norm": 1.0859375, "learning_rate": 9.680279624871567e-05, "loss": 1.1461, "step": 1385 }, { "epoch": 0.14, "grad_norm": 0.76171875, "learning_rate": 9.67737171361339e-05, "loss": 1.0644, "step": 1390 }, { "epoch": 0.14, "grad_norm": 0.83203125, "learning_rate": 9.674451078967334e-05, "loss": 1.1142, "step": 1395 }, { "epoch": 0.14, "grad_norm": 0.7890625, "learning_rate": 9.67151772887811e-05, "loss": 0.9974, "step": 1400 }, { "epoch": 0.14, "grad_norm": 0.859375, "learning_rate": 9.668571671325018e-05, "loss": 1.3387, "step": 1405 }, { "epoch": 0.14, "grad_norm": 0.7734375, "learning_rate": 9.665612914321925e-05, "loss": 0.9411, "step": 1410 }, { "epoch": 0.14, "grad_norm": 0.7421875, "learning_rate": 9.66264146591724e-05, "loss": 1.0943, "step": 1415 }, { "epoch": 0.14, "grad_norm": 0.8125, "learning_rate": 9.659657334193901e-05, "loss": 1.1811, "step": 1420 }, { "epoch": 0.15, "grad_norm": 0.77734375, "learning_rate": 9.656660527269344e-05, "loss": 1.3589, "step": 1425 }, { "epoch": 0.15, "grad_norm": 0.8046875, "learning_rate": 9.65365105329548e-05, "loss": 1.1464, "step": 1430 }, { "epoch": 0.15, "grad_norm": 0.83203125, "learning_rate": 9.650628920458686e-05, "loss": 1.0347, "step": 1435 }, { "epoch": 0.15, "grad_norm": 0.73828125, "learning_rate": 9.647594136979768e-05, "loss": 1.0573, "step": 1440 }, { "epoch": 0.15, "grad_norm": 0.8046875, "learning_rate": 9.64454671111394e-05, "loss": 1.162, "step": 1445 }, { "epoch": 0.15, "grad_norm": 0.75390625, "learning_rate": 9.641486651150815e-05, "loss": 1.03, "step": 1450 }, { "epoch": 0.15, "grad_norm": 0.89453125, "learning_rate": 9.63841396541437e-05, "loss": 1.0686, "step": 1455 }, { "epoch": 0.15, "grad_norm": 0.9765625, "learning_rate": 9.635328662262921e-05, "loss": 1.2671, "step": 1460 }, { "epoch": 0.15, "grad_norm": 0.65625, "learning_rate": 9.632230750089114e-05, "loss": 1.0885, "step": 1465 }, { "epoch": 0.15, "grad_norm": 0.73828125, "learning_rate": 9.62912023731989e-05, "loss": 0.9341, "step": 1470 }, { "epoch": 0.15, "grad_norm": 0.73046875, "learning_rate": 9.625997132416467e-05, "loss": 1.1212, "step": 1475 }, { "epoch": 0.15, "grad_norm": 0.72265625, "learning_rate": 9.622861443874314e-05, "loss": 1.0372, "step": 1480 }, { "epoch": 0.15, "grad_norm": 0.86328125, "learning_rate": 9.619713180223135e-05, "loss": 1.1321, "step": 1485 }, { "epoch": 0.15, "grad_norm": 0.9453125, "learning_rate": 9.616552350026835e-05, "loss": 1.179, "step": 1490 }, { "epoch": 0.15, "grad_norm": 0.71875, "learning_rate": 9.613378961883506e-05, "loss": 1.1647, "step": 1495 }, { "epoch": 0.15, "grad_norm": 0.69921875, "learning_rate": 9.610193024425399e-05, "loss": 1.0691, "step": 1500 }, { "epoch": 0.15, "grad_norm": 0.796875, "learning_rate": 9.606994546318904e-05, "loss": 1.169, "step": 1505 }, { "epoch": 0.15, "grad_norm": 0.921875, "learning_rate": 9.603783536264519e-05, "loss": 1.156, "step": 1510 }, { "epoch": 0.15, "grad_norm": 0.66796875, "learning_rate": 9.600560002996837e-05, "loss": 1.1447, "step": 1515 }, { "epoch": 0.15, "grad_norm": 0.76171875, "learning_rate": 9.597323955284511e-05, "loss": 1.0639, "step": 1520 }, { "epoch": 0.16, "grad_norm": 0.71484375, "learning_rate": 9.594075401930242e-05, "loss": 1.0137, "step": 1525 }, { "epoch": 0.16, "grad_norm": 0.72265625, "learning_rate": 9.590814351770747e-05, "loss": 1.1508, "step": 1530 }, { "epoch": 0.16, "grad_norm": 0.7109375, "learning_rate": 9.587540813676732e-05, "loss": 1.2712, "step": 1535 }, { "epoch": 0.16, "grad_norm": 1.2578125, "learning_rate": 9.584254796552877e-05, "loss": 1.2424, "step": 1540 }, { "epoch": 0.16, "grad_norm": 0.69921875, "learning_rate": 9.580956309337807e-05, "loss": 1.0838, "step": 1545 }, { "epoch": 0.16, "grad_norm": 0.70703125, "learning_rate": 9.577645361004068e-05, "loss": 1.1275, "step": 1550 }, { "epoch": 0.16, "grad_norm": 0.82421875, "learning_rate": 9.574321960558104e-05, "loss": 1.1691, "step": 1555 }, { "epoch": 0.16, "grad_norm": 0.734375, "learning_rate": 9.57098611704023e-05, "loss": 0.967, "step": 1560 }, { "epoch": 0.16, "grad_norm": 0.78515625, "learning_rate": 9.567637839524604e-05, "loss": 1.0619, "step": 1565 }, { "epoch": 0.16, "grad_norm": 0.75390625, "learning_rate": 9.564277137119217e-05, "loss": 1.0978, "step": 1570 }, { "epoch": 0.16, "grad_norm": 0.83984375, "learning_rate": 9.56090401896585e-05, "loss": 1.1419, "step": 1575 }, { "epoch": 0.16, "grad_norm": 0.69921875, "learning_rate": 9.557518494240059e-05, "loss": 1.0186, "step": 1580 }, { "epoch": 0.16, "grad_norm": 0.7734375, "learning_rate": 9.554120572151152e-05, "loss": 1.0404, "step": 1585 }, { "epoch": 0.16, "grad_norm": 0.796875, "learning_rate": 9.550710261942155e-05, "loss": 1.0433, "step": 1590 }, { "epoch": 0.16, "grad_norm": 0.80078125, "learning_rate": 9.547287572889796e-05, "loss": 0.9927, "step": 1595 }, { "epoch": 0.16, "grad_norm": 0.828125, "learning_rate": 9.543852514304476e-05, "loss": 1.1879, "step": 1600 }, { "epoch": 0.16, "grad_norm": 1.0078125, "learning_rate": 9.54040509553024e-05, "loss": 1.1554, "step": 1605 }, { "epoch": 0.16, "grad_norm": 0.7734375, "learning_rate": 9.53694532594476e-05, "loss": 1.1748, "step": 1610 }, { "epoch": 0.16, "grad_norm": 0.6640625, "learning_rate": 9.5334732149593e-05, "loss": 1.0011, "step": 1615 }, { "epoch": 0.16, "grad_norm": 0.78515625, "learning_rate": 9.529988772018699e-05, "loss": 1.1982, "step": 1620 }, { "epoch": 0.17, "grad_norm": 0.859375, "learning_rate": 9.526492006601337e-05, "loss": 0.9936, "step": 1625 }, { "epoch": 0.17, "grad_norm": 1.2890625, "learning_rate": 9.52298292821912e-05, "loss": 1.1356, "step": 1630 }, { "epoch": 0.17, "grad_norm": 0.75390625, "learning_rate": 9.519461546417441e-05, "loss": 0.9811, "step": 1635 }, { "epoch": 0.17, "grad_norm": 0.6875, "learning_rate": 9.515927870775165e-05, "loss": 1.1542, "step": 1640 }, { "epoch": 0.17, "grad_norm": 0.7265625, "learning_rate": 9.5123819109046e-05, "loss": 1.0644, "step": 1645 }, { "epoch": 0.17, "grad_norm": 0.7109375, "learning_rate": 9.508823676451462e-05, "loss": 1.1402, "step": 1650 }, { "epoch": 0.17, "grad_norm": 1.7109375, "learning_rate": 9.505253177094864e-05, "loss": 1.076, "step": 1655 }, { "epoch": 0.17, "grad_norm": 1.1640625, "learning_rate": 9.50167042254728e-05, "loss": 1.1421, "step": 1660 }, { "epoch": 0.17, "grad_norm": 1.1953125, "learning_rate": 9.49807542255452e-05, "loss": 1.1005, "step": 1665 }, { "epoch": 0.17, "grad_norm": 0.609375, "learning_rate": 9.494468186895701e-05, "loss": 1.1378, "step": 1670 }, { "epoch": 0.17, "grad_norm": 0.875, "learning_rate": 9.49084872538323e-05, "loss": 1.0166, "step": 1675 }, { "epoch": 0.17, "grad_norm": 0.71875, "learning_rate": 9.487217047862767e-05, "loss": 1.068, "step": 1680 }, { "epoch": 0.17, "grad_norm": 0.734375, "learning_rate": 9.4835731642132e-05, "loss": 1.0219, "step": 1685 }, { "epoch": 0.17, "grad_norm": 0.81640625, "learning_rate": 9.479917084346623e-05, "loss": 1.2396, "step": 1690 }, { "epoch": 0.17, "grad_norm": 1.0859375, "learning_rate": 9.476248818208308e-05, "loss": 1.0547, "step": 1695 }, { "epoch": 0.17, "grad_norm": 0.93359375, "learning_rate": 9.472568375776669e-05, "loss": 1.2844, "step": 1700 }, { "epoch": 0.17, "grad_norm": 0.765625, "learning_rate": 9.468875767063252e-05, "loss": 1.0235, "step": 1705 }, { "epoch": 0.17, "grad_norm": 0.84765625, "learning_rate": 9.465171002112688e-05, "loss": 1.0473, "step": 1710 }, { "epoch": 0.17, "grad_norm": 0.78125, "learning_rate": 9.461454091002681e-05, "loss": 1.091, "step": 1715 }, { "epoch": 0.18, "grad_norm": 0.81640625, "learning_rate": 9.457725043843975e-05, "loss": 0.9108, "step": 1720 }, { "epoch": 0.18, "grad_norm": 0.828125, "learning_rate": 9.453983870780322e-05, "loss": 0.9136, "step": 1725 }, { "epoch": 0.18, "grad_norm": 0.8046875, "learning_rate": 9.450230581988466e-05, "loss": 1.0787, "step": 1730 }, { "epoch": 0.18, "grad_norm": 0.75390625, "learning_rate": 9.446465187678103e-05, "loss": 1.2171, "step": 1735 }, { "epoch": 0.18, "grad_norm": 0.6875, "learning_rate": 9.442687698091857e-05, "loss": 1.1358, "step": 1740 }, { "epoch": 0.18, "grad_norm": 0.73046875, "learning_rate": 9.43889812350526e-05, "loss": 1.0922, "step": 1745 }, { "epoch": 0.18, "grad_norm": 0.8359375, "learning_rate": 9.43509647422671e-05, "loss": 1.1919, "step": 1750 }, { "epoch": 0.18, "grad_norm": 0.79296875, "learning_rate": 9.431282760597458e-05, "loss": 1.1724, "step": 1755 }, { "epoch": 0.18, "grad_norm": 0.921875, "learning_rate": 9.427456992991568e-05, "loss": 1.0825, "step": 1760 }, { "epoch": 0.18, "grad_norm": 0.578125, "learning_rate": 9.423619181815893e-05, "loss": 0.9497, "step": 1765 }, { "epoch": 0.18, "grad_norm": 0.69140625, "learning_rate": 9.419769337510048e-05, "loss": 1.0628, "step": 1770 }, { "epoch": 0.18, "grad_norm": 0.765625, "learning_rate": 9.415907470546382e-05, "loss": 0.9418, "step": 1775 }, { "epoch": 0.18, "grad_norm": 0.71484375, "learning_rate": 9.412033591429947e-05, "loss": 0.9248, "step": 1780 }, { "epoch": 0.18, "grad_norm": 0.85546875, "learning_rate": 9.408147710698467e-05, "loss": 1.201, "step": 1785 }, { "epoch": 0.18, "grad_norm": 0.73046875, "learning_rate": 9.40424983892232e-05, "loss": 1.0609, "step": 1790 }, { "epoch": 0.18, "grad_norm": 1.4609375, "learning_rate": 9.400339986704494e-05, "loss": 1.1425, "step": 1795 }, { "epoch": 0.18, "grad_norm": 0.71484375, "learning_rate": 9.396418164680572e-05, "loss": 1.1577, "step": 1800 }, { "epoch": 0.18, "grad_norm": 0.76171875, "learning_rate": 9.392484383518696e-05, "loss": 0.8667, "step": 1805 }, { "epoch": 0.18, "grad_norm": 0.8671875, "learning_rate": 9.388538653919539e-05, "loss": 1.2117, "step": 1810 }, { "epoch": 0.18, "grad_norm": 0.88671875, "learning_rate": 9.384580986616273e-05, "loss": 1.155, "step": 1815 }, { "epoch": 0.19, "grad_norm": 0.84375, "learning_rate": 9.380611392374548e-05, "loss": 1.2536, "step": 1820 }, { "epoch": 0.19, "grad_norm": 0.7109375, "learning_rate": 9.376629881992454e-05, "loss": 1.075, "step": 1825 }, { "epoch": 0.19, "grad_norm": 0.7578125, "learning_rate": 9.3726364663005e-05, "loss": 1.1615, "step": 1830 }, { "epoch": 0.19, "grad_norm": 0.734375, "learning_rate": 9.368631156161571e-05, "loss": 0.97, "step": 1835 }, { "epoch": 0.19, "grad_norm": 0.76171875, "learning_rate": 9.364613962470918e-05, "loss": 0.9049, "step": 1840 }, { "epoch": 0.19, "grad_norm": 0.70703125, "learning_rate": 9.360584896156111e-05, "loss": 1.0722, "step": 1845 }, { "epoch": 0.19, "grad_norm": 0.69140625, "learning_rate": 9.356543968177015e-05, "loss": 1.0683, "step": 1850 }, { "epoch": 0.19, "grad_norm": 1.734375, "learning_rate": 9.352491189525766e-05, "loss": 1.0876, "step": 1855 }, { "epoch": 0.19, "grad_norm": 0.83984375, "learning_rate": 9.348426571226732e-05, "loss": 1.1982, "step": 1860 }, { "epoch": 0.19, "grad_norm": 0.6171875, "learning_rate": 9.344350124336491e-05, "loss": 1.0043, "step": 1865 }, { "epoch": 0.19, "grad_norm": 0.75390625, "learning_rate": 9.340261859943793e-05, "loss": 1.1613, "step": 1870 }, { "epoch": 0.19, "grad_norm": 0.875, "learning_rate": 9.336161789169536e-05, "loss": 1.155, "step": 1875 }, { "epoch": 0.19, "grad_norm": 0.8984375, "learning_rate": 9.332049923166735e-05, "loss": 1.3299, "step": 1880 }, { "epoch": 0.19, "grad_norm": 0.62109375, "learning_rate": 9.327926273120487e-05, "loss": 0.9898, "step": 1885 }, { "epoch": 0.19, "grad_norm": 1.0859375, "learning_rate": 9.323790850247949e-05, "loss": 1.0147, "step": 1890 }, { "epoch": 0.19, "grad_norm": 0.89453125, "learning_rate": 9.319643665798297e-05, "loss": 1.123, "step": 1895 }, { "epoch": 0.19, "grad_norm": 0.734375, "learning_rate": 9.315484731052706e-05, "loss": 1.0347, "step": 1900 }, { "epoch": 0.19, "grad_norm": 0.7109375, "learning_rate": 9.311314057324307e-05, "loss": 1.0325, "step": 1905 }, { "epoch": 0.19, "grad_norm": 0.87890625, "learning_rate": 9.307131655958173e-05, "loss": 1.0965, "step": 1910 }, { "epoch": 0.2, "grad_norm": 0.7265625, "learning_rate": 9.302937538331271e-05, "loss": 0.9937, "step": 1915 }, { "epoch": 0.2, "grad_norm": 0.80078125, "learning_rate": 9.29873171585244e-05, "loss": 1.0982, "step": 1920 }, { "epoch": 0.2, "grad_norm": 0.67578125, "learning_rate": 9.294514199962359e-05, "loss": 0.9308, "step": 1925 }, { "epoch": 0.2, "grad_norm": 0.71875, "learning_rate": 9.29028500213352e-05, "loss": 1.0217, "step": 1930 }, { "epoch": 0.2, "grad_norm": 0.6484375, "learning_rate": 9.286044133870185e-05, "loss": 1.0372, "step": 1935 }, { "epoch": 0.2, "grad_norm": 0.76171875, "learning_rate": 9.281791606708366e-05, "loss": 1.0679, "step": 1940 }, { "epoch": 0.2, "grad_norm": 0.83984375, "learning_rate": 9.277527432215786e-05, "loss": 1.054, "step": 1945 }, { "epoch": 0.2, "grad_norm": 0.98828125, "learning_rate": 9.273251621991858e-05, "loss": 1.1235, "step": 1950 }, { "epoch": 0.2, "grad_norm": 0.75, "learning_rate": 9.26896418766764e-05, "loss": 1.1428, "step": 1955 }, { "epoch": 0.2, "grad_norm": 0.73828125, "learning_rate": 9.264665140905813e-05, "loss": 1.0621, "step": 1960 }, { "epoch": 0.2, "grad_norm": 0.84375, "learning_rate": 9.260354493400643e-05, "loss": 0.9373, "step": 1965 }, { "epoch": 0.2, "grad_norm": 0.9765625, "learning_rate": 9.256032256877956e-05, "loss": 1.0627, "step": 1970 }, { "epoch": 0.2, "grad_norm": 0.734375, "learning_rate": 9.251698443095102e-05, "loss": 0.9658, "step": 1975 }, { "epoch": 0.2, "grad_norm": 0.81640625, "learning_rate": 9.24735306384092e-05, "loss": 1.0197, "step": 1980 }, { "epoch": 0.2, "grad_norm": 0.6875, "learning_rate": 9.242996130935716e-05, "loss": 1.0304, "step": 1985 }, { "epoch": 0.2, "grad_norm": 0.7109375, "learning_rate": 9.238627656231215e-05, "loss": 1.0051, "step": 1990 }, { "epoch": 0.2, "grad_norm": 0.9296875, "learning_rate": 9.234247651610547e-05, "loss": 1.1458, "step": 1995 }, { "epoch": 0.2, "grad_norm": 0.73828125, "learning_rate": 9.229856128988201e-05, "loss": 1.111, "step": 2000 }, { "epoch": 0.2, "grad_norm": 0.73828125, "learning_rate": 9.225453100309996e-05, "loss": 1.1457, "step": 2005 }, { "epoch": 0.2, "grad_norm": 0.83984375, "learning_rate": 9.221038577553055e-05, "loss": 0.9936, "step": 2010 }, { "epoch": 0.21, "grad_norm": 11.125, "learning_rate": 9.216612572725762e-05, "loss": 1.0155, "step": 2015 }, { "epoch": 0.21, "grad_norm": 0.74609375, "learning_rate": 9.212175097867738e-05, "loss": 1.0828, "step": 2020 }, { "epoch": 0.21, "grad_norm": 0.65625, "learning_rate": 9.207726165049801e-05, "loss": 1.0193, "step": 2025 }, { "epoch": 0.21, "grad_norm": 0.78515625, "learning_rate": 9.203265786373942e-05, "loss": 1.0699, "step": 2030 }, { "epoch": 0.21, "grad_norm": 0.875, "learning_rate": 9.198793973973285e-05, "loss": 1.0606, "step": 2035 }, { "epoch": 0.21, "grad_norm": 0.62109375, "learning_rate": 9.19431074001205e-05, "loss": 0.9172, "step": 2040 }, { "epoch": 0.21, "grad_norm": 0.7578125, "learning_rate": 9.18981609668554e-05, "loss": 1.0414, "step": 2045 }, { "epoch": 0.21, "grad_norm": 0.73046875, "learning_rate": 9.185310056220075e-05, "loss": 1.044, "step": 2050 }, { "epoch": 0.21, "grad_norm": 0.796875, "learning_rate": 9.180792630872997e-05, "loss": 1.0334, "step": 2055 }, { "epoch": 0.21, "grad_norm": 0.78125, "learning_rate": 9.1762638329326e-05, "loss": 1.1207, "step": 2060 }, { "epoch": 0.21, "grad_norm": 0.734375, "learning_rate": 9.171723674718127e-05, "loss": 1.0033, "step": 2065 }, { "epoch": 0.21, "grad_norm": 0.7578125, "learning_rate": 9.167172168579714e-05, "loss": 1.0764, "step": 2070 }, { "epoch": 0.21, "grad_norm": 0.65625, "learning_rate": 9.16260932689837e-05, "loss": 1.177, "step": 2075 }, { "epoch": 0.21, "grad_norm": 0.78125, "learning_rate": 9.158035162085941e-05, "loss": 1.0129, "step": 2080 }, { "epoch": 0.21, "grad_norm": 0.77734375, "learning_rate": 9.153449686585068e-05, "loss": 1.0506, "step": 2085 }, { "epoch": 0.21, "grad_norm": 0.80859375, "learning_rate": 9.148852912869166e-05, "loss": 1.0257, "step": 2090 }, { "epoch": 0.21, "grad_norm": 0.80078125, "learning_rate": 9.144244853442376e-05, "loss": 1.1325, "step": 2095 }, { "epoch": 0.21, "grad_norm": 1.8359375, "learning_rate": 9.139625520839548e-05, "loss": 1.1309, "step": 2100 }, { "epoch": 0.21, "grad_norm": 0.89453125, "learning_rate": 9.134994927626187e-05, "loss": 1.1229, "step": 2105 }, { "epoch": 0.21, "grad_norm": 2.5, "learning_rate": 9.130353086398435e-05, "loss": 0.9799, "step": 2110 }, { "epoch": 0.22, "grad_norm": 0.76953125, "learning_rate": 9.125700009783034e-05, "loss": 1.0075, "step": 2115 }, { "epoch": 0.22, "grad_norm": 0.703125, "learning_rate": 9.121035710437278e-05, "loss": 1.0522, "step": 2120 }, { "epoch": 0.22, "grad_norm": 0.80859375, "learning_rate": 9.116360201049e-05, "loss": 1.1396, "step": 2125 }, { "epoch": 0.22, "grad_norm": 0.76171875, "learning_rate": 9.111673494336519e-05, "loss": 1.0829, "step": 2130 }, { "epoch": 0.22, "grad_norm": 0.95703125, "learning_rate": 9.106975603048619e-05, "loss": 1.238, "step": 2135 }, { "epoch": 0.22, "grad_norm": 1.4609375, "learning_rate": 9.102266539964503e-05, "loss": 0.8204, "step": 2140 }, { "epoch": 0.22, "grad_norm": 0.6640625, "learning_rate": 9.097546317893766e-05, "loss": 0.7842, "step": 2145 }, { "epoch": 0.22, "grad_norm": 0.71875, "learning_rate": 9.092814949676358e-05, "loss": 1.0007, "step": 2150 }, { "epoch": 0.22, "grad_norm": 0.81640625, "learning_rate": 9.08807244818255e-05, "loss": 1.1469, "step": 2155 }, { "epoch": 0.22, "grad_norm": 0.7578125, "learning_rate": 9.083318826312893e-05, "loss": 1.0594, "step": 2160 }, { "epoch": 0.22, "grad_norm": 0.62890625, "learning_rate": 9.078554096998188e-05, "loss": 0.9549, "step": 2165 }, { "epoch": 0.22, "grad_norm": 0.765625, "learning_rate": 9.07377827319946e-05, "loss": 1.0158, "step": 2170 }, { "epoch": 0.22, "grad_norm": 0.71484375, "learning_rate": 9.068991367907902e-05, "loss": 1.0672, "step": 2175 }, { "epoch": 0.22, "grad_norm": 1.078125, "learning_rate": 9.064193394144857e-05, "loss": 0.9645, "step": 2180 }, { "epoch": 0.22, "grad_norm": 0.69921875, "learning_rate": 9.059384364961771e-05, "loss": 0.989, "step": 2185 }, { "epoch": 0.22, "grad_norm": 0.69921875, "learning_rate": 9.054564293440172e-05, "loss": 1.003, "step": 2190 }, { "epoch": 0.22, "grad_norm": 0.6875, "learning_rate": 9.049733192691617e-05, "loss": 0.8656, "step": 2195 }, { "epoch": 0.22, "grad_norm": 0.7734375, "learning_rate": 9.044891075857667e-05, "loss": 1.1505, "step": 2200 }, { "epoch": 0.22, "grad_norm": 0.7734375, "learning_rate": 9.04003795610985e-05, "loss": 1.0423, "step": 2205 }, { "epoch": 0.23, "grad_norm": 0.70703125, "learning_rate": 9.035173846649622e-05, "loss": 1.2939, "step": 2210 }, { "epoch": 0.23, "grad_norm": 0.875, "learning_rate": 9.03029876070834e-05, "loss": 0.975, "step": 2215 }, { "epoch": 0.23, "grad_norm": 0.84375, "learning_rate": 9.025412711547211e-05, "loss": 1.0369, "step": 2220 }, { "epoch": 0.23, "grad_norm": 0.74609375, "learning_rate": 9.020515712457267e-05, "loss": 0.9975, "step": 2225 }, { "epoch": 0.23, "grad_norm": 0.80078125, "learning_rate": 9.015607776759328e-05, "loss": 0.9609, "step": 2230 }, { "epoch": 0.23, "grad_norm": 0.76953125, "learning_rate": 9.010688917803958e-05, "loss": 0.979, "step": 2235 }, { "epoch": 0.23, "grad_norm": 0.6640625, "learning_rate": 9.005759148971445e-05, "loss": 0.9703, "step": 2240 }, { "epoch": 0.23, "grad_norm": 0.703125, "learning_rate": 9.000818483671741e-05, "loss": 0.9845, "step": 2245 }, { "epoch": 0.23, "grad_norm": 0.6484375, "learning_rate": 8.99586693534445e-05, "loss": 1.0463, "step": 2250 }, { "epoch": 0.23, "grad_norm": 0.8671875, "learning_rate": 8.990904517458774e-05, "loss": 1.0171, "step": 2255 }, { "epoch": 0.23, "grad_norm": 0.76953125, "learning_rate": 8.98593124351348e-05, "loss": 1.0348, "step": 2260 }, { "epoch": 0.23, "grad_norm": 0.7109375, "learning_rate": 8.980947127036876e-05, "loss": 0.9866, "step": 2265 }, { "epoch": 0.23, "grad_norm": 0.7421875, "learning_rate": 8.975952181586751e-05, "loss": 1.0997, "step": 2270 }, { "epoch": 0.23, "grad_norm": 0.74609375, "learning_rate": 8.970946420750362e-05, "loss": 1.0921, "step": 2275 }, { "epoch": 0.23, "grad_norm": 0.828125, "learning_rate": 8.965929858144375e-05, "loss": 1.1113, "step": 2280 }, { "epoch": 0.23, "grad_norm": 0.671875, "learning_rate": 8.96090250741485e-05, "loss": 0.9056, "step": 2285 }, { "epoch": 0.23, "grad_norm": 0.70703125, "learning_rate": 8.955864382237187e-05, "loss": 1.0231, "step": 2290 }, { "epoch": 0.23, "grad_norm": 0.71484375, "learning_rate": 8.950815496316094e-05, "loss": 1.0442, "step": 2295 }, { "epoch": 0.23, "grad_norm": 0.765625, "learning_rate": 8.945755863385552e-05, "loss": 1.0367, "step": 2300 }, { "epoch": 0.23, "grad_norm": 0.7109375, "learning_rate": 8.940685497208779e-05, "loss": 0.9599, "step": 2305 }, { "epoch": 0.24, "grad_norm": 0.89453125, "learning_rate": 8.935604411578182e-05, "loss": 1.0799, "step": 2310 }, { "epoch": 0.24, "grad_norm": 0.7109375, "learning_rate": 8.930512620315337e-05, "loss": 1.2683, "step": 2315 }, { "epoch": 0.24, "grad_norm": 0.66796875, "learning_rate": 8.92541013727093e-05, "loss": 0.9654, "step": 2320 }, { "epoch": 0.24, "grad_norm": 0.75390625, "learning_rate": 8.92029697632474e-05, "loss": 1.0312, "step": 2325 }, { "epoch": 0.24, "grad_norm": 0.59375, "learning_rate": 8.915173151385589e-05, "loss": 1.0048, "step": 2330 }, { "epoch": 0.24, "grad_norm": 2.65625, "learning_rate": 8.910038676391308e-05, "loss": 1.0796, "step": 2335 }, { "epoch": 0.24, "grad_norm": 0.7109375, "learning_rate": 8.904893565308696e-05, "loss": 0.9861, "step": 2340 }, { "epoch": 0.24, "grad_norm": 0.71875, "learning_rate": 8.899737832133486e-05, "loss": 0.9987, "step": 2345 }, { "epoch": 0.24, "grad_norm": 0.6953125, "learning_rate": 8.894571490890303e-05, "loss": 0.9757, "step": 2350 }, { "epoch": 0.24, "grad_norm": 1.0703125, "learning_rate": 8.889394555632633e-05, "loss": 1.1203, "step": 2355 }, { "epoch": 0.24, "grad_norm": 0.984375, "learning_rate": 8.884207040442773e-05, "loss": 1.0323, "step": 2360 }, { "epoch": 0.24, "grad_norm": 0.70703125, "learning_rate": 8.87900895943181e-05, "loss": 1.03, "step": 2365 }, { "epoch": 0.24, "grad_norm": 0.6484375, "learning_rate": 8.873800326739558e-05, "loss": 0.9085, "step": 2370 }, { "epoch": 0.24, "grad_norm": 0.77734375, "learning_rate": 8.868581156534544e-05, "loss": 1.204, "step": 2375 }, { "epoch": 0.24, "grad_norm": 0.94921875, "learning_rate": 8.863351463013959e-05, "loss": 1.2113, "step": 2380 }, { "epoch": 0.24, "grad_norm": 0.6328125, "learning_rate": 8.85811126040361e-05, "loss": 1.1689, "step": 2385 }, { "epoch": 0.24, "grad_norm": 0.6328125, "learning_rate": 8.852860562957906e-05, "loss": 0.9359, "step": 2390 }, { "epoch": 0.24, "grad_norm": 0.8828125, "learning_rate": 8.847599384959791e-05, "loss": 1.2104, "step": 2395 }, { "epoch": 0.24, "grad_norm": 0.76953125, "learning_rate": 8.842327740720722e-05, "loss": 1.2473, "step": 2400 }, { "epoch": 0.24, "grad_norm": 0.62109375, "learning_rate": 8.83704564458063e-05, "loss": 1.1854, "step": 2405 }, { "epoch": 0.25, "grad_norm": 0.69140625, "learning_rate": 8.831753110907873e-05, "loss": 1.0163, "step": 2410 }, { "epoch": 0.25, "grad_norm": 0.78125, "learning_rate": 8.826450154099203e-05, "loss": 1.0576, "step": 2415 }, { "epoch": 0.25, "grad_norm": 0.87109375, "learning_rate": 8.821136788579725e-05, "loss": 1.2287, "step": 2420 }, { "epoch": 0.25, "grad_norm": 0.6953125, "learning_rate": 8.815813028802855e-05, "loss": 1.0083, "step": 2425 }, { "epoch": 0.25, "grad_norm": 0.75, "learning_rate": 8.810478889250289e-05, "loss": 0.9742, "step": 2430 }, { "epoch": 0.25, "grad_norm": 0.72265625, "learning_rate": 8.805134384431954e-05, "loss": 1.1751, "step": 2435 }, { "epoch": 0.25, "grad_norm": 0.82421875, "learning_rate": 8.799779528885973e-05, "loss": 1.0532, "step": 2440 }, { "epoch": 0.25, "grad_norm": 0.68359375, "learning_rate": 8.794414337178626e-05, "loss": 0.9992, "step": 2445 }, { "epoch": 0.25, "grad_norm": 0.71875, "learning_rate": 8.789038823904307e-05, "loss": 1.0536, "step": 2450 }, { "epoch": 0.25, "grad_norm": 0.85546875, "learning_rate": 8.783653003685493e-05, "loss": 1.0741, "step": 2455 }, { "epoch": 0.25, "grad_norm": 0.72265625, "learning_rate": 8.778256891172691e-05, "loss": 1.1178, "step": 2460 }, { "epoch": 0.25, "grad_norm": 0.76953125, "learning_rate": 8.772850501044408e-05, "loss": 1.2603, "step": 2465 }, { "epoch": 0.25, "grad_norm": 0.7109375, "learning_rate": 8.767433848007107e-05, "loss": 1.0352, "step": 2470 }, { "epoch": 0.25, "grad_norm": 0.74609375, "learning_rate": 8.762006946795171e-05, "loss": 1.0574, "step": 2475 }, { "epoch": 0.25, "grad_norm": 0.8671875, "learning_rate": 8.756569812170859e-05, "loss": 1.1272, "step": 2480 }, { "epoch": 0.25, "grad_norm": 0.7265625, "learning_rate": 8.751122458924263e-05, "loss": 1.2053, "step": 2485 }, { "epoch": 0.25, "grad_norm": 0.69921875, "learning_rate": 8.745664901873276e-05, "loss": 0.9419, "step": 2490 }, { "epoch": 0.25, "grad_norm": 0.70703125, "learning_rate": 8.740197155863548e-05, "loss": 1.0645, "step": 2495 }, { "epoch": 0.25, "grad_norm": 0.72265625, "learning_rate": 8.734719235768442e-05, "loss": 1.1206, "step": 2500 }, { "epoch": 0.26, "grad_norm": 0.92578125, "learning_rate": 8.729231156488997e-05, "loss": 1.0041, "step": 2505 }, { "epoch": 0.26, "grad_norm": 0.78515625, "learning_rate": 8.723732932953889e-05, "loss": 1.0053, "step": 2510 }, { "epoch": 0.26, "grad_norm": 2.953125, "learning_rate": 8.718224580119387e-05, "loss": 1.0583, "step": 2515 }, { "epoch": 0.26, "grad_norm": 0.734375, "learning_rate": 8.712706112969314e-05, "loss": 1.008, "step": 2520 }, { "epoch": 0.26, "grad_norm": 0.69140625, "learning_rate": 8.707177546515007e-05, "loss": 1.1302, "step": 2525 }, { "epoch": 0.26, "grad_norm": 0.7734375, "learning_rate": 8.701638895795271e-05, "loss": 1.006, "step": 2530 }, { "epoch": 0.26, "grad_norm": 0.78125, "learning_rate": 8.696090175876348e-05, "loss": 1.1817, "step": 2535 }, { "epoch": 0.26, "grad_norm": 0.6796875, "learning_rate": 8.690531401851866e-05, "loss": 1.1655, "step": 2540 }, { "epoch": 0.26, "grad_norm": 0.75, "learning_rate": 8.684962588842805e-05, "loss": 1.0119, "step": 2545 }, { "epoch": 0.26, "grad_norm": 0.7890625, "learning_rate": 8.679383751997452e-05, "loss": 1.0204, "step": 2550 }, { "epoch": 0.26, "grad_norm": 0.76171875, "learning_rate": 8.673794906491358e-05, "loss": 1.1571, "step": 2555 }, { "epoch": 0.26, "grad_norm": 0.890625, "learning_rate": 8.668196067527306e-05, "loss": 1.3189, "step": 2560 }, { "epoch": 0.26, "grad_norm": 0.84375, "learning_rate": 8.662587250335256e-05, "loss": 1.0633, "step": 2565 }, { "epoch": 0.26, "grad_norm": 0.7265625, "learning_rate": 8.656968470172315e-05, "loss": 1.0841, "step": 2570 }, { "epoch": 0.26, "grad_norm": 0.6953125, "learning_rate": 8.65133974232269e-05, "loss": 1.1659, "step": 2575 }, { "epoch": 0.26, "grad_norm": 0.92578125, "learning_rate": 8.645701082097651e-05, "loss": 1.0921, "step": 2580 }, { "epoch": 0.26, "grad_norm": 0.6484375, "learning_rate": 8.64005250483548e-05, "loss": 0.9961, "step": 2585 }, { "epoch": 0.26, "grad_norm": 0.63671875, "learning_rate": 8.63439402590144e-05, "loss": 1.0388, "step": 2590 }, { "epoch": 0.26, "grad_norm": 0.71875, "learning_rate": 8.628725660687726e-05, "loss": 1.0681, "step": 2595 }, { "epoch": 0.26, "grad_norm": 0.8203125, "learning_rate": 8.623047424613427e-05, "loss": 1.2429, "step": 2600 }, { "epoch": 0.27, "grad_norm": 0.88671875, "learning_rate": 8.617359333124484e-05, "loss": 1.1379, "step": 2605 }, { "epoch": 0.27, "grad_norm": 0.6640625, "learning_rate": 8.611661401693643e-05, "loss": 1.0119, "step": 2610 }, { "epoch": 0.27, "grad_norm": 0.94921875, "learning_rate": 8.605953645820419e-05, "loss": 1.1663, "step": 2615 }, { "epoch": 0.27, "grad_norm": 0.82421875, "learning_rate": 8.600236081031052e-05, "loss": 1.3278, "step": 2620 }, { "epoch": 0.27, "grad_norm": 0.73046875, "learning_rate": 8.594508722878462e-05, "loss": 1.0438, "step": 2625 }, { "epoch": 0.27, "grad_norm": 0.8203125, "learning_rate": 8.588771586942213e-05, "loss": 1.1375, "step": 2630 }, { "epoch": 0.27, "grad_norm": 0.74609375, "learning_rate": 8.583024688828462e-05, "loss": 1.0629, "step": 2635 }, { "epoch": 0.27, "grad_norm": 0.70703125, "learning_rate": 8.577268044169922e-05, "loss": 1.0521, "step": 2640 }, { "epoch": 0.27, "grad_norm": 0.7734375, "learning_rate": 8.571501668625821e-05, "loss": 1.116, "step": 2645 }, { "epoch": 0.27, "grad_norm": 0.6953125, "learning_rate": 8.565725577881856e-05, "loss": 0.9471, "step": 2650 }, { "epoch": 0.27, "grad_norm": 9.0, "learning_rate": 8.559939787650149e-05, "loss": 1.0076, "step": 2655 }, { "epoch": 0.27, "grad_norm": 0.72265625, "learning_rate": 8.554144313669207e-05, "loss": 1.2238, "step": 2660 }, { "epoch": 0.27, "grad_norm": 0.71484375, "learning_rate": 8.548339171703883e-05, "loss": 1.1868, "step": 2665 }, { "epoch": 0.27, "grad_norm": 0.7421875, "learning_rate": 8.542524377545325e-05, "loss": 1.1218, "step": 2670 }, { "epoch": 0.27, "grad_norm": 1.0078125, "learning_rate": 8.536699947010937e-05, "loss": 1.0396, "step": 2675 }, { "epoch": 0.27, "grad_norm": 0.63671875, "learning_rate": 8.530865895944337e-05, "loss": 0.9681, "step": 2680 }, { "epoch": 0.27, "grad_norm": 0.703125, "learning_rate": 8.525022240215314e-05, "loss": 1.0103, "step": 2685 }, { "epoch": 0.27, "grad_norm": 0.74609375, "learning_rate": 8.51916899571978e-05, "loss": 1.109, "step": 2690 }, { "epoch": 0.27, "grad_norm": 1.0625, "learning_rate": 8.513306178379732e-05, "loss": 1.0134, "step": 2695 }, { "epoch": 0.27, "grad_norm": 0.80859375, "learning_rate": 8.507433804143208e-05, "loss": 1.0635, "step": 2700 }, { "epoch": 0.28, "grad_norm": 0.8359375, "learning_rate": 8.501551888984241e-05, "loss": 1.0984, "step": 2705 }, { "epoch": 0.28, "grad_norm": 0.71484375, "learning_rate": 8.495660448902823e-05, "loss": 1.0672, "step": 2710 }, { "epoch": 0.28, "grad_norm": 0.78515625, "learning_rate": 8.489759499924845e-05, "loss": 1.0909, "step": 2715 }, { "epoch": 0.28, "grad_norm": 0.69921875, "learning_rate": 8.483849058102073e-05, "loss": 1.0751, "step": 2720 }, { "epoch": 0.28, "grad_norm": 0.7265625, "learning_rate": 8.477929139512093e-05, "loss": 0.9671, "step": 2725 }, { "epoch": 0.28, "grad_norm": 0.71484375, "learning_rate": 8.471999760258268e-05, "loss": 1.1997, "step": 2730 }, { "epoch": 0.28, "grad_norm": 0.67578125, "learning_rate": 8.466060936469697e-05, "loss": 1.013, "step": 2735 }, { "epoch": 0.28, "grad_norm": 0.7421875, "learning_rate": 8.460112684301172e-05, "loss": 1.0004, "step": 2740 }, { "epoch": 0.28, "grad_norm": 0.8046875, "learning_rate": 8.454155019933127e-05, "loss": 1.3402, "step": 2745 }, { "epoch": 0.28, "grad_norm": 2.203125, "learning_rate": 8.448187959571607e-05, "loss": 1.198, "step": 2750 }, { "epoch": 0.28, "grad_norm": 0.76171875, "learning_rate": 8.442211519448209e-05, "loss": 1.1905, "step": 2755 }, { "epoch": 0.28, "grad_norm": 0.80859375, "learning_rate": 8.436225715820046e-05, "loss": 1.0614, "step": 2760 }, { "epoch": 0.28, "grad_norm": 0.8203125, "learning_rate": 8.430230564969702e-05, "loss": 1.0983, "step": 2765 }, { "epoch": 0.28, "grad_norm": 0.6171875, "learning_rate": 8.424226083205192e-05, "loss": 1.0469, "step": 2770 }, { "epoch": 0.28, "grad_norm": 0.8203125, "learning_rate": 8.418212286859904e-05, "loss": 1.1048, "step": 2775 }, { "epoch": 0.28, "grad_norm": 0.76953125, "learning_rate": 8.412189192292572e-05, "loss": 0.9553, "step": 2780 }, { "epoch": 0.28, "grad_norm": 0.8515625, "learning_rate": 8.406156815887221e-05, "loss": 1.0917, "step": 2785 }, { "epoch": 0.28, "grad_norm": 0.8125, "learning_rate": 8.400115174053119e-05, "loss": 0.9901, "step": 2790 }, { "epoch": 0.28, "grad_norm": 0.6953125, "learning_rate": 8.394064283224743e-05, "loss": 1.1655, "step": 2795 }, { "epoch": 0.29, "grad_norm": 0.7890625, "learning_rate": 8.388004159861731e-05, "loss": 0.9846, "step": 2800 }, { "epoch": 0.29, "grad_norm": 0.8515625, "learning_rate": 8.381934820448829e-05, "loss": 1.1505, "step": 2805 }, { "epoch": 0.29, "grad_norm": 0.609375, "learning_rate": 8.375856281495856e-05, "loss": 0.8496, "step": 2810 }, { "epoch": 0.29, "grad_norm": 1.34375, "learning_rate": 8.369768559537657e-05, "loss": 1.0266, "step": 2815 }, { "epoch": 0.29, "grad_norm": 0.74609375, "learning_rate": 8.363671671134054e-05, "loss": 1.0651, "step": 2820 }, { "epoch": 0.29, "grad_norm": 0.6015625, "learning_rate": 8.357565632869804e-05, "loss": 1.0375, "step": 2825 }, { "epoch": 0.29, "grad_norm": 1.234375, "learning_rate": 8.351450461354556e-05, "loss": 1.0987, "step": 2830 }, { "epoch": 0.29, "grad_norm": 3.21875, "learning_rate": 8.345326173222799e-05, "loss": 1.0204, "step": 2835 }, { "epoch": 0.29, "grad_norm": 0.76171875, "learning_rate": 8.339192785133824e-05, "loss": 0.8794, "step": 2840 }, { "epoch": 0.29, "grad_norm": 0.75390625, "learning_rate": 8.333050313771677e-05, "loss": 1.0895, "step": 2845 }, { "epoch": 0.29, "grad_norm": 0.64453125, "learning_rate": 8.326898775845108e-05, "loss": 1.0476, "step": 2850 }, { "epoch": 0.29, "grad_norm": 0.8359375, "learning_rate": 8.320738188087533e-05, "loss": 1.1581, "step": 2855 }, { "epoch": 0.29, "grad_norm": 0.734375, "learning_rate": 8.314568567256986e-05, "loss": 1.0111, "step": 2860 }, { "epoch": 0.29, "grad_norm": 0.7890625, "learning_rate": 8.30838993013607e-05, "loss": 0.9779, "step": 2865 }, { "epoch": 0.29, "grad_norm": 0.7421875, "learning_rate": 8.302202293531915e-05, "loss": 0.9491, "step": 2870 }, { "epoch": 0.29, "grad_norm": 0.76171875, "learning_rate": 8.296005674276133e-05, "loss": 1.1357, "step": 2875 }, { "epoch": 0.29, "grad_norm": 0.6796875, "learning_rate": 8.289800089224768e-05, "loss": 0.9299, "step": 2880 }, { "epoch": 0.29, "grad_norm": 0.71875, "learning_rate": 8.283585555258256e-05, "loss": 1.0777, "step": 2885 }, { "epoch": 0.29, "grad_norm": 0.7109375, "learning_rate": 8.277362089281371e-05, "loss": 1.0231, "step": 2890 }, { "epoch": 0.29, "grad_norm": 0.71875, "learning_rate": 8.27112970822319e-05, "loss": 1.1078, "step": 2895 }, { "epoch": 0.3, "grad_norm": 0.78125, "learning_rate": 8.264888429037039e-05, "loss": 1.106, "step": 2900 }, { "epoch": 0.3, "grad_norm": 0.87890625, "learning_rate": 8.258638268700443e-05, "loss": 1.1014, "step": 2905 }, { "epoch": 0.3, "grad_norm": 0.73046875, "learning_rate": 8.252379244215094e-05, "loss": 1.0533, "step": 2910 }, { "epoch": 0.3, "grad_norm": 0.6953125, "learning_rate": 8.246111372606789e-05, "loss": 0.9655, "step": 2915 }, { "epoch": 0.3, "grad_norm": 0.71484375, "learning_rate": 8.239834670925396e-05, "loss": 1.0225, "step": 2920 }, { "epoch": 0.3, "grad_norm": 0.72265625, "learning_rate": 8.2335491562448e-05, "loss": 1.0847, "step": 2925 }, { "epoch": 0.3, "grad_norm": 0.78515625, "learning_rate": 8.227254845662861e-05, "loss": 1.1701, "step": 2930 }, { "epoch": 0.3, "grad_norm": 0.74609375, "learning_rate": 8.220951756301364e-05, "loss": 1.147, "step": 2935 }, { "epoch": 0.3, "grad_norm": 0.71484375, "learning_rate": 8.214639905305974e-05, "loss": 1.1362, "step": 2940 }, { "epoch": 0.3, "grad_norm": 0.76171875, "learning_rate": 8.208319309846188e-05, "loss": 1.1103, "step": 2945 }, { "epoch": 0.3, "grad_norm": 0.87109375, "learning_rate": 8.201989987115296e-05, "loss": 0.927, "step": 2950 }, { "epoch": 0.3, "grad_norm": 0.62890625, "learning_rate": 8.19565195433032e-05, "loss": 0.9902, "step": 2955 }, { "epoch": 0.3, "grad_norm": 0.6875, "learning_rate": 8.18930522873198e-05, "loss": 0.9363, "step": 2960 }, { "epoch": 0.3, "grad_norm": 0.78515625, "learning_rate": 8.182949827584641e-05, "loss": 1.1759, "step": 2965 }, { "epoch": 0.3, "grad_norm": 0.76171875, "learning_rate": 8.176585768176266e-05, "loss": 1.0668, "step": 2970 }, { "epoch": 0.3, "grad_norm": 0.75390625, "learning_rate": 8.170213067818371e-05, "loss": 1.035, "step": 2975 }, { "epoch": 0.3, "grad_norm": 0.7578125, "learning_rate": 8.16383174384598e-05, "loss": 0.9191, "step": 2980 }, { "epoch": 0.3, "grad_norm": 0.796875, "learning_rate": 8.157441813617568e-05, "loss": 1.146, "step": 2985 }, { "epoch": 0.3, "grad_norm": 0.7265625, "learning_rate": 8.15104329451503e-05, "loss": 0.9065, "step": 2990 }, { "epoch": 0.31, "grad_norm": 0.76953125, "learning_rate": 8.144636203943616e-05, "loss": 1.0684, "step": 2995 }, { "epoch": 0.31, "grad_norm": 0.953125, "learning_rate": 8.138220559331895e-05, "loss": 1.1734, "step": 3000 }, { "epoch": 0.31, "grad_norm": 0.81640625, "learning_rate": 8.131796378131709e-05, "loss": 1.0927, "step": 3005 }, { "epoch": 0.31, "grad_norm": 0.6328125, "learning_rate": 8.125363677818114e-05, "loss": 0.9621, "step": 3010 }, { "epoch": 0.31, "grad_norm": 0.6953125, "learning_rate": 8.118922475889346e-05, "loss": 1.1715, "step": 3015 }, { "epoch": 0.31, "grad_norm": 0.78125, "learning_rate": 8.112472789866763e-05, "loss": 1.1307, "step": 3020 }, { "epoch": 0.31, "grad_norm": 0.78515625, "learning_rate": 8.106014637294801e-05, "loss": 1.1674, "step": 3025 }, { "epoch": 0.31, "grad_norm": 0.80078125, "learning_rate": 8.099548035740932e-05, "loss": 0.9488, "step": 3030 }, { "epoch": 0.31, "grad_norm": 0.76953125, "learning_rate": 8.093073002795605e-05, "loss": 1.1273, "step": 3035 }, { "epoch": 0.31, "grad_norm": 0.66015625, "learning_rate": 8.086589556072208e-05, "loss": 1.0848, "step": 3040 }, { "epoch": 0.31, "grad_norm": 3.421875, "learning_rate": 8.080097713207011e-05, "loss": 1.0033, "step": 3045 }, { "epoch": 0.31, "grad_norm": 0.828125, "learning_rate": 8.073597491859132e-05, "loss": 1.0488, "step": 3050 }, { "epoch": 0.31, "grad_norm": 0.73046875, "learning_rate": 8.067088909710473e-05, "loss": 0.97, "step": 3055 }, { "epoch": 0.31, "grad_norm": 0.73046875, "learning_rate": 8.060571984465679e-05, "loss": 1.0538, "step": 3060 }, { "epoch": 0.31, "grad_norm": 0.67578125, "learning_rate": 8.054046733852095e-05, "loss": 1.1165, "step": 3065 }, { "epoch": 0.31, "grad_norm": 0.74609375, "learning_rate": 8.047513175619708e-05, "loss": 1.0492, "step": 3070 }, { "epoch": 0.31, "grad_norm": 0.87890625, "learning_rate": 8.040971327541105e-05, "loss": 1.0117, "step": 3075 }, { "epoch": 0.31, "grad_norm": 0.796875, "learning_rate": 8.034421207411423e-05, "loss": 1.119, "step": 3080 }, { "epoch": 0.31, "grad_norm": 0.80859375, "learning_rate": 8.0278628330483e-05, "loss": 1.1995, "step": 3085 }, { "epoch": 0.31, "grad_norm": 0.77734375, "learning_rate": 8.021296222291827e-05, "loss": 1.023, "step": 3090 }, { "epoch": 0.32, "grad_norm": 0.796875, "learning_rate": 8.014721393004506e-05, "loss": 0.9752, "step": 3095 }, { "epoch": 0.32, "grad_norm": 0.78515625, "learning_rate": 8.008138363071184e-05, "loss": 1.1086, "step": 3100 }, { "epoch": 0.32, "grad_norm": 0.7734375, "learning_rate": 8.001547150399023e-05, "loss": 1.1296, "step": 3105 }, { "epoch": 0.32, "grad_norm": 0.9453125, "learning_rate": 7.99494777291744e-05, "loss": 1.0221, "step": 3110 }, { "epoch": 0.32, "grad_norm": 0.83984375, "learning_rate": 7.988340248578066e-05, "loss": 0.8845, "step": 3115 }, { "epoch": 0.32, "grad_norm": 0.97265625, "learning_rate": 7.981724595354687e-05, "loss": 1.1321, "step": 3120 }, { "epoch": 0.32, "grad_norm": 0.84375, "learning_rate": 7.97510083124321e-05, "loss": 1.0893, "step": 3125 }, { "epoch": 0.32, "grad_norm": 0.75390625, "learning_rate": 7.968468974261596e-05, "loss": 1.1259, "step": 3130 }, { "epoch": 0.32, "grad_norm": 0.80859375, "learning_rate": 7.961829042449825e-05, "loss": 1.0753, "step": 3135 }, { "epoch": 0.32, "grad_norm": 0.8203125, "learning_rate": 7.955181053869841e-05, "loss": 1.0884, "step": 3140 }, { "epoch": 0.32, "grad_norm": 0.66796875, "learning_rate": 7.948525026605506e-05, "loss": 1.0389, "step": 3145 }, { "epoch": 0.32, "grad_norm": 0.640625, "learning_rate": 7.941860978762548e-05, "loss": 0.9641, "step": 3150 }, { "epoch": 0.32, "grad_norm": 0.6875, "learning_rate": 7.935188928468508e-05, "loss": 1.0516, "step": 3155 }, { "epoch": 0.32, "grad_norm": 0.734375, "learning_rate": 7.928508893872701e-05, "loss": 1.0447, "step": 3160 }, { "epoch": 0.32, "grad_norm": 0.82421875, "learning_rate": 7.921820893146162e-05, "loss": 0.9953, "step": 3165 }, { "epoch": 0.32, "grad_norm": 0.67578125, "learning_rate": 7.915124944481589e-05, "loss": 1.0326, "step": 3170 }, { "epoch": 0.32, "grad_norm": 0.73046875, "learning_rate": 7.908421066093305e-05, "loss": 1.026, "step": 3175 }, { "epoch": 0.32, "grad_norm": 0.93359375, "learning_rate": 7.9017092762172e-05, "loss": 1.1395, "step": 3180 }, { "epoch": 0.32, "grad_norm": 1.0546875, "learning_rate": 7.894989593110688e-05, "loss": 1.0432, "step": 3185 }, { "epoch": 0.32, "grad_norm": 1.59375, "learning_rate": 7.888262035052656e-05, "loss": 1.1498, "step": 3190 }, { "epoch": 0.33, "grad_norm": 0.80078125, "learning_rate": 7.881526620343405e-05, "loss": 1.1971, "step": 3195 }, { "epoch": 0.33, "grad_norm": 0.73828125, "learning_rate": 7.874783367304612e-05, "loss": 1.0781, "step": 3200 }, { "epoch": 0.33, "grad_norm": 0.8515625, "learning_rate": 7.868032294279279e-05, "loss": 1.1643, "step": 3205 }, { "epoch": 0.33, "grad_norm": 0.8125, "learning_rate": 7.861273419631674e-05, "loss": 1.0102, "step": 3210 }, { "epoch": 0.33, "grad_norm": 0.78125, "learning_rate": 7.854506761747291e-05, "loss": 0.9955, "step": 3215 }, { "epoch": 0.33, "grad_norm": 0.75390625, "learning_rate": 7.847732339032796e-05, "loss": 1.0998, "step": 3220 }, { "epoch": 0.33, "grad_norm": 0.80078125, "learning_rate": 7.840950169915973e-05, "loss": 0.9991, "step": 3225 }, { "epoch": 0.33, "grad_norm": 0.80078125, "learning_rate": 7.834160272845681e-05, "loss": 1.0961, "step": 3230 }, { "epoch": 0.33, "grad_norm": 0.68359375, "learning_rate": 7.827362666291802e-05, "loss": 1.0051, "step": 3235 }, { "epoch": 0.33, "grad_norm": 0.85546875, "learning_rate": 7.820557368745185e-05, "loss": 1.1509, "step": 3240 }, { "epoch": 0.33, "grad_norm": 0.69921875, "learning_rate": 7.813744398717603e-05, "loss": 1.0741, "step": 3245 }, { "epoch": 0.33, "grad_norm": 0.7421875, "learning_rate": 7.8069237747417e-05, "loss": 0.913, "step": 3250 }, { "epoch": 0.33, "grad_norm": 0.7734375, "learning_rate": 7.800095515370938e-05, "loss": 1.0218, "step": 3255 }, { "epoch": 0.33, "grad_norm": 0.83203125, "learning_rate": 7.793259639179551e-05, "loss": 1.1609, "step": 3260 }, { "epoch": 0.33, "grad_norm": 0.7265625, "learning_rate": 7.786416164762492e-05, "loss": 1.0585, "step": 3265 }, { "epoch": 0.33, "grad_norm": 0.87890625, "learning_rate": 7.779565110735378e-05, "loss": 1.2063, "step": 3270 }, { "epoch": 0.33, "grad_norm": 0.7421875, "learning_rate": 7.772706495734454e-05, "loss": 1.0923, "step": 3275 }, { "epoch": 0.33, "grad_norm": 0.85546875, "learning_rate": 7.765840338416523e-05, "loss": 1.1602, "step": 3280 }, { "epoch": 0.33, "grad_norm": 0.75, "learning_rate": 7.758966657458908e-05, "loss": 1.1422, "step": 3285 }, { "epoch": 0.34, "grad_norm": 0.6796875, "learning_rate": 7.752085471559399e-05, "loss": 1.0115, "step": 3290 }, { "epoch": 0.34, "grad_norm": 0.86328125, "learning_rate": 7.7451967994362e-05, "loss": 1.027, "step": 3295 }, { "epoch": 0.34, "grad_norm": 0.83203125, "learning_rate": 7.738300659827877e-05, "loss": 1.0176, "step": 3300 }, { "epoch": 0.34, "grad_norm": 1.2265625, "learning_rate": 7.731397071493315e-05, "loss": 1.1252, "step": 3305 }, { "epoch": 0.34, "grad_norm": 0.72265625, "learning_rate": 7.724486053211652e-05, "loss": 0.9394, "step": 3310 }, { "epoch": 0.34, "grad_norm": 0.72265625, "learning_rate": 7.717567623782246e-05, "loss": 1.0385, "step": 3315 }, { "epoch": 0.34, "grad_norm": 0.8515625, "learning_rate": 7.710641802024608e-05, "loss": 0.9274, "step": 3320 }, { "epoch": 0.34, "grad_norm": 0.69921875, "learning_rate": 7.703708606778361e-05, "loss": 1.0951, "step": 3325 }, { "epoch": 0.34, "grad_norm": 0.875, "learning_rate": 7.696768056903186e-05, "loss": 1.1682, "step": 3330 }, { "epoch": 0.34, "grad_norm": 0.7890625, "learning_rate": 7.689820171278769e-05, "loss": 1.0186, "step": 3335 }, { "epoch": 0.34, "grad_norm": 0.8125, "learning_rate": 7.682864968804748e-05, "loss": 1.1767, "step": 3340 }, { "epoch": 0.34, "grad_norm": 0.859375, "learning_rate": 7.675902468400668e-05, "loss": 1.2458, "step": 3345 }, { "epoch": 0.34, "grad_norm": 0.8359375, "learning_rate": 7.668932689005925e-05, "loss": 1.1877, "step": 3350 }, { "epoch": 0.34, "grad_norm": 0.84375, "learning_rate": 7.661955649579712e-05, "loss": 1.0722, "step": 3355 }, { "epoch": 0.34, "grad_norm": 0.7109375, "learning_rate": 7.65497136910098e-05, "loss": 1.0606, "step": 3360 }, { "epoch": 0.34, "grad_norm": 0.765625, "learning_rate": 7.647979866568365e-05, "loss": 0.9516, "step": 3365 }, { "epoch": 0.34, "grad_norm": 0.96875, "learning_rate": 7.640981161000157e-05, "loss": 1.2059, "step": 3370 }, { "epoch": 0.34, "grad_norm": 0.62109375, "learning_rate": 7.633975271434236e-05, "loss": 1.0378, "step": 3375 }, { "epoch": 0.34, "grad_norm": 0.7890625, "learning_rate": 7.626962216928025e-05, "loss": 0.9668, "step": 3380 }, { "epoch": 0.34, "grad_norm": 0.7265625, "learning_rate": 7.619942016558434e-05, "loss": 1.0575, "step": 3385 }, { "epoch": 0.35, "grad_norm": 1.3671875, "learning_rate": 7.612914689421821e-05, "loss": 1.1209, "step": 3390 }, { "epoch": 0.35, "grad_norm": 0.6484375, "learning_rate": 7.605880254633917e-05, "loss": 1.1557, "step": 3395 }, { "epoch": 0.35, "grad_norm": 0.8515625, "learning_rate": 7.598838731329796e-05, "loss": 1.0978, "step": 3400 }, { "epoch": 0.35, "grad_norm": 0.78125, "learning_rate": 7.591790138663813e-05, "loss": 1.0088, "step": 3405 }, { "epoch": 0.35, "grad_norm": 0.7421875, "learning_rate": 7.584734495809549e-05, "loss": 1.2496, "step": 3410 }, { "epoch": 0.35, "grad_norm": 0.8125, "learning_rate": 7.577671821959766e-05, "loss": 1.1008, "step": 3415 }, { "epoch": 0.35, "grad_norm": 0.7109375, "learning_rate": 7.570602136326352e-05, "loss": 1.0393, "step": 3420 }, { "epoch": 0.35, "grad_norm": 0.78515625, "learning_rate": 7.563525458140269e-05, "loss": 1.0792, "step": 3425 }, { "epoch": 0.35, "grad_norm": 0.66796875, "learning_rate": 7.5564418066515e-05, "loss": 1.0352, "step": 3430 }, { "epoch": 0.35, "grad_norm": 0.6875, "learning_rate": 7.549351201128991e-05, "loss": 0.9773, "step": 3435 }, { "epoch": 0.35, "grad_norm": 0.875, "learning_rate": 7.542253660860614e-05, "loss": 1.0221, "step": 3440 }, { "epoch": 0.35, "grad_norm": 0.81640625, "learning_rate": 7.535149205153098e-05, "loss": 1.1252, "step": 3445 }, { "epoch": 0.35, "grad_norm": 0.7734375, "learning_rate": 7.528037853331987e-05, "loss": 1.0447, "step": 3450 }, { "epoch": 0.35, "grad_norm": 0.890625, "learning_rate": 7.520919624741578e-05, "loss": 1.0, "step": 3455 }, { "epoch": 0.35, "grad_norm": 0.890625, "learning_rate": 7.513794538744885e-05, "loss": 1.1898, "step": 3460 }, { "epoch": 0.35, "grad_norm": 0.83203125, "learning_rate": 7.506662614723563e-05, "loss": 1.0415, "step": 3465 }, { "epoch": 0.35, "grad_norm": 0.7265625, "learning_rate": 7.499523872077878e-05, "loss": 0.9453, "step": 3470 }, { "epoch": 0.35, "grad_norm": 0.76171875, "learning_rate": 7.492378330226637e-05, "loss": 1.0141, "step": 3475 }, { "epoch": 0.35, "grad_norm": 0.7421875, "learning_rate": 7.485226008607146e-05, "loss": 1.072, "step": 3480 }, { "epoch": 0.35, "grad_norm": 0.76953125, "learning_rate": 7.478066926675153e-05, "loss": 0.995, "step": 3485 }, { "epoch": 0.36, "grad_norm": 0.69921875, "learning_rate": 7.470901103904794e-05, "loss": 1.0155, "step": 3490 }, { "epoch": 0.36, "grad_norm": 0.7109375, "learning_rate": 7.463728559788541e-05, "loss": 1.1286, "step": 3495 }, { "epoch": 0.36, "grad_norm": 0.77734375, "learning_rate": 7.456549313837153e-05, "loss": 0.864, "step": 3500 }, { "epoch": 0.36, "grad_norm": 0.8515625, "learning_rate": 7.449363385579616e-05, "loss": 1.173, "step": 3505 }, { "epoch": 0.36, "grad_norm": 0.64453125, "learning_rate": 7.442170794563094e-05, "loss": 1.1935, "step": 3510 }, { "epoch": 0.36, "grad_norm": 0.89453125, "learning_rate": 7.434971560352873e-05, "loss": 1.2068, "step": 3515 }, { "epoch": 0.36, "grad_norm": 6.46875, "learning_rate": 7.427765702532315e-05, "loss": 1.1444, "step": 3520 }, { "epoch": 0.36, "grad_norm": 0.78515625, "learning_rate": 7.420553240702798e-05, "loss": 1.1498, "step": 3525 }, { "epoch": 0.36, "grad_norm": 0.671875, "learning_rate": 7.413334194483657e-05, "loss": 0.9728, "step": 3530 }, { "epoch": 0.36, "grad_norm": 0.796875, "learning_rate": 7.406108583512148e-05, "loss": 0.9861, "step": 3535 }, { "epoch": 0.36, "grad_norm": 0.80859375, "learning_rate": 7.398876427443379e-05, "loss": 0.9792, "step": 3540 }, { "epoch": 0.36, "grad_norm": 0.8125, "learning_rate": 7.391637745950262e-05, "loss": 0.9392, "step": 3545 }, { "epoch": 0.36, "grad_norm": 1.0546875, "learning_rate": 7.384392558723461e-05, "loss": 1.2004, "step": 3550 }, { "epoch": 0.36, "grad_norm": 0.71875, "learning_rate": 7.377140885471339e-05, "loss": 1.0013, "step": 3555 }, { "epoch": 0.36, "grad_norm": 0.8046875, "learning_rate": 7.369882745919896e-05, "loss": 1.02, "step": 3560 }, { "epoch": 0.36, "grad_norm": 0.80078125, "learning_rate": 7.362618159812726e-05, "loss": 1.0134, "step": 3565 }, { "epoch": 0.36, "grad_norm": 0.671875, "learning_rate": 7.355347146910961e-05, "loss": 0.9055, "step": 3570 }, { "epoch": 0.36, "grad_norm": 0.6796875, "learning_rate": 7.348069726993208e-05, "loss": 0.9618, "step": 3575 }, { "epoch": 0.36, "grad_norm": 0.75390625, "learning_rate": 7.34078591985551e-05, "loss": 0.9983, "step": 3580 }, { "epoch": 0.37, "grad_norm": 0.80859375, "learning_rate": 7.333495745311279e-05, "loss": 1.1512, "step": 3585 }, { "epoch": 0.37, "grad_norm": 0.84375, "learning_rate": 7.326199223191253e-05, "loss": 1.076, "step": 3590 }, { "epoch": 0.37, "grad_norm": 0.73046875, "learning_rate": 7.318896373343432e-05, "loss": 1.0545, "step": 3595 }, { "epoch": 0.37, "grad_norm": 0.79296875, "learning_rate": 7.311587215633029e-05, "loss": 0.974, "step": 3600 }, { "epoch": 0.37, "grad_norm": 0.80078125, "learning_rate": 7.304271769942417e-05, "loss": 0.9138, "step": 3605 }, { "epoch": 0.37, "grad_norm": 0.94140625, "learning_rate": 7.296950056171076e-05, "loss": 1.0022, "step": 3610 }, { "epoch": 0.37, "grad_norm": 0.77734375, "learning_rate": 7.289622094235531e-05, "loss": 1.1084, "step": 3615 }, { "epoch": 0.37, "grad_norm": 0.609375, "learning_rate": 7.282287904069308e-05, "loss": 1.2029, "step": 3620 }, { "epoch": 0.37, "grad_norm": 0.7421875, "learning_rate": 7.274947505622875e-05, "loss": 1.0783, "step": 3625 }, { "epoch": 0.37, "grad_norm": 0.84765625, "learning_rate": 7.26760091886358e-05, "loss": 1.0601, "step": 3630 }, { "epoch": 0.37, "grad_norm": 0.7734375, "learning_rate": 7.260248163775616e-05, "loss": 1.2118, "step": 3635 }, { "epoch": 0.37, "grad_norm": 0.74609375, "learning_rate": 7.252889260359947e-05, "loss": 1.0463, "step": 3640 }, { "epoch": 0.37, "grad_norm": 0.765625, "learning_rate": 7.245524228634264e-05, "loss": 1.072, "step": 3645 }, { "epoch": 0.37, "grad_norm": 0.828125, "learning_rate": 7.23815308863293e-05, "loss": 1.1773, "step": 3650 }, { "epoch": 0.37, "grad_norm": 0.6796875, "learning_rate": 7.230775860406922e-05, "loss": 1.0873, "step": 3655 }, { "epoch": 0.37, "grad_norm": 0.9609375, "learning_rate": 7.223392564023776e-05, "loss": 1.1419, "step": 3660 }, { "epoch": 0.37, "grad_norm": 0.80078125, "learning_rate": 7.216003219567539e-05, "loss": 1.1429, "step": 3665 }, { "epoch": 0.37, "grad_norm": 0.6875, "learning_rate": 7.208607847138709e-05, "loss": 1.0969, "step": 3670 }, { "epoch": 0.37, "grad_norm": 0.75, "learning_rate": 7.201206466854176e-05, "loss": 1.0345, "step": 3675 }, { "epoch": 0.37, "grad_norm": 0.7578125, "learning_rate": 7.193799098847181e-05, "loss": 1.0727, "step": 3680 }, { "epoch": 0.38, "grad_norm": 0.8359375, "learning_rate": 7.186385763267247e-05, "loss": 1.0523, "step": 3685 }, { "epoch": 0.38, "grad_norm": 0.7578125, "learning_rate": 7.178966480280131e-05, "loss": 1.0141, "step": 3690 }, { "epoch": 0.38, "grad_norm": 0.72265625, "learning_rate": 7.171541270067771e-05, "loss": 0.9112, "step": 3695 }, { "epoch": 0.38, "grad_norm": 0.875, "learning_rate": 7.164110152828223e-05, "loss": 1.0412, "step": 3700 }, { "epoch": 0.38, "grad_norm": 0.8359375, "learning_rate": 7.156673148775615e-05, "loss": 1.066, "step": 3705 }, { "epoch": 0.38, "grad_norm": 1.9765625, "learning_rate": 7.149230278140089e-05, "loss": 1.1692, "step": 3710 }, { "epoch": 0.38, "grad_norm": 0.80078125, "learning_rate": 7.141781561167742e-05, "loss": 1.0214, "step": 3715 }, { "epoch": 0.38, "grad_norm": 0.83984375, "learning_rate": 7.134327018120578e-05, "loss": 1.0949, "step": 3720 }, { "epoch": 0.38, "grad_norm": 0.66796875, "learning_rate": 7.126866669276447e-05, "loss": 1.0195, "step": 3725 }, { "epoch": 0.38, "grad_norm": 0.78515625, "learning_rate": 7.119400534928988e-05, "loss": 1.0325, "step": 3730 }, { "epoch": 0.38, "grad_norm": 0.92578125, "learning_rate": 7.111928635387588e-05, "loss": 1.0255, "step": 3735 }, { "epoch": 0.38, "grad_norm": 0.8984375, "learning_rate": 7.104450990977306e-05, "loss": 1.0661, "step": 3740 }, { "epoch": 0.38, "grad_norm": 0.796875, "learning_rate": 7.096967622038834e-05, "loss": 1.0796, "step": 3745 }, { "epoch": 0.38, "grad_norm": 1.4296875, "learning_rate": 7.089478548928434e-05, "loss": 1.0412, "step": 3750 }, { "epoch": 0.38, "grad_norm": 0.796875, "learning_rate": 7.081983792017885e-05, "loss": 1.1041, "step": 3755 }, { "epoch": 0.38, "grad_norm": 0.8515625, "learning_rate": 7.074483371694426e-05, "loss": 0.9852, "step": 3760 }, { "epoch": 0.38, "grad_norm": 0.76171875, "learning_rate": 7.066977308360704e-05, "loss": 1.0722, "step": 3765 }, { "epoch": 0.38, "grad_norm": 1.2265625, "learning_rate": 7.059465622434713e-05, "loss": 0.9913, "step": 3770 }, { "epoch": 0.38, "grad_norm": 0.8125, "learning_rate": 7.051948334349746e-05, "loss": 1.0868, "step": 3775 }, { "epoch": 0.38, "grad_norm": 0.77734375, "learning_rate": 7.044425464554329e-05, "loss": 0.9402, "step": 3780 }, { "epoch": 0.39, "grad_norm": 0.79296875, "learning_rate": 7.036897033512177e-05, "loss": 1.0482, "step": 3785 }, { "epoch": 0.39, "grad_norm": 0.75, "learning_rate": 7.029363061702129e-05, "loss": 0.9428, "step": 3790 }, { "epoch": 0.39, "grad_norm": 0.734375, "learning_rate": 7.021823569618097e-05, "loss": 1.0352, "step": 3795 }, { "epoch": 0.39, "grad_norm": 0.734375, "learning_rate": 7.01427857776901e-05, "loss": 1.2236, "step": 3800 }, { "epoch": 0.39, "grad_norm": 0.8359375, "learning_rate": 7.006728106678757e-05, "loss": 1.2762, "step": 3805 }, { "epoch": 0.39, "grad_norm": 0.78125, "learning_rate": 6.999172176886133e-05, "loss": 0.993, "step": 3810 }, { "epoch": 0.39, "grad_norm": 0.8515625, "learning_rate": 6.991610808944778e-05, "loss": 0.941, "step": 3815 }, { "epoch": 0.39, "grad_norm": 0.703125, "learning_rate": 6.984044023423128e-05, "loss": 1.104, "step": 3820 }, { "epoch": 0.39, "grad_norm": 1.2734375, "learning_rate": 6.976471840904355e-05, "loss": 1.0966, "step": 3825 }, { "epoch": 0.39, "grad_norm": 0.7578125, "learning_rate": 6.968894281986313e-05, "loss": 1.0537, "step": 3830 }, { "epoch": 0.39, "grad_norm": 0.76171875, "learning_rate": 6.96131136728148e-05, "loss": 0.8949, "step": 3835 }, { "epoch": 0.39, "grad_norm": 0.765625, "learning_rate": 6.953723117416901e-05, "loss": 1.0956, "step": 3840 }, { "epoch": 0.39, "grad_norm": 3.515625, "learning_rate": 6.94612955303414e-05, "loss": 1.1337, "step": 3845 }, { "epoch": 0.39, "grad_norm": 0.7578125, "learning_rate": 6.938530694789206e-05, "loss": 0.9737, "step": 3850 }, { "epoch": 0.39, "grad_norm": 0.74609375, "learning_rate": 6.930926563352521e-05, "loss": 1.0535, "step": 3855 }, { "epoch": 0.39, "grad_norm": 0.78515625, "learning_rate": 6.923317179408844e-05, "loss": 1.0223, "step": 3860 }, { "epoch": 0.39, "grad_norm": 0.6796875, "learning_rate": 6.91570256365722e-05, "loss": 1.0118, "step": 3865 }, { "epoch": 0.39, "grad_norm": 0.80078125, "learning_rate": 6.908082736810935e-05, "loss": 1.0233, "step": 3870 }, { "epoch": 0.39, "grad_norm": 0.765625, "learning_rate": 6.90045771959744e-05, "loss": 0.9254, "step": 3875 }, { "epoch": 0.4, "grad_norm": 0.6875, "learning_rate": 6.892827532758311e-05, "loss": 1.0422, "step": 3880 }, { "epoch": 0.4, "grad_norm": 0.890625, "learning_rate": 6.885192197049182e-05, "loss": 1.0743, "step": 3885 }, { "epoch": 0.4, "grad_norm": 0.86328125, "learning_rate": 6.877551733239699e-05, "loss": 1.0699, "step": 3890 }, { "epoch": 0.4, "grad_norm": 0.84765625, "learning_rate": 6.869906162113449e-05, "loss": 1.1193, "step": 3895 }, { "epoch": 0.4, "grad_norm": 0.671875, "learning_rate": 6.862255504467924e-05, "loss": 0.9723, "step": 3900 }, { "epoch": 0.4, "grad_norm": 0.640625, "learning_rate": 6.854599781114437e-05, "loss": 1.0641, "step": 3905 }, { "epoch": 0.4, "grad_norm": 0.71484375, "learning_rate": 6.846939012878094e-05, "loss": 1.1135, "step": 3910 }, { "epoch": 0.4, "grad_norm": 0.703125, "learning_rate": 6.839273220597717e-05, "loss": 1.1871, "step": 3915 }, { "epoch": 0.4, "grad_norm": 0.703125, "learning_rate": 6.831602425125796e-05, "loss": 1.0599, "step": 3920 }, { "epoch": 0.4, "grad_norm": 0.70703125, "learning_rate": 6.823926647328434e-05, "loss": 1.0594, "step": 3925 }, { "epoch": 0.4, "grad_norm": 0.82421875, "learning_rate": 6.81624590808528e-05, "loss": 1.1122, "step": 3930 }, { "epoch": 0.4, "grad_norm": 0.75, "learning_rate": 6.808560228289487e-05, "loss": 0.9967, "step": 3935 }, { "epoch": 0.4, "grad_norm": 0.71875, "learning_rate": 6.80086962884764e-05, "loss": 1.0192, "step": 3940 }, { "epoch": 0.4, "grad_norm": 0.68359375, "learning_rate": 6.793174130679711e-05, "loss": 1.0385, "step": 3945 }, { "epoch": 0.4, "grad_norm": 0.75, "learning_rate": 6.785473754718997e-05, "loss": 0.9171, "step": 3950 }, { "epoch": 0.4, "grad_norm": 0.8046875, "learning_rate": 6.777768521912062e-05, "loss": 0.9, "step": 3955 }, { "epoch": 0.4, "grad_norm": 0.7890625, "learning_rate": 6.770058453218683e-05, "loss": 1.0755, "step": 3960 }, { "epoch": 0.4, "grad_norm": 0.75390625, "learning_rate": 6.762343569611794e-05, "loss": 1.0492, "step": 3965 }, { "epoch": 0.4, "grad_norm": 0.671875, "learning_rate": 6.754623892077418e-05, "loss": 1.0071, "step": 3970 }, { "epoch": 0.4, "grad_norm": 0.74609375, "learning_rate": 6.746899441614624e-05, "loss": 0.9688, "step": 3975 }, { "epoch": 0.41, "grad_norm": 0.6875, "learning_rate": 6.739170239235471e-05, "loss": 0.8293, "step": 3980 }, { "epoch": 0.41, "grad_norm": 0.859375, "learning_rate": 6.731436305964933e-05, "loss": 1.0995, "step": 3985 }, { "epoch": 0.41, "grad_norm": 0.57421875, "learning_rate": 6.723697662840857e-05, "loss": 0.9918, "step": 3990 }, { "epoch": 0.41, "grad_norm": 0.69140625, "learning_rate": 6.715954330913902e-05, "loss": 0.9673, "step": 3995 }, { "epoch": 0.41, "grad_norm": 0.80078125, "learning_rate": 6.708206331247487e-05, "loss": 1.1082, "step": 4000 }, { "epoch": 0.41, "grad_norm": 0.73828125, "learning_rate": 6.700453684917716e-05, "loss": 1.0181, "step": 4005 }, { "epoch": 0.41, "grad_norm": 0.74609375, "learning_rate": 6.692696413013344e-05, "loss": 0.9705, "step": 4010 }, { "epoch": 0.41, "grad_norm": 0.66796875, "learning_rate": 6.684934536635702e-05, "loss": 1.0555, "step": 4015 }, { "epoch": 0.41, "grad_norm": 0.83984375, "learning_rate": 6.67716807689865e-05, "loss": 1.0467, "step": 4020 }, { "epoch": 0.41, "grad_norm": 0.7890625, "learning_rate": 6.669397054928514e-05, "loss": 1.1325, "step": 4025 }, { "epoch": 0.41, "grad_norm": 1.046875, "learning_rate": 6.661621491864029e-05, "loss": 1.0789, "step": 4030 }, { "epoch": 0.41, "grad_norm": 0.71875, "learning_rate": 6.653841408856284e-05, "loss": 1.1021, "step": 4035 }, { "epoch": 0.41, "grad_norm": 0.765625, "learning_rate": 6.646056827068664e-05, "loss": 1.1895, "step": 4040 }, { "epoch": 0.41, "grad_norm": 0.62109375, "learning_rate": 6.638267767676791e-05, "loss": 1.1359, "step": 4045 }, { "epoch": 0.41, "grad_norm": 0.7890625, "learning_rate": 6.630474251868462e-05, "loss": 1.0581, "step": 4050 }, { "epoch": 0.41, "grad_norm": 0.7578125, "learning_rate": 6.622676300843608e-05, "loss": 1.065, "step": 4055 }, { "epoch": 0.41, "grad_norm": 0.6875, "learning_rate": 6.614873935814214e-05, "loss": 1.1056, "step": 4060 }, { "epoch": 0.41, "grad_norm": 0.72265625, "learning_rate": 6.607067178004275e-05, "loss": 1.069, "step": 4065 }, { "epoch": 0.41, "grad_norm": 0.73828125, "learning_rate": 6.599256048649737e-05, "loss": 1.007, "step": 4070 }, { "epoch": 0.42, "grad_norm": 0.765625, "learning_rate": 6.591440568998435e-05, "loss": 1.0672, "step": 4075 }, { "epoch": 0.42, "grad_norm": 0.76953125, "learning_rate": 6.583620760310041e-05, "loss": 0.9907, "step": 4080 }, { "epoch": 0.42, "grad_norm": 0.82421875, "learning_rate": 6.575796643856e-05, "loss": 1.067, "step": 4085 }, { "epoch": 0.42, "grad_norm": 0.78125, "learning_rate": 6.567968240919475e-05, "loss": 1.2485, "step": 4090 }, { "epoch": 0.42, "grad_norm": 0.70703125, "learning_rate": 6.560135572795292e-05, "loss": 1.0458, "step": 4095 }, { "epoch": 0.42, "grad_norm": 2.421875, "learning_rate": 6.552298660789875e-05, "loss": 1.0591, "step": 4100 }, { "epoch": 0.42, "grad_norm": 1.0234375, "learning_rate": 6.544457526221192e-05, "loss": 1.1354, "step": 4105 }, { "epoch": 0.42, "grad_norm": 0.7578125, "learning_rate": 6.536612190418707e-05, "loss": 0.9666, "step": 4110 }, { "epoch": 0.42, "grad_norm": 0.71875, "learning_rate": 6.528762674723298e-05, "loss": 1.1244, "step": 4115 }, { "epoch": 0.42, "grad_norm": 0.67578125, "learning_rate": 6.520909000487223e-05, "loss": 1.0685, "step": 4120 }, { "epoch": 0.42, "grad_norm": 1.0859375, "learning_rate": 6.513051189074047e-05, "loss": 1.171, "step": 4125 }, { "epoch": 0.42, "grad_norm": 0.7109375, "learning_rate": 6.505189261858591e-05, "loss": 1.042, "step": 4130 }, { "epoch": 0.42, "grad_norm": 0.73046875, "learning_rate": 6.497323240226874e-05, "loss": 1.0154, "step": 4135 }, { "epoch": 0.42, "grad_norm": 0.734375, "learning_rate": 6.489453145576048e-05, "loss": 1.0528, "step": 4140 }, { "epoch": 0.42, "grad_norm": 0.84765625, "learning_rate": 6.481578999314347e-05, "loss": 1.2128, "step": 4145 }, { "epoch": 0.42, "grad_norm": 0.6484375, "learning_rate": 6.473700822861027e-05, "loss": 0.8893, "step": 4150 }, { "epoch": 0.42, "grad_norm": 0.7265625, "learning_rate": 6.465818637646305e-05, "loss": 1.0033, "step": 4155 }, { "epoch": 0.42, "grad_norm": 0.69921875, "learning_rate": 6.457932465111303e-05, "loss": 1.0655, "step": 4160 }, { "epoch": 0.42, "grad_norm": 0.8046875, "learning_rate": 6.450042326707992e-05, "loss": 1.0127, "step": 4165 }, { "epoch": 0.42, "grad_norm": 0.73046875, "learning_rate": 6.442148243899126e-05, "loss": 1.1003, "step": 4170 }, { "epoch": 0.43, "grad_norm": 0.875, "learning_rate": 6.434250238158193e-05, "loss": 1.0191, "step": 4175 }, { "epoch": 0.43, "grad_norm": 0.80859375, "learning_rate": 6.426348330969353e-05, "loss": 1.2239, "step": 4180 }, { "epoch": 0.43, "grad_norm": 0.83203125, "learning_rate": 6.41844254382737e-05, "loss": 1.0521, "step": 4185 }, { "epoch": 0.43, "grad_norm": 0.94140625, "learning_rate": 6.410532898237572e-05, "loss": 1.2184, "step": 4190 }, { "epoch": 0.43, "grad_norm": 1.4140625, "learning_rate": 6.40261941571578e-05, "loss": 1.0664, "step": 4195 }, { "epoch": 0.43, "grad_norm": 0.67578125, "learning_rate": 6.394702117788249e-05, "loss": 0.8452, "step": 4200 }, { "epoch": 0.43, "grad_norm": 0.80859375, "learning_rate": 6.386781025991617e-05, "loss": 1.0528, "step": 4205 }, { "epoch": 0.43, "grad_norm": 0.80078125, "learning_rate": 6.37885616187284e-05, "loss": 1.0768, "step": 4210 }, { "epoch": 0.43, "grad_norm": 0.76171875, "learning_rate": 6.370927546989136e-05, "loss": 1.105, "step": 4215 }, { "epoch": 0.43, "grad_norm": 0.84375, "learning_rate": 6.362995202907924e-05, "loss": 1.136, "step": 4220 }, { "epoch": 0.43, "grad_norm": 0.703125, "learning_rate": 6.35505915120677e-05, "loss": 1.1266, "step": 4225 }, { "epoch": 0.43, "grad_norm": 4.53125, "learning_rate": 6.347119413473323e-05, "loss": 0.9888, "step": 4230 }, { "epoch": 0.43, "grad_norm": 0.69921875, "learning_rate": 6.339176011305262e-05, "loss": 1.0379, "step": 4235 }, { "epoch": 0.43, "grad_norm": 1.171875, "learning_rate": 6.331228966310228e-05, "loss": 1.226, "step": 4240 }, { "epoch": 0.43, "grad_norm": 0.671875, "learning_rate": 6.323278300105778e-05, "loss": 0.9981, "step": 4245 }, { "epoch": 0.43, "grad_norm": 0.78515625, "learning_rate": 6.315324034319317e-05, "loss": 1.2093, "step": 4250 }, { "epoch": 0.43, "grad_norm": 0.65234375, "learning_rate": 6.30736619058804e-05, "loss": 0.9749, "step": 4255 }, { "epoch": 0.43, "grad_norm": 0.6796875, "learning_rate": 6.299404790558874e-05, "loss": 1.0927, "step": 4260 }, { "epoch": 0.43, "grad_norm": 0.6953125, "learning_rate": 6.291439855888423e-05, "loss": 1.0008, "step": 4265 }, { "epoch": 0.43, "grad_norm": 0.76171875, "learning_rate": 6.283471408242907e-05, "loss": 1.2255, "step": 4270 }, { "epoch": 0.44, "grad_norm": 0.7734375, "learning_rate": 6.275499469298097e-05, "loss": 1.0456, "step": 4275 }, { "epoch": 0.44, "grad_norm": 0.73828125, "learning_rate": 6.267524060739264e-05, "loss": 1.04, "step": 4280 }, { "epoch": 0.44, "grad_norm": 0.7734375, "learning_rate": 6.259545204261117e-05, "loss": 1.0479, "step": 4285 }, { "epoch": 0.44, "grad_norm": 0.56640625, "learning_rate": 6.251562921567744e-05, "loss": 0.708, "step": 4290 }, { "epoch": 0.44, "grad_norm": 0.69921875, "learning_rate": 6.243577234372551e-05, "loss": 1.2084, "step": 4295 }, { "epoch": 0.44, "grad_norm": 0.78515625, "learning_rate": 6.235588164398209e-05, "loss": 1.1408, "step": 4300 }, { "epoch": 0.44, "grad_norm": 0.69921875, "learning_rate": 6.227595733376587e-05, "loss": 0.9636, "step": 4305 }, { "epoch": 0.44, "grad_norm": 0.921875, "learning_rate": 6.219599963048697e-05, "loss": 1.2366, "step": 4310 }, { "epoch": 0.44, "grad_norm": 0.9296875, "learning_rate": 6.211600875164637e-05, "loss": 1.0225, "step": 4315 }, { "epoch": 0.44, "grad_norm": 0.76953125, "learning_rate": 6.20359849148353e-05, "loss": 1.2267, "step": 4320 }, { "epoch": 0.44, "grad_norm": 0.73828125, "learning_rate": 6.19559283377346e-05, "loss": 1.0573, "step": 4325 }, { "epoch": 0.44, "grad_norm": 0.75, "learning_rate": 6.18758392381142e-05, "loss": 1.1054, "step": 4330 }, { "epoch": 0.44, "grad_norm": 0.7265625, "learning_rate": 6.17957178338325e-05, "loss": 1.1948, "step": 4335 }, { "epoch": 0.44, "grad_norm": 0.71875, "learning_rate": 6.171556434283574e-05, "loss": 1.191, "step": 4340 }, { "epoch": 0.44, "grad_norm": 0.79296875, "learning_rate": 6.163537898315752e-05, "loss": 1.0744, "step": 4345 }, { "epoch": 0.44, "grad_norm": 0.75390625, "learning_rate": 6.155516197291802e-05, "loss": 1.0738, "step": 4350 }, { "epoch": 0.44, "grad_norm": 0.73828125, "learning_rate": 6.147491353032361e-05, "loss": 1.012, "step": 4355 }, { "epoch": 0.44, "grad_norm": 0.76953125, "learning_rate": 6.139463387366612e-05, "loss": 0.9968, "step": 4360 }, { "epoch": 0.44, "grad_norm": 0.7265625, "learning_rate": 6.131432322132228e-05, "loss": 1.0121, "step": 4365 }, { "epoch": 0.45, "grad_norm": 0.73046875, "learning_rate": 6.123398179175318e-05, "loss": 1.0607, "step": 4370 }, { "epoch": 0.45, "grad_norm": 0.734375, "learning_rate": 6.115360980350358e-05, "loss": 0.8946, "step": 4375 }, { "epoch": 0.45, "grad_norm": 0.7734375, "learning_rate": 6.107320747520138e-05, "loss": 1.0477, "step": 4380 }, { "epoch": 0.45, "grad_norm": 0.80859375, "learning_rate": 6.0992775025557025e-05, "loss": 0.9904, "step": 4385 }, { "epoch": 0.45, "grad_norm": 0.7265625, "learning_rate": 6.0912312673362905e-05, "loss": 0.9623, "step": 4390 }, { "epoch": 0.45, "grad_norm": 0.6796875, "learning_rate": 6.083182063749269e-05, "loss": 1.1777, "step": 4395 }, { "epoch": 0.45, "grad_norm": 0.83203125, "learning_rate": 6.0751299136900886e-05, "loss": 0.9931, "step": 4400 }, { "epoch": 0.45, "grad_norm": 1.53125, "learning_rate": 6.067074839062207e-05, "loss": 1.1213, "step": 4405 }, { "epoch": 0.45, "grad_norm": 0.7578125, "learning_rate": 6.0590168617770416e-05, "loss": 1.1679, "step": 4410 }, { "epoch": 0.45, "grad_norm": 0.70703125, "learning_rate": 6.050956003753905e-05, "loss": 0.8842, "step": 4415 }, { "epoch": 0.45, "grad_norm": 0.82421875, "learning_rate": 6.0428922869199434e-05, "loss": 1.1319, "step": 4420 }, { "epoch": 0.45, "grad_norm": 1.171875, "learning_rate": 6.034825733210086e-05, "loss": 1.06, "step": 4425 }, { "epoch": 0.45, "grad_norm": 0.74609375, "learning_rate": 6.02675636456697e-05, "loss": 1.1352, "step": 4430 }, { "epoch": 0.45, "grad_norm": 0.77734375, "learning_rate": 6.018684202940896e-05, "loss": 1.0763, "step": 4435 }, { "epoch": 0.45, "grad_norm": 0.73046875, "learning_rate": 6.010609270289761e-05, "loss": 1.0631, "step": 4440 }, { "epoch": 0.45, "grad_norm": 0.72265625, "learning_rate": 6.002531588579e-05, "loss": 0.9551, "step": 4445 }, { "epoch": 0.45, "grad_norm": 0.71484375, "learning_rate": 5.9944511797815215e-05, "loss": 1.0763, "step": 4450 }, { "epoch": 0.45, "grad_norm": 0.74609375, "learning_rate": 5.986368065877659e-05, "loss": 1.145, "step": 4455 }, { "epoch": 0.45, "grad_norm": 0.796875, "learning_rate": 5.9782822688551e-05, "loss": 1.0967, "step": 4460 }, { "epoch": 0.45, "grad_norm": 0.83984375, "learning_rate": 5.970193810708833e-05, "loss": 1.0671, "step": 4465 }, { "epoch": 0.46, "grad_norm": 0.9296875, "learning_rate": 5.962102713441083e-05, "loss": 1.1535, "step": 4470 }, { "epoch": 0.46, "grad_norm": 0.7734375, "learning_rate": 5.9540089990612546e-05, "loss": 0.9838, "step": 4475 }, { "epoch": 0.46, "grad_norm": 0.83984375, "learning_rate": 5.9459126895858744e-05, "loss": 0.8705, "step": 4480 }, { "epoch": 0.46, "grad_norm": 0.75, "learning_rate": 5.937813807038524e-05, "loss": 1.0007, "step": 4485 }, { "epoch": 0.46, "grad_norm": 0.7265625, "learning_rate": 5.929712373449785e-05, "loss": 1.2598, "step": 4490 }, { "epoch": 0.46, "grad_norm": 0.8125, "learning_rate": 5.92160841085718e-05, "loss": 1.1775, "step": 4495 }, { "epoch": 0.46, "grad_norm": 0.80078125, "learning_rate": 5.913501941305114e-05, "loss": 1.1022, "step": 4500 }, { "epoch": 0.46, "grad_norm": 0.7734375, "learning_rate": 5.905392986844802e-05, "loss": 1.2405, "step": 4505 }, { "epoch": 0.46, "grad_norm": 0.671875, "learning_rate": 5.897281569534229e-05, "loss": 0.9214, "step": 4510 }, { "epoch": 0.46, "grad_norm": 0.78125, "learning_rate": 5.8891677114380715e-05, "loss": 1.0663, "step": 4515 }, { "epoch": 0.46, "grad_norm": 0.79296875, "learning_rate": 5.88105143462765e-05, "loss": 1.0043, "step": 4520 }, { "epoch": 0.46, "grad_norm": 0.65625, "learning_rate": 5.8729327611808646e-05, "loss": 1.12, "step": 4525 }, { "epoch": 0.46, "grad_norm": 0.86328125, "learning_rate": 5.864811713182129e-05, "loss": 1.0425, "step": 4530 }, { "epoch": 0.46, "grad_norm": 0.70703125, "learning_rate": 5.856688312722324e-05, "loss": 1.0047, "step": 4535 }, { "epoch": 0.46, "grad_norm": 0.6875, "learning_rate": 5.848562581898724e-05, "loss": 0.9652, "step": 4540 }, { "epoch": 0.46, "grad_norm": 0.6953125, "learning_rate": 5.840434542814947e-05, "loss": 1.1617, "step": 4545 }, { "epoch": 0.46, "grad_norm": 0.796875, "learning_rate": 5.8323042175808815e-05, "loss": 1.1901, "step": 4550 }, { "epoch": 0.46, "grad_norm": 0.7578125, "learning_rate": 5.824171628312648e-05, "loss": 0.9598, "step": 4555 }, { "epoch": 0.46, "grad_norm": 0.671875, "learning_rate": 5.8160367971325115e-05, "loss": 0.9963, "step": 4560 }, { "epoch": 0.46, "grad_norm": 0.671875, "learning_rate": 5.807899746168847e-05, "loss": 0.9893, "step": 4565 }, { "epoch": 0.47, "grad_norm": 0.87109375, "learning_rate": 5.7997604975560594e-05, "loss": 1.0825, "step": 4570 }, { "epoch": 0.47, "grad_norm": 0.76171875, "learning_rate": 5.791619073434536e-05, "loss": 0.9896, "step": 4575 }, { "epoch": 0.47, "grad_norm": 1.0, "learning_rate": 5.7834754959505836e-05, "loss": 1.074, "step": 4580 }, { "epoch": 0.47, "grad_norm": 0.8125, "learning_rate": 5.775329787256362e-05, "loss": 1.1366, "step": 4585 }, { "epoch": 0.47, "grad_norm": 0.8671875, "learning_rate": 5.7671819695098294e-05, "loss": 1.0909, "step": 4590 }, { "epoch": 0.47, "grad_norm": 0.73046875, "learning_rate": 5.759032064874683e-05, "loss": 1.1507, "step": 4595 }, { "epoch": 0.47, "grad_norm": 0.7421875, "learning_rate": 5.750880095520296e-05, "loss": 1.006, "step": 4600 }, { "epoch": 0.47, "grad_norm": 0.73828125, "learning_rate": 5.7427260836216556e-05, "loss": 1.0907, "step": 4605 }, { "epoch": 0.47, "grad_norm": 0.76953125, "learning_rate": 5.734570051359312e-05, "loss": 1.0693, "step": 4610 }, { "epoch": 0.47, "grad_norm": 0.76953125, "learning_rate": 5.7264120209193015e-05, "loss": 1.1219, "step": 4615 }, { "epoch": 0.47, "grad_norm": 0.953125, "learning_rate": 5.718252014493104e-05, "loss": 0.9902, "step": 4620 }, { "epoch": 0.47, "grad_norm": 0.765625, "learning_rate": 5.7100900542775705e-05, "loss": 0.9542, "step": 4625 }, { "epoch": 0.47, "grad_norm": 0.6484375, "learning_rate": 5.7019261624748664e-05, "loss": 0.993, "step": 4630 }, { "epoch": 0.47, "grad_norm": 0.8515625, "learning_rate": 5.693760361292414e-05, "loss": 1.0169, "step": 4635 }, { "epoch": 0.47, "grad_norm": 0.76171875, "learning_rate": 5.6855926729428274e-05, "loss": 1.0525, "step": 4640 }, { "epoch": 0.47, "grad_norm": 0.6328125, "learning_rate": 5.677423119643856e-05, "loss": 0.8732, "step": 4645 }, { "epoch": 0.47, "grad_norm": 1.1953125, "learning_rate": 5.66925172361832e-05, "loss": 1.0501, "step": 4650 }, { "epoch": 0.47, "grad_norm": 0.74609375, "learning_rate": 5.661078507094052e-05, "loss": 0.9633, "step": 4655 }, { "epoch": 0.47, "grad_norm": 0.6796875, "learning_rate": 5.652903492303838e-05, "loss": 1.0973, "step": 4660 }, { "epoch": 0.48, "grad_norm": 0.69921875, "learning_rate": 5.644726701485358e-05, "loss": 1.1023, "step": 4665 }, { "epoch": 0.48, "grad_norm": 0.86328125, "learning_rate": 5.6365481568811195e-05, "loss": 1.0983, "step": 4670 }, { "epoch": 0.48, "grad_norm": 0.71875, "learning_rate": 5.628367880738401e-05, "loss": 1.1221, "step": 4675 }, { "epoch": 0.48, "grad_norm": 0.7578125, "learning_rate": 5.620185895309195e-05, "loss": 1.0069, "step": 4680 }, { "epoch": 0.48, "grad_norm": 0.86328125, "learning_rate": 5.6120022228501346e-05, "loss": 1.0823, "step": 4685 }, { "epoch": 0.48, "grad_norm": 0.73828125, "learning_rate": 5.603816885622455e-05, "loss": 1.0401, "step": 4690 }, { "epoch": 0.48, "grad_norm": 0.90625, "learning_rate": 5.595629905891908e-05, "loss": 1.1202, "step": 4695 }, { "epoch": 0.48, "grad_norm": 0.71484375, "learning_rate": 5.5874413059287214e-05, "loss": 1.1821, "step": 4700 }, { "epoch": 0.48, "grad_norm": 0.7265625, "learning_rate": 5.579251108007523e-05, "loss": 0.902, "step": 4705 }, { "epoch": 0.48, "grad_norm": 0.828125, "learning_rate": 5.571059334407297e-05, "loss": 1.2474, "step": 4710 }, { "epoch": 0.48, "grad_norm": 0.80859375, "learning_rate": 5.5628660074113034e-05, "loss": 1.0546, "step": 4715 }, { "epoch": 0.48, "grad_norm": 0.74609375, "learning_rate": 5.554671149307036e-05, "loss": 1.1297, "step": 4720 }, { "epoch": 0.48, "grad_norm": 0.83203125, "learning_rate": 5.5464747823861486e-05, "loss": 1.2551, "step": 4725 }, { "epoch": 0.48, "grad_norm": 0.7265625, "learning_rate": 5.5382769289444e-05, "loss": 1.0594, "step": 4730 }, { "epoch": 0.48, "grad_norm": 0.875, "learning_rate": 5.530077611281598e-05, "loss": 1.2795, "step": 4735 }, { "epoch": 0.48, "grad_norm": 0.71875, "learning_rate": 5.5218768517015216e-05, "loss": 1.0416, "step": 4740 }, { "epoch": 0.48, "grad_norm": 0.8203125, "learning_rate": 5.513674672511885e-05, "loss": 1.0523, "step": 4745 }, { "epoch": 0.48, "grad_norm": 0.6796875, "learning_rate": 5.5054710960242574e-05, "loss": 1.1531, "step": 4750 }, { "epoch": 0.48, "grad_norm": 5.40625, "learning_rate": 5.497266144554007e-05, "loss": 1.2132, "step": 4755 }, { "epoch": 0.48, "grad_norm": 0.63671875, "learning_rate": 5.489059840420249e-05, "loss": 1.0111, "step": 4760 }, { "epoch": 0.49, "grad_norm": 0.8125, "learning_rate": 5.4808522059457724e-05, "loss": 1.0953, "step": 4765 }, { "epoch": 0.49, "grad_norm": 0.6953125, "learning_rate": 5.472643263456987e-05, "loss": 1.1535, "step": 4770 }, { "epoch": 0.49, "grad_norm": 0.7109375, "learning_rate": 5.4644330352838605e-05, "loss": 0.8999, "step": 4775 }, { "epoch": 0.49, "grad_norm": 0.7421875, "learning_rate": 5.456221543759857e-05, "loss": 1.0552, "step": 4780 }, { "epoch": 0.49, "grad_norm": 0.7265625, "learning_rate": 5.4480088112218795e-05, "loss": 1.0709, "step": 4785 }, { "epoch": 0.49, "grad_norm": 0.64453125, "learning_rate": 5.439794860010207e-05, "loss": 1.0218, "step": 4790 }, { "epoch": 0.49, "grad_norm": 0.78125, "learning_rate": 5.431579712468428e-05, "loss": 1.0769, "step": 4795 }, { "epoch": 0.49, "grad_norm": 0.671875, "learning_rate": 5.423363390943391e-05, "loss": 0.945, "step": 4800 }, { "epoch": 0.49, "grad_norm": 0.70703125, "learning_rate": 5.415145917785137e-05, "loss": 1.0062, "step": 4805 }, { "epoch": 0.49, "grad_norm": 0.890625, "learning_rate": 5.406927315346839e-05, "loss": 1.2441, "step": 4810 }, { "epoch": 0.49, "grad_norm": 0.7734375, "learning_rate": 5.398707605984739e-05, "loss": 0.9563, "step": 4815 }, { "epoch": 0.49, "grad_norm": 0.734375, "learning_rate": 5.390486812058095e-05, "loss": 1.0443, "step": 4820 }, { "epoch": 0.49, "grad_norm": 0.71875, "learning_rate": 5.382264955929114e-05, "loss": 0.9431, "step": 4825 }, { "epoch": 0.49, "grad_norm": 0.703125, "learning_rate": 5.374042059962888e-05, "loss": 0.9862, "step": 4830 }, { "epoch": 0.49, "grad_norm": 0.74609375, "learning_rate": 5.365818146527346e-05, "loss": 1.0034, "step": 4835 }, { "epoch": 0.49, "grad_norm": 0.734375, "learning_rate": 5.357593237993174e-05, "loss": 0.9622, "step": 4840 }, { "epoch": 0.49, "grad_norm": 0.6640625, "learning_rate": 5.3493673567337754e-05, "loss": 1.0207, "step": 4845 }, { "epoch": 0.49, "grad_norm": 0.6484375, "learning_rate": 5.341140525125191e-05, "loss": 1.137, "step": 4850 }, { "epoch": 0.49, "grad_norm": 0.78515625, "learning_rate": 5.3329127655460533e-05, "loss": 1.0355, "step": 4855 }, { "epoch": 0.49, "grad_norm": 0.6875, "learning_rate": 5.3246841003775136e-05, "loss": 1.2037, "step": 4860 }, { "epoch": 0.5, "grad_norm": 0.71484375, "learning_rate": 5.3164545520031926e-05, "loss": 0.9678, "step": 4865 }, { "epoch": 0.5, "grad_norm": 0.66015625, "learning_rate": 5.3082241428091065e-05, "loss": 1.051, "step": 4870 }, { "epoch": 0.5, "grad_norm": 0.7890625, "learning_rate": 5.2999928951836195e-05, "loss": 1.101, "step": 4875 }, { "epoch": 0.5, "grad_norm": 0.88671875, "learning_rate": 5.291760831517373e-05, "loss": 1.1423, "step": 4880 }, { "epoch": 0.5, "grad_norm": 0.81640625, "learning_rate": 5.283527974203227e-05, "loss": 1.2217, "step": 4885 }, { "epoch": 0.5, "grad_norm": 0.72265625, "learning_rate": 5.275294345636208e-05, "loss": 1.0533, "step": 4890 }, { "epoch": 0.5, "grad_norm": 0.71484375, "learning_rate": 5.267059968213428e-05, "loss": 0.9988, "step": 4895 }, { "epoch": 0.5, "grad_norm": 0.75, "learning_rate": 5.258824864334047e-05, "loss": 0.9561, "step": 4900 }, { "epoch": 0.5, "grad_norm": 0.7578125, "learning_rate": 5.250589056399194e-05, "loss": 1.1255, "step": 4905 }, { "epoch": 0.5, "grad_norm": 0.96484375, "learning_rate": 5.242352566811919e-05, "loss": 1.1792, "step": 4910 }, { "epoch": 0.5, "grad_norm": 0.8046875, "learning_rate": 5.2341154179771224e-05, "loss": 1.1269, "step": 4915 }, { "epoch": 0.5, "grad_norm": 0.734375, "learning_rate": 5.225877632301497e-05, "loss": 1.238, "step": 4920 }, { "epoch": 0.5, "grad_norm": 0.75, "learning_rate": 5.217639232193473e-05, "loss": 1.1802, "step": 4925 }, { "epoch": 0.5, "grad_norm": 0.69921875, "learning_rate": 5.209400240063147e-05, "loss": 1.0109, "step": 4930 }, { "epoch": 0.5, "grad_norm": 0.8203125, "learning_rate": 5.201160678322227e-05, "loss": 1.0919, "step": 4935 }, { "epoch": 0.5, "grad_norm": 0.8125, "learning_rate": 5.192920569383971e-05, "loss": 1.0738, "step": 4940 }, { "epoch": 0.5, "grad_norm": 0.69921875, "learning_rate": 5.184679935663131e-05, "loss": 1.0073, "step": 4945 }, { "epoch": 0.5, "grad_norm": 0.66796875, "learning_rate": 5.1764387995758726e-05, "loss": 1.0793, "step": 4950 }, { "epoch": 0.5, "grad_norm": 0.7734375, "learning_rate": 5.168197183539742e-05, "loss": 1.077, "step": 4955 }, { "epoch": 0.51, "grad_norm": 0.71484375, "learning_rate": 5.159955109973585e-05, "loss": 1.1918, "step": 4960 }, { "epoch": 0.51, "grad_norm": 0.77734375, "learning_rate": 5.151712601297491e-05, "loss": 1.1241, "step": 4965 }, { "epoch": 0.51, "grad_norm": 0.74609375, "learning_rate": 5.1434696799327374e-05, "loss": 0.996, "step": 4970 }, { "epoch": 0.51, "grad_norm": 0.7421875, "learning_rate": 5.1352263683017165e-05, "loss": 1.1889, "step": 4975 }, { "epoch": 0.51, "grad_norm": 0.73828125, "learning_rate": 5.126982688827892e-05, "loss": 0.9517, "step": 4980 }, { "epoch": 0.51, "grad_norm": 0.80859375, "learning_rate": 5.118738663935719e-05, "loss": 1.0765, "step": 4985 }, { "epoch": 0.51, "grad_norm": 0.734375, "learning_rate": 5.1104943160506005e-05, "loss": 1.2389, "step": 4990 }, { "epoch": 0.51, "grad_norm": 0.6484375, "learning_rate": 5.1022496675988085e-05, "loss": 0.9763, "step": 4995 }, { "epoch": 0.51, "grad_norm": 0.74609375, "learning_rate": 5.0940047410074446e-05, "loss": 1.0224, "step": 5000 }, { "epoch": 0.51, "grad_norm": 0.7890625, "learning_rate": 5.085759558704355e-05, "loss": 1.0476, "step": 5005 }, { "epoch": 0.51, "grad_norm": 0.8359375, "learning_rate": 5.077514143118091e-05, "loss": 1.0783, "step": 5010 }, { "epoch": 0.51, "grad_norm": 0.78515625, "learning_rate": 5.069268516677831e-05, "loss": 0.9991, "step": 5015 }, { "epoch": 0.51, "grad_norm": 0.70703125, "learning_rate": 5.0610227018133314e-05, "loss": 0.9524, "step": 5020 }, { "epoch": 0.51, "grad_norm": 0.68359375, "learning_rate": 5.052776720954862e-05, "loss": 1.0971, "step": 5025 }, { "epoch": 0.51, "grad_norm": 0.75, "learning_rate": 5.0445305965331415e-05, "loss": 1.1505, "step": 5030 }, { "epoch": 0.51, "grad_norm": 0.78125, "learning_rate": 5.03628435097928e-05, "loss": 0.9684, "step": 5035 }, { "epoch": 0.51, "grad_norm": 0.75390625, "learning_rate": 5.028038006724716e-05, "loss": 1.0793, "step": 5040 }, { "epoch": 0.51, "grad_norm": 0.70703125, "learning_rate": 5.019791586201157e-05, "loss": 0.927, "step": 5045 }, { "epoch": 0.51, "grad_norm": 0.734375, "learning_rate": 5.01154511184052e-05, "loss": 1.0578, "step": 5050 }, { "epoch": 0.51, "grad_norm": 0.6875, "learning_rate": 5.0032986060748676e-05, "loss": 1.016, "step": 5055 }, { "epoch": 0.52, "grad_norm": 0.65234375, "learning_rate": 4.9950520913363444e-05, "loss": 0.8672, "step": 5060 }, { "epoch": 0.52, "grad_norm": 0.671875, "learning_rate": 4.9868055900571245e-05, "loss": 0.9281, "step": 5065 }, { "epoch": 0.52, "grad_norm": 0.73828125, "learning_rate": 4.9785591246693415e-05, "loss": 1.1488, "step": 5070 }, { "epoch": 0.52, "grad_norm": 0.66796875, "learning_rate": 4.9703127176050346e-05, "loss": 1.0526, "step": 5075 }, { "epoch": 0.52, "grad_norm": 0.78125, "learning_rate": 4.962066391296083e-05, "loss": 1.0239, "step": 5080 }, { "epoch": 0.52, "grad_norm": 0.98828125, "learning_rate": 4.9538201681741416e-05, "loss": 1.098, "step": 5085 }, { "epoch": 0.52, "grad_norm": 0.7109375, "learning_rate": 4.945574070670595e-05, "loss": 1.0512, "step": 5090 }, { "epoch": 0.52, "grad_norm": 0.75, "learning_rate": 4.9373281212164764e-05, "loss": 0.9719, "step": 5095 }, { "epoch": 0.52, "grad_norm": 0.68359375, "learning_rate": 4.929082342242419e-05, "loss": 0.8981, "step": 5100 }, { "epoch": 0.52, "grad_norm": 0.81640625, "learning_rate": 4.9208367561785953e-05, "loss": 1.0382, "step": 5105 }, { "epoch": 0.52, "grad_norm": 0.734375, "learning_rate": 4.912591385454651e-05, "loss": 1.052, "step": 5110 }, { "epoch": 0.52, "grad_norm": 0.59765625, "learning_rate": 4.904346252499644e-05, "loss": 0.9151, "step": 5115 }, { "epoch": 0.52, "grad_norm": 0.74609375, "learning_rate": 4.89610137974199e-05, "loss": 1.0986, "step": 5120 }, { "epoch": 0.52, "grad_norm": 0.7265625, "learning_rate": 4.8878567896093915e-05, "loss": 1.1447, "step": 5125 }, { "epoch": 0.52, "grad_norm": 0.7890625, "learning_rate": 4.8796125045287864e-05, "loss": 1.1617, "step": 5130 }, { "epoch": 0.52, "grad_norm": 0.75, "learning_rate": 4.87136854692628e-05, "loss": 1.045, "step": 5135 }, { "epoch": 0.52, "grad_norm": 0.75, "learning_rate": 4.86312493922709e-05, "loss": 1.0253, "step": 5140 }, { "epoch": 0.52, "grad_norm": 0.73046875, "learning_rate": 4.854881703855481e-05, "loss": 0.9743, "step": 5145 }, { "epoch": 0.52, "grad_norm": 0.76171875, "learning_rate": 4.8466388632347e-05, "loss": 1.0227, "step": 5150 }, { "epoch": 0.52, "grad_norm": 0.77734375, "learning_rate": 4.83839643978693e-05, "loss": 1.1735, "step": 5155 }, { "epoch": 0.53, "grad_norm": 0.78515625, "learning_rate": 4.83015445593321e-05, "loss": 1.1489, "step": 5160 }, { "epoch": 0.53, "grad_norm": 0.84375, "learning_rate": 4.8219129340933865e-05, "loss": 1.0653, "step": 5165 }, { "epoch": 0.53, "grad_norm": 0.66015625, "learning_rate": 4.813671896686052e-05, "loss": 1.016, "step": 5170 }, { "epoch": 0.53, "grad_norm": 0.71875, "learning_rate": 4.805431366128478e-05, "loss": 0.8193, "step": 5175 }, { "epoch": 0.53, "grad_norm": 0.8359375, "learning_rate": 4.797191364836557e-05, "loss": 0.9678, "step": 5180 }, { "epoch": 0.53, "grad_norm": 0.73046875, "learning_rate": 4.7889519152247445e-05, "loss": 1.0295, "step": 5185 }, { "epoch": 0.53, "grad_norm": 0.734375, "learning_rate": 4.780713039705993e-05, "loss": 1.0114, "step": 5190 }, { "epoch": 0.53, "grad_norm": 0.78125, "learning_rate": 4.772474760691692e-05, "loss": 1.1057, "step": 5195 }, { "epoch": 0.53, "grad_norm": 0.8046875, "learning_rate": 4.764237100591614e-05, "loss": 1.1135, "step": 5200 }, { "epoch": 0.53, "grad_norm": 0.8359375, "learning_rate": 4.756000081813843e-05, "loss": 1.0649, "step": 5205 }, { "epoch": 0.53, "grad_norm": 0.73046875, "learning_rate": 4.7477637267647174e-05, "loss": 1.078, "step": 5210 }, { "epoch": 0.53, "grad_norm": 0.6796875, "learning_rate": 4.739528057848777e-05, "loss": 0.9255, "step": 5215 }, { "epoch": 0.53, "grad_norm": 0.73828125, "learning_rate": 4.731293097468688e-05, "loss": 1.1124, "step": 5220 }, { "epoch": 0.53, "grad_norm": 0.6640625, "learning_rate": 4.7230588680251904e-05, "loss": 1.1173, "step": 5225 }, { "epoch": 0.53, "grad_norm": 0.66015625, "learning_rate": 4.7148253919170415e-05, "loss": 1.1483, "step": 5230 }, { "epoch": 0.53, "grad_norm": 0.58203125, "learning_rate": 4.706592691540944e-05, "loss": 1.0426, "step": 5235 }, { "epoch": 0.53, "grad_norm": 1.6953125, "learning_rate": 4.698360789291489e-05, "loss": 1.2384, "step": 5240 }, { "epoch": 0.53, "grad_norm": 0.8125, "learning_rate": 4.690129707561104e-05, "loss": 1.0778, "step": 5245 }, { "epoch": 0.53, "grad_norm": 0.703125, "learning_rate": 4.6818994687399767e-05, "loss": 0.9883, "step": 5250 }, { "epoch": 0.54, "grad_norm": 0.796875, "learning_rate": 4.673670095216006e-05, "loss": 0.9607, "step": 5255 }, { "epoch": 0.54, "grad_norm": 0.80859375, "learning_rate": 4.665441609374735e-05, "loss": 1.0143, "step": 5260 }, { "epoch": 0.54, "grad_norm": 0.82421875, "learning_rate": 4.6572140335992956e-05, "loss": 1.1046, "step": 5265 }, { "epoch": 0.54, "grad_norm": 0.734375, "learning_rate": 4.648987390270341e-05, "loss": 0.9611, "step": 5270 }, { "epoch": 0.54, "grad_norm": 0.6953125, "learning_rate": 4.6407617017659864e-05, "loss": 1.042, "step": 5275 }, { "epoch": 0.54, "grad_norm": 0.6875, "learning_rate": 4.632536990461756e-05, "loss": 0.9466, "step": 5280 }, { "epoch": 0.54, "grad_norm": 0.78515625, "learning_rate": 4.624313278730508e-05, "loss": 1.1763, "step": 5285 }, { "epoch": 0.54, "grad_norm": 0.75, "learning_rate": 4.6160905889423864e-05, "loss": 1.0396, "step": 5290 }, { "epoch": 0.54, "grad_norm": 0.79296875, "learning_rate": 4.607868943464757e-05, "loss": 1.0803, "step": 5295 }, { "epoch": 0.54, "grad_norm": 1.2890625, "learning_rate": 4.599648364662141e-05, "loss": 1.2114, "step": 5300 }, { "epoch": 0.54, "grad_norm": 0.703125, "learning_rate": 4.591428874896155e-05, "loss": 0.8715, "step": 5305 }, { "epoch": 0.54, "grad_norm": 0.71875, "learning_rate": 4.583210496525464e-05, "loss": 1.002, "step": 5310 }, { "epoch": 0.54, "grad_norm": 0.7421875, "learning_rate": 4.5749932519056994e-05, "loss": 1.0924, "step": 5315 }, { "epoch": 0.54, "grad_norm": 0.67578125, "learning_rate": 4.566777163389411e-05, "loss": 0.9119, "step": 5320 }, { "epoch": 0.54, "grad_norm": 0.90234375, "learning_rate": 4.5585622533260095e-05, "loss": 1.1491, "step": 5325 }, { "epoch": 0.54, "grad_norm": 0.703125, "learning_rate": 4.5503485440616925e-05, "loss": 0.9818, "step": 5330 }, { "epoch": 0.54, "grad_norm": 0.72265625, "learning_rate": 4.5421360579393944e-05, "loss": 0.8872, "step": 5335 }, { "epoch": 0.54, "grad_norm": 0.68359375, "learning_rate": 4.533924817298724e-05, "loss": 0.956, "step": 5340 }, { "epoch": 0.54, "grad_norm": 2.578125, "learning_rate": 4.525714844475901e-05, "loss": 0.952, "step": 5345 }, { "epoch": 0.54, "grad_norm": 0.6875, "learning_rate": 4.5175061618036906e-05, "loss": 0.9298, "step": 5350 }, { "epoch": 0.55, "grad_norm": 0.7109375, "learning_rate": 4.509298791611361e-05, "loss": 1.0396, "step": 5355 }, { "epoch": 0.55, "grad_norm": 0.76171875, "learning_rate": 4.501092756224598e-05, "loss": 1.0364, "step": 5360 }, { "epoch": 0.55, "grad_norm": 0.7578125, "learning_rate": 4.492888077965462e-05, "loss": 1.1525, "step": 5365 }, { "epoch": 0.55, "grad_norm": 0.69921875, "learning_rate": 4.484684779152324e-05, "loss": 1.0566, "step": 5370 }, { "epoch": 0.55, "grad_norm": 0.57421875, "learning_rate": 4.4764828820997965e-05, "loss": 0.9149, "step": 5375 }, { "epoch": 0.55, "grad_norm": 0.828125, "learning_rate": 4.4682824091186855e-05, "loss": 1.0611, "step": 5380 }, { "epoch": 0.55, "grad_norm": 0.80859375, "learning_rate": 4.460083382515914e-05, "loss": 1.2073, "step": 5385 }, { "epoch": 0.55, "grad_norm": 0.71875, "learning_rate": 4.4518858245944836e-05, "loss": 0.949, "step": 5390 }, { "epoch": 0.55, "grad_norm": 0.71875, "learning_rate": 4.4436897576533904e-05, "loss": 0.9548, "step": 5395 }, { "epoch": 0.55, "grad_norm": 0.921875, "learning_rate": 4.435495203987576e-05, "loss": 0.9157, "step": 5400 }, { "epoch": 0.55, "grad_norm": 0.75390625, "learning_rate": 4.427302185887872e-05, "loss": 1.0325, "step": 5405 }, { "epoch": 0.55, "grad_norm": 0.83203125, "learning_rate": 4.4191107256409264e-05, "loss": 0.92, "step": 5410 }, { "epoch": 0.55, "grad_norm": 0.78515625, "learning_rate": 4.410920845529151e-05, "loss": 1.0232, "step": 5415 }, { "epoch": 0.55, "grad_norm": 0.95703125, "learning_rate": 4.4027325678306606e-05, "loss": 1.1333, "step": 5420 }, { "epoch": 0.55, "grad_norm": 0.64453125, "learning_rate": 4.394545914819213e-05, "loss": 1.0389, "step": 5425 }, { "epoch": 0.55, "grad_norm": 0.73828125, "learning_rate": 4.38636090876414e-05, "loss": 1.0456, "step": 5430 }, { "epoch": 0.55, "grad_norm": 0.72265625, "learning_rate": 4.3781775719302994e-05, "loss": 1.1344, "step": 5435 }, { "epoch": 0.55, "grad_norm": 0.7421875, "learning_rate": 4.3699959265780066e-05, "loss": 1.0087, "step": 5440 }, { "epoch": 0.55, "grad_norm": 0.6953125, "learning_rate": 4.361815994962974e-05, "loss": 1.0076, "step": 5445 }, { "epoch": 0.56, "grad_norm": 0.69140625, "learning_rate": 4.353637799336257e-05, "loss": 0.954, "step": 5450 }, { "epoch": 0.56, "grad_norm": 0.6796875, "learning_rate": 4.345461361944184e-05, "loss": 1.1507, "step": 5455 }, { "epoch": 0.56, "grad_norm": 1.046875, "learning_rate": 4.3372867050283e-05, "loss": 1.0085, "step": 5460 }, { "epoch": 0.56, "grad_norm": 0.765625, "learning_rate": 4.329113850825314e-05, "loss": 0.8788, "step": 5465 }, { "epoch": 0.56, "grad_norm": 0.83984375, "learning_rate": 4.320942821567023e-05, "loss": 1.0546, "step": 5470 }, { "epoch": 0.56, "grad_norm": 0.69140625, "learning_rate": 4.312773639480263e-05, "loss": 1.0464, "step": 5475 }, { "epoch": 0.56, "grad_norm": 1.7578125, "learning_rate": 4.304606326786847e-05, "loss": 0.9899, "step": 5480 }, { "epoch": 0.56, "grad_norm": 1.0390625, "learning_rate": 4.296440905703501e-05, "loss": 0.9215, "step": 5485 }, { "epoch": 0.56, "grad_norm": 0.71875, "learning_rate": 4.288277398441804e-05, "loss": 0.9959, "step": 5490 }, { "epoch": 0.56, "grad_norm": 1.21875, "learning_rate": 4.280115827208134e-05, "loss": 1.0586, "step": 5495 }, { "epoch": 0.56, "grad_norm": 0.77734375, "learning_rate": 4.271956214203598e-05, "loss": 1.0914, "step": 5500 }, { "epoch": 0.56, "grad_norm": 0.671875, "learning_rate": 4.263798581623976e-05, "loss": 0.9507, "step": 5505 }, { "epoch": 0.56, "grad_norm": 0.796875, "learning_rate": 4.255642951659664e-05, "loss": 1.158, "step": 5510 }, { "epoch": 0.56, "grad_norm": 0.78125, "learning_rate": 4.2474893464956086e-05, "loss": 0.9905, "step": 5515 }, { "epoch": 0.56, "grad_norm": 0.80859375, "learning_rate": 4.2393377883112495e-05, "loss": 1.1065, "step": 5520 }, { "epoch": 0.56, "grad_norm": 0.734375, "learning_rate": 4.231188299280456e-05, "loss": 1.0051, "step": 5525 }, { "epoch": 0.56, "grad_norm": 0.77734375, "learning_rate": 4.2230409015714724e-05, "loss": 0.9964, "step": 5530 }, { "epoch": 0.56, "grad_norm": 0.671875, "learning_rate": 4.214895617346853e-05, "loss": 1.0241, "step": 5535 }, { "epoch": 0.56, "grad_norm": 0.65625, "learning_rate": 4.206752468763397e-05, "loss": 1.0364, "step": 5540 }, { "epoch": 0.56, "grad_norm": 0.70703125, "learning_rate": 4.198611477972108e-05, "loss": 1.0132, "step": 5545 }, { "epoch": 0.57, "grad_norm": 0.875, "learning_rate": 4.190472667118105e-05, "loss": 1.0339, "step": 5550 }, { "epoch": 0.57, "grad_norm": 2.921875, "learning_rate": 4.182336058340585e-05, "loss": 0.9761, "step": 5555 }, { "epoch": 0.57, "grad_norm": 0.78515625, "learning_rate": 4.174201673772754e-05, "loss": 1.1701, "step": 5560 }, { "epoch": 0.57, "grad_norm": 0.69921875, "learning_rate": 4.166069535541768e-05, "loss": 0.917, "step": 5565 }, { "epoch": 0.57, "grad_norm": 0.69140625, "learning_rate": 4.1579396657686693e-05, "loss": 1.0072, "step": 5570 }, { "epoch": 0.57, "grad_norm": 0.79296875, "learning_rate": 4.1498120865683355e-05, "loss": 1.1712, "step": 5575 }, { "epoch": 0.57, "grad_norm": 0.87109375, "learning_rate": 4.141686820049409e-05, "loss": 1.11, "step": 5580 }, { "epoch": 0.57, "grad_norm": 0.75390625, "learning_rate": 4.1335638883142384e-05, "loss": 1.1543, "step": 5585 }, { "epoch": 0.57, "grad_norm": 0.75390625, "learning_rate": 4.12544331345883e-05, "loss": 1.0131, "step": 5590 }, { "epoch": 0.57, "grad_norm": 0.76953125, "learning_rate": 4.11732511757277e-05, "loss": 1.1151, "step": 5595 }, { "epoch": 0.57, "grad_norm": 0.6875, "learning_rate": 4.1092093227391795e-05, "loss": 1.0872, "step": 5600 }, { "epoch": 0.57, "grad_norm": 0.6015625, "learning_rate": 4.101095951034645e-05, "loss": 0.8416, "step": 5605 }, { "epoch": 0.57, "grad_norm": 0.8203125, "learning_rate": 4.092985024529164e-05, "loss": 0.9728, "step": 5610 }, { "epoch": 0.57, "grad_norm": 0.7734375, "learning_rate": 4.0848765652860766e-05, "loss": 0.9884, "step": 5615 }, { "epoch": 0.57, "grad_norm": 0.71875, "learning_rate": 4.0767705953620226e-05, "loss": 0.9671, "step": 5620 }, { "epoch": 0.57, "grad_norm": 0.828125, "learning_rate": 4.06866713680686e-05, "loss": 1.1356, "step": 5625 }, { "epoch": 0.57, "grad_norm": 0.75390625, "learning_rate": 4.0605662116636185e-05, "loss": 1.0336, "step": 5630 }, { "epoch": 0.57, "grad_norm": 0.671875, "learning_rate": 4.052467841968437e-05, "loss": 1.014, "step": 5635 }, { "epoch": 0.57, "grad_norm": 0.7265625, "learning_rate": 4.0443720497505054e-05, "loss": 1.1874, "step": 5640 }, { "epoch": 0.57, "grad_norm": 0.71484375, "learning_rate": 4.0362788570319995e-05, "loss": 1.1396, "step": 5645 }, { "epoch": 0.58, "grad_norm": 0.75, "learning_rate": 4.0281882858280205e-05, "loss": 1.0163, "step": 5650 }, { "epoch": 0.58, "grad_norm": 0.7890625, "learning_rate": 4.0201003581465484e-05, "loss": 1.0544, "step": 5655 }, { "epoch": 0.58, "grad_norm": 0.7890625, "learning_rate": 4.012015095988363e-05, "loss": 1.0106, "step": 5660 }, { "epoch": 0.58, "grad_norm": 0.78515625, "learning_rate": 4.003932521346996e-05, "loss": 1.0598, "step": 5665 }, { "epoch": 0.58, "grad_norm": 0.69140625, "learning_rate": 3.995852656208672e-05, "loss": 1.0287, "step": 5670 }, { "epoch": 0.58, "grad_norm": 0.81640625, "learning_rate": 3.9877755225522403e-05, "loss": 1.2901, "step": 5675 }, { "epoch": 0.58, "grad_norm": 0.69921875, "learning_rate": 3.979701142349123e-05, "loss": 0.9457, "step": 5680 }, { "epoch": 0.58, "grad_norm": 0.71484375, "learning_rate": 3.971629537563252e-05, "loss": 1.174, "step": 5685 }, { "epoch": 0.58, "grad_norm": 0.8359375, "learning_rate": 3.9635607301510095e-05, "loss": 0.9278, "step": 5690 }, { "epoch": 0.58, "grad_norm": 0.74609375, "learning_rate": 3.955494742061163e-05, "loss": 0.8117, "step": 5695 }, { "epoch": 0.58, "grad_norm": 0.91015625, "learning_rate": 3.947431595234823e-05, "loss": 1.1099, "step": 5700 }, { "epoch": 0.58, "grad_norm": 0.75390625, "learning_rate": 3.939371311605358e-05, "loss": 0.9595, "step": 5705 }, { "epoch": 0.58, "grad_norm": 0.62109375, "learning_rate": 3.931313913098356e-05, "loss": 1.0319, "step": 5710 }, { "epoch": 0.58, "grad_norm": 0.75390625, "learning_rate": 3.923259421631555e-05, "loss": 1.0311, "step": 5715 }, { "epoch": 0.58, "grad_norm": 0.71875, "learning_rate": 3.915207859114785e-05, "loss": 1.0689, "step": 5720 }, { "epoch": 0.58, "grad_norm": 0.8359375, "learning_rate": 3.907159247449907e-05, "loss": 1.1307, "step": 5725 }, { "epoch": 0.58, "grad_norm": 0.7890625, "learning_rate": 3.899113608530759e-05, "loss": 1.1061, "step": 5730 }, { "epoch": 0.58, "grad_norm": 0.875, "learning_rate": 3.891070964243091e-05, "loss": 1.1362, "step": 5735 }, { "epoch": 0.58, "grad_norm": 0.86328125, "learning_rate": 3.883031336464502e-05, "loss": 1.1102, "step": 5740 }, { "epoch": 0.59, "grad_norm": 0.7734375, "learning_rate": 3.874994747064394e-05, "loss": 1.1218, "step": 5745 }, { "epoch": 0.59, "grad_norm": 0.75, "learning_rate": 3.866961217903897e-05, "loss": 1.0652, "step": 5750 }, { "epoch": 0.59, "grad_norm": 0.81640625, "learning_rate": 3.8589307708358217e-05, "loss": 0.9907, "step": 5755 }, { "epoch": 0.59, "grad_norm": 0.76953125, "learning_rate": 3.850903427704591e-05, "loss": 1.025, "step": 5760 }, { "epoch": 0.59, "grad_norm": 0.90625, "learning_rate": 3.8428792103461875e-05, "loss": 0.9414, "step": 5765 }, { "epoch": 0.59, "grad_norm": 0.8359375, "learning_rate": 3.834858140588087e-05, "loss": 1.0071, "step": 5770 }, { "epoch": 0.59, "grad_norm": 0.79296875, "learning_rate": 3.826840240249207e-05, "loss": 0.9806, "step": 5775 }, { "epoch": 0.59, "grad_norm": 0.73828125, "learning_rate": 3.8188255311398434e-05, "loss": 0.9352, "step": 5780 }, { "epoch": 0.59, "grad_norm": 0.73828125, "learning_rate": 3.8108140350616086e-05, "loss": 0.9285, "step": 5785 }, { "epoch": 0.59, "grad_norm": 0.66796875, "learning_rate": 3.802805773807377e-05, "loss": 0.9716, "step": 5790 }, { "epoch": 0.59, "grad_norm": 0.77734375, "learning_rate": 3.794800769161225e-05, "loss": 1.197, "step": 5795 }, { "epoch": 0.59, "grad_norm": 0.81640625, "learning_rate": 3.7867990428983675e-05, "loss": 1.2058, "step": 5800 }, { "epoch": 0.59, "grad_norm": 0.7421875, "learning_rate": 3.7788006167850984e-05, "loss": 0.9454, "step": 5805 }, { "epoch": 0.59, "grad_norm": 0.6875, "learning_rate": 3.770805512578746e-05, "loss": 0.97, "step": 5810 }, { "epoch": 0.59, "grad_norm": 0.67578125, "learning_rate": 3.76281375202759e-05, "loss": 1.0012, "step": 5815 }, { "epoch": 0.59, "grad_norm": 0.8359375, "learning_rate": 3.7548253568708206e-05, "loss": 1.0222, "step": 5820 }, { "epoch": 0.59, "grad_norm": 0.72265625, "learning_rate": 3.746840348838474e-05, "loss": 1.135, "step": 5825 }, { "epoch": 0.59, "grad_norm": 0.66796875, "learning_rate": 3.738858749651371e-05, "loss": 0.9011, "step": 5830 }, { "epoch": 0.59, "grad_norm": 0.72265625, "learning_rate": 3.730880581021058e-05, "loss": 1.1707, "step": 5835 }, { "epoch": 0.59, "grad_norm": 0.76953125, "learning_rate": 3.722905864649754e-05, "loss": 1.1671, "step": 5840 }, { "epoch": 0.6, "grad_norm": 0.75, "learning_rate": 3.7149346222302854e-05, "loss": 1.0802, "step": 5845 }, { "epoch": 0.6, "grad_norm": 0.6953125, "learning_rate": 3.7069668754460236e-05, "loss": 0.9061, "step": 5850 }, { "epoch": 0.6, "grad_norm": 0.671875, "learning_rate": 3.699002645970842e-05, "loss": 1.0918, "step": 5855 }, { "epoch": 0.6, "grad_norm": 4.8125, "learning_rate": 3.6910419554690344e-05, "loss": 1.0834, "step": 5860 }, { "epoch": 0.6, "grad_norm": 0.78125, "learning_rate": 3.6830848255952755e-05, "loss": 1.0767, "step": 5865 }, { "epoch": 0.6, "grad_norm": 0.75, "learning_rate": 3.675131277994553e-05, "loss": 1.0405, "step": 5870 }, { "epoch": 0.6, "grad_norm": 0.78515625, "learning_rate": 3.667181334302109e-05, "loss": 1.1731, "step": 5875 }, { "epoch": 0.6, "grad_norm": 0.69921875, "learning_rate": 3.659235016143383e-05, "loss": 1.1153, "step": 5880 }, { "epoch": 0.6, "grad_norm": 0.7421875, "learning_rate": 3.6512923451339483e-05, "loss": 0.972, "step": 5885 }, { "epoch": 0.6, "grad_norm": 0.79296875, "learning_rate": 3.6433533428794674e-05, "loss": 0.9887, "step": 5890 }, { "epoch": 0.6, "grad_norm": 0.75, "learning_rate": 3.635418030975612e-05, "loss": 1.0424, "step": 5895 }, { "epoch": 0.6, "grad_norm": 0.734375, "learning_rate": 3.627486431008019e-05, "loss": 1.0604, "step": 5900 }, { "epoch": 0.6, "grad_norm": 0.71484375, "learning_rate": 3.619558564552232e-05, "loss": 1.2235, "step": 5905 }, { "epoch": 0.6, "grad_norm": 1.140625, "learning_rate": 3.611634453173634e-05, "loss": 1.0018, "step": 5910 }, { "epoch": 0.6, "grad_norm": 0.703125, "learning_rate": 3.6037141184273955e-05, "loss": 1.0535, "step": 5915 }, { "epoch": 0.6, "grad_norm": 0.80859375, "learning_rate": 3.5957975818584135e-05, "loss": 1.0967, "step": 5920 }, { "epoch": 0.6, "grad_norm": 0.7734375, "learning_rate": 3.587884865001254e-05, "loss": 1.1435, "step": 5925 }, { "epoch": 0.6, "grad_norm": 0.76953125, "learning_rate": 3.579975989380088e-05, "loss": 0.9315, "step": 5930 }, { "epoch": 0.6, "grad_norm": 0.7421875, "learning_rate": 3.572070976508645e-05, "loss": 1.1153, "step": 5935 }, { "epoch": 0.6, "grad_norm": 0.8203125, "learning_rate": 3.564169847890141e-05, "loss": 1.0063, "step": 5940 }, { "epoch": 0.61, "grad_norm": 0.70703125, "learning_rate": 3.5562726250172294e-05, "loss": 0.9381, "step": 5945 }, { "epoch": 0.61, "grad_norm": 0.81640625, "learning_rate": 3.548379329371939e-05, "loss": 0.9847, "step": 5950 }, { "epoch": 0.61, "grad_norm": 0.98828125, "learning_rate": 3.5404899824256144e-05, "loss": 1.0556, "step": 5955 }, { "epoch": 0.61, "grad_norm": 0.76953125, "learning_rate": 3.532604605638856e-05, "loss": 1.0517, "step": 5960 }, { "epoch": 0.61, "grad_norm": 0.71875, "learning_rate": 3.5247232204614745e-05, "loss": 0.8894, "step": 5965 }, { "epoch": 0.61, "grad_norm": 0.73046875, "learning_rate": 3.516845848332411e-05, "loss": 0.9472, "step": 5970 }, { "epoch": 0.61, "grad_norm": 0.7421875, "learning_rate": 3.5089725106796955e-05, "loss": 1.1139, "step": 5975 }, { "epoch": 0.61, "grad_norm": 0.65625, "learning_rate": 3.5011032289203863e-05, "loss": 0.9535, "step": 5980 }, { "epoch": 0.61, "grad_norm": 0.7734375, "learning_rate": 3.4932380244605046e-05, "loss": 1.0412, "step": 5985 }, { "epoch": 0.61, "grad_norm": 0.7734375, "learning_rate": 3.485376918694981e-05, "loss": 1.0978, "step": 5990 }, { "epoch": 0.61, "grad_norm": 0.671875, "learning_rate": 3.477519933007599e-05, "loss": 1.022, "step": 5995 }, { "epoch": 0.61, "grad_norm": 0.8046875, "learning_rate": 3.469667088770934e-05, "loss": 1.1041, "step": 6000 }, { "epoch": 0.61, "grad_norm": 0.8203125, "learning_rate": 3.461818407346292e-05, "loss": 0.9351, "step": 6005 }, { "epoch": 0.61, "grad_norm": 0.66796875, "learning_rate": 3.453973910083659e-05, "loss": 0.9167, "step": 6010 }, { "epoch": 0.61, "grad_norm": 0.828125, "learning_rate": 3.446133618321642e-05, "loss": 1.0682, "step": 6015 }, { "epoch": 0.61, "grad_norm": 0.671875, "learning_rate": 3.438297553387402e-05, "loss": 1.0559, "step": 6020 }, { "epoch": 0.61, "grad_norm": 4.8125, "learning_rate": 3.430465736596605e-05, "loss": 1.1181, "step": 6025 }, { "epoch": 0.61, "grad_norm": 0.8046875, "learning_rate": 3.422638189253364e-05, "loss": 1.0976, "step": 6030 }, { "epoch": 0.61, "grad_norm": 0.75390625, "learning_rate": 3.4148149326501745e-05, "loss": 0.9938, "step": 6035 }, { "epoch": 0.62, "grad_norm": 2.515625, "learning_rate": 3.406995988067858e-05, "loss": 1.0343, "step": 6040 }, { "epoch": 0.62, "grad_norm": 0.6640625, "learning_rate": 3.399181376775514e-05, "loss": 1.1531, "step": 6045 }, { "epoch": 0.62, "grad_norm": 0.74609375, "learning_rate": 3.391371120030449e-05, "loss": 0.9443, "step": 6050 }, { "epoch": 0.62, "grad_norm": 0.828125, "learning_rate": 3.3835652390781246e-05, "loss": 1.0006, "step": 6055 }, { "epoch": 0.62, "grad_norm": 0.78515625, "learning_rate": 3.375763755152101e-05, "loss": 1.0689, "step": 6060 }, { "epoch": 0.62, "grad_norm": 0.8515625, "learning_rate": 3.367966689473979e-05, "loss": 1.0221, "step": 6065 }, { "epoch": 0.62, "grad_norm": 0.6953125, "learning_rate": 3.360174063253335e-05, "loss": 0.9852, "step": 6070 }, { "epoch": 0.62, "grad_norm": 0.68359375, "learning_rate": 3.3523858976876774e-05, "loss": 1.1333, "step": 6075 }, { "epoch": 0.62, "grad_norm": 0.7578125, "learning_rate": 3.3446022139623744e-05, "loss": 1.0534, "step": 6080 }, { "epoch": 0.62, "grad_norm": 0.63671875, "learning_rate": 3.336823033250602e-05, "loss": 0.961, "step": 6085 }, { "epoch": 0.62, "grad_norm": 0.65625, "learning_rate": 3.329048376713294e-05, "loss": 0.9828, "step": 6090 }, { "epoch": 0.62, "grad_norm": 0.84765625, "learning_rate": 3.321278265499072e-05, "loss": 0.9971, "step": 6095 }, { "epoch": 0.62, "grad_norm": 0.70703125, "learning_rate": 3.3135127207441934e-05, "loss": 1.0486, "step": 6100 }, { "epoch": 0.62, "grad_norm": 0.7421875, "learning_rate": 3.305751763572497e-05, "loss": 0.9744, "step": 6105 }, { "epoch": 0.62, "grad_norm": 0.8046875, "learning_rate": 3.297995415095342e-05, "loss": 1.1336, "step": 6110 }, { "epoch": 0.62, "grad_norm": 0.7890625, "learning_rate": 3.2902436964115435e-05, "loss": 0.9069, "step": 6115 }, { "epoch": 0.62, "grad_norm": 0.73828125, "learning_rate": 3.282496628607337e-05, "loss": 0.8783, "step": 6120 }, { "epoch": 0.62, "grad_norm": 0.75, "learning_rate": 3.274754232756294e-05, "loss": 0.9949, "step": 6125 }, { "epoch": 0.62, "grad_norm": 0.765625, "learning_rate": 3.267016529919282e-05, "loss": 1.1357, "step": 6130 }, { "epoch": 0.62, "grad_norm": 0.84375, "learning_rate": 3.259283541144403e-05, "loss": 1.1777, "step": 6135 }, { "epoch": 0.63, "grad_norm": 0.74609375, "learning_rate": 3.251555287466936e-05, "loss": 1.0011, "step": 6140 }, { "epoch": 0.63, "grad_norm": 0.9296875, "learning_rate": 3.2438317899092796e-05, "loss": 1.0628, "step": 6145 }, { "epoch": 0.63, "grad_norm": 0.671875, "learning_rate": 3.2361130694808895e-05, "loss": 1.042, "step": 6150 }, { "epoch": 0.63, "grad_norm": 0.828125, "learning_rate": 3.228399147178238e-05, "loss": 1.3185, "step": 6155 }, { "epoch": 0.63, "grad_norm": 0.66015625, "learning_rate": 3.220690043984734e-05, "loss": 1.18, "step": 6160 }, { "epoch": 0.63, "grad_norm": 0.83984375, "learning_rate": 3.212985780870683e-05, "loss": 1.1598, "step": 6165 }, { "epoch": 0.63, "grad_norm": 0.72265625, "learning_rate": 3.2052863787932245e-05, "loss": 1.1073, "step": 6170 }, { "epoch": 0.63, "grad_norm": 0.80859375, "learning_rate": 3.197591858696275e-05, "loss": 0.9834, "step": 6175 }, { "epoch": 0.63, "grad_norm": 0.83203125, "learning_rate": 3.189902241510467e-05, "loss": 0.9688, "step": 6180 }, { "epoch": 0.63, "grad_norm": 0.796875, "learning_rate": 3.1822175481531034e-05, "loss": 1.2171, "step": 6185 }, { "epoch": 0.63, "grad_norm": 0.88671875, "learning_rate": 3.174537799528089e-05, "loss": 1.1525, "step": 6190 }, { "epoch": 0.63, "grad_norm": 0.66796875, "learning_rate": 3.1668630165258736e-05, "loss": 0.9984, "step": 6195 }, { "epoch": 0.63, "grad_norm": 0.81640625, "learning_rate": 3.1591932200234095e-05, "loss": 1.2287, "step": 6200 }, { "epoch": 0.63, "grad_norm": 0.765625, "learning_rate": 3.151528430884078e-05, "loss": 1.0306, "step": 6205 }, { "epoch": 0.63, "grad_norm": 0.71875, "learning_rate": 3.143868669957637e-05, "loss": 0.9547, "step": 6210 }, { "epoch": 0.63, "grad_norm": 0.81640625, "learning_rate": 3.136213958080175e-05, "loss": 1.0548, "step": 6215 }, { "epoch": 0.63, "grad_norm": 0.921875, "learning_rate": 3.1285643160740394e-05, "loss": 1.1733, "step": 6220 }, { "epoch": 0.63, "grad_norm": 0.9375, "learning_rate": 3.1209197647477885e-05, "loss": 1.092, "step": 6225 }, { "epoch": 0.63, "grad_norm": 0.8984375, "learning_rate": 3.113280324896134e-05, "loss": 1.2049, "step": 6230 }, { "epoch": 0.63, "grad_norm": 0.79296875, "learning_rate": 3.105646017299882e-05, "loss": 1.0478, "step": 6235 }, { "epoch": 0.64, "grad_norm": 0.8515625, "learning_rate": 3.0980168627258775e-05, "loss": 1.0877, "step": 6240 }, { "epoch": 0.64, "grad_norm": 0.95703125, "learning_rate": 3.09039288192695e-05, "loss": 1.0203, "step": 6245 }, { "epoch": 0.64, "grad_norm": 0.703125, "learning_rate": 3.082774095641853e-05, "loss": 0.9786, "step": 6250 }, { "epoch": 0.64, "grad_norm": 0.7890625, "learning_rate": 3.075160524595212e-05, "loss": 1.0277, "step": 6255 }, { "epoch": 0.64, "grad_norm": 0.765625, "learning_rate": 3.0675521894974645e-05, "loss": 1.0803, "step": 6260 }, { "epoch": 0.64, "grad_norm": 0.67578125, "learning_rate": 3.059949111044809e-05, "loss": 0.9742, "step": 6265 }, { "epoch": 0.64, "grad_norm": 1.046875, "learning_rate": 3.052351309919136e-05, "loss": 1.092, "step": 6270 }, { "epoch": 0.64, "grad_norm": 0.7109375, "learning_rate": 3.04475880678799e-05, "loss": 0.9507, "step": 6275 }, { "epoch": 0.64, "grad_norm": 0.71875, "learning_rate": 3.0371716223044998e-05, "loss": 1.0087, "step": 6280 }, { "epoch": 0.64, "grad_norm": 0.796875, "learning_rate": 3.0295897771073266e-05, "loss": 1.2292, "step": 6285 }, { "epoch": 0.64, "grad_norm": 1.3359375, "learning_rate": 3.0220132918206073e-05, "loss": 1.06, "step": 6290 }, { "epoch": 0.64, "grad_norm": 0.76171875, "learning_rate": 3.014442187053901e-05, "loss": 1.0489, "step": 6295 }, { "epoch": 0.64, "grad_norm": 0.7734375, "learning_rate": 3.006876483402128e-05, "loss": 1.0905, "step": 6300 }, { "epoch": 0.64, "grad_norm": 0.74609375, "learning_rate": 2.9993162014455145e-05, "loss": 0.9829, "step": 6305 }, { "epoch": 0.64, "grad_norm": 0.7578125, "learning_rate": 2.9917613617495484e-05, "loss": 1.1404, "step": 6310 }, { "epoch": 0.64, "grad_norm": 0.75, "learning_rate": 2.984211984864902e-05, "loss": 0.8786, "step": 6315 }, { "epoch": 0.64, "grad_norm": 0.76953125, "learning_rate": 2.9766680913273915e-05, "loss": 1.0549, "step": 6320 }, { "epoch": 0.64, "grad_norm": 0.7265625, "learning_rate": 2.9691297016579223e-05, "loss": 1.004, "step": 6325 }, { "epoch": 0.64, "grad_norm": 0.8125, "learning_rate": 2.9615968363624212e-05, "loss": 1.1635, "step": 6330 }, { "epoch": 0.65, "grad_norm": 0.71484375, "learning_rate": 2.9540695159317915e-05, "loss": 0.9876, "step": 6335 }, { "epoch": 0.65, "grad_norm": 0.76953125, "learning_rate": 2.946547760841853e-05, "loss": 0.9598, "step": 6340 }, { "epoch": 0.65, "grad_norm": 1.5859375, "learning_rate": 2.939031591553286e-05, "loss": 1.0414, "step": 6345 }, { "epoch": 0.65, "grad_norm": 0.73828125, "learning_rate": 2.9315210285115736e-05, "loss": 1.0624, "step": 6350 }, { "epoch": 0.65, "grad_norm": 0.62109375, "learning_rate": 2.924016092146956e-05, "loss": 1.074, "step": 6355 }, { "epoch": 0.65, "grad_norm": 0.76953125, "learning_rate": 2.9165168028743605e-05, "loss": 1.1979, "step": 6360 }, { "epoch": 0.65, "grad_norm": 0.765625, "learning_rate": 2.9090231810933538e-05, "loss": 1.1097, "step": 6365 }, { "epoch": 0.65, "grad_norm": 1.46875, "learning_rate": 2.9015352471880953e-05, "loss": 1.2338, "step": 6370 }, { "epoch": 0.65, "grad_norm": 0.734375, "learning_rate": 2.8940530215272577e-05, "loss": 1.0206, "step": 6375 }, { "epoch": 0.65, "grad_norm": 0.77734375, "learning_rate": 2.8865765244639963e-05, "loss": 1.0175, "step": 6380 }, { "epoch": 0.65, "grad_norm": 0.69140625, "learning_rate": 2.879105776335878e-05, "loss": 1.2262, "step": 6385 }, { "epoch": 0.65, "grad_norm": 0.77734375, "learning_rate": 2.8716407974648385e-05, "loss": 1.0278, "step": 6390 }, { "epoch": 0.65, "grad_norm": 1.109375, "learning_rate": 2.8641816081571148e-05, "loss": 1.1596, "step": 6395 }, { "epoch": 0.65, "grad_norm": 0.8828125, "learning_rate": 2.8567282287031904e-05, "loss": 1.007, "step": 6400 }, { "epoch": 0.65, "grad_norm": 0.68359375, "learning_rate": 2.8492806793777566e-05, "loss": 0.9987, "step": 6405 }, { "epoch": 0.65, "grad_norm": 0.671875, "learning_rate": 2.841838980439636e-05, "loss": 0.9187, "step": 6410 }, { "epoch": 0.65, "grad_norm": 0.66015625, "learning_rate": 2.8344031521317415e-05, "loss": 0.9507, "step": 6415 }, { "epoch": 0.65, "grad_norm": 0.7265625, "learning_rate": 2.8269732146810145e-05, "loss": 1.0025, "step": 6420 }, { "epoch": 0.65, "grad_norm": 0.79296875, "learning_rate": 2.8195491882983726e-05, "loss": 1.0142, "step": 6425 }, { "epoch": 0.65, "grad_norm": 0.75390625, "learning_rate": 2.8121310931786537e-05, "loss": 1.0992, "step": 6430 }, { "epoch": 0.66, "grad_norm": 0.75, "learning_rate": 2.8047189495005656e-05, "loss": 0.9699, "step": 6435 }, { "epoch": 0.66, "grad_norm": 0.70703125, "learning_rate": 2.797312777426624e-05, "loss": 1.0096, "step": 6440 }, { "epoch": 0.66, "grad_norm": 0.69921875, "learning_rate": 2.7899125971030938e-05, "loss": 0.9341, "step": 6445 }, { "epoch": 0.66, "grad_norm": 0.765625, "learning_rate": 2.7825184286599543e-05, "loss": 0.9735, "step": 6450 }, { "epoch": 0.66, "grad_norm": 0.65625, "learning_rate": 2.775130292210822e-05, "loss": 0.9685, "step": 6455 }, { "epoch": 0.66, "grad_norm": 0.78125, "learning_rate": 2.7677482078529083e-05, "loss": 1.1064, "step": 6460 }, { "epoch": 0.66, "grad_norm": 0.75390625, "learning_rate": 2.7603721956669604e-05, "loss": 1.1283, "step": 6465 }, { "epoch": 0.66, "grad_norm": 0.7734375, "learning_rate": 2.7530022757172092e-05, "loss": 0.987, "step": 6470 }, { "epoch": 0.66, "grad_norm": 1.109375, "learning_rate": 2.7456384680513114e-05, "loss": 1.0236, "step": 6475 }, { "epoch": 0.66, "grad_norm": 1.0859375, "learning_rate": 2.738280792700303e-05, "loss": 0.9482, "step": 6480 }, { "epoch": 0.66, "grad_norm": 0.79296875, "learning_rate": 2.730929269678534e-05, "loss": 1.1663, "step": 6485 }, { "epoch": 0.66, "grad_norm": 0.78125, "learning_rate": 2.7235839189836127e-05, "loss": 0.9718, "step": 6490 }, { "epoch": 0.66, "grad_norm": 0.7109375, "learning_rate": 2.7162447605963716e-05, "loss": 1.1799, "step": 6495 }, { "epoch": 0.66, "grad_norm": 0.703125, "learning_rate": 2.7089118144807886e-05, "loss": 0.9379, "step": 6500 }, { "epoch": 0.66, "grad_norm": 0.80078125, "learning_rate": 2.7015851005839465e-05, "loss": 1.0892, "step": 6505 }, { "epoch": 0.66, "grad_norm": 0.70703125, "learning_rate": 2.694264638835974e-05, "loss": 0.979, "step": 6510 }, { "epoch": 0.66, "grad_norm": 0.90625, "learning_rate": 2.686950449149993e-05, "loss": 1.1582, "step": 6515 }, { "epoch": 0.66, "grad_norm": 0.72265625, "learning_rate": 2.6796425514220646e-05, "loss": 0.9576, "step": 6520 }, { "epoch": 0.66, "grad_norm": 0.828125, "learning_rate": 2.672340965531135e-05, "loss": 1.1961, "step": 6525 }, { "epoch": 0.67, "grad_norm": 0.71875, "learning_rate": 2.6650457113389794e-05, "loss": 1.0329, "step": 6530 }, { "epoch": 0.67, "grad_norm": 0.71875, "learning_rate": 2.657756808690151e-05, "loss": 0.9499, "step": 6535 }, { "epoch": 0.67, "grad_norm": 0.80859375, "learning_rate": 2.6504742774119225e-05, "loss": 1.1772, "step": 6540 }, { "epoch": 0.67, "grad_norm": 0.71875, "learning_rate": 2.6431981373142413e-05, "loss": 1.0316, "step": 6545 }, { "epoch": 0.67, "grad_norm": 0.828125, "learning_rate": 2.635928408189666e-05, "loss": 0.9518, "step": 6550 }, { "epoch": 0.67, "grad_norm": 0.79296875, "learning_rate": 2.6286651098133107e-05, "loss": 1.063, "step": 6555 }, { "epoch": 0.67, "grad_norm": 0.74609375, "learning_rate": 2.621408261942806e-05, "loss": 1.0582, "step": 6560 }, { "epoch": 0.67, "grad_norm": 0.7734375, "learning_rate": 2.6141578843182312e-05, "loss": 0.9849, "step": 6565 }, { "epoch": 0.67, "grad_norm": 0.73828125, "learning_rate": 2.6069139966620648e-05, "loss": 0.96, "step": 6570 }, { "epoch": 0.67, "grad_norm": 0.7734375, "learning_rate": 2.5996766186791323e-05, "loss": 1.1181, "step": 6575 }, { "epoch": 0.67, "grad_norm": 0.74609375, "learning_rate": 2.592445770056551e-05, "loss": 0.9954, "step": 6580 }, { "epoch": 0.67, "grad_norm": 0.7421875, "learning_rate": 2.5852214704636756e-05, "loss": 0.9518, "step": 6585 }, { "epoch": 0.67, "grad_norm": 0.7421875, "learning_rate": 2.578003739552053e-05, "loss": 0.9734, "step": 6590 }, { "epoch": 0.67, "grad_norm": 0.7265625, "learning_rate": 2.570792596955355e-05, "loss": 0.9734, "step": 6595 }, { "epoch": 0.67, "grad_norm": 0.74609375, "learning_rate": 2.5635880622893282e-05, "loss": 0.9918, "step": 6600 }, { "epoch": 0.67, "grad_norm": 0.67578125, "learning_rate": 2.5563901551517576e-05, "loss": 1.0667, "step": 6605 }, { "epoch": 0.67, "grad_norm": 0.6796875, "learning_rate": 2.5491988951223878e-05, "loss": 0.9904, "step": 6610 }, { "epoch": 0.67, "grad_norm": 0.76171875, "learning_rate": 2.5420143017628885e-05, "loss": 1.134, "step": 6615 }, { "epoch": 0.67, "grad_norm": 0.78515625, "learning_rate": 2.5348363946167913e-05, "loss": 0.9767, "step": 6620 }, { "epoch": 0.67, "grad_norm": 0.80078125, "learning_rate": 2.527665193209442e-05, "loss": 1.0921, "step": 6625 }, { "epoch": 0.68, "grad_norm": 1.1171875, "learning_rate": 2.5205007170479444e-05, "loss": 0.9865, "step": 6630 }, { "epoch": 0.68, "grad_norm": 0.7578125, "learning_rate": 2.5133429856211066e-05, "loss": 1.035, "step": 6635 }, { "epoch": 0.68, "grad_norm": 0.6953125, "learning_rate": 2.5061920183993977e-05, "loss": 1.0116, "step": 6640 }, { "epoch": 0.68, "grad_norm": 1.9453125, "learning_rate": 2.4990478348348744e-05, "loss": 0.8975, "step": 6645 }, { "epoch": 0.68, "grad_norm": 0.796875, "learning_rate": 2.4919104543611476e-05, "loss": 1.1114, "step": 6650 }, { "epoch": 0.68, "grad_norm": 0.796875, "learning_rate": 2.4847798963933256e-05, "loss": 1.0895, "step": 6655 }, { "epoch": 0.68, "grad_norm": 1.171875, "learning_rate": 2.4776561803279524e-05, "loss": 1.2028, "step": 6660 }, { "epoch": 0.68, "grad_norm": 0.796875, "learning_rate": 2.4705393255429616e-05, "loss": 1.1115, "step": 6665 }, { "epoch": 0.68, "grad_norm": 0.79296875, "learning_rate": 2.463429351397624e-05, "loss": 1.1289, "step": 6670 }, { "epoch": 0.68, "grad_norm": 0.80078125, "learning_rate": 2.456326277232494e-05, "loss": 1.049, "step": 6675 }, { "epoch": 0.68, "grad_norm": 0.7421875, "learning_rate": 2.4492301223693553e-05, "loss": 1.039, "step": 6680 }, { "epoch": 0.68, "grad_norm": 0.65234375, "learning_rate": 2.4421409061111704e-05, "loss": 1.0935, "step": 6685 }, { "epoch": 0.68, "grad_norm": 0.72265625, "learning_rate": 2.435058647742028e-05, "loss": 1.0483, "step": 6690 }, { "epoch": 0.68, "grad_norm": 0.8046875, "learning_rate": 2.4279833665270862e-05, "loss": 1.0762, "step": 6695 }, { "epoch": 0.68, "grad_norm": 1.9453125, "learning_rate": 2.4209150817125327e-05, "loss": 1.1581, "step": 6700 }, { "epoch": 0.68, "grad_norm": 2.234375, "learning_rate": 2.413853812525516e-05, "loss": 1.0193, "step": 6705 }, { "epoch": 0.68, "grad_norm": 0.55859375, "learning_rate": 2.4067995781740975e-05, "loss": 1.0372, "step": 6710 }, { "epoch": 0.68, "grad_norm": 0.703125, "learning_rate": 2.399752397847214e-05, "loss": 1.0451, "step": 6715 }, { "epoch": 0.68, "grad_norm": 0.76953125, "learning_rate": 2.392712290714605e-05, "loss": 1.0981, "step": 6720 }, { "epoch": 0.68, "grad_norm": 0.75390625, "learning_rate": 2.3856792759267716e-05, "loss": 1.2807, "step": 6725 }, { "epoch": 0.69, "grad_norm": 0.76171875, "learning_rate": 2.378653372614923e-05, "loss": 1.1395, "step": 6730 }, { "epoch": 0.69, "grad_norm": 0.80859375, "learning_rate": 2.371634599890923e-05, "loss": 0.9596, "step": 6735 }, { "epoch": 0.69, "grad_norm": 0.7734375, "learning_rate": 2.3646229768472376e-05, "loss": 1.1523, "step": 6740 }, { "epoch": 0.69, "grad_norm": 0.75, "learning_rate": 2.3576185225568904e-05, "loss": 1.1069, "step": 6745 }, { "epoch": 0.69, "grad_norm": 0.6640625, "learning_rate": 2.3506212560733986e-05, "loss": 1.2153, "step": 6750 }, { "epoch": 0.69, "grad_norm": 0.6328125, "learning_rate": 2.3436311964307252e-05, "loss": 0.8835, "step": 6755 }, { "epoch": 0.69, "grad_norm": 0.640625, "learning_rate": 2.3366483626432328e-05, "loss": 0.9273, "step": 6760 }, { "epoch": 0.69, "grad_norm": 0.6953125, "learning_rate": 2.329672773705631e-05, "loss": 0.9277, "step": 6765 }, { "epoch": 0.69, "grad_norm": 0.7265625, "learning_rate": 2.3227044485929185e-05, "loss": 0.8623, "step": 6770 }, { "epoch": 0.69, "grad_norm": 0.78125, "learning_rate": 2.3157434062603327e-05, "loss": 1.0534, "step": 6775 }, { "epoch": 0.69, "grad_norm": 0.83984375, "learning_rate": 2.3087896656433057e-05, "loss": 0.9354, "step": 6780 }, { "epoch": 0.69, "grad_norm": 0.83203125, "learning_rate": 2.3018432456574025e-05, "loss": 0.9725, "step": 6785 }, { "epoch": 0.69, "grad_norm": 0.69921875, "learning_rate": 2.2949041651982763e-05, "loss": 1.0576, "step": 6790 }, { "epoch": 0.69, "grad_norm": 0.71875, "learning_rate": 2.28797244314162e-05, "loss": 1.006, "step": 6795 }, { "epoch": 0.69, "grad_norm": 0.69140625, "learning_rate": 2.2810480983431022e-05, "loss": 0.9865, "step": 6800 }, { "epoch": 0.69, "grad_norm": 0.83984375, "learning_rate": 2.2741311496383254e-05, "loss": 1.3458, "step": 6805 }, { "epoch": 0.69, "grad_norm": 0.796875, "learning_rate": 2.2672216158427816e-05, "loss": 0.9439, "step": 6810 }, { "epoch": 0.69, "grad_norm": 0.81640625, "learning_rate": 2.2603195157517825e-05, "loss": 1.0935, "step": 6815 }, { "epoch": 0.69, "grad_norm": 0.65234375, "learning_rate": 2.2534248681404247e-05, "loss": 1.0236, "step": 6820 }, { "epoch": 0.7, "grad_norm": 0.71484375, "learning_rate": 2.246537691763529e-05, "loss": 1.0428, "step": 6825 }, { "epoch": 0.7, "grad_norm": 0.71484375, "learning_rate": 2.2396580053555956e-05, "loss": 1.1085, "step": 6830 }, { "epoch": 0.7, "grad_norm": 0.64453125, "learning_rate": 2.232785827630749e-05, "loss": 1.0015, "step": 6835 }, { "epoch": 0.7, "grad_norm": 0.796875, "learning_rate": 2.2259211772826886e-05, "loss": 1.0249, "step": 6840 }, { "epoch": 0.7, "grad_norm": 0.76171875, "learning_rate": 2.2190640729846396e-05, "loss": 1.1303, "step": 6845 }, { "epoch": 0.7, "grad_norm": 0.74609375, "learning_rate": 2.2122145333892964e-05, "loss": 1.0429, "step": 6850 }, { "epoch": 0.7, "grad_norm": 0.7265625, "learning_rate": 2.2053725771287816e-05, "loss": 1.0856, "step": 6855 }, { "epoch": 0.7, "grad_norm": 0.80078125, "learning_rate": 2.1985382228145873e-05, "loss": 1.0638, "step": 6860 }, { "epoch": 0.7, "grad_norm": 4.1875, "learning_rate": 2.1917114890375205e-05, "loss": 1.0854, "step": 6865 }, { "epoch": 0.7, "grad_norm": 0.7265625, "learning_rate": 2.1848923943676703e-05, "loss": 1.0279, "step": 6870 }, { "epoch": 0.7, "grad_norm": 0.8515625, "learning_rate": 2.1780809573543377e-05, "loss": 0.9827, "step": 6875 }, { "epoch": 0.7, "grad_norm": 0.7265625, "learning_rate": 2.1712771965259953e-05, "loss": 0.8368, "step": 6880 }, { "epoch": 0.7, "grad_norm": 2.71875, "learning_rate": 2.1644811303902357e-05, "loss": 1.0574, "step": 6885 }, { "epoch": 0.7, "grad_norm": 0.8828125, "learning_rate": 2.1576927774337196e-05, "loss": 1.0771, "step": 6890 }, { "epoch": 0.7, "grad_norm": 0.7578125, "learning_rate": 2.1509121561221274e-05, "loss": 0.9592, "step": 6895 }, { "epoch": 0.7, "grad_norm": 0.796875, "learning_rate": 2.1441392849001048e-05, "loss": 1.1832, "step": 6900 }, { "epoch": 0.7, "grad_norm": 0.7734375, "learning_rate": 2.1373741821912234e-05, "loss": 1.0718, "step": 6905 }, { "epoch": 0.7, "grad_norm": 0.73046875, "learning_rate": 2.130616866397912e-05, "loss": 0.9542, "step": 6910 }, { "epoch": 0.7, "grad_norm": 0.69921875, "learning_rate": 2.1238673559014237e-05, "loss": 1.0957, "step": 6915 }, { "epoch": 0.7, "grad_norm": 0.765625, "learning_rate": 2.117125669061782e-05, "loss": 1.1128, "step": 6920 }, { "epoch": 0.71, "grad_norm": 0.77734375, "learning_rate": 2.110391824217723e-05, "loss": 0.9538, "step": 6925 }, { "epoch": 0.71, "grad_norm": 0.86328125, "learning_rate": 2.1036658396866543e-05, "loss": 1.2052, "step": 6930 }, { "epoch": 0.71, "grad_norm": 0.796875, "learning_rate": 2.0969477337646005e-05, "loss": 0.9397, "step": 6935 }, { "epoch": 0.71, "grad_norm": 0.76171875, "learning_rate": 2.090237524726156e-05, "loss": 1.038, "step": 6940 }, { "epoch": 0.71, "grad_norm": 0.67578125, "learning_rate": 2.0835352308244298e-05, "loss": 1.0399, "step": 6945 }, { "epoch": 0.71, "grad_norm": 0.83984375, "learning_rate": 2.076840870291011e-05, "loss": 1.2341, "step": 6950 }, { "epoch": 0.71, "grad_norm": 1.2890625, "learning_rate": 2.070154461335894e-05, "loss": 0.9457, "step": 6955 }, { "epoch": 0.71, "grad_norm": 0.70703125, "learning_rate": 2.0634760221474526e-05, "loss": 1.1267, "step": 6960 }, { "epoch": 0.71, "grad_norm": 0.78515625, "learning_rate": 2.05680557089238e-05, "loss": 0.9812, "step": 6965 }, { "epoch": 0.71, "grad_norm": 0.79296875, "learning_rate": 2.0501431257156395e-05, "loss": 1.202, "step": 6970 }, { "epoch": 0.71, "grad_norm": 0.6875, "learning_rate": 2.0434887047404163e-05, "loss": 0.9614, "step": 6975 }, { "epoch": 0.71, "grad_norm": 0.77734375, "learning_rate": 2.0368423260680674e-05, "loss": 0.9797, "step": 6980 }, { "epoch": 0.71, "grad_norm": 0.7578125, "learning_rate": 2.0302040077780744e-05, "loss": 1.0879, "step": 6985 }, { "epoch": 0.71, "grad_norm": 0.78515625, "learning_rate": 2.023573767927993e-05, "loss": 0.984, "step": 6990 }, { "epoch": 0.71, "grad_norm": 0.6328125, "learning_rate": 2.0169516245534032e-05, "loss": 0.8387, "step": 6995 }, { "epoch": 0.71, "grad_norm": 0.796875, "learning_rate": 2.010337595667861e-05, "loss": 0.9795, "step": 7000 }, { "epoch": 0.71, "grad_norm": 0.7109375, "learning_rate": 2.00373169926285e-05, "loss": 1.1734, "step": 7005 }, { "epoch": 0.71, "grad_norm": 0.75, "learning_rate": 1.99713395330773e-05, "loss": 1.006, "step": 7010 }, { "epoch": 0.71, "grad_norm": 0.82421875, "learning_rate": 1.9905443757496966e-05, "loss": 1.1849, "step": 7015 }, { "epoch": 0.71, "grad_norm": 0.734375, "learning_rate": 1.9839629845137147e-05, "loss": 0.9772, "step": 7020 }, { "epoch": 0.72, "grad_norm": 0.79296875, "learning_rate": 1.9773897975024873e-05, "loss": 1.0551, "step": 7025 }, { "epoch": 0.72, "grad_norm": 0.83984375, "learning_rate": 1.9708248325964028e-05, "loss": 1.107, "step": 7030 }, { "epoch": 0.72, "grad_norm": 0.65234375, "learning_rate": 1.9642681076534792e-05, "loss": 0.9541, "step": 7035 }, { "epoch": 0.72, "grad_norm": 0.765625, "learning_rate": 1.957719640509321e-05, "loss": 1.0616, "step": 7040 }, { "epoch": 0.72, "grad_norm": 0.89453125, "learning_rate": 1.9511794489770712e-05, "loss": 1.0631, "step": 7045 }, { "epoch": 0.72, "grad_norm": 0.62109375, "learning_rate": 1.94464755084736e-05, "loss": 1.0395, "step": 7050 }, { "epoch": 0.72, "grad_norm": 0.7265625, "learning_rate": 1.938123963888257e-05, "loss": 0.9842, "step": 7055 }, { "epoch": 0.72, "grad_norm": 0.7734375, "learning_rate": 1.9316087058452304e-05, "loss": 1.0299, "step": 7060 }, { "epoch": 0.72, "grad_norm": 0.8125, "learning_rate": 1.9251017944410822e-05, "loss": 1.0622, "step": 7065 }, { "epoch": 0.72, "grad_norm": 0.8984375, "learning_rate": 1.9186032473759147e-05, "loss": 1.134, "step": 7070 }, { "epoch": 0.72, "grad_norm": 0.796875, "learning_rate": 1.91211308232708e-05, "loss": 0.9231, "step": 7075 }, { "epoch": 0.72, "grad_norm": 0.828125, "learning_rate": 1.9056313169491264e-05, "loss": 0.928, "step": 7080 }, { "epoch": 0.72, "grad_norm": 0.64453125, "learning_rate": 1.8991579688737538e-05, "loss": 0.8661, "step": 7085 }, { "epoch": 0.72, "grad_norm": 0.77734375, "learning_rate": 1.8926930557097644e-05, "loss": 1.0144, "step": 7090 }, { "epoch": 0.72, "grad_norm": 0.796875, "learning_rate": 1.8862365950430176e-05, "loss": 1.1465, "step": 7095 }, { "epoch": 0.72, "grad_norm": 0.6484375, "learning_rate": 1.8797886044363788e-05, "loss": 0.9353, "step": 7100 }, { "epoch": 0.72, "grad_norm": 0.703125, "learning_rate": 1.8733491014296772e-05, "loss": 1.0119, "step": 7105 }, { "epoch": 0.72, "grad_norm": 0.78515625, "learning_rate": 1.8669181035396465e-05, "loss": 0.9608, "step": 7110 }, { "epoch": 0.72, "grad_norm": 0.64453125, "learning_rate": 1.8604956282598894e-05, "loss": 1.2149, "step": 7115 }, { "epoch": 0.73, "grad_norm": 0.921875, "learning_rate": 1.854081693060828e-05, "loss": 0.9177, "step": 7120 }, { "epoch": 0.73, "grad_norm": 0.7109375, "learning_rate": 1.8476763153896486e-05, "loss": 1.0206, "step": 7125 }, { "epoch": 0.73, "grad_norm": 0.80078125, "learning_rate": 1.8412795126702638e-05, "loss": 1.1597, "step": 7130 }, { "epoch": 0.73, "grad_norm": 0.72265625, "learning_rate": 1.8348913023032522e-05, "loss": 1.0858, "step": 7135 }, { "epoch": 0.73, "grad_norm": 0.76171875, "learning_rate": 1.8285117016658314e-05, "loss": 1.1216, "step": 7140 }, { "epoch": 0.73, "grad_norm": 0.93359375, "learning_rate": 1.8221407281117913e-05, "loss": 1.029, "step": 7145 }, { "epoch": 0.73, "grad_norm": 0.796875, "learning_rate": 1.815778398971455e-05, "loss": 1.0972, "step": 7150 }, { "epoch": 0.73, "grad_norm": 0.79296875, "learning_rate": 1.8094247315516333e-05, "loss": 0.899, "step": 7155 }, { "epoch": 0.73, "grad_norm": 0.7734375, "learning_rate": 1.803079743135573e-05, "loss": 0.8719, "step": 7160 }, { "epoch": 0.73, "grad_norm": 0.68359375, "learning_rate": 1.7967434509829124e-05, "loss": 1.0492, "step": 7165 }, { "epoch": 0.73, "grad_norm": 0.79296875, "learning_rate": 1.7904158723296398e-05, "loss": 1.0277, "step": 7170 }, { "epoch": 0.73, "grad_norm": 0.65234375, "learning_rate": 1.7840970243880317e-05, "loss": 1.0728, "step": 7175 }, { "epoch": 0.73, "grad_norm": 0.58203125, "learning_rate": 1.7777869243466207e-05, "loss": 1.0019, "step": 7180 }, { "epoch": 0.73, "grad_norm": 0.828125, "learning_rate": 1.771485589370145e-05, "loss": 1.0241, "step": 7185 }, { "epoch": 0.73, "grad_norm": 0.625, "learning_rate": 1.7651930365994966e-05, "loss": 1.016, "step": 7190 }, { "epoch": 0.73, "grad_norm": 0.765625, "learning_rate": 1.75890928315168e-05, "loss": 0.8753, "step": 7195 }, { "epoch": 0.73, "grad_norm": 0.75390625, "learning_rate": 1.7526343461197618e-05, "loss": 0.936, "step": 7200 }, { "epoch": 0.73, "grad_norm": 0.73046875, "learning_rate": 1.746368242572829e-05, "loss": 1.093, "step": 7205 }, { "epoch": 0.73, "grad_norm": 1.3671875, "learning_rate": 1.7401109895559353e-05, "loss": 1.2597, "step": 7210 }, { "epoch": 0.73, "grad_norm": 0.7421875, "learning_rate": 1.7338626040900678e-05, "loss": 1.0184, "step": 7215 }, { "epoch": 0.74, "grad_norm": 0.75, "learning_rate": 1.727623103172082e-05, "loss": 1.0955, "step": 7220 }, { "epoch": 0.74, "grad_norm": 0.7421875, "learning_rate": 1.7213925037746688e-05, "loss": 1.0579, "step": 7225 }, { "epoch": 0.74, "grad_norm": 0.8671875, "learning_rate": 1.7151708228463104e-05, "loss": 1.0101, "step": 7230 }, { "epoch": 0.74, "grad_norm": 0.8125, "learning_rate": 1.7089580773112235e-05, "loss": 1.0389, "step": 7235 }, { "epoch": 0.74, "grad_norm": 0.79296875, "learning_rate": 1.7027542840693205e-05, "loss": 0.9624, "step": 7240 }, { "epoch": 0.74, "grad_norm": 0.72265625, "learning_rate": 1.6965594599961608e-05, "loss": 1.0722, "step": 7245 }, { "epoch": 0.74, "grad_norm": 0.765625, "learning_rate": 1.690373621942908e-05, "loss": 1.0638, "step": 7250 }, { "epoch": 0.74, "grad_norm": 0.69921875, "learning_rate": 1.6841967867362784e-05, "loss": 0.9502, "step": 7255 }, { "epoch": 0.74, "grad_norm": 0.75, "learning_rate": 1.678028971178503e-05, "loss": 0.9155, "step": 7260 }, { "epoch": 0.74, "grad_norm": 0.7421875, "learning_rate": 1.6718701920472746e-05, "loss": 1.0022, "step": 7265 }, { "epoch": 0.74, "grad_norm": 0.765625, "learning_rate": 1.665720466095706e-05, "loss": 1.0691, "step": 7270 }, { "epoch": 0.74, "grad_norm": 0.77734375, "learning_rate": 1.6595798100522813e-05, "loss": 1.0905, "step": 7275 }, { "epoch": 0.74, "grad_norm": 0.66015625, "learning_rate": 1.6534482406208192e-05, "loss": 1.0457, "step": 7280 }, { "epoch": 0.74, "grad_norm": 0.5859375, "learning_rate": 1.6473257744804154e-05, "loss": 0.9816, "step": 7285 }, { "epoch": 0.74, "grad_norm": 0.69140625, "learning_rate": 1.6412124282854002e-05, "loss": 1.0526, "step": 7290 }, { "epoch": 0.74, "grad_norm": 0.79296875, "learning_rate": 1.6351082186653045e-05, "loss": 0.9644, "step": 7295 }, { "epoch": 0.74, "grad_norm": 2.015625, "learning_rate": 1.629013162224799e-05, "loss": 1.0989, "step": 7300 }, { "epoch": 0.74, "grad_norm": 0.82421875, "learning_rate": 1.6229272755436574e-05, "loss": 0.982, "step": 7305 }, { "epoch": 0.74, "grad_norm": 0.84375, "learning_rate": 1.616850575176712e-05, "loss": 1.0199, "step": 7310 }, { "epoch": 0.74, "grad_norm": 0.74609375, "learning_rate": 1.6107830776538034e-05, "loss": 0.9466, "step": 7315 }, { "epoch": 0.75, "grad_norm": 0.75390625, "learning_rate": 1.6047247994797397e-05, "loss": 0.9556, "step": 7320 }, { "epoch": 0.75, "grad_norm": 0.765625, "learning_rate": 1.5986757571342547e-05, "loss": 1.0774, "step": 7325 }, { "epoch": 0.75, "grad_norm": 0.71875, "learning_rate": 1.592635967071951e-05, "loss": 0.9787, "step": 7330 }, { "epoch": 0.75, "grad_norm": 1.21875, "learning_rate": 1.5866054457222667e-05, "loss": 1.0819, "step": 7335 }, { "epoch": 0.75, "grad_norm": 0.9765625, "learning_rate": 1.5805842094894313e-05, "loss": 1.0096, "step": 7340 }, { "epoch": 0.75, "grad_norm": 0.7421875, "learning_rate": 1.5745722747524117e-05, "loss": 1.0094, "step": 7345 }, { "epoch": 0.75, "grad_norm": 0.8125, "learning_rate": 1.5685696578648746e-05, "loss": 1.2053, "step": 7350 }, { "epoch": 0.75, "grad_norm": 1.0625, "learning_rate": 1.5625763751551396e-05, "loss": 1.1395, "step": 7355 }, { "epoch": 0.75, "grad_norm": 3.984375, "learning_rate": 1.5565924429261368e-05, "loss": 0.8549, "step": 7360 }, { "epoch": 0.75, "grad_norm": 0.6953125, "learning_rate": 1.5506178774553585e-05, "loss": 0.9433, "step": 7365 }, { "epoch": 0.75, "grad_norm": 0.921875, "learning_rate": 1.5446526949948243e-05, "loss": 1.127, "step": 7370 }, { "epoch": 0.75, "grad_norm": 0.73828125, "learning_rate": 1.53869691177102e-05, "loss": 1.0448, "step": 7375 }, { "epoch": 0.75, "grad_norm": 0.64453125, "learning_rate": 1.5327505439848704e-05, "loss": 1.0138, "step": 7380 }, { "epoch": 0.75, "grad_norm": 0.76953125, "learning_rate": 1.5268136078116864e-05, "loss": 1.0345, "step": 7385 }, { "epoch": 0.75, "grad_norm": 1.0234375, "learning_rate": 1.5208861194011254e-05, "loss": 1.0045, "step": 7390 }, { "epoch": 0.75, "grad_norm": 0.7890625, "learning_rate": 1.5149680948771439e-05, "loss": 1.1314, "step": 7395 }, { "epoch": 0.75, "grad_norm": 0.6875, "learning_rate": 1.5090595503379484e-05, "loss": 0.9383, "step": 7400 }, { "epoch": 0.75, "grad_norm": 0.8828125, "learning_rate": 1.5031605018559685e-05, "loss": 1.0632, "step": 7405 }, { "epoch": 0.75, "grad_norm": 0.83203125, "learning_rate": 1.4972709654777967e-05, "loss": 1.1253, "step": 7410 }, { "epoch": 0.76, "grad_norm": 0.703125, "learning_rate": 1.4913909572241503e-05, "loss": 1.2378, "step": 7415 }, { "epoch": 0.76, "grad_norm": 0.65234375, "learning_rate": 1.4855204930898303e-05, "loss": 0.8846, "step": 7420 }, { "epoch": 0.76, "grad_norm": 0.703125, "learning_rate": 1.4796595890436748e-05, "loss": 1.0447, "step": 7425 }, { "epoch": 0.76, "grad_norm": 0.73046875, "learning_rate": 1.4738082610285153e-05, "loss": 1.1328, "step": 7430 }, { "epoch": 0.76, "grad_norm": 0.671875, "learning_rate": 1.4679665249611386e-05, "loss": 1.0008, "step": 7435 }, { "epoch": 0.76, "grad_norm": 0.71484375, "learning_rate": 1.462134396732237e-05, "loss": 1.0664, "step": 7440 }, { "epoch": 0.76, "grad_norm": 0.79296875, "learning_rate": 1.4563118922063623e-05, "loss": 1.1858, "step": 7445 }, { "epoch": 0.76, "grad_norm": 0.7890625, "learning_rate": 1.4504990272218988e-05, "loss": 1.0722, "step": 7450 }, { "epoch": 0.76, "grad_norm": 0.73828125, "learning_rate": 1.4446958175910008e-05, "loss": 1.1279, "step": 7455 }, { "epoch": 0.76, "grad_norm": 0.9453125, "learning_rate": 1.438902279099561e-05, "loss": 1.0442, "step": 7460 }, { "epoch": 0.76, "grad_norm": 0.80859375, "learning_rate": 1.4331184275071651e-05, "loss": 1.051, "step": 7465 }, { "epoch": 0.76, "grad_norm": 0.76171875, "learning_rate": 1.4273442785470475e-05, "loss": 1.0136, "step": 7470 }, { "epoch": 0.76, "grad_norm": 0.7421875, "learning_rate": 1.4215798479260484e-05, "loss": 1.1228, "step": 7475 }, { "epoch": 0.76, "grad_norm": 0.75390625, "learning_rate": 1.4158251513245774e-05, "loss": 0.8455, "step": 7480 }, { "epoch": 0.76, "grad_norm": 0.81640625, "learning_rate": 1.4100802043965588e-05, "loss": 1.1439, "step": 7485 }, { "epoch": 0.76, "grad_norm": 0.8046875, "learning_rate": 1.4043450227693978e-05, "loss": 1.0626, "step": 7490 }, { "epoch": 0.76, "grad_norm": 0.73046875, "learning_rate": 1.3986196220439402e-05, "loss": 0.9875, "step": 7495 }, { "epoch": 0.76, "grad_norm": 0.78125, "learning_rate": 1.3929040177944213e-05, "loss": 1.2451, "step": 7500 }, { "epoch": 0.76, "grad_norm": 0.69140625, "learning_rate": 1.3871982255684313e-05, "loss": 1.0016, "step": 7505 }, { "epoch": 0.76, "grad_norm": 0.8203125, "learning_rate": 1.3815022608868634e-05, "loss": 1.2081, "step": 7510 }, { "epoch": 0.77, "grad_norm": 1.0078125, "learning_rate": 1.3758161392438862e-05, "loss": 0.9329, "step": 7515 }, { "epoch": 0.77, "grad_norm": 0.890625, "learning_rate": 1.3701398761068878e-05, "loss": 1.0586, "step": 7520 }, { "epoch": 0.77, "grad_norm": 0.8984375, "learning_rate": 1.3644734869164422e-05, "loss": 1.2397, "step": 7525 }, { "epoch": 0.77, "grad_norm": 0.76171875, "learning_rate": 1.358816987086261e-05, "loss": 0.9554, "step": 7530 }, { "epoch": 0.77, "grad_norm": 0.6875, "learning_rate": 1.3531703920031586e-05, "loss": 1.0263, "step": 7535 }, { "epoch": 0.77, "grad_norm": 0.74609375, "learning_rate": 1.3475337170270013e-05, "loss": 0.9861, "step": 7540 }, { "epoch": 0.77, "grad_norm": 0.70703125, "learning_rate": 1.3419069774906784e-05, "loss": 0.9712, "step": 7545 }, { "epoch": 0.77, "grad_norm": 0.671875, "learning_rate": 1.3362901887000473e-05, "loss": 1.1714, "step": 7550 }, { "epoch": 0.77, "grad_norm": 0.70703125, "learning_rate": 1.3306833659338952e-05, "loss": 1.0479, "step": 7555 }, { "epoch": 0.77, "grad_norm": 0.7421875, "learning_rate": 1.325086524443906e-05, "loss": 0.9922, "step": 7560 }, { "epoch": 0.77, "grad_norm": 0.7265625, "learning_rate": 1.3194996794546094e-05, "loss": 1.1754, "step": 7565 }, { "epoch": 0.77, "grad_norm": 0.80078125, "learning_rate": 1.3139228461633418e-05, "loss": 0.9961, "step": 7570 }, { "epoch": 0.77, "grad_norm": 0.78125, "learning_rate": 1.3083560397402072e-05, "loss": 1.1122, "step": 7575 }, { "epoch": 0.77, "grad_norm": 0.67578125, "learning_rate": 1.3027992753280338e-05, "loss": 1.0002, "step": 7580 }, { "epoch": 0.77, "grad_norm": 0.75390625, "learning_rate": 1.2972525680423325e-05, "loss": 1.0337, "step": 7585 }, { "epoch": 0.77, "grad_norm": 0.8203125, "learning_rate": 1.2917159329712608e-05, "loss": 1.1011, "step": 7590 }, { "epoch": 0.77, "grad_norm": 0.8046875, "learning_rate": 1.2861893851755753e-05, "loss": 1.0683, "step": 7595 }, { "epoch": 0.77, "grad_norm": 0.7265625, "learning_rate": 1.280672939688588e-05, "loss": 1.0678, "step": 7600 }, { "epoch": 0.77, "grad_norm": 2.046875, "learning_rate": 1.2751666115161409e-05, "loss": 1.0422, "step": 7605 }, { "epoch": 0.78, "grad_norm": 0.7421875, "learning_rate": 1.2696704156365463e-05, "loss": 1.017, "step": 7610 }, { "epoch": 0.78, "grad_norm": 10.875, "learning_rate": 1.2641843670005593e-05, "loss": 0.9963, "step": 7615 }, { "epoch": 0.78, "grad_norm": 0.703125, "learning_rate": 1.258708480531331e-05, "loss": 0.8597, "step": 7620 }, { "epoch": 0.78, "grad_norm": 0.66796875, "learning_rate": 1.2532427711243682e-05, "loss": 1.1199, "step": 7625 }, { "epoch": 0.78, "grad_norm": 0.6796875, "learning_rate": 1.2477872536474961e-05, "loss": 0.9385, "step": 7630 }, { "epoch": 0.78, "grad_norm": 0.64453125, "learning_rate": 1.2423419429408146e-05, "loss": 0.9348, "step": 7635 }, { "epoch": 0.78, "grad_norm": 1.6171875, "learning_rate": 1.236906853816659e-05, "loss": 0.9485, "step": 7640 }, { "epoch": 0.78, "grad_norm": 0.703125, "learning_rate": 1.2314820010595612e-05, "loss": 0.8493, "step": 7645 }, { "epoch": 0.78, "grad_norm": 0.76171875, "learning_rate": 1.2260673994262056e-05, "loss": 1.0026, "step": 7650 }, { "epoch": 0.78, "grad_norm": 0.671875, "learning_rate": 1.2206630636453953e-05, "loss": 1.18, "step": 7655 }, { "epoch": 0.78, "grad_norm": 0.76171875, "learning_rate": 1.2152690084180068e-05, "loss": 1.0291, "step": 7660 }, { "epoch": 0.78, "grad_norm": 0.82421875, "learning_rate": 1.209885248416946e-05, "loss": 1.2367, "step": 7665 }, { "epoch": 0.78, "grad_norm": 1.0859375, "learning_rate": 1.2045117982871229e-05, "loss": 1.0427, "step": 7670 }, { "epoch": 0.78, "grad_norm": 0.7265625, "learning_rate": 1.1991486726453965e-05, "loss": 1.0732, "step": 7675 }, { "epoch": 0.78, "grad_norm": 0.85546875, "learning_rate": 1.1937958860805427e-05, "loss": 1.0942, "step": 7680 }, { "epoch": 0.78, "grad_norm": 0.84375, "learning_rate": 1.1884534531532132e-05, "loss": 1.1168, "step": 7685 }, { "epoch": 0.78, "grad_norm": 0.66796875, "learning_rate": 1.1831213883958957e-05, "loss": 0.9727, "step": 7690 }, { "epoch": 0.78, "grad_norm": 0.76171875, "learning_rate": 1.177799706312872e-05, "loss": 1.0578, "step": 7695 }, { "epoch": 0.78, "grad_norm": 0.75, "learning_rate": 1.1724884213801873e-05, "loss": 1.0348, "step": 7700 }, { "epoch": 0.78, "grad_norm": 0.8046875, "learning_rate": 1.1671875480455996e-05, "loss": 1.1907, "step": 7705 }, { "epoch": 0.79, "grad_norm": 0.75, "learning_rate": 1.1618971007285423e-05, "loss": 1.0709, "step": 7710 }, { "epoch": 0.79, "grad_norm": 0.6875, "learning_rate": 1.1566170938200954e-05, "loss": 1.1659, "step": 7715 }, { "epoch": 0.79, "grad_norm": 0.76953125, "learning_rate": 1.1513475416829344e-05, "loss": 1.1204, "step": 7720 }, { "epoch": 0.79, "grad_norm": 0.86328125, "learning_rate": 1.146088458651296e-05, "loss": 1.0912, "step": 7725 }, { "epoch": 0.79, "grad_norm": 0.6953125, "learning_rate": 1.14083985903094e-05, "loss": 1.1206, "step": 7730 }, { "epoch": 0.79, "grad_norm": 1.1640625, "learning_rate": 1.1356017570991079e-05, "loss": 1.1407, "step": 7735 }, { "epoch": 0.79, "grad_norm": 0.6953125, "learning_rate": 1.1303741671044848e-05, "loss": 1.0298, "step": 7740 }, { "epoch": 0.79, "grad_norm": 1.015625, "learning_rate": 1.125157103267166e-05, "loss": 1.1292, "step": 7745 }, { "epoch": 0.79, "grad_norm": 0.73828125, "learning_rate": 1.1199505797786091e-05, "loss": 1.1263, "step": 7750 }, { "epoch": 0.79, "grad_norm": 0.8515625, "learning_rate": 1.1147546108015983e-05, "loss": 1.0643, "step": 7755 }, { "epoch": 0.79, "grad_norm": 0.796875, "learning_rate": 1.1095692104702104e-05, "loss": 1.1432, "step": 7760 }, { "epoch": 0.79, "grad_norm": 0.62109375, "learning_rate": 1.1043943928897754e-05, "loss": 0.9644, "step": 7765 }, { "epoch": 0.79, "grad_norm": 0.7734375, "learning_rate": 1.0992301721368319e-05, "loss": 1.1377, "step": 7770 }, { "epoch": 0.79, "grad_norm": 0.73046875, "learning_rate": 1.0940765622590949e-05, "loss": 1.1033, "step": 7775 }, { "epoch": 0.79, "grad_norm": 0.77734375, "learning_rate": 1.088933577275415e-05, "loss": 1.1233, "step": 7780 }, { "epoch": 0.79, "grad_norm": 0.74609375, "learning_rate": 1.0838012311757423e-05, "loss": 1.111, "step": 7785 }, { "epoch": 0.79, "grad_norm": 1.0078125, "learning_rate": 1.0786795379210856e-05, "loss": 1.0881, "step": 7790 }, { "epoch": 0.79, "grad_norm": 0.76953125, "learning_rate": 1.0735685114434763e-05, "loss": 1.0869, "step": 7795 }, { "epoch": 0.79, "grad_norm": 0.8203125, "learning_rate": 1.0684681656459305e-05, "loss": 1.1783, "step": 7800 }, { "epoch": 0.79, "grad_norm": 0.63671875, "learning_rate": 1.0633785144024084e-05, "loss": 1.0289, "step": 7805 }, { "epoch": 0.8, "grad_norm": 0.7734375, "learning_rate": 1.0582995715577843e-05, "loss": 1.0741, "step": 7810 }, { "epoch": 0.8, "grad_norm": 0.71875, "learning_rate": 1.0532313509277992e-05, "loss": 1.0633, "step": 7815 }, { "epoch": 0.8, "grad_norm": 0.6953125, "learning_rate": 1.0481738662990242e-05, "loss": 1.0032, "step": 7820 }, { "epoch": 0.8, "grad_norm": 0.74609375, "learning_rate": 1.0431271314288348e-05, "loss": 0.908, "step": 7825 }, { "epoch": 0.8, "grad_norm": 0.82421875, "learning_rate": 1.0380911600453597e-05, "loss": 1.191, "step": 7830 }, { "epoch": 0.8, "grad_norm": 0.7578125, "learning_rate": 1.033065965847449e-05, "loss": 0.8685, "step": 7835 }, { "epoch": 0.8, "grad_norm": 0.765625, "learning_rate": 1.0280515625046378e-05, "loss": 0.92, "step": 7840 }, { "epoch": 0.8, "grad_norm": 0.8046875, "learning_rate": 1.0230479636571078e-05, "loss": 1.0653, "step": 7845 }, { "epoch": 0.8, "grad_norm": 0.76953125, "learning_rate": 1.0180551829156476e-05, "loss": 1.1654, "step": 7850 }, { "epoch": 0.8, "grad_norm": 0.66015625, "learning_rate": 1.0130732338616244e-05, "loss": 1.0986, "step": 7855 }, { "epoch": 0.8, "grad_norm": 0.79296875, "learning_rate": 1.008102130046938e-05, "loss": 1.0657, "step": 7860 }, { "epoch": 0.8, "grad_norm": 0.73046875, "learning_rate": 1.0031418849939806e-05, "loss": 0.9849, "step": 7865 }, { "epoch": 0.8, "grad_norm": 1.1328125, "learning_rate": 9.981925121956177e-06, "loss": 0.9419, "step": 7870 }, { "epoch": 0.8, "grad_norm": 0.9296875, "learning_rate": 9.932540251151328e-06, "loss": 1.2451, "step": 7875 }, { "epoch": 0.8, "grad_norm": 0.79296875, "learning_rate": 9.883264371862e-06, "loss": 1.0926, "step": 7880 }, { "epoch": 0.8, "grad_norm": 0.69140625, "learning_rate": 9.834097618128458e-06, "loss": 0.99, "step": 7885 }, { "epoch": 0.8, "grad_norm": 0.8671875, "learning_rate": 9.785040123694107e-06, "loss": 1.0726, "step": 7890 }, { "epoch": 0.8, "grad_norm": 0.7265625, "learning_rate": 9.736092022005172e-06, "loss": 1.2484, "step": 7895 }, { "epoch": 0.8, "grad_norm": 0.73046875, "learning_rate": 9.687253446210271e-06, "loss": 0.9666, "step": 7900 }, { "epoch": 0.81, "grad_norm": 0.76171875, "learning_rate": 9.638524529160147e-06, "loss": 0.9516, "step": 7905 }, { "epoch": 0.81, "grad_norm": 0.75390625, "learning_rate": 9.58990540340719e-06, "loss": 1.0633, "step": 7910 }, { "epoch": 0.81, "grad_norm": 0.703125, "learning_rate": 9.541396201205149e-06, "loss": 0.9372, "step": 7915 }, { "epoch": 0.81, "grad_norm": 0.66015625, "learning_rate": 9.492997054508795e-06, "loss": 0.9173, "step": 7920 }, { "epoch": 0.81, "grad_norm": 0.703125, "learning_rate": 9.444708094973498e-06, "loss": 1.0616, "step": 7925 }, { "epoch": 0.81, "grad_norm": 0.78125, "learning_rate": 9.396529453954889e-06, "loss": 1.2095, "step": 7930 }, { "epoch": 0.81, "grad_norm": 0.73046875, "learning_rate": 9.348461262508523e-06, "loss": 1.1098, "step": 7935 }, { "epoch": 0.81, "grad_norm": 0.70703125, "learning_rate": 9.300503651389514e-06, "loss": 0.9954, "step": 7940 }, { "epoch": 0.81, "grad_norm": 1.0546875, "learning_rate": 9.252656751052163e-06, "loss": 1.09, "step": 7945 }, { "epoch": 0.81, "grad_norm": 0.6953125, "learning_rate": 9.20492069164962e-06, "loss": 0.9648, "step": 7950 }, { "epoch": 0.81, "grad_norm": 0.79296875, "learning_rate": 9.157295603033539e-06, "loss": 1.1242, "step": 7955 }, { "epoch": 0.81, "grad_norm": 0.71484375, "learning_rate": 9.109781614753682e-06, "loss": 0.971, "step": 7960 }, { "epoch": 0.81, "grad_norm": 0.73828125, "learning_rate": 9.062378856057629e-06, "loss": 0.9582, "step": 7965 }, { "epoch": 0.81, "grad_norm": 0.7109375, "learning_rate": 9.01508745589038e-06, "loss": 1.0426, "step": 7970 }, { "epoch": 0.81, "grad_norm": 0.765625, "learning_rate": 8.967907542894006e-06, "loss": 1.0823, "step": 7975 }, { "epoch": 0.81, "grad_norm": 0.75390625, "learning_rate": 8.92083924540732e-06, "loss": 1.071, "step": 7980 }, { "epoch": 0.81, "grad_norm": 0.76953125, "learning_rate": 8.873882691465523e-06, "loss": 1.0752, "step": 7985 }, { "epoch": 0.81, "grad_norm": 1.0546875, "learning_rate": 8.827038008799843e-06, "loss": 0.9109, "step": 7990 }, { "epoch": 0.81, "grad_norm": 0.73046875, "learning_rate": 8.780305324837201e-06, "loss": 0.9449, "step": 7995 }, { "epoch": 0.81, "grad_norm": 0.7109375, "learning_rate": 8.733684766699862e-06, "loss": 1.0107, "step": 8000 }, { "epoch": 0.82, "grad_norm": 0.74609375, "learning_rate": 8.68717646120507e-06, "loss": 1.0883, "step": 8005 }, { "epoch": 0.82, "grad_norm": 0.73046875, "learning_rate": 8.640780534864729e-06, "loss": 1.058, "step": 8010 }, { "epoch": 0.82, "grad_norm": 0.78125, "learning_rate": 8.594497113885081e-06, "loss": 1.0544, "step": 8015 }, { "epoch": 0.82, "grad_norm": 0.8203125, "learning_rate": 8.548326324166267e-06, "loss": 1.1859, "step": 8020 }, { "epoch": 0.82, "grad_norm": 0.90625, "learning_rate": 8.502268291302085e-06, "loss": 1.0732, "step": 8025 }, { "epoch": 0.82, "grad_norm": 0.6484375, "learning_rate": 8.456323140579631e-06, "loss": 0.9538, "step": 8030 }, { "epoch": 0.82, "grad_norm": 0.9453125, "learning_rate": 8.410490996978908e-06, "loss": 0.9876, "step": 8035 }, { "epoch": 0.82, "grad_norm": 0.71875, "learning_rate": 8.36477198517252e-06, "loss": 0.9445, "step": 8040 }, { "epoch": 0.82, "grad_norm": 0.7265625, "learning_rate": 8.319166229525344e-06, "loss": 0.9873, "step": 8045 }, { "epoch": 0.82, "grad_norm": 0.75, "learning_rate": 8.27367385409416e-06, "loss": 1.0131, "step": 8050 }, { "epoch": 0.82, "grad_norm": 0.73828125, "learning_rate": 8.228294982627337e-06, "loss": 1.0492, "step": 8055 }, { "epoch": 0.82, "grad_norm": 0.66796875, "learning_rate": 8.18302973856453e-06, "loss": 0.9502, "step": 8060 }, { "epoch": 0.82, "grad_norm": 0.93359375, "learning_rate": 8.137878245036229e-06, "loss": 1.1096, "step": 8065 }, { "epoch": 0.82, "grad_norm": 0.58984375, "learning_rate": 8.092840624863535e-06, "loss": 0.993, "step": 8070 }, { "epoch": 0.82, "grad_norm": 0.765625, "learning_rate": 8.047917000557832e-06, "loss": 0.9962, "step": 8075 }, { "epoch": 0.82, "grad_norm": 0.98046875, "learning_rate": 8.003107494320356e-06, "loss": 1.0157, "step": 8080 }, { "epoch": 0.82, "grad_norm": 0.70703125, "learning_rate": 7.95841222804194e-06, "loss": 0.9748, "step": 8085 }, { "epoch": 0.82, "grad_norm": 0.84375, "learning_rate": 7.913831323302656e-06, "loss": 0.9751, "step": 8090 }, { "epoch": 0.82, "grad_norm": 1.4140625, "learning_rate": 7.869364901371501e-06, "loss": 1.1196, "step": 8095 }, { "epoch": 0.82, "grad_norm": 0.85546875, "learning_rate": 7.82501308320603e-06, "loss": 1.0587, "step": 8100 }, { "epoch": 0.83, "grad_norm": 0.7890625, "learning_rate": 7.780775989452117e-06, "loss": 0.9451, "step": 8105 }, { "epoch": 0.83, "grad_norm": 0.7890625, "learning_rate": 7.736653740443466e-06, "loss": 1.1029, "step": 8110 }, { "epoch": 0.83, "grad_norm": 0.92578125, "learning_rate": 7.692646456201453e-06, "loss": 0.8981, "step": 8115 }, { "epoch": 0.83, "grad_norm": 0.72265625, "learning_rate": 7.64875425643471e-06, "loss": 1.119, "step": 8120 }, { "epoch": 0.83, "grad_norm": 0.78515625, "learning_rate": 7.604977260538804e-06, "loss": 1.1263, "step": 8125 }, { "epoch": 0.83, "grad_norm": 0.73046875, "learning_rate": 7.561315587595946e-06, "loss": 1.0309, "step": 8130 }, { "epoch": 0.83, "grad_norm": 0.61328125, "learning_rate": 7.517769356374599e-06, "loss": 0.9766, "step": 8135 }, { "epoch": 0.83, "grad_norm": 0.66796875, "learning_rate": 7.474338685329263e-06, "loss": 0.9325, "step": 8140 }, { "epoch": 0.83, "grad_norm": 0.734375, "learning_rate": 7.431023692600059e-06, "loss": 1.0075, "step": 8145 }, { "epoch": 0.83, "grad_norm": 0.8046875, "learning_rate": 7.38782449601243e-06, "loss": 1.1524, "step": 8150 }, { "epoch": 0.83, "grad_norm": 0.7578125, "learning_rate": 7.34474121307685e-06, "loss": 1.214, "step": 8155 }, { "epoch": 0.83, "grad_norm": 0.61328125, "learning_rate": 7.301773960988479e-06, "loss": 0.9925, "step": 8160 }, { "epoch": 0.83, "grad_norm": 0.71484375, "learning_rate": 7.258922856626837e-06, "loss": 1.1018, "step": 8165 }, { "epoch": 0.83, "grad_norm": 0.79296875, "learning_rate": 7.216188016555542e-06, "loss": 0.9746, "step": 8170 }, { "epoch": 0.83, "grad_norm": 0.66015625, "learning_rate": 7.173569557021881e-06, "loss": 0.8846, "step": 8175 }, { "epoch": 0.83, "grad_norm": 0.73828125, "learning_rate": 7.131067593956608e-06, "loss": 1.0179, "step": 8180 }, { "epoch": 0.83, "grad_norm": 0.8046875, "learning_rate": 7.0886822429735875e-06, "loss": 1.0651, "step": 8185 }, { "epoch": 0.83, "grad_norm": 0.7734375, "learning_rate": 7.046413619369463e-06, "loss": 0.9921, "step": 8190 }, { "epoch": 0.83, "grad_norm": 0.71875, "learning_rate": 7.004261838123355e-06, "loss": 1.1201, "step": 8195 }, { "epoch": 0.84, "grad_norm": 0.734375, "learning_rate": 6.962227013896544e-06, "loss": 1.1401, "step": 8200 }, { "epoch": 0.84, "grad_norm": 0.8828125, "learning_rate": 6.92030926103217e-06, "loss": 1.0509, "step": 8205 }, { "epoch": 0.84, "grad_norm": 0.74609375, "learning_rate": 6.87850869355492e-06, "loss": 1.1444, "step": 8210 }, { "epoch": 0.84, "grad_norm": 0.79296875, "learning_rate": 6.8368254251707234e-06, "loss": 1.1066, "step": 8215 }, { "epoch": 0.84, "grad_norm": 0.7265625, "learning_rate": 6.795259569266399e-06, "loss": 0.9891, "step": 8220 }, { "epoch": 0.84, "grad_norm": 0.7890625, "learning_rate": 6.7538112389093965e-06, "loss": 1.1345, "step": 8225 }, { "epoch": 0.84, "grad_norm": 0.76171875, "learning_rate": 6.7124805468475e-06, "loss": 0.9894, "step": 8230 }, { "epoch": 0.84, "grad_norm": 0.7109375, "learning_rate": 6.671267605508452e-06, "loss": 0.9833, "step": 8235 }, { "epoch": 0.84, "grad_norm": 0.7109375, "learning_rate": 6.630172526999712e-06, "loss": 0.9544, "step": 8240 }, { "epoch": 0.84, "grad_norm": 0.8515625, "learning_rate": 6.589195423108124e-06, "loss": 1.203, "step": 8245 }, { "epoch": 0.84, "grad_norm": 0.6953125, "learning_rate": 6.548336405299621e-06, "loss": 0.9743, "step": 8250 }, { "epoch": 0.84, "grad_norm": 1.8203125, "learning_rate": 6.507595584718912e-06, "loss": 1.1241, "step": 8255 }, { "epoch": 0.84, "grad_norm": 0.78125, "learning_rate": 6.466973072189186e-06, "loss": 1.0756, "step": 8260 }, { "epoch": 0.84, "grad_norm": 0.77734375, "learning_rate": 6.426468978211819e-06, "loss": 1.0216, "step": 8265 }, { "epoch": 0.84, "grad_norm": 0.7890625, "learning_rate": 6.386083412966054e-06, "loss": 1.114, "step": 8270 }, { "epoch": 0.84, "grad_norm": 0.765625, "learning_rate": 6.345816486308709e-06, "loss": 0.9978, "step": 8275 }, { "epoch": 0.84, "grad_norm": 0.85546875, "learning_rate": 6.305668307773915e-06, "loss": 1.1581, "step": 8280 }, { "epoch": 0.84, "grad_norm": 0.7265625, "learning_rate": 6.265638986572747e-06, "loss": 0.9827, "step": 8285 }, { "epoch": 0.84, "grad_norm": 0.65234375, "learning_rate": 6.225728631592958e-06, "loss": 0.9278, "step": 8290 }, { "epoch": 0.84, "grad_norm": 0.67578125, "learning_rate": 6.18593735139873e-06, "loss": 0.9989, "step": 8295 }, { "epoch": 0.85, "grad_norm": 0.73828125, "learning_rate": 6.146265254230304e-06, "loss": 1.0704, "step": 8300 }, { "epoch": 0.85, "grad_norm": 0.63671875, "learning_rate": 6.106712448003738e-06, "loss": 0.9833, "step": 8305 }, { "epoch": 0.85, "grad_norm": 0.7890625, "learning_rate": 6.067279040310575e-06, "loss": 1.0683, "step": 8310 }, { "epoch": 0.85, "grad_norm": 0.67578125, "learning_rate": 6.027965138417591e-06, "loss": 1.0245, "step": 8315 }, { "epoch": 0.85, "grad_norm": 0.7421875, "learning_rate": 5.988770849266462e-06, "loss": 1.0429, "step": 8320 }, { "epoch": 0.85, "grad_norm": 1.0234375, "learning_rate": 5.949696279473533e-06, "loss": 1.0661, "step": 8325 }, { "epoch": 0.85, "grad_norm": 0.734375, "learning_rate": 5.9107415353294285e-06, "loss": 1.1553, "step": 8330 }, { "epoch": 0.85, "grad_norm": 0.7265625, "learning_rate": 5.871906722798859e-06, "loss": 1.0681, "step": 8335 }, { "epoch": 0.85, "grad_norm": 0.9765625, "learning_rate": 5.833191947520311e-06, "loss": 1.0529, "step": 8340 }, { "epoch": 0.85, "grad_norm": 0.69921875, "learning_rate": 5.794597314805711e-06, "loss": 1.0056, "step": 8345 }, { "epoch": 0.85, "grad_norm": 0.7578125, "learning_rate": 5.7561229296402005e-06, "loss": 0.9506, "step": 8350 }, { "epoch": 0.85, "grad_norm": 0.8046875, "learning_rate": 5.717768896681808e-06, "loss": 1.0626, "step": 8355 }, { "epoch": 0.85, "grad_norm": 0.75, "learning_rate": 5.679535320261187e-06, "loss": 0.9996, "step": 8360 }, { "epoch": 0.85, "grad_norm": 0.9765625, "learning_rate": 5.641422304381305e-06, "loss": 0.9982, "step": 8365 }, { "epoch": 0.85, "grad_norm": 0.79296875, "learning_rate": 5.603429952717238e-06, "loss": 1.1246, "step": 8370 }, { "epoch": 0.85, "grad_norm": 0.69140625, "learning_rate": 5.5655583686157585e-06, "loss": 1.1155, "step": 8375 }, { "epoch": 0.85, "grad_norm": 0.7578125, "learning_rate": 5.527807655095174e-06, "loss": 1.0422, "step": 8380 }, { "epoch": 0.85, "grad_norm": 0.6640625, "learning_rate": 5.490177914844968e-06, "loss": 1.0936, "step": 8385 }, { "epoch": 0.85, "grad_norm": 1.7890625, "learning_rate": 5.452669250225601e-06, "loss": 1.1358, "step": 8390 }, { "epoch": 0.85, "grad_norm": 0.66796875, "learning_rate": 5.415281763268143e-06, "loss": 0.98, "step": 8395 }, { "epoch": 0.86, "grad_norm": 1.0078125, "learning_rate": 5.378015555674026e-06, "loss": 1.0793, "step": 8400 }, { "epoch": 0.86, "grad_norm": 0.72265625, "learning_rate": 5.340870728814823e-06, "loss": 1.0334, "step": 8405 }, { "epoch": 0.86, "grad_norm": 0.8203125, "learning_rate": 5.303847383731891e-06, "loss": 1.2618, "step": 8410 }, { "epoch": 0.86, "grad_norm": 0.671875, "learning_rate": 5.266945621136149e-06, "loss": 1.0013, "step": 8415 }, { "epoch": 0.86, "grad_norm": 0.60546875, "learning_rate": 5.2301655414077834e-06, "loss": 0.9661, "step": 8420 }, { "epoch": 0.86, "grad_norm": 2.234375, "learning_rate": 5.193507244595969e-06, "loss": 0.9258, "step": 8425 }, { "epoch": 0.86, "grad_norm": 0.80859375, "learning_rate": 5.1569708304186174e-06, "loss": 1.1952, "step": 8430 }, { "epoch": 0.86, "grad_norm": 0.73828125, "learning_rate": 5.120556398262099e-06, "loss": 1.0236, "step": 8435 }, { "epoch": 0.86, "grad_norm": 0.8515625, "learning_rate": 5.084264047180976e-06, "loss": 0.8937, "step": 8440 }, { "epoch": 0.86, "grad_norm": 0.7890625, "learning_rate": 5.048093875897674e-06, "loss": 1.1052, "step": 8445 }, { "epoch": 0.86, "grad_norm": 0.7734375, "learning_rate": 5.012045982802332e-06, "loss": 1.1172, "step": 8450 }, { "epoch": 0.86, "grad_norm": 0.80859375, "learning_rate": 4.976120465952438e-06, "loss": 0.9773, "step": 8455 }, { "epoch": 0.86, "grad_norm": 0.7421875, "learning_rate": 4.940317423072588e-06, "loss": 1.0267, "step": 8460 }, { "epoch": 0.86, "grad_norm": 0.88671875, "learning_rate": 4.904636951554226e-06, "loss": 1.0397, "step": 8465 }, { "epoch": 0.86, "grad_norm": 0.8203125, "learning_rate": 4.869079148455385e-06, "loss": 1.067, "step": 8470 }, { "epoch": 0.86, "grad_norm": 0.83984375, "learning_rate": 4.8336441105004016e-06, "loss": 1.2063, "step": 8475 }, { "epoch": 0.86, "grad_norm": 0.82421875, "learning_rate": 4.798331934079709e-06, "loss": 1.2085, "step": 8480 }, { "epoch": 0.86, "grad_norm": 0.7734375, "learning_rate": 4.763142715249458e-06, "loss": 0.8973, "step": 8485 }, { "epoch": 0.86, "grad_norm": 0.7421875, "learning_rate": 4.728076549731386e-06, "loss": 1.0066, "step": 8490 }, { "epoch": 0.87, "grad_norm": 0.73828125, "learning_rate": 4.693133532912497e-06, "loss": 1.1551, "step": 8495 }, { "epoch": 0.87, "grad_norm": 0.76171875, "learning_rate": 4.65831375984479e-06, "loss": 0.9847, "step": 8500 }, { "epoch": 0.87, "grad_norm": 0.63671875, "learning_rate": 4.623617325245027e-06, "loss": 0.9894, "step": 8505 }, { "epoch": 0.87, "grad_norm": 0.83203125, "learning_rate": 4.589044323494435e-06, "loss": 1.1926, "step": 8510 }, { "epoch": 0.87, "grad_norm": 0.65234375, "learning_rate": 4.554594848638533e-06, "loss": 1.0448, "step": 8515 }, { "epoch": 0.87, "grad_norm": 0.78515625, "learning_rate": 4.520268994386778e-06, "loss": 0.9993, "step": 8520 }, { "epoch": 0.87, "grad_norm": 0.99609375, "learning_rate": 4.4860668541123695e-06, "loss": 0.9977, "step": 8525 }, { "epoch": 0.87, "grad_norm": 0.70703125, "learning_rate": 4.451988520851985e-06, "loss": 0.9981, "step": 8530 }, { "epoch": 0.87, "grad_norm": 0.79296875, "learning_rate": 4.418034087305506e-06, "loss": 1.0088, "step": 8535 }, { "epoch": 0.87, "grad_norm": 0.84765625, "learning_rate": 4.3842036458357935e-06, "loss": 1.1097, "step": 8540 }, { "epoch": 0.87, "grad_norm": 1.296875, "learning_rate": 4.350497288468436e-06, "loss": 0.9324, "step": 8545 }, { "epoch": 0.87, "grad_norm": 0.80078125, "learning_rate": 4.316915106891484e-06, "loss": 0.9506, "step": 8550 }, { "epoch": 0.87, "grad_norm": 0.82421875, "learning_rate": 4.283457192455159e-06, "loss": 1.1521, "step": 8555 }, { "epoch": 0.87, "grad_norm": 0.7421875, "learning_rate": 4.2501236361717266e-06, "loss": 1.2179, "step": 8560 }, { "epoch": 0.87, "grad_norm": 0.83203125, "learning_rate": 4.2169145287151215e-06, "loss": 1.1016, "step": 8565 }, { "epoch": 0.87, "grad_norm": 0.68359375, "learning_rate": 4.183829960420765e-06, "loss": 0.9785, "step": 8570 }, { "epoch": 0.87, "grad_norm": 0.6640625, "learning_rate": 4.150870021285319e-06, "loss": 0.9116, "step": 8575 }, { "epoch": 0.87, "grad_norm": 0.76953125, "learning_rate": 4.118034800966408e-06, "loss": 1.0457, "step": 8580 }, { "epoch": 0.87, "grad_norm": 0.6640625, "learning_rate": 4.085324388782407e-06, "loss": 0.9744, "step": 8585 }, { "epoch": 0.87, "grad_norm": 0.8359375, "learning_rate": 4.052738873712208e-06, "loss": 1.0935, "step": 8590 }, { "epoch": 0.88, "grad_norm": 0.7890625, "learning_rate": 4.020278344394929e-06, "loss": 1.0172, "step": 8595 }, { "epoch": 0.88, "grad_norm": 0.73828125, "learning_rate": 3.987942889129692e-06, "loss": 0.9883, "step": 8600 }, { "epoch": 0.88, "grad_norm": 0.7265625, "learning_rate": 3.955732595875439e-06, "loss": 0.8696, "step": 8605 }, { "epoch": 0.88, "grad_norm": 0.72265625, "learning_rate": 3.923647552250604e-06, "loss": 1.0511, "step": 8610 }, { "epoch": 0.88, "grad_norm": 0.83203125, "learning_rate": 3.891687845532932e-06, "loss": 1.0887, "step": 8615 }, { "epoch": 0.88, "grad_norm": 0.86328125, "learning_rate": 3.859853562659232e-06, "loss": 1.1004, "step": 8620 }, { "epoch": 0.88, "grad_norm": 0.921875, "learning_rate": 3.82814479022513e-06, "loss": 0.8922, "step": 8625 }, { "epoch": 0.88, "grad_norm": 0.69921875, "learning_rate": 3.7965616144848337e-06, "loss": 1.1908, "step": 8630 }, { "epoch": 0.88, "grad_norm": 0.82421875, "learning_rate": 3.7651041213509043e-06, "loss": 0.9542, "step": 8635 }, { "epoch": 0.88, "grad_norm": 0.8671875, "learning_rate": 3.733772396394031e-06, "loss": 1.2244, "step": 8640 }, { "epoch": 0.88, "grad_norm": 0.69921875, "learning_rate": 3.7025665248427766e-06, "loss": 0.9537, "step": 8645 }, { "epoch": 0.88, "grad_norm": 0.78515625, "learning_rate": 3.6714865915833563e-06, "loss": 1.1107, "step": 8650 }, { "epoch": 0.88, "grad_norm": 0.8046875, "learning_rate": 3.6405326811594256e-06, "loss": 1.0089, "step": 8655 }, { "epoch": 0.88, "grad_norm": 0.75390625, "learning_rate": 3.6097048777718246e-06, "loss": 1.3491, "step": 8660 }, { "epoch": 0.88, "grad_norm": 0.7578125, "learning_rate": 3.5790032652783247e-06, "loss": 1.143, "step": 8665 }, { "epoch": 0.88, "grad_norm": 0.8046875, "learning_rate": 3.5484279271934873e-06, "loss": 1.1867, "step": 8670 }, { "epoch": 0.88, "grad_norm": 1.1015625, "learning_rate": 3.5179789466883495e-06, "loss": 1.1987, "step": 8675 }, { "epoch": 0.88, "grad_norm": 0.7734375, "learning_rate": 3.487656406590234e-06, "loss": 0.9052, "step": 8680 }, { "epoch": 0.88, "grad_norm": 1.015625, "learning_rate": 3.4574603893825274e-06, "loss": 1.0106, "step": 8685 }, { "epoch": 0.88, "grad_norm": 0.7109375, "learning_rate": 3.427390977204442e-06, "loss": 1.1506, "step": 8690 }, { "epoch": 0.89, "grad_norm": 0.71484375, "learning_rate": 3.397448251850788e-06, "loss": 0.9958, "step": 8695 }, { "epoch": 0.89, "grad_norm": 0.796875, "learning_rate": 3.3676322947717997e-06, "loss": 1.2572, "step": 8700 }, { "epoch": 0.89, "grad_norm": 0.94140625, "learning_rate": 3.337943187072845e-06, "loss": 0.8616, "step": 8705 }, { "epoch": 0.89, "grad_norm": 0.77734375, "learning_rate": 3.3083810095142278e-06, "loss": 1.0765, "step": 8710 }, { "epoch": 0.89, "grad_norm": 0.7890625, "learning_rate": 3.2789458425110054e-06, "loss": 1.2654, "step": 8715 }, { "epoch": 0.89, "grad_norm": 0.7578125, "learning_rate": 3.2496377661327294e-06, "loss": 1.0817, "step": 8720 }, { "epoch": 0.89, "grad_norm": 0.65234375, "learning_rate": 3.2204568601032327e-06, "loss": 0.9803, "step": 8725 }, { "epoch": 0.89, "grad_norm": 0.67578125, "learning_rate": 3.191403203800425e-06, "loss": 1.1562, "step": 8730 }, { "epoch": 0.89, "grad_norm": 0.76171875, "learning_rate": 3.162476876256071e-06, "loss": 1.1149, "step": 8735 }, { "epoch": 0.89, "grad_norm": 0.71484375, "learning_rate": 3.133677956155567e-06, "loss": 1.3157, "step": 8740 }, { "epoch": 0.89, "grad_norm": 0.73046875, "learning_rate": 3.1050065218377556e-06, "loss": 0.9424, "step": 8745 }, { "epoch": 0.89, "grad_norm": 0.73046875, "learning_rate": 3.076462651294687e-06, "loss": 0.9379, "step": 8750 }, { "epoch": 0.89, "grad_norm": 0.734375, "learning_rate": 3.048046422171391e-06, "loss": 0.8838, "step": 8755 }, { "epoch": 0.89, "grad_norm": 0.7578125, "learning_rate": 3.0197579117656972e-06, "loss": 1.1311, "step": 8760 }, { "epoch": 0.89, "grad_norm": 0.77734375, "learning_rate": 2.9915971970280344e-06, "loss": 1.0632, "step": 8765 }, { "epoch": 0.89, "grad_norm": 0.80859375, "learning_rate": 2.9635643545611826e-06, "loss": 1.2597, "step": 8770 }, { "epoch": 0.89, "grad_norm": 0.80078125, "learning_rate": 2.9356594606200883e-06, "loss": 0.9993, "step": 8775 }, { "epoch": 0.89, "grad_norm": 0.84765625, "learning_rate": 2.9078825911116493e-06, "loss": 1.0114, "step": 8780 }, { "epoch": 0.89, "grad_norm": 0.7109375, "learning_rate": 2.880233821594519e-06, "loss": 1.1114, "step": 8785 }, { "epoch": 0.9, "grad_norm": 0.80078125, "learning_rate": 2.8527132272788806e-06, "loss": 1.2655, "step": 8790 }, { "epoch": 0.9, "grad_norm": 0.8125, "learning_rate": 2.825320883026267e-06, "loss": 1.0745, "step": 8795 }, { "epoch": 0.9, "grad_norm": 0.74609375, "learning_rate": 2.7980568633493463e-06, "loss": 1.1723, "step": 8800 }, { "epoch": 0.9, "grad_norm": 0.65625, "learning_rate": 2.7709212424116946e-06, "loss": 0.9589, "step": 8805 }, { "epoch": 0.9, "grad_norm": 0.703125, "learning_rate": 2.7439140940276554e-06, "loss": 1.1502, "step": 8810 }, { "epoch": 0.9, "grad_norm": 0.73046875, "learning_rate": 2.7170354916620842e-06, "loss": 1.0675, "step": 8815 }, { "epoch": 0.9, "grad_norm": 0.72265625, "learning_rate": 2.690285508430135e-06, "loss": 0.977, "step": 8820 }, { "epoch": 0.9, "grad_norm": 0.70703125, "learning_rate": 2.6636642170971506e-06, "loss": 1.1135, "step": 8825 }, { "epoch": 0.9, "grad_norm": 0.69921875, "learning_rate": 2.6371716900783606e-06, "loss": 0.966, "step": 8830 }, { "epoch": 0.9, "grad_norm": 0.86328125, "learning_rate": 2.610807999438747e-06, "loss": 1.0174, "step": 8835 }, { "epoch": 0.9, "grad_norm": 0.69140625, "learning_rate": 2.584573216892827e-06, "loss": 1.0872, "step": 8840 }, { "epoch": 0.9, "grad_norm": 0.734375, "learning_rate": 2.5584674138044606e-06, "loss": 1.0395, "step": 8845 }, { "epoch": 0.9, "grad_norm": 0.8203125, "learning_rate": 2.532490661186665e-06, "loss": 1.1921, "step": 8850 }, { "epoch": 0.9, "grad_norm": 0.8203125, "learning_rate": 2.5066430297014165e-06, "loss": 1.1705, "step": 8855 }, { "epoch": 0.9, "grad_norm": 0.7265625, "learning_rate": 2.4809245896594448e-06, "loss": 1.1343, "step": 8860 }, { "epoch": 0.9, "grad_norm": 0.73046875, "learning_rate": 2.4553354110200487e-06, "loss": 0.9917, "step": 8865 }, { "epoch": 0.9, "grad_norm": 0.7890625, "learning_rate": 2.429875563390932e-06, "loss": 1.0901, "step": 8870 }, { "epoch": 0.9, "grad_norm": 0.6640625, "learning_rate": 2.4045451160279785e-06, "loss": 1.1292, "step": 8875 }, { "epoch": 0.9, "grad_norm": 0.6953125, "learning_rate": 2.3793441378350713e-06, "loss": 1.1216, "step": 8880 }, { "epoch": 0.9, "grad_norm": 0.79296875, "learning_rate": 2.3542726973639184e-06, "loss": 1.0914, "step": 8885 }, { "epoch": 0.91, "grad_norm": 0.75, "learning_rate": 2.3293308628138668e-06, "loss": 0.9495, "step": 8890 }, { "epoch": 0.91, "grad_norm": 0.70703125, "learning_rate": 2.3045187020316994e-06, "loss": 0.9418, "step": 8895 }, { "epoch": 0.91, "grad_norm": 0.71484375, "learning_rate": 2.2798362825114494e-06, "loss": 1.035, "step": 8900 }, { "epoch": 0.91, "grad_norm": 0.80859375, "learning_rate": 2.2552836713942705e-06, "loss": 1.0294, "step": 8905 }, { "epoch": 0.91, "grad_norm": 0.734375, "learning_rate": 2.2308609354681597e-06, "loss": 1.1058, "step": 8910 }, { "epoch": 0.91, "grad_norm": 0.76953125, "learning_rate": 2.206568141167853e-06, "loss": 0.9427, "step": 8915 }, { "epoch": 0.91, "grad_norm": 0.68359375, "learning_rate": 2.1824053545746236e-06, "loss": 1.0727, "step": 8920 }, { "epoch": 0.91, "grad_norm": 0.7890625, "learning_rate": 2.1583726414160843e-06, "loss": 0.9588, "step": 8925 }, { "epoch": 0.91, "grad_norm": 0.5859375, "learning_rate": 2.13447006706603e-06, "loss": 0.9136, "step": 8930 }, { "epoch": 0.91, "grad_norm": 0.76171875, "learning_rate": 2.110697696544256e-06, "loss": 1.1282, "step": 8935 }, { "epoch": 0.91, "grad_norm": 0.953125, "learning_rate": 2.0870555945163627e-06, "loss": 1.2367, "step": 8940 }, { "epoch": 0.91, "grad_norm": 0.734375, "learning_rate": 2.0635438252936124e-06, "loss": 0.9674, "step": 8945 }, { "epoch": 0.91, "grad_norm": 0.68359375, "learning_rate": 2.0401624528327114e-06, "loss": 0.9966, "step": 8950 }, { "epoch": 0.91, "grad_norm": 0.81640625, "learning_rate": 2.0169115407356943e-06, "loss": 1.1108, "step": 8955 }, { "epoch": 0.91, "grad_norm": 0.81640625, "learning_rate": 1.99379115224968e-06, "loss": 1.0323, "step": 8960 }, { "epoch": 0.91, "grad_norm": 1.53125, "learning_rate": 1.970801350266771e-06, "loss": 1.1575, "step": 8965 }, { "epoch": 0.91, "grad_norm": 0.94921875, "learning_rate": 1.9479421973238322e-06, "loss": 1.132, "step": 8970 }, { "epoch": 0.91, "grad_norm": 0.6953125, "learning_rate": 1.9252137556023233e-06, "loss": 1.1065, "step": 8975 }, { "epoch": 0.91, "grad_norm": 0.7890625, "learning_rate": 1.9026160869281773e-06, "loss": 1.279, "step": 8980 }, { "epoch": 0.92, "grad_norm": 0.66015625, "learning_rate": 1.880149252771568e-06, "loss": 0.9565, "step": 8985 }, { "epoch": 0.92, "grad_norm": 0.7109375, "learning_rate": 1.8578133142467812e-06, "loss": 1.0308, "step": 8990 }, { "epoch": 0.92, "grad_norm": 0.77734375, "learning_rate": 1.8356083321120542e-06, "loss": 0.8321, "step": 8995 }, { "epoch": 0.92, "grad_norm": 0.7578125, "learning_rate": 1.8135343667693816e-06, "loss": 1.1632, "step": 9000 }, { "epoch": 0.92, "grad_norm": 0.7734375, "learning_rate": 1.791591478264365e-06, "loss": 1.0824, "step": 9005 }, { "epoch": 0.92, "grad_norm": 0.76953125, "learning_rate": 1.7697797262860527e-06, "loss": 1.0118, "step": 9010 }, { "epoch": 0.92, "grad_norm": 0.6484375, "learning_rate": 1.7480991701668003e-06, "loss": 0.8796, "step": 9015 }, { "epoch": 0.92, "grad_norm": 0.75390625, "learning_rate": 1.726549868882038e-06, "loss": 1.0019, "step": 9020 }, { "epoch": 0.92, "grad_norm": 0.83984375, "learning_rate": 1.705131881050187e-06, "loss": 1.0832, "step": 9025 }, { "epoch": 0.92, "grad_norm": 0.6328125, "learning_rate": 1.6838452649324765e-06, "loss": 0.9772, "step": 9030 }, { "epoch": 0.92, "grad_norm": 0.8203125, "learning_rate": 1.6626900784327603e-06, "loss": 1.0851, "step": 9035 }, { "epoch": 0.92, "grad_norm": 0.6796875, "learning_rate": 1.641666379097373e-06, "loss": 0.9743, "step": 9040 }, { "epoch": 0.92, "grad_norm": 0.63671875, "learning_rate": 1.6207742241150016e-06, "loss": 1.0845, "step": 9045 }, { "epoch": 0.92, "grad_norm": 0.75, "learning_rate": 1.6000136703164803e-06, "loss": 1.0218, "step": 9050 }, { "epoch": 0.92, "grad_norm": 0.71484375, "learning_rate": 1.5793847741746748e-06, "loss": 1.1603, "step": 9055 }, { "epoch": 0.92, "grad_norm": 0.6328125, "learning_rate": 1.5588875918043255e-06, "loss": 1.028, "step": 9060 }, { "epoch": 0.92, "grad_norm": 0.8046875, "learning_rate": 1.5385221789618654e-06, "loss": 1.0683, "step": 9065 }, { "epoch": 0.92, "grad_norm": 0.75, "learning_rate": 1.5182885910452914e-06, "loss": 0.958, "step": 9070 }, { "epoch": 0.92, "grad_norm": 0.84375, "learning_rate": 1.4981868830940216e-06, "loss": 1.0407, "step": 9075 }, { "epoch": 0.92, "grad_norm": 0.69921875, "learning_rate": 1.478217109788732e-06, "loss": 1.0951, "step": 9080 }, { "epoch": 0.93, "grad_norm": 0.8203125, "learning_rate": 1.4583793254511979e-06, "loss": 1.2087, "step": 9085 }, { "epoch": 0.93, "grad_norm": 1.2265625, "learning_rate": 1.43867358404417e-06, "loss": 1.0511, "step": 9090 }, { "epoch": 0.93, "grad_norm": 0.8671875, "learning_rate": 1.4190999391712146e-06, "loss": 1.2468, "step": 9095 }, { "epoch": 0.93, "grad_norm": 0.75, "learning_rate": 1.399658444076557e-06, "loss": 1.1579, "step": 9100 }, { "epoch": 0.93, "grad_norm": 0.70703125, "learning_rate": 1.3803491516449608e-06, "loss": 0.9643, "step": 9105 }, { "epoch": 0.93, "grad_norm": 0.796875, "learning_rate": 1.3611721144015765e-06, "loss": 1.0911, "step": 9110 }, { "epoch": 0.93, "grad_norm": 0.71875, "learning_rate": 1.342127384511771e-06, "loss": 0.9783, "step": 9115 }, { "epoch": 0.93, "grad_norm": 0.65625, "learning_rate": 1.3232150137810428e-06, "loss": 0.9269, "step": 9120 }, { "epoch": 0.93, "grad_norm": 0.74609375, "learning_rate": 1.3044350536548178e-06, "loss": 1.1495, "step": 9125 }, { "epoch": 0.93, "grad_norm": 0.84375, "learning_rate": 1.2857875552183485e-06, "loss": 1.0372, "step": 9130 }, { "epoch": 0.93, "grad_norm": 0.6640625, "learning_rate": 1.2672725691965648e-06, "loss": 1.0084, "step": 9135 }, { "epoch": 0.93, "grad_norm": 0.68359375, "learning_rate": 1.2488901459539403e-06, "loss": 1.1416, "step": 9140 }, { "epoch": 0.93, "grad_norm": 0.71484375, "learning_rate": 1.230640335494354e-06, "loss": 1.0988, "step": 9145 }, { "epoch": 0.93, "grad_norm": 0.67578125, "learning_rate": 1.2125231874609288e-06, "loss": 1.0674, "step": 9150 }, { "epoch": 0.93, "grad_norm": 1.1015625, "learning_rate": 1.194538751135954e-06, "loss": 1.042, "step": 9155 }, { "epoch": 0.93, "grad_norm": 0.765625, "learning_rate": 1.1766870754406855e-06, "loss": 1.0034, "step": 9160 }, { "epoch": 0.93, "grad_norm": 0.7109375, "learning_rate": 1.158968208935257e-06, "loss": 1.2456, "step": 9165 }, { "epoch": 0.93, "grad_norm": 0.921875, "learning_rate": 1.1413821998185526e-06, "loss": 1.0782, "step": 9170 }, { "epoch": 0.93, "grad_norm": 0.78515625, "learning_rate": 1.1239290959280168e-06, "loss": 1.1104, "step": 9175 }, { "epoch": 0.93, "grad_norm": 2.59375, "learning_rate": 1.10660894473959e-06, "loss": 0.9997, "step": 9180 }, { "epoch": 0.94, "grad_norm": 0.94140625, "learning_rate": 1.0894217933675677e-06, "loss": 1.1067, "step": 9185 }, { "epoch": 0.94, "grad_norm": 0.8984375, "learning_rate": 1.0723676885644295e-06, "loss": 1.1516, "step": 9190 }, { "epoch": 0.94, "grad_norm": 0.7265625, "learning_rate": 1.0554466767207672e-06, "loss": 1.1205, "step": 9195 }, { "epoch": 0.94, "grad_norm": 0.82421875, "learning_rate": 1.0386588038651167e-06, "loss": 1.0301, "step": 9200 }, { "epoch": 0.94, "grad_norm": 0.72265625, "learning_rate": 1.022004115663855e-06, "loss": 1.1526, "step": 9205 }, { "epoch": 0.94, "grad_norm": 0.765625, "learning_rate": 1.0054826574210584e-06, "loss": 1.1847, "step": 9210 }, { "epoch": 0.94, "grad_norm": 0.828125, "learning_rate": 9.89094474078428e-07, "loss": 1.1413, "step": 9215 }, { "epoch": 0.94, "grad_norm": 0.81640625, "learning_rate": 9.728396102150872e-07, "loss": 1.2559, "step": 9220 }, { "epoch": 0.94, "grad_norm": 0.67578125, "learning_rate": 9.567181100475164e-07, "loss": 0.9214, "step": 9225 }, { "epoch": 0.94, "grad_norm": 0.73046875, "learning_rate": 9.407300174294365e-07, "loss": 1.1769, "step": 9230 }, { "epoch": 0.94, "grad_norm": 0.703125, "learning_rate": 9.248753758516581e-07, "loss": 0.8906, "step": 9235 }, { "epoch": 0.94, "grad_norm": 0.94140625, "learning_rate": 9.09154228441983e-07, "loss": 1.096, "step": 9240 }, { "epoch": 0.94, "grad_norm": 0.7109375, "learning_rate": 8.935666179650804e-07, "loss": 0.9857, "step": 9245 }, { "epoch": 0.94, "grad_norm": 0.79296875, "learning_rate": 8.781125868223716e-07, "loss": 1.0866, "step": 9250 }, { "epoch": 0.94, "grad_norm": 0.82421875, "learning_rate": 8.627921770519187e-07, "loss": 1.1719, "step": 9255 }, { "epoch": 0.94, "grad_norm": 0.6328125, "learning_rate": 8.476054303282966e-07, "loss": 0.99, "step": 9260 }, { "epoch": 0.94, "grad_norm": 0.77734375, "learning_rate": 8.325523879625096e-07, "loss": 1.1741, "step": 9265 }, { "epoch": 0.94, "grad_norm": 0.82421875, "learning_rate": 8.176330909018426e-07, "loss": 1.1881, "step": 9270 }, { "epoch": 0.94, "grad_norm": 0.765625, "learning_rate": 8.028475797297652e-07, "loss": 0.8986, "step": 9275 }, { "epoch": 0.95, "grad_norm": 0.57421875, "learning_rate": 7.881958946658496e-07, "loss": 0.8785, "step": 9280 }, { "epoch": 0.95, "grad_norm": 0.76171875, "learning_rate": 7.736780755655871e-07, "loss": 1.0865, "step": 9285 }, { "epoch": 0.95, "grad_norm": 0.859375, "learning_rate": 7.592941619203708e-07, "loss": 1.0307, "step": 9290 }, { "epoch": 0.95, "grad_norm": 1.125, "learning_rate": 7.450441928573193e-07, "loss": 1.0542, "step": 9295 }, { "epoch": 0.95, "grad_norm": 0.84375, "learning_rate": 7.309282071392087e-07, "loss": 1.0614, "step": 9300 }, { "epoch": 0.95, "grad_norm": 0.875, "learning_rate": 7.169462431643459e-07, "loss": 0.9932, "step": 9305 }, { "epoch": 0.95, "grad_norm": 0.69921875, "learning_rate": 7.030983389664681e-07, "loss": 0.9256, "step": 9310 }, { "epoch": 0.95, "grad_norm": 0.66015625, "learning_rate": 6.893845322146542e-07, "loss": 1.0094, "step": 9315 }, { "epoch": 0.95, "grad_norm": 0.82421875, "learning_rate": 6.758048602131972e-07, "loss": 1.1286, "step": 9320 }, { "epoch": 0.95, "grad_norm": 1.265625, "learning_rate": 6.623593599015432e-07, "loss": 1.1295, "step": 9325 }, { "epoch": 0.95, "grad_norm": 0.71484375, "learning_rate": 6.490480678541244e-07, "loss": 1.1013, "step": 9330 }, { "epoch": 0.95, "grad_norm": 0.6953125, "learning_rate": 6.358710202803264e-07, "loss": 0.9872, "step": 9335 }, { "epoch": 0.95, "grad_norm": 0.71875, "learning_rate": 6.228282530243656e-07, "loss": 1.083, "step": 9340 }, { "epoch": 0.95, "grad_norm": 0.703125, "learning_rate": 6.099198015651786e-07, "loss": 0.9032, "step": 9345 }, { "epoch": 0.95, "grad_norm": 0.6796875, "learning_rate": 5.971457010163329e-07, "loss": 1.0754, "step": 9350 }, { "epoch": 0.95, "grad_norm": 1.6796875, "learning_rate": 5.845059861259494e-07, "loss": 1.1483, "step": 9355 }, { "epoch": 0.95, "grad_norm": 0.64453125, "learning_rate": 5.720006912765752e-07, "loss": 1.0902, "step": 9360 }, { "epoch": 0.95, "grad_norm": 0.88671875, "learning_rate": 5.596298504851216e-07, "loss": 1.118, "step": 9365 }, { "epoch": 0.95, "grad_norm": 0.9453125, "learning_rate": 5.473934974027595e-07, "loss": 1.0928, "step": 9370 }, { "epoch": 0.95, "grad_norm": 0.88671875, "learning_rate": 5.352916653148077e-07, "loss": 1.0034, "step": 9375 }, { "epoch": 0.96, "grad_norm": 0.7734375, "learning_rate": 5.233243871406779e-07, "loss": 1.2228, "step": 9380 }, { "epoch": 0.96, "grad_norm": 0.6484375, "learning_rate": 5.11491695433769e-07, "loss": 0.8809, "step": 9385 }, { "epoch": 0.96, "grad_norm": 0.765625, "learning_rate": 4.997936223813671e-07, "loss": 1.0031, "step": 9390 }, { "epoch": 0.96, "grad_norm": 0.59765625, "learning_rate": 4.88230199804579e-07, "loss": 0.9258, "step": 9395 }, { "epoch": 0.96, "grad_norm": 0.6796875, "learning_rate": 4.768014591582215e-07, "loss": 1.0305, "step": 9400 }, { "epoch": 0.96, "grad_norm": 0.734375, "learning_rate": 4.65507431530765e-07, "loss": 0.9031, "step": 9405 }, { "epoch": 0.96, "grad_norm": 0.82421875, "learning_rate": 4.543481476442235e-07, "loss": 1.2693, "step": 9410 }, { "epoch": 0.96, "grad_norm": 0.76953125, "learning_rate": 4.4332363785408173e-07, "loss": 1.1679, "step": 9415 }, { "epoch": 0.96, "grad_norm": 0.78125, "learning_rate": 4.324339321492232e-07, "loss": 1.0913, "step": 9420 }, { "epoch": 0.96, "grad_norm": 0.73046875, "learning_rate": 4.21679060151825e-07, "loss": 0.9403, "step": 9425 }, { "epoch": 0.96, "grad_norm": 0.80859375, "learning_rate": 4.1105905111728513e-07, "loss": 1.0614, "step": 9430 }, { "epoch": 0.96, "grad_norm": 0.69921875, "learning_rate": 4.0057393393416185e-07, "loss": 1.1454, "step": 9435 }, { "epoch": 0.96, "grad_norm": 0.68359375, "learning_rate": 3.902237371240791e-07, "loss": 1.0216, "step": 9440 }, { "epoch": 0.96, "grad_norm": 0.78125, "learning_rate": 3.800084888416322e-07, "loss": 1.3414, "step": 9445 }, { "epoch": 0.96, "grad_norm": 0.76171875, "learning_rate": 3.6992821687435454e-07, "loss": 1.0328, "step": 9450 }, { "epoch": 0.96, "grad_norm": 0.75, "learning_rate": 3.5998294864260093e-07, "loss": 1.1775, "step": 9455 }, { "epoch": 0.96, "grad_norm": 0.83984375, "learning_rate": 3.5017271119949236e-07, "loss": 1.1039, "step": 9460 }, { "epoch": 0.96, "grad_norm": 0.69140625, "learning_rate": 3.404975312308378e-07, "loss": 0.9896, "step": 9465 }, { "epoch": 0.96, "grad_norm": 0.83984375, "learning_rate": 3.309574350550626e-07, "loss": 0.9686, "step": 9470 }, { "epoch": 0.96, "grad_norm": 0.64453125, "learning_rate": 3.2155244862314695e-07, "loss": 1.0276, "step": 9475 }, { "epoch": 0.97, "grad_norm": 0.75390625, "learning_rate": 3.1228259751853174e-07, "loss": 1.1181, "step": 9480 }, { "epoch": 0.97, "grad_norm": 0.85546875, "learning_rate": 3.0314790695706865e-07, "loss": 1.0357, "step": 9485 }, { "epoch": 0.97, "grad_norm": 0.640625, "learning_rate": 2.941484017869478e-07, "loss": 0.9547, "step": 9490 }, { "epoch": 0.97, "grad_norm": 0.77734375, "learning_rate": 2.852841064886258e-07, "loss": 1.0971, "step": 9495 }, { "epoch": 0.97, "grad_norm": 0.82421875, "learning_rate": 2.765550451747645e-07, "loss": 1.2219, "step": 9500 }, { "epoch": 0.97, "grad_norm": 0.8046875, "learning_rate": 2.679612415901478e-07, "loss": 1.0231, "step": 9505 }, { "epoch": 0.97, "grad_norm": 0.71484375, "learning_rate": 2.595027191116539e-07, "loss": 1.0979, "step": 9510 }, { "epoch": 0.97, "grad_norm": 0.6328125, "learning_rate": 2.5117950074815545e-07, "loss": 1.0184, "step": 9515 }, { "epoch": 0.97, "grad_norm": 0.78125, "learning_rate": 2.4299160914046934e-07, "loss": 0.884, "step": 9520 }, { "epoch": 0.97, "grad_norm": 0.81640625, "learning_rate": 2.3493906656130161e-07, "loss": 1.2306, "step": 9525 }, { "epoch": 0.97, "grad_norm": 0.72265625, "learning_rate": 2.2702189491518587e-07, "loss": 0.9829, "step": 9530 }, { "epoch": 0.97, "grad_norm": 0.73046875, "learning_rate": 2.1924011573841163e-07, "loss": 0.9877, "step": 9535 }, { "epoch": 0.97, "grad_norm": 0.74609375, "learning_rate": 2.1159375019897397e-07, "loss": 1.1163, "step": 9540 }, { "epoch": 0.97, "grad_norm": 0.75390625, "learning_rate": 2.0408281909652382e-07, "loss": 1.0729, "step": 9545 }, { "epoch": 0.97, "grad_norm": 0.6796875, "learning_rate": 1.9670734286229565e-07, "loss": 1.1664, "step": 9550 }, { "epoch": 0.97, "grad_norm": 0.875, "learning_rate": 1.8946734155905755e-07, "loss": 1.1987, "step": 9555 }, { "epoch": 0.97, "grad_norm": 0.77734375, "learning_rate": 1.8236283488107243e-07, "loss": 1.0129, "step": 9560 }, { "epoch": 0.97, "grad_norm": 0.6796875, "learning_rate": 1.7539384215401468e-07, "loss": 0.9329, "step": 9565 }, { "epoch": 0.97, "grad_norm": 0.77734375, "learning_rate": 1.6856038233494243e-07, "loss": 1.3408, "step": 9570 }, { "epoch": 0.98, "grad_norm": 0.734375, "learning_rate": 1.6186247401223653e-07, "loss": 1.0876, "step": 9575 }, { "epoch": 0.98, "grad_norm": 0.65234375, "learning_rate": 1.55300135405545e-07, "loss": 1.138, "step": 9580 }, { "epoch": 0.98, "grad_norm": 1.3515625, "learning_rate": 1.4887338436574417e-07, "loss": 1.0621, "step": 9585 }, { "epoch": 0.98, "grad_norm": 0.76171875, "learning_rate": 1.425822383748887e-07, "loss": 0.9534, "step": 9590 }, { "epoch": 0.98, "grad_norm": 0.7578125, "learning_rate": 1.3642671454615065e-07, "loss": 1.0583, "step": 9595 }, { "epoch": 0.98, "grad_norm": 0.71875, "learning_rate": 1.3040682962379148e-07, "loss": 0.8461, "step": 9600 }, { "epoch": 0.98, "grad_norm": 0.6953125, "learning_rate": 1.2452259998310123e-07, "loss": 0.9082, "step": 9605 }, { "epoch": 0.98, "grad_norm": 1.3125, "learning_rate": 1.1877404163035954e-07, "loss": 1.1428, "step": 9610 }, { "epoch": 0.98, "grad_norm": 0.59765625, "learning_rate": 1.1316117020280792e-07, "loss": 0.8789, "step": 9615 }, { "epoch": 0.98, "grad_norm": 0.89453125, "learning_rate": 1.0768400096856645e-07, "loss": 1.034, "step": 9620 }, { "epoch": 0.98, "grad_norm": 0.8125, "learning_rate": 1.0234254882664496e-07, "loss": 1.0811, "step": 9625 }, { "epoch": 0.98, "grad_norm": 0.796875, "learning_rate": 9.713682830685411e-08, "loss": 1.0992, "step": 9630 }, { "epoch": 0.98, "grad_norm": 0.84375, "learning_rate": 9.206685356980549e-08, "loss": 1.1375, "step": 9635 }, { "epoch": 0.98, "grad_norm": 0.74609375, "learning_rate": 8.713263840683939e-08, "loss": 1.0984, "step": 9640 }, { "epoch": 0.98, "grad_norm": 0.73046875, "learning_rate": 8.233419624000816e-08, "loss": 1.1631, "step": 9645 }, { "epoch": 0.98, "grad_norm": 0.70703125, "learning_rate": 7.767154012203736e-08, "loss": 1.0627, "step": 9650 }, { "epoch": 0.98, "grad_norm": 0.74609375, "learning_rate": 7.314468273628695e-08, "loss": 0.9164, "step": 9655 }, { "epoch": 0.98, "grad_norm": 0.79296875, "learning_rate": 6.875363639671229e-08, "loss": 1.016, "step": 9660 }, { "epoch": 0.98, "grad_norm": 0.74609375, "learning_rate": 6.44984130478421e-08, "loss": 0.9653, "step": 9665 }, { "epoch": 0.98, "grad_norm": 0.63671875, "learning_rate": 6.037902426473952e-08, "loss": 0.9356, "step": 9670 }, { "epoch": 0.99, "grad_norm": 0.7109375, "learning_rate": 5.639548125295768e-08, "loss": 1.1079, "step": 9675 }, { "epoch": 0.99, "grad_norm": 0.72265625, "learning_rate": 5.2547794848539775e-08, "loss": 0.9644, "step": 9680 }, { "epoch": 0.99, "grad_norm": 0.76171875, "learning_rate": 4.883597551795793e-08, "loss": 1.0018, "step": 9685 }, { "epoch": 0.99, "grad_norm": 0.74609375, "learning_rate": 4.5260033358107686e-08, "loss": 1.0907, "step": 9690 }, { "epoch": 0.99, "grad_norm": 0.703125, "learning_rate": 4.181997809626914e-08, "loss": 1.1787, "step": 9695 }, { "epoch": 0.99, "grad_norm": 0.78125, "learning_rate": 3.8515819090073627e-08, "loss": 0.9285, "step": 9700 }, { "epoch": 0.99, "grad_norm": 0.77734375, "learning_rate": 3.534756532750927e-08, "loss": 0.981, "step": 9705 }, { "epoch": 0.99, "grad_norm": 0.625, "learning_rate": 3.2315225426843286e-08, "loss": 1.1156, "step": 9710 }, { "epoch": 0.99, "grad_norm": 0.78515625, "learning_rate": 2.941880763666083e-08, "loss": 1.1838, "step": 9715 }, { "epoch": 0.99, "grad_norm": 0.8125, "learning_rate": 2.665831983579836e-08, "loss": 1.1042, "step": 9720 }, { "epoch": 0.99, "grad_norm": 1.0, "learning_rate": 2.4033769533327034e-08, "loss": 1.0008, "step": 9725 }, { "epoch": 0.99, "grad_norm": 0.78515625, "learning_rate": 2.1545163868552654e-08, "loss": 1.1072, "step": 9730 }, { "epoch": 0.99, "grad_norm": 0.62109375, "learning_rate": 1.919250961098795e-08, "loss": 0.8663, "step": 9735 }, { "epoch": 0.99, "grad_norm": 0.8671875, "learning_rate": 1.6975813160313713e-08, "loss": 0.9007, "step": 9740 }, { "epoch": 0.99, "grad_norm": 0.89453125, "learning_rate": 1.4895080546395435e-08, "loss": 0.823, "step": 9745 }, { "epoch": 0.99, "grad_norm": 0.69921875, "learning_rate": 1.2950317429238911e-08, "loss": 0.9001, "step": 9750 }, { "epoch": 0.99, "grad_norm": 0.80859375, "learning_rate": 1.1141529098990244e-08, "loss": 1.1884, "step": 9755 }, { "epoch": 0.99, "grad_norm": 0.7265625, "learning_rate": 9.468720475913628e-09, "loss": 0.9437, "step": 9760 }, { "epoch": 0.99, "grad_norm": 0.72265625, "learning_rate": 7.931896110391357e-09, "loss": 1.0478, "step": 9765 }, { "epoch": 0.99, "grad_norm": 0.87890625, "learning_rate": 6.531060182884962e-09, "loss": 1.1108, "step": 9770 }, { "epoch": 1.0, "grad_norm": 0.70703125, "learning_rate": 5.2662165039518704e-09, "loss": 0.9931, "step": 9775 }, { "epoch": 1.0, "grad_norm": 0.6953125, "learning_rate": 4.137368514217643e-09, "loss": 1.0868, "step": 9780 }, { "epoch": 1.0, "grad_norm": 0.79296875, "learning_rate": 3.144519284381531e-09, "loss": 1.1434, "step": 9785 }, { "epoch": 1.0, "grad_norm": 0.76953125, "learning_rate": 2.287671515183165e-09, "loss": 1.0663, "step": 9790 }, { "epoch": 1.0, "grad_norm": 0.63671875, "learning_rate": 1.5668275374303153e-09, "loss": 0.9415, "step": 9795 }, { "epoch": 1.0, "grad_norm": 0.91015625, "learning_rate": 9.819893119544788e-10, "loss": 0.9083, "step": 9800 }, { "epoch": 1.0, "grad_norm": 0.796875, "learning_rate": 5.33158429638636e-10, "loss": 1.12, "step": 9805 }, { "epoch": 1.0, "grad_norm": 0.59375, "learning_rate": 2.203361113894964e-10, "loss": 0.9311, "step": 9810 }, { "epoch": 1.0, "grad_norm": 0.671875, "learning_rate": 4.352320814859923e-11, "loss": 0.9898, "step": 9815 }, { "epoch": 1.0, "step": 9819, "total_flos": 1.2142702597546967e+18, "train_loss": 1.0705042276791343, "train_runtime": 37728.5727, "train_samples_per_second": 1.041, "train_steps_per_second": 0.26 } ], "logging_steps": 5, "max_steps": 9819, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 1.2142702597546967e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }