diff --git "a/checkpoint-349372/trainer_state.json" "b/checkpoint-349372/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-349372/trainer_state.json" @@ -0,0 +1,5099 @@ +{ + "best_metric": 0.89, + "best_model_checkpoint": "models/evacun-lemmatization/checkpoint-349372", + "epoch": 19.0, + "eval_steps": 500, + "global_step": 349372, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.027191646726125736, + "grad_norm": 3.1259007453918457, + "learning_rate": 9.994561670654776e-06, + "loss": 4.085, + "step": 500 + }, + { + "epoch": 0.05438329345225147, + "grad_norm": 3.907444953918457, + "learning_rate": 9.98912334130955e-06, + "loss": 1.9495, + "step": 1000 + }, + { + "epoch": 0.0815749401783772, + "grad_norm": 2.732598066329956, + "learning_rate": 9.983685011964326e-06, + "loss": 1.4816, + "step": 1500 + }, + { + "epoch": 0.10876658690450294, + "grad_norm": 5.2942376136779785, + "learning_rate": 9.9782466826191e-06, + "loss": 1.2103, + "step": 2000 + }, + { + "epoch": 0.13595823363062867, + "grad_norm": 4.758411884307861, + "learning_rate": 9.972808353273875e-06, + "loss": 1.0154, + "step": 2500 + }, + { + "epoch": 0.1631498803567544, + "grad_norm": 6.896116733551025, + "learning_rate": 9.96737002392865e-06, + "loss": 0.9023, + "step": 3000 + }, + { + "epoch": 0.19034152708288013, + "grad_norm": 5.587175369262695, + "learning_rate": 9.961931694583424e-06, + "loss": 0.7953, + "step": 3500 + }, + { + "epoch": 0.2175331738090059, + "grad_norm": 5.7429704666137695, + "learning_rate": 9.9564933652382e-06, + "loss": 0.7148, + "step": 4000 + }, + { + "epoch": 0.24472482053513162, + "grad_norm": 4.422295093536377, + "learning_rate": 9.951055035892974e-06, + "loss": 0.6566, + "step": 4500 + }, + { + "epoch": 0.27191646726125734, + "grad_norm": 3.8143515586853027, + "learning_rate": 9.94561670654775e-06, + "loss": 0.6051, + "step": 5000 + }, + { + "epoch": 0.2991081139873831, + "grad_norm": 3.3009655475616455, + "learning_rate": 9.940178377202525e-06, + "loss": 0.5562, + "step": 5500 + }, + { + "epoch": 0.3262997607135088, + "grad_norm": 4.799742221832275, + "learning_rate": 9.934740047857298e-06, + "loss": 0.5165, + "step": 6000 + }, + { + "epoch": 0.35349140743963453, + "grad_norm": 4.289649486541748, + "learning_rate": 9.929301718512075e-06, + "loss": 0.4773, + "step": 6500 + }, + { + "epoch": 0.38068305416576026, + "grad_norm": 5.090242385864258, + "learning_rate": 9.923863389166849e-06, + "loss": 0.4651, + "step": 7000 + }, + { + "epoch": 0.407874700891886, + "grad_norm": 3.0283756256103516, + "learning_rate": 9.918425059821624e-06, + "loss": 0.4524, + "step": 7500 + }, + { + "epoch": 0.4350663476180118, + "grad_norm": 3.258117198944092, + "learning_rate": 9.912986730476399e-06, + "loss": 0.4214, + "step": 8000 + }, + { + "epoch": 0.4622579943441375, + "grad_norm": 2.3887009620666504, + "learning_rate": 9.907548401131173e-06, + "loss": 0.393, + "step": 8500 + }, + { + "epoch": 0.48944964107026323, + "grad_norm": 4.268918037414551, + "learning_rate": 9.90211007178595e-06, + "loss": 0.3776, + "step": 9000 + }, + { + "epoch": 0.516641287796389, + "grad_norm": 3.5440762042999268, + "learning_rate": 9.896671742440723e-06, + "loss": 0.3549, + "step": 9500 + }, + { + "epoch": 0.5438329345225147, + "grad_norm": 3.123277187347412, + "learning_rate": 9.891233413095498e-06, + "loss": 0.3558, + "step": 10000 + }, + { + "epoch": 0.5710245812486404, + "grad_norm": 2.6814281940460205, + "learning_rate": 9.885795083750273e-06, + "loss": 0.3392, + "step": 10500 + }, + { + "epoch": 0.5982162279747661, + "grad_norm": 3.1762001514434814, + "learning_rate": 9.880356754405047e-06, + "loss": 0.3232, + "step": 11000 + }, + { + "epoch": 0.6254078747008919, + "grad_norm": 2.568861722946167, + "learning_rate": 9.874918425059824e-06, + "loss": 0.334, + "step": 11500 + }, + { + "epoch": 0.6525995214270176, + "grad_norm": 3.985795736312866, + "learning_rate": 9.869480095714597e-06, + "loss": 0.3126, + "step": 12000 + }, + { + "epoch": 0.6797911681531433, + "grad_norm": 3.3112289905548096, + "learning_rate": 9.864041766369372e-06, + "loss": 0.3023, + "step": 12500 + }, + { + "epoch": 0.7069828148792691, + "grad_norm": 5.755519390106201, + "learning_rate": 9.858603437024146e-06, + "loss": 0.2828, + "step": 13000 + }, + { + "epoch": 0.7341744616053948, + "grad_norm": 2.3035266399383545, + "learning_rate": 9.853165107678921e-06, + "loss": 0.2752, + "step": 13500 + }, + { + "epoch": 0.7613661083315205, + "grad_norm": 4.862213611602783, + "learning_rate": 9.847726778333696e-06, + "loss": 0.2518, + "step": 14000 + }, + { + "epoch": 0.7885577550576462, + "grad_norm": 5.991926193237305, + "learning_rate": 9.842288448988472e-06, + "loss": 0.2571, + "step": 14500 + }, + { + "epoch": 0.815749401783772, + "grad_norm": 5.180552959442139, + "learning_rate": 9.836850119643247e-06, + "loss": 0.2655, + "step": 15000 + }, + { + "epoch": 0.8429410485098977, + "grad_norm": 1.392749309539795, + "learning_rate": 9.83141179029802e-06, + "loss": 0.2618, + "step": 15500 + }, + { + "epoch": 0.8701326952360235, + "grad_norm": 2.431339740753174, + "learning_rate": 9.825973460952795e-06, + "loss": 0.2577, + "step": 16000 + }, + { + "epoch": 0.8973243419621493, + "grad_norm": 2.325584888458252, + "learning_rate": 9.82053513160757e-06, + "loss": 0.2389, + "step": 16500 + }, + { + "epoch": 0.924515988688275, + "grad_norm": 2.193328619003296, + "learning_rate": 9.815096802262346e-06, + "loss": 0.2404, + "step": 17000 + }, + { + "epoch": 0.9517076354144007, + "grad_norm": 2.3462648391723633, + "learning_rate": 9.809658472917121e-06, + "loss": 0.2299, + "step": 17500 + }, + { + "epoch": 0.9788992821405265, + "grad_norm": 1.676985502243042, + "learning_rate": 9.804220143571895e-06, + "loss": 0.2351, + "step": 18000 + }, + { + "epoch": 1.0, + "eval_exact_match": 0.7948, + "eval_loss": 0.2527632415294647, + "eval_runtime": 1021.1942, + "eval_samples_per_second": 11.141, + "eval_steps_per_second": 0.697, + "step": 18388 + }, + { + "epoch": 1.0060909288666522, + "grad_norm": 2.935654640197754, + "learning_rate": 9.79878181422667e-06, + "loss": 0.2187, + "step": 18500 + }, + { + "epoch": 1.033282575592778, + "grad_norm": 4.042050838470459, + "learning_rate": 9.793343484881445e-06, + "loss": 0.184, + "step": 19000 + }, + { + "epoch": 1.0604742223189036, + "grad_norm": 2.6612393856048584, + "learning_rate": 9.78790515553622e-06, + "loss": 0.1627, + "step": 19500 + }, + { + "epoch": 1.0876658690450294, + "grad_norm": 0.839908242225647, + "learning_rate": 9.782466826190995e-06, + "loss": 0.1761, + "step": 20000 + }, + { + "epoch": 1.114857515771155, + "grad_norm": 3.877523899078369, + "learning_rate": 9.777028496845769e-06, + "loss": 0.1675, + "step": 20500 + }, + { + "epoch": 1.1420491624972808, + "grad_norm": 2.287436008453369, + "learning_rate": 9.771590167500544e-06, + "loss": 0.1713, + "step": 21000 + }, + { + "epoch": 1.1692408092234066, + "grad_norm": 2.2683210372924805, + "learning_rate": 9.76615183815532e-06, + "loss": 0.1671, + "step": 21500 + }, + { + "epoch": 1.1964324559495323, + "grad_norm": 0.6974703669548035, + "learning_rate": 9.760713508810095e-06, + "loss": 0.167, + "step": 22000 + }, + { + "epoch": 1.223624102675658, + "grad_norm": 0.8777428865432739, + "learning_rate": 9.75527517946487e-06, + "loss": 0.1569, + "step": 22500 + }, + { + "epoch": 1.2508157494017837, + "grad_norm": 2.61319637298584, + "learning_rate": 9.749836850119643e-06, + "loss": 0.1648, + "step": 23000 + }, + { + "epoch": 1.2780073961279095, + "grad_norm": 3.361828565597534, + "learning_rate": 9.744398520774418e-06, + "loss": 0.1529, + "step": 23500 + }, + { + "epoch": 1.3051990428540352, + "grad_norm": 2.4574973583221436, + "learning_rate": 9.738960191429194e-06, + "loss": 0.1587, + "step": 24000 + }, + { + "epoch": 1.332390689580161, + "grad_norm": 2.845959424972534, + "learning_rate": 9.733521862083969e-06, + "loss": 0.146, + "step": 24500 + }, + { + "epoch": 1.3595823363062867, + "grad_norm": 2.5402307510375977, + "learning_rate": 9.728083532738744e-06, + "loss": 0.1494, + "step": 25000 + }, + { + "epoch": 1.3867739830324124, + "grad_norm": 2.56087327003479, + "learning_rate": 9.722645203393518e-06, + "loss": 0.1533, + "step": 25500 + }, + { + "epoch": 1.4139656297585381, + "grad_norm": 2.302635431289673, + "learning_rate": 9.717206874048293e-06, + "loss": 0.1557, + "step": 26000 + }, + { + "epoch": 1.4411572764846639, + "grad_norm": 5.063803672790527, + "learning_rate": 9.711768544703068e-06, + "loss": 0.1614, + "step": 26500 + }, + { + "epoch": 1.4683489232107896, + "grad_norm": 4.056577682495117, + "learning_rate": 9.706330215357843e-06, + "loss": 0.1529, + "step": 27000 + }, + { + "epoch": 1.4955405699369153, + "grad_norm": 4.56748104095459, + "learning_rate": 9.700891886012618e-06, + "loss": 0.1458, + "step": 27500 + }, + { + "epoch": 1.5227322166630413, + "grad_norm": 3.91300892829895, + "learning_rate": 9.695453556667392e-06, + "loss": 0.1399, + "step": 28000 + }, + { + "epoch": 1.549923863389167, + "grad_norm": 2.8584766387939453, + "learning_rate": 9.690015227322167e-06, + "loss": 0.1372, + "step": 28500 + }, + { + "epoch": 1.5771155101152927, + "grad_norm": 1.0653077363967896, + "learning_rate": 9.684576897976942e-06, + "loss": 0.1505, + "step": 29000 + }, + { + "epoch": 1.6043071568414184, + "grad_norm": 2.1718199253082275, + "learning_rate": 9.679138568631718e-06, + "loss": 0.132, + "step": 29500 + }, + { + "epoch": 1.6314988035675442, + "grad_norm": 2.4078354835510254, + "learning_rate": 9.673700239286493e-06, + "loss": 0.1354, + "step": 30000 + }, + { + "epoch": 1.65869045029367, + "grad_norm": 2.3602287769317627, + "learning_rate": 9.668261909941266e-06, + "loss": 0.1478, + "step": 30500 + }, + { + "epoch": 1.6858820970197956, + "grad_norm": 5.312971591949463, + "learning_rate": 9.662823580596041e-06, + "loss": 0.1443, + "step": 31000 + }, + { + "epoch": 1.7130737437459214, + "grad_norm": 1.5108168125152588, + "learning_rate": 9.657385251250817e-06, + "loss": 0.1442, + "step": 31500 + }, + { + "epoch": 1.740265390472047, + "grad_norm": 2.7200069427490234, + "learning_rate": 9.651946921905592e-06, + "loss": 0.138, + "step": 32000 + }, + { + "epoch": 1.7674570371981728, + "grad_norm": 1.6983907222747803, + "learning_rate": 9.646508592560367e-06, + "loss": 0.1472, + "step": 32500 + }, + { + "epoch": 1.7946486839242985, + "grad_norm": 4.2195024490356445, + "learning_rate": 9.64107026321514e-06, + "loss": 0.1327, + "step": 33000 + }, + { + "epoch": 1.8218403306504243, + "grad_norm": 2.506478786468506, + "learning_rate": 9.635631933869916e-06, + "loss": 0.1382, + "step": 33500 + }, + { + "epoch": 1.84903197737655, + "grad_norm": 5.781156539916992, + "learning_rate": 9.630193604524691e-06, + "loss": 0.1301, + "step": 34000 + }, + { + "epoch": 1.8762236241026757, + "grad_norm": 1.8185195922851562, + "learning_rate": 9.624755275179466e-06, + "loss": 0.1294, + "step": 34500 + }, + { + "epoch": 1.9034152708288015, + "grad_norm": 4.116232872009277, + "learning_rate": 9.619316945834241e-06, + "loss": 0.1285, + "step": 35000 + }, + { + "epoch": 1.9306069175549272, + "grad_norm": 2.8177270889282227, + "learning_rate": 9.613878616489015e-06, + "loss": 0.1282, + "step": 35500 + }, + { + "epoch": 1.957798564281053, + "grad_norm": 0.8758026361465454, + "learning_rate": 9.60844028714379e-06, + "loss": 0.1327, + "step": 36000 + }, + { + "epoch": 1.9849902110071787, + "grad_norm": 1.9684972763061523, + "learning_rate": 9.603001957798565e-06, + "loss": 0.1237, + "step": 36500 + }, + { + "epoch": 2.0, + "eval_exact_match": 0.8592, + "eval_loss": 0.1779375970363617, + "eval_runtime": 1020.8338, + "eval_samples_per_second": 11.145, + "eval_steps_per_second": 0.697, + "step": 36776 + }, + { + "epoch": 2.0121818577333044, + "grad_norm": 2.499526262283325, + "learning_rate": 9.59756362845334e-06, + "loss": 0.1125, + "step": 37000 + }, + { + "epoch": 2.03937350445943, + "grad_norm": 0.8748095631599426, + "learning_rate": 9.592125299108114e-06, + "loss": 0.0825, + "step": 37500 + }, + { + "epoch": 2.066565151185556, + "grad_norm": 3.8996646404266357, + "learning_rate": 9.58668696976289e-06, + "loss": 0.0833, + "step": 38000 + }, + { + "epoch": 2.0937567979116816, + "grad_norm": 4.08225154876709, + "learning_rate": 9.581248640417664e-06, + "loss": 0.0848, + "step": 38500 + }, + { + "epoch": 2.1209484446378073, + "grad_norm": 2.4132580757141113, + "learning_rate": 9.57581031107244e-06, + "loss": 0.0816, + "step": 39000 + }, + { + "epoch": 2.148140091363933, + "grad_norm": 3.604099988937378, + "learning_rate": 9.570371981727215e-06, + "loss": 0.0871, + "step": 39500 + }, + { + "epoch": 2.1753317380900588, + "grad_norm": 0.1320401430130005, + "learning_rate": 9.564933652381988e-06, + "loss": 0.0885, + "step": 40000 + }, + { + "epoch": 2.2025233848161845, + "grad_norm": 0.4408586621284485, + "learning_rate": 9.559495323036764e-06, + "loss": 0.0873, + "step": 40500 + }, + { + "epoch": 2.22971503154231, + "grad_norm": 1.62918221950531, + "learning_rate": 9.554056993691539e-06, + "loss": 0.0911, + "step": 41000 + }, + { + "epoch": 2.256906678268436, + "grad_norm": 1.7283786535263062, + "learning_rate": 9.548618664346314e-06, + "loss": 0.0851, + "step": 41500 + }, + { + "epoch": 2.2840983249945617, + "grad_norm": 3.522033452987671, + "learning_rate": 9.54318033500109e-06, + "loss": 0.0806, + "step": 42000 + }, + { + "epoch": 2.3112899717206874, + "grad_norm": 1.8525676727294922, + "learning_rate": 9.537742005655863e-06, + "loss": 0.0776, + "step": 42500 + }, + { + "epoch": 2.338481618446813, + "grad_norm": 2.7800660133361816, + "learning_rate": 9.532303676310638e-06, + "loss": 0.0875, + "step": 43000 + }, + { + "epoch": 2.365673265172939, + "grad_norm": 0.9835543632507324, + "learning_rate": 9.526865346965413e-06, + "loss": 0.0797, + "step": 43500 + }, + { + "epoch": 2.3928649118990646, + "grad_norm": 4.02990198135376, + "learning_rate": 9.521427017620188e-06, + "loss": 0.0822, + "step": 44000 + }, + { + "epoch": 2.4200565586251903, + "grad_norm": 3.2835583686828613, + "learning_rate": 9.515988688274963e-06, + "loss": 0.0896, + "step": 44500 + }, + { + "epoch": 2.447248205351316, + "grad_norm": 2.090576171875, + "learning_rate": 9.510550358929737e-06, + "loss": 0.0797, + "step": 45000 + }, + { + "epoch": 2.4744398520774418, + "grad_norm": 1.9766199588775635, + "learning_rate": 9.505112029584512e-06, + "loss": 0.0853, + "step": 45500 + }, + { + "epoch": 2.5016314988035675, + "grad_norm": 4.452338695526123, + "learning_rate": 9.499673700239287e-06, + "loss": 0.0862, + "step": 46000 + }, + { + "epoch": 2.5288231455296932, + "grad_norm": 0.5408188104629517, + "learning_rate": 9.494235370894063e-06, + "loss": 0.0831, + "step": 46500 + }, + { + "epoch": 2.556014792255819, + "grad_norm": 1.3575879335403442, + "learning_rate": 9.488797041548838e-06, + "loss": 0.0872, + "step": 47000 + }, + { + "epoch": 2.5832064389819447, + "grad_norm": 1.4951727390289307, + "learning_rate": 9.483358712203611e-06, + "loss": 0.0851, + "step": 47500 + }, + { + "epoch": 2.6103980857080704, + "grad_norm": 2.675262212753296, + "learning_rate": 9.477920382858387e-06, + "loss": 0.0823, + "step": 48000 + }, + { + "epoch": 2.637589732434196, + "grad_norm": 2.4334521293640137, + "learning_rate": 9.472482053513162e-06, + "loss": 0.0793, + "step": 48500 + }, + { + "epoch": 2.664781379160322, + "grad_norm": 3.254221200942993, + "learning_rate": 9.467043724167937e-06, + "loss": 0.0814, + "step": 49000 + }, + { + "epoch": 2.6919730258864476, + "grad_norm": 2.9325039386749268, + "learning_rate": 9.461605394822712e-06, + "loss": 0.084, + "step": 49500 + }, + { + "epoch": 2.7191646726125733, + "grad_norm": 1.1457610130310059, + "learning_rate": 9.456167065477486e-06, + "loss": 0.0834, + "step": 50000 + }, + { + "epoch": 2.746356319338699, + "grad_norm": 3.7050232887268066, + "learning_rate": 9.450728736132261e-06, + "loss": 0.0866, + "step": 50500 + }, + { + "epoch": 2.773547966064825, + "grad_norm": 0.22456876933574677, + "learning_rate": 9.445290406787036e-06, + "loss": 0.0872, + "step": 51000 + }, + { + "epoch": 2.8007396127909505, + "grad_norm": 0.23606906831264496, + "learning_rate": 9.439852077441811e-06, + "loss": 0.0764, + "step": 51500 + }, + { + "epoch": 2.8279312595170762, + "grad_norm": 3.726656436920166, + "learning_rate": 9.434413748096586e-06, + "loss": 0.0808, + "step": 52000 + }, + { + "epoch": 2.855122906243202, + "grad_norm": 0.7011487483978271, + "learning_rate": 9.42897541875136e-06, + "loss": 0.0843, + "step": 52500 + }, + { + "epoch": 2.8823145529693277, + "grad_norm": 4.318293571472168, + "learning_rate": 9.423537089406135e-06, + "loss": 0.0743, + "step": 53000 + }, + { + "epoch": 2.9095061996954534, + "grad_norm": 1.4616190195083618, + "learning_rate": 9.41809876006091e-06, + "loss": 0.0765, + "step": 53500 + }, + { + "epoch": 2.936697846421579, + "grad_norm": 0.5177611112594604, + "learning_rate": 9.412660430715684e-06, + "loss": 0.0768, + "step": 54000 + }, + { + "epoch": 2.963889493147705, + "grad_norm": 1.2543549537658691, + "learning_rate": 9.40722210137046e-06, + "loss": 0.078, + "step": 54500 + }, + { + "epoch": 2.9910811398738306, + "grad_norm": 3.5229008197784424, + "learning_rate": 9.401783772025234e-06, + "loss": 0.0861, + "step": 55000 + }, + { + "epoch": 3.0, + "eval_exact_match": 0.8693, + "eval_loss": 0.16594479978084564, + "eval_runtime": 1021.2082, + "eval_samples_per_second": 11.141, + "eval_steps_per_second": 0.697, + "step": 55164 + }, + { + "epoch": 3.0182727865999563, + "grad_norm": 0.3636282980442047, + "learning_rate": 9.39634544268001e-06, + "loss": 0.057, + "step": 55500 + }, + { + "epoch": 3.045464433326082, + "grad_norm": 1.1032425165176392, + "learning_rate": 9.390907113334785e-06, + "loss": 0.0468, + "step": 56000 + }, + { + "epoch": 3.072656080052208, + "grad_norm": 1.8635987043380737, + "learning_rate": 9.385468783989558e-06, + "loss": 0.0458, + "step": 56500 + }, + { + "epoch": 3.0998477267783335, + "grad_norm": 0.11481478065252304, + "learning_rate": 9.380030454644335e-06, + "loss": 0.0487, + "step": 57000 + }, + { + "epoch": 3.1270393735044593, + "grad_norm": 1.2612336874008179, + "learning_rate": 9.374592125299109e-06, + "loss": 0.0472, + "step": 57500 + }, + { + "epoch": 3.154231020230585, + "grad_norm": 3.589947462081909, + "learning_rate": 9.369153795953884e-06, + "loss": 0.0512, + "step": 58000 + }, + { + "epoch": 3.1814226669567107, + "grad_norm": 2.791079521179199, + "learning_rate": 9.363715466608659e-06, + "loss": 0.0499, + "step": 58500 + }, + { + "epoch": 3.2086143136828364, + "grad_norm": 2.4952220916748047, + "learning_rate": 9.358277137263433e-06, + "loss": 0.0477, + "step": 59000 + }, + { + "epoch": 3.235805960408962, + "grad_norm": 1.147648572921753, + "learning_rate": 9.35283880791821e-06, + "loss": 0.0484, + "step": 59500 + }, + { + "epoch": 3.262997607135088, + "grad_norm": 0.35628893971443176, + "learning_rate": 9.347400478572983e-06, + "loss": 0.0504, + "step": 60000 + }, + { + "epoch": 3.2901892538612136, + "grad_norm": 1.9759888648986816, + "learning_rate": 9.341962149227758e-06, + "loss": 0.0515, + "step": 60500 + }, + { + "epoch": 3.31738090058734, + "grad_norm": 0.2888725697994232, + "learning_rate": 9.336523819882533e-06, + "loss": 0.0523, + "step": 61000 + }, + { + "epoch": 3.344572547313465, + "grad_norm": 3.626575469970703, + "learning_rate": 9.331085490537307e-06, + "loss": 0.0499, + "step": 61500 + }, + { + "epoch": 3.3717641940395913, + "grad_norm": 1.1794458627700806, + "learning_rate": 9.325647161192082e-06, + "loss": 0.0482, + "step": 62000 + }, + { + "epoch": 3.3989558407657166, + "grad_norm": 1.3318313360214233, + "learning_rate": 9.320208831846857e-06, + "loss": 0.0558, + "step": 62500 + }, + { + "epoch": 3.4261474874918427, + "grad_norm": 0.2946118712425232, + "learning_rate": 9.314770502501632e-06, + "loss": 0.0524, + "step": 63000 + }, + { + "epoch": 3.453339134217968, + "grad_norm": 0.9527666568756104, + "learning_rate": 9.309332173156406e-06, + "loss": 0.0523, + "step": 63500 + }, + { + "epoch": 3.480530780944094, + "grad_norm": 1.9586501121520996, + "learning_rate": 9.303893843811181e-06, + "loss": 0.0496, + "step": 64000 + }, + { + "epoch": 3.5077224276702195, + "grad_norm": 3.5742673873901367, + "learning_rate": 9.298455514465956e-06, + "loss": 0.0562, + "step": 64500 + }, + { + "epoch": 3.5349140743963456, + "grad_norm": 1.3943268060684204, + "learning_rate": 9.293017185120732e-06, + "loss": 0.0513, + "step": 65000 + }, + { + "epoch": 3.562105721122471, + "grad_norm": 3.8523316383361816, + "learning_rate": 9.287578855775507e-06, + "loss": 0.0515, + "step": 65500 + }, + { + "epoch": 3.589297367848597, + "grad_norm": 1.121957540512085, + "learning_rate": 9.28214052643028e-06, + "loss": 0.0547, + "step": 66000 + }, + { + "epoch": 3.6164890145747224, + "grad_norm": 0.9070321321487427, + "learning_rate": 9.276702197085056e-06, + "loss": 0.0482, + "step": 66500 + }, + { + "epoch": 3.6436806613008486, + "grad_norm": 0.5214864611625671, + "learning_rate": 9.27126386773983e-06, + "loss": 0.0505, + "step": 67000 + }, + { + "epoch": 3.670872308026974, + "grad_norm": 3.1231439113616943, + "learning_rate": 9.265825538394606e-06, + "loss": 0.0497, + "step": 67500 + }, + { + "epoch": 3.6980639547531, + "grad_norm": 2.436281204223633, + "learning_rate": 9.260387209049381e-06, + "loss": 0.0444, + "step": 68000 + }, + { + "epoch": 3.7252556014792257, + "grad_norm": 1.7644033432006836, + "learning_rate": 9.254948879704155e-06, + "loss": 0.0493, + "step": 68500 + }, + { + "epoch": 3.7524472482053515, + "grad_norm": 0.840013861656189, + "learning_rate": 9.24951055035893e-06, + "loss": 0.0536, + "step": 69000 + }, + { + "epoch": 3.779638894931477, + "grad_norm": 2.0115058422088623, + "learning_rate": 9.244072221013705e-06, + "loss": 0.0522, + "step": 69500 + }, + { + "epoch": 3.806830541657603, + "grad_norm": 2.1235430240631104, + "learning_rate": 9.23863389166848e-06, + "loss": 0.0554, + "step": 70000 + }, + { + "epoch": 3.8340221883837287, + "grad_norm": 0.39138633012771606, + "learning_rate": 9.233195562323255e-06, + "loss": 0.0477, + "step": 70500 + }, + { + "epoch": 3.8612138351098544, + "grad_norm": 0.49036768078804016, + "learning_rate": 9.227757232978029e-06, + "loss": 0.0543, + "step": 71000 + }, + { + "epoch": 3.88840548183598, + "grad_norm": 2.911491632461548, + "learning_rate": 9.222318903632804e-06, + "loss": 0.0513, + "step": 71500 + }, + { + "epoch": 3.915597128562106, + "grad_norm": 5.786170959472656, + "learning_rate": 9.21688057428758e-06, + "loss": 0.0531, + "step": 72000 + }, + { + "epoch": 3.9427887752882316, + "grad_norm": 0.6657633185386658, + "learning_rate": 9.211442244942355e-06, + "loss": 0.0545, + "step": 72500 + }, + { + "epoch": 3.9699804220143573, + "grad_norm": 2.4814870357513428, + "learning_rate": 9.20600391559713e-06, + "loss": 0.0523, + "step": 73000 + }, + { + "epoch": 3.997172068740483, + "grad_norm": 0.967647135257721, + "learning_rate": 9.200565586251903e-06, + "loss": 0.0497, + "step": 73500 + }, + { + "epoch": 4.0, + "eval_exact_match": 0.8795, + "eval_loss": 0.175943061709404, + "eval_runtime": 1022.0597, + "eval_samples_per_second": 11.131, + "eval_steps_per_second": 0.697, + "step": 73552 + }, + { + "epoch": 4.024363715466609, + "grad_norm": 4.134894847869873, + "learning_rate": 9.195127256906679e-06, + "loss": 0.0281, + "step": 74000 + }, + { + "epoch": 4.051555362192734, + "grad_norm": 1.442826509475708, + "learning_rate": 9.189688927561454e-06, + "loss": 0.0287, + "step": 74500 + }, + { + "epoch": 4.07874700891886, + "grad_norm": 0.3482317626476288, + "learning_rate": 9.184250598216229e-06, + "loss": 0.0309, + "step": 75000 + }, + { + "epoch": 4.1059386556449855, + "grad_norm": 0.5604238510131836, + "learning_rate": 9.178812268871004e-06, + "loss": 0.0281, + "step": 75500 + }, + { + "epoch": 4.133130302371112, + "grad_norm": 2.2239296436309814, + "learning_rate": 9.173373939525778e-06, + "loss": 0.0259, + "step": 76000 + }, + { + "epoch": 4.160321949097237, + "grad_norm": 0.34009259939193726, + "learning_rate": 9.167935610180553e-06, + "loss": 0.0321, + "step": 76500 + }, + { + "epoch": 4.187513595823363, + "grad_norm": 3.0141053199768066, + "learning_rate": 9.162497280835328e-06, + "loss": 0.0317, + "step": 77000 + }, + { + "epoch": 4.214705242549488, + "grad_norm": 0.01643652655184269, + "learning_rate": 9.157058951490103e-06, + "loss": 0.029, + "step": 77500 + }, + { + "epoch": 4.241896889275615, + "grad_norm": 0.03787761181592941, + "learning_rate": 9.151620622144878e-06, + "loss": 0.0321, + "step": 78000 + }, + { + "epoch": 4.26908853600174, + "grad_norm": 0.7121431231498718, + "learning_rate": 9.146182292799652e-06, + "loss": 0.0312, + "step": 78500 + }, + { + "epoch": 4.296280182727866, + "grad_norm": 0.01658172532916069, + "learning_rate": 9.140743963454427e-06, + "loss": 0.0327, + "step": 79000 + }, + { + "epoch": 4.323471829453991, + "grad_norm": 0.21374182403087616, + "learning_rate": 9.135305634109202e-06, + "loss": 0.0318, + "step": 79500 + }, + { + "epoch": 4.3506634761801175, + "grad_norm": 3.6706273555755615, + "learning_rate": 9.129867304763978e-06, + "loss": 0.0311, + "step": 80000 + }, + { + "epoch": 4.377855122906243, + "grad_norm": 2.5517737865448, + "learning_rate": 9.124428975418753e-06, + "loss": 0.0299, + "step": 80500 + }, + { + "epoch": 4.405046769632369, + "grad_norm": 1.2063195705413818, + "learning_rate": 9.118990646073526e-06, + "loss": 0.0323, + "step": 81000 + }, + { + "epoch": 4.432238416358494, + "grad_norm": 0.2457069456577301, + "learning_rate": 9.113552316728301e-06, + "loss": 0.0309, + "step": 81500 + }, + { + "epoch": 4.45943006308462, + "grad_norm": 0.020821336656808853, + "learning_rate": 9.108113987383077e-06, + "loss": 0.0351, + "step": 82000 + }, + { + "epoch": 4.486621709810747, + "grad_norm": 2.251249313354492, + "learning_rate": 9.102675658037852e-06, + "loss": 0.0345, + "step": 82500 + }, + { + "epoch": 4.513813356536872, + "grad_norm": 0.6691648364067078, + "learning_rate": 9.097237328692627e-06, + "loss": 0.0311, + "step": 83000 + }, + { + "epoch": 4.541005003262997, + "grad_norm": 0.05074188485741615, + "learning_rate": 9.0917989993474e-06, + "loss": 0.03, + "step": 83500 + }, + { + "epoch": 4.568196649989123, + "grad_norm": 0.03867918998003006, + "learning_rate": 9.086360670002176e-06, + "loss": 0.0317, + "step": 84000 + }, + { + "epoch": 4.5953882967152495, + "grad_norm": 2.578198194503784, + "learning_rate": 9.080922340656951e-06, + "loss": 0.0332, + "step": 84500 + }, + { + "epoch": 4.622579943441375, + "grad_norm": 0.6527734994888306, + "learning_rate": 9.075484011311726e-06, + "loss": 0.0306, + "step": 85000 + }, + { + "epoch": 4.6497715901675, + "grad_norm": 3.3111846446990967, + "learning_rate": 9.070045681966501e-06, + "loss": 0.0352, + "step": 85500 + }, + { + "epoch": 4.676963236893626, + "grad_norm": 4.766884803771973, + "learning_rate": 9.064607352621275e-06, + "loss": 0.0331, + "step": 86000 + }, + { + "epoch": 4.704154883619752, + "grad_norm": 3.993748903274536, + "learning_rate": 9.05916902327605e-06, + "loss": 0.0343, + "step": 86500 + }, + { + "epoch": 4.731346530345878, + "grad_norm": 6.515500068664551, + "learning_rate": 9.053730693930825e-06, + "loss": 0.0322, + "step": 87000 + }, + { + "epoch": 4.758538177072003, + "grad_norm": 1.858112096786499, + "learning_rate": 9.0482923645856e-06, + "loss": 0.034, + "step": 87500 + }, + { + "epoch": 4.785729823798129, + "grad_norm": 8.057866096496582, + "learning_rate": 9.042854035240374e-06, + "loss": 0.0326, + "step": 88000 + }, + { + "epoch": 4.812921470524255, + "grad_norm": 2.783409833908081, + "learning_rate": 9.03741570589515e-06, + "loss": 0.0349, + "step": 88500 + }, + { + "epoch": 4.840113117250381, + "grad_norm": 0.008890635333955288, + "learning_rate": 9.031977376549924e-06, + "loss": 0.0324, + "step": 89000 + }, + { + "epoch": 4.867304763976507, + "grad_norm": 2.3815135955810547, + "learning_rate": 9.0265390472047e-06, + "loss": 0.0332, + "step": 89500 + }, + { + "epoch": 4.894496410702632, + "grad_norm": 1.191375732421875, + "learning_rate": 9.021100717859475e-06, + "loss": 0.0336, + "step": 90000 + }, + { + "epoch": 4.921688057428758, + "grad_norm": 2.7159788608551025, + "learning_rate": 9.015662388514248e-06, + "loss": 0.0359, + "step": 90500 + }, + { + "epoch": 4.9488797041548835, + "grad_norm": 4.028094291687012, + "learning_rate": 9.010224059169024e-06, + "loss": 0.032, + "step": 91000 + }, + { + "epoch": 4.97607135088101, + "grad_norm": 1.4064428806304932, + "learning_rate": 9.004785729823799e-06, + "loss": 0.0338, + "step": 91500 + }, + { + "epoch": 5.0, + "eval_exact_match": 0.8753, + "eval_loss": 0.20461878180503845, + "eval_runtime": 1007.4294, + "eval_samples_per_second": 11.293, + "eval_steps_per_second": 0.707, + "step": 91940 + }, + { + "epoch": 5.003262997607135, + "grad_norm": 2.7804439067840576, + "learning_rate": 8.999347400478574e-06, + "loss": 0.0301, + "step": 92000 + }, + { + "epoch": 5.030454644333261, + "grad_norm": 0.11794668436050415, + "learning_rate": 8.99390907113335e-06, + "loss": 0.0179, + "step": 92500 + }, + { + "epoch": 5.0576462910593865, + "grad_norm": 0.5474434494972229, + "learning_rate": 8.988470741788123e-06, + "loss": 0.0198, + "step": 93000 + }, + { + "epoch": 5.084837937785513, + "grad_norm": 3.92921781539917, + "learning_rate": 8.983032412442898e-06, + "loss": 0.0193, + "step": 93500 + }, + { + "epoch": 5.112029584511638, + "grad_norm": 2.111978054046631, + "learning_rate": 8.977594083097673e-06, + "loss": 0.0187, + "step": 94000 + }, + { + "epoch": 5.139221231237764, + "grad_norm": 0.17302042245864868, + "learning_rate": 8.972155753752448e-06, + "loss": 0.016, + "step": 94500 + }, + { + "epoch": 5.166412877963889, + "grad_norm": 1.5338056087493896, + "learning_rate": 8.966717424407224e-06, + "loss": 0.0215, + "step": 95000 + }, + { + "epoch": 5.1936045246900155, + "grad_norm": 0.14234082400798798, + "learning_rate": 8.961279095061997e-06, + "loss": 0.0214, + "step": 95500 + }, + { + "epoch": 5.220796171416141, + "grad_norm": 0.10225632041692734, + "learning_rate": 8.955840765716772e-06, + "loss": 0.0194, + "step": 96000 + }, + { + "epoch": 5.247987818142267, + "grad_norm": 3.5910024642944336, + "learning_rate": 8.950402436371547e-06, + "loss": 0.0204, + "step": 96500 + }, + { + "epoch": 5.275179464868392, + "grad_norm": 4.878687381744385, + "learning_rate": 8.944964107026323e-06, + "loss": 0.0234, + "step": 97000 + }, + { + "epoch": 5.3023711115945185, + "grad_norm": 0.8146882057189941, + "learning_rate": 8.939525777681098e-06, + "loss": 0.0207, + "step": 97500 + }, + { + "epoch": 5.329562758320644, + "grad_norm": 0.23543909192085266, + "learning_rate": 8.934087448335871e-06, + "loss": 0.0242, + "step": 98000 + }, + { + "epoch": 5.35675440504677, + "grad_norm": 0.07411856204271317, + "learning_rate": 8.928649118990647e-06, + "loss": 0.0233, + "step": 98500 + }, + { + "epoch": 5.383946051772895, + "grad_norm": 0.7384315729141235, + "learning_rate": 8.923210789645422e-06, + "loss": 0.0205, + "step": 99000 + }, + { + "epoch": 5.411137698499021, + "grad_norm": 0.6251228451728821, + "learning_rate": 8.917772460300197e-06, + "loss": 0.023, + "step": 99500 + }, + { + "epoch": 5.438329345225147, + "grad_norm": 0.5921465754508972, + "learning_rate": 8.912334130954972e-06, + "loss": 0.0218, + "step": 100000 + }, + { + "epoch": 5.465520991951273, + "grad_norm": 0.01474306546151638, + "learning_rate": 8.906895801609746e-06, + "loss": 0.0219, + "step": 100500 + }, + { + "epoch": 5.492712638677398, + "grad_norm": 3.527553081512451, + "learning_rate": 8.901457472264521e-06, + "loss": 0.0213, + "step": 101000 + }, + { + "epoch": 5.519904285403524, + "grad_norm": 3.0029749870300293, + "learning_rate": 8.896019142919296e-06, + "loss": 0.0205, + "step": 101500 + }, + { + "epoch": 5.54709593212965, + "grad_norm": 0.3347836434841156, + "learning_rate": 8.890580813574071e-06, + "loss": 0.0213, + "step": 102000 + }, + { + "epoch": 5.574287578855776, + "grad_norm": 2.0022785663604736, + "learning_rate": 8.885142484228847e-06, + "loss": 0.0236, + "step": 102500 + }, + { + "epoch": 5.601479225581901, + "grad_norm": 0.6466526389122009, + "learning_rate": 8.87970415488362e-06, + "loss": 0.0229, + "step": 103000 + }, + { + "epoch": 5.628670872308027, + "grad_norm": 0.013481836766004562, + "learning_rate": 8.874265825538395e-06, + "loss": 0.022, + "step": 103500 + }, + { + "epoch": 5.6558625190341525, + "grad_norm": 0.4195241630077362, + "learning_rate": 8.86882749619317e-06, + "loss": 0.0241, + "step": 104000 + }, + { + "epoch": 5.683054165760279, + "grad_norm": 1.216953158378601, + "learning_rate": 8.863389166847946e-06, + "loss": 0.0224, + "step": 104500 + }, + { + "epoch": 5.710245812486404, + "grad_norm": 0.682259738445282, + "learning_rate": 8.85795083750272e-06, + "loss": 0.0191, + "step": 105000 + }, + { + "epoch": 5.73743745921253, + "grad_norm": 0.42561373114585876, + "learning_rate": 8.852512508157494e-06, + "loss": 0.0237, + "step": 105500 + }, + { + "epoch": 5.764629105938655, + "grad_norm": 6.53951358795166, + "learning_rate": 8.84707417881227e-06, + "loss": 0.021, + "step": 106000 + }, + { + "epoch": 5.791820752664782, + "grad_norm": 2.535867214202881, + "learning_rate": 8.841635849467045e-06, + "loss": 0.0236, + "step": 106500 + }, + { + "epoch": 5.819012399390907, + "grad_norm": 0.7644603848457336, + "learning_rate": 8.83619752012182e-06, + "loss": 0.0219, + "step": 107000 + }, + { + "epoch": 5.846204046117033, + "grad_norm": 7.7599616050720215, + "learning_rate": 8.830759190776595e-06, + "loss": 0.0206, + "step": 107500 + }, + { + "epoch": 5.873395692843158, + "grad_norm": 1.106614589691162, + "learning_rate": 8.825320861431369e-06, + "loss": 0.0214, + "step": 108000 + }, + { + "epoch": 5.9005873395692845, + "grad_norm": 0.4505751430988312, + "learning_rate": 8.819882532086144e-06, + "loss": 0.0229, + "step": 108500 + }, + { + "epoch": 5.92777898629541, + "grad_norm": 0.23297396302223206, + "learning_rate": 8.814444202740919e-06, + "loss": 0.0234, + "step": 109000 + }, + { + "epoch": 5.954970633021536, + "grad_norm": 1.527669072151184, + "learning_rate": 8.809005873395694e-06, + "loss": 0.0214, + "step": 109500 + }, + { + "epoch": 5.982162279747661, + "grad_norm": 0.012977199628949165, + "learning_rate": 8.80356754405047e-06, + "loss": 0.0228, + "step": 110000 + }, + { + "epoch": 6.0, + "eval_exact_match": 0.8794, + "eval_loss": 0.2122552990913391, + "eval_runtime": 1020.184, + "eval_samples_per_second": 11.152, + "eval_steps_per_second": 0.698, + "step": 110328 + }, + { + "epoch": 6.009353926473787, + "grad_norm": 0.0063011981546878815, + "learning_rate": 8.798129214705243e-06, + "loss": 0.0183, + "step": 110500 + }, + { + "epoch": 6.036545573199913, + "grad_norm": 0.042253538966178894, + "learning_rate": 8.792690885360018e-06, + "loss": 0.0136, + "step": 111000 + }, + { + "epoch": 6.063737219926039, + "grad_norm": 3.0220460891723633, + "learning_rate": 8.787252556014792e-06, + "loss": 0.0142, + "step": 111500 + }, + { + "epoch": 6.090928866652164, + "grad_norm": 1.7237880229949951, + "learning_rate": 8.781814226669569e-06, + "loss": 0.0121, + "step": 112000 + }, + { + "epoch": 6.11812051337829, + "grad_norm": 2.3117618560791016, + "learning_rate": 8.776375897324342e-06, + "loss": 0.0151, + "step": 112500 + }, + { + "epoch": 6.145312160104416, + "grad_norm": 0.2513481080532074, + "learning_rate": 8.770937567979117e-06, + "loss": 0.0122, + "step": 113000 + }, + { + "epoch": 6.172503806830542, + "grad_norm": 0.22196491062641144, + "learning_rate": 8.765499238633893e-06, + "loss": 0.0136, + "step": 113500 + }, + { + "epoch": 6.199695453556667, + "grad_norm": 0.3039638102054596, + "learning_rate": 8.760060909288666e-06, + "loss": 0.0154, + "step": 114000 + }, + { + "epoch": 6.226887100282793, + "grad_norm": 1.2839832305908203, + "learning_rate": 8.754622579943443e-06, + "loss": 0.015, + "step": 114500 + }, + { + "epoch": 6.2540787470089185, + "grad_norm": 1.728105902671814, + "learning_rate": 8.749184250598216e-06, + "loss": 0.0154, + "step": 115000 + }, + { + "epoch": 6.281270393735045, + "grad_norm": 1.7439731359481812, + "learning_rate": 8.743745921252992e-06, + "loss": 0.013, + "step": 115500 + }, + { + "epoch": 6.30846204046117, + "grad_norm": 0.7588323354721069, + "learning_rate": 8.738307591907767e-06, + "loss": 0.013, + "step": 116000 + }, + { + "epoch": 6.335653687187296, + "grad_norm": 0.3581075668334961, + "learning_rate": 8.73286926256254e-06, + "loss": 0.0147, + "step": 116500 + }, + { + "epoch": 6.362845333913421, + "grad_norm": 0.5312409996986389, + "learning_rate": 8.727430933217317e-06, + "loss": 0.0159, + "step": 117000 + }, + { + "epoch": 6.390036980639548, + "grad_norm": 1.5447969436645508, + "learning_rate": 8.72199260387209e-06, + "loss": 0.0163, + "step": 117500 + }, + { + "epoch": 6.417228627365673, + "grad_norm": 0.41506335139274597, + "learning_rate": 8.716554274526866e-06, + "loss": 0.016, + "step": 118000 + }, + { + "epoch": 6.444420274091799, + "grad_norm": 0.17262350022792816, + "learning_rate": 8.711115945181641e-06, + "loss": 0.0143, + "step": 118500 + }, + { + "epoch": 6.471611920817924, + "grad_norm": 1.5750232934951782, + "learning_rate": 8.705677615836415e-06, + "loss": 0.0143, + "step": 119000 + }, + { + "epoch": 6.4988035675440505, + "grad_norm": 2.806853771209717, + "learning_rate": 8.700239286491192e-06, + "loss": 0.0164, + "step": 119500 + }, + { + "epoch": 6.525995214270176, + "grad_norm": 0.2280786782503128, + "learning_rate": 8.694800957145965e-06, + "loss": 0.0138, + "step": 120000 + }, + { + "epoch": 6.553186860996302, + "grad_norm": 0.6690570712089539, + "learning_rate": 8.68936262780074e-06, + "loss": 0.0157, + "step": 120500 + }, + { + "epoch": 6.580378507722427, + "grad_norm": 0.04652916640043259, + "learning_rate": 8.683924298455516e-06, + "loss": 0.0172, + "step": 121000 + }, + { + "epoch": 6.6075701544485534, + "grad_norm": 1.504131555557251, + "learning_rate": 8.678485969110289e-06, + "loss": 0.0175, + "step": 121500 + }, + { + "epoch": 6.63476180117468, + "grad_norm": 1.5482994318008423, + "learning_rate": 8.673047639765066e-06, + "loss": 0.0177, + "step": 122000 + }, + { + "epoch": 6.661953447900805, + "grad_norm": 1.784773349761963, + "learning_rate": 8.66760931041984e-06, + "loss": 0.0153, + "step": 122500 + }, + { + "epoch": 6.68914509462693, + "grad_norm": 0.1438634991645813, + "learning_rate": 8.662170981074615e-06, + "loss": 0.016, + "step": 123000 + }, + { + "epoch": 6.716336741353056, + "grad_norm": 1.1094955205917358, + "learning_rate": 8.65673265172939e-06, + "loss": 0.0158, + "step": 123500 + }, + { + "epoch": 6.7435283880791825, + "grad_norm": 3.1225857734680176, + "learning_rate": 8.651294322384163e-06, + "loss": 0.0145, + "step": 124000 + }, + { + "epoch": 6.770720034805308, + "grad_norm": 0.4727814793586731, + "learning_rate": 8.64585599303894e-06, + "loss": 0.0178, + "step": 124500 + }, + { + "epoch": 6.797911681531433, + "grad_norm": 0.9324865937232971, + "learning_rate": 8.640417663693714e-06, + "loss": 0.0165, + "step": 125000 + }, + { + "epoch": 6.825103328257559, + "grad_norm": 1.3977622985839844, + "learning_rate": 8.634979334348489e-06, + "loss": 0.0148, + "step": 125500 + }, + { + "epoch": 6.8522949749836854, + "grad_norm": 0.694773256778717, + "learning_rate": 8.629541005003264e-06, + "loss": 0.0173, + "step": 126000 + }, + { + "epoch": 6.879486621709811, + "grad_norm": 4.397082328796387, + "learning_rate": 8.624102675658038e-06, + "loss": 0.0164, + "step": 126500 + }, + { + "epoch": 6.906678268435936, + "grad_norm": 0.09322671592235565, + "learning_rate": 8.618664346312815e-06, + "loss": 0.0162, + "step": 127000 + }, + { + "epoch": 6.933869915162062, + "grad_norm": 3.736959218978882, + "learning_rate": 8.613226016967588e-06, + "loss": 0.0161, + "step": 127500 + }, + { + "epoch": 6.961061561888188, + "grad_norm": 0.21108615398406982, + "learning_rate": 8.607787687622363e-06, + "loss": 0.0154, + "step": 128000 + }, + { + "epoch": 6.988253208614314, + "grad_norm": 0.029453817754983902, + "learning_rate": 8.602349358277138e-06, + "loss": 0.017, + "step": 128500 + }, + { + "epoch": 7.0, + "eval_exact_match": 0.8825, + "eval_loss": 0.24493291974067688, + "eval_runtime": 1023.3771, + "eval_samples_per_second": 11.117, + "eval_steps_per_second": 0.696, + "step": 128716 + }, + { + "epoch": 7.01544485534044, + "grad_norm": 6.605143070220947, + "learning_rate": 8.596911028931912e-06, + "loss": 0.0124, + "step": 129000 + }, + { + "epoch": 7.042636502066565, + "grad_norm": 0.22909535467624664, + "learning_rate": 8.591472699586689e-06, + "loss": 0.009, + "step": 129500 + }, + { + "epoch": 7.069828148792691, + "grad_norm": 0.08523764461278915, + "learning_rate": 8.586034370241462e-06, + "loss": 0.0085, + "step": 130000 + }, + { + "epoch": 7.097019795518817, + "grad_norm": 6.189642906188965, + "learning_rate": 8.580596040896238e-06, + "loss": 0.0093, + "step": 130500 + }, + { + "epoch": 7.124211442244943, + "grad_norm": 0.7962560057640076, + "learning_rate": 8.575157711551013e-06, + "loss": 0.0116, + "step": 131000 + }, + { + "epoch": 7.151403088971068, + "grad_norm": 0.14231279492378235, + "learning_rate": 8.569719382205786e-06, + "loss": 0.0122, + "step": 131500 + }, + { + "epoch": 7.178594735697194, + "grad_norm": 2.1064910888671875, + "learning_rate": 8.564281052860563e-06, + "loss": 0.013, + "step": 132000 + }, + { + "epoch": 7.2057863824233195, + "grad_norm": 0.02576456405222416, + "learning_rate": 8.558842723515337e-06, + "loss": 0.0098, + "step": 132500 + }, + { + "epoch": 7.232978029149446, + "grad_norm": 0.05112173408269882, + "learning_rate": 8.553404394170112e-06, + "loss": 0.0101, + "step": 133000 + }, + { + "epoch": 7.260169675875571, + "grad_norm": 0.7245155572891235, + "learning_rate": 8.547966064824887e-06, + "loss": 0.012, + "step": 133500 + }, + { + "epoch": 7.287361322601697, + "grad_norm": 0.014532721601426601, + "learning_rate": 8.54252773547966e-06, + "loss": 0.0088, + "step": 134000 + }, + { + "epoch": 7.314552969327822, + "grad_norm": 0.1423533707857132, + "learning_rate": 8.537089406134438e-06, + "loss": 0.0124, + "step": 134500 + }, + { + "epoch": 7.341744616053949, + "grad_norm": 0.33425888419151306, + "learning_rate": 8.531651076789211e-06, + "loss": 0.0103, + "step": 135000 + }, + { + "epoch": 7.368936262780074, + "grad_norm": 0.27307161688804626, + "learning_rate": 8.526212747443986e-06, + "loss": 0.0103, + "step": 135500 + }, + { + "epoch": 7.3961279095062, + "grad_norm": 0.559861958026886, + "learning_rate": 8.52077441809876e-06, + "loss": 0.0124, + "step": 136000 + }, + { + "epoch": 7.423319556232325, + "grad_norm": 0.1392046958208084, + "learning_rate": 8.515336088753535e-06, + "loss": 0.0105, + "step": 136500 + }, + { + "epoch": 7.4505112029584515, + "grad_norm": 0.013254035264253616, + "learning_rate": 8.50989775940831e-06, + "loss": 0.011, + "step": 137000 + }, + { + "epoch": 7.477702849684577, + "grad_norm": 5.330664157867432, + "learning_rate": 8.504459430063085e-06, + "loss": 0.0108, + "step": 137500 + }, + { + "epoch": 7.504894496410703, + "grad_norm": 3.8794960975646973, + "learning_rate": 8.49902110071786e-06, + "loss": 0.0117, + "step": 138000 + }, + { + "epoch": 7.532086143136828, + "grad_norm": 0.7144114375114441, + "learning_rate": 8.493582771372634e-06, + "loss": 0.0126, + "step": 138500 + }, + { + "epoch": 7.559277789862954, + "grad_norm": 0.022297067567706108, + "learning_rate": 8.48814444202741e-06, + "loss": 0.0125, + "step": 139000 + }, + { + "epoch": 7.58646943658908, + "grad_norm": 0.6900652050971985, + "learning_rate": 8.482706112682185e-06, + "loss": 0.0113, + "step": 139500 + }, + { + "epoch": 7.613661083315206, + "grad_norm": 0.21370786428451538, + "learning_rate": 8.47726778333696e-06, + "loss": 0.0111, + "step": 140000 + }, + { + "epoch": 7.640852730041331, + "grad_norm": 1.9464149475097656, + "learning_rate": 8.471829453991735e-06, + "loss": 0.0118, + "step": 140500 + }, + { + "epoch": 7.668044376767457, + "grad_norm": 0.18144677579402924, + "learning_rate": 8.466391124646508e-06, + "loss": 0.0145, + "step": 141000 + }, + { + "epoch": 7.695236023493583, + "grad_norm": 0.3602216839790344, + "learning_rate": 8.460952795301284e-06, + "loss": 0.0122, + "step": 141500 + }, + { + "epoch": 7.722427670219709, + "grad_norm": 0.3419041335582733, + "learning_rate": 8.455514465956059e-06, + "loss": 0.0133, + "step": 142000 + }, + { + "epoch": 7.749619316945834, + "grad_norm": 0.10839635133743286, + "learning_rate": 8.450076136610834e-06, + "loss": 0.0105, + "step": 142500 + }, + { + "epoch": 7.77681096367196, + "grad_norm": 3.2976644039154053, + "learning_rate": 8.44463780726561e-06, + "loss": 0.0106, + "step": 143000 + }, + { + "epoch": 7.8040026103980855, + "grad_norm": 1.0369518995285034, + "learning_rate": 8.439199477920383e-06, + "loss": 0.0137, + "step": 143500 + }, + { + "epoch": 7.831194257124212, + "grad_norm": 1.0955511331558228, + "learning_rate": 8.433761148575158e-06, + "loss": 0.0111, + "step": 144000 + }, + { + "epoch": 7.858385903850337, + "grad_norm": 1.4048563241958618, + "learning_rate": 8.428322819229933e-06, + "loss": 0.0108, + "step": 144500 + }, + { + "epoch": 7.885577550576463, + "grad_norm": 1.3276982307434082, + "learning_rate": 8.422884489884708e-06, + "loss": 0.0135, + "step": 145000 + }, + { + "epoch": 7.912769197302588, + "grad_norm": 0.062070589512586594, + "learning_rate": 8.417446160539484e-06, + "loss": 0.0128, + "step": 145500 + }, + { + "epoch": 7.939960844028715, + "grad_norm": 0.06153295561671257, + "learning_rate": 8.412007831194257e-06, + "loss": 0.0122, + "step": 146000 + }, + { + "epoch": 7.96715249075484, + "grad_norm": 0.3033904731273651, + "learning_rate": 8.406569501849032e-06, + "loss": 0.0121, + "step": 146500 + }, + { + "epoch": 7.994344137480966, + "grad_norm": 0.010566359385848045, + "learning_rate": 8.401131172503807e-06, + "loss": 0.0109, + "step": 147000 + }, + { + "epoch": 8.0, + "eval_exact_match": 0.8825, + "eval_loss": 0.27669957280158997, + "eval_runtime": 1019.6911, + "eval_samples_per_second": 11.157, + "eval_steps_per_second": 0.698, + "step": 147104 + }, + { + "epoch": 8.021535784207092, + "grad_norm": 0.4247625172138214, + "learning_rate": 8.395692843158583e-06, + "loss": 0.0086, + "step": 147500 + }, + { + "epoch": 8.048727430933218, + "grad_norm": 0.1236380860209465, + "learning_rate": 8.390254513813358e-06, + "loss": 0.0071, + "step": 148000 + }, + { + "epoch": 8.075919077659343, + "grad_norm": 8.52878475189209, + "learning_rate": 8.384816184468131e-06, + "loss": 0.0081, + "step": 148500 + }, + { + "epoch": 8.103110724385468, + "grad_norm": 1.577666997909546, + "learning_rate": 8.379377855122907e-06, + "loss": 0.008, + "step": 149000 + }, + { + "epoch": 8.130302371111595, + "grad_norm": 0.29314127564430237, + "learning_rate": 8.373939525777682e-06, + "loss": 0.0066, + "step": 149500 + }, + { + "epoch": 8.15749401783772, + "grad_norm": 0.7720054984092712, + "learning_rate": 8.368501196432457e-06, + "loss": 0.0064, + "step": 150000 + }, + { + "epoch": 8.184685664563846, + "grad_norm": 0.21347130835056305, + "learning_rate": 8.363062867087232e-06, + "loss": 0.0067, + "step": 150500 + }, + { + "epoch": 8.211877311289971, + "grad_norm": 0.004629666917026043, + "learning_rate": 8.357624537742006e-06, + "loss": 0.0073, + "step": 151000 + }, + { + "epoch": 8.239068958016098, + "grad_norm": 0.07173150777816772, + "learning_rate": 8.352186208396781e-06, + "loss": 0.0088, + "step": 151500 + }, + { + "epoch": 8.266260604742223, + "grad_norm": 0.0343378446996212, + "learning_rate": 8.346747879051556e-06, + "loss": 0.0085, + "step": 152000 + }, + { + "epoch": 8.293452251468349, + "grad_norm": 0.2848726809024811, + "learning_rate": 8.341309549706331e-06, + "loss": 0.0076, + "step": 152500 + }, + { + "epoch": 8.320643898194474, + "grad_norm": 0.08385962247848511, + "learning_rate": 8.335871220361107e-06, + "loss": 0.0089, + "step": 153000 + }, + { + "epoch": 8.347835544920601, + "grad_norm": 0.2588665783405304, + "learning_rate": 8.33043289101588e-06, + "loss": 0.0086, + "step": 153500 + }, + { + "epoch": 8.375027191646726, + "grad_norm": 0.5390540361404419, + "learning_rate": 8.324994561670655e-06, + "loss": 0.0082, + "step": 154000 + }, + { + "epoch": 8.402218838372852, + "grad_norm": 0.09838502109050751, + "learning_rate": 8.31955623232543e-06, + "loss": 0.0095, + "step": 154500 + }, + { + "epoch": 8.429410485098977, + "grad_norm": 5.106756687164307, + "learning_rate": 8.314117902980206e-06, + "loss": 0.01, + "step": 155000 + }, + { + "epoch": 8.456602131825104, + "grad_norm": 0.08692660927772522, + "learning_rate": 8.308679573634981e-06, + "loss": 0.0097, + "step": 155500 + }, + { + "epoch": 8.48379377855123, + "grad_norm": 0.013324704952538013, + "learning_rate": 8.303241244289754e-06, + "loss": 0.0092, + "step": 156000 + }, + { + "epoch": 8.510985425277354, + "grad_norm": 0.008706462569534779, + "learning_rate": 8.29780291494453e-06, + "loss": 0.0097, + "step": 156500 + }, + { + "epoch": 8.53817707200348, + "grad_norm": 5.618961334228516, + "learning_rate": 8.292364585599305e-06, + "loss": 0.0103, + "step": 157000 + }, + { + "epoch": 8.565368718729607, + "grad_norm": 0.17830610275268555, + "learning_rate": 8.28692625625408e-06, + "loss": 0.0091, + "step": 157500 + }, + { + "epoch": 8.592560365455732, + "grad_norm": 0.15179601311683655, + "learning_rate": 8.281487926908855e-06, + "loss": 0.0097, + "step": 158000 + }, + { + "epoch": 8.619752012181857, + "grad_norm": 0.004950134549289942, + "learning_rate": 8.276049597563629e-06, + "loss": 0.0077, + "step": 158500 + }, + { + "epoch": 8.646943658907983, + "grad_norm": 1.6617698669433594, + "learning_rate": 8.270611268218404e-06, + "loss": 0.0112, + "step": 159000 + }, + { + "epoch": 8.67413530563411, + "grad_norm": 1.129233479499817, + "learning_rate": 8.265172938873179e-06, + "loss": 0.0084, + "step": 159500 + }, + { + "epoch": 8.701326952360235, + "grad_norm": 0.3634829521179199, + "learning_rate": 8.259734609527954e-06, + "loss": 0.0088, + "step": 160000 + }, + { + "epoch": 8.72851859908636, + "grad_norm": 0.0007674964144825935, + "learning_rate": 8.254296280182728e-06, + "loss": 0.0091, + "step": 160500 + }, + { + "epoch": 8.755710245812486, + "grad_norm": 0.840733528137207, + "learning_rate": 8.248857950837503e-06, + "loss": 0.0096, + "step": 161000 + }, + { + "epoch": 8.782901892538613, + "grad_norm": 0.002596140606328845, + "learning_rate": 8.243419621492278e-06, + "loss": 0.0089, + "step": 161500 + }, + { + "epoch": 8.810093539264738, + "grad_norm": 0.011948781087994576, + "learning_rate": 8.237981292147052e-06, + "loss": 0.0079, + "step": 162000 + }, + { + "epoch": 8.837285185990863, + "grad_norm": 0.4065161645412445, + "learning_rate": 8.232542962801829e-06, + "loss": 0.0083, + "step": 162500 + }, + { + "epoch": 8.864476832716988, + "grad_norm": 0.001233687624335289, + "learning_rate": 8.227104633456602e-06, + "loss": 0.0101, + "step": 163000 + }, + { + "epoch": 8.891668479443116, + "grad_norm": 1.7294543981552124, + "learning_rate": 8.221666304111377e-06, + "loss": 0.0097, + "step": 163500 + }, + { + "epoch": 8.91886012616924, + "grad_norm": 0.001020897296257317, + "learning_rate": 8.216227974766153e-06, + "loss": 0.0101, + "step": 164000 + }, + { + "epoch": 8.946051772895366, + "grad_norm": 0.01642615906894207, + "learning_rate": 8.210789645420926e-06, + "loss": 0.0103, + "step": 164500 + }, + { + "epoch": 8.973243419621493, + "grad_norm": 0.7197975516319275, + "learning_rate": 8.205351316075703e-06, + "loss": 0.0097, + "step": 165000 + }, + { + "epoch": 9.0, + "eval_exact_match": 0.8803, + "eval_loss": 0.29959383606910706, + "eval_runtime": 1017.9862, + "eval_samples_per_second": 11.176, + "eval_steps_per_second": 0.699, + "step": 165492 + }, + { + "epoch": 9.000435066347618, + "grad_norm": 0.19451579451560974, + "learning_rate": 8.199912986730476e-06, + "loss": 0.007, + "step": 165500 + }, + { + "epoch": 9.027626713073744, + "grad_norm": 0.10793378204107285, + "learning_rate": 8.194474657385252e-06, + "loss": 0.0054, + "step": 166000 + }, + { + "epoch": 9.054818359799869, + "grad_norm": 0.012452369555830956, + "learning_rate": 8.189036328040027e-06, + "loss": 0.004, + "step": 166500 + }, + { + "epoch": 9.082010006525994, + "grad_norm": 0.30145037174224854, + "learning_rate": 8.1835979986948e-06, + "loss": 0.0059, + "step": 167000 + }, + { + "epoch": 9.109201653252121, + "grad_norm": 0.0008534679072909057, + "learning_rate": 8.178159669349577e-06, + "loss": 0.0062, + "step": 167500 + }, + { + "epoch": 9.136393299978247, + "grad_norm": 0.03561757877469063, + "learning_rate": 8.17272134000435e-06, + "loss": 0.0049, + "step": 168000 + }, + { + "epoch": 9.163584946704372, + "grad_norm": 0.9257168173789978, + "learning_rate": 8.167283010659126e-06, + "loss": 0.0055, + "step": 168500 + }, + { + "epoch": 9.190776593430499, + "grad_norm": 0.4846535325050354, + "learning_rate": 8.161844681313901e-06, + "loss": 0.0068, + "step": 169000 + }, + { + "epoch": 9.217968240156624, + "grad_norm": 0.2089615762233734, + "learning_rate": 8.156406351968675e-06, + "loss": 0.0063, + "step": 169500 + }, + { + "epoch": 9.24515988688275, + "grad_norm": 0.14049288630485535, + "learning_rate": 8.150968022623452e-06, + "loss": 0.0072, + "step": 170000 + }, + { + "epoch": 9.272351533608875, + "grad_norm": 0.26103538274765015, + "learning_rate": 8.145529693278225e-06, + "loss": 0.006, + "step": 170500 + }, + { + "epoch": 9.299543180335002, + "grad_norm": 0.1939212679862976, + "learning_rate": 8.140091363933e-06, + "loss": 0.0069, + "step": 171000 + }, + { + "epoch": 9.326734827061127, + "grad_norm": 0.004148316103965044, + "learning_rate": 8.134653034587776e-06, + "loss": 0.0063, + "step": 171500 + }, + { + "epoch": 9.353926473787253, + "grad_norm": 0.36702999472618103, + "learning_rate": 8.129214705242549e-06, + "loss": 0.0064, + "step": 172000 + }, + { + "epoch": 9.381118120513378, + "grad_norm": 0.0005734359147027135, + "learning_rate": 8.123776375897326e-06, + "loss": 0.007, + "step": 172500 + }, + { + "epoch": 9.408309767239505, + "grad_norm": 9.195197105407715, + "learning_rate": 8.1183380465521e-06, + "loss": 0.0087, + "step": 173000 + }, + { + "epoch": 9.43550141396563, + "grad_norm": 0.4702955186367035, + "learning_rate": 8.112899717206875e-06, + "loss": 0.0077, + "step": 173500 + }, + { + "epoch": 9.462693060691755, + "grad_norm": 0.8873838782310486, + "learning_rate": 8.10746138786165e-06, + "loss": 0.0073, + "step": 174000 + }, + { + "epoch": 9.48988470741788, + "grad_norm": 0.028335994109511375, + "learning_rate": 8.102023058516423e-06, + "loss": 0.0068, + "step": 174500 + }, + { + "epoch": 9.517076354144008, + "grad_norm": 0.2030220478773117, + "learning_rate": 8.0965847291712e-06, + "loss": 0.008, + "step": 175000 + }, + { + "epoch": 9.544268000870133, + "grad_norm": 0.015208634547889233, + "learning_rate": 8.091146399825974e-06, + "loss": 0.0086, + "step": 175500 + }, + { + "epoch": 9.571459647596258, + "grad_norm": 0.004770020954310894, + "learning_rate": 8.085708070480749e-06, + "loss": 0.0075, + "step": 176000 + }, + { + "epoch": 9.598651294322384, + "grad_norm": 7.568017959594727, + "learning_rate": 8.080269741135524e-06, + "loss": 0.0066, + "step": 176500 + }, + { + "epoch": 9.62584294104851, + "grad_norm": 0.08423513919115067, + "learning_rate": 8.074831411790298e-06, + "loss": 0.0064, + "step": 177000 + }, + { + "epoch": 9.653034587774636, + "grad_norm": 0.979928195476532, + "learning_rate": 8.069393082445075e-06, + "loss": 0.0072, + "step": 177500 + }, + { + "epoch": 9.680226234500761, + "grad_norm": 0.05285876616835594, + "learning_rate": 8.063954753099848e-06, + "loss": 0.0075, + "step": 178000 + }, + { + "epoch": 9.707417881226887, + "grad_norm": 0.3576393723487854, + "learning_rate": 8.058516423754623e-06, + "loss": 0.0087, + "step": 178500 + }, + { + "epoch": 9.734609527953014, + "grad_norm": 0.027542833238840103, + "learning_rate": 8.053078094409399e-06, + "loss": 0.0081, + "step": 179000 + }, + { + "epoch": 9.761801174679139, + "grad_norm": 0.01537884958088398, + "learning_rate": 8.047639765064172e-06, + "loss": 0.0059, + "step": 179500 + }, + { + "epoch": 9.788992821405264, + "grad_norm": 0.8428720235824585, + "learning_rate": 8.042201435718949e-06, + "loss": 0.0083, + "step": 180000 + }, + { + "epoch": 9.81618446813139, + "grad_norm": 0.0019140657968819141, + "learning_rate": 8.036763106373722e-06, + "loss": 0.0072, + "step": 180500 + }, + { + "epoch": 9.843376114857517, + "grad_norm": 0.03006519190967083, + "learning_rate": 8.031324777028498e-06, + "loss": 0.0091, + "step": 181000 + }, + { + "epoch": 9.870567761583642, + "grad_norm": 1.1168920993804932, + "learning_rate": 8.025886447683273e-06, + "loss": 0.0069, + "step": 181500 + }, + { + "epoch": 9.897759408309767, + "grad_norm": 0.0870746374130249, + "learning_rate": 8.020448118338046e-06, + "loss": 0.0069, + "step": 182000 + }, + { + "epoch": 9.924951055035892, + "grad_norm": 0.2507496774196625, + "learning_rate": 8.015009788992823e-06, + "loss": 0.007, + "step": 182500 + }, + { + "epoch": 9.95214270176202, + "grad_norm": 4.196346759796143, + "learning_rate": 8.009571459647597e-06, + "loss": 0.0083, + "step": 183000 + }, + { + "epoch": 9.979334348488145, + "grad_norm": 0.009614282287657261, + "learning_rate": 8.004133130302372e-06, + "loss": 0.0084, + "step": 183500 + }, + { + "epoch": 10.0, + "eval_exact_match": 0.882, + "eval_loss": 0.3081795871257782, + "eval_runtime": 1024.0537, + "eval_samples_per_second": 11.11, + "eval_steps_per_second": 0.695, + "step": 183880 + }, + { + "epoch": 10.00652599521427, + "grad_norm": 0.01503839809447527, + "learning_rate": 7.998694800957147e-06, + "loss": 0.0067, + "step": 184000 + }, + { + "epoch": 10.033717641940395, + "grad_norm": 1.0102128982543945, + "learning_rate": 7.99325647161192e-06, + "loss": 0.0046, + "step": 184500 + }, + { + "epoch": 10.060909288666522, + "grad_norm": 1.3546432256698608, + "learning_rate": 7.987818142266696e-06, + "loss": 0.0055, + "step": 185000 + }, + { + "epoch": 10.088100935392648, + "grad_norm": 0.001982804387807846, + "learning_rate": 7.982379812921471e-06, + "loss": 0.0054, + "step": 185500 + }, + { + "epoch": 10.115292582118773, + "grad_norm": 0.011872241273522377, + "learning_rate": 7.976941483576246e-06, + "loss": 0.0059, + "step": 186000 + }, + { + "epoch": 10.142484228844898, + "grad_norm": 0.06217168644070625, + "learning_rate": 7.97150315423102e-06, + "loss": 0.0051, + "step": 186500 + }, + { + "epoch": 10.169675875571025, + "grad_norm": 0.5569573044776917, + "learning_rate": 7.966064824885795e-06, + "loss": 0.0052, + "step": 187000 + }, + { + "epoch": 10.19686752229715, + "grad_norm": 0.1213633194565773, + "learning_rate": 7.96062649554057e-06, + "loss": 0.0041, + "step": 187500 + }, + { + "epoch": 10.224059169023276, + "grad_norm": 0.020384617149829865, + "learning_rate": 7.955188166195345e-06, + "loss": 0.0059, + "step": 188000 + }, + { + "epoch": 10.251250815749401, + "grad_norm": 0.012423365376889706, + "learning_rate": 7.94974983685012e-06, + "loss": 0.0064, + "step": 188500 + }, + { + "epoch": 10.278442462475528, + "grad_norm": 0.0038297956343740225, + "learning_rate": 7.944311507504894e-06, + "loss": 0.0039, + "step": 189000 + }, + { + "epoch": 10.305634109201653, + "grad_norm": 0.6402150988578796, + "learning_rate": 7.93887317815967e-06, + "loss": 0.0066, + "step": 189500 + }, + { + "epoch": 10.332825755927779, + "grad_norm": 0.0031200500670820475, + "learning_rate": 7.933434848814445e-06, + "loss": 0.0061, + "step": 190000 + }, + { + "epoch": 10.360017402653904, + "grad_norm": 0.6107866168022156, + "learning_rate": 7.92799651946922e-06, + "loss": 0.0054, + "step": 190500 + }, + { + "epoch": 10.387209049380031, + "grad_norm": 2.908076047897339, + "learning_rate": 7.922558190123995e-06, + "loss": 0.0066, + "step": 191000 + }, + { + "epoch": 10.414400696106156, + "grad_norm": 0.5853959321975708, + "learning_rate": 7.917119860778768e-06, + "loss": 0.0065, + "step": 191500 + }, + { + "epoch": 10.441592342832282, + "grad_norm": 0.005342130083590746, + "learning_rate": 7.911681531433544e-06, + "loss": 0.0079, + "step": 192000 + }, + { + "epoch": 10.468783989558407, + "grad_norm": 10.593901634216309, + "learning_rate": 7.906243202088319e-06, + "loss": 0.0064, + "step": 192500 + }, + { + "epoch": 10.495975636284534, + "grad_norm": 0.007571155205368996, + "learning_rate": 7.900804872743094e-06, + "loss": 0.0061, + "step": 193000 + }, + { + "epoch": 10.52316728301066, + "grad_norm": 0.08925803750753403, + "learning_rate": 7.89536654339787e-06, + "loss": 0.0051, + "step": 193500 + }, + { + "epoch": 10.550358929736785, + "grad_norm": 0.13003523647785187, + "learning_rate": 7.889928214052643e-06, + "loss": 0.0053, + "step": 194000 + }, + { + "epoch": 10.57755057646291, + "grad_norm": 0.051926884800195694, + "learning_rate": 7.884489884707418e-06, + "loss": 0.0081, + "step": 194500 + }, + { + "epoch": 10.604742223189037, + "grad_norm": 0.0002671371621545404, + "learning_rate": 7.879051555362193e-06, + "loss": 0.006, + "step": 195000 + }, + { + "epoch": 10.631933869915162, + "grad_norm": 0.0013647901359945536, + "learning_rate": 7.873613226016968e-06, + "loss": 0.0055, + "step": 195500 + }, + { + "epoch": 10.659125516641287, + "grad_norm": 0.495665043592453, + "learning_rate": 7.868174896671744e-06, + "loss": 0.006, + "step": 196000 + }, + { + "epoch": 10.686317163367413, + "grad_norm": 3.4194962978363037, + "learning_rate": 7.862736567326517e-06, + "loss": 0.006, + "step": 196500 + }, + { + "epoch": 10.71350881009354, + "grad_norm": 0.5971339344978333, + "learning_rate": 7.857298237981292e-06, + "loss": 0.0065, + "step": 197000 + }, + { + "epoch": 10.740700456819665, + "grad_norm": 0.0033764122053980827, + "learning_rate": 7.851859908636068e-06, + "loss": 0.0069, + "step": 197500 + }, + { + "epoch": 10.76789210354579, + "grad_norm": 0.15896004438400269, + "learning_rate": 7.846421579290843e-06, + "loss": 0.0065, + "step": 198000 + }, + { + "epoch": 10.795083750271916, + "grad_norm": 0.012025897391140461, + "learning_rate": 7.840983249945618e-06, + "loss": 0.0054, + "step": 198500 + }, + { + "epoch": 10.822275396998043, + "grad_norm": 0.05475957691669464, + "learning_rate": 7.835544920600391e-06, + "loss": 0.0054, + "step": 199000 + }, + { + "epoch": 10.849467043724168, + "grad_norm": 0.003789114998653531, + "learning_rate": 7.830106591255167e-06, + "loss": 0.0068, + "step": 199500 + }, + { + "epoch": 10.876658690450293, + "grad_norm": 0.0021468698978424072, + "learning_rate": 7.824668261909942e-06, + "loss": 0.0064, + "step": 200000 + }, + { + "epoch": 10.903850337176419, + "grad_norm": 6.396396636962891, + "learning_rate": 7.819229932564717e-06, + "loss": 0.0074, + "step": 200500 + }, + { + "epoch": 10.931041983902546, + "grad_norm": 0.023530324921011925, + "learning_rate": 7.813791603219492e-06, + "loss": 0.0061, + "step": 201000 + }, + { + "epoch": 10.958233630628671, + "grad_norm": 0.019590675830841064, + "learning_rate": 7.808353273874266e-06, + "loss": 0.0054, + "step": 201500 + }, + { + "epoch": 10.985425277354796, + "grad_norm": 0.0019979814533144236, + "learning_rate": 7.802914944529041e-06, + "loss": 0.0062, + "step": 202000 + }, + { + "epoch": 11.0, + "eval_exact_match": 0.8818, + "eval_loss": 0.31789034605026245, + "eval_runtime": 1019.6612, + "eval_samples_per_second": 11.158, + "eval_steps_per_second": 0.698, + "step": 202268 + }, + { + "epoch": 11.012616924080922, + "grad_norm": 0.024939043447375298, + "learning_rate": 7.797476615183816e-06, + "loss": 0.0058, + "step": 202500 + }, + { + "epoch": 11.039808570807049, + "grad_norm": 0.00953602697700262, + "learning_rate": 7.792038285838591e-06, + "loss": 0.0063, + "step": 203000 + }, + { + "epoch": 11.067000217533174, + "grad_norm": 2.4622678756713867, + "learning_rate": 7.786599956493367e-06, + "loss": 0.003, + "step": 203500 + }, + { + "epoch": 11.0941918642593, + "grad_norm": 0.002521548420190811, + "learning_rate": 7.78116162714814e-06, + "loss": 0.0037, + "step": 204000 + }, + { + "epoch": 11.121383510985424, + "grad_norm": 0.017454292625188828, + "learning_rate": 7.775723297802915e-06, + "loss": 0.0038, + "step": 204500 + }, + { + "epoch": 11.148575157711551, + "grad_norm": 0.01883101277053356, + "learning_rate": 7.77028496845769e-06, + "loss": 0.0035, + "step": 205000 + }, + { + "epoch": 11.175766804437677, + "grad_norm": 0.004060177132487297, + "learning_rate": 7.764846639112466e-06, + "loss": 0.0049, + "step": 205500 + }, + { + "epoch": 11.202958451163802, + "grad_norm": 11.019586563110352, + "learning_rate": 7.759408309767241e-06, + "loss": 0.0059, + "step": 206000 + }, + { + "epoch": 11.230150097889927, + "grad_norm": 0.0053796349093317986, + "learning_rate": 7.753969980422014e-06, + "loss": 0.0052, + "step": 206500 + }, + { + "epoch": 11.257341744616054, + "grad_norm": 0.03368353471159935, + "learning_rate": 7.74853165107679e-06, + "loss": 0.0048, + "step": 207000 + }, + { + "epoch": 11.28453339134218, + "grad_norm": 1.5319031476974487, + "learning_rate": 7.743093321731565e-06, + "loss": 0.005, + "step": 207500 + }, + { + "epoch": 11.311725038068305, + "grad_norm": 0.18133412301540375, + "learning_rate": 7.73765499238634e-06, + "loss": 0.0043, + "step": 208000 + }, + { + "epoch": 11.33891668479443, + "grad_norm": 0.0025546839460730553, + "learning_rate": 7.732216663041115e-06, + "loss": 0.0041, + "step": 208500 + }, + { + "epoch": 11.366108331520557, + "grad_norm": 0.0636424720287323, + "learning_rate": 7.726778333695889e-06, + "loss": 0.0054, + "step": 209000 + }, + { + "epoch": 11.393299978246683, + "grad_norm": 0.0026644619647413492, + "learning_rate": 7.721340004350664e-06, + "loss": 0.0036, + "step": 209500 + }, + { + "epoch": 11.420491624972808, + "grad_norm": 0.014393744058907032, + "learning_rate": 7.71590167500544e-06, + "loss": 0.0045, + "step": 210000 + }, + { + "epoch": 11.447683271698935, + "grad_norm": 0.00033763342071324587, + "learning_rate": 7.710463345660214e-06, + "loss": 0.0052, + "step": 210500 + }, + { + "epoch": 11.47487491842506, + "grad_norm": 0.0011537778191268444, + "learning_rate": 7.705025016314988e-06, + "loss": 0.0032, + "step": 211000 + }, + { + "epoch": 11.502066565151186, + "grad_norm": 0.007493032608181238, + "learning_rate": 7.699586686969763e-06, + "loss": 0.0048, + "step": 211500 + }, + { + "epoch": 11.52925821187731, + "grad_norm": 0.05346640944480896, + "learning_rate": 7.694148357624538e-06, + "loss": 0.0055, + "step": 212000 + }, + { + "epoch": 11.556449858603436, + "grad_norm": 0.06508354097604752, + "learning_rate": 7.688710028279313e-06, + "loss": 0.0056, + "step": 212500 + }, + { + "epoch": 11.583641505329563, + "grad_norm": 0.00885526929050684, + "learning_rate": 7.683271698934089e-06, + "loss": 0.0055, + "step": 213000 + }, + { + "epoch": 11.610833152055688, + "grad_norm": 0.3413640558719635, + "learning_rate": 7.677833369588862e-06, + "loss": 0.0049, + "step": 213500 + }, + { + "epoch": 11.638024798781814, + "grad_norm": 0.4742060601711273, + "learning_rate": 7.672395040243637e-06, + "loss": 0.0042, + "step": 214000 + }, + { + "epoch": 11.66521644550794, + "grad_norm": 0.003855757648125291, + "learning_rate": 7.666956710898413e-06, + "loss": 0.0052, + "step": 214500 + }, + { + "epoch": 11.692408092234066, + "grad_norm": 0.0012399395927786827, + "learning_rate": 7.661518381553188e-06, + "loss": 0.005, + "step": 215000 + }, + { + "epoch": 11.719599738960191, + "grad_norm": 0.6118332743644714, + "learning_rate": 7.656080052207963e-06, + "loss": 0.0043, + "step": 215500 + }, + { + "epoch": 11.746791385686317, + "grad_norm": 1.3236043453216553, + "learning_rate": 7.650641722862737e-06, + "loss": 0.0041, + "step": 216000 + }, + { + "epoch": 11.773983032412444, + "grad_norm": 0.2502232491970062, + "learning_rate": 7.645203393517512e-06, + "loss": 0.0053, + "step": 216500 + }, + { + "epoch": 11.801174679138569, + "grad_norm": 0.3646252155303955, + "learning_rate": 7.639765064172287e-06, + "loss": 0.0054, + "step": 217000 + }, + { + "epoch": 11.828366325864694, + "grad_norm": 0.00609625643119216, + "learning_rate": 7.634326734827062e-06, + "loss": 0.0067, + "step": 217500 + }, + { + "epoch": 11.85555797259082, + "grad_norm": 0.000278811261523515, + "learning_rate": 7.6288884054818365e-06, + "loss": 0.0068, + "step": 218000 + }, + { + "epoch": 11.882749619316947, + "grad_norm": 0.03974534198641777, + "learning_rate": 7.623450076136612e-06, + "loss": 0.0059, + "step": 218500 + }, + { + "epoch": 11.909941266043072, + "grad_norm": 0.001021494623273611, + "learning_rate": 7.618011746791386e-06, + "loss": 0.0054, + "step": 219000 + }, + { + "epoch": 11.937132912769197, + "grad_norm": 3.7211320400238037, + "learning_rate": 7.61257341744616e-06, + "loss": 0.0039, + "step": 219500 + }, + { + "epoch": 11.964324559495322, + "grad_norm": 0.25480392575263977, + "learning_rate": 7.6071350881009365e-06, + "loss": 0.0063, + "step": 220000 + }, + { + "epoch": 11.99151620622145, + "grad_norm": 0.09184807538986206, + "learning_rate": 7.601696758755711e-06, + "loss": 0.0056, + "step": 220500 + }, + { + "epoch": 12.0, + "eval_exact_match": 0.8848, + "eval_loss": 0.32719093561172485, + "eval_runtime": 1021.1809, + "eval_samples_per_second": 11.141, + "eval_steps_per_second": 0.697, + "step": 220656 + }, + { + "epoch": 12.018707852947575, + "grad_norm": 0.0011609598295763135, + "learning_rate": 7.596258429410486e-06, + "loss": 0.0038, + "step": 221000 + }, + { + "epoch": 12.0458994996737, + "grad_norm": 0.006982658989727497, + "learning_rate": 7.59082010006526e-06, + "loss": 0.0022, + "step": 221500 + }, + { + "epoch": 12.073091146399825, + "grad_norm": 0.16167816519737244, + "learning_rate": 7.585381770720035e-06, + "loss": 0.0036, + "step": 222000 + }, + { + "epoch": 12.100282793125952, + "grad_norm": 0.00900455191731453, + "learning_rate": 7.579943441374811e-06, + "loss": 0.0032, + "step": 222500 + }, + { + "epoch": 12.127474439852078, + "grad_norm": 0.0007139613153412938, + "learning_rate": 7.574505112029585e-06, + "loss": 0.0042, + "step": 223000 + }, + { + "epoch": 12.154666086578203, + "grad_norm": 0.2929919362068176, + "learning_rate": 7.56906678268436e-06, + "loss": 0.004, + "step": 223500 + }, + { + "epoch": 12.181857733304328, + "grad_norm": 0.1538592129945755, + "learning_rate": 7.563628453339135e-06, + "loss": 0.0029, + "step": 224000 + }, + { + "epoch": 12.209049380030455, + "grad_norm": 0.05683496594429016, + "learning_rate": 7.558190123993909e-06, + "loss": 0.0039, + "step": 224500 + }, + { + "epoch": 12.23624102675658, + "grad_norm": 0.002470650477334857, + "learning_rate": 7.552751794648685e-06, + "loss": 0.0045, + "step": 225000 + }, + { + "epoch": 12.263432673482706, + "grad_norm": 11.708252906799316, + "learning_rate": 7.5473134653034595e-06, + "loss": 0.0053, + "step": 225500 + }, + { + "epoch": 12.290624320208831, + "grad_norm": 0.005214590113610029, + "learning_rate": 7.541875135958235e-06, + "loss": 0.0036, + "step": 226000 + }, + { + "epoch": 12.317815966934958, + "grad_norm": 0.16717997193336487, + "learning_rate": 7.536436806613009e-06, + "loss": 0.0037, + "step": 226500 + }, + { + "epoch": 12.345007613661084, + "grad_norm": 0.4088883101940155, + "learning_rate": 7.530998477267783e-06, + "loss": 0.0067, + "step": 227000 + }, + { + "epoch": 12.372199260387209, + "grad_norm": 7.834692001342773, + "learning_rate": 7.5255601479225594e-06, + "loss": 0.0042, + "step": 227500 + }, + { + "epoch": 12.399390907113334, + "grad_norm": 0.0002414975460851565, + "learning_rate": 7.520121818577334e-06, + "loss": 0.0033, + "step": 228000 + }, + { + "epoch": 12.426582553839461, + "grad_norm": 0.005257593933492899, + "learning_rate": 7.514683489232108e-06, + "loss": 0.0034, + "step": 228500 + }, + { + "epoch": 12.453774200565586, + "grad_norm": 0.0003224584797862917, + "learning_rate": 7.509245159886883e-06, + "loss": 0.0047, + "step": 229000 + }, + { + "epoch": 12.480965847291712, + "grad_norm": 0.013201883994042873, + "learning_rate": 7.503806830541658e-06, + "loss": 0.0057, + "step": 229500 + }, + { + "epoch": 12.508157494017837, + "grad_norm": 0.6929981112480164, + "learning_rate": 7.498368501196434e-06, + "loss": 0.007, + "step": 230000 + }, + { + "epoch": 12.535349140743964, + "grad_norm": 0.01077383290976286, + "learning_rate": 7.492930171851208e-06, + "loss": 0.0048, + "step": 230500 + }, + { + "epoch": 12.56254078747009, + "grad_norm": 0.00016784864419605583, + "learning_rate": 7.4874918425059825e-06, + "loss": 0.0052, + "step": 231000 + }, + { + "epoch": 12.589732434196215, + "grad_norm": 0.02103503979742527, + "learning_rate": 7.482053513160758e-06, + "loss": 0.0042, + "step": 231500 + }, + { + "epoch": 12.61692408092234, + "grad_norm": 0.16059672832489014, + "learning_rate": 7.476615183815532e-06, + "loss": 0.0034, + "step": 232000 + }, + { + "epoch": 12.644115727648467, + "grad_norm": 0.002428087405860424, + "learning_rate": 7.471176854470308e-06, + "loss": 0.0045, + "step": 232500 + }, + { + "epoch": 12.671307374374592, + "grad_norm": 0.002322005107998848, + "learning_rate": 7.4657385251250825e-06, + "loss": 0.0053, + "step": 233000 + }, + { + "epoch": 12.698499021100718, + "grad_norm": 0.001284032710827887, + "learning_rate": 7.460300195779857e-06, + "loss": 0.0036, + "step": 233500 + }, + { + "epoch": 12.725690667826843, + "grad_norm": 0.011062849313020706, + "learning_rate": 7.454861866434632e-06, + "loss": 0.0051, + "step": 234000 + }, + { + "epoch": 12.75288231455297, + "grad_norm": 0.007350505795329809, + "learning_rate": 7.449423537089406e-06, + "loss": 0.0056, + "step": 234500 + }, + { + "epoch": 12.780073961279095, + "grad_norm": 0.033329423516988754, + "learning_rate": 7.443985207744182e-06, + "loss": 0.0042, + "step": 235000 + }, + { + "epoch": 12.80726560800522, + "grad_norm": 0.06402397900819778, + "learning_rate": 7.438546878398957e-06, + "loss": 0.0046, + "step": 235500 + }, + { + "epoch": 12.834457254731346, + "grad_norm": 0.1854117214679718, + "learning_rate": 7.433108549053731e-06, + "loss": 0.0043, + "step": 236000 + }, + { + "epoch": 12.861648901457473, + "grad_norm": 4.086294174194336, + "learning_rate": 7.427670219708506e-06, + "loss": 0.0043, + "step": 236500 + }, + { + "epoch": 12.888840548183598, + "grad_norm": 0.0060177757404744625, + "learning_rate": 7.422231890363281e-06, + "loss": 0.0049, + "step": 237000 + }, + { + "epoch": 12.916032194909723, + "grad_norm": 0.0001580445095896721, + "learning_rate": 7.416793561018057e-06, + "loss": 0.0051, + "step": 237500 + }, + { + "epoch": 12.943223841635849, + "grad_norm": 0.08792165666818619, + "learning_rate": 7.411355231672831e-06, + "loss": 0.0043, + "step": 238000 + }, + { + "epoch": 12.970415488361976, + "grad_norm": 0.10381147265434265, + "learning_rate": 7.4059169023276055e-06, + "loss": 0.0059, + "step": 238500 + }, + { + "epoch": 12.997607135088101, + "grad_norm": 0.06624840945005417, + "learning_rate": 7.400478572982381e-06, + "loss": 0.0053, + "step": 239000 + }, + { + "epoch": 13.0, + "eval_exact_match": 0.8813, + "eval_loss": 0.3372015357017517, + "eval_runtime": 1023.493, + "eval_samples_per_second": 11.116, + "eval_steps_per_second": 0.696, + "step": 239044 + }, + { + "epoch": 13.024798781814226, + "grad_norm": 0.021261321380734444, + "learning_rate": 7.395040243637155e-06, + "loss": 0.0036, + "step": 239500 + }, + { + "epoch": 13.051990428540352, + "grad_norm": 0.0019554144237190485, + "learning_rate": 7.38960191429193e-06, + "loss": 0.002, + "step": 240000 + }, + { + "epoch": 13.079182075266479, + "grad_norm": 0.01153012365102768, + "learning_rate": 7.3841635849467054e-06, + "loss": 0.0027, + "step": 240500 + }, + { + "epoch": 13.106373721992604, + "grad_norm": 0.0053418660536408424, + "learning_rate": 7.37872525560148e-06, + "loss": 0.0027, + "step": 241000 + }, + { + "epoch": 13.13356536871873, + "grad_norm": 7.908132101874799e-05, + "learning_rate": 7.373286926256254e-06, + "loss": 0.0044, + "step": 241500 + }, + { + "epoch": 13.160757015444855, + "grad_norm": 0.0015456726541742682, + "learning_rate": 7.367848596911029e-06, + "loss": 0.0048, + "step": 242000 + }, + { + "epoch": 13.187948662170982, + "grad_norm": 0.009918862022459507, + "learning_rate": 7.3624102675658046e-06, + "loss": 0.0024, + "step": 242500 + }, + { + "epoch": 13.215140308897107, + "grad_norm": 1.1204112768173218, + "learning_rate": 7.35697193822058e-06, + "loss": 0.0034, + "step": 243000 + }, + { + "epoch": 13.242331955623232, + "grad_norm": 0.015402857214212418, + "learning_rate": 7.351533608875354e-06, + "loss": 0.0042, + "step": 243500 + }, + { + "epoch": 13.269523602349357, + "grad_norm": 0.13369937241077423, + "learning_rate": 7.3460952795301285e-06, + "loss": 0.0038, + "step": 244000 + }, + { + "epoch": 13.296715249075485, + "grad_norm": 0.0002845363924279809, + "learning_rate": 7.340656950184904e-06, + "loss": 0.0025, + "step": 244500 + }, + { + "epoch": 13.32390689580161, + "grad_norm": 0.05165963992476463, + "learning_rate": 7.335218620839679e-06, + "loss": 0.0037, + "step": 245000 + }, + { + "epoch": 13.351098542527735, + "grad_norm": 0.0008344284142367542, + "learning_rate": 7.329780291494454e-06, + "loss": 0.0036, + "step": 245500 + }, + { + "epoch": 13.37829018925386, + "grad_norm": 0.46657341718673706, + "learning_rate": 7.3243419621492284e-06, + "loss": 0.0046, + "step": 246000 + }, + { + "epoch": 13.405481835979987, + "grad_norm": 0.6373306512832642, + "learning_rate": 7.318903632804003e-06, + "loss": 0.0039, + "step": 246500 + }, + { + "epoch": 13.432673482706113, + "grad_norm": 0.5203019976615906, + "learning_rate": 7.313465303458778e-06, + "loss": 0.0038, + "step": 247000 + }, + { + "epoch": 13.459865129432238, + "grad_norm": 0.3291895389556885, + "learning_rate": 7.308026974113553e-06, + "loss": 0.0048, + "step": 247500 + }, + { + "epoch": 13.487056776158363, + "grad_norm": 0.10239601135253906, + "learning_rate": 7.302588644768328e-06, + "loss": 0.004, + "step": 248000 + }, + { + "epoch": 13.51424842288449, + "grad_norm": 6.02880859375, + "learning_rate": 7.297150315423103e-06, + "loss": 0.0044, + "step": 248500 + }, + { + "epoch": 13.541440069610616, + "grad_norm": 0.0003724268462974578, + "learning_rate": 7.291711986077877e-06, + "loss": 0.003, + "step": 249000 + }, + { + "epoch": 13.568631716336741, + "grad_norm": 0.008930696174502373, + "learning_rate": 7.286273656732652e-06, + "loss": 0.0063, + "step": 249500 + }, + { + "epoch": 13.595823363062866, + "grad_norm": 0.08336593955755234, + "learning_rate": 7.2808353273874275e-06, + "loss": 0.0034, + "step": 250000 + }, + { + "epoch": 13.623015009788993, + "grad_norm": 0.00440165214240551, + "learning_rate": 7.275396998042203e-06, + "loss": 0.0052, + "step": 250500 + }, + { + "epoch": 13.650206656515119, + "grad_norm": 0.02176918275654316, + "learning_rate": 7.269958668696977e-06, + "loss": 0.0027, + "step": 251000 + }, + { + "epoch": 13.677398303241244, + "grad_norm": 0.8251722455024719, + "learning_rate": 7.2645203393517515e-06, + "loss": 0.0042, + "step": 251500 + }, + { + "epoch": 13.704589949967371, + "grad_norm": 0.0017955650109797716, + "learning_rate": 7.259082010006527e-06, + "loss": 0.0058, + "step": 252000 + }, + { + "epoch": 13.731781596693496, + "grad_norm": 0.00015411581262014806, + "learning_rate": 7.253643680661302e-06, + "loss": 0.0043, + "step": 252500 + }, + { + "epoch": 13.758973243419621, + "grad_norm": 0.0008522845455445349, + "learning_rate": 7.248205351316076e-06, + "loss": 0.0036, + "step": 253000 + }, + { + "epoch": 13.786164890145747, + "grad_norm": 0.16430547833442688, + "learning_rate": 7.242767021970851e-06, + "loss": 0.0037, + "step": 253500 + }, + { + "epoch": 13.813356536871872, + "grad_norm": 0.004896281752735376, + "learning_rate": 7.237328692625626e-06, + "loss": 0.0033, + "step": 254000 + }, + { + "epoch": 13.840548183597999, + "grad_norm": 0.011140445247292519, + "learning_rate": 7.2318903632804e-06, + "loss": 0.0038, + "step": 254500 + }, + { + "epoch": 13.867739830324124, + "grad_norm": 0.4586171805858612, + "learning_rate": 7.226452033935176e-06, + "loss": 0.004, + "step": 255000 + }, + { + "epoch": 13.89493147705025, + "grad_norm": 0.30230623483657837, + "learning_rate": 7.2210137045899505e-06, + "loss": 0.005, + "step": 255500 + }, + { + "epoch": 13.922123123776377, + "grad_norm": 0.0007046368555165827, + "learning_rate": 7.215575375244726e-06, + "loss": 0.0047, + "step": 256000 + }, + { + "epoch": 13.949314770502502, + "grad_norm": 0.014722813852131367, + "learning_rate": 7.2101370458995e-06, + "loss": 0.0048, + "step": 256500 + }, + { + "epoch": 13.976506417228627, + "grad_norm": 0.00015357104712165892, + "learning_rate": 7.2046987165542745e-06, + "loss": 0.0029, + "step": 257000 + }, + { + "epoch": 14.0, + "eval_exact_match": 0.8867, + "eval_loss": 0.34934452176094055, + "eval_runtime": 1020.9159, + "eval_samples_per_second": 11.144, + "eval_steps_per_second": 0.697, + "step": 257432 + }, + { + "epoch": 14.003698063954753, + "grad_norm": 0.042392514646053314, + "learning_rate": 7.1992603872090505e-06, + "loss": 0.0044, + "step": 257500 + }, + { + "epoch": 14.03088971068088, + "grad_norm": 0.00043550218106247485, + "learning_rate": 7.193822057863825e-06, + "loss": 0.0022, + "step": 258000 + }, + { + "epoch": 14.058081357407005, + "grad_norm": 0.010852435603737831, + "learning_rate": 7.1883837285186e-06, + "loss": 0.0038, + "step": 258500 + }, + { + "epoch": 14.08527300413313, + "grad_norm": 1.2875603437423706, + "learning_rate": 7.1829453991733744e-06, + "loss": 0.0022, + "step": 259000 + }, + { + "epoch": 14.112464650859255, + "grad_norm": 0.7273116707801819, + "learning_rate": 7.177507069828149e-06, + "loss": 0.0019, + "step": 259500 + }, + { + "epoch": 14.139656297585383, + "grad_norm": 0.029256224632263184, + "learning_rate": 7.172068740482925e-06, + "loss": 0.0035, + "step": 260000 + }, + { + "epoch": 14.166847944311508, + "grad_norm": 0.6167672276496887, + "learning_rate": 7.166630411137699e-06, + "loss": 0.0024, + "step": 260500 + }, + { + "epoch": 14.194039591037633, + "grad_norm": 0.0019446202786639333, + "learning_rate": 7.161192081792474e-06, + "loss": 0.0025, + "step": 261000 + }, + { + "epoch": 14.221231237763758, + "grad_norm": 0.007389050908386707, + "learning_rate": 7.155753752447249e-06, + "loss": 0.0033, + "step": 261500 + }, + { + "epoch": 14.248422884489885, + "grad_norm": 1.1310490369796753, + "learning_rate": 7.150315423102023e-06, + "loss": 0.0025, + "step": 262000 + }, + { + "epoch": 14.27561453121601, + "grad_norm": 0.011108157224953175, + "learning_rate": 7.144877093756798e-06, + "loss": 0.0019, + "step": 262500 + }, + { + "epoch": 14.302806177942136, + "grad_norm": 0.0007718420820310712, + "learning_rate": 7.1394387644115735e-06, + "loss": 0.0048, + "step": 263000 + }, + { + "epoch": 14.329997824668261, + "grad_norm": 0.00021103527978993952, + "learning_rate": 7.134000435066349e-06, + "loss": 0.0027, + "step": 263500 + }, + { + "epoch": 14.357189471394388, + "grad_norm": 0.2015926092863083, + "learning_rate": 7.128562105721123e-06, + "loss": 0.0045, + "step": 264000 + }, + { + "epoch": 14.384381118120514, + "grad_norm": 0.008299685083329678, + "learning_rate": 7.1231237763758974e-06, + "loss": 0.0033, + "step": 264500 + }, + { + "epoch": 14.411572764846639, + "grad_norm": 2.5773375034332275, + "learning_rate": 7.117685447030672e-06, + "loss": 0.004, + "step": 265000 + }, + { + "epoch": 14.438764411572764, + "grad_norm": 0.0018001631833612919, + "learning_rate": 7.112247117685448e-06, + "loss": 0.0036, + "step": 265500 + }, + { + "epoch": 14.465956058298891, + "grad_norm": 0.0002188493381254375, + "learning_rate": 7.106808788340222e-06, + "loss": 0.0044, + "step": 266000 + }, + { + "epoch": 14.493147705025017, + "grad_norm": 6.366543769836426, + "learning_rate": 7.101370458994997e-06, + "loss": 0.0045, + "step": 266500 + }, + { + "epoch": 14.520339351751142, + "grad_norm": 1.5890947906882502e-05, + "learning_rate": 7.095932129649772e-06, + "loss": 0.0042, + "step": 267000 + }, + { + "epoch": 14.547530998477267, + "grad_norm": 0.0019672750495374203, + "learning_rate": 7.090493800304546e-06, + "loss": 0.0041, + "step": 267500 + }, + { + "epoch": 14.574722645203394, + "grad_norm": 3.3259029388427734, + "learning_rate": 7.085055470959322e-06, + "loss": 0.004, + "step": 268000 + }, + { + "epoch": 14.60191429192952, + "grad_norm": 0.051898933947086334, + "learning_rate": 7.0796171416140965e-06, + "loss": 0.0038, + "step": 268500 + }, + { + "epoch": 14.629105938655645, + "grad_norm": 0.0006499428418464959, + "learning_rate": 7.074178812268872e-06, + "loss": 0.0036, + "step": 269000 + }, + { + "epoch": 14.65629758538177, + "grad_norm": 0.00038398781907744706, + "learning_rate": 7.068740482923646e-06, + "loss": 0.0048, + "step": 269500 + }, + { + "epoch": 14.683489232107897, + "grad_norm": 0.4993869960308075, + "learning_rate": 7.0633021535784204e-06, + "loss": 0.0035, + "step": 270000 + }, + { + "epoch": 14.710680878834022, + "grad_norm": 2.1485577235580422e-05, + "learning_rate": 7.0578638242331965e-06, + "loss": 0.0047, + "step": 270500 + }, + { + "epoch": 14.737872525560148, + "grad_norm": 0.0015798599924892187, + "learning_rate": 7.052425494887971e-06, + "loss": 0.0042, + "step": 271000 + }, + { + "epoch": 14.765064172286273, + "grad_norm": 0.0002877725928556174, + "learning_rate": 7.046987165542746e-06, + "loss": 0.0042, + "step": 271500 + }, + { + "epoch": 14.7922558190124, + "grad_norm": 0.0006782010896131396, + "learning_rate": 7.04154883619752e-06, + "loss": 0.0029, + "step": 272000 + }, + { + "epoch": 14.819447465738525, + "grad_norm": 3.1857094764709473, + "learning_rate": 7.036110506852295e-06, + "loss": 0.0034, + "step": 272500 + }, + { + "epoch": 14.84663911246465, + "grad_norm": 0.07301226258277893, + "learning_rate": 7.030672177507071e-06, + "loss": 0.003, + "step": 273000 + }, + { + "epoch": 14.873830759190776, + "grad_norm": 6.0143280029296875, + "learning_rate": 7.025233848161845e-06, + "loss": 0.003, + "step": 273500 + }, + { + "epoch": 14.901022405916903, + "grad_norm": 0.02105681598186493, + "learning_rate": 7.01979551881662e-06, + "loss": 0.0041, + "step": 274000 + }, + { + "epoch": 14.928214052643028, + "grad_norm": 0.08257223665714264, + "learning_rate": 7.014357189471395e-06, + "loss": 0.0051, + "step": 274500 + }, + { + "epoch": 14.955405699369154, + "grad_norm": 0.013296125456690788, + "learning_rate": 7.008918860126169e-06, + "loss": 0.0044, + "step": 275000 + }, + { + "epoch": 14.982597346095279, + "grad_norm": 0.16596081852912903, + "learning_rate": 7.003480530780945e-06, + "loss": 0.0039, + "step": 275500 + }, + { + "epoch": 15.0, + "eval_exact_match": 0.8834, + "eval_loss": 0.3267911374568939, + "eval_runtime": 1018.9931, + "eval_samples_per_second": 11.165, + "eval_steps_per_second": 0.699, + "step": 275820 + }, + { + "epoch": 15.009788992821406, + "grad_norm": 0.004664579872041941, + "learning_rate": 6.9980422014357195e-06, + "loss": 0.004, + "step": 276000 + }, + { + "epoch": 15.036980639547531, + "grad_norm": 0.004305088426917791, + "learning_rate": 6.992603872090495e-06, + "loss": 0.0022, + "step": 276500 + }, + { + "epoch": 15.064172286273656, + "grad_norm": 0.0011663463665172458, + "learning_rate": 6.987165542745269e-06, + "loss": 0.0026, + "step": 277000 + }, + { + "epoch": 15.091363932999782, + "grad_norm": 0.0547974593937397, + "learning_rate": 6.9817272134000434e-06, + "loss": 0.002, + "step": 277500 + }, + { + "epoch": 15.118555579725909, + "grad_norm": 0.0003876920964103192, + "learning_rate": 6.9762888840548195e-06, + "loss": 0.0033, + "step": 278000 + }, + { + "epoch": 15.145747226452034, + "grad_norm": 0.0008074783254414797, + "learning_rate": 6.970850554709594e-06, + "loss": 0.0034, + "step": 278500 + }, + { + "epoch": 15.17293887317816, + "grad_norm": 2.947611093521118, + "learning_rate": 6.965412225364368e-06, + "loss": 0.0033, + "step": 279000 + }, + { + "epoch": 15.200130519904285, + "grad_norm": 0.034830063581466675, + "learning_rate": 6.959973896019143e-06, + "loss": 0.0039, + "step": 279500 + }, + { + "epoch": 15.227322166630412, + "grad_norm": 0.0010544485412538052, + "learning_rate": 6.954535566673918e-06, + "loss": 0.0017, + "step": 280000 + }, + { + "epoch": 15.254513813356537, + "grad_norm": 0.5181836485862732, + "learning_rate": 6.949097237328694e-06, + "loss": 0.002, + "step": 280500 + }, + { + "epoch": 15.281705460082662, + "grad_norm": 2.1243185997009277, + "learning_rate": 6.943658907983468e-06, + "loss": 0.0033, + "step": 281000 + }, + { + "epoch": 15.308897106808788, + "grad_norm": 0.2341310679912567, + "learning_rate": 6.9382205786382425e-06, + "loss": 0.003, + "step": 281500 + }, + { + "epoch": 15.336088753534915, + "grad_norm": 0.9371480941772461, + "learning_rate": 6.932782249293018e-06, + "loss": 0.003, + "step": 282000 + }, + { + "epoch": 15.36328040026104, + "grad_norm": 0.0012509943917393684, + "learning_rate": 6.927343919947792e-06, + "loss": 0.0031, + "step": 282500 + }, + { + "epoch": 15.390472046987165, + "grad_norm": 0.03419405594468117, + "learning_rate": 6.921905590602568e-06, + "loss": 0.0041, + "step": 283000 + }, + { + "epoch": 15.41766369371329, + "grad_norm": 0.003866742830723524, + "learning_rate": 6.9164672612573425e-06, + "loss": 0.0037, + "step": 283500 + }, + { + "epoch": 15.444855340439418, + "grad_norm": 3.6404711863724515e-05, + "learning_rate": 6.911028931912117e-06, + "loss": 0.0037, + "step": 284000 + }, + { + "epoch": 15.472046987165543, + "grad_norm": 0.006914507132023573, + "learning_rate": 6.905590602566892e-06, + "loss": 0.0031, + "step": 284500 + }, + { + "epoch": 15.499238633891668, + "grad_norm": 0.0347830168902874, + "learning_rate": 6.900152273221666e-06, + "loss": 0.0022, + "step": 285000 + }, + { + "epoch": 15.526430280617793, + "grad_norm": 0.022012189030647278, + "learning_rate": 6.8947139438764425e-06, + "loss": 0.0035, + "step": 285500 + }, + { + "epoch": 15.55362192734392, + "grad_norm": 0.000470901868538931, + "learning_rate": 6.889275614531217e-06, + "loss": 0.0046, + "step": 286000 + }, + { + "epoch": 15.580813574070046, + "grad_norm": 0.16011016070842743, + "learning_rate": 6.883837285185991e-06, + "loss": 0.0041, + "step": 286500 + }, + { + "epoch": 15.608005220796171, + "grad_norm": 0.008064665831625462, + "learning_rate": 6.878398955840766e-06, + "loss": 0.0028, + "step": 287000 + }, + { + "epoch": 15.635196867522296, + "grad_norm": 1.2888329029083252, + "learning_rate": 6.872960626495541e-06, + "loss": 0.0035, + "step": 287500 + }, + { + "epoch": 15.662388514248423, + "grad_norm": 0.01231129840016365, + "learning_rate": 6.867522297150317e-06, + "loss": 0.0037, + "step": 288000 + }, + { + "epoch": 15.689580160974549, + "grad_norm": 0.0002686446823645383, + "learning_rate": 6.862083967805091e-06, + "loss": 0.0033, + "step": 288500 + }, + { + "epoch": 15.716771807700674, + "grad_norm": 1.6821107864379883, + "learning_rate": 6.8566456384598655e-06, + "loss": 0.0036, + "step": 289000 + }, + { + "epoch": 15.743963454426801, + "grad_norm": 0.003827363019809127, + "learning_rate": 6.85120730911464e-06, + "loss": 0.0043, + "step": 289500 + }, + { + "epoch": 15.771155101152926, + "grad_norm": 0.03282500430941582, + "learning_rate": 6.845768979769415e-06, + "loss": 0.0035, + "step": 290000 + }, + { + "epoch": 15.798346747879052, + "grad_norm": 0.020095176994800568, + "learning_rate": 6.84033065042419e-06, + "loss": 0.0036, + "step": 290500 + }, + { + "epoch": 15.825538394605177, + "grad_norm": 0.0032423606608062983, + "learning_rate": 6.8348923210789655e-06, + "loss": 0.0049, + "step": 291000 + }, + { + "epoch": 15.852730041331302, + "grad_norm": 0.02819984033703804, + "learning_rate": 6.82945399173374e-06, + "loss": 0.0028, + "step": 291500 + }, + { + "epoch": 15.87992168805743, + "grad_norm": 0.0026651720982044935, + "learning_rate": 6.824015662388514e-06, + "loss": 0.0032, + "step": 292000 + }, + { + "epoch": 15.907113334783554, + "grad_norm": 0.1840675175189972, + "learning_rate": 6.818577333043289e-06, + "loss": 0.0026, + "step": 292500 + }, + { + "epoch": 15.93430498150968, + "grad_norm": 0.0006855327519588172, + "learning_rate": 6.813139003698065e-06, + "loss": 0.0028, + "step": 293000 + }, + { + "epoch": 15.961496628235807, + "grad_norm": 0.3514760732650757, + "learning_rate": 6.80770067435284e-06, + "loss": 0.0048, + "step": 293500 + }, + { + "epoch": 15.988688274961932, + "grad_norm": 0.10397683829069138, + "learning_rate": 6.802262345007614e-06, + "loss": 0.002, + "step": 294000 + }, + { + "epoch": 16.0, + "eval_exact_match": 0.8854, + "eval_loss": 0.3592107594013214, + "eval_runtime": 1020.086, + "eval_samples_per_second": 11.153, + "eval_steps_per_second": 0.698, + "step": 294208 + }, + { + "epoch": 16.015879921688057, + "grad_norm": 0.0021444354206323624, + "learning_rate": 6.7968240156623885e-06, + "loss": 0.0022, + "step": 294500 + }, + { + "epoch": 16.043071568414184, + "grad_norm": 0.3667925000190735, + "learning_rate": 6.791385686317164e-06, + "loss": 0.0022, + "step": 295000 + }, + { + "epoch": 16.070263215140308, + "grad_norm": 0.05687262490391731, + "learning_rate": 6.785947356971939e-06, + "loss": 0.0018, + "step": 295500 + }, + { + "epoch": 16.097454861866435, + "grad_norm": 0.017906086519360542, + "learning_rate": 6.780509027626714e-06, + "loss": 0.0021, + "step": 296000 + }, + { + "epoch": 16.124646508592562, + "grad_norm": 0.38542675971984863, + "learning_rate": 6.7750706982814885e-06, + "loss": 0.0033, + "step": 296500 + }, + { + "epoch": 16.151838155318686, + "grad_norm": 0.0025707464665174484, + "learning_rate": 6.769632368936263e-06, + "loss": 0.0019, + "step": 297000 + }, + { + "epoch": 16.179029802044813, + "grad_norm": 0.017234420403838158, + "learning_rate": 6.764194039591038e-06, + "loss": 0.0019, + "step": 297500 + }, + { + "epoch": 16.206221448770936, + "grad_norm": 0.02613496407866478, + "learning_rate": 6.758755710245813e-06, + "loss": 0.0029, + "step": 298000 + }, + { + "epoch": 16.233413095497063, + "grad_norm": 0.1518625020980835, + "learning_rate": 6.7533173809005884e-06, + "loss": 0.002, + "step": 298500 + }, + { + "epoch": 16.26060474222319, + "grad_norm": 0.017356229946017265, + "learning_rate": 6.747879051555363e-06, + "loss": 0.0015, + "step": 299000 + }, + { + "epoch": 16.287796388949314, + "grad_norm": 0.02871873788535595, + "learning_rate": 6.742440722210137e-06, + "loss": 0.003, + "step": 299500 + }, + { + "epoch": 16.31498803567544, + "grad_norm": 0.0028448388911783695, + "learning_rate": 6.737002392864912e-06, + "loss": 0.0016, + "step": 300000 + }, + { + "epoch": 16.342179682401568, + "grad_norm": 0.03786061704158783, + "learning_rate": 6.7315640635196876e-06, + "loss": 0.0024, + "step": 300500 + }, + { + "epoch": 16.36937132912769, + "grad_norm": 0.17070689797401428, + "learning_rate": 6.726125734174462e-06, + "loss": 0.0038, + "step": 301000 + }, + { + "epoch": 16.39656297585382, + "grad_norm": 0.04375317320227623, + "learning_rate": 6.720687404829237e-06, + "loss": 0.0027, + "step": 301500 + }, + { + "epoch": 16.423754622579942, + "grad_norm": 0.0004980552475899458, + "learning_rate": 6.7152490754840115e-06, + "loss": 0.0027, + "step": 302000 + }, + { + "epoch": 16.45094626930607, + "grad_norm": 0.00025424736668355763, + "learning_rate": 6.709810746138786e-06, + "loss": 0.0027, + "step": 302500 + }, + { + "epoch": 16.478137916032196, + "grad_norm": 0.0015307448338717222, + "learning_rate": 6.704372416793562e-06, + "loss": 0.0031, + "step": 303000 + }, + { + "epoch": 16.50532956275832, + "grad_norm": 0.03277304396033287, + "learning_rate": 6.698934087448336e-06, + "loss": 0.0039, + "step": 303500 + }, + { + "epoch": 16.532521209484447, + "grad_norm": 0.0010686165187507868, + "learning_rate": 6.6934957581031115e-06, + "loss": 0.0031, + "step": 304000 + }, + { + "epoch": 16.559712856210574, + "grad_norm": 0.014178049750626087, + "learning_rate": 6.688057428757886e-06, + "loss": 0.0027, + "step": 304500 + }, + { + "epoch": 16.586904502936697, + "grad_norm": 0.9002902507781982, + "learning_rate": 6.68261909941266e-06, + "loss": 0.0033, + "step": 305000 + }, + { + "epoch": 16.614096149662824, + "grad_norm": 0.06729450821876526, + "learning_rate": 6.677180770067436e-06, + "loss": 0.0028, + "step": 305500 + }, + { + "epoch": 16.641287796388948, + "grad_norm": 0.3082190752029419, + "learning_rate": 6.671742440722211e-06, + "loss": 0.0024, + "step": 306000 + }, + { + "epoch": 16.668479443115075, + "grad_norm": 0.0023857878986746073, + "learning_rate": 6.666304111376986e-06, + "loss": 0.002, + "step": 306500 + }, + { + "epoch": 16.695671089841202, + "grad_norm": 0.2701680064201355, + "learning_rate": 6.66086578203176e-06, + "loss": 0.0034, + "step": 307000 + }, + { + "epoch": 16.722862736567325, + "grad_norm": 0.0009946366772055626, + "learning_rate": 6.6554274526865345e-06, + "loss": 0.0028, + "step": 307500 + }, + { + "epoch": 16.750054383293453, + "grad_norm": 0.006399666890501976, + "learning_rate": 6.6499891233413106e-06, + "loss": 0.0028, + "step": 308000 + }, + { + "epoch": 16.77724603001958, + "grad_norm": 1.565460443496704, + "learning_rate": 6.644550793996085e-06, + "loss": 0.0026, + "step": 308500 + }, + { + "epoch": 16.804437676745703, + "grad_norm": 0.40049976110458374, + "learning_rate": 6.63911246465086e-06, + "loss": 0.0029, + "step": 309000 + }, + { + "epoch": 16.83162932347183, + "grad_norm": 0.30652865767478943, + "learning_rate": 6.6336741353056345e-06, + "loss": 0.0032, + "step": 309500 + }, + { + "epoch": 16.858820970197954, + "grad_norm": 0.2172711193561554, + "learning_rate": 6.628235805960409e-06, + "loss": 0.0031, + "step": 310000 + }, + { + "epoch": 16.88601261692408, + "grad_norm": 0.010080419480800629, + "learning_rate": 6.622797476615185e-06, + "loss": 0.0028, + "step": 310500 + }, + { + "epoch": 16.913204263650208, + "grad_norm": 0.03477318584918976, + "learning_rate": 6.617359147269959e-06, + "loss": 0.0037, + "step": 311000 + }, + { + "epoch": 16.94039591037633, + "grad_norm": 0.0014530881308019161, + "learning_rate": 6.6119208179247344e-06, + "loss": 0.003, + "step": 311500 + }, + { + "epoch": 16.96758755710246, + "grad_norm": 0.008372816257178783, + "learning_rate": 6.606482488579509e-06, + "loss": 0.0023, + "step": 312000 + }, + { + "epoch": 16.994779203828585, + "grad_norm": 0.00021387383458204567, + "learning_rate": 6.601044159234283e-06, + "loss": 0.0024, + "step": 312500 + }, + { + "epoch": 17.0, + "eval_exact_match": 0.887, + "eval_loss": 0.3400750756263733, + "eval_runtime": 1019.1709, + "eval_samples_per_second": 11.163, + "eval_steps_per_second": 0.699, + "step": 312596 + }, + { + "epoch": 17.02197085055471, + "grad_norm": 0.0042693424038589, + "learning_rate": 6.595605829889059e-06, + "loss": 0.0027, + "step": 313000 + }, + { + "epoch": 17.049162497280836, + "grad_norm": 0.005676996428519487, + "learning_rate": 6.5901675005438336e-06, + "loss": 0.0024, + "step": 313500 + }, + { + "epoch": 17.07635414400696, + "grad_norm": 0.006243901327252388, + "learning_rate": 6.584729171198608e-06, + "loss": 0.0019, + "step": 314000 + }, + { + "epoch": 17.103545790733087, + "grad_norm": 0.0003304806014057249, + "learning_rate": 6.579290841853383e-06, + "loss": 0.0023, + "step": 314500 + }, + { + "epoch": 17.130737437459214, + "grad_norm": 0.001669013872742653, + "learning_rate": 6.5738525125081575e-06, + "loss": 0.0022, + "step": 315000 + }, + { + "epoch": 17.157929084185337, + "grad_norm": 0.07920686900615692, + "learning_rate": 6.5684141831629335e-06, + "loss": 0.0011, + "step": 315500 + }, + { + "epoch": 17.185120730911464, + "grad_norm": 0.32899802923202515, + "learning_rate": 6.562975853817708e-06, + "loss": 0.0027, + "step": 316000 + }, + { + "epoch": 17.21231237763759, + "grad_norm": 0.0661708191037178, + "learning_rate": 6.557537524472482e-06, + "loss": 0.003, + "step": 316500 + }, + { + "epoch": 17.239504024363715, + "grad_norm": 0.002496266271919012, + "learning_rate": 6.5520991951272574e-06, + "loss": 0.0033, + "step": 317000 + }, + { + "epoch": 17.266695671089842, + "grad_norm": 0.028615722432732582, + "learning_rate": 6.546660865782032e-06, + "loss": 0.0022, + "step": 317500 + }, + { + "epoch": 17.293887317815965, + "grad_norm": 0.002526836236938834, + "learning_rate": 6.541222536436808e-06, + "loss": 0.0028, + "step": 318000 + }, + { + "epoch": 17.321078964542092, + "grad_norm": 0.0003039772855117917, + "learning_rate": 6.535784207091582e-06, + "loss": 0.002, + "step": 318500 + }, + { + "epoch": 17.34827061126822, + "grad_norm": 0.004081173334270716, + "learning_rate": 6.5303458777463566e-06, + "loss": 0.0012, + "step": 319000 + }, + { + "epoch": 17.375462257994343, + "grad_norm": 0.09041595458984375, + "learning_rate": 6.524907548401132e-06, + "loss": 0.0023, + "step": 319500 + }, + { + "epoch": 17.40265390472047, + "grad_norm": 0.0002773249871097505, + "learning_rate": 6.519469219055906e-06, + "loss": 0.002, + "step": 320000 + }, + { + "epoch": 17.429845551446597, + "grad_norm": 0.0009189367992803454, + "learning_rate": 6.514030889710682e-06, + "loss": 0.0029, + "step": 320500 + }, + { + "epoch": 17.45703719817272, + "grad_norm": 7.472214929293841e-05, + "learning_rate": 6.5085925603654565e-06, + "loss": 0.0034, + "step": 321000 + }, + { + "epoch": 17.484228844898848, + "grad_norm": 0.011273865588009357, + "learning_rate": 6.503154231020231e-06, + "loss": 0.0021, + "step": 321500 + }, + { + "epoch": 17.51142049162497, + "grad_norm": 0.008078432641923428, + "learning_rate": 6.497715901675006e-06, + "loss": 0.0018, + "step": 322000 + }, + { + "epoch": 17.5386121383511, + "grad_norm": 0.0031718432437628508, + "learning_rate": 6.4922775723297805e-06, + "loss": 0.0029, + "step": 322500 + }, + { + "epoch": 17.565803785077225, + "grad_norm": 0.10418205708265305, + "learning_rate": 6.4868392429845565e-06, + "loss": 0.0025, + "step": 323000 + }, + { + "epoch": 17.59299543180335, + "grad_norm": 4.8601868911646307e-05, + "learning_rate": 6.481400913639331e-06, + "loss": 0.0024, + "step": 323500 + }, + { + "epoch": 17.620187078529476, + "grad_norm": 0.0067114997655153275, + "learning_rate": 6.475962584294105e-06, + "loss": 0.0019, + "step": 324000 + }, + { + "epoch": 17.647378725255603, + "grad_norm": 0.0027572689577937126, + "learning_rate": 6.4705242549488804e-06, + "loss": 0.0018, + "step": 324500 + }, + { + "epoch": 17.674570371981726, + "grad_norm": 0.0052411542274057865, + "learning_rate": 6.465085925603655e-06, + "loss": 0.0017, + "step": 325000 + }, + { + "epoch": 17.701762018707853, + "grad_norm": 0.048968348652124405, + "learning_rate": 6.45964759625843e-06, + "loss": 0.0041, + "step": 325500 + }, + { + "epoch": 17.728953665433977, + "grad_norm": 0.0033183780033141375, + "learning_rate": 6.454209266913205e-06, + "loss": 0.0027, + "step": 326000 + }, + { + "epoch": 17.756145312160104, + "grad_norm": 0.002152485540136695, + "learning_rate": 6.4487709375679795e-06, + "loss": 0.0029, + "step": 326500 + }, + { + "epoch": 17.78333695888623, + "grad_norm": 0.1940182000398636, + "learning_rate": 6.443332608222754e-06, + "loss": 0.005, + "step": 327000 + }, + { + "epoch": 17.810528605612355, + "grad_norm": 0.00849226675927639, + "learning_rate": 6.437894278877529e-06, + "loss": 0.0027, + "step": 327500 + }, + { + "epoch": 17.83772025233848, + "grad_norm": 0.003068720456212759, + "learning_rate": 6.432455949532304e-06, + "loss": 0.0023, + "step": 328000 + }, + { + "epoch": 17.86491189906461, + "grad_norm": 0.8710117936134338, + "learning_rate": 6.4270176201870795e-06, + "loss": 0.0027, + "step": 328500 + }, + { + "epoch": 17.892103545790732, + "grad_norm": 0.0016409781528636813, + "learning_rate": 6.421579290841854e-06, + "loss": 0.0025, + "step": 329000 + }, + { + "epoch": 17.91929519251686, + "grad_norm": 0.01407603919506073, + "learning_rate": 6.416140961496628e-06, + "loss": 0.0026, + "step": 329500 + }, + { + "epoch": 17.946486839242986, + "grad_norm": 0.02472531422972679, + "learning_rate": 6.4107026321514034e-06, + "loss": 0.0043, + "step": 330000 + }, + { + "epoch": 17.97367848596911, + "grad_norm": 0.0002928628819063306, + "learning_rate": 6.405264302806179e-06, + "loss": 0.003, + "step": 330500 + }, + { + "epoch": 18.0, + "eval_exact_match": 0.8866, + "eval_loss": 0.3563012182712555, + "eval_runtime": 1021.8314, + "eval_samples_per_second": 11.134, + "eval_steps_per_second": 0.697, + "step": 330984 + }, + { + "epoch": 18.000870132695237, + "grad_norm": 0.0025135932955890894, + "learning_rate": 6.399825973460954e-06, + "loss": 0.0043, + "step": 331000 + }, + { + "epoch": 18.02806177942136, + "grad_norm": 0.0022032412234693766, + "learning_rate": 6.394387644115728e-06, + "loss": 0.0017, + "step": 331500 + }, + { + "epoch": 18.055253426147488, + "grad_norm": 0.00628103269264102, + "learning_rate": 6.3889493147705026e-06, + "loss": 0.0011, + "step": 332000 + }, + { + "epoch": 18.082445072873615, + "grad_norm": 0.0016064423834905028, + "learning_rate": 6.383510985425278e-06, + "loss": 0.0015, + "step": 332500 + }, + { + "epoch": 18.109636719599738, + "grad_norm": 0.036574505269527435, + "learning_rate": 6.378072656080053e-06, + "loss": 0.0017, + "step": 333000 + }, + { + "epoch": 18.136828366325865, + "grad_norm": 0.017870482057332993, + "learning_rate": 6.372634326734828e-06, + "loss": 0.001, + "step": 333500 + }, + { + "epoch": 18.16402001305199, + "grad_norm": 0.013820737600326538, + "learning_rate": 6.3671959973896025e-06, + "loss": 0.0017, + "step": 334000 + }, + { + "epoch": 18.191211659778116, + "grad_norm": 0.008671647869050503, + "learning_rate": 6.361757668044377e-06, + "loss": 0.0031, + "step": 334500 + }, + { + "epoch": 18.218403306504243, + "grad_norm": 0.0009647606639191508, + "learning_rate": 6.356319338699152e-06, + "loss": 0.0028, + "step": 335000 + }, + { + "epoch": 18.245594953230366, + "grad_norm": 0.316081166267395, + "learning_rate": 6.350881009353927e-06, + "loss": 0.0023, + "step": 335500 + }, + { + "epoch": 18.272786599956493, + "grad_norm": 0.006871197838336229, + "learning_rate": 6.3454426800087025e-06, + "loss": 0.0026, + "step": 336000 + }, + { + "epoch": 18.29997824668262, + "grad_norm": 0.004635128192603588, + "learning_rate": 6.340004350663477e-06, + "loss": 0.0023, + "step": 336500 + }, + { + "epoch": 18.327169893408744, + "grad_norm": 0.77015620470047, + "learning_rate": 6.334566021318251e-06, + "loss": 0.0017, + "step": 337000 + }, + { + "epoch": 18.35436154013487, + "grad_norm": 5.524133666767739e-05, + "learning_rate": 6.329127691973026e-06, + "loss": 0.0016, + "step": 337500 + }, + { + "epoch": 18.381553186860998, + "grad_norm": 0.042669035494327545, + "learning_rate": 6.323689362627802e-06, + "loss": 0.0014, + "step": 338000 + }, + { + "epoch": 18.40874483358712, + "grad_norm": 0.004605181515216827, + "learning_rate": 6.318251033282576e-06, + "loss": 0.0021, + "step": 338500 + }, + { + "epoch": 18.43593648031325, + "grad_norm": 0.0011207705829292536, + "learning_rate": 6.312812703937351e-06, + "loss": 0.0022, + "step": 339000 + }, + { + "epoch": 18.463128127039372, + "grad_norm": 0.0714796632528305, + "learning_rate": 6.3073743745921255e-06, + "loss": 0.0023, + "step": 339500 + }, + { + "epoch": 18.4903197737655, + "grad_norm": 0.2758328318595886, + "learning_rate": 6.3019360452469e-06, + "loss": 0.0029, + "step": 340000 + }, + { + "epoch": 18.517511420491626, + "grad_norm": 0.0030946761835366488, + "learning_rate": 6.296497715901676e-06, + "loss": 0.0025, + "step": 340500 + }, + { + "epoch": 18.54470306721775, + "grad_norm": 0.001809292589314282, + "learning_rate": 6.29105938655645e-06, + "loss": 0.003, + "step": 341000 + }, + { + "epoch": 18.571894713943877, + "grad_norm": 1.4280059337615967, + "learning_rate": 6.2856210572112255e-06, + "loss": 0.0022, + "step": 341500 + }, + { + "epoch": 18.599086360670004, + "grad_norm": 0.003949850331991911, + "learning_rate": 6.280182727866e-06, + "loss": 0.0022, + "step": 342000 + }, + { + "epoch": 18.626278007396127, + "grad_norm": 0.008224201388657093, + "learning_rate": 6.274744398520774e-06, + "loss": 0.0024, + "step": 342500 + }, + { + "epoch": 18.653469654122254, + "grad_norm": 0.0052871499210596085, + "learning_rate": 6.26930606917555e-06, + "loss": 0.0026, + "step": 343000 + }, + { + "epoch": 18.680661300848378, + "grad_norm": 0.0848020538687706, + "learning_rate": 6.263867739830325e-06, + "loss": 0.0023, + "step": 343500 + }, + { + "epoch": 18.707852947574505, + "grad_norm": 0.00029155545053072274, + "learning_rate": 6.2584294104851e-06, + "loss": 0.0024, + "step": 344000 + }, + { + "epoch": 18.735044594300632, + "grad_norm": 0.00019738083938136697, + "learning_rate": 6.252991081139874e-06, + "loss": 0.0021, + "step": 344500 + }, + { + "epoch": 18.762236241026756, + "grad_norm": 0.009960692375898361, + "learning_rate": 6.2475527517946485e-06, + "loss": 0.0032, + "step": 345000 + }, + { + "epoch": 18.789427887752883, + "grad_norm": 0.03927024081349373, + "learning_rate": 6.242114422449425e-06, + "loss": 0.0027, + "step": 345500 + }, + { + "epoch": 18.81661953447901, + "grad_norm": 1.062012791633606, + "learning_rate": 6.236676093104199e-06, + "loss": 0.002, + "step": 346000 + }, + { + "epoch": 18.843811181205133, + "grad_norm": 7.133814506232738e-05, + "learning_rate": 6.231237763758974e-06, + "loss": 0.0022, + "step": 346500 + }, + { + "epoch": 18.87100282793126, + "grad_norm": 0.00028946486418135464, + "learning_rate": 6.2257994344137485e-06, + "loss": 0.0036, + "step": 347000 + }, + { + "epoch": 18.898194474657384, + "grad_norm": 0.0008498073439113796, + "learning_rate": 6.220361105068523e-06, + "loss": 0.002, + "step": 347500 + }, + { + "epoch": 18.92538612138351, + "grad_norm": 0.009209150448441505, + "learning_rate": 6.214922775723299e-06, + "loss": 0.0022, + "step": 348000 + }, + { + "epoch": 18.952577768109638, + "grad_norm": 0.0022132894955575466, + "learning_rate": 6.209484446378073e-06, + "loss": 0.0019, + "step": 348500 + }, + { + "epoch": 18.97976941483576, + "grad_norm": 0.09578939527273178, + "learning_rate": 6.2040461170328485e-06, + "loss": 0.0033, + "step": 349000 + }, + { + "epoch": 19.0, + "eval_exact_match": 0.89, + "eval_loss": 0.36704039573669434, + "eval_runtime": 1019.123, + "eval_samples_per_second": 11.164, + "eval_steps_per_second": 0.699, + "step": 349372 + } + ], + "logging_steps": 500, + "max_steps": 919400, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.0577080599658496e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}