{ "best_metric": 1.0753824710845947, "best_model_checkpoint": "/root/finetuning_executions/finetuning_02_codet5p_src_fm_fc_ms_ff/checkpoint-17548", "epoch": 5.0, "eval_steps": 500, "global_step": 87740, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 1.5287591218948364, "learning_rate": 2.4687500000000004e-05, "loss": 1.4862, "step": 400 }, { "epoch": 0.05, "grad_norm": 1.919360876083374, "learning_rate": 4.96875e-05, "loss": 1.1714, "step": 800 }, { "epoch": 0.07, "grad_norm": 1.1510220766067505, "learning_rate": 4.977283183804923e-05, "loss": 1.1326, "step": 1200 }, { "epoch": 0.09, "grad_norm": 1.1849422454833984, "learning_rate": 4.9542788129744654e-05, "loss": 1.1176, "step": 1600 }, { "epoch": 0.11, "grad_norm": 1.052920937538147, "learning_rate": 4.931274442144008e-05, "loss": 1.0981, "step": 2000 }, { "epoch": 0.14, "grad_norm": 1.174275517463684, "learning_rate": 4.90827007131355e-05, "loss": 1.0811, "step": 2400 }, { "epoch": 0.16, "grad_norm": 1.0344840288162231, "learning_rate": 4.885265700483092e-05, "loss": 1.065, "step": 2800 }, { "epoch": 0.18, "grad_norm": 1.2671674489974976, "learning_rate": 4.862261329652634e-05, "loss": 1.0578, "step": 3200 }, { "epoch": 0.21, "grad_norm": 1.1277002096176147, "learning_rate": 4.839256958822176e-05, "loss": 1.0421, "step": 3600 }, { "epoch": 0.23, "grad_norm": 1.1894861459732056, "learning_rate": 4.8162525879917186e-05, "loss": 1.031, "step": 4000 }, { "epoch": 0.25, "grad_norm": 1.2189041376113892, "learning_rate": 4.793248217161261e-05, "loss": 1.0322, "step": 4400 }, { "epoch": 0.27, "grad_norm": 1.2372210025787354, "learning_rate": 4.770243846330803e-05, "loss": 1.0155, "step": 4800 }, { "epoch": 0.3, "grad_norm": 1.2500073909759521, "learning_rate": 4.7472394755003454e-05, "loss": 1.0211, "step": 5200 }, { "epoch": 0.32, "grad_norm": 0.9148824214935303, "learning_rate": 4.724235104669887e-05, "loss": 1.0001, "step": 5600 }, { "epoch": 0.34, "grad_norm": 1.1473156213760376, "learning_rate": 4.7012307338394294e-05, "loss": 0.9869, "step": 6000 }, { "epoch": 0.36, "grad_norm": 1.1870834827423096, "learning_rate": 4.6782263630089717e-05, "loss": 0.9799, "step": 6400 }, { "epoch": 0.39, "grad_norm": 1.1499440670013428, "learning_rate": 4.655221992178514e-05, "loss": 0.9745, "step": 6800 }, { "epoch": 0.41, "grad_norm": 1.0729453563690186, "learning_rate": 4.632217621348056e-05, "loss": 0.9871, "step": 7200 }, { "epoch": 0.43, "grad_norm": 1.3007827997207642, "learning_rate": 4.6092132505175986e-05, "loss": 0.9612, "step": 7600 }, { "epoch": 0.46, "grad_norm": 1.1860408782958984, "learning_rate": 4.586208879687141e-05, "loss": 0.9636, "step": 8000 }, { "epoch": 0.48, "grad_norm": 1.0349955558776855, "learning_rate": 4.5632045088566825e-05, "loss": 0.9645, "step": 8400 }, { "epoch": 0.5, "grad_norm": 1.3005322217941284, "learning_rate": 4.5402001380262254e-05, "loss": 0.9536, "step": 8800 }, { "epoch": 0.52, "grad_norm": 1.2307965755462646, "learning_rate": 4.517195767195768e-05, "loss": 0.9474, "step": 9200 }, { "epoch": 0.55, "grad_norm": 1.0385469198226929, "learning_rate": 4.49419139636531e-05, "loss": 0.9402, "step": 9600 }, { "epoch": 0.57, "grad_norm": 1.1734727621078491, "learning_rate": 4.471187025534852e-05, "loss": 0.9321, "step": 10000 }, { "epoch": 0.59, "grad_norm": 1.3363800048828125, "learning_rate": 4.448182654704394e-05, "loss": 0.9192, "step": 10400 }, { "epoch": 0.62, "grad_norm": 1.073585033416748, "learning_rate": 4.425178283873936e-05, "loss": 0.9378, "step": 10800 }, { "epoch": 0.64, "grad_norm": 1.0610324144363403, "learning_rate": 4.4021739130434786e-05, "loss": 0.9187, "step": 11200 }, { "epoch": 0.66, "grad_norm": 1.039048194885254, "learning_rate": 4.379169542213021e-05, "loss": 0.9191, "step": 11600 }, { "epoch": 0.68, "grad_norm": 1.0391401052474976, "learning_rate": 4.356165171382563e-05, "loss": 0.91, "step": 12000 }, { "epoch": 0.71, "grad_norm": 1.082083821296692, "learning_rate": 4.3331608005521054e-05, "loss": 0.9166, "step": 12400 }, { "epoch": 0.73, "grad_norm": 1.0464677810668945, "learning_rate": 4.310156429721647e-05, "loss": 0.9234, "step": 12800 }, { "epoch": 0.75, "grad_norm": 1.0795680284500122, "learning_rate": 4.2871520588911894e-05, "loss": 0.9004, "step": 13200 }, { "epoch": 0.78, "grad_norm": 1.2177696228027344, "learning_rate": 4.2641476880607317e-05, "loss": 0.8991, "step": 13600 }, { "epoch": 0.8, "grad_norm": 0.9279542565345764, "learning_rate": 4.241143317230274e-05, "loss": 0.901, "step": 14000 }, { "epoch": 0.82, "grad_norm": 1.2393149137496948, "learning_rate": 4.218138946399816e-05, "loss": 0.8898, "step": 14400 }, { "epoch": 0.84, "grad_norm": 1.2811025381088257, "learning_rate": 4.1951920864964345e-05, "loss": 0.8975, "step": 14800 }, { "epoch": 0.87, "grad_norm": 1.0508288145065308, "learning_rate": 4.172187715665977e-05, "loss": 0.897, "step": 15200 }, { "epoch": 0.89, "grad_norm": 0.962242066860199, "learning_rate": 4.149183344835519e-05, "loss": 0.8776, "step": 15600 }, { "epoch": 0.91, "grad_norm": 0.9615252017974854, "learning_rate": 4.126178974005061e-05, "loss": 0.873, "step": 16000 }, { "epoch": 0.93, "grad_norm": 1.040337324142456, "learning_rate": 4.103174603174603e-05, "loss": 0.8831, "step": 16400 }, { "epoch": 0.96, "grad_norm": 1.0600088834762573, "learning_rate": 4.0801702323441453e-05, "loss": 0.8759, "step": 16800 }, { "epoch": 0.98, "grad_norm": 0.9814367890357971, "learning_rate": 4.0572233724407636e-05, "loss": 0.8732, "step": 17200 }, { "epoch": 1.0, "eval_loss": 1.0753824710845947, "eval_runtime": 239.6966, "eval_samples_per_second": 251.464, "eval_steps_per_second": 3.93, "step": 17548 }, { "epoch": 1.0, "grad_norm": 1.055283784866333, "learning_rate": 4.034219001610306e-05, "loss": 0.8697, "step": 17600 }, { "epoch": 1.03, "grad_norm": 1.1038569211959839, "learning_rate": 4.011272141706924e-05, "loss": 0.8246, "step": 18000 }, { "epoch": 1.05, "grad_norm": 0.9692428708076477, "learning_rate": 3.9882677708764665e-05, "loss": 0.8284, "step": 18400 }, { "epoch": 1.07, "grad_norm": 1.093485951423645, "learning_rate": 3.965263400046009e-05, "loss": 0.8271, "step": 18800 }, { "epoch": 1.09, "grad_norm": 1.1435869932174683, "learning_rate": 3.942259029215551e-05, "loss": 0.8198, "step": 19200 }, { "epoch": 1.12, "grad_norm": 1.389695644378662, "learning_rate": 3.9192546583850934e-05, "loss": 0.8223, "step": 19600 }, { "epoch": 1.14, "grad_norm": 1.081563949584961, "learning_rate": 3.896307798481712e-05, "loss": 0.8078, "step": 20000 }, { "epoch": 1.16, "grad_norm": 1.20356023311615, "learning_rate": 3.873303427651253e-05, "loss": 0.8216, "step": 20400 }, { "epoch": 1.19, "grad_norm": 1.2045621871948242, "learning_rate": 3.850299056820796e-05, "loss": 0.8222, "step": 20800 }, { "epoch": 1.21, "grad_norm": 0.969454824924469, "learning_rate": 3.8272946859903386e-05, "loss": 0.803, "step": 21200 }, { "epoch": 1.23, "grad_norm": 1.2209794521331787, "learning_rate": 3.804290315159881e-05, "loss": 0.8115, "step": 21600 }, { "epoch": 1.25, "grad_norm": 1.0688341856002808, "learning_rate": 3.781285944329423e-05, "loss": 0.8051, "step": 22000 }, { "epoch": 1.28, "grad_norm": 1.1031506061553955, "learning_rate": 3.7582815734989655e-05, "loss": 0.8059, "step": 22400 }, { "epoch": 1.3, "grad_norm": 0.9878343939781189, "learning_rate": 3.735277202668507e-05, "loss": 0.8054, "step": 22800 }, { "epoch": 1.32, "grad_norm": 1.327987790107727, "learning_rate": 3.7122728318380494e-05, "loss": 0.8131, "step": 23200 }, { "epoch": 1.34, "grad_norm": 1.0833244323730469, "learning_rate": 3.689268461007592e-05, "loss": 0.7936, "step": 23600 }, { "epoch": 1.37, "grad_norm": 1.1618777513504028, "learning_rate": 3.666264090177134e-05, "loss": 0.7991, "step": 24000 }, { "epoch": 1.39, "grad_norm": 1.022359013557434, "learning_rate": 3.643259719346676e-05, "loss": 0.8002, "step": 24400 }, { "epoch": 1.41, "grad_norm": 1.2475693225860596, "learning_rate": 3.6202553485162186e-05, "loss": 0.8001, "step": 24800 }, { "epoch": 1.44, "grad_norm": 1.1127784252166748, "learning_rate": 3.59725097768576e-05, "loss": 0.7865, "step": 25200 }, { "epoch": 1.46, "grad_norm": 1.2091097831726074, "learning_rate": 3.5742466068553025e-05, "loss": 0.7899, "step": 25600 }, { "epoch": 1.48, "grad_norm": 0.9588549733161926, "learning_rate": 3.551242236024845e-05, "loss": 0.7942, "step": 26000 }, { "epoch": 1.5, "grad_norm": 1.195241093635559, "learning_rate": 3.528237865194387e-05, "loss": 0.7813, "step": 26400 }, { "epoch": 1.53, "grad_norm": 0.9788525700569153, "learning_rate": 3.5052334943639294e-05, "loss": 0.7805, "step": 26800 }, { "epoch": 1.55, "grad_norm": 1.2794181108474731, "learning_rate": 3.482286634460548e-05, "loss": 0.7763, "step": 27200 }, { "epoch": 1.57, "grad_norm": 0.9700046181678772, "learning_rate": 3.45928226363009e-05, "loss": 0.7801, "step": 27600 }, { "epoch": 1.6, "grad_norm": 1.2326452732086182, "learning_rate": 3.436335403726708e-05, "loss": 0.7864, "step": 28000 }, { "epoch": 1.62, "grad_norm": 1.2367639541625977, "learning_rate": 3.4133310328962506e-05, "loss": 0.7845, "step": 28400 }, { "epoch": 1.64, "grad_norm": 1.077854871749878, "learning_rate": 3.390326662065793e-05, "loss": 0.7869, "step": 28800 }, { "epoch": 1.66, "grad_norm": 1.0575716495513916, "learning_rate": 3.3673222912353345e-05, "loss": 0.7838, "step": 29200 }, { "epoch": 1.69, "grad_norm": 1.1674555540084839, "learning_rate": 3.344317920404877e-05, "loss": 0.7827, "step": 29600 }, { "epoch": 1.71, "grad_norm": 1.148335337638855, "learning_rate": 3.321313549574419e-05, "loss": 0.7781, "step": 30000 }, { "epoch": 1.73, "grad_norm": 1.0287448167800903, "learning_rate": 3.2983091787439614e-05, "loss": 0.7652, "step": 30400 }, { "epoch": 1.76, "grad_norm": 1.2461556196212769, "learning_rate": 3.275304807913504e-05, "loss": 0.7773, "step": 30800 }, { "epoch": 1.78, "grad_norm": 1.1946007013320923, "learning_rate": 3.252357948010122e-05, "loss": 0.7694, "step": 31200 }, { "epoch": 1.8, "grad_norm": 1.019499659538269, "learning_rate": 3.229353577179664e-05, "loss": 0.7803, "step": 31600 }, { "epoch": 1.82, "grad_norm": 1.3375366926193237, "learning_rate": 3.2063492063492065e-05, "loss": 0.7684, "step": 32000 }, { "epoch": 1.85, "grad_norm": 1.2477443218231201, "learning_rate": 3.183344835518749e-05, "loss": 0.7657, "step": 32400 }, { "epoch": 1.87, "grad_norm": 1.1749552488327026, "learning_rate": 3.160340464688291e-05, "loss": 0.767, "step": 32800 }, { "epoch": 1.89, "grad_norm": 1.0863006114959717, "learning_rate": 3.1373360938578334e-05, "loss": 0.767, "step": 33200 }, { "epoch": 1.91, "grad_norm": 0.9976168870925903, "learning_rate": 3.114389233954452e-05, "loss": 0.7536, "step": 33600 }, { "epoch": 1.94, "grad_norm": 1.1924540996551514, "learning_rate": 3.09144237405107e-05, "loss": 0.7622, "step": 34000 }, { "epoch": 1.96, "grad_norm": 1.0996850728988647, "learning_rate": 3.068438003220612e-05, "loss": 0.7569, "step": 34400 }, { "epoch": 1.98, "grad_norm": 1.2163282632827759, "learning_rate": 3.0454336323901546e-05, "loss": 0.7667, "step": 34800 }, { "epoch": 2.0, "eval_loss": 1.0829898118972778, "eval_runtime": 239.7954, "eval_samples_per_second": 251.36, "eval_steps_per_second": 3.928, "step": 35096 }, { "epoch": 2.01, "grad_norm": 1.1651737689971924, "learning_rate": 3.0224292615596966e-05, "loss": 0.7442, "step": 35200 }, { "epoch": 2.03, "grad_norm": 1.1764894723892212, "learning_rate": 2.999424890729239e-05, "loss": 0.714, "step": 35600 }, { "epoch": 2.05, "grad_norm": 1.1951353549957275, "learning_rate": 2.976420519898781e-05, "loss": 0.7076, "step": 36000 }, { "epoch": 2.07, "grad_norm": 1.1282097101211548, "learning_rate": 2.953416149068323e-05, "loss": 0.7105, "step": 36400 }, { "epoch": 2.1, "grad_norm": 1.3397319316864014, "learning_rate": 2.9304117782378654e-05, "loss": 0.7023, "step": 36800 }, { "epoch": 2.12, "grad_norm": 1.1150188446044922, "learning_rate": 2.9074074074074077e-05, "loss": 0.7035, "step": 37200 }, { "epoch": 2.14, "grad_norm": 1.2119678258895874, "learning_rate": 2.8844030365769497e-05, "loss": 0.7168, "step": 37600 }, { "epoch": 2.17, "grad_norm": 1.167506456375122, "learning_rate": 2.861398665746492e-05, "loss": 0.7125, "step": 38000 }, { "epoch": 2.19, "grad_norm": 1.0915708541870117, "learning_rate": 2.8384518058431102e-05, "loss": 0.7101, "step": 38400 }, { "epoch": 2.21, "grad_norm": 1.135021686553955, "learning_rate": 2.8154474350126525e-05, "loss": 0.7145, "step": 38800 }, { "epoch": 2.23, "grad_norm": 1.3739718198776245, "learning_rate": 2.792443064182195e-05, "loss": 0.7096, "step": 39200 }, { "epoch": 2.26, "grad_norm": 1.1629129648208618, "learning_rate": 2.7694386933517368e-05, "loss": 0.7053, "step": 39600 }, { "epoch": 2.28, "grad_norm": 0.9963687062263489, "learning_rate": 2.746434322521279e-05, "loss": 0.7012, "step": 40000 }, { "epoch": 2.3, "grad_norm": 1.0318909883499146, "learning_rate": 2.7234874626178974e-05, "loss": 0.713, "step": 40400 }, { "epoch": 2.33, "grad_norm": 1.0613532066345215, "learning_rate": 2.7004830917874397e-05, "loss": 0.704, "step": 40800 }, { "epoch": 2.35, "grad_norm": 1.1298637390136719, "learning_rate": 2.677478720956982e-05, "loss": 0.708, "step": 41200 }, { "epoch": 2.37, "grad_norm": 1.1079801321029663, "learning_rate": 2.654474350126524e-05, "loss": 0.6975, "step": 41600 }, { "epoch": 2.39, "grad_norm": 1.0751113891601562, "learning_rate": 2.6314699792960662e-05, "loss": 0.6999, "step": 42000 }, { "epoch": 2.42, "grad_norm": 1.1240077018737793, "learning_rate": 2.6085231193926845e-05, "loss": 0.7055, "step": 42400 }, { "epoch": 2.44, "grad_norm": 1.0788402557373047, "learning_rate": 2.5855187485622268e-05, "loss": 0.7055, "step": 42800 }, { "epoch": 2.46, "grad_norm": 1.00369131565094, "learning_rate": 2.562514377731769e-05, "loss": 0.6949, "step": 43200 }, { "epoch": 2.48, "grad_norm": 1.1382017135620117, "learning_rate": 2.539510006901311e-05, "loss": 0.7093, "step": 43600 }, { "epoch": 2.51, "grad_norm": 1.0273314714431763, "learning_rate": 2.5165056360708534e-05, "loss": 0.7066, "step": 44000 }, { "epoch": 2.53, "grad_norm": 1.331964373588562, "learning_rate": 2.4935012652403957e-05, "loss": 0.7037, "step": 44400 }, { "epoch": 2.55, "grad_norm": 1.102133870124817, "learning_rate": 2.470496894409938e-05, "loss": 0.7028, "step": 44800 }, { "epoch": 2.58, "grad_norm": 1.131090521812439, "learning_rate": 2.4474925235794803e-05, "loss": 0.6871, "step": 45200 }, { "epoch": 2.6, "grad_norm": 1.1939336061477661, "learning_rate": 2.4244881527490222e-05, "loss": 0.6966, "step": 45600 }, { "epoch": 2.62, "grad_norm": 1.344831109046936, "learning_rate": 2.4014837819185645e-05, "loss": 0.6933, "step": 46000 }, { "epoch": 2.64, "grad_norm": 0.9559622406959534, "learning_rate": 2.3784794110881068e-05, "loss": 0.6916, "step": 46400 }, { "epoch": 2.67, "grad_norm": 1.182010293006897, "learning_rate": 2.355475040257649e-05, "loss": 0.6903, "step": 46800 }, { "epoch": 2.69, "grad_norm": 1.080712080001831, "learning_rate": 2.3325281803542674e-05, "loss": 0.6965, "step": 47200 }, { "epoch": 2.71, "grad_norm": 1.2468616962432861, "learning_rate": 2.3095238095238097e-05, "loss": 0.6906, "step": 47600 }, { "epoch": 2.74, "grad_norm": 1.0585706233978271, "learning_rate": 2.286519438693352e-05, "loss": 0.6966, "step": 48000 }, { "epoch": 2.76, "grad_norm": 1.2725940942764282, "learning_rate": 2.2635150678628943e-05, "loss": 0.6894, "step": 48400 }, { "epoch": 2.78, "grad_norm": 1.1753593683242798, "learning_rate": 2.2405106970324362e-05, "loss": 0.6806, "step": 48800 }, { "epoch": 2.8, "grad_norm": 1.117319941520691, "learning_rate": 2.2175063262019785e-05, "loss": 0.6879, "step": 49200 }, { "epoch": 2.83, "grad_norm": 1.2521744966506958, "learning_rate": 2.194501955371521e-05, "loss": 0.6808, "step": 49600 }, { "epoch": 2.85, "grad_norm": 1.396971344947815, "learning_rate": 2.1714975845410628e-05, "loss": 0.6798, "step": 50000 }, { "epoch": 2.87, "grad_norm": 1.0855846405029297, "learning_rate": 2.148493213710605e-05, "loss": 0.6978, "step": 50400 }, { "epoch": 2.89, "grad_norm": 1.199013113975525, "learning_rate": 2.1254888428801474e-05, "loss": 0.6882, "step": 50800 }, { "epoch": 2.92, "grad_norm": 1.366407871246338, "learning_rate": 2.1024844720496894e-05, "loss": 0.6882, "step": 51200 }, { "epoch": 2.94, "grad_norm": 1.1709498167037964, "learning_rate": 2.0794801012192317e-05, "loss": 0.6907, "step": 51600 }, { "epoch": 2.96, "grad_norm": 1.1881307363510132, "learning_rate": 2.05653324131585e-05, "loss": 0.6883, "step": 52000 }, { "epoch": 2.99, "grad_norm": 1.4105783700942993, "learning_rate": 2.0335288704853922e-05, "loss": 0.6833, "step": 52400 }, { "epoch": 3.0, "eval_loss": 1.0844900608062744, "eval_runtime": 239.8565, "eval_samples_per_second": 251.296, "eval_steps_per_second": 3.927, "step": 52644 }, { "epoch": 3.01, "grad_norm": 1.4675981998443604, "learning_rate": 2.0105244996549345e-05, "loss": 0.6679, "step": 52800 }, { "epoch": 3.03, "grad_norm": 1.151491403579712, "learning_rate": 1.9875201288244768e-05, "loss": 0.6501, "step": 53200 }, { "epoch": 3.05, "grad_norm": 1.0938260555267334, "learning_rate": 1.964515757994019e-05, "loss": 0.6396, "step": 53600 }, { "epoch": 3.08, "grad_norm": 1.055185317993164, "learning_rate": 1.941511387163561e-05, "loss": 0.6442, "step": 54000 }, { "epoch": 3.1, "grad_norm": 1.0307785272598267, "learning_rate": 1.9185645272601797e-05, "loss": 0.6489, "step": 54400 }, { "epoch": 3.12, "grad_norm": 1.184102177619934, "learning_rate": 1.8955601564297217e-05, "loss": 0.6454, "step": 54800 }, { "epoch": 3.15, "grad_norm": 1.1798542737960815, "learning_rate": 1.872555785599264e-05, "loss": 0.6552, "step": 55200 }, { "epoch": 3.17, "grad_norm": 1.1375089883804321, "learning_rate": 1.8496089256958822e-05, "loss": 0.6359, "step": 55600 }, { "epoch": 3.19, "grad_norm": 1.0475974082946777, "learning_rate": 1.8266045548654245e-05, "loss": 0.6374, "step": 56000 }, { "epoch": 3.21, "grad_norm": 1.0948106050491333, "learning_rate": 1.803600184034967e-05, "loss": 0.6431, "step": 56400 }, { "epoch": 3.24, "grad_norm": 1.1488378047943115, "learning_rate": 1.7805958132045088e-05, "loss": 0.646, "step": 56800 }, { "epoch": 3.26, "grad_norm": 1.1257692575454712, "learning_rate": 1.757591442374051e-05, "loss": 0.6408, "step": 57200 }, { "epoch": 3.28, "grad_norm": 1.1101455688476562, "learning_rate": 1.7345870715435934e-05, "loss": 0.6389, "step": 57600 }, { "epoch": 3.31, "grad_norm": 1.329904556274414, "learning_rate": 1.7115827007131354e-05, "loss": 0.6399, "step": 58000 }, { "epoch": 3.33, "grad_norm": 1.2944815158843994, "learning_rate": 1.6885783298826777e-05, "loss": 0.6421, "step": 58400 }, { "epoch": 3.35, "grad_norm": 1.1607027053833008, "learning_rate": 1.6655739590522203e-05, "loss": 0.637, "step": 58800 }, { "epoch": 3.37, "grad_norm": 1.0392543077468872, "learning_rate": 1.6426270991488382e-05, "loss": 0.6411, "step": 59200 }, { "epoch": 3.4, "grad_norm": 1.3244273662567139, "learning_rate": 1.6196227283183805e-05, "loss": 0.6473, "step": 59600 }, { "epoch": 3.42, "grad_norm": 1.1351373195648193, "learning_rate": 1.5966183574879228e-05, "loss": 0.6298, "step": 60000 }, { "epoch": 3.44, "grad_norm": 1.1698590517044067, "learning_rate": 1.573613986657465e-05, "loss": 0.6355, "step": 60400 }, { "epoch": 3.46, "grad_norm": 1.2005553245544434, "learning_rate": 1.5506671267540834e-05, "loss": 0.6395, "step": 60800 }, { "epoch": 3.49, "grad_norm": 0.97503662109375, "learning_rate": 1.5276627559236257e-05, "loss": 0.6437, "step": 61200 }, { "epoch": 3.51, "grad_norm": 1.2518908977508545, "learning_rate": 1.5046583850931678e-05, "loss": 0.6385, "step": 61600 }, { "epoch": 3.53, "grad_norm": 1.2661454677581787, "learning_rate": 1.48165401426271e-05, "loss": 0.6403, "step": 62000 }, { "epoch": 3.56, "grad_norm": 1.2612046003341675, "learning_rate": 1.4586496434322523e-05, "loss": 0.6442, "step": 62400 }, { "epoch": 3.58, "grad_norm": 1.1942335367202759, "learning_rate": 1.4356452726017944e-05, "loss": 0.6383, "step": 62800 }, { "epoch": 3.6, "grad_norm": 1.1030133962631226, "learning_rate": 1.4126409017713365e-05, "loss": 0.6277, "step": 63200 }, { "epoch": 3.62, "grad_norm": 1.2485852241516113, "learning_rate": 1.3896365309408788e-05, "loss": 0.6414, "step": 63600 }, { "epoch": 3.65, "grad_norm": 0.9925839900970459, "learning_rate": 1.366632160110421e-05, "loss": 0.6337, "step": 64000 }, { "epoch": 3.67, "grad_norm": 1.3896905183792114, "learning_rate": 1.343627789279963e-05, "loss": 0.6314, "step": 64400 }, { "epoch": 3.69, "grad_norm": 1.1392475366592407, "learning_rate": 1.3206809293765815e-05, "loss": 0.6312, "step": 64800 }, { "epoch": 3.72, "grad_norm": 1.2051880359649658, "learning_rate": 1.2976765585461237e-05, "loss": 0.6198, "step": 65200 }, { "epoch": 3.74, "grad_norm": 1.3581410646438599, "learning_rate": 1.2746721877156661e-05, "loss": 0.634, "step": 65600 }, { "epoch": 3.76, "grad_norm": 1.4071406126022339, "learning_rate": 1.2516678168852084e-05, "loss": 0.633, "step": 66000 }, { "epoch": 3.78, "grad_norm": 1.1921656131744385, "learning_rate": 1.2286634460547504e-05, "loss": 0.6206, "step": 66400 }, { "epoch": 3.81, "grad_norm": 1.4039461612701416, "learning_rate": 1.2056590752242927e-05, "loss": 0.6341, "step": 66800 }, { "epoch": 3.83, "grad_norm": 1.3369255065917969, "learning_rate": 1.182654704393835e-05, "loss": 0.6427, "step": 67200 }, { "epoch": 3.85, "grad_norm": 1.2129446268081665, "learning_rate": 1.1596503335633771e-05, "loss": 0.6293, "step": 67600 }, { "epoch": 3.88, "grad_norm": 1.264256238937378, "learning_rate": 1.1366459627329192e-05, "loss": 0.6282, "step": 68000 }, { "epoch": 3.9, "grad_norm": 1.1778966188430786, "learning_rate": 1.1136415919024615e-05, "loss": 0.6383, "step": 68400 }, { "epoch": 3.92, "grad_norm": 1.045240044593811, "learning_rate": 1.0906372210720037e-05, "loss": 0.6315, "step": 68800 }, { "epoch": 3.94, "grad_norm": 1.2942785024642944, "learning_rate": 1.0676903611686221e-05, "loss": 0.6276, "step": 69200 }, { "epoch": 3.97, "grad_norm": 1.2519258260726929, "learning_rate": 1.0446859903381644e-05, "loss": 0.6228, "step": 69600 }, { "epoch": 3.99, "grad_norm": 1.2884622812271118, "learning_rate": 1.0216816195077065e-05, "loss": 0.6234, "step": 70000 }, { "epoch": 4.0, "eval_loss": 1.09929621219635, "eval_runtime": 239.9825, "eval_samples_per_second": 251.164, "eval_steps_per_second": 3.925, "step": 70192 }, { "epoch": 4.01, "grad_norm": 1.029523253440857, "learning_rate": 9.986772486772487e-06, "loss": 0.607, "step": 70400 }, { "epoch": 4.03, "grad_norm": 1.1874665021896362, "learning_rate": 9.75672877846791e-06, "loss": 0.5992, "step": 70800 }, { "epoch": 4.06, "grad_norm": 1.3719263076782227, "learning_rate": 9.526685070163331e-06, "loss": 0.5932, "step": 71200 }, { "epoch": 4.08, "grad_norm": 1.1106728315353394, "learning_rate": 9.296641361858754e-06, "loss": 0.6082, "step": 71600 }, { "epoch": 4.1, "grad_norm": 1.1333997249603271, "learning_rate": 9.066597653554177e-06, "loss": 0.5958, "step": 72000 }, { "epoch": 4.13, "grad_norm": 1.2606267929077148, "learning_rate": 8.837129054520358e-06, "loss": 0.6015, "step": 72400 }, { "epoch": 4.15, "grad_norm": 1.123744249343872, "learning_rate": 8.607085346215783e-06, "loss": 0.6011, "step": 72800 }, { "epoch": 4.17, "grad_norm": 1.155521273612976, "learning_rate": 8.377041637911204e-06, "loss": 0.5973, "step": 73200 }, { "epoch": 4.19, "grad_norm": 1.1591954231262207, "learning_rate": 8.146997929606625e-06, "loss": 0.5924, "step": 73600 }, { "epoch": 4.22, "grad_norm": 1.3380868434906006, "learning_rate": 7.916954221302048e-06, "loss": 0.5983, "step": 74000 }, { "epoch": 4.24, "grad_norm": 1.2216105461120605, "learning_rate": 7.68691051299747e-06, "loss": 0.5947, "step": 74400 }, { "epoch": 4.26, "grad_norm": 1.0791873931884766, "learning_rate": 7.4568668046928916e-06, "loss": 0.6054, "step": 74800 }, { "epoch": 4.29, "grad_norm": 1.1365481615066528, "learning_rate": 7.2268230963883145e-06, "loss": 0.6092, "step": 75200 }, { "epoch": 4.31, "grad_norm": 1.1376712322235107, "learning_rate": 6.997354497354498e-06, "loss": 0.5942, "step": 75600 }, { "epoch": 4.33, "grad_norm": 1.1192513704299927, "learning_rate": 6.76731078904992e-06, "loss": 0.5955, "step": 76000 }, { "epoch": 4.35, "grad_norm": 1.1927390098571777, "learning_rate": 6.537267080745342e-06, "loss": 0.5901, "step": 76400 }, { "epoch": 4.38, "grad_norm": 1.236060619354248, "learning_rate": 6.307223372440764e-06, "loss": 0.6, "step": 76800 }, { "epoch": 4.4, "grad_norm": 1.077643871307373, "learning_rate": 6.077179664136186e-06, "loss": 0.6088, "step": 77200 }, { "epoch": 4.42, "grad_norm": 1.3172234296798706, "learning_rate": 5.847135955831608e-06, "loss": 0.5944, "step": 77600 }, { "epoch": 4.44, "grad_norm": 1.2222837209701538, "learning_rate": 5.61709224752703e-06, "loss": 0.5976, "step": 78000 }, { "epoch": 4.47, "grad_norm": 1.2887938022613525, "learning_rate": 5.387048539222452e-06, "loss": 0.6023, "step": 78400 }, { "epoch": 4.49, "grad_norm": 1.1380060911178589, "learning_rate": 5.157579940188636e-06, "loss": 0.5938, "step": 78800 }, { "epoch": 4.51, "grad_norm": 1.2178806066513062, "learning_rate": 4.927536231884058e-06, "loss": 0.5916, "step": 79200 }, { "epoch": 4.54, "grad_norm": 1.2010163068771362, "learning_rate": 4.69749252357948e-06, "loss": 0.5891, "step": 79600 }, { "epoch": 4.56, "grad_norm": 1.2172470092773438, "learning_rate": 4.467448815274902e-06, "loss": 0.6019, "step": 80000 }, { "epoch": 4.58, "grad_norm": 1.2008330821990967, "learning_rate": 4.2374051069703245e-06, "loss": 0.596, "step": 80400 }, { "epoch": 4.6, "grad_norm": 1.3656328916549683, "learning_rate": 4.007936507936508e-06, "loss": 0.6001, "step": 80800 }, { "epoch": 4.63, "grad_norm": 1.336308240890503, "learning_rate": 3.7778927996319303e-06, "loss": 0.5912, "step": 81200 }, { "epoch": 4.65, "grad_norm": 1.1399625539779663, "learning_rate": 3.5478490913273524e-06, "loss": 0.5962, "step": 81600 }, { "epoch": 4.67, "grad_norm": 1.237598180770874, "learning_rate": 3.317805383022775e-06, "loss": 0.5869, "step": 82000 }, { "epoch": 4.7, "grad_norm": 1.1215174198150635, "learning_rate": 3.0877616747181967e-06, "loss": 0.5927, "step": 82400 }, { "epoch": 4.72, "grad_norm": 1.3274859189987183, "learning_rate": 2.857717966413619e-06, "loss": 0.6066, "step": 82800 }, { "epoch": 4.74, "grad_norm": 1.276289463043213, "learning_rate": 2.628249367379802e-06, "loss": 0.5994, "step": 83200 }, { "epoch": 4.76, "grad_norm": 1.154296636581421, "learning_rate": 2.3982056590752246e-06, "loss": 0.5907, "step": 83600 }, { "epoch": 4.79, "grad_norm": 1.1015737056732178, "learning_rate": 2.1681619507706463e-06, "loss": 0.6043, "step": 84000 }, { "epoch": 4.81, "grad_norm": 1.356696367263794, "learning_rate": 1.9381182424660685e-06, "loss": 0.5883, "step": 84400 }, { "epoch": 4.83, "grad_norm": 1.1508524417877197, "learning_rate": 1.7080745341614908e-06, "loss": 0.5899, "step": 84800 }, { "epoch": 4.86, "grad_norm": 1.1132149696350098, "learning_rate": 1.478030825856913e-06, "loss": 0.5994, "step": 85200 }, { "epoch": 4.88, "grad_norm": 1.306624174118042, "learning_rate": 1.2485622268230964e-06, "loss": 0.5891, "step": 85600 }, { "epoch": 4.9, "grad_norm": 1.234307050704956, "learning_rate": 1.0185185185185188e-06, "loss": 0.5916, "step": 86000 }, { "epoch": 4.92, "grad_norm": 1.0994372367858887, "learning_rate": 7.884748102139407e-07, "loss": 0.5908, "step": 86400 }, { "epoch": 4.95, "grad_norm": 1.2712494134902954, "learning_rate": 5.584311019093628e-07, "loss": 0.5963, "step": 86800 }, { "epoch": 4.97, "grad_norm": 1.2190104722976685, "learning_rate": 3.283873936047849e-07, "loss": 0.5902, "step": 87200 }, { "epoch": 4.99, "grad_norm": 1.3301359415054321, "learning_rate": 9.891879457096849e-08, "loss": 0.591, "step": 87600 }, { "epoch": 5.0, "eval_loss": 1.1109092235565186, "eval_runtime": 240.0172, "eval_samples_per_second": 251.128, "eval_steps_per_second": 3.925, "step": 87740 }, { "epoch": 5.0, "step": 87740, "total_flos": 1.7097588901675008e+18, "train_loss": 0.7401387438351292, "train_runtime": 31448.3717, "train_samples_per_second": 89.283, "train_steps_per_second": 2.79 } ], "logging_steps": 400, "max_steps": 87740, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 1.7097588901675008e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }