eljavatar's picture
Upload model finetuned on codet5p-220m using strategy src_fm_fc_ms_ff
7f9de78 verified
raw
history blame contribute delete
No virus
37.1 kB
{
"best_metric": 1.0753824710845947,
"best_model_checkpoint": "/root/finetuning_executions/finetuning_02_codet5p_src_fm_fc_ms_ff/checkpoint-17548",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 87740,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"grad_norm": 1.5287591218948364,
"learning_rate": 2.4687500000000004e-05,
"loss": 1.4862,
"step": 400
},
{
"epoch": 0.05,
"grad_norm": 1.919360876083374,
"learning_rate": 4.96875e-05,
"loss": 1.1714,
"step": 800
},
{
"epoch": 0.07,
"grad_norm": 1.1510220766067505,
"learning_rate": 4.977283183804923e-05,
"loss": 1.1326,
"step": 1200
},
{
"epoch": 0.09,
"grad_norm": 1.1849422454833984,
"learning_rate": 4.9542788129744654e-05,
"loss": 1.1176,
"step": 1600
},
{
"epoch": 0.11,
"grad_norm": 1.052920937538147,
"learning_rate": 4.931274442144008e-05,
"loss": 1.0981,
"step": 2000
},
{
"epoch": 0.14,
"grad_norm": 1.174275517463684,
"learning_rate": 4.90827007131355e-05,
"loss": 1.0811,
"step": 2400
},
{
"epoch": 0.16,
"grad_norm": 1.0344840288162231,
"learning_rate": 4.885265700483092e-05,
"loss": 1.065,
"step": 2800
},
{
"epoch": 0.18,
"grad_norm": 1.2671674489974976,
"learning_rate": 4.862261329652634e-05,
"loss": 1.0578,
"step": 3200
},
{
"epoch": 0.21,
"grad_norm": 1.1277002096176147,
"learning_rate": 4.839256958822176e-05,
"loss": 1.0421,
"step": 3600
},
{
"epoch": 0.23,
"grad_norm": 1.1894861459732056,
"learning_rate": 4.8162525879917186e-05,
"loss": 1.031,
"step": 4000
},
{
"epoch": 0.25,
"grad_norm": 1.2189041376113892,
"learning_rate": 4.793248217161261e-05,
"loss": 1.0322,
"step": 4400
},
{
"epoch": 0.27,
"grad_norm": 1.2372210025787354,
"learning_rate": 4.770243846330803e-05,
"loss": 1.0155,
"step": 4800
},
{
"epoch": 0.3,
"grad_norm": 1.2500073909759521,
"learning_rate": 4.7472394755003454e-05,
"loss": 1.0211,
"step": 5200
},
{
"epoch": 0.32,
"grad_norm": 0.9148824214935303,
"learning_rate": 4.724235104669887e-05,
"loss": 1.0001,
"step": 5600
},
{
"epoch": 0.34,
"grad_norm": 1.1473156213760376,
"learning_rate": 4.7012307338394294e-05,
"loss": 0.9869,
"step": 6000
},
{
"epoch": 0.36,
"grad_norm": 1.1870834827423096,
"learning_rate": 4.6782263630089717e-05,
"loss": 0.9799,
"step": 6400
},
{
"epoch": 0.39,
"grad_norm": 1.1499440670013428,
"learning_rate": 4.655221992178514e-05,
"loss": 0.9745,
"step": 6800
},
{
"epoch": 0.41,
"grad_norm": 1.0729453563690186,
"learning_rate": 4.632217621348056e-05,
"loss": 0.9871,
"step": 7200
},
{
"epoch": 0.43,
"grad_norm": 1.3007827997207642,
"learning_rate": 4.6092132505175986e-05,
"loss": 0.9612,
"step": 7600
},
{
"epoch": 0.46,
"grad_norm": 1.1860408782958984,
"learning_rate": 4.586208879687141e-05,
"loss": 0.9636,
"step": 8000
},
{
"epoch": 0.48,
"grad_norm": 1.0349955558776855,
"learning_rate": 4.5632045088566825e-05,
"loss": 0.9645,
"step": 8400
},
{
"epoch": 0.5,
"grad_norm": 1.3005322217941284,
"learning_rate": 4.5402001380262254e-05,
"loss": 0.9536,
"step": 8800
},
{
"epoch": 0.52,
"grad_norm": 1.2307965755462646,
"learning_rate": 4.517195767195768e-05,
"loss": 0.9474,
"step": 9200
},
{
"epoch": 0.55,
"grad_norm": 1.0385469198226929,
"learning_rate": 4.49419139636531e-05,
"loss": 0.9402,
"step": 9600
},
{
"epoch": 0.57,
"grad_norm": 1.1734727621078491,
"learning_rate": 4.471187025534852e-05,
"loss": 0.9321,
"step": 10000
},
{
"epoch": 0.59,
"grad_norm": 1.3363800048828125,
"learning_rate": 4.448182654704394e-05,
"loss": 0.9192,
"step": 10400
},
{
"epoch": 0.62,
"grad_norm": 1.073585033416748,
"learning_rate": 4.425178283873936e-05,
"loss": 0.9378,
"step": 10800
},
{
"epoch": 0.64,
"grad_norm": 1.0610324144363403,
"learning_rate": 4.4021739130434786e-05,
"loss": 0.9187,
"step": 11200
},
{
"epoch": 0.66,
"grad_norm": 1.039048194885254,
"learning_rate": 4.379169542213021e-05,
"loss": 0.9191,
"step": 11600
},
{
"epoch": 0.68,
"grad_norm": 1.0391401052474976,
"learning_rate": 4.356165171382563e-05,
"loss": 0.91,
"step": 12000
},
{
"epoch": 0.71,
"grad_norm": 1.082083821296692,
"learning_rate": 4.3331608005521054e-05,
"loss": 0.9166,
"step": 12400
},
{
"epoch": 0.73,
"grad_norm": 1.0464677810668945,
"learning_rate": 4.310156429721647e-05,
"loss": 0.9234,
"step": 12800
},
{
"epoch": 0.75,
"grad_norm": 1.0795680284500122,
"learning_rate": 4.2871520588911894e-05,
"loss": 0.9004,
"step": 13200
},
{
"epoch": 0.78,
"grad_norm": 1.2177696228027344,
"learning_rate": 4.2641476880607317e-05,
"loss": 0.8991,
"step": 13600
},
{
"epoch": 0.8,
"grad_norm": 0.9279542565345764,
"learning_rate": 4.241143317230274e-05,
"loss": 0.901,
"step": 14000
},
{
"epoch": 0.82,
"grad_norm": 1.2393149137496948,
"learning_rate": 4.218138946399816e-05,
"loss": 0.8898,
"step": 14400
},
{
"epoch": 0.84,
"grad_norm": 1.2811025381088257,
"learning_rate": 4.1951920864964345e-05,
"loss": 0.8975,
"step": 14800
},
{
"epoch": 0.87,
"grad_norm": 1.0508288145065308,
"learning_rate": 4.172187715665977e-05,
"loss": 0.897,
"step": 15200
},
{
"epoch": 0.89,
"grad_norm": 0.962242066860199,
"learning_rate": 4.149183344835519e-05,
"loss": 0.8776,
"step": 15600
},
{
"epoch": 0.91,
"grad_norm": 0.9615252017974854,
"learning_rate": 4.126178974005061e-05,
"loss": 0.873,
"step": 16000
},
{
"epoch": 0.93,
"grad_norm": 1.040337324142456,
"learning_rate": 4.103174603174603e-05,
"loss": 0.8831,
"step": 16400
},
{
"epoch": 0.96,
"grad_norm": 1.0600088834762573,
"learning_rate": 4.0801702323441453e-05,
"loss": 0.8759,
"step": 16800
},
{
"epoch": 0.98,
"grad_norm": 0.9814367890357971,
"learning_rate": 4.0572233724407636e-05,
"loss": 0.8732,
"step": 17200
},
{
"epoch": 1.0,
"eval_loss": 1.0753824710845947,
"eval_runtime": 239.6966,
"eval_samples_per_second": 251.464,
"eval_steps_per_second": 3.93,
"step": 17548
},
{
"epoch": 1.0,
"grad_norm": 1.055283784866333,
"learning_rate": 4.034219001610306e-05,
"loss": 0.8697,
"step": 17600
},
{
"epoch": 1.03,
"grad_norm": 1.1038569211959839,
"learning_rate": 4.011272141706924e-05,
"loss": 0.8246,
"step": 18000
},
{
"epoch": 1.05,
"grad_norm": 0.9692428708076477,
"learning_rate": 3.9882677708764665e-05,
"loss": 0.8284,
"step": 18400
},
{
"epoch": 1.07,
"grad_norm": 1.093485951423645,
"learning_rate": 3.965263400046009e-05,
"loss": 0.8271,
"step": 18800
},
{
"epoch": 1.09,
"grad_norm": 1.1435869932174683,
"learning_rate": 3.942259029215551e-05,
"loss": 0.8198,
"step": 19200
},
{
"epoch": 1.12,
"grad_norm": 1.389695644378662,
"learning_rate": 3.9192546583850934e-05,
"loss": 0.8223,
"step": 19600
},
{
"epoch": 1.14,
"grad_norm": 1.081563949584961,
"learning_rate": 3.896307798481712e-05,
"loss": 0.8078,
"step": 20000
},
{
"epoch": 1.16,
"grad_norm": 1.20356023311615,
"learning_rate": 3.873303427651253e-05,
"loss": 0.8216,
"step": 20400
},
{
"epoch": 1.19,
"grad_norm": 1.2045621871948242,
"learning_rate": 3.850299056820796e-05,
"loss": 0.8222,
"step": 20800
},
{
"epoch": 1.21,
"grad_norm": 0.969454824924469,
"learning_rate": 3.8272946859903386e-05,
"loss": 0.803,
"step": 21200
},
{
"epoch": 1.23,
"grad_norm": 1.2209794521331787,
"learning_rate": 3.804290315159881e-05,
"loss": 0.8115,
"step": 21600
},
{
"epoch": 1.25,
"grad_norm": 1.0688341856002808,
"learning_rate": 3.781285944329423e-05,
"loss": 0.8051,
"step": 22000
},
{
"epoch": 1.28,
"grad_norm": 1.1031506061553955,
"learning_rate": 3.7582815734989655e-05,
"loss": 0.8059,
"step": 22400
},
{
"epoch": 1.3,
"grad_norm": 0.9878343939781189,
"learning_rate": 3.735277202668507e-05,
"loss": 0.8054,
"step": 22800
},
{
"epoch": 1.32,
"grad_norm": 1.327987790107727,
"learning_rate": 3.7122728318380494e-05,
"loss": 0.8131,
"step": 23200
},
{
"epoch": 1.34,
"grad_norm": 1.0833244323730469,
"learning_rate": 3.689268461007592e-05,
"loss": 0.7936,
"step": 23600
},
{
"epoch": 1.37,
"grad_norm": 1.1618777513504028,
"learning_rate": 3.666264090177134e-05,
"loss": 0.7991,
"step": 24000
},
{
"epoch": 1.39,
"grad_norm": 1.022359013557434,
"learning_rate": 3.643259719346676e-05,
"loss": 0.8002,
"step": 24400
},
{
"epoch": 1.41,
"grad_norm": 1.2475693225860596,
"learning_rate": 3.6202553485162186e-05,
"loss": 0.8001,
"step": 24800
},
{
"epoch": 1.44,
"grad_norm": 1.1127784252166748,
"learning_rate": 3.59725097768576e-05,
"loss": 0.7865,
"step": 25200
},
{
"epoch": 1.46,
"grad_norm": 1.2091097831726074,
"learning_rate": 3.5742466068553025e-05,
"loss": 0.7899,
"step": 25600
},
{
"epoch": 1.48,
"grad_norm": 0.9588549733161926,
"learning_rate": 3.551242236024845e-05,
"loss": 0.7942,
"step": 26000
},
{
"epoch": 1.5,
"grad_norm": 1.195241093635559,
"learning_rate": 3.528237865194387e-05,
"loss": 0.7813,
"step": 26400
},
{
"epoch": 1.53,
"grad_norm": 0.9788525700569153,
"learning_rate": 3.5052334943639294e-05,
"loss": 0.7805,
"step": 26800
},
{
"epoch": 1.55,
"grad_norm": 1.2794181108474731,
"learning_rate": 3.482286634460548e-05,
"loss": 0.7763,
"step": 27200
},
{
"epoch": 1.57,
"grad_norm": 0.9700046181678772,
"learning_rate": 3.45928226363009e-05,
"loss": 0.7801,
"step": 27600
},
{
"epoch": 1.6,
"grad_norm": 1.2326452732086182,
"learning_rate": 3.436335403726708e-05,
"loss": 0.7864,
"step": 28000
},
{
"epoch": 1.62,
"grad_norm": 1.2367639541625977,
"learning_rate": 3.4133310328962506e-05,
"loss": 0.7845,
"step": 28400
},
{
"epoch": 1.64,
"grad_norm": 1.077854871749878,
"learning_rate": 3.390326662065793e-05,
"loss": 0.7869,
"step": 28800
},
{
"epoch": 1.66,
"grad_norm": 1.0575716495513916,
"learning_rate": 3.3673222912353345e-05,
"loss": 0.7838,
"step": 29200
},
{
"epoch": 1.69,
"grad_norm": 1.1674555540084839,
"learning_rate": 3.344317920404877e-05,
"loss": 0.7827,
"step": 29600
},
{
"epoch": 1.71,
"grad_norm": 1.148335337638855,
"learning_rate": 3.321313549574419e-05,
"loss": 0.7781,
"step": 30000
},
{
"epoch": 1.73,
"grad_norm": 1.0287448167800903,
"learning_rate": 3.2983091787439614e-05,
"loss": 0.7652,
"step": 30400
},
{
"epoch": 1.76,
"grad_norm": 1.2461556196212769,
"learning_rate": 3.275304807913504e-05,
"loss": 0.7773,
"step": 30800
},
{
"epoch": 1.78,
"grad_norm": 1.1946007013320923,
"learning_rate": 3.252357948010122e-05,
"loss": 0.7694,
"step": 31200
},
{
"epoch": 1.8,
"grad_norm": 1.019499659538269,
"learning_rate": 3.229353577179664e-05,
"loss": 0.7803,
"step": 31600
},
{
"epoch": 1.82,
"grad_norm": 1.3375366926193237,
"learning_rate": 3.2063492063492065e-05,
"loss": 0.7684,
"step": 32000
},
{
"epoch": 1.85,
"grad_norm": 1.2477443218231201,
"learning_rate": 3.183344835518749e-05,
"loss": 0.7657,
"step": 32400
},
{
"epoch": 1.87,
"grad_norm": 1.1749552488327026,
"learning_rate": 3.160340464688291e-05,
"loss": 0.767,
"step": 32800
},
{
"epoch": 1.89,
"grad_norm": 1.0863006114959717,
"learning_rate": 3.1373360938578334e-05,
"loss": 0.767,
"step": 33200
},
{
"epoch": 1.91,
"grad_norm": 0.9976168870925903,
"learning_rate": 3.114389233954452e-05,
"loss": 0.7536,
"step": 33600
},
{
"epoch": 1.94,
"grad_norm": 1.1924540996551514,
"learning_rate": 3.09144237405107e-05,
"loss": 0.7622,
"step": 34000
},
{
"epoch": 1.96,
"grad_norm": 1.0996850728988647,
"learning_rate": 3.068438003220612e-05,
"loss": 0.7569,
"step": 34400
},
{
"epoch": 1.98,
"grad_norm": 1.2163282632827759,
"learning_rate": 3.0454336323901546e-05,
"loss": 0.7667,
"step": 34800
},
{
"epoch": 2.0,
"eval_loss": 1.0829898118972778,
"eval_runtime": 239.7954,
"eval_samples_per_second": 251.36,
"eval_steps_per_second": 3.928,
"step": 35096
},
{
"epoch": 2.01,
"grad_norm": 1.1651737689971924,
"learning_rate": 3.0224292615596966e-05,
"loss": 0.7442,
"step": 35200
},
{
"epoch": 2.03,
"grad_norm": 1.1764894723892212,
"learning_rate": 2.999424890729239e-05,
"loss": 0.714,
"step": 35600
},
{
"epoch": 2.05,
"grad_norm": 1.1951353549957275,
"learning_rate": 2.976420519898781e-05,
"loss": 0.7076,
"step": 36000
},
{
"epoch": 2.07,
"grad_norm": 1.1282097101211548,
"learning_rate": 2.953416149068323e-05,
"loss": 0.7105,
"step": 36400
},
{
"epoch": 2.1,
"grad_norm": 1.3397319316864014,
"learning_rate": 2.9304117782378654e-05,
"loss": 0.7023,
"step": 36800
},
{
"epoch": 2.12,
"grad_norm": 1.1150188446044922,
"learning_rate": 2.9074074074074077e-05,
"loss": 0.7035,
"step": 37200
},
{
"epoch": 2.14,
"grad_norm": 1.2119678258895874,
"learning_rate": 2.8844030365769497e-05,
"loss": 0.7168,
"step": 37600
},
{
"epoch": 2.17,
"grad_norm": 1.167506456375122,
"learning_rate": 2.861398665746492e-05,
"loss": 0.7125,
"step": 38000
},
{
"epoch": 2.19,
"grad_norm": 1.0915708541870117,
"learning_rate": 2.8384518058431102e-05,
"loss": 0.7101,
"step": 38400
},
{
"epoch": 2.21,
"grad_norm": 1.135021686553955,
"learning_rate": 2.8154474350126525e-05,
"loss": 0.7145,
"step": 38800
},
{
"epoch": 2.23,
"grad_norm": 1.3739718198776245,
"learning_rate": 2.792443064182195e-05,
"loss": 0.7096,
"step": 39200
},
{
"epoch": 2.26,
"grad_norm": 1.1629129648208618,
"learning_rate": 2.7694386933517368e-05,
"loss": 0.7053,
"step": 39600
},
{
"epoch": 2.28,
"grad_norm": 0.9963687062263489,
"learning_rate": 2.746434322521279e-05,
"loss": 0.7012,
"step": 40000
},
{
"epoch": 2.3,
"grad_norm": 1.0318909883499146,
"learning_rate": 2.7234874626178974e-05,
"loss": 0.713,
"step": 40400
},
{
"epoch": 2.33,
"grad_norm": 1.0613532066345215,
"learning_rate": 2.7004830917874397e-05,
"loss": 0.704,
"step": 40800
},
{
"epoch": 2.35,
"grad_norm": 1.1298637390136719,
"learning_rate": 2.677478720956982e-05,
"loss": 0.708,
"step": 41200
},
{
"epoch": 2.37,
"grad_norm": 1.1079801321029663,
"learning_rate": 2.654474350126524e-05,
"loss": 0.6975,
"step": 41600
},
{
"epoch": 2.39,
"grad_norm": 1.0751113891601562,
"learning_rate": 2.6314699792960662e-05,
"loss": 0.6999,
"step": 42000
},
{
"epoch": 2.42,
"grad_norm": 1.1240077018737793,
"learning_rate": 2.6085231193926845e-05,
"loss": 0.7055,
"step": 42400
},
{
"epoch": 2.44,
"grad_norm": 1.0788402557373047,
"learning_rate": 2.5855187485622268e-05,
"loss": 0.7055,
"step": 42800
},
{
"epoch": 2.46,
"grad_norm": 1.00369131565094,
"learning_rate": 2.562514377731769e-05,
"loss": 0.6949,
"step": 43200
},
{
"epoch": 2.48,
"grad_norm": 1.1382017135620117,
"learning_rate": 2.539510006901311e-05,
"loss": 0.7093,
"step": 43600
},
{
"epoch": 2.51,
"grad_norm": 1.0273314714431763,
"learning_rate": 2.5165056360708534e-05,
"loss": 0.7066,
"step": 44000
},
{
"epoch": 2.53,
"grad_norm": 1.331964373588562,
"learning_rate": 2.4935012652403957e-05,
"loss": 0.7037,
"step": 44400
},
{
"epoch": 2.55,
"grad_norm": 1.102133870124817,
"learning_rate": 2.470496894409938e-05,
"loss": 0.7028,
"step": 44800
},
{
"epoch": 2.58,
"grad_norm": 1.131090521812439,
"learning_rate": 2.4474925235794803e-05,
"loss": 0.6871,
"step": 45200
},
{
"epoch": 2.6,
"grad_norm": 1.1939336061477661,
"learning_rate": 2.4244881527490222e-05,
"loss": 0.6966,
"step": 45600
},
{
"epoch": 2.62,
"grad_norm": 1.344831109046936,
"learning_rate": 2.4014837819185645e-05,
"loss": 0.6933,
"step": 46000
},
{
"epoch": 2.64,
"grad_norm": 0.9559622406959534,
"learning_rate": 2.3784794110881068e-05,
"loss": 0.6916,
"step": 46400
},
{
"epoch": 2.67,
"grad_norm": 1.182010293006897,
"learning_rate": 2.355475040257649e-05,
"loss": 0.6903,
"step": 46800
},
{
"epoch": 2.69,
"grad_norm": 1.080712080001831,
"learning_rate": 2.3325281803542674e-05,
"loss": 0.6965,
"step": 47200
},
{
"epoch": 2.71,
"grad_norm": 1.2468616962432861,
"learning_rate": 2.3095238095238097e-05,
"loss": 0.6906,
"step": 47600
},
{
"epoch": 2.74,
"grad_norm": 1.0585706233978271,
"learning_rate": 2.286519438693352e-05,
"loss": 0.6966,
"step": 48000
},
{
"epoch": 2.76,
"grad_norm": 1.2725940942764282,
"learning_rate": 2.2635150678628943e-05,
"loss": 0.6894,
"step": 48400
},
{
"epoch": 2.78,
"grad_norm": 1.1753593683242798,
"learning_rate": 2.2405106970324362e-05,
"loss": 0.6806,
"step": 48800
},
{
"epoch": 2.8,
"grad_norm": 1.117319941520691,
"learning_rate": 2.2175063262019785e-05,
"loss": 0.6879,
"step": 49200
},
{
"epoch": 2.83,
"grad_norm": 1.2521744966506958,
"learning_rate": 2.194501955371521e-05,
"loss": 0.6808,
"step": 49600
},
{
"epoch": 2.85,
"grad_norm": 1.396971344947815,
"learning_rate": 2.1714975845410628e-05,
"loss": 0.6798,
"step": 50000
},
{
"epoch": 2.87,
"grad_norm": 1.0855846405029297,
"learning_rate": 2.148493213710605e-05,
"loss": 0.6978,
"step": 50400
},
{
"epoch": 2.89,
"grad_norm": 1.199013113975525,
"learning_rate": 2.1254888428801474e-05,
"loss": 0.6882,
"step": 50800
},
{
"epoch": 2.92,
"grad_norm": 1.366407871246338,
"learning_rate": 2.1024844720496894e-05,
"loss": 0.6882,
"step": 51200
},
{
"epoch": 2.94,
"grad_norm": 1.1709498167037964,
"learning_rate": 2.0794801012192317e-05,
"loss": 0.6907,
"step": 51600
},
{
"epoch": 2.96,
"grad_norm": 1.1881307363510132,
"learning_rate": 2.05653324131585e-05,
"loss": 0.6883,
"step": 52000
},
{
"epoch": 2.99,
"grad_norm": 1.4105783700942993,
"learning_rate": 2.0335288704853922e-05,
"loss": 0.6833,
"step": 52400
},
{
"epoch": 3.0,
"eval_loss": 1.0844900608062744,
"eval_runtime": 239.8565,
"eval_samples_per_second": 251.296,
"eval_steps_per_second": 3.927,
"step": 52644
},
{
"epoch": 3.01,
"grad_norm": 1.4675981998443604,
"learning_rate": 2.0105244996549345e-05,
"loss": 0.6679,
"step": 52800
},
{
"epoch": 3.03,
"grad_norm": 1.151491403579712,
"learning_rate": 1.9875201288244768e-05,
"loss": 0.6501,
"step": 53200
},
{
"epoch": 3.05,
"grad_norm": 1.0938260555267334,
"learning_rate": 1.964515757994019e-05,
"loss": 0.6396,
"step": 53600
},
{
"epoch": 3.08,
"grad_norm": 1.055185317993164,
"learning_rate": 1.941511387163561e-05,
"loss": 0.6442,
"step": 54000
},
{
"epoch": 3.1,
"grad_norm": 1.0307785272598267,
"learning_rate": 1.9185645272601797e-05,
"loss": 0.6489,
"step": 54400
},
{
"epoch": 3.12,
"grad_norm": 1.184102177619934,
"learning_rate": 1.8955601564297217e-05,
"loss": 0.6454,
"step": 54800
},
{
"epoch": 3.15,
"grad_norm": 1.1798542737960815,
"learning_rate": 1.872555785599264e-05,
"loss": 0.6552,
"step": 55200
},
{
"epoch": 3.17,
"grad_norm": 1.1375089883804321,
"learning_rate": 1.8496089256958822e-05,
"loss": 0.6359,
"step": 55600
},
{
"epoch": 3.19,
"grad_norm": 1.0475974082946777,
"learning_rate": 1.8266045548654245e-05,
"loss": 0.6374,
"step": 56000
},
{
"epoch": 3.21,
"grad_norm": 1.0948106050491333,
"learning_rate": 1.803600184034967e-05,
"loss": 0.6431,
"step": 56400
},
{
"epoch": 3.24,
"grad_norm": 1.1488378047943115,
"learning_rate": 1.7805958132045088e-05,
"loss": 0.646,
"step": 56800
},
{
"epoch": 3.26,
"grad_norm": 1.1257692575454712,
"learning_rate": 1.757591442374051e-05,
"loss": 0.6408,
"step": 57200
},
{
"epoch": 3.28,
"grad_norm": 1.1101455688476562,
"learning_rate": 1.7345870715435934e-05,
"loss": 0.6389,
"step": 57600
},
{
"epoch": 3.31,
"grad_norm": 1.329904556274414,
"learning_rate": 1.7115827007131354e-05,
"loss": 0.6399,
"step": 58000
},
{
"epoch": 3.33,
"grad_norm": 1.2944815158843994,
"learning_rate": 1.6885783298826777e-05,
"loss": 0.6421,
"step": 58400
},
{
"epoch": 3.35,
"grad_norm": 1.1607027053833008,
"learning_rate": 1.6655739590522203e-05,
"loss": 0.637,
"step": 58800
},
{
"epoch": 3.37,
"grad_norm": 1.0392543077468872,
"learning_rate": 1.6426270991488382e-05,
"loss": 0.6411,
"step": 59200
},
{
"epoch": 3.4,
"grad_norm": 1.3244273662567139,
"learning_rate": 1.6196227283183805e-05,
"loss": 0.6473,
"step": 59600
},
{
"epoch": 3.42,
"grad_norm": 1.1351373195648193,
"learning_rate": 1.5966183574879228e-05,
"loss": 0.6298,
"step": 60000
},
{
"epoch": 3.44,
"grad_norm": 1.1698590517044067,
"learning_rate": 1.573613986657465e-05,
"loss": 0.6355,
"step": 60400
},
{
"epoch": 3.46,
"grad_norm": 1.2005553245544434,
"learning_rate": 1.5506671267540834e-05,
"loss": 0.6395,
"step": 60800
},
{
"epoch": 3.49,
"grad_norm": 0.97503662109375,
"learning_rate": 1.5276627559236257e-05,
"loss": 0.6437,
"step": 61200
},
{
"epoch": 3.51,
"grad_norm": 1.2518908977508545,
"learning_rate": 1.5046583850931678e-05,
"loss": 0.6385,
"step": 61600
},
{
"epoch": 3.53,
"grad_norm": 1.2661454677581787,
"learning_rate": 1.48165401426271e-05,
"loss": 0.6403,
"step": 62000
},
{
"epoch": 3.56,
"grad_norm": 1.2612046003341675,
"learning_rate": 1.4586496434322523e-05,
"loss": 0.6442,
"step": 62400
},
{
"epoch": 3.58,
"grad_norm": 1.1942335367202759,
"learning_rate": 1.4356452726017944e-05,
"loss": 0.6383,
"step": 62800
},
{
"epoch": 3.6,
"grad_norm": 1.1030133962631226,
"learning_rate": 1.4126409017713365e-05,
"loss": 0.6277,
"step": 63200
},
{
"epoch": 3.62,
"grad_norm": 1.2485852241516113,
"learning_rate": 1.3896365309408788e-05,
"loss": 0.6414,
"step": 63600
},
{
"epoch": 3.65,
"grad_norm": 0.9925839900970459,
"learning_rate": 1.366632160110421e-05,
"loss": 0.6337,
"step": 64000
},
{
"epoch": 3.67,
"grad_norm": 1.3896905183792114,
"learning_rate": 1.343627789279963e-05,
"loss": 0.6314,
"step": 64400
},
{
"epoch": 3.69,
"grad_norm": 1.1392475366592407,
"learning_rate": 1.3206809293765815e-05,
"loss": 0.6312,
"step": 64800
},
{
"epoch": 3.72,
"grad_norm": 1.2051880359649658,
"learning_rate": 1.2976765585461237e-05,
"loss": 0.6198,
"step": 65200
},
{
"epoch": 3.74,
"grad_norm": 1.3581410646438599,
"learning_rate": 1.2746721877156661e-05,
"loss": 0.634,
"step": 65600
},
{
"epoch": 3.76,
"grad_norm": 1.4071406126022339,
"learning_rate": 1.2516678168852084e-05,
"loss": 0.633,
"step": 66000
},
{
"epoch": 3.78,
"grad_norm": 1.1921656131744385,
"learning_rate": 1.2286634460547504e-05,
"loss": 0.6206,
"step": 66400
},
{
"epoch": 3.81,
"grad_norm": 1.4039461612701416,
"learning_rate": 1.2056590752242927e-05,
"loss": 0.6341,
"step": 66800
},
{
"epoch": 3.83,
"grad_norm": 1.3369255065917969,
"learning_rate": 1.182654704393835e-05,
"loss": 0.6427,
"step": 67200
},
{
"epoch": 3.85,
"grad_norm": 1.2129446268081665,
"learning_rate": 1.1596503335633771e-05,
"loss": 0.6293,
"step": 67600
},
{
"epoch": 3.88,
"grad_norm": 1.264256238937378,
"learning_rate": 1.1366459627329192e-05,
"loss": 0.6282,
"step": 68000
},
{
"epoch": 3.9,
"grad_norm": 1.1778966188430786,
"learning_rate": 1.1136415919024615e-05,
"loss": 0.6383,
"step": 68400
},
{
"epoch": 3.92,
"grad_norm": 1.045240044593811,
"learning_rate": 1.0906372210720037e-05,
"loss": 0.6315,
"step": 68800
},
{
"epoch": 3.94,
"grad_norm": 1.2942785024642944,
"learning_rate": 1.0676903611686221e-05,
"loss": 0.6276,
"step": 69200
},
{
"epoch": 3.97,
"grad_norm": 1.2519258260726929,
"learning_rate": 1.0446859903381644e-05,
"loss": 0.6228,
"step": 69600
},
{
"epoch": 3.99,
"grad_norm": 1.2884622812271118,
"learning_rate": 1.0216816195077065e-05,
"loss": 0.6234,
"step": 70000
},
{
"epoch": 4.0,
"eval_loss": 1.09929621219635,
"eval_runtime": 239.9825,
"eval_samples_per_second": 251.164,
"eval_steps_per_second": 3.925,
"step": 70192
},
{
"epoch": 4.01,
"grad_norm": 1.029523253440857,
"learning_rate": 9.986772486772487e-06,
"loss": 0.607,
"step": 70400
},
{
"epoch": 4.03,
"grad_norm": 1.1874665021896362,
"learning_rate": 9.75672877846791e-06,
"loss": 0.5992,
"step": 70800
},
{
"epoch": 4.06,
"grad_norm": 1.3719263076782227,
"learning_rate": 9.526685070163331e-06,
"loss": 0.5932,
"step": 71200
},
{
"epoch": 4.08,
"grad_norm": 1.1106728315353394,
"learning_rate": 9.296641361858754e-06,
"loss": 0.6082,
"step": 71600
},
{
"epoch": 4.1,
"grad_norm": 1.1333997249603271,
"learning_rate": 9.066597653554177e-06,
"loss": 0.5958,
"step": 72000
},
{
"epoch": 4.13,
"grad_norm": 1.2606267929077148,
"learning_rate": 8.837129054520358e-06,
"loss": 0.6015,
"step": 72400
},
{
"epoch": 4.15,
"grad_norm": 1.123744249343872,
"learning_rate": 8.607085346215783e-06,
"loss": 0.6011,
"step": 72800
},
{
"epoch": 4.17,
"grad_norm": 1.155521273612976,
"learning_rate": 8.377041637911204e-06,
"loss": 0.5973,
"step": 73200
},
{
"epoch": 4.19,
"grad_norm": 1.1591954231262207,
"learning_rate": 8.146997929606625e-06,
"loss": 0.5924,
"step": 73600
},
{
"epoch": 4.22,
"grad_norm": 1.3380868434906006,
"learning_rate": 7.916954221302048e-06,
"loss": 0.5983,
"step": 74000
},
{
"epoch": 4.24,
"grad_norm": 1.2216105461120605,
"learning_rate": 7.68691051299747e-06,
"loss": 0.5947,
"step": 74400
},
{
"epoch": 4.26,
"grad_norm": 1.0791873931884766,
"learning_rate": 7.4568668046928916e-06,
"loss": 0.6054,
"step": 74800
},
{
"epoch": 4.29,
"grad_norm": 1.1365481615066528,
"learning_rate": 7.2268230963883145e-06,
"loss": 0.6092,
"step": 75200
},
{
"epoch": 4.31,
"grad_norm": 1.1376712322235107,
"learning_rate": 6.997354497354498e-06,
"loss": 0.5942,
"step": 75600
},
{
"epoch": 4.33,
"grad_norm": 1.1192513704299927,
"learning_rate": 6.76731078904992e-06,
"loss": 0.5955,
"step": 76000
},
{
"epoch": 4.35,
"grad_norm": 1.1927390098571777,
"learning_rate": 6.537267080745342e-06,
"loss": 0.5901,
"step": 76400
},
{
"epoch": 4.38,
"grad_norm": 1.236060619354248,
"learning_rate": 6.307223372440764e-06,
"loss": 0.6,
"step": 76800
},
{
"epoch": 4.4,
"grad_norm": 1.077643871307373,
"learning_rate": 6.077179664136186e-06,
"loss": 0.6088,
"step": 77200
},
{
"epoch": 4.42,
"grad_norm": 1.3172234296798706,
"learning_rate": 5.847135955831608e-06,
"loss": 0.5944,
"step": 77600
},
{
"epoch": 4.44,
"grad_norm": 1.2222837209701538,
"learning_rate": 5.61709224752703e-06,
"loss": 0.5976,
"step": 78000
},
{
"epoch": 4.47,
"grad_norm": 1.2887938022613525,
"learning_rate": 5.387048539222452e-06,
"loss": 0.6023,
"step": 78400
},
{
"epoch": 4.49,
"grad_norm": 1.1380060911178589,
"learning_rate": 5.157579940188636e-06,
"loss": 0.5938,
"step": 78800
},
{
"epoch": 4.51,
"grad_norm": 1.2178806066513062,
"learning_rate": 4.927536231884058e-06,
"loss": 0.5916,
"step": 79200
},
{
"epoch": 4.54,
"grad_norm": 1.2010163068771362,
"learning_rate": 4.69749252357948e-06,
"loss": 0.5891,
"step": 79600
},
{
"epoch": 4.56,
"grad_norm": 1.2172470092773438,
"learning_rate": 4.467448815274902e-06,
"loss": 0.6019,
"step": 80000
},
{
"epoch": 4.58,
"grad_norm": 1.2008330821990967,
"learning_rate": 4.2374051069703245e-06,
"loss": 0.596,
"step": 80400
},
{
"epoch": 4.6,
"grad_norm": 1.3656328916549683,
"learning_rate": 4.007936507936508e-06,
"loss": 0.6001,
"step": 80800
},
{
"epoch": 4.63,
"grad_norm": 1.336308240890503,
"learning_rate": 3.7778927996319303e-06,
"loss": 0.5912,
"step": 81200
},
{
"epoch": 4.65,
"grad_norm": 1.1399625539779663,
"learning_rate": 3.5478490913273524e-06,
"loss": 0.5962,
"step": 81600
},
{
"epoch": 4.67,
"grad_norm": 1.237598180770874,
"learning_rate": 3.317805383022775e-06,
"loss": 0.5869,
"step": 82000
},
{
"epoch": 4.7,
"grad_norm": 1.1215174198150635,
"learning_rate": 3.0877616747181967e-06,
"loss": 0.5927,
"step": 82400
},
{
"epoch": 4.72,
"grad_norm": 1.3274859189987183,
"learning_rate": 2.857717966413619e-06,
"loss": 0.6066,
"step": 82800
},
{
"epoch": 4.74,
"grad_norm": 1.276289463043213,
"learning_rate": 2.628249367379802e-06,
"loss": 0.5994,
"step": 83200
},
{
"epoch": 4.76,
"grad_norm": 1.154296636581421,
"learning_rate": 2.3982056590752246e-06,
"loss": 0.5907,
"step": 83600
},
{
"epoch": 4.79,
"grad_norm": 1.1015737056732178,
"learning_rate": 2.1681619507706463e-06,
"loss": 0.6043,
"step": 84000
},
{
"epoch": 4.81,
"grad_norm": 1.356696367263794,
"learning_rate": 1.9381182424660685e-06,
"loss": 0.5883,
"step": 84400
},
{
"epoch": 4.83,
"grad_norm": 1.1508524417877197,
"learning_rate": 1.7080745341614908e-06,
"loss": 0.5899,
"step": 84800
},
{
"epoch": 4.86,
"grad_norm": 1.1132149696350098,
"learning_rate": 1.478030825856913e-06,
"loss": 0.5994,
"step": 85200
},
{
"epoch": 4.88,
"grad_norm": 1.306624174118042,
"learning_rate": 1.2485622268230964e-06,
"loss": 0.5891,
"step": 85600
},
{
"epoch": 4.9,
"grad_norm": 1.234307050704956,
"learning_rate": 1.0185185185185188e-06,
"loss": 0.5916,
"step": 86000
},
{
"epoch": 4.92,
"grad_norm": 1.0994372367858887,
"learning_rate": 7.884748102139407e-07,
"loss": 0.5908,
"step": 86400
},
{
"epoch": 4.95,
"grad_norm": 1.2712494134902954,
"learning_rate": 5.584311019093628e-07,
"loss": 0.5963,
"step": 86800
},
{
"epoch": 4.97,
"grad_norm": 1.2190104722976685,
"learning_rate": 3.283873936047849e-07,
"loss": 0.5902,
"step": 87200
},
{
"epoch": 4.99,
"grad_norm": 1.3301359415054321,
"learning_rate": 9.891879457096849e-08,
"loss": 0.591,
"step": 87600
},
{
"epoch": 5.0,
"eval_loss": 1.1109092235565186,
"eval_runtime": 240.0172,
"eval_samples_per_second": 251.128,
"eval_steps_per_second": 3.925,
"step": 87740
},
{
"epoch": 5.0,
"step": 87740,
"total_flos": 1.7097588901675008e+18,
"train_loss": 0.7401387438351292,
"train_runtime": 31448.3717,
"train_samples_per_second": 89.283,
"train_steps_per_second": 2.79
}
],
"logging_steps": 400,
"max_steps": 87740,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"total_flos": 1.7097588901675008e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}