{ "best_metric": 0.25531044602394104, "best_model_checkpoint": "./convnext-tiny-new-1e-4/checkpoint-10990", "epoch": 10.0, "eval_steps": 500, "global_step": 10990, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09, "grad_norm": 24.848655700683594, "learning_rate": 9.99795725199423e-05, "loss": 2.511, "step": 100 }, { "epoch": 0.18, "grad_norm": 21.852506637573242, "learning_rate": 9.991830677104683e-05, "loss": 1.4511, "step": 200 }, { "epoch": 0.27, "grad_norm": 14.456836700439453, "learning_rate": 9.981625281350813e-05, "loss": 1.1871, "step": 300 }, { "epoch": 0.36, "grad_norm": 25.593276977539062, "learning_rate": 9.967349403553353e-05, "loss": 1.0097, "step": 400 }, { "epoch": 0.45, "grad_norm": 21.799734115600586, "learning_rate": 9.949014708520663e-05, "loss": 0.9728, "step": 500 }, { "epoch": 0.55, "grad_norm": 22.431013107299805, "learning_rate": 9.926636177517427e-05, "loss": 0.9024, "step": 600 }, { "epoch": 0.64, "grad_norm": 28.28536033630371, "learning_rate": 9.900232096023477e-05, "loss": 0.8912, "step": 700 }, { "epoch": 0.73, "grad_norm": 27.407716751098633, "learning_rate": 9.869824038792741e-05, "loss": 0.865, "step": 800 }, { "epoch": 0.82, "grad_norm": 13.469520568847656, "learning_rate": 9.835436852224525e-05, "loss": 0.7768, "step": 900 }, { "epoch": 0.91, "grad_norm": 17.95415496826172, "learning_rate": 9.797098634061542e-05, "loss": 0.8432, "step": 1000 }, { "epoch": 1.0, "eval_accuracy": 0.862027833001988, "eval_loss": 0.4732176661491394, "eval_runtime": 70.5844, "eval_samples_per_second": 35.631, "eval_steps_per_second": 2.238, "step": 1099 }, { "epoch": 1.0, "grad_norm": 28.058094024658203, "learning_rate": 9.754840710431274e-05, "loss": 0.827, "step": 1100 }, { "epoch": 1.09, "grad_norm": 10.800704002380371, "learning_rate": 9.708697610249406e-05, "loss": 0.733, "step": 1200 }, { "epoch": 1.18, "grad_norm": 17.161815643310547, "learning_rate": 9.658707037006294e-05, "loss": 0.7117, "step": 1300 }, { "epoch": 1.27, "grad_norm": 15.4861478805542, "learning_rate": 9.604909837959455e-05, "loss": 0.6609, "step": 1400 }, { "epoch": 1.36, "grad_norm": 20.579856872558594, "learning_rate": 9.547349970757317e-05, "loss": 0.6396, "step": 1500 }, { "epoch": 1.46, "grad_norm": 24.0612735748291, "learning_rate": 9.486074467521456e-05, "loss": 0.6321, "step": 1600 }, { "epoch": 1.55, "grad_norm": 23.076522827148438, "learning_rate": 9.421133396416686e-05, "loss": 0.7177, "step": 1700 }, { "epoch": 1.64, "grad_norm": 24.146862030029297, "learning_rate": 9.352579820740405e-05, "loss": 0.7033, "step": 1800 }, { "epoch": 1.73, "grad_norm": 13.88412857055664, "learning_rate": 9.280469755564613e-05, "loss": 0.637, "step": 1900 }, { "epoch": 1.82, "grad_norm": 18.419206619262695, "learning_rate": 9.204862121966044e-05, "loss": 0.6056, "step": 2000 }, { "epoch": 1.91, "grad_norm": 22.241830825805664, "learning_rate": 9.125818698881798e-05, "loss": 0.6603, "step": 2100 }, { "epoch": 2.0, "eval_accuracy": 0.9009940357852882, "eval_loss": 0.35555583238601685, "eval_runtime": 70.2432, "eval_samples_per_second": 35.804, "eval_steps_per_second": 2.249, "step": 2198 }, { "epoch": 2.0, "grad_norm": 28.679054260253906, "learning_rate": 9.043404072629829e-05, "loss": 0.6342, "step": 2200 }, { "epoch": 2.09, "grad_norm": 25.620079040527344, "learning_rate": 8.957685584135502e-05, "loss": 0.5285, "step": 2300 }, { "epoch": 2.18, "grad_norm": 17.019142150878906, "learning_rate": 8.86873327390739e-05, "loss": 0.5882, "step": 2400 }, { "epoch": 2.27, "grad_norm": 15.56048583984375, "learning_rate": 8.776619824807224e-05, "loss": 0.545, "step": 2500 }, { "epoch": 2.37, "grad_norm": 10.068634033203125, "learning_rate": 8.681420502660786e-05, "loss": 0.5944, "step": 2600 }, { "epoch": 2.46, "grad_norm": 15.93220043182373, "learning_rate": 8.583213094758261e-05, "loss": 0.5378, "step": 2700 }, { "epoch": 2.55, "grad_norm": 16.840496063232422, "learning_rate": 8.482077846294308e-05, "loss": 0.5597, "step": 2800 }, { "epoch": 2.64, "grad_norm": 23.36423683166504, "learning_rate": 8.378097394799773e-05, "loss": 0.5476, "step": 2900 }, { "epoch": 2.73, "grad_norm": 14.018542289733887, "learning_rate": 8.271356702618626e-05, "loss": 0.5429, "step": 3000 }, { "epoch": 2.82, "grad_norm": 22.620033264160156, "learning_rate": 8.161942987485303e-05, "loss": 0.5361, "step": 3100 }, { "epoch": 2.91, "grad_norm": 29.02153778076172, "learning_rate": 8.049945651259163e-05, "loss": 0.487, "step": 3200 }, { "epoch": 3.0, "eval_accuracy": 0.882703777335984, "eval_loss": 0.4106706380844116, "eval_runtime": 69.7812, "eval_samples_per_second": 36.041, "eval_steps_per_second": 2.264, "step": 3297 }, { "epoch": 3.0, "grad_norm": 9.186837196350098, "learning_rate": 7.935456206874292e-05, "loss": 0.5448, "step": 3300 }, { "epoch": 3.09, "grad_norm": 10.694817543029785, "learning_rate": 7.818568203564374e-05, "loss": 0.4207, "step": 3400 }, { "epoch": 3.18, "grad_norm": 9.862343788146973, "learning_rate": 7.699377150423672e-05, "loss": 0.475, "step": 3500 }, { "epoch": 3.28, "grad_norm": 17.465476989746094, "learning_rate": 7.577980438366628e-05, "loss": 0.48, "step": 3600 }, { "epoch": 3.37, "grad_norm": 19.368242263793945, "learning_rate": 7.454477260549828e-05, "loss": 0.4126, "step": 3700 }, { "epoch": 3.46, "grad_norm": 24.577472686767578, "learning_rate": 7.32896853132135e-05, "loss": 0.4992, "step": 3800 }, { "epoch": 3.55, "grad_norm": 7.475000381469727, "learning_rate": 7.201556803763725e-05, "loss": 0.4931, "step": 3900 }, { "epoch": 3.64, "grad_norm": 13.887510299682617, "learning_rate": 7.07234618589791e-05, "loss": 0.4909, "step": 4000 }, { "epoch": 3.73, "grad_norm": 20.115652084350586, "learning_rate": 6.94144225561669e-05, "loss": 0.4814, "step": 4100 }, { "epoch": 3.82, "grad_norm": 14.20095443725586, "learning_rate": 6.808951974417078e-05, "loss": 0.457, "step": 4200 }, { "epoch": 3.91, "grad_norm": 29.256624221801758, "learning_rate": 6.674983600002155e-05, "loss": 0.4369, "step": 4300 }, { "epoch": 4.0, "eval_accuracy": 0.9172962226640159, "eval_loss": 0.3118807375431061, "eval_runtime": 70.2783, "eval_samples_per_second": 35.786, "eval_steps_per_second": 2.248, "step": 4396 }, { "epoch": 4.0, "grad_norm": 10.440736770629883, "learning_rate": 6.539646597823791e-05, "loss": 0.4036, "step": 4400 }, { "epoch": 4.09, "grad_norm": 14.221583366394043, "learning_rate": 6.403051551638508e-05, "loss": 0.3902, "step": 4500 }, { "epoch": 4.19, "grad_norm": 18.483789443969727, "learning_rate": 6.265310073149584e-05, "loss": 0.4096, "step": 4600 }, { "epoch": 4.28, "grad_norm": 27.6778564453125, "learning_rate": 6.126534710809216e-05, "loss": 0.3778, "step": 4700 }, { "epoch": 4.37, "grad_norm": 18.118946075439453, "learning_rate": 5.9868388578552734e-05, "loss": 0.356, "step": 4800 }, { "epoch": 4.46, "grad_norm": 27.71598243713379, "learning_rate": 5.8463366596577706e-05, "loss": 0.3544, "step": 4900 }, { "epoch": 4.55, "grad_norm": 10.941313743591309, "learning_rate": 5.705142920450777e-05, "loss": 0.4005, "step": 5000 }, { "epoch": 4.64, "grad_norm": 20.2288818359375, "learning_rate": 5.5633730095259695e-05, "loss": 0.4007, "step": 5100 }, { "epoch": 4.73, "grad_norm": 13.51459789276123, "learning_rate": 5.421142766964474e-05, "loss": 0.411, "step": 5200 }, { "epoch": 4.82, "grad_norm": 10.38302993774414, "learning_rate": 5.278568408984037e-05, "loss": 0.4178, "step": 5300 }, { "epoch": 4.91, "grad_norm": 7.628846168518066, "learning_rate": 5.135766432978829e-05, "loss": 0.3791, "step": 5400 }, { "epoch": 5.0, "eval_accuracy": 0.9196819085487078, "eval_loss": 0.3082761764526367, "eval_runtime": 71.5388, "eval_samples_per_second": 35.156, "eval_steps_per_second": 2.209, "step": 5495 }, { "epoch": 5.0, "grad_norm": 15.286186218261719, "learning_rate": 4.9928535223295344e-05, "loss": 0.3914, "step": 5500 }, { "epoch": 5.1, "grad_norm": 19.06636619567871, "learning_rate": 4.849946451061443e-05, "loss": 0.3268, "step": 5600 }, { "epoch": 5.19, "grad_norm": 16.880006790161133, "learning_rate": 4.707161988428495e-05, "loss": 0.3184, "step": 5700 }, { "epoch": 5.28, "grad_norm": 10.349203109741211, "learning_rate": 4.564616803501205e-05, "loss": 0.3417, "step": 5800 }, { "epoch": 5.37, "grad_norm": 17.491863250732422, "learning_rate": 4.4224273698364735e-05, "loss": 0.3218, "step": 5900 }, { "epoch": 5.46, "grad_norm": 9.212013244628906, "learning_rate": 4.2807098703071255e-05, "loss": 0.3143, "step": 6000 }, { "epoch": 5.55, "grad_norm": 19.431245803833008, "learning_rate": 4.1395801021689746e-05, "loss": 0.3262, "step": 6100 }, { "epoch": 5.64, "grad_norm": 16.24408531188965, "learning_rate": 3.999153382442995e-05, "loss": 0.2885, "step": 6200 }, { "epoch": 5.73, "grad_norm": 23.65041732788086, "learning_rate": 3.859544453689853e-05, "loss": 0.3555, "step": 6300 }, { "epoch": 5.82, "grad_norm": 10.8707275390625, "learning_rate": 3.7208673902538706e-05, "loss": 0.3404, "step": 6400 }, { "epoch": 5.91, "grad_norm": 18.250511169433594, "learning_rate": 3.583235505052955e-05, "loss": 0.3342, "step": 6500 }, { "epoch": 6.0, "eval_accuracy": 0.9296222664015904, "eval_loss": 0.28032296895980835, "eval_runtime": 79.2042, "eval_samples_per_second": 31.753, "eval_steps_per_second": 1.995, "step": 6594 }, { "epoch": 6.01, "grad_norm": 6.232498645782471, "learning_rate": 3.446761256990723e-05, "loss": 0.3301, "step": 6600 }, { "epoch": 6.1, "grad_norm": 19.711963653564453, "learning_rate": 3.311556159066397e-05, "loss": 0.2638, "step": 6700 }, { "epoch": 6.19, "grad_norm": 11.230116844177246, "learning_rate": 3.177730687257639e-05, "loss": 0.312, "step": 6800 }, { "epoch": 6.28, "grad_norm": 9.200843811035156, "learning_rate": 3.0453941902507177e-05, "loss": 0.3046, "step": 6900 }, { "epoch": 6.37, "grad_norm": 18.336387634277344, "learning_rate": 2.914654800091768e-05, "loss": 0.2505, "step": 7000 }, { "epoch": 6.46, "grad_norm": 4.870161533355713, "learning_rate": 2.7856193438321986e-05, "loss": 0.3136, "step": 7100 }, { "epoch": 6.55, "grad_norm": 8.88219165802002, "learning_rate": 2.6583932562403957e-05, "loss": 0.2656, "step": 7200 }, { "epoch": 6.64, "grad_norm": 12.868867874145508, "learning_rate": 2.5330804936510373e-05, "loss": 0.2714, "step": 7300 }, { "epoch": 6.73, "grad_norm": 26.35186767578125, "learning_rate": 2.409783449022475e-05, "loss": 0.2675, "step": 7400 }, { "epoch": 6.82, "grad_norm": 13.712077140808105, "learning_rate": 2.2886028682715217e-05, "loss": 0.2664, "step": 7500 }, { "epoch": 6.92, "grad_norm": 0.8319320678710938, "learning_rate": 2.169637767954048e-05, "loss": 0.2803, "step": 7600 }, { "epoch": 7.0, "eval_accuracy": 0.9339960238568589, "eval_loss": 0.26732343435287476, "eval_runtime": 70.0882, "eval_samples_per_second": 35.883, "eval_steps_per_second": 2.254, "step": 7693 }, { "epoch": 7.01, "grad_norm": 15.552393913269043, "learning_rate": 2.052985354358622e-05, "loss": 0.2109, "step": 7700 }, { "epoch": 7.1, "grad_norm": 15.977888107299805, "learning_rate": 1.9387409440793386e-05, "loss": 0.2315, "step": 7800 }, { "epoch": 7.19, "grad_norm": 12.987915992736816, "learning_rate": 1.82699788613271e-05, "loss": 0.2421, "step": 7900 }, { "epoch": 7.28, "grad_norm": 7.147055625915527, "learning_rate": 1.7178474856822456e-05, "loss": 0.2454, "step": 8000 }, { "epoch": 7.37, "grad_norm": 25.91172981262207, "learning_rate": 1.611378929433083e-05, "loss": 0.2504, "step": 8100 }, { "epoch": 7.46, "grad_norm": 12.569241523742676, "learning_rate": 1.5076792127576073e-05, "loss": 0.2338, "step": 8200 }, { "epoch": 7.55, "grad_norm": 19.739892959594727, "learning_rate": 1.4068330686115943e-05, "loss": 0.2161, "step": 8300 }, { "epoch": 7.64, "grad_norm": 9.218408584594727, "learning_rate": 1.308922898298977e-05, "loss": 0.2329, "step": 8400 }, { "epoch": 7.73, "grad_norm": 2.0763704776763916, "learning_rate": 1.2140287041418203e-05, "loss": 0.2085, "step": 8500 }, { "epoch": 7.83, "grad_norm": 11.455533981323242, "learning_rate": 1.1222280241104716e-05, "loss": 0.2561, "step": 8600 }, { "epoch": 7.92, "grad_norm": 3.408818006515503, "learning_rate": 1.0335958684673574e-05, "loss": 0.2118, "step": 8700 }, { "epoch": 8.0, "eval_accuracy": 0.9320079522862823, "eval_loss": 0.2637202739715576, "eval_runtime": 74.3067, "eval_samples_per_second": 33.846, "eval_steps_per_second": 2.126, "step": 8792 }, { "epoch": 8.01, "grad_norm": 10.710466384887695, "learning_rate": 9.482046584761495e-06, "loss": 0.2045, "step": 8800 }, { "epoch": 8.1, "grad_norm": 14.385254859924316, "learning_rate": 8.661241672264192e-06, "loss": 0.2176, "step": 8900 }, { "epoch": 8.19, "grad_norm": 10.243017196655273, "learning_rate": 7.874214626220899e-06, "loss": 0.2099, "step": 9000 }, { "epoch": 8.28, "grad_norm": 6.018984317779541, "learning_rate": 7.1216085258031414e-06, "loss": 0.1686, "step": 9100 }, { "epoch": 8.37, "grad_norm": 1.289069652557373, "learning_rate": 6.404038324855222e-06, "loss": 0.2214, "step": 9200 }, { "epoch": 8.46, "grad_norm": 2.6374850273132324, "learning_rate": 5.7220903494159316e-06, "loss": 0.1675, "step": 9300 }, { "epoch": 8.55, "grad_norm": 2.8265738487243652, "learning_rate": 5.076321818632018e-06, "loss": 0.2129, "step": 9400 }, { "epoch": 8.64, "grad_norm": 11.41331958770752, "learning_rate": 4.467260389454864e-06, "loss": 0.2097, "step": 9500 }, { "epoch": 8.74, "grad_norm": 22.140159606933594, "learning_rate": 3.895403725492402e-06, "loss": 0.2099, "step": 9600 }, { "epoch": 8.83, "grad_norm": 7.952567100524902, "learning_rate": 3.3612190903686005e-06, "loss": 0.2438, "step": 9700 }, { "epoch": 8.92, "grad_norm": 19.06670379638672, "learning_rate": 2.86514296592269e-06, "loss": 0.1706, "step": 9800 }, { "epoch": 9.0, "eval_accuracy": 0.9347912524850894, "eval_loss": 0.25745028257369995, "eval_runtime": 74.1249, "eval_samples_per_second": 33.929, "eval_steps_per_second": 2.132, "step": 9891 }, { "epoch": 9.01, "grad_norm": 16.285367965698242, "learning_rate": 2.407580695560252e-06, "loss": 0.1872, "step": 9900 }, { "epoch": 9.1, "grad_norm": 5.482550621032715, "learning_rate": 1.9889061530473986e-06, "loss": 0.1863, "step": 10000 }, { "epoch": 9.19, "grad_norm": 8.077670097351074, "learning_rate": 1.6094614370188499e-06, "loss": 0.1759, "step": 10100 }, { "epoch": 9.28, "grad_norm": 14.164857864379883, "learning_rate": 1.269556591449389e-06, "loss": 0.1768, "step": 10200 }, { "epoch": 9.37, "grad_norm": 16.639026641845703, "learning_rate": 9.694693523171927e-07, "loss": 0.2211, "step": 10300 }, { "epoch": 9.46, "grad_norm": 13.088668823242188, "learning_rate": 7.094449206659748e-07, "loss": 0.1891, "step": 10400 }, { "epoch": 9.55, "grad_norm": 7.857825756072998, "learning_rate": 4.896957622514298e-07, "loss": 0.2362, "step": 10500 }, { "epoch": 9.65, "grad_norm": 16.95376968383789, "learning_rate": 3.104014339355921e-07, "loss": 0.2201, "step": 10600 }, { "epoch": 9.74, "grad_norm": 11.055394172668457, "learning_rate": 1.7170843697111304e-07, "loss": 0.2139, "step": 10700 }, { "epoch": 9.83, "grad_norm": 8.469569206237793, "learning_rate": 7.37300972951771e-08, "loss": 0.2026, "step": 10800 }, { "epoch": 9.92, "grad_norm": 18.598886489868164, "learning_rate": 1.654647293098388e-08, "loss": 0.1995, "step": 10900 }, { "epoch": 10.0, "eval_accuracy": 0.9375745526838967, "eval_loss": 0.25531044602394104, "eval_runtime": 73.7798, "eval_samples_per_second": 34.088, "eval_steps_per_second": 2.142, "step": 10990 }, { "epoch": 10.0, "step": 10990, "total_flos": 1.301428412334932e+19, "train_loss": 0.4412916103637251, "train_runtime": 10918.1842, "train_samples_per_second": 16.102, "train_steps_per_second": 1.007 } ], "logging_steps": 100, "max_steps": 10990, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 1.301428412334932e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }