{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9978431433840766, "eval_steps": 500, "global_step": 2997, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001000281329123816, "grad_norm": 77.5, "learning_rate": 6.666666666666668e-08, "loss": 2.1336, "step": 1 }, { "epoch": 0.0500140664561908, "grad_norm": 9.0625, "learning_rate": 3.3333333333333333e-06, "loss": 1.2593, "step": 50 }, { "epoch": 0.1000281329123816, "grad_norm": 4.1875, "learning_rate": 6.666666666666667e-06, "loss": 0.6353, "step": 100 }, { "epoch": 0.1500421993685724, "grad_norm": 3.4375, "learning_rate": 1e-05, "loss": 0.5783, "step": 150 }, { "epoch": 0.2000562658247632, "grad_norm": 3.09375, "learning_rate": 1.3333333333333333e-05, "loss": 0.5614, "step": 200 }, { "epoch": 0.25007033228095404, "grad_norm": 3.421875, "learning_rate": 1.6666666666666667e-05, "loss": 0.6185, "step": 250 }, { "epoch": 0.3000843987371448, "grad_norm": 7.5, "learning_rate": 2e-05, "loss": 0.6262, "step": 300 }, { "epoch": 0.35009846519333565, "grad_norm": 2.421875, "learning_rate": 1.9983043934122208e-05, "loss": 0.5878, "step": 350 }, { "epoch": 0.4001125316495264, "grad_norm": 3.171875, "learning_rate": 1.9932233238122834e-05, "loss": 0.575, "step": 400 }, { "epoch": 0.45012659810571726, "grad_norm": 2.0, "learning_rate": 1.984774022190361e-05, "loss": 0.5526, "step": 450 }, { "epoch": 0.5001406645619081, "grad_norm": 1.8203125, "learning_rate": 1.972985141929439e-05, "loss": 0.54, "step": 500 }, { "epoch": 0.5501547310180989, "grad_norm": 2.03125, "learning_rate": 1.9578966616355823e-05, "loss": 0.527, "step": 550 }, { "epoch": 0.6001687974742896, "grad_norm": 2.046875, "learning_rate": 1.9395597495619634e-05, "loss": 0.5229, "step": 600 }, { "epoch": 0.6501828639304804, "grad_norm": 1.734375, "learning_rate": 1.918036590086405e-05, "loss": 0.5062, "step": 650 }, { "epoch": 0.7001969303866713, "grad_norm": 1.578125, "learning_rate": 1.8934001728309003e-05, "loss": 0.5055, "step": 700 }, { "epoch": 0.7502109968428621, "grad_norm": 1.7265625, "learning_rate": 1.865734045138245e-05, "loss": 0.4947, "step": 750 }, { "epoch": 0.8002250632990529, "grad_norm": 1.5390625, "learning_rate": 1.8351320287451865e-05, "loss": 0.491, "step": 800 }, { "epoch": 0.8502391297552436, "grad_norm": 1.4765625, "learning_rate": 1.8016979016129164e-05, "loss": 0.4824, "step": 850 }, { "epoch": 0.9002531962114345, "grad_norm": 1.53125, "learning_rate": 1.7655450459938786e-05, "loss": 0.4738, "step": 900 }, { "epoch": 0.9502672626676253, "grad_norm": 1.65625, "learning_rate": 1.726796063928382e-05, "loss": 0.4676, "step": 950 }, { "epoch": 1.0002813291238162, "grad_norm": 1.40625, "learning_rate": 1.6855823614749474e-05, "loss": 0.4657, "step": 1000 }, { "epoch": 1.050295395580007, "grad_norm": 1.40625, "learning_rate": 1.6420437030843482e-05, "loss": 0.3223, "step": 1050 }, { "epoch": 1.1003094620361977, "grad_norm": 1.3359375, "learning_rate": 1.5963277376285646e-05, "loss": 0.3197, "step": 1100 }, { "epoch": 1.1503235284923885, "grad_norm": 1.2890625, "learning_rate": 1.5485894976919836e-05, "loss": 0.3246, "step": 1150 }, { "epoch": 1.2003375949485793, "grad_norm": 1.421875, "learning_rate": 1.4989908738228567e-05, "loss": 0.3167, "step": 1200 }, { "epoch": 1.25035166140477, "grad_norm": 1.3828125, "learning_rate": 1.4477000655279376e-05, "loss": 0.3186, "step": 1250 }, { "epoch": 1.3003657278609608, "grad_norm": 1.3515625, "learning_rate": 1.394891010872102e-05, "loss": 0.3149, "step": 1300 }, { "epoch": 1.3503797943171518, "grad_norm": 1.3828125, "learning_rate": 1.3407427966172866e-05, "loss": 0.3164, "step": 1350 }, { "epoch": 1.4003938607733426, "grad_norm": 1.34375, "learning_rate": 1.2854390509011061e-05, "loss": 0.313, "step": 1400 }, { "epoch": 1.4504079272295334, "grad_norm": 1.390625, "learning_rate": 1.2291673205146908e-05, "loss": 0.3071, "step": 1450 }, { "epoch": 1.5004219936857242, "grad_norm": 1.328125, "learning_rate": 1.1721184348915384e-05, "loss": 0.3063, "step": 1500 }, { "epoch": 1.550436060141915, "grad_norm": 1.2890625, "learning_rate": 1.1144858589642251e-05, "loss": 0.3022, "step": 1550 }, { "epoch": 1.6004501265981057, "grad_norm": 1.3359375, "learning_rate": 1.0564650370835772e-05, "loss": 0.3006, "step": 1600 }, { "epoch": 1.6504641930542965, "grad_norm": 1.3359375, "learning_rate": 9.982527302252135e-06, "loss": 0.2998, "step": 1650 }, { "epoch": 1.7004782595104873, "grad_norm": 1.359375, "learning_rate": 9.40046348731131e-06, "loss": 0.2947, "step": 1700 }, { "epoch": 1.750492325966678, "grad_norm": 1.34375, "learning_rate": 8.820432828491542e-06, "loss": 0.294, "step": 1750 }, { "epoch": 1.8005063924228688, "grad_norm": 1.375, "learning_rate": 8.244402333405252e-06, "loss": 0.2894, "step": 1800 }, { "epoch": 1.8505204588790596, "grad_norm": 1.375, "learning_rate": 7.674325444256899e-06, "loss": 0.2879, "step": 1850 }, { "epoch": 1.9005345253352506, "grad_norm": 1.3671875, "learning_rate": 7.112135413304042e-06, "loss": 0.2842, "step": 1900 }, { "epoch": 1.9505485917914414, "grad_norm": 1.3125, "learning_rate": 6.55973874678682e-06, "loss": 0.2839, "step": 1950 }, { "epoch": 2.0005626582476324, "grad_norm": 1.1171875, "learning_rate": 6.0190087395588596e-06, "loss": 0.2766, "step": 2000 }, { "epoch": 2.050576724703823, "grad_norm": 1.3046875, "learning_rate": 5.491779122345093e-06, "loss": 0.1517, "step": 2050 }, { "epoch": 2.100590791160014, "grad_norm": 1.1953125, "learning_rate": 4.979837843169959e-06, "loss": 0.1492, "step": 2100 }, { "epoch": 2.1506048576162047, "grad_norm": 1.203125, "learning_rate": 4.484921004044509e-06, "loss": 0.1494, "step": 2150 }, { "epoch": 2.2006189240723955, "grad_norm": 1.1484375, "learning_rate": 4.008706973474391e-06, "loss": 0.1498, "step": 2200 }, { "epoch": 2.2506329905285862, "grad_norm": 1.1328125, "learning_rate": 3.5528106947544626e-06, "loss": 0.1482, "step": 2250 }, { "epoch": 2.300647056984777, "grad_norm": 1.1796875, "learning_rate": 3.118778209351808e-06, "loss": 0.1477, "step": 2300 }, { "epoch": 2.350661123440968, "grad_norm": 1.234375, "learning_rate": 2.7080814139495402e-06, "loss": 0.1473, "step": 2350 }, { "epoch": 2.4006751898971586, "grad_norm": 1.2421875, "learning_rate": 2.322113068931391e-06, "loss": 0.147, "step": 2400 }, { "epoch": 2.4506892563533493, "grad_norm": 1.1875, "learning_rate": 1.9621820752343324e-06, "loss": 0.1466, "step": 2450 }, { "epoch": 2.50070332280954, "grad_norm": 1.21875, "learning_rate": 1.629509035586484e-06, "loss": 0.145, "step": 2500 }, { "epoch": 2.550717389265731, "grad_norm": 1.4140625, "learning_rate": 1.3252221151830513e-06, "loss": 0.146, "step": 2550 }, { "epoch": 2.6007314557219217, "grad_norm": 1.15625, "learning_rate": 1.0503532158376584e-06, "loss": 0.1453, "step": 2600 }, { "epoch": 2.6507455221781124, "grad_norm": 1.203125, "learning_rate": 8.058344765833171e-07, "loss": 0.1466, "step": 2650 }, { "epoch": 2.7007595886343037, "grad_norm": 1.2734375, "learning_rate": 5.924951125902545e-07, "loss": 0.1458, "step": 2700 }, { "epoch": 2.7507736550904944, "grad_norm": 1.1640625, "learning_rate": 4.11058603120511e-07, "loss": 0.1445, "step": 2750 }, { "epoch": 2.800787721546685, "grad_norm": 1.1875, "learning_rate": 2.6214023805552826e-07, "loss": 0.1449, "step": 2800 }, { "epoch": 2.850801788002876, "grad_norm": 1.390625, "learning_rate": 1.462450313169983e-07, "loss": 0.1457, "step": 2850 }, { "epoch": 2.9008158544590668, "grad_norm": 1.2265625, "learning_rate": 6.376600825699463e-08, "loss": 0.1444, "step": 2900 }, { "epoch": 2.9508299209152575, "grad_norm": 1.171875, "learning_rate": 1.49828728252277e-08, "loss": 0.1451, "step": 2950 }, { "epoch": 2.9978431433840766, "step": 2997, "total_flos": 1.9278929080237425e+18, "train_loss": 0.3418351112304626, "train_runtime": 31635.6273, "train_samples_per_second": 6.067, "train_steps_per_second": 0.095 } ], "logging_steps": 50, "max_steps": 2997, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 1.9278929080237425e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }