|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9978431433840766, |
|
"eval_steps": 500, |
|
"global_step": 2997, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0500140664561908, |
|
"grad_norm": 0.9210330247879028, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 1.561, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1000281329123816, |
|
"grad_norm": 0.6426990032196045, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.9595, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1500421993685724, |
|
"grad_norm": 0.5717326998710632, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6943, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2000562658247632, |
|
"grad_norm": 0.4604141414165497, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.6171, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.25007033228095404, |
|
"grad_norm": 0.45251232385635376, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.5517, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3000843987371448, |
|
"grad_norm": 0.5194812417030334, |
|
"learning_rate": 2e-05, |
|
"loss": 0.523, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.35009846519333565, |
|
"grad_norm": 0.4478510618209839, |
|
"learning_rate": 1.9983043934122208e-05, |
|
"loss": 0.4911, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4001125316495264, |
|
"grad_norm": 0.3697112500667572, |
|
"learning_rate": 1.9932233238122834e-05, |
|
"loss": 0.4631, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.45012659810571726, |
|
"grad_norm": 0.4671821892261505, |
|
"learning_rate": 1.984774022190361e-05, |
|
"loss": 0.4416, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5001406645619081, |
|
"grad_norm": 0.3720390498638153, |
|
"learning_rate": 1.972985141929439e-05, |
|
"loss": 0.4219, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5501547310180989, |
|
"grad_norm": 0.3556453287601471, |
|
"learning_rate": 1.9578966616355823e-05, |
|
"loss": 0.4144, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6001687974742896, |
|
"grad_norm": 0.34724870324134827, |
|
"learning_rate": 1.9395597495619634e-05, |
|
"loss": 0.4129, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6501828639304804, |
|
"grad_norm": 0.35355138778686523, |
|
"learning_rate": 1.918036590086405e-05, |
|
"loss": 0.3979, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7001969303866713, |
|
"grad_norm": 0.44766274094581604, |
|
"learning_rate": 1.8934001728309003e-05, |
|
"loss": 0.3843, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7502109968428621, |
|
"grad_norm": 0.3572178781032562, |
|
"learning_rate": 1.865734045138245e-05, |
|
"loss": 0.3854, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8002250632990529, |
|
"grad_norm": 0.37280118465423584, |
|
"learning_rate": 1.8351320287451865e-05, |
|
"loss": 0.3699, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8502391297552436, |
|
"grad_norm": 0.3944127559661865, |
|
"learning_rate": 1.8016979016129164e-05, |
|
"loss": 0.3646, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9002531962114345, |
|
"grad_norm": 0.32988861203193665, |
|
"learning_rate": 1.7655450459938786e-05, |
|
"loss": 0.3537, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9502672626676253, |
|
"grad_norm": 0.31536394357681274, |
|
"learning_rate": 1.726796063928382e-05, |
|
"loss": 0.3514, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.0002813291238162, |
|
"grad_norm": 0.3472963869571686, |
|
"learning_rate": 1.6855823614749474e-05, |
|
"loss": 0.3489, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.050295395580007, |
|
"grad_norm": 0.357003778219223, |
|
"learning_rate": 1.6420437030843482e-05, |
|
"loss": 0.3187, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.1003094620361977, |
|
"grad_norm": 0.31990480422973633, |
|
"learning_rate": 1.5963277376285646e-05, |
|
"loss": 0.3117, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.1503235284923885, |
|
"grad_norm": 0.3657567501068115, |
|
"learning_rate": 1.5485894976919836e-05, |
|
"loss": 0.3089, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.2003375949485793, |
|
"grad_norm": 0.3026184141635895, |
|
"learning_rate": 1.4989908738228567e-05, |
|
"loss": 0.3005, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.25035166140477, |
|
"grad_norm": 0.3388034999370575, |
|
"learning_rate": 1.4477000655279376e-05, |
|
"loss": 0.3042, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.3003657278609608, |
|
"grad_norm": 0.3384702205657959, |
|
"learning_rate": 1.394891010872102e-05, |
|
"loss": 0.3057, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.3503797943171518, |
|
"grad_norm": 0.3673734664916992, |
|
"learning_rate": 1.3407427966172866e-05, |
|
"loss": 0.2958, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.4003938607733426, |
|
"grad_norm": 0.35079866647720337, |
|
"learning_rate": 1.2854390509011061e-05, |
|
"loss": 0.298, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.4504079272295334, |
|
"grad_norm": 0.3334648013114929, |
|
"learning_rate": 1.2291673205146908e-05, |
|
"loss": 0.3008, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.5004219936857242, |
|
"grad_norm": 0.3893981873989105, |
|
"learning_rate": 1.1721184348915384e-05, |
|
"loss": 0.2917, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.550436060141915, |
|
"grad_norm": 0.31972965598106384, |
|
"learning_rate": 1.1144858589642251e-05, |
|
"loss": 0.2926, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.6004501265981057, |
|
"grad_norm": 0.31529781222343445, |
|
"learning_rate": 1.0564650370835772e-05, |
|
"loss": 0.2876, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.6504641930542965, |
|
"grad_norm": 0.3786483705043793, |
|
"learning_rate": 9.982527302252135e-06, |
|
"loss": 0.2904, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.7004782595104873, |
|
"grad_norm": 0.33780983090400696, |
|
"learning_rate": 9.40046348731131e-06, |
|
"loss": 0.2906, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.750492325966678, |
|
"grad_norm": 0.3387848734855652, |
|
"learning_rate": 8.820432828491542e-06, |
|
"loss": 0.2833, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.8005063924228688, |
|
"grad_norm": 0.280521422624588, |
|
"learning_rate": 8.244402333405252e-06, |
|
"loss": 0.2779, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.8505204588790596, |
|
"grad_norm": 0.3127559721469879, |
|
"learning_rate": 7.674325444256899e-06, |
|
"loss": 0.2898, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.9005345253352506, |
|
"grad_norm": 0.30676454305648804, |
|
"learning_rate": 7.112135413304042e-06, |
|
"loss": 0.2813, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.9505485917914414, |
|
"grad_norm": 0.31918200850486755, |
|
"learning_rate": 6.55973874678682e-06, |
|
"loss": 0.2777, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.0005626582476324, |
|
"grad_norm": 0.32422205805778503, |
|
"learning_rate": 6.0190087395588596e-06, |
|
"loss": 0.2767, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.050576724703823, |
|
"grad_norm": 0.31339433789253235, |
|
"learning_rate": 5.491779122345093e-06, |
|
"loss": 0.2571, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.100590791160014, |
|
"grad_norm": 0.342579185962677, |
|
"learning_rate": 4.979837843169959e-06, |
|
"loss": 0.2516, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.1506048576162047, |
|
"grad_norm": 0.38084590435028076, |
|
"learning_rate": 4.484921004044509e-06, |
|
"loss": 0.2536, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.2006189240723955, |
|
"grad_norm": 0.3659977316856384, |
|
"learning_rate": 4.008706973474391e-06, |
|
"loss": 0.2508, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.2506329905285862, |
|
"grad_norm": 0.3062564730644226, |
|
"learning_rate": 3.5528106947544626e-06, |
|
"loss": 0.2501, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.300647056984777, |
|
"grad_norm": 0.31410741806030273, |
|
"learning_rate": 3.118778209351808e-06, |
|
"loss": 0.2555, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.350661123440968, |
|
"grad_norm": 0.3308572769165039, |
|
"learning_rate": 2.7080814139495402e-06, |
|
"loss": 0.2519, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.4006751898971586, |
|
"grad_norm": 0.32767045497894287, |
|
"learning_rate": 2.322113068931391e-06, |
|
"loss": 0.2496, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.4506892563533493, |
|
"grad_norm": 0.302573025226593, |
|
"learning_rate": 1.9621820752343324e-06, |
|
"loss": 0.245, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.50070332280954, |
|
"grad_norm": 0.29837408661842346, |
|
"learning_rate": 1.629509035586484e-06, |
|
"loss": 0.2532, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.550717389265731, |
|
"grad_norm": 0.3348439931869507, |
|
"learning_rate": 1.3252221151830513e-06, |
|
"loss": 0.2457, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.6007314557219217, |
|
"grad_norm": 0.37546366453170776, |
|
"learning_rate": 1.0503532158376584e-06, |
|
"loss": 0.2475, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.6507455221781124, |
|
"grad_norm": 0.2989293038845062, |
|
"learning_rate": 8.058344765833171e-07, |
|
"loss": 0.246, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.7007595886343037, |
|
"grad_norm": 0.32395079731941223, |
|
"learning_rate": 5.924951125902545e-07, |
|
"loss": 0.2495, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.7507736550904944, |
|
"grad_norm": 0.31438368558883667, |
|
"learning_rate": 4.11058603120511e-07, |
|
"loss": 0.2476, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.800787721546685, |
|
"grad_norm": 0.30735209584236145, |
|
"learning_rate": 2.6214023805552826e-07, |
|
"loss": 0.2481, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.850801788002876, |
|
"grad_norm": 0.3182600736618042, |
|
"learning_rate": 1.462450313169983e-07, |
|
"loss": 0.245, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.9008158544590668, |
|
"grad_norm": 0.3001386523246765, |
|
"learning_rate": 6.376600825699463e-08, |
|
"loss": 0.2459, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.9508299209152575, |
|
"grad_norm": 0.3104873299598694, |
|
"learning_rate": 1.49828728252277e-08, |
|
"loss": 0.2419, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.9978431433840766, |
|
"step": 2997, |
|
"total_flos": 2.5671411434264986e+17, |
|
"train_loss": 0.35637976608556393, |
|
"train_runtime": 44055.9239, |
|
"train_samples_per_second": 4.357, |
|
"train_steps_per_second": 0.068 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 2997, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"total_flos": 2.5671411434264986e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|