|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.254170497832654, |
|
"eval_steps": 500, |
|
"global_step": 10000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.052541704978326546, |
|
"grad_norm": 0.9754809141159058, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2583, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10508340995665309, |
|
"grad_norm": 0.6179457902908325, |
|
"learning_rate": 1e-05, |
|
"loss": 1.0217, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.15762511493497963, |
|
"grad_norm": 0.6895671486854553, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.8382, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.21016681991330619, |
|
"grad_norm": 1.0476884841918945, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7851, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2627085248916327, |
|
"grad_norm": 0.8948929905891418, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.7133, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2627085248916327, |
|
"eval_loss": 0.6265314221382141, |
|
"eval_runtime": 1.2508, |
|
"eval_samples_per_second": 7.995, |
|
"eval_steps_per_second": 3.997, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.31525022986995926, |
|
"grad_norm": 1.1585056781768799, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6821, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3677919348482858, |
|
"grad_norm": 0.838334321975708, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.6574, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.42033363982661237, |
|
"grad_norm": 1.056951642036438, |
|
"learning_rate": 4e-05, |
|
"loss": 0.6456, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4728753448049389, |
|
"grad_norm": 1.043925404548645, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.6396, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5254170497832654, |
|
"grad_norm": 1.2812925577163696, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6351, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5254170497832654, |
|
"eval_loss": 0.5569401979446411, |
|
"eval_runtime": 1.2148, |
|
"eval_samples_per_second": 8.232, |
|
"eval_steps_per_second": 4.116, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.577958754761592, |
|
"grad_norm": 1.3376405239105225, |
|
"learning_rate": 4.9444444444444446e-05, |
|
"loss": 0.6051, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.6305004597399185, |
|
"grad_norm": 1.3639925718307495, |
|
"learning_rate": 4.888888888888889e-05, |
|
"loss": 0.6182, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.6830421647182451, |
|
"grad_norm": 1.4436575174331665, |
|
"learning_rate": 4.8333333333333334e-05, |
|
"loss": 0.6145, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.7355838696965716, |
|
"grad_norm": 2.0778963565826416, |
|
"learning_rate": 4.7777777777777784e-05, |
|
"loss": 0.5879, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.7881255746748982, |
|
"grad_norm": 1.6049619913101196, |
|
"learning_rate": 4.722222222222222e-05, |
|
"loss": 0.5851, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7881255746748982, |
|
"eval_loss": 0.5357321500778198, |
|
"eval_runtime": 1.2208, |
|
"eval_samples_per_second": 8.192, |
|
"eval_steps_per_second": 4.096, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.8406672796532247, |
|
"grad_norm": 1.5120118856430054, |
|
"learning_rate": 4.666666666666667e-05, |
|
"loss": 0.5693, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.8932089846315513, |
|
"grad_norm": 3.5921452045440674, |
|
"learning_rate": 4.6111111111111115e-05, |
|
"loss": 0.579, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.9457506896098778, |
|
"grad_norm": 0.9748013615608215, |
|
"learning_rate": 4.555555555555556e-05, |
|
"loss": 0.5686, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.9982923945882044, |
|
"grad_norm": 0.8585628271102905, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.5688, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.0508340995665308, |
|
"grad_norm": 3.2674453258514404, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.5521, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.0508340995665308, |
|
"eval_loss": 0.5279057621955872, |
|
"eval_runtime": 1.2064, |
|
"eval_samples_per_second": 8.289, |
|
"eval_steps_per_second": 4.145, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.1033758045448574, |
|
"grad_norm": 1.5048810243606567, |
|
"learning_rate": 4.388888888888889e-05, |
|
"loss": 0.5315, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.155917509523184, |
|
"grad_norm": 1.1342097520828247, |
|
"learning_rate": 4.3333333333333334e-05, |
|
"loss": 0.53, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.2084592145015105, |
|
"grad_norm": 1.418641448020935, |
|
"learning_rate": 4.277777777777778e-05, |
|
"loss": 0.5345, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.261000919479837, |
|
"grad_norm": 1.1525490283966064, |
|
"learning_rate": 4.222222222222222e-05, |
|
"loss": 0.5364, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.3135426244581636, |
|
"grad_norm": 1.1001209020614624, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 0.5365, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.3135426244581636, |
|
"eval_loss": 0.5235944986343384, |
|
"eval_runtime": 1.2217, |
|
"eval_samples_per_second": 8.186, |
|
"eval_steps_per_second": 4.093, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.3660843294364902, |
|
"grad_norm": 1.8745806217193604, |
|
"learning_rate": 4.111111111111111e-05, |
|
"loss": 0.5228, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.4186260344148167, |
|
"grad_norm": 1.1493146419525146, |
|
"learning_rate": 4.055555555555556e-05, |
|
"loss": 0.5244, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.4711677393931433, |
|
"grad_norm": 2.0443015098571777, |
|
"learning_rate": 4e-05, |
|
"loss": 0.5321, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.5237094443714698, |
|
"grad_norm": 2.053090810775757, |
|
"learning_rate": 3.944444444444445e-05, |
|
"loss": 0.5221, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.5762511493497964, |
|
"grad_norm": 2.0530471801757812, |
|
"learning_rate": 3.888888888888889e-05, |
|
"loss": 0.5192, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.5762511493497964, |
|
"eval_loss": 0.5113556981086731, |
|
"eval_runtime": 1.221, |
|
"eval_samples_per_second": 8.19, |
|
"eval_steps_per_second": 4.095, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.628792854328123, |
|
"grad_norm": 1.469416856765747, |
|
"learning_rate": 3.8333333333333334e-05, |
|
"loss": 0.5312, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.6813345593064495, |
|
"grad_norm": 2.854994535446167, |
|
"learning_rate": 3.777777777777778e-05, |
|
"loss": 0.5223, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.733876264284776, |
|
"grad_norm": 1.2286680936813354, |
|
"learning_rate": 3.722222222222222e-05, |
|
"loss": 0.5235, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.7864179692631026, |
|
"grad_norm": 1.3007181882858276, |
|
"learning_rate": 3.6666666666666666e-05, |
|
"loss": 0.5086, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.8389596742414291, |
|
"grad_norm": 2.007951498031616, |
|
"learning_rate": 3.611111111111111e-05, |
|
"loss": 0.5241, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.8389596742414291, |
|
"eval_loss": 0.5189292430877686, |
|
"eval_runtime": 1.2074, |
|
"eval_samples_per_second": 8.282, |
|
"eval_steps_per_second": 4.141, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.8915013792197557, |
|
"grad_norm": 1.0643136501312256, |
|
"learning_rate": 3.555555555555556e-05, |
|
"loss": 0.504, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.9440430841980822, |
|
"grad_norm": 1.4835437536239624, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.5161, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.9965847891764088, |
|
"grad_norm": 1.4298534393310547, |
|
"learning_rate": 3.444444444444445e-05, |
|
"loss": 0.5095, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.049126494154735, |
|
"grad_norm": 1.0783276557922363, |
|
"learning_rate": 3.388888888888889e-05, |
|
"loss": 0.4978, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.1016681991330617, |
|
"grad_norm": 2.3203086853027344, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.4854, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.1016681991330617, |
|
"eval_loss": 0.5047374963760376, |
|
"eval_runtime": 1.2234, |
|
"eval_samples_per_second": 8.174, |
|
"eval_steps_per_second": 4.087, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.1542099041113882, |
|
"grad_norm": 1.2583634853363037, |
|
"learning_rate": 3.277777777777778e-05, |
|
"loss": 0.4934, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.206751609089715, |
|
"grad_norm": 2.2736079692840576, |
|
"learning_rate": 3.222222222222223e-05, |
|
"loss": 0.4895, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.2592933140680413, |
|
"grad_norm": 2.931607484817505, |
|
"learning_rate": 3.1666666666666666e-05, |
|
"loss": 0.489, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.311835019046368, |
|
"grad_norm": 1.2253628969192505, |
|
"learning_rate": 3.111111111111111e-05, |
|
"loss": 0.4868, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.3643767240246945, |
|
"grad_norm": 1.2433133125305176, |
|
"learning_rate": 3.055555555555556e-05, |
|
"loss": 0.4926, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.3643767240246945, |
|
"eval_loss": 0.5110575556755066, |
|
"eval_runtime": 1.2952, |
|
"eval_samples_per_second": 7.721, |
|
"eval_steps_per_second": 3.86, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.416918429003021, |
|
"grad_norm": 1.1074525117874146, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4918, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.4694601339813476, |
|
"grad_norm": 1.3761866092681885, |
|
"learning_rate": 2.9444444444444448e-05, |
|
"loss": 0.4915, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.522001838959674, |
|
"grad_norm": 2.135338068008423, |
|
"learning_rate": 2.8888888888888888e-05, |
|
"loss": 0.483, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.5745435439380007, |
|
"grad_norm": 1.0095371007919312, |
|
"learning_rate": 2.8333333333333335e-05, |
|
"loss": 0.4918, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.627085248916327, |
|
"grad_norm": 0.9606950879096985, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 0.486, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.627085248916327, |
|
"eval_loss": 0.5079401731491089, |
|
"eval_runtime": 1.2665, |
|
"eval_samples_per_second": 7.896, |
|
"eval_steps_per_second": 3.948, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.6796269538946538, |
|
"grad_norm": 1.0660576820373535, |
|
"learning_rate": 2.7222222222222223e-05, |
|
"loss": 0.4922, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.7321686588729803, |
|
"grad_norm": 2.1245779991149902, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 0.4933, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.784710363851307, |
|
"grad_norm": 1.7832878828048706, |
|
"learning_rate": 2.6111111111111114e-05, |
|
"loss": 0.4794, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.8372520688296334, |
|
"grad_norm": 2.381094455718994, |
|
"learning_rate": 2.5555555555555554e-05, |
|
"loss": 0.4852, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.88979377380796, |
|
"grad_norm": 1.01780104637146, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.4775, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.88979377380796, |
|
"eval_loss": 0.49784645438194275, |
|
"eval_runtime": 1.2033, |
|
"eval_samples_per_second": 8.31, |
|
"eval_steps_per_second": 4.155, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.9423354787862865, |
|
"grad_norm": 2.5046234130859375, |
|
"learning_rate": 2.4444444444444445e-05, |
|
"loss": 0.4786, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.994877183764613, |
|
"grad_norm": 2.6457679271698, |
|
"learning_rate": 2.3888888888888892e-05, |
|
"loss": 0.479, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 3.0474188887429396, |
|
"grad_norm": 1.8499191999435425, |
|
"learning_rate": 2.3333333333333336e-05, |
|
"loss": 0.4741, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 3.099960593721266, |
|
"grad_norm": 1.0821927785873413, |
|
"learning_rate": 2.277777777777778e-05, |
|
"loss": 0.4647, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 3.1525022986995928, |
|
"grad_norm": 1.5793670415878296, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 0.483, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.1525022986995928, |
|
"eval_loss": 0.49618691205978394, |
|
"eval_runtime": 1.2172, |
|
"eval_samples_per_second": 8.216, |
|
"eval_steps_per_second": 4.108, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.2050440036779193, |
|
"grad_norm": 1.7674349546432495, |
|
"learning_rate": 2.1666666666666667e-05, |
|
"loss": 0.4682, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 3.257585708656246, |
|
"grad_norm": 1.8406447172164917, |
|
"learning_rate": 2.111111111111111e-05, |
|
"loss": 0.4658, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 3.3101274136345724, |
|
"grad_norm": 1.0926892757415771, |
|
"learning_rate": 2.0555555555555555e-05, |
|
"loss": 0.4676, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 3.362669118612899, |
|
"grad_norm": 2.300992488861084, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4595, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 3.4152108235912255, |
|
"grad_norm": 1.148647665977478, |
|
"learning_rate": 1.9444444444444445e-05, |
|
"loss": 0.4674, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.4152108235912255, |
|
"eval_loss": 0.4925019145011902, |
|
"eval_runtime": 1.2002, |
|
"eval_samples_per_second": 8.332, |
|
"eval_steps_per_second": 4.166, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.467752528569552, |
|
"grad_norm": 1.1826932430267334, |
|
"learning_rate": 1.888888888888889e-05, |
|
"loss": 0.4563, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 3.5202942335478786, |
|
"grad_norm": 1.0678240060806274, |
|
"learning_rate": 1.8333333333333333e-05, |
|
"loss": 0.47, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 3.572835938526205, |
|
"grad_norm": 2.3626327514648438, |
|
"learning_rate": 1.777777777777778e-05, |
|
"loss": 0.4563, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 3.6253776435045317, |
|
"grad_norm": 1.1958198547363281, |
|
"learning_rate": 1.7222222222222224e-05, |
|
"loss": 0.4621, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 3.6779193484828583, |
|
"grad_norm": 1.61029052734375, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.4629, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.6779193484828583, |
|
"eval_loss": 0.4930470585823059, |
|
"eval_runtime": 1.2074, |
|
"eval_samples_per_second": 8.282, |
|
"eval_steps_per_second": 4.141, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.730461053461185, |
|
"grad_norm": 1.0514099597930908, |
|
"learning_rate": 1.6111111111111115e-05, |
|
"loss": 0.467, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 3.7830027584395114, |
|
"grad_norm": 1.5292298793792725, |
|
"learning_rate": 1.5555555555555555e-05, |
|
"loss": 0.4657, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 3.835544463417838, |
|
"grad_norm": 1.8738545179367065, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.4741, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 3.8880861683961645, |
|
"grad_norm": 1.5150409936904907, |
|
"learning_rate": 1.4444444444444444e-05, |
|
"loss": 0.4598, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 3.940627873374491, |
|
"grad_norm": 1.3109909296035767, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 0.4682, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.940627873374491, |
|
"eval_loss": 0.49467793107032776, |
|
"eval_runtime": 1.241, |
|
"eval_samples_per_second": 8.058, |
|
"eval_steps_per_second": 4.029, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.9931695783528176, |
|
"grad_norm": 1.4036318063735962, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.4652, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 4.045711283331144, |
|
"grad_norm": 1.308784008026123, |
|
"learning_rate": 1.2777777777777777e-05, |
|
"loss": 0.4535, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 4.09825298830947, |
|
"grad_norm": 1.1499841213226318, |
|
"learning_rate": 1.2222222222222222e-05, |
|
"loss": 0.4554, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 4.150794693287797, |
|
"grad_norm": 1.1128175258636475, |
|
"learning_rate": 1.1666666666666668e-05, |
|
"loss": 0.4574, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 4.203336398266123, |
|
"grad_norm": 1.2397664785385132, |
|
"learning_rate": 1.1111111111111112e-05, |
|
"loss": 0.4456, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.203336398266123, |
|
"eval_loss": 0.49332427978515625, |
|
"eval_runtime": 1.2363, |
|
"eval_samples_per_second": 8.089, |
|
"eval_steps_per_second": 4.044, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.25587810324445, |
|
"grad_norm": 1.3388928174972534, |
|
"learning_rate": 1.0555555555555555e-05, |
|
"loss": 0.4619, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 4.3084198082227765, |
|
"grad_norm": 0.97515869140625, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4623, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 4.3609615132011035, |
|
"grad_norm": 1.0904083251953125, |
|
"learning_rate": 9.444444444444445e-06, |
|
"loss": 0.4456, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 4.41350321817943, |
|
"grad_norm": 1.3429654836654663, |
|
"learning_rate": 8.88888888888889e-06, |
|
"loss": 0.4501, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 4.466044923157757, |
|
"grad_norm": 1.919039249420166, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.4486, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 4.466044923157757, |
|
"eval_loss": 0.4914402365684509, |
|
"eval_runtime": 1.2335, |
|
"eval_samples_per_second": 8.107, |
|
"eval_steps_per_second": 4.053, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 4.518586628136083, |
|
"grad_norm": 2.0423989295959473, |
|
"learning_rate": 7.777777777777777e-06, |
|
"loss": 0.4571, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 4.57112833311441, |
|
"grad_norm": 1.3525835275650024, |
|
"learning_rate": 7.222222222222222e-06, |
|
"loss": 0.46, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 4.623670038092736, |
|
"grad_norm": 1.1251217126846313, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.4522, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 4.676211743071063, |
|
"grad_norm": 1.8645511865615845, |
|
"learning_rate": 6.111111111111111e-06, |
|
"loss": 0.4605, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 4.728753448049389, |
|
"grad_norm": 1.1539450883865356, |
|
"learning_rate": 5.555555555555556e-06, |
|
"loss": 0.4544, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.728753448049389, |
|
"eval_loss": 0.490975946187973, |
|
"eval_runtime": 1.2066, |
|
"eval_samples_per_second": 8.288, |
|
"eval_steps_per_second": 4.144, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.781295153027716, |
|
"grad_norm": 1.1662914752960205, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4521, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 4.833836858006042, |
|
"grad_norm": 0.9396342635154724, |
|
"learning_rate": 4.444444444444445e-06, |
|
"loss": 0.4643, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 4.886378562984369, |
|
"grad_norm": 2.0316038131713867, |
|
"learning_rate": 3.888888888888889e-06, |
|
"loss": 0.4632, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 4.938920267962695, |
|
"grad_norm": 1.1537178754806519, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.442, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 4.991461972941022, |
|
"grad_norm": 1.3441674709320068, |
|
"learning_rate": 2.777777777777778e-06, |
|
"loss": 0.4531, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 4.991461972941022, |
|
"eval_loss": 0.4903333783149719, |
|
"eval_runtime": 1.2203, |
|
"eval_samples_per_second": 8.194, |
|
"eval_steps_per_second": 4.097, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 5.044003677919348, |
|
"grad_norm": 1.1727863550186157, |
|
"learning_rate": 2.2222222222222225e-06, |
|
"loss": 0.4564, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 5.096545382897675, |
|
"grad_norm": 1.2154302597045898, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.4369, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 5.149087087876001, |
|
"grad_norm": 0.9445755481719971, |
|
"learning_rate": 1.1111111111111112e-06, |
|
"loss": 0.4377, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 5.201628792854328, |
|
"grad_norm": 1.6093214750289917, |
|
"learning_rate": 5.555555555555556e-07, |
|
"loss": 0.4503, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 5.254170497832654, |
|
"grad_norm": 1.092114806175232, |
|
"learning_rate": 0.0, |
|
"loss": 0.4549, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 5.254170497832654, |
|
"eval_loss": 0.4912105202674866, |
|
"eval_runtime": 1.2188, |
|
"eval_samples_per_second": 8.205, |
|
"eval_steps_per_second": 4.102, |
|
"step": 10000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7932422620461716e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|