|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 1320, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 1.5151515151515152e-06, |
|
"loss": 2.504, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 7.5757575757575764e-06, |
|
"loss": 2.4856, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 1.5151515151515153e-05, |
|
"loss": 2.4231, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 2.272727272727273e-05, |
|
"loss": 2.4239, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 3.0303030303030306e-05, |
|
"loss": 2.4352, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 3.787878787878788e-05, |
|
"loss": 2.4108, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 2.3658, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 5.303030303030303e-05, |
|
"loss": 2.3141, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 6.060606060606061e-05, |
|
"loss": 2.3152, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 6.818181818181818e-05, |
|
"loss": 2.294, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 7.575757575757576e-05, |
|
"loss": 2.2869, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 2.2961, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 2.2337, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 9.848484848484849e-05, |
|
"loss": 2.2716, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 0.00010606060606060606, |
|
"loss": 2.2456, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 0.00011363636363636365, |
|
"loss": 2.2372, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 0.00012121212121212122, |
|
"loss": 2.231, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00012878787878787878, |
|
"loss": 2.2575, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.1640625, |
|
"learning_rate": 0.00013636363636363637, |
|
"loss": 2.2734, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 0.00014393939393939396, |
|
"loss": 2.2141, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.00015151515151515152, |
|
"loss": 2.2526, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 0.0001590909090909091, |
|
"loss": 2.2879, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 2.1986, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 0.00017424242424242425, |
|
"loss": 2.2265, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 2.196, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00018939393939393942, |
|
"loss": 2.1943, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.00019696969696969698, |
|
"loss": 2.2055, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.0001999968531423333, |
|
"loss": 2.173, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.0001999776230627102, |
|
"loss": 2.1762, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0001999409145155235, |
|
"loss": 2.1762, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0001998867339183008, |
|
"loss": 2.157, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0001998150907430998, |
|
"loss": 2.1838, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00019972599751485226, |
|
"loss": 2.1985, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00019961946980917456, |
|
"loss": 2.1016, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0001994955262496446, |
|
"loss": 2.1371, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00019935418850454588, |
|
"loss": 2.1527, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.00019919548128307954, |
|
"loss": 2.118, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00019901943233104443, |
|
"loss": 2.098, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00019882607242598663, |
|
"loss": 2.0853, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00019861543537181867, |
|
"loss": 2.0832, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00019838755799290994, |
|
"loss": 2.1097, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00019814248012764877, |
|
"loss": 2.0618, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00019788024462147788, |
|
"loss": 2.0607, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00019760089731940384, |
|
"loss": 2.0829, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 1.9736, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.0001969910656567805, |
|
"loss": 2.033, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.00019666068790931732, |
|
"loss": 1.9864, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.00019631341157348465, |
|
"loss": 2.008, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00019594929736144976, |
|
"loss": 2.089, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00019556840892904126, |
|
"loss": 1.9851, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.0001951708128646208, |
|
"loss": 1.9994, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.0001947565786774415, |
|
"loss": 1.9324, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00019432577878549637, |
|
"loss": 2.063, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.1617629528045654, |
|
"eval_runtime": 67.9579, |
|
"eval_samples_per_second": 3.899, |
|
"eval_steps_per_second": 0.5, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.00019387848850285772, |
|
"loss": 1.8981, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.00019341478602651069, |
|
"loss": 1.7606, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.00019293475242268223, |
|
"loss": 1.814, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.0001924384716126692, |
|
"loss": 1.7713, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00019192603035816656, |
|
"loss": 1.8325, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.0001913975182460996, |
|
"loss": 1.7617, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.00019085302767296182, |
|
"loss": 1.8414, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00019029265382866214, |
|
"loss": 1.7312, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.000189716494679883, |
|
"loss": 1.8303, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00018912465095295388, |
|
"loss": 1.7397, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00018851722611624164, |
|
"loss": 1.7772, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.00018789432636206197, |
|
"loss": 1.7317, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00018725606058811424, |
|
"loss": 1.7184, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 1.798, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00018593387998393457, |
|
"loss": 1.6324, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00018525019630233463, |
|
"loss": 1.6136, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00018455160885782045, |
|
"loss": 1.7733, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.00018383823978010075, |
|
"loss": 1.6591, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00018311021378306563, |
|
"loss": 1.6217, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.0001823676581429833, |
|
"loss": 1.5634, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00018161070267624937, |
|
"loss": 1.7062, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0001808394797166919, |
|
"loss": 1.6042, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00018005412409243606, |
|
"loss": 1.6031, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00017925477310233316, |
|
"loss": 1.4434, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00017844156649195759, |
|
"loss": 1.4816, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.0001776146464291757, |
|
"loss": 1.5563, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.00017677415747929174, |
|
"loss": 1.5238, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.00017592024657977432, |
|
"loss": 1.5693, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.00017505306301456822, |
|
"loss": 1.5205, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00017417275838799596, |
|
"loss": 1.537, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0001732794865982539, |
|
"loss": 1.4182, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00017237340381050703, |
|
"loss": 1.4714, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00017145466842958764, |
|
"loss": 1.4167, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00017052344107230241, |
|
"loss": 1.4556, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00016957988453935276, |
|
"loss": 1.3048, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 1.4648, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00016765644589759396, |
|
"loss": 1.3969, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00016667690005162916, |
|
"loss": 1.5284, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00016568569749690208, |
|
"loss": 1.3291, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00016468301151920575, |
|
"loss": 1.3605, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00016366901741190882, |
|
"loss": 1.4962, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00016264389244531014, |
|
"loss": 1.3618, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0001616078158356475, |
|
"loss": 1.391, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00016056096871376667, |
|
"loss": 1.2776, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00015950353409345517, |
|
"loss": 1.4267, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0001584356968394471, |
|
"loss": 1.2524, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0001573576436351046, |
|
"loss": 1.3176, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00015626956294978103, |
|
"loss": 1.2975, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0001551716450058719, |
|
"loss": 1.2803, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.00015406408174555976, |
|
"loss": 1.3139, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.0001529470667972579, |
|
"loss": 1.2867, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00015182079544175955, |
|
"loss": 1.3683, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.0001506854645780983, |
|
"loss": 1.2293, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.9121202230453491, |
|
"eval_runtime": 67.9476, |
|
"eval_samples_per_second": 3.9, |
|
"eval_steps_per_second": 0.5, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.00014954127268912526, |
|
"loss": 1.1291, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.0001483884198068096, |
|
"loss": 1.045, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.0001472271074772683, |
|
"loss": 0.9242, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00014605753872553093, |
|
"loss": 1.0677, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00014487991802004623, |
|
"loss": 0.9137, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00014369445123693596, |
|
"loss": 0.9222, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.000142501345624003, |
|
"loss": 0.89, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00014130080976449948, |
|
"loss": 1.0561, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00014009305354066137, |
|
"loss": 1.0657, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0001388782880970162, |
|
"loss": 0.8967, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00013765672580346987, |
|
"loss": 0.9202, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00013642858021817943, |
|
"loss": 0.8938, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00013519406605021797, |
|
"loss": 0.9264, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00013395339912203829, |
|
"loss": 0.9314, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00013270679633174218, |
|
"loss": 0.7482, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00013145447561516138, |
|
"loss": 0.8059, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00013019665590775716, |
|
"loss": 0.8082, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.0001289335571063453, |
|
"loss": 0.7883, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0001276654000306527, |
|
"loss": 0.7813, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00012639240638471317, |
|
"loss": 0.9305, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.0001251147987181079, |
|
"loss": 0.7705, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.00012383280038705884, |
|
"loss": 0.8381, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00012254663551538046, |
|
"loss": 0.9174, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.00012125652895529766, |
|
"loss": 0.9508, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00011996270624813642, |
|
"loss": 0.9384, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00011866539358489345, |
|
"loss": 0.7756, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.8655, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00011606120616513648, |
|
"loss": 0.8007, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00011475478668255222, |
|
"loss": 0.9225, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00011344578771215319, |
|
"loss": 0.7905, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.0001121344380981082, |
|
"loss": 0.8412, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00011082096709553442, |
|
"loss": 0.8604, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00010950560433041826, |
|
"loss": 0.8263, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00010818857975947128, |
|
"loss": 0.8888, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0001068701236299281, |
|
"loss": 0.868, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00010555046643929403, |
|
"loss": 0.8699, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00010422983889504831, |
|
"loss": 0.7818, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00010290847187431113, |
|
"loss": 0.8963, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00010158659638348081, |
|
"loss": 0.659, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00010026444351784822, |
|
"loss": 0.7786, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 9.894224442119607e-05, |
|
"loss": 0.7708, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 9.762023024538926e-05, |
|
"loss": 0.7358, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 9.629863210996419e-05, |
|
"loss": 0.859, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 9.49776810617235e-05, |
|
"loss": 0.6883, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 9.365760803434355e-05, |
|
"loss": 0.6727, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 9.233864380800178e-05, |
|
"loss": 0.7685, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 9.102101896903084e-05, |
|
"loss": 0.8278, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 8.970496386960656e-05, |
|
"loss": 0.7174, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 8.839070858747697e-05, |
|
"loss": 0.7189, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 8.707848288573926e-05, |
|
"loss": 0.6564, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 8.57685161726715e-05, |
|
"loss": 0.6793, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 8.446103746162698e-05, |
|
"loss": 0.8086, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 8.315627533099696e-05, |
|
"loss": 0.6985, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.6915704011917114, |
|
"eval_runtime": 67.8527, |
|
"eval_samples_per_second": 3.906, |
|
"eval_steps_per_second": 0.501, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 8.185445788424974e-05, |
|
"loss": 0.5262, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 8.055581271005292e-05, |
|
"loss": 0.57, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 7.92605668424853e-05, |
|
"loss": 0.4825, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 7.796894672134594e-05, |
|
"loss": 0.5857, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 7.668117815256729e-05, |
|
"loss": 0.4708, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 7.539748626873866e-05, |
|
"loss": 0.5465, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 7.411809548974792e-05, |
|
"loss": 0.5395, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 7.28432294835474e-05, |
|
"loss": 0.5176, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 7.157311112705149e-05, |
|
"loss": 0.5352, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 7.030796246717255e-05, |
|
"loss": 0.5097, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 6.904800468200143e-05, |
|
"loss": 0.4998, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 6.779345804214088e-05, |
|
"loss": 0.491, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 6.654454187219649e-05, |
|
"loss": 0.5098, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 6.530147451243377e-05, |
|
"loss": 0.5362, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 0.875, |
|
"learning_rate": 6.406447328060709e-05, |
|
"loss": 0.5549, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 6.283375443396726e-05, |
|
"loss": 0.4999, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 6.160953313145463e-05, |
|
"loss": 0.471, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 0.4818, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.75, |
|
"learning_rate": 5.918143807752972e-05, |
|
"loss": 0.5522, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 5.797798881491138e-05, |
|
"loss": 0.4821, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 5.678188599979753e-05, |
|
"loss": 0.5222, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 5.559333873942259e-05, |
|
"loss": 0.4699, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 5.44125548201301e-05, |
|
"loss": 0.4958, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 5.3239740671046864e-05, |
|
"loss": 0.6005, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 5.207510132799436e-05, |
|
"loss": 0.5334, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 5.091884039764321e-05, |
|
"loss": 0.481, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 4.9771160021918305e-05, |
|
"loss": 0.5972, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 4.8632260842659393e-05, |
|
"loss": 0.5062, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 4.7502341966544e-05, |
|
"loss": 0.5411, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 4.638160093027908e-05, |
|
"loss": 0.4579, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 4.527023366606679e-05, |
|
"loss": 0.5362, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 4.416843446735077e-05, |
|
"loss": 0.5176, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 4.3076395954849236e-05, |
|
"loss": 0.5847, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 4.19943090428802e-05, |
|
"loss": 0.4638, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 4.092236290598499e-05, |
|
"loss": 0.5305, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 3.986074494585619e-05, |
|
"loss": 0.4742, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 3.880964075857535e-05, |
|
"loss": 0.5312, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 3.776923410216636e-05, |
|
"loss": 0.4508, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 3.673970686447005e-05, |
|
"loss": 0.5022, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 0.4655, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.4714008655207e-05, |
|
"loss": 0.3936, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 3.37181918238904e-05, |
|
"loss": 0.508, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 3.273396262987475e-05, |
|
"loss": 0.4346, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 3.1761493139843735e-05, |
|
"loss": 0.4455, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 3.080095336460491e-05, |
|
"loss": 0.431, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 2.9852511229367865e-05, |
|
"loss": 0.4371, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 2.891633254438685e-05, |
|
"loss": 0.4578, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 2.7992580975973136e-05, |
|
"loss": 0.5402, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 2.70814180178823e-05, |
|
"loss": 0.4338, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 2.618300296308135e-05, |
|
"loss": 0.4541, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 2.529749287590042e-05, |
|
"loss": 0.4433, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 2.4425042564574184e-05, |
|
"loss": 0.4724, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 2.356580455417776e-05, |
|
"loss": 0.4922, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.6053533554077148, |
|
"eval_runtime": 67.9802, |
|
"eval_samples_per_second": 3.898, |
|
"eval_steps_per_second": 0.5, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 2.2719929059961698e-05, |
|
"loss": 0.4008, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 2.1887563961090663e-05, |
|
"loss": 0.3496, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 2.106885477479078e-05, |
|
"loss": 0.486, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 2.0263944630909738e-05, |
|
"loss": 0.3697, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 1.947297424689414e-05, |
|
"loss": 0.4396, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 1.8696081903188955e-05, |
|
"loss": 0.3916, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 1.7933403419062688e-05, |
|
"loss": 0.4399, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.7185072128862933e-05, |
|
"loss": 0.486, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 1.6451218858706374e-05, |
|
"loss": 0.4195, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 1.573197190360729e-05, |
|
"loss": 0.3948, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.5027457005048573e-05, |
|
"loss": 0.4698, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 1.433779732899897e-05, |
|
"loss": 0.3929, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 1.3663113444380905e-05, |
|
"loss": 0.3774, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 1.3003523301992104e-05, |
|
"loss": 0.495, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 1.2359142213884933e-05, |
|
"loss": 0.4335, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 1.1730082833207202e-05, |
|
"loss": 0.358, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 1.1116455134507664e-05, |
|
"loss": 0.357, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.0518366394509804e-05, |
|
"loss": 0.4767, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 9.935921173357442e-06, |
|
"loss": 0.4063, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 9.369221296335006e-06, |
|
"loss": 0.3668, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 8.818365836066101e-06, |
|
"loss": 0.3531, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 8.283451095193229e-06, |
|
"loss": 0.3754, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 7.764570589541875e-06, |
|
"loss": 0.4278, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 7.261815031771602e-06, |
|
"loss": 0.4919, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 6.7752723155174226e-06, |
|
"loss": 0.4837, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 6.3050275000238414e-06, |
|
"loss": 0.4196, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 5.851162795274445e-06, |
|
"loss": 0.4928, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 5.413757547619747e-06, |
|
"loss": 0.4489, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 4.992888225905468e-06, |
|
"loss": 0.4083, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 4.5886284081039675e-06, |
|
"loss": 0.5487, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 4.20104876845111e-06, |
|
"loss": 0.3817, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 3.830217065090702e-06, |
|
"loss": 0.4394, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 3.476198128228736e-06, |
|
"loss": 0.3811, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 3.139053848799556e-06, |
|
"loss": 0.3809, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 2.818843167645835e-06, |
|
"loss": 0.3755, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 2.5156220652143404e-06, |
|
"loss": 0.4108, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 2.2294435517691503e-06, |
|
"loss": 0.5031, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 1.960357658124301e-06, |
|
"loss": 0.4176, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 1.7084114268971275e-06, |
|
"loss": 0.3757, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 1.4736489042840973e-06, |
|
"loss": 0.4244, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 1.2561111323605712e-06, |
|
"loss": 0.3933, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 1.055836141905553e-06, |
|
"loss": 0.4223, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 8.728589457530855e-07, |
|
"loss": 0.3988, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 7.072115326711704e-07, |
|
"loss": 0.3916, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 5.589228617693288e-07, |
|
"loss": 0.3814, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 4.2801885743588567e-07, |
|
"loss": 0.4261, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 3.145224048057727e-07, |
|
"loss": 0.4018, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 2.1845334575963938e-07, |
|
"loss": 0.4824, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 1.3982847545507271e-07, |
|
"loss": 0.4293, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 7.866153939033449e-08, |
|
"loss": 0.3955, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 3.496323100138366e-08, |
|
"loss": 0.3909, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 8.7411897923384e-09, |
|
"loss": 0.4108, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0, |
|
"loss": 0.3396, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 1.6054625511169434, |
|
"eval_runtime": 67.9944, |
|
"eval_samples_per_second": 3.897, |
|
"eval_steps_per_second": 0.5, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 1320, |
|
"total_flos": 9.280995169497252e+17, |
|
"train_loss": 1.0955926315350966, |
|
"train_runtime": 10263.5013, |
|
"train_samples_per_second": 1.028, |
|
"train_steps_per_second": 0.129 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1320, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 100, |
|
"total_flos": 9.280995169497252e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|