|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 5460, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.0019010665128007531, |
|
"learning_rate": 0.0005, |
|
"loss": 0.0009, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.33505979180336, |
|
"learning_rate": 0.001, |
|
"loss": 0.0059, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.521899700164795, |
|
"learning_rate": 0.0009953789279112755, |
|
"loss": 0.2569, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.3200310468673706, |
|
"learning_rate": 0.000990757855822551, |
|
"loss": 2.1666, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.1342906951904297, |
|
"learning_rate": 0.0009861367837338264, |
|
"loss": 0.3677, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.6839067935943604, |
|
"learning_rate": 0.0009815157116451016, |
|
"loss": 0.2573, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.120778203010559, |
|
"learning_rate": 0.0009768946395563771, |
|
"loss": 0.1784, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.39630964398384094, |
|
"learning_rate": 0.0009722735674676525, |
|
"loss": 0.1007, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.8115565776824951, |
|
"learning_rate": 0.0009676524953789279, |
|
"loss": 0.1009, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.4256753921508789, |
|
"learning_rate": 0.0009630314232902033, |
|
"loss": 0.0613, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.3296879529953003, |
|
"learning_rate": 0.0009584103512014787, |
|
"loss": 0.0656, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.2332513928413391, |
|
"learning_rate": 0.0009537892791127542, |
|
"loss": 0.1056, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.361102819442749, |
|
"learning_rate": 0.0009491682070240297, |
|
"loss": 0.0714, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.3176443576812744, |
|
"learning_rate": 0.000944547134935305, |
|
"loss": 0.051, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.5561681389808655, |
|
"learning_rate": 0.0009399260628465805, |
|
"loss": 0.0987, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.17937970161437988, |
|
"learning_rate": 0.0009353049907578558, |
|
"loss": 0.0705, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.11134567111730576, |
|
"learning_rate": 0.0009306839186691313, |
|
"loss": 0.0449, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.1100844293832779, |
|
"learning_rate": 0.0009260628465804066, |
|
"loss": 0.0605, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.5044831037521362, |
|
"learning_rate": 0.0009214417744916821, |
|
"loss": 0.0682, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.1506507396697998, |
|
"learning_rate": 0.0009168207024029575, |
|
"loss": 0.0382, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.358964741230011, |
|
"learning_rate": 0.0009121996303142329, |
|
"loss": 0.0416, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.22685140371322632, |
|
"eval_runtime": 467.2357, |
|
"eval_samples_per_second": 1.74, |
|
"eval_steps_per_second": 0.291, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.21843843162059784, |
|
"learning_rate": 0.0009075785582255084, |
|
"loss": 0.0283, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.3907661437988281, |
|
"learning_rate": 0.0009029574861367837, |
|
"loss": 0.0322, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.026330502703785896, |
|
"learning_rate": 0.0008983364140480592, |
|
"loss": 0.021, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.015783503651618958, |
|
"learning_rate": 0.0008937153419593346, |
|
"loss": 0.0237, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.07014349848031998, |
|
"learning_rate": 0.00088909426987061, |
|
"loss": 0.0307, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.05321989953517914, |
|
"learning_rate": 0.0008844731977818854, |
|
"loss": 0.0217, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.07509706914424896, |
|
"learning_rate": 0.0008798521256931608, |
|
"loss": 0.0319, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.239598348736763, |
|
"learning_rate": 0.0008752310536044362, |
|
"loss": 0.0373, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.1276847869157791, |
|
"learning_rate": 0.0008706099815157116, |
|
"loss": 0.0308, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.17856040596961975, |
|
"learning_rate": 0.000865988909426987, |
|
"loss": 0.0326, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.5299984216690063, |
|
"learning_rate": 0.0008613678373382626, |
|
"loss": 0.034, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.18570055067539215, |
|
"learning_rate": 0.0008567467652495379, |
|
"loss": 0.0255, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.012400169856846333, |
|
"learning_rate": 0.0008521256931608134, |
|
"loss": 0.0133, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.18435439467430115, |
|
"learning_rate": 0.0008475046210720887, |
|
"loss": 0.0392, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.20227985084056854, |
|
"learning_rate": 0.0008428835489833642, |
|
"loss": 0.0339, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.09818145632743835, |
|
"learning_rate": 0.0008382624768946395, |
|
"loss": 0.039, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.17143449187278748, |
|
"learning_rate": 0.000833641404805915, |
|
"loss": 0.0256, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.3052353858947754, |
|
"learning_rate": 0.0008290203327171904, |
|
"loss": 0.0279, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.09069275110960007, |
|
"learning_rate": 0.0008243992606284658, |
|
"loss": 0.0253, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.7721070647239685, |
|
"learning_rate": 0.0008197781885397413, |
|
"loss": 0.0222, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.2378959357738495, |
|
"learning_rate": 0.0008151571164510166, |
|
"loss": 0.0386, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.004904525820165873, |
|
"learning_rate": 0.0008105360443622921, |
|
"loss": 0.0243, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.20544852316379547, |
|
"eval_runtime": 463.5714, |
|
"eval_samples_per_second": 1.754, |
|
"eval_steps_per_second": 0.293, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 0.13851934671401978, |
|
"learning_rate": 0.0008059149722735675, |
|
"loss": 0.0267, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 0.01875193975865841, |
|
"learning_rate": 0.0008012939001848429, |
|
"loss": 0.0117, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 0.0016854548593983054, |
|
"learning_rate": 0.0007966728280961183, |
|
"loss": 0.0317, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.10259977728128433, |
|
"learning_rate": 0.0007920517560073937, |
|
"loss": 0.0253, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.054936520755290985, |
|
"learning_rate": 0.0007874306839186691, |
|
"loss": 0.0228, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.08183781057596207, |
|
"learning_rate": 0.0007828096118299445, |
|
"loss": 0.0163, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 0.03448671102523804, |
|
"learning_rate": 0.0007781885397412199, |
|
"loss": 0.0252, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.026254719123244286, |
|
"learning_rate": 0.0007735674676524955, |
|
"loss": 0.0263, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.07833431661128998, |
|
"learning_rate": 0.0007689463955637708, |
|
"loss": 0.0268, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 0.3215916156768799, |
|
"learning_rate": 0.0007643253234750463, |
|
"loss": 0.0184, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.1949949562549591, |
|
"learning_rate": 0.0007597042513863216, |
|
"loss": 0.0234, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.10761301964521408, |
|
"learning_rate": 0.0007550831792975971, |
|
"loss": 0.0214, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.23488566279411316, |
|
"learning_rate": 0.0007504621072088724, |
|
"loss": 0.0321, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 0.16079489886760712, |
|
"learning_rate": 0.0007458410351201479, |
|
"loss": 0.041, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.3500367999076843, |
|
"learning_rate": 0.0007412199630314234, |
|
"loss": 0.0218, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.01966880075633526, |
|
"learning_rate": 0.0007365988909426987, |
|
"loss": 0.0185, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.10084854066371918, |
|
"learning_rate": 0.0007319778188539742, |
|
"loss": 0.0158, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.045843809843063354, |
|
"learning_rate": 0.0007273567467652495, |
|
"loss": 0.0193, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.19230197370052338, |
|
"learning_rate": 0.000722735674676525, |
|
"loss": 0.0115, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.10168833285570145, |
|
"learning_rate": 0.0007181146025878004, |
|
"loss": 0.0173, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.24770613014698029, |
|
"learning_rate": 0.0007134935304990758, |
|
"loss": 0.019, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 0.04277370125055313, |
|
"learning_rate": 0.0007088724584103512, |
|
"loss": 0.0262, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.18655328452587128, |
|
"eval_runtime": 463.8687, |
|
"eval_samples_per_second": 1.753, |
|
"eval_steps_per_second": 0.293, |
|
"step": 1638 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 0.03143952414393425, |
|
"learning_rate": 0.0007042513863216266, |
|
"loss": 0.0185, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 0.061480745673179626, |
|
"learning_rate": 0.000699630314232902, |
|
"loss": 0.0158, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 0.05645143985748291, |
|
"learning_rate": 0.0006950092421441774, |
|
"loss": 0.0163, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 0.3927539885044098, |
|
"learning_rate": 0.0006903881700554528, |
|
"loss": 0.0257, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 0.1579461544752121, |
|
"learning_rate": 0.0006857670979667284, |
|
"loss": 0.0203, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.003284105099737644, |
|
"learning_rate": 0.0006811460258780037, |
|
"loss": 0.012, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 0.0939943715929985, |
|
"learning_rate": 0.0006765249537892792, |
|
"loss": 0.0139, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 0.08114974200725555, |
|
"learning_rate": 0.0006719038817005545, |
|
"loss": 0.0134, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.008277042768895626, |
|
"learning_rate": 0.00066728280961183, |
|
"loss": 0.0203, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 0.014137201942503452, |
|
"learning_rate": 0.0006626617375231053, |
|
"loss": 0.018, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 0.04209378361701965, |
|
"learning_rate": 0.0006580406654343808, |
|
"loss": 0.0107, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 0.015557551756501198, |
|
"learning_rate": 0.0006534195933456563, |
|
"loss": 0.0104, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 0.021405475214123726, |
|
"learning_rate": 0.0006487985212569316, |
|
"loss": 0.0117, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 0.0015239958884194493, |
|
"learning_rate": 0.0006441774491682071, |
|
"loss": 0.0176, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 0.0997876301407814, |
|
"learning_rate": 0.0006395563770794824, |
|
"loss": 0.0183, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 0.004715020768344402, |
|
"learning_rate": 0.0006349353049907579, |
|
"loss": 0.0199, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.1075858548283577, |
|
"learning_rate": 0.0006303142329020333, |
|
"loss": 0.0201, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.020496558398008347, |
|
"learning_rate": 0.0006256931608133087, |
|
"loss": 0.0145, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.11063025891780853, |
|
"learning_rate": 0.0006210720887245841, |
|
"loss": 0.0201, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 0.1012192815542221, |
|
"learning_rate": 0.0006164510166358595, |
|
"loss": 0.0223, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.04694315418601036, |
|
"learning_rate": 0.0006118299445471349, |
|
"loss": 0.0163, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 0.05395512282848358, |
|
"learning_rate": 0.0006072088724584103, |
|
"loss": 0.009, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.20004291832447052, |
|
"eval_runtime": 464.3668, |
|
"eval_samples_per_second": 1.751, |
|
"eval_steps_per_second": 0.293, |
|
"step": 2184 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.08517912030220032, |
|
"learning_rate": 0.0006025878003696857, |
|
"loss": 0.0152, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 0.23693686723709106, |
|
"learning_rate": 0.0005979667282809613, |
|
"loss": 0.0123, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 0.04390133172273636, |
|
"learning_rate": 0.0005933456561922366, |
|
"loss": 0.0104, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 0.048480305820703506, |
|
"learning_rate": 0.0005887245841035121, |
|
"loss": 0.0191, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 0.07334431260824203, |
|
"learning_rate": 0.0005841035120147874, |
|
"loss": 0.0079, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 0.26686009764671326, |
|
"learning_rate": 0.0005794824399260629, |
|
"loss": 0.0134, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 0.18834412097930908, |
|
"learning_rate": 0.0005748613678373382, |
|
"loss": 0.0108, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 0.11365604400634766, |
|
"learning_rate": 0.0005702402957486137, |
|
"loss": 0.0116, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.21077445149421692, |
|
"learning_rate": 0.0005656192236598892, |
|
"loss": 0.017, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.14450936019420624, |
|
"learning_rate": 0.0005609981515711645, |
|
"loss": 0.0056, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 0.07659462839365005, |
|
"learning_rate": 0.00055637707948244, |
|
"loss": 0.0128, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 0.07819797843694687, |
|
"learning_rate": 0.0005517560073937153, |
|
"loss": 0.0085, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 0.10529200732707977, |
|
"learning_rate": 0.0005471349353049908, |
|
"loss": 0.0156, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 0.034541305154561996, |
|
"learning_rate": 0.0005425138632162662, |
|
"loss": 0.0114, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 0.0043388293124735355, |
|
"learning_rate": 0.0005378927911275416, |
|
"loss": 0.0114, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.09843795001506805, |
|
"learning_rate": 0.000533271719038817, |
|
"loss": 0.0097, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 0.1924191564321518, |
|
"learning_rate": 0.0005286506469500924, |
|
"loss": 0.0138, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 0.0032940045930445194, |
|
"learning_rate": 0.0005240295748613678, |
|
"loss": 0.009, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 0.17411276698112488, |
|
"learning_rate": 0.0005194085027726432, |
|
"loss": 0.005, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 0.0008068850729614496, |
|
"learning_rate": 0.0005147874306839186, |
|
"loss": 0.0091, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 0.013785873539745808, |
|
"learning_rate": 0.0005101663585951941, |
|
"loss": 0.0174, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 0.06957102566957474, |
|
"learning_rate": 0.0005055452865064695, |
|
"loss": 0.0196, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.1927657425403595, |
|
"eval_runtime": 464.5332, |
|
"eval_samples_per_second": 1.75, |
|
"eval_steps_per_second": 0.293, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"grad_norm": 0.1873362511396408, |
|
"learning_rate": 0.000500924214417745, |
|
"loss": 0.0114, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 5.08, |
|
"grad_norm": 0.013944294303655624, |
|
"learning_rate": 0.0004963031423290203, |
|
"loss": 0.0047, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 5.13, |
|
"grad_norm": 0.14739681780338287, |
|
"learning_rate": 0.0004916820702402958, |
|
"loss": 0.0064, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 5.17, |
|
"grad_norm": 0.039295587688684464, |
|
"learning_rate": 0.00048706099815157115, |
|
"loss": 0.0061, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"grad_norm": 0.009731476195156574, |
|
"learning_rate": 0.0004824399260628466, |
|
"loss": 0.0064, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 5.27, |
|
"grad_norm": 0.009130421094596386, |
|
"learning_rate": 0.000477818853974122, |
|
"loss": 0.0056, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 5.31, |
|
"grad_norm": 0.10517439246177673, |
|
"learning_rate": 0.0004731977818853974, |
|
"loss": 0.0095, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 5.36, |
|
"grad_norm": 0.03147244080901146, |
|
"learning_rate": 0.00046857670979667283, |
|
"loss": 0.0069, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 0.07550155371427536, |
|
"learning_rate": 0.00046395563770794824, |
|
"loss": 0.0084, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 5.45, |
|
"grad_norm": 0.09899873286485672, |
|
"learning_rate": 0.00045933456561922365, |
|
"loss": 0.0087, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 5.49, |
|
"grad_norm": 0.062454238533973694, |
|
"learning_rate": 0.00045471349353049906, |
|
"loss": 0.0114, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"grad_norm": 0.14996998012065887, |
|
"learning_rate": 0.00045009242144177446, |
|
"loss": 0.0091, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 5.59, |
|
"grad_norm": 0.19108814001083374, |
|
"learning_rate": 0.00044547134935304987, |
|
"loss": 0.0147, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 5.63, |
|
"grad_norm": 0.14450325071811676, |
|
"learning_rate": 0.00044085027726432533, |
|
"loss": 0.0152, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 5.68, |
|
"grad_norm": 0.04423892870545387, |
|
"learning_rate": 0.0004362292051756008, |
|
"loss": 0.006, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 5.72, |
|
"grad_norm": 0.13844439387321472, |
|
"learning_rate": 0.0004316081330868762, |
|
"loss": 0.009, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 5.77, |
|
"grad_norm": 0.0006735218339599669, |
|
"learning_rate": 0.0004269870609981516, |
|
"loss": 0.0058, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 5.82, |
|
"grad_norm": 0.011760660447180271, |
|
"learning_rate": 0.000422365988909427, |
|
"loss": 0.0049, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"grad_norm": 0.08969856053590775, |
|
"learning_rate": 0.0004177449168207024, |
|
"loss": 0.0065, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 5.91, |
|
"grad_norm": 0.12556907534599304, |
|
"learning_rate": 0.00041312384473197783, |
|
"loss": 0.0089, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 5.95, |
|
"grad_norm": 0.017725255340337753, |
|
"learning_rate": 0.00040850277264325324, |
|
"loss": 0.0088, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.009897828102111816, |
|
"learning_rate": 0.00040388170055452864, |
|
"loss": 0.0071, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.20994354784488678, |
|
"eval_runtime": 463.0078, |
|
"eval_samples_per_second": 1.756, |
|
"eval_steps_per_second": 0.294, |
|
"step": 3276 |
|
}, |
|
{ |
|
"epoch": 6.04, |
|
"grad_norm": 0.0028004159685224295, |
|
"learning_rate": 0.00039926062846580405, |
|
"loss": 0.0093, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 6.09, |
|
"grad_norm": 0.10490375012159348, |
|
"learning_rate": 0.0003946395563770795, |
|
"loss": 0.0053, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 6.14, |
|
"grad_norm": 0.019779745489358902, |
|
"learning_rate": 0.0003900184842883549, |
|
"loss": 0.0084, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"grad_norm": 0.00020589173072949052, |
|
"learning_rate": 0.00038539741219963033, |
|
"loss": 0.0029, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 6.23, |
|
"grad_norm": 0.003221085062250495, |
|
"learning_rate": 0.00038077634011090574, |
|
"loss": 0.0051, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 6.27, |
|
"grad_norm": 0.00455264188349247, |
|
"learning_rate": 0.00037615526802218114, |
|
"loss": 0.0063, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 6.32, |
|
"grad_norm": 0.00967650581151247, |
|
"learning_rate": 0.00037153419593345655, |
|
"loss": 0.0035, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 6.36, |
|
"grad_norm": 0.009352604858577251, |
|
"learning_rate": 0.00036691312384473196, |
|
"loss": 0.0065, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 6.41, |
|
"grad_norm": 0.002876508515328169, |
|
"learning_rate": 0.00036229205175600736, |
|
"loss": 0.0049, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 6.46, |
|
"grad_norm": 0.002914861775934696, |
|
"learning_rate": 0.00035767097966728277, |
|
"loss": 0.0043, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.021481545642018318, |
|
"learning_rate": 0.0003530499075785583, |
|
"loss": 0.0072, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 6.55, |
|
"grad_norm": 0.08110266923904419, |
|
"learning_rate": 0.0003484288354898337, |
|
"loss": 0.0044, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 6.59, |
|
"grad_norm": 0.020943278446793556, |
|
"learning_rate": 0.0003438077634011091, |
|
"loss": 0.0072, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"grad_norm": 0.005692564882338047, |
|
"learning_rate": 0.0003391866913123845, |
|
"loss": 0.0078, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 6.68, |
|
"grad_norm": 0.11609622091054916, |
|
"learning_rate": 0.0003345656192236599, |
|
"loss": 0.008, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 6.73, |
|
"grad_norm": 0.05904560536146164, |
|
"learning_rate": 0.0003299445471349353, |
|
"loss": 0.0061, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 6.78, |
|
"grad_norm": 0.03346557542681694, |
|
"learning_rate": 0.00032532347504621073, |
|
"loss": 0.0069, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"grad_norm": 0.04848520830273628, |
|
"learning_rate": 0.00032070240295748614, |
|
"loss": 0.0078, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 6.87, |
|
"grad_norm": 0.11064545810222626, |
|
"learning_rate": 0.00031608133086876155, |
|
"loss": 0.0083, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 6.91, |
|
"grad_norm": 0.001821186626330018, |
|
"learning_rate": 0.00031146025878003695, |
|
"loss": 0.0084, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 6.96, |
|
"grad_norm": 0.03919747844338417, |
|
"learning_rate": 0.0003068391866913124, |
|
"loss": 0.0054, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.20703129470348358, |
|
"eval_runtime": 463.5868, |
|
"eval_samples_per_second": 1.754, |
|
"eval_steps_per_second": 0.293, |
|
"step": 3822 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"grad_norm": 0.008144889958202839, |
|
"learning_rate": 0.0003022181146025878, |
|
"loss": 0.0057, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"grad_norm": 0.005378293804824352, |
|
"learning_rate": 0.00029759704251386323, |
|
"loss": 0.004, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 0.03501349315047264, |
|
"learning_rate": 0.00029297597042513864, |
|
"loss": 0.003, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 7.14, |
|
"grad_norm": 0.07073014974594116, |
|
"learning_rate": 0.00028835489833641404, |
|
"loss": 0.0029, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 7.19, |
|
"grad_norm": 0.09017951786518097, |
|
"learning_rate": 0.00028373382624768945, |
|
"loss": 0.0027, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 7.23, |
|
"grad_norm": 0.009881277568638325, |
|
"learning_rate": 0.00027911275415896486, |
|
"loss": 0.0044, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"grad_norm": 0.0018990118987858295, |
|
"learning_rate": 0.00027449168207024027, |
|
"loss": 0.0031, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 7.33, |
|
"grad_norm": 0.004116680007427931, |
|
"learning_rate": 0.00026987060998151567, |
|
"loss": 0.0026, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"grad_norm": 0.03917807340621948, |
|
"learning_rate": 0.00026524953789279113, |
|
"loss": 0.0038, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"grad_norm": 0.0030583201441913843, |
|
"learning_rate": 0.0002606284658040666, |
|
"loss": 0.0032, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 7.46, |
|
"grad_norm": 0.0014874553307890892, |
|
"learning_rate": 0.000256007393715342, |
|
"loss": 0.0054, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 7.51, |
|
"grad_norm": 0.0008628646028228104, |
|
"learning_rate": 0.0002513863216266174, |
|
"loss": 0.0019, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 7.55, |
|
"grad_norm": 0.02715575322508812, |
|
"learning_rate": 0.00024676524953789276, |
|
"loss": 0.0037, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"grad_norm": 0.0031906655058264732, |
|
"learning_rate": 0.00024214417744916822, |
|
"loss": 0.0058, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 7.65, |
|
"grad_norm": 0.011863148771226406, |
|
"learning_rate": 0.00023752310536044363, |
|
"loss": 0.0022, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"grad_norm": 0.0015202141366899014, |
|
"learning_rate": 0.00023290203327171904, |
|
"loss": 0.0045, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"grad_norm": 0.02240474335849285, |
|
"learning_rate": 0.00022828096118299447, |
|
"loss": 0.0039, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 7.78, |
|
"grad_norm": 0.00918908603489399, |
|
"learning_rate": 0.00022365988909426988, |
|
"loss": 0.0014, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"grad_norm": 0.005950120277702808, |
|
"learning_rate": 0.0002190388170055453, |
|
"loss": 0.0031, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 7.88, |
|
"grad_norm": 0.07433830946683884, |
|
"learning_rate": 0.0002144177449168207, |
|
"loss": 0.002, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 7.92, |
|
"grad_norm": 0.09878811240196228, |
|
"learning_rate": 0.0002097966728280961, |
|
"loss": 0.0039, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 7.97, |
|
"grad_norm": 0.004627088084816933, |
|
"learning_rate": 0.00020517560073937154, |
|
"loss": 0.0066, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.21887589991092682, |
|
"eval_runtime": 464.2458, |
|
"eval_samples_per_second": 1.751, |
|
"eval_steps_per_second": 0.293, |
|
"step": 4368 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"grad_norm": 0.016953645274043083, |
|
"learning_rate": 0.00020055452865064697, |
|
"loss": 0.002, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 8.06, |
|
"grad_norm": 0.00016232863708864897, |
|
"learning_rate": 0.00019593345656192238, |
|
"loss": 0.0022, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 8.1, |
|
"grad_norm": 0.00045125139877200127, |
|
"learning_rate": 0.00019131238447319779, |
|
"loss": 0.0016, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 8.15, |
|
"grad_norm": 0.02065761759877205, |
|
"learning_rate": 0.0001866913123844732, |
|
"loss": 0.0017, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"grad_norm": 0.042185261845588684, |
|
"learning_rate": 0.00018207024029574863, |
|
"loss": 0.0027, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 8.24, |
|
"grad_norm": 0.003087196499109268, |
|
"learning_rate": 0.00017744916820702404, |
|
"loss": 0.0018, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 8.29, |
|
"grad_norm": 0.02859407104551792, |
|
"learning_rate": 0.00017282809611829944, |
|
"loss": 0.0015, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 8.33, |
|
"grad_norm": 0.00041793755372054875, |
|
"learning_rate": 0.00016820702402957485, |
|
"loss": 0.0035, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 8.38, |
|
"grad_norm": 0.0037734461948275566, |
|
"learning_rate": 0.00016358595194085026, |
|
"loss": 0.002, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 8.42, |
|
"grad_norm": 0.0030207443051040173, |
|
"learning_rate": 0.00015896487985212572, |
|
"loss": 0.0022, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 8.47, |
|
"grad_norm": 0.0026946039870381355, |
|
"learning_rate": 0.00015434380776340113, |
|
"loss": 0.0028, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 8.52, |
|
"grad_norm": 0.041892848908901215, |
|
"learning_rate": 0.00014972273567467653, |
|
"loss": 0.001, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 8.56, |
|
"grad_norm": 0.06906843930482864, |
|
"learning_rate": 0.00014510166358595194, |
|
"loss": 0.0013, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 8.61, |
|
"grad_norm": 0.00067297019995749, |
|
"learning_rate": 0.00014048059149722737, |
|
"loss": 0.0029, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 8.65, |
|
"grad_norm": 0.011746911332011223, |
|
"learning_rate": 0.00013585951940850278, |
|
"loss": 0.0012, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 8.7, |
|
"grad_norm": 0.0013995037879794836, |
|
"learning_rate": 0.0001312384473197782, |
|
"loss": 0.0017, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 0.009580204263329506, |
|
"learning_rate": 0.0001266173752310536, |
|
"loss": 0.001, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 8.79, |
|
"grad_norm": 0.0008843488758429885, |
|
"learning_rate": 0.00012199630314232903, |
|
"loss": 0.0015, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 8.84, |
|
"grad_norm": 0.0013571062590926886, |
|
"learning_rate": 0.00011737523105360444, |
|
"loss": 0.0024, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 8.88, |
|
"grad_norm": 0.01475840900093317, |
|
"learning_rate": 0.00011275415896487985, |
|
"loss": 0.0009, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 8.93, |
|
"grad_norm": 0.008486463688313961, |
|
"learning_rate": 0.00010813308687615527, |
|
"loss": 0.0022, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 8.97, |
|
"grad_norm": 0.0005981879075989127, |
|
"learning_rate": 0.00010351201478743069, |
|
"loss": 0.0006, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 0.23254649341106415, |
|
"eval_runtime": 463.7576, |
|
"eval_samples_per_second": 1.753, |
|
"eval_steps_per_second": 0.293, |
|
"step": 4914 |
|
}, |
|
{ |
|
"epoch": 9.02, |
|
"grad_norm": 0.011784604750573635, |
|
"learning_rate": 9.889094269870611e-05, |
|
"loss": 0.0028, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 9.07, |
|
"grad_norm": 0.0005488657625392079, |
|
"learning_rate": 9.426987060998152e-05, |
|
"loss": 0.0016, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 9.11, |
|
"grad_norm": 0.0024228901602327824, |
|
"learning_rate": 8.964879852125694e-05, |
|
"loss": 0.001, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 9.16, |
|
"grad_norm": 0.0021140037570148706, |
|
"learning_rate": 8.502772643253234e-05, |
|
"loss": 0.0014, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"grad_norm": 0.0011844311375170946, |
|
"learning_rate": 8.040665434380776e-05, |
|
"loss": 0.001, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 0.011841055937111378, |
|
"learning_rate": 7.578558225508319e-05, |
|
"loss": 0.0009, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 9.29, |
|
"grad_norm": 0.013395372778177261, |
|
"learning_rate": 7.116451016635859e-05, |
|
"loss": 0.0009, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 9.34, |
|
"grad_norm": 0.05545121058821678, |
|
"learning_rate": 6.654343807763401e-05, |
|
"loss": 0.0012, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 9.39, |
|
"grad_norm": 0.01891588233411312, |
|
"learning_rate": 6.192236598890943e-05, |
|
"loss": 0.0006, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 9.43, |
|
"grad_norm": 0.0025335114914923906, |
|
"learning_rate": 5.730129390018484e-05, |
|
"loss": 0.0006, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 9.48, |
|
"grad_norm": 0.0021167423110455275, |
|
"learning_rate": 5.268022181146026e-05, |
|
"loss": 0.0007, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"grad_norm": 0.0011415353510528803, |
|
"learning_rate": 4.8059149722735676e-05, |
|
"loss": 0.0014, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 9.57, |
|
"grad_norm": 0.00026013093884103, |
|
"learning_rate": 4.343807763401109e-05, |
|
"loss": 0.0007, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 9.62, |
|
"grad_norm": 0.03879648819565773, |
|
"learning_rate": 3.8817005545286504e-05, |
|
"loss": 0.0007, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 9.66, |
|
"grad_norm": 0.006720875855535269, |
|
"learning_rate": 3.4195933456561925e-05, |
|
"loss": 0.0009, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 9.71, |
|
"grad_norm": 0.006371485069394112, |
|
"learning_rate": 2.957486136783734e-05, |
|
"loss": 0.0009, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"grad_norm": 0.012291524559259415, |
|
"learning_rate": 2.4953789279112753e-05, |
|
"loss": 0.0012, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"grad_norm": 0.012388636358082294, |
|
"learning_rate": 2.033271719038817e-05, |
|
"loss": 0.0006, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 9.84, |
|
"grad_norm": 0.0905984491109848, |
|
"learning_rate": 1.5711645101663588e-05, |
|
"loss": 0.0011, |
|
"step": 5375 |
|
}, |
|
{ |
|
"epoch": 9.89, |
|
"grad_norm": 0.0024207117967307568, |
|
"learning_rate": 1.1090573012939002e-05, |
|
"loss": 0.001, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 9.94, |
|
"grad_norm": 0.003070174716413021, |
|
"learning_rate": 6.469500924214418e-06, |
|
"loss": 0.0008, |
|
"step": 5425 |
|
}, |
|
{ |
|
"epoch": 9.98, |
|
"grad_norm": 0.012533812783658504, |
|
"learning_rate": 1.8484288354898337e-06, |
|
"loss": 0.001, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 0.23037216067314148, |
|
"eval_runtime": 463.1996, |
|
"eval_samples_per_second": 1.755, |
|
"eval_steps_per_second": 0.294, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 5460, |
|
"total_flos": 1.135723105419264e+20, |
|
"train_loss": 0.029495533068592733, |
|
"train_runtime": 29108.855, |
|
"train_samples_per_second": 1.124, |
|
"train_steps_per_second": 0.188 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 5460, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 1.135723105419264e+20, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|