|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 13.2, |
|
"eval_steps": 500, |
|
"global_step": 33000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.1794846057891846, |
|
"learning_rate": 0.0002, |
|
"loss": 2.9634, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.9956796169281006, |
|
"learning_rate": 0.0002, |
|
"loss": 2.5897, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.7088788747787476, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3967, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.8429712057113647, |
|
"learning_rate": 0.0002, |
|
"loss": 2.403, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7503288388252258, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3169, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.6073698401451111, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2718, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6131550073623657, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1968, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.0864523649215698, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2322, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7630054950714111, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2196, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.8388053178787231, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2967, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.7474327087402344, |
|
"learning_rate": 0.0002, |
|
"loss": 2.222, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.7345255613327026, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0606, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.5666030049324036, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1423, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.7595005631446838, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1629, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.5787087678909302, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1169, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.9300134778022766, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0647, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.6892948746681213, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0773, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5084001421928406, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1002, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.470070481300354, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0381, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.6220455169677734, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0517, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.8656258583068848, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9672, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.7517567873001099, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0848, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.7987684011459351, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0362, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.6208759546279907, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0221, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5592771768569946, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9436, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.8733409643173218, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9028, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.6601306200027466, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9403, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.7789013385772705, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8257, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.5855867862701416, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9351, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.5030935406684875, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9253, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.6684442758560181, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9153, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.6724442839622498, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8726, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.48911118507385254, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9423, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 0.7769139409065247, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7886, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.639460027217865, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9118, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.5745570659637451, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8423, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.6319829225540161, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8893, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.5839726328849792, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8506, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.7453562617301941, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8698, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.7091575264930725, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8367, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 0.722655177116394, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8939, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 0.566392183303833, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7557, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.564018189907074, |
|
"learning_rate": 0.0002, |
|
"loss": 1.72, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.7120116949081421, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7436, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.7515490651130676, |
|
"learning_rate": 0.0002, |
|
"loss": 1.83, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 0.6733573079109192, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8012, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.0119801759719849, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8833, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.73843914270401, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8012, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.5831142067909241, |
|
"learning_rate": 0.0002, |
|
"loss": 1.768, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.5329481959342957, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8615, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.5874722599983215, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6998, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.5984659194946289, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7303, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.7614981532096863, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6637, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.8689171671867371, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.8071489334106445, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6588, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.7306439280509949, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7062, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.2800000000000002, |
|
"grad_norm": 0.483328253030777, |
|
"learning_rate": 0.0002, |
|
"loss": 1.705, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.9662113189697266, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6375, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 0.6570947766304016, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6495, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.6682040691375732, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6971, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.6937342286109924, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6179, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.6113543510437012, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6769, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.6734089851379395, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6866, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.6203577518463135, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6999, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.8067578077316284, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6294, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.7386764883995056, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6841, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.9529528617858887, |
|
"learning_rate": 0.0002, |
|
"loss": 1.665, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 0.5639382600784302, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6756, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.8711239695549011, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5953, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.6677307486534119, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6708, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.6820212006568909, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6722, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.6516992449760437, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6407, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.6071237325668335, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6273, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.6759991645812988, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6218, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.6260673403739929, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7184, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.6020120978355408, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4686, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 0.9953874945640564, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5124, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 0.7787545323371887, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5184, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 0.9334218502044678, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5031, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.9822279214859009, |
|
"learning_rate": 0.0002, |
|
"loss": 1.504, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.8602248430252075, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5348, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 3.2800000000000002, |
|
"grad_norm": 0.88422691822052, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4788, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 0.8250532746315002, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5258, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 1.3689357042312622, |
|
"learning_rate": 0.0002, |
|
"loss": 1.53, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 1.0472410917282104, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5347, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 0.7986043095588684, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4467, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 0.8166589736938477, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6207, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.7566811442375183, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5238, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 0.9338216185569763, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5801, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.9602301716804504, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4506, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.9274515509605408, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5574, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 1.6966444253921509, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4391, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 3.7199999999999998, |
|
"grad_norm": 0.9805380702018738, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5071, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 1.1102324724197388, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5041, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 1.1052135229110718, |
|
"learning_rate": 0.0002, |
|
"loss": 1.504, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 0.8187786936759949, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5444, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 1.5404032468795776, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5627, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.7880242466926575, |
|
"learning_rate": 0.0002, |
|
"loss": 1.508, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 1.2433679103851318, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5252, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5194, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 0.9061052799224854, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3939, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 0.9826890230178833, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3602, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 1.1164418458938599, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4036, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 1.2371020317077637, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4325, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 0.9281136393547058, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3633, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 1.5267653465270996, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3531, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 0.9372194409370422, |
|
"learning_rate": 0.0002, |
|
"loss": 1.387, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 1.9180704355239868, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4049, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 0.9102849960327148, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3481, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 1.3661117553710938, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3171, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.6796606183052063, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3776, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 1.2240846157073975, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3756, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 0.7911117672920227, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3484, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 4.5600000000000005, |
|
"grad_norm": 0.6849353313446045, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3985, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 1.292270541191101, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3338, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 1.0751451253890991, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4044, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 1.391108512878418, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4003, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.8339885473251343, |
|
"learning_rate": 0.0002, |
|
"loss": 1.389, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 0.9836968779563904, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3732, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 1.3942408561706543, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3763, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 0.8473936915397644, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3675, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 1.1180263757705688, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3018, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 1.0825896263122559, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3935, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 1.7957453727722168, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4167, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.4984807968139648, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3562, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"grad_norm": 1.2871723175048828, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2327, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 5.08, |
|
"grad_norm": 0.5957996249198914, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1613, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"grad_norm": 1.0731712579727173, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2683, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"grad_norm": 1.2785401344299316, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2215, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 0.6945757269859314, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2665, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 5.24, |
|
"grad_norm": 0.8779969215393066, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2028, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 0.9905422329902649, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2691, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"grad_norm": 0.7267619967460632, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2794, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 5.36, |
|
"grad_norm": 1.1280577182769775, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2608, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 1.3053045272827148, |
|
"learning_rate": 0.0002, |
|
"loss": 1.253, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"grad_norm": 1.1373580694198608, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2236, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"grad_norm": 0.9823132753372192, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2841, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"grad_norm": 1.0225436687469482, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2052, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 5.5600000000000005, |
|
"grad_norm": 1.6617635488510132, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2314, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 1.0195096731185913, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2959, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 1.3563017845153809, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2683, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 5.68, |
|
"grad_norm": 2.1966164112091064, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2247, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 5.72, |
|
"grad_norm": 1.057099461555481, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2577, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"grad_norm": 1.2523263692855835, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2865, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"grad_norm": 1.7607208490371704, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2205, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"grad_norm": 1.9455257654190063, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2829, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"grad_norm": 1.0618771314620972, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3037, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 1.0242942571640015, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2165, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 5.96, |
|
"grad_norm": 1.2692897319793701, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2873, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 1.0533056259155273, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2528, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 6.04, |
|
"grad_norm": 0.8311458230018616, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0705, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"grad_norm": 1.0614266395568848, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0803, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 6.12, |
|
"grad_norm": 1.4157719612121582, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0864, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 6.16, |
|
"grad_norm": 1.0785095691680908, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1076, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"grad_norm": 1.3798463344573975, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1038, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"grad_norm": 0.9821926355361938, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1482, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"grad_norm": 2.189770221710205, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1154, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 6.32, |
|
"grad_norm": 5.5393500328063965, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1186, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 6.36, |
|
"grad_norm": 1.2127723693847656, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1888, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 2.876635789871216, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1029, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 6.44, |
|
"grad_norm": 1.3155653476715088, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1092, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 6.48, |
|
"grad_norm": 1.1871975660324097, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1488, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 6.52, |
|
"grad_norm": 1.6706851720809937, |
|
"learning_rate": 0.0002, |
|
"loss": 1.153, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 6.5600000000000005, |
|
"grad_norm": 2.3989503383636475, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1049, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"grad_norm": 1.0962737798690796, |
|
"learning_rate": 0.0002, |
|
"loss": 1.182, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"grad_norm": 1.2321207523345947, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1196, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 6.68, |
|
"grad_norm": 1.6745890378952026, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1776, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"grad_norm": 3.3587148189544678, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1335, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 6.76, |
|
"grad_norm": 1.2802035808563232, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2144, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"grad_norm": 1.5999842882156372, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1143, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 6.84, |
|
"grad_norm": 1.5529465675354004, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1503, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 0.9041572213172913, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1681, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"grad_norm": 1.2176377773284912, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1284, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 6.96, |
|
"grad_norm": 1.1047685146331787, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1622, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 2.3316566944122314, |
|
"learning_rate": 0.0002, |
|
"loss": 1.149, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"grad_norm": 0.9057099223136902, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9952, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 7.08, |
|
"grad_norm": 2.2556183338165283, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9667, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 7.12, |
|
"grad_norm": 1.2181649208068848, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9606, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 7.16, |
|
"grad_norm": 1.1756635904312134, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0176, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 1.3955613374710083, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0423, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 7.24, |
|
"grad_norm": 2.0451719760894775, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0152, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"grad_norm": 1.8395252227783203, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9896, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 7.32, |
|
"grad_norm": 0.8587050437927246, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9976, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 7.36, |
|
"grad_norm": 1.848895788192749, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9711, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"grad_norm": 0.8069963455200195, |
|
"learning_rate": 0.0002, |
|
"loss": 1.02, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 7.44, |
|
"grad_norm": 1.0655219554901123, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0069, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 7.48, |
|
"grad_norm": 1.474328637123108, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0311, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 7.52, |
|
"grad_norm": 1.3253296613693237, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0158, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 7.5600000000000005, |
|
"grad_norm": 1.1345421075820923, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0391, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"grad_norm": 1.0736902952194214, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0498, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 1.6537193059921265, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0235, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 7.68, |
|
"grad_norm": 1.3010786771774292, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0307, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 7.72, |
|
"grad_norm": 0.9209179282188416, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0111, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 7.76, |
|
"grad_norm": 5.0836310386657715, |
|
"learning_rate": 0.0002, |
|
"loss": 1.059, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"grad_norm": 1.7362360954284668, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0704, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 7.84, |
|
"grad_norm": 1.692413330078125, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0618, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 7.88, |
|
"grad_norm": 3.538470506668091, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0778, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 7.92, |
|
"grad_norm": 2.578237533569336, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1153, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"grad_norm": 1.356609582901001, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0597, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.89506995677948, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0892, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 8.04, |
|
"grad_norm": 1.5506243705749512, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8659, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 8.08, |
|
"grad_norm": 1.69996178150177, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9142, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"grad_norm": 2.0094783306121826, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8807, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 8.16, |
|
"grad_norm": 1.6503652334213257, |
|
"learning_rate": 0.0002, |
|
"loss": 0.857, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"grad_norm": 0.933527410030365, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8855, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 8.24, |
|
"grad_norm": 1.6827526092529297, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9295, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 2.89079213142395, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9146, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 8.32, |
|
"grad_norm": 1.0603892803192139, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9279, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 8.36, |
|
"grad_norm": 3.3014211654663086, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9406, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"grad_norm": 0.8049854040145874, |
|
"learning_rate": 0.0002, |
|
"loss": 0.907, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 8.44, |
|
"grad_norm": 2.991314649581909, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9721, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 8.48, |
|
"grad_norm": 1.043578028678894, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9557, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 8.52, |
|
"grad_norm": 1.6888822317123413, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9477, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 8.56, |
|
"grad_norm": 1.4089540243148804, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9424, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 7.123160362243652, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9232, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"grad_norm": 1.149699091911316, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9189, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 8.68, |
|
"grad_norm": 1.3870540857315063, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9318, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 8.72, |
|
"grad_norm": 1.4124248027801514, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9657, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 8.76, |
|
"grad_norm": 0.6599737405776978, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9493, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"grad_norm": 1.0088489055633545, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9739, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 8.84, |
|
"grad_norm": 0.7540724873542786, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9492, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 8.88, |
|
"grad_norm": 1.0838185548782349, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0105, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"grad_norm": 0.8989962935447693, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9256, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 8.96, |
|
"grad_norm": 0.9646226167678833, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9766, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 1.0363638401031494, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9676, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 9.04, |
|
"grad_norm": 1.2631175518035889, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7845, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 9.08, |
|
"grad_norm": 1.2183212041854858, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7785, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 9.12, |
|
"grad_norm": 1.4566229581832886, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7972, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 9.16, |
|
"grad_norm": 1.2739365100860596, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7729, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"grad_norm": 1.4455575942993164, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8359, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 9.24, |
|
"grad_norm": 1.1576048135757446, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8324, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"grad_norm": 1.3177443742752075, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8223, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 9.32, |
|
"grad_norm": 4.67700719833374, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8154, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 9.36, |
|
"grad_norm": 1.2279400825500488, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8047, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 9.4, |
|
"grad_norm": 0.6378070712089539, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8539, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 9.44, |
|
"grad_norm": 1.7342982292175293, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8405, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 9.48, |
|
"grad_norm": 1.534493088722229, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8493, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"grad_norm": 1.0669933557510376, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8931, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 9.56, |
|
"grad_norm": 1.2402708530426025, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8689, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 1.2599835395812988, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8733, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 9.64, |
|
"grad_norm": 2.5779175758361816, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8934, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 9.68, |
|
"grad_norm": 1.8797070980072021, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8536, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 9.72, |
|
"grad_norm": 1.3404995203018188, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9191, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 9.76, |
|
"grad_norm": 1.4789234399795532, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8198, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"grad_norm": 0.6044953465461731, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8304, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 9.84, |
|
"grad_norm": 1.2792354822158813, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9196, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 9.88, |
|
"grad_norm": 2.047617197036743, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9136, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 9.92, |
|
"grad_norm": 5.776161193847656, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8715, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 9.96, |
|
"grad_norm": 1.9059360027313232, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8712, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 1.1497496366500854, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8607, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 10.04, |
|
"grad_norm": 0.6882659792900085, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7556, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 10.08, |
|
"grad_norm": 0.9709841012954712, |
|
"learning_rate": 0.0002, |
|
"loss": 0.787, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 10.12, |
|
"grad_norm": 0.9609636664390564, |
|
"learning_rate": 0.0002, |
|
"loss": 0.701, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 10.16, |
|
"grad_norm": 1.397544026374817, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7548, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 10.2, |
|
"grad_norm": 2.043348550796509, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7357, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 10.24, |
|
"grad_norm": 0.7744215726852417, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7481, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 10.28, |
|
"grad_norm": 0.7471018433570862, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7206, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 10.32, |
|
"grad_norm": 1.5226207971572876, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7199, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 10.36, |
|
"grad_norm": 0.9229708313941956, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7449, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 10.4, |
|
"grad_norm": 0.8783457279205322, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8026, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 10.44, |
|
"grad_norm": 1.858168125152588, |
|
"learning_rate": 0.0002, |
|
"loss": 0.758, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 10.48, |
|
"grad_norm": 9.689515113830566, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7394, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 10.52, |
|
"grad_norm": 1.7214679718017578, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7704, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 10.56, |
|
"grad_norm": 4.615940570831299, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7957, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 10.6, |
|
"grad_norm": 1.4114075899124146, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7613, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 10.64, |
|
"grad_norm": 1.1253297328948975, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8119, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 10.68, |
|
"grad_norm": 1.9775581359863281, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8083, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 10.72, |
|
"grad_norm": 0.6344081163406372, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8078, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 10.76, |
|
"grad_norm": 1.287255048751831, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7732, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 10.8, |
|
"grad_norm": 0.826394259929657, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7874, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 10.84, |
|
"grad_norm": 1.9858311414718628, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7672, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 10.88, |
|
"grad_norm": 2.411587715148926, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7926, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 10.92, |
|
"grad_norm": 1.1117106676101685, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8031, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 10.96, |
|
"grad_norm": 0.9955012798309326, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8018, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 6.443946838378906, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7974, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 11.04, |
|
"grad_norm": 0.9066771864891052, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6354, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 11.08, |
|
"grad_norm": 1.2832303047180176, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6669, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 11.12, |
|
"grad_norm": 1.3047817945480347, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6408, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 11.16, |
|
"grad_norm": 1.2970690727233887, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6777, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 11.2, |
|
"grad_norm": 1.2775633335113525, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6366, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 11.24, |
|
"grad_norm": 1.1797577142715454, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6618, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 11.28, |
|
"grad_norm": 0.4641984701156616, |
|
"learning_rate": 0.0002, |
|
"loss": 0.714, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 11.32, |
|
"grad_norm": 5.440238952636719, |
|
"learning_rate": 0.0002, |
|
"loss": 0.707, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 11.36, |
|
"grad_norm": 2.7342915534973145, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7155, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 11.4, |
|
"grad_norm": 1.025922179222107, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6761, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 11.44, |
|
"grad_norm": 1.4162850379943848, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7079, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 11.48, |
|
"grad_norm": 1.673281192779541, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6957, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 11.52, |
|
"grad_norm": 1.6640900373458862, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7616, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 11.56, |
|
"grad_norm": 0.5658434629440308, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6939, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 11.6, |
|
"grad_norm": 0.6061828136444092, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7162, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 11.64, |
|
"grad_norm": 1.6504275798797607, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7266, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 11.68, |
|
"grad_norm": 2.936732530593872, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7053, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 11.72, |
|
"grad_norm": 1.2476632595062256, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6966, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 11.76, |
|
"grad_norm": 0.8274354934692383, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6928, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 11.8, |
|
"grad_norm": 1.6214333772659302, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7288, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 11.84, |
|
"grad_norm": 1.2420477867126465, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7501, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 11.88, |
|
"grad_norm": 1.3407083749771118, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7472, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 11.92, |
|
"grad_norm": 2.469486951828003, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7077, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 11.96, |
|
"grad_norm": 2.422231912612915, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7587, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 2.655271053314209, |
|
"learning_rate": 0.0002, |
|
"loss": 0.734, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 12.04, |
|
"grad_norm": 3.5966265201568604, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5812, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 12.08, |
|
"grad_norm": 0.46480792760849, |
|
"learning_rate": 0.0002, |
|
"loss": 0.589, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 12.12, |
|
"grad_norm": 0.8438324928283691, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6005, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 12.16, |
|
"grad_norm": 1.9844694137573242, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6312, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 12.2, |
|
"grad_norm": 1.1500381231307983, |
|
"learning_rate": 0.0002, |
|
"loss": 0.621, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 12.24, |
|
"grad_norm": 1.3909460306167603, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6353, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 12.28, |
|
"grad_norm": 3.504803419113159, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6191, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 12.32, |
|
"grad_norm": 1.116080641746521, |
|
"learning_rate": 0.0002, |
|
"loss": 0.628, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 12.36, |
|
"grad_norm": 0.9176204800605774, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6314, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 12.4, |
|
"grad_norm": 1.0510369539260864, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6532, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 12.44, |
|
"grad_norm": 0.5242018103599548, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6422, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 12.48, |
|
"grad_norm": 5.052489280700684, |
|
"learning_rate": 0.0002, |
|
"loss": 0.64, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 12.52, |
|
"grad_norm": 0.5426860451698303, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6488, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 12.56, |
|
"grad_norm": 0.7503352165222168, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6764, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 12.6, |
|
"grad_norm": 2.2711548805236816, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6177, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 12.64, |
|
"grad_norm": 1.3384021520614624, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6425, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 12.68, |
|
"grad_norm": 1.098314642906189, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6453, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 12.72, |
|
"grad_norm": 0.8752951622009277, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6724, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 12.76, |
|
"grad_norm": 1.700907826423645, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6707, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"grad_norm": 2.7010860443115234, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6551, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 12.84, |
|
"grad_norm": 0.9509829878807068, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6766, |
|
"step": 32100 |
|
}, |
|
{ |
|
"epoch": 12.88, |
|
"grad_norm": 1.8936227560043335, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6889, |
|
"step": 32200 |
|
}, |
|
{ |
|
"epoch": 12.92, |
|
"grad_norm": 3.870959997177124, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6526, |
|
"step": 32300 |
|
}, |
|
{ |
|
"epoch": 12.96, |
|
"grad_norm": 0.8048538565635681, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6658, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 3.0256259441375732, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6521, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 13.04, |
|
"grad_norm": 0.5219417810440063, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5415, |
|
"step": 32600 |
|
}, |
|
{ |
|
"epoch": 13.08, |
|
"grad_norm": 0.6729668378829956, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5442, |
|
"step": 32700 |
|
}, |
|
{ |
|
"epoch": 13.12, |
|
"grad_norm": 5.041486740112305, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5794, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 13.16, |
|
"grad_norm": 0.7740932106971741, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5739, |
|
"step": 32900 |
|
}, |
|
{ |
|
"epoch": 13.2, |
|
"grad_norm": 2.4038619995117188, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5527, |
|
"step": 33000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 125000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.330326307619963e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|