|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 10, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 16.75, |
|
"learning_rate": 1.142857142857143e-06, |
|
"loss": 2.1299, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.1395466327667236, |
|
"eval_runtime": 1.8752, |
|
"eval_samples_per_second": 83.192, |
|
"eval_steps_per_second": 4.266, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 17.125, |
|
"learning_rate": 2.285714285714286e-06, |
|
"loss": 2.0585, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 1.9696139097213745, |
|
"eval_runtime": 1.8157, |
|
"eval_samples_per_second": 85.918, |
|
"eval_steps_per_second": 4.406, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 20.0, |
|
"learning_rate": 3.428571428571429e-06, |
|
"loss": 1.7178, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 1.4725638628005981, |
|
"eval_runtime": 1.828, |
|
"eval_samples_per_second": 85.337, |
|
"eval_steps_per_second": 4.376, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 12.5, |
|
"learning_rate": 4.571428571428572e-06, |
|
"loss": 1.1311, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 0.9639529585838318, |
|
"eval_runtime": 1.8321, |
|
"eval_samples_per_second": 85.149, |
|
"eval_steps_per_second": 4.367, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 7.1875, |
|
"learning_rate": 5.7142857142857145e-06, |
|
"loss": 0.9087, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.8117697238922119, |
|
"eval_runtime": 1.8294, |
|
"eval_samples_per_second": 85.276, |
|
"eval_steps_per_second": 4.373, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.0, |
|
"learning_rate": 6.857142857142858e-06, |
|
"loss": 0.7361, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 0.7243122458457947, |
|
"eval_runtime": 1.8406, |
|
"eval_samples_per_second": 84.755, |
|
"eval_steps_per_second": 4.346, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.6542, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_loss": 0.6852245330810547, |
|
"eval_runtime": 1.8344, |
|
"eval_samples_per_second": 85.043, |
|
"eval_steps_per_second": 4.361, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 9.142857142857144e-06, |
|
"loss": 0.6405, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 0.6576731204986572, |
|
"eval_runtime": 1.8353, |
|
"eval_samples_per_second": 85.0, |
|
"eval_steps_per_second": 4.359, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.0285714285714285e-05, |
|
"loss": 0.6306, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 0.6408917307853699, |
|
"eval_runtime": 1.8368, |
|
"eval_samples_per_second": 84.931, |
|
"eval_steps_per_second": 4.355, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1.1428571428571429e-05, |
|
"loss": 0.5593, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.6218527555465698, |
|
"eval_runtime": 1.8407, |
|
"eval_samples_per_second": 84.75, |
|
"eval_steps_per_second": 4.346, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 1.2571428571428572e-05, |
|
"loss": 0.6108, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_loss": 0.6078172922134399, |
|
"eval_runtime": 1.8218, |
|
"eval_samples_per_second": 85.628, |
|
"eval_steps_per_second": 4.391, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.3714285714285716e-05, |
|
"loss": 0.6107, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 0.6042583584785461, |
|
"eval_runtime": 1.833, |
|
"eval_samples_per_second": 85.107, |
|
"eval_steps_per_second": 4.364, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 1.4857142857142858e-05, |
|
"loss": 0.5921, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"eval_loss": 0.6016473770141602, |
|
"eval_runtime": 1.8301, |
|
"eval_samples_per_second": 85.241, |
|
"eval_steps_per_second": 4.371, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.5577, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 0.5969346761703491, |
|
"eval_runtime": 1.8347, |
|
"eval_samples_per_second": 85.029, |
|
"eval_steps_per_second": 4.36, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.7142857142857142e-05, |
|
"loss": 0.5214, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 0.5916711688041687, |
|
"eval_runtime": 1.8257, |
|
"eval_samples_per_second": 85.448, |
|
"eval_steps_per_second": 4.382, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 1.8285714285714288e-05, |
|
"loss": 0.5675, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 0.5914089679718018, |
|
"eval_runtime": 1.824, |
|
"eval_samples_per_second": 85.525, |
|
"eval_steps_per_second": 4.386, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 1.942857142857143e-05, |
|
"loss": 0.5718, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_loss": 0.5912750363349915, |
|
"eval_runtime": 1.8251, |
|
"eval_samples_per_second": 85.476, |
|
"eval_steps_per_second": 4.383, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.9999502669559432e-05, |
|
"loss": 0.5099, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 0.5953446626663208, |
|
"eval_runtime": 1.8315, |
|
"eval_samples_per_second": 85.174, |
|
"eval_steps_per_second": 4.368, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 1.9995524322835035e-05, |
|
"loss": 0.5531, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_loss": 0.5920645594596863, |
|
"eval_runtime": 1.8189, |
|
"eval_samples_per_second": 85.767, |
|
"eval_steps_per_second": 4.398, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.9987569212189224e-05, |
|
"loss": 0.5936, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.5868185758590698, |
|
"eval_runtime": 1.8197, |
|
"eval_samples_per_second": 85.727, |
|
"eval_steps_per_second": 4.396, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 1.9975640502598243e-05, |
|
"loss": 0.5256, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_loss": 0.5831221342086792, |
|
"eval_runtime": 1.8324, |
|
"eval_samples_per_second": 85.133, |
|
"eval_steps_per_second": 4.366, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.9959742939952393e-05, |
|
"loss": 0.5436, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 0.5828912854194641, |
|
"eval_runtime": 1.8214, |
|
"eval_samples_per_second": 85.65, |
|
"eval_steps_per_second": 4.392, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 1.9939882849167853e-05, |
|
"loss": 0.5338, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_loss": 0.5832617282867432, |
|
"eval_runtime": 1.8287, |
|
"eval_samples_per_second": 85.307, |
|
"eval_steps_per_second": 4.375, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 3.375, |
|
"learning_rate": 1.9916068131670302e-05, |
|
"loss": 0.5825, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_loss": 0.5794676542282104, |
|
"eval_runtime": 1.8254, |
|
"eval_samples_per_second": 85.461, |
|
"eval_steps_per_second": 4.383, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.9888308262251286e-05, |
|
"loss": 0.5602, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.5799536108970642, |
|
"eval_runtime": 1.8258, |
|
"eval_samples_per_second": 85.441, |
|
"eval_steps_per_second": 4.382, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 3.375, |
|
"learning_rate": 1.985661428529863e-05, |
|
"loss": 0.4275, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"eval_loss": 0.6025771498680115, |
|
"eval_runtime": 1.8242, |
|
"eval_samples_per_second": 85.517, |
|
"eval_steps_per_second": 4.386, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.982099881040239e-05, |
|
"loss": 0.4108, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"eval_loss": 0.5985576510429382, |
|
"eval_runtime": 1.8242, |
|
"eval_samples_per_second": 85.515, |
|
"eval_steps_per_second": 4.385, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.9781476007338058e-05, |
|
"loss": 0.418, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_loss": 0.5975014567375183, |
|
"eval_runtime": 1.8198, |
|
"eval_samples_per_second": 85.722, |
|
"eval_steps_per_second": 4.396, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.9738061600429062e-05, |
|
"loss": 0.3805, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"eval_loss": 0.6011523008346558, |
|
"eval_runtime": 1.8201, |
|
"eval_samples_per_second": 85.708, |
|
"eval_steps_per_second": 4.395, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.969077286229078e-05, |
|
"loss": 0.3967, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.6029513478279114, |
|
"eval_runtime": 1.8226, |
|
"eval_samples_per_second": 85.593, |
|
"eval_steps_per_second": 4.389, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 1.9639628606958535e-05, |
|
"loss": 0.4397, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"eval_loss": 0.5951108932495117, |
|
"eval_runtime": 1.8172, |
|
"eval_samples_per_second": 85.844, |
|
"eval_steps_per_second": 4.402, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.9584649182402358e-05, |
|
"loss": 0.4374, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"eval_loss": 0.5968344211578369, |
|
"eval_runtime": 1.8254, |
|
"eval_samples_per_second": 85.46, |
|
"eval_steps_per_second": 4.383, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.9525856462431463e-05, |
|
"loss": 0.4249, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"eval_loss": 0.5980938673019409, |
|
"eval_runtime": 1.8248, |
|
"eval_samples_per_second": 85.49, |
|
"eval_steps_per_second": 4.384, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.9463273837991643e-05, |
|
"loss": 0.4135, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"eval_loss": 0.5925901532173157, |
|
"eval_runtime": 1.8236, |
|
"eval_samples_per_second": 85.544, |
|
"eval_steps_per_second": 4.387, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.425, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"eval_loss": 0.5934858322143555, |
|
"eval_runtime": 1.8225, |
|
"eval_samples_per_second": 85.597, |
|
"eval_steps_per_second": 4.39, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.9326839968734278e-05, |
|
"loss": 0.4261, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"eval_loss": 0.5896275639533997, |
|
"eval_runtime": 1.8188, |
|
"eval_samples_per_second": 85.773, |
|
"eval_steps_per_second": 4.399, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.9253043004739967e-05, |
|
"loss": 0.4163, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"eval_loss": 0.5982385277748108, |
|
"eval_runtime": 1.8212, |
|
"eval_samples_per_second": 85.656, |
|
"eval_steps_per_second": 4.393, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.917556467632734e-05, |
|
"loss": 0.4424, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"eval_loss": 0.5906324982643127, |
|
"eval_runtime": 1.8214, |
|
"eval_samples_per_second": 85.646, |
|
"eval_steps_per_second": 4.392, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.9094435808594823e-05, |
|
"loss": 0.3827, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"eval_loss": 0.6001599431037903, |
|
"eval_runtime": 1.8232, |
|
"eval_samples_per_second": 85.566, |
|
"eval_steps_per_second": 4.388, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.900968867902419e-05, |
|
"loss": 0.4036, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.5959637761116028, |
|
"eval_runtime": 1.8277, |
|
"eval_samples_per_second": 85.354, |
|
"eval_steps_per_second": 4.377, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.8921357004638837e-05, |
|
"loss": 0.3885, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"eval_loss": 0.593512237071991, |
|
"eval_runtime": 1.8262, |
|
"eval_samples_per_second": 85.424, |
|
"eval_steps_per_second": 4.381, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1.8829475928589272e-05, |
|
"loss": 0.4268, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"eval_loss": 0.5908737182617188, |
|
"eval_runtime": 1.8249, |
|
"eval_samples_per_second": 85.484, |
|
"eval_steps_per_second": 4.384, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.87340820061713e-05, |
|
"loss": 0.4051, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"eval_loss": 0.5885254740715027, |
|
"eval_runtime": 1.8213, |
|
"eval_samples_per_second": 85.652, |
|
"eval_steps_per_second": 4.392, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 1.8635213190282312e-05, |
|
"loss": 0.4012, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"eval_loss": 0.5890469551086426, |
|
"eval_runtime": 1.8253, |
|
"eval_samples_per_second": 85.467, |
|
"eval_steps_per_second": 4.383, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 1.8532908816321557e-05, |
|
"loss": 0.4283, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"eval_loss": 0.587235152721405, |
|
"eval_runtime": 1.819, |
|
"eval_samples_per_second": 85.761, |
|
"eval_steps_per_second": 4.398, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.8427209586540392e-05, |
|
"loss": 0.4044, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"eval_loss": 0.5868382453918457, |
|
"eval_runtime": 1.8214, |
|
"eval_samples_per_second": 85.649, |
|
"eval_steps_per_second": 4.392, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 1.8318157553848694e-05, |
|
"loss": 0.4342, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"eval_loss": 0.5887174606323242, |
|
"eval_runtime": 1.8229, |
|
"eval_samples_per_second": 85.58, |
|
"eval_steps_per_second": 4.389, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.8205796105083917e-05, |
|
"loss": 0.4291, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"eval_loss": 0.5855108499526978, |
|
"eval_runtime": 1.8227, |
|
"eval_samples_per_second": 85.589, |
|
"eval_steps_per_second": 4.389, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.8090169943749477e-05, |
|
"loss": 0.4303, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"eval_loss": 0.5847751498222351, |
|
"eval_runtime": 1.8226, |
|
"eval_samples_per_second": 85.594, |
|
"eval_steps_per_second": 4.389, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.7971325072229227e-05, |
|
"loss": 0.4211, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.5896024107933044, |
|
"eval_runtime": 1.8212, |
|
"eval_samples_per_second": 85.657, |
|
"eval_steps_per_second": 4.393, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.7849308773485226e-05, |
|
"loss": 0.214, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"eval_loss": 0.669939398765564, |
|
"eval_runtime": 1.8254, |
|
"eval_samples_per_second": 85.461, |
|
"eval_steps_per_second": 4.383, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.7724169592245996e-05, |
|
"loss": 0.2126, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"eval_loss": 0.6375145316123962, |
|
"eval_runtime": 1.8249, |
|
"eval_samples_per_second": 85.486, |
|
"eval_steps_per_second": 4.384, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.7595957315692782e-05, |
|
"loss": 0.2108, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"eval_loss": 0.6472862958908081, |
|
"eval_runtime": 1.8227, |
|
"eval_samples_per_second": 85.589, |
|
"eval_steps_per_second": 4.389, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.7464722953651504e-05, |
|
"loss": 0.2154, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"eval_loss": 0.6398993134498596, |
|
"eval_runtime": 1.8233, |
|
"eval_samples_per_second": 85.56, |
|
"eval_steps_per_second": 4.388, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.7330518718298263e-05, |
|
"loss": 0.2127, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"eval_loss": 0.6527624130249023, |
|
"eval_runtime": 1.8317, |
|
"eval_samples_per_second": 85.168, |
|
"eval_steps_per_second": 4.368, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.7193398003386514e-05, |
|
"loss": 0.2106, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"eval_loss": 0.6582281589508057, |
|
"eval_runtime": 1.8389, |
|
"eval_samples_per_second": 84.835, |
|
"eval_steps_per_second": 4.351, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.705341536300409e-05, |
|
"loss": 0.1955, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"eval_loss": 0.6634351015090942, |
|
"eval_runtime": 1.8422, |
|
"eval_samples_per_second": 84.68, |
|
"eval_steps_per_second": 4.343, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.691062648986865e-05, |
|
"loss": 0.1913, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"eval_loss": 0.6663269400596619, |
|
"eval_runtime": 1.8357, |
|
"eval_samples_per_second": 84.983, |
|
"eval_steps_per_second": 4.358, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.6765088193170055e-05, |
|
"loss": 0.226, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"eval_loss": 0.6499953866004944, |
|
"eval_runtime": 1.839, |
|
"eval_samples_per_second": 84.83, |
|
"eval_steps_per_second": 4.35, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.6616858375968596e-05, |
|
"loss": 0.2094, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"eval_loss": 0.6462793350219727, |
|
"eval_runtime": 1.8441, |
|
"eval_samples_per_second": 84.592, |
|
"eval_steps_per_second": 4.338, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.6465996012157996e-05, |
|
"loss": 0.205, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"eval_loss": 0.6545242667198181, |
|
"eval_runtime": 1.8396, |
|
"eval_samples_per_second": 84.8, |
|
"eval_steps_per_second": 4.349, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.631256112300239e-05, |
|
"loss": 0.2099, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"eval_loss": 0.6629713773727417, |
|
"eval_runtime": 1.8435, |
|
"eval_samples_per_second": 84.621, |
|
"eval_steps_per_second": 4.34, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.6156614753256583e-05, |
|
"loss": 0.2197, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"eval_loss": 0.657893717288971, |
|
"eval_runtime": 1.8357, |
|
"eval_samples_per_second": 84.98, |
|
"eval_steps_per_second": 4.358, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.599821894687914e-05, |
|
"loss": 0.2135, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"eval_loss": 0.6627128720283508, |
|
"eval_runtime": 1.8361, |
|
"eval_samples_per_second": 84.964, |
|
"eval_steps_per_second": 4.357, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.5837436722347902e-05, |
|
"loss": 0.2234, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"eval_loss": 0.6546624302864075, |
|
"eval_runtime": 1.8329, |
|
"eval_samples_per_second": 85.112, |
|
"eval_steps_per_second": 4.365, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.567433204758782e-05, |
|
"loss": 0.2104, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"eval_loss": 0.6604743599891663, |
|
"eval_runtime": 1.8296, |
|
"eval_samples_per_second": 85.263, |
|
"eval_steps_per_second": 4.372, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.5508969814521026e-05, |
|
"loss": 0.2148, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"eval_loss": 0.6577239632606506, |
|
"eval_runtime": 1.8328, |
|
"eval_samples_per_second": 85.115, |
|
"eval_steps_per_second": 4.365, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 1.534141581324929e-05, |
|
"loss": 0.2301, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"eval_loss": 0.6536082029342651, |
|
"eval_runtime": 1.8432, |
|
"eval_samples_per_second": 84.638, |
|
"eval_steps_per_second": 4.34, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.5171736705879127e-05, |
|
"loss": 0.216, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"eval_loss": 0.650008499622345, |
|
"eval_runtime": 1.8464, |
|
"eval_samples_per_second": 84.49, |
|
"eval_steps_per_second": 4.333, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.2006, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"eval_loss": 0.6558033227920532, |
|
"eval_runtime": 1.8156, |
|
"eval_samples_per_second": 85.924, |
|
"eval_steps_per_second": 4.406, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.482627402182611e-05, |
|
"loss": 0.2158, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"eval_loss": 0.6513029336929321, |
|
"eval_runtime": 1.8338, |
|
"eval_samples_per_second": 85.068, |
|
"eval_steps_per_second": 4.362, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.4650627889012507e-05, |
|
"loss": 0.2103, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"eval_loss": 0.6524425745010376, |
|
"eval_runtime": 1.842, |
|
"eval_samples_per_second": 84.692, |
|
"eval_steps_per_second": 4.343, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.4473131483156326e-05, |
|
"loss": 0.2196, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"eval_loss": 0.6501613259315491, |
|
"eval_runtime": 1.8283, |
|
"eval_samples_per_second": 85.325, |
|
"eval_steps_per_second": 4.376, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.4293855421994094e-05, |
|
"loss": 0.2061, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"eval_loss": 0.6513486504554749, |
|
"eval_runtime": 1.8281, |
|
"eval_samples_per_second": 85.335, |
|
"eval_steps_per_second": 4.376, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.4112871031306118e-05, |
|
"loss": 0.2223, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.6523801684379578, |
|
"eval_runtime": 1.8268, |
|
"eval_samples_per_second": 85.394, |
|
"eval_steps_per_second": 4.379, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.3930250316539237e-05, |
|
"loss": 0.1221, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"eval_loss": 0.7312780618667603, |
|
"eval_runtime": 1.8299, |
|
"eval_samples_per_second": 85.251, |
|
"eval_steps_per_second": 4.372, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.3746065934159123e-05, |
|
"loss": 0.1157, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"eval_loss": 0.7543404698371887, |
|
"eval_runtime": 1.8272, |
|
"eval_samples_per_second": 85.375, |
|
"eval_steps_per_second": 4.378, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.356039116274357e-05, |
|
"loss": 0.1138, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"eval_loss": 0.726587176322937, |
|
"eval_runtime": 1.8234, |
|
"eval_samples_per_second": 85.557, |
|
"eval_steps_per_second": 4.388, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.3373299873828303e-05, |
|
"loss": 0.123, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"eval_loss": 0.7310608625411987, |
|
"eval_runtime": 1.8252, |
|
"eval_samples_per_second": 85.47, |
|
"eval_steps_per_second": 4.383, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 1.3184866502516846e-05, |
|
"loss": 0.111, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"eval_loss": 0.7380203008651733, |
|
"eval_runtime": 1.8245, |
|
"eval_samples_per_second": 85.505, |
|
"eval_steps_per_second": 4.385, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.2995166017866194e-05, |
|
"loss": 0.1197, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"eval_loss": 0.735602080821991, |
|
"eval_runtime": 1.8271, |
|
"eval_samples_per_second": 85.381, |
|
"eval_steps_per_second": 4.379, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.2804273893060028e-05, |
|
"loss": 0.1253, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"eval_loss": 0.7375702261924744, |
|
"eval_runtime": 1.8333, |
|
"eval_samples_per_second": 85.094, |
|
"eval_steps_per_second": 4.364, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.2612266075381385e-05, |
|
"loss": 0.1179, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"eval_loss": 0.7395501732826233, |
|
"eval_runtime": 1.8275, |
|
"eval_samples_per_second": 85.364, |
|
"eval_steps_per_second": 4.378, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.2419218955996677e-05, |
|
"loss": 0.1296, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"eval_loss": 0.7340117692947388, |
|
"eval_runtime": 1.8276, |
|
"eval_samples_per_second": 85.358, |
|
"eval_steps_per_second": 4.377, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.2225209339563144e-05, |
|
"loss": 0.1192, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"eval_loss": 0.7283806800842285, |
|
"eval_runtime": 1.8268, |
|
"eval_samples_per_second": 85.394, |
|
"eval_steps_per_second": 4.379, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.2030314413671763e-05, |
|
"loss": 0.1151, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"eval_loss": 0.7252658605575562, |
|
"eval_runtime": 1.8274, |
|
"eval_samples_per_second": 85.368, |
|
"eval_steps_per_second": 4.378, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.1834611718137825e-05, |
|
"loss": 0.1168, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"eval_loss": 0.7386046648025513, |
|
"eval_runtime": 1.8276, |
|
"eval_samples_per_second": 85.358, |
|
"eval_steps_per_second": 4.377, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 1.1638179114151378e-05, |
|
"loss": 0.12, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"eval_loss": 0.75035160779953, |
|
"eval_runtime": 1.8285, |
|
"eval_samples_per_second": 85.315, |
|
"eval_steps_per_second": 4.375, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.1441094753299802e-05, |
|
"loss": 0.1301, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"eval_loss": 0.7399159073829651, |
|
"eval_runtime": 1.8281, |
|
"eval_samples_per_second": 85.335, |
|
"eval_steps_per_second": 4.376, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.1243437046474854e-05, |
|
"loss": 0.1159, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"eval_loss": 0.734505295753479, |
|
"eval_runtime": 1.8287, |
|
"eval_samples_per_second": 85.307, |
|
"eval_steps_per_second": 4.375, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.1045284632676535e-05, |
|
"loss": 0.1217, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"eval_loss": 0.7360817193984985, |
|
"eval_runtime": 1.8327, |
|
"eval_samples_per_second": 85.118, |
|
"eval_steps_per_second": 4.365, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.0846716347726233e-05, |
|
"loss": 0.1164, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"eval_loss": 0.7412881255149841, |
|
"eval_runtime": 1.8259, |
|
"eval_samples_per_second": 85.439, |
|
"eval_steps_per_second": 4.382, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 1.0647811192901518e-05, |
|
"loss": 0.1167, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"eval_loss": 0.7428140044212341, |
|
"eval_runtime": 1.8329, |
|
"eval_samples_per_second": 85.111, |
|
"eval_steps_per_second": 4.365, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.044864830350515e-05, |
|
"loss": 0.122, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"eval_loss": 0.7407277226448059, |
|
"eval_runtime": 1.8317, |
|
"eval_samples_per_second": 85.165, |
|
"eval_steps_per_second": 4.367, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 1.0249306917380731e-05, |
|
"loss": 0.1174, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"eval_loss": 0.739768385887146, |
|
"eval_runtime": 1.8255, |
|
"eval_samples_per_second": 85.457, |
|
"eval_steps_per_second": 4.382, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.0049866343387582e-05, |
|
"loss": 0.1152, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"eval_loss": 0.7400333881378174, |
|
"eval_runtime": 1.827, |
|
"eval_samples_per_second": 85.384, |
|
"eval_steps_per_second": 4.379, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 9.850405929847367e-06, |
|
"loss": 0.1265, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"eval_loss": 0.7430944442749023, |
|
"eval_runtime": 1.8297, |
|
"eval_samples_per_second": 85.261, |
|
"eval_steps_per_second": 4.372, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 9.651005032974994e-06, |
|
"loss": 0.1183, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"eval_loss": 0.7510712742805481, |
|
"eval_runtime": 1.8296, |
|
"eval_samples_per_second": 85.264, |
|
"eval_steps_per_second": 4.373, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 9.4517429853064e-06, |
|
"loss": 0.1169, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"eval_loss": 0.7500400543212891, |
|
"eval_runtime": 1.8299, |
|
"eval_samples_per_second": 85.252, |
|
"eval_steps_per_second": 4.372, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 9.252699064135759e-06, |
|
"loss": 0.1191, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.7431495785713196, |
|
"eval_runtime": 1.8402, |
|
"eval_samples_per_second": 84.774, |
|
"eval_steps_per_second": 4.347, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1750, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 7, |
|
"save_steps": 500, |
|
"total_flos": 9.48348331795415e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|