|
{ |
|
"best_metric": 0.970888078212738, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-150", |
|
"epoch": 2.0, |
|
"eval_steps": 25, |
|
"global_step": 172, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011627906976744186, |
|
"grad_norm": 2.9615676403045654, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 3.7306, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.011627906976744186, |
|
"eval_loss": 1.230737328529358, |
|
"eval_runtime": 7.9791, |
|
"eval_samples_per_second": 6.266, |
|
"eval_steps_per_second": 0.877, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.023255813953488372, |
|
"grad_norm": 2.557929754257202, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 3.564, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03488372093023256, |
|
"grad_norm": 2.1619441509246826, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 3.744, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.046511627906976744, |
|
"grad_norm": 1.1338863372802734, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 5.1335, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.05813953488372093, |
|
"grad_norm": 1.0265060663223267, |
|
"learning_rate": 0.00015, |
|
"loss": 4.7372, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.06976744186046512, |
|
"grad_norm": 1.4262174367904663, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 5.1326, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.08139534883720931, |
|
"grad_norm": 1.4468605518341064, |
|
"learning_rate": 0.00020999999999999998, |
|
"loss": 5.1196, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.09302325581395349, |
|
"grad_norm": 1.4061644077301025, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 5.5418, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.10465116279069768, |
|
"grad_norm": 1.197460651397705, |
|
"learning_rate": 0.00027, |
|
"loss": 4.9912, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.11627906976744186, |
|
"grad_norm": 1.332681655883789, |
|
"learning_rate": 0.0003, |
|
"loss": 5.1568, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.12790697674418605, |
|
"grad_norm": 1.0973302125930786, |
|
"learning_rate": 0.00029997179556727515, |
|
"loss": 5.2029, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.13953488372093023, |
|
"grad_norm": 1.9083119630813599, |
|
"learning_rate": 0.0002998871928756345, |
|
"loss": 4.6768, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.1511627906976744, |
|
"grad_norm": 1.5310242176055908, |
|
"learning_rate": 0.00029974622374069024, |
|
"loss": 4.4159, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.16279069767441862, |
|
"grad_norm": 0.9514783620834351, |
|
"learning_rate": 0.0002995489411751688, |
|
"loss": 4.3375, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.1744186046511628, |
|
"grad_norm": 1.3517793416976929, |
|
"learning_rate": 0.0002992954193689748, |
|
"loss": 4.7541, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.18604651162790697, |
|
"grad_norm": 1.600083827972412, |
|
"learning_rate": 0.00029898575366129145, |
|
"loss": 5.5065, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.19767441860465115, |
|
"grad_norm": 0.972209095954895, |
|
"learning_rate": 0.00029862006050472675, |
|
"loss": 4.8114, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.20930232558139536, |
|
"grad_norm": 0.8742607831954956, |
|
"learning_rate": 0.0002981984774215213, |
|
"loss": 4.5777, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.22093023255813954, |
|
"grad_norm": 1.0211178064346313, |
|
"learning_rate": 0.0002977211629518312, |
|
"loss": 5.0349, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.23255813953488372, |
|
"grad_norm": 1.19241464138031, |
|
"learning_rate": 0.00029718829659410766, |
|
"loss": 5.1001, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2441860465116279, |
|
"grad_norm": 2.0463316440582275, |
|
"learning_rate": 0.00029660007873759533, |
|
"loss": 5.5808, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.2558139534883721, |
|
"grad_norm": 5.027365207672119, |
|
"learning_rate": 0.00029595673058697357, |
|
"loss": 3.2185, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.26744186046511625, |
|
"grad_norm": 1.7920422554016113, |
|
"learning_rate": 0.00029525849407917087, |
|
"loss": 3.5772, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.27906976744186046, |
|
"grad_norm": 2.2887630462646484, |
|
"learning_rate": 0.000294505631792382, |
|
"loss": 2.6486, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.29069767441860467, |
|
"grad_norm": 1.6574441194534302, |
|
"learning_rate": 0.00029369842684732334, |
|
"loss": 3.5684, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.29069767441860467, |
|
"eval_loss": 1.0823593139648438, |
|
"eval_runtime": 7.799, |
|
"eval_samples_per_second": 6.411, |
|
"eval_steps_per_second": 0.898, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.3023255813953488, |
|
"grad_norm": 0.8784985542297363, |
|
"learning_rate": 0.00029283718280076227, |
|
"loss": 4.3739, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.313953488372093, |
|
"grad_norm": 1.098872423171997, |
|
"learning_rate": 0.00029192222353136254, |
|
"loss": 4.6181, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.32558139534883723, |
|
"grad_norm": 0.9574615359306335, |
|
"learning_rate": 0.0002909538931178862, |
|
"loss": 4.7032, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.3372093023255814, |
|
"grad_norm": 1.1205692291259766, |
|
"learning_rate": 0.0002899325557098001, |
|
"loss": 4.9195, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.3488372093023256, |
|
"grad_norm": 0.9449394941329956, |
|
"learning_rate": 0.00028885859539033357, |
|
"loss": 4.2665, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.36046511627906974, |
|
"grad_norm": 1.0518043041229248, |
|
"learning_rate": 0.0002877324160320411, |
|
"loss": 4.6985, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.37209302325581395, |
|
"grad_norm": 0.9868097901344299, |
|
"learning_rate": 0.000286554441144922, |
|
"loss": 4.8123, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.38372093023255816, |
|
"grad_norm": 0.8500082492828369, |
|
"learning_rate": 0.00028532511371715566, |
|
"loss": 4.5633, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.3953488372093023, |
|
"grad_norm": 0.9240705370903015, |
|
"learning_rate": 0.0002840448960485118, |
|
"loss": 4.4635, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.4069767441860465, |
|
"grad_norm": 0.8523270487785339, |
|
"learning_rate": 0.00028271426957649865, |
|
"loss": 4.1489, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.4186046511627907, |
|
"grad_norm": 0.9723194241523743, |
|
"learning_rate": 0.00028133373469531363, |
|
"loss": 4.0247, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.43023255813953487, |
|
"grad_norm": 1.9848984479904175, |
|
"learning_rate": 0.0002799038105676658, |
|
"loss": 4.588, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.4418604651162791, |
|
"grad_norm": 0.8572053909301758, |
|
"learning_rate": 0.00027842503492953995, |
|
"loss": 4.6031, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.45348837209302323, |
|
"grad_norm": 0.9035416841506958, |
|
"learning_rate": 0.0002768979638879761, |
|
"loss": 4.8538, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.46511627906976744, |
|
"grad_norm": 1.053252935409546, |
|
"learning_rate": 0.00027532317171194046, |
|
"loss": 4.9722, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.47674418604651164, |
|
"grad_norm": 1.071892261505127, |
|
"learning_rate": 0.000273701250616366, |
|
"loss": 5.0359, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.4883720930232558, |
|
"grad_norm": 2.722141981124878, |
|
"learning_rate": 0.0002720328105394451, |
|
"loss": 6.2443, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.1856935024261475, |
|
"learning_rate": 0.00027031847891325657, |
|
"loss": 3.1716, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.5116279069767442, |
|
"grad_norm": 1.6351940631866455, |
|
"learning_rate": 0.00026855890042781387, |
|
"loss": 2.7951, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.5232558139534884, |
|
"grad_norm": 1.7642279863357544, |
|
"learning_rate": 0.000266754736788624, |
|
"loss": 3.6811, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.5348837209302325, |
|
"grad_norm": 1.665910243988037, |
|
"learning_rate": 0.00026490666646784665, |
|
"loss": 4.2234, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.5465116279069767, |
|
"grad_norm": 1.4838505983352661, |
|
"learning_rate": 0.00026301538444914907, |
|
"loss": 4.4947, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.5581395348837209, |
|
"grad_norm": 1.082304835319519, |
|
"learning_rate": 0.00026108160196635066, |
|
"loss": 4.6096, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.5697674418604651, |
|
"grad_norm": 0.9804915189743042, |
|
"learning_rate": 0.0002591060462359573, |
|
"loss": 4.3917, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.5813953488372093, |
|
"grad_norm": 1.103212594985962, |
|
"learning_rate": 0.00025708946018368484, |
|
"loss": 4.2729, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5813953488372093, |
|
"eval_loss": 1.018844723701477, |
|
"eval_runtime": 8.0091, |
|
"eval_samples_per_second": 6.243, |
|
"eval_steps_per_second": 0.874, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5930232558139535, |
|
"grad_norm": 0.9773282408714294, |
|
"learning_rate": 0.00025503260216507527, |
|
"loss": 4.8289, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.6046511627906976, |
|
"grad_norm": 0.9617090225219727, |
|
"learning_rate": 0.00025293624568031, |
|
"loss": 4.4117, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.6162790697674418, |
|
"grad_norm": 2.6547250747680664, |
|
"learning_rate": 0.00025080117908332834, |
|
"loss": 4.1853, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.627906976744186, |
|
"grad_norm": 0.9078519940376282, |
|
"learning_rate": 0.00024862820528535954, |
|
"loss": 4.1129, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.6395348837209303, |
|
"grad_norm": 0.9886384606361389, |
|
"learning_rate": 0.0002464181414529809, |
|
"loss": 3.9609, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.6511627906976745, |
|
"grad_norm": 0.9917184710502625, |
|
"learning_rate": 0.0002441718187008148, |
|
"loss": 3.9052, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.6627906976744186, |
|
"grad_norm": 0.8498942255973816, |
|
"learning_rate": 0.0002418900817789804, |
|
"loss": 4.3063, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.6744186046511628, |
|
"grad_norm": 3.341484308242798, |
|
"learning_rate": 0.00023957378875541792, |
|
"loss": 5.0247, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.686046511627907, |
|
"grad_norm": 0.8067652583122253, |
|
"learning_rate": 0.00023722381069320398, |
|
"loss": 4.6544, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.6976744186046512, |
|
"grad_norm": 0.8337781429290771, |
|
"learning_rate": 0.00023484103132298079, |
|
"loss": 4.5483, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.7093023255813954, |
|
"grad_norm": 0.9353996515274048, |
|
"learning_rate": 0.0002324263467106209, |
|
"loss": 4.5697, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.7209302325581395, |
|
"grad_norm": 1.1387176513671875, |
|
"learning_rate": 0.0002299806649202537, |
|
"loss": 4.8516, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.7325581395348837, |
|
"grad_norm": 2.2056901454925537, |
|
"learning_rate": 0.00022750490567277943, |
|
"loss": 5.6958, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.7441860465116279, |
|
"grad_norm": 1.9957636594772339, |
|
"learning_rate": 0.000225, |
|
"loss": 2.8987, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.7558139534883721, |
|
"grad_norm": 2.1281590461730957, |
|
"learning_rate": 0.00022246688989449576, |
|
"loss": 3.1358, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.7674418604651163, |
|
"grad_norm": 1.0050021409988403, |
|
"learning_rate": 0.00021990652795538082, |
|
"loss": 3.127, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.7790697674418605, |
|
"grad_norm": 0.9873821139335632, |
|
"learning_rate": 0.00021731987703006933, |
|
"loss": 4.0994, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.7906976744186046, |
|
"grad_norm": 1.039175271987915, |
|
"learning_rate": 0.00021470790985218802, |
|
"loss": 4.3436, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.8023255813953488, |
|
"grad_norm": 0.9042471647262573, |
|
"learning_rate": 0.00021207160867577087, |
|
"loss": 4.0852, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.813953488372093, |
|
"grad_norm": 0.8749189972877502, |
|
"learning_rate": 0.0002094119649058735, |
|
"loss": 4.5285, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.8255813953488372, |
|
"grad_norm": 0.9549452662467957, |
|
"learning_rate": 0.00020672997872574637, |
|
"loss": 4.3908, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.8372093023255814, |
|
"grad_norm": 1.4416093826293945, |
|
"learning_rate": 0.00020402665872070654, |
|
"loss": 4.1939, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.8488372093023255, |
|
"grad_norm": 0.9574504494667053, |
|
"learning_rate": 0.00020130302149885031, |
|
"loss": 4.4245, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.8604651162790697, |
|
"grad_norm": 0.8751915097236633, |
|
"learning_rate": 0.00019856009130874816, |
|
"loss": 4.207, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.872093023255814, |
|
"grad_norm": 0.8442956209182739, |
|
"learning_rate": 0.00019579889965426698, |
|
"loss": 3.7247, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.872093023255814, |
|
"eval_loss": 1.0017675161361694, |
|
"eval_runtime": 7.7953, |
|
"eval_samples_per_second": 6.414, |
|
"eval_steps_per_second": 0.898, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.8837209302325582, |
|
"grad_norm": 0.7961885929107666, |
|
"learning_rate": 0.00019302048490666353, |
|
"loss": 3.855, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.8953488372093024, |
|
"grad_norm": 0.8288469314575195, |
|
"learning_rate": 0.0001902258919140956, |
|
"loss": 4.3016, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.9069767441860465, |
|
"grad_norm": 0.9873271584510803, |
|
"learning_rate": 0.00018741617160869718, |
|
"loss": 4.0653, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.9186046511627907, |
|
"grad_norm": 0.8603689074516296, |
|
"learning_rate": 0.00018459238061136602, |
|
"loss": 4.3487, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"grad_norm": 0.8512229323387146, |
|
"learning_rate": 0.00018175558083441162, |
|
"loss": 4.331, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.9418604651162791, |
|
"grad_norm": 0.9621251821517944, |
|
"learning_rate": 0.00017890683908221346, |
|
"loss": 4.4871, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.9534883720930233, |
|
"grad_norm": 1.079795479774475, |
|
"learning_rate": 0.00017604722665003956, |
|
"loss": 4.9358, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.9651162790697675, |
|
"grad_norm": 1.1401747465133667, |
|
"learning_rate": 0.00017317781892117607, |
|
"loss": 5.0393, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.9767441860465116, |
|
"grad_norm": 2.458475112915039, |
|
"learning_rate": 0.00017029969496251966, |
|
"loss": 5.3823, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.9883720930232558, |
|
"grad_norm": 1.0386871099472046, |
|
"learning_rate": 0.00016741393711878453, |
|
"loss": 3.8578, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.7885408401489258, |
|
"learning_rate": 0.00016452163060547687, |
|
"loss": 4.6428, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.0116279069767442, |
|
"grad_norm": 1.0790536403656006, |
|
"learning_rate": 0.00016162386310078963, |
|
"loss": 2.3767, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.0232558139534884, |
|
"grad_norm": 0.8506271839141846, |
|
"learning_rate": 0.00015872172433657134, |
|
"loss": 2.6958, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.0348837209302326, |
|
"grad_norm": 0.7386451959609985, |
|
"learning_rate": 0.0001558163056885225, |
|
"loss": 3.0156, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.0465116279069768, |
|
"grad_norm": 0.901319682598114, |
|
"learning_rate": 0.00015290869976577364, |
|
"loss": 3.9778, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.058139534883721, |
|
"grad_norm": 0.9542390704154968, |
|
"learning_rate": 0.00015, |
|
"loss": 3.7751, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.069767441860465, |
|
"grad_norm": 0.9419402480125427, |
|
"learning_rate": 0.00014709130023422633, |
|
"loss": 4.0309, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.0813953488372092, |
|
"grad_norm": 0.8811500668525696, |
|
"learning_rate": 0.00014418369431147746, |
|
"loss": 4.2052, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.0930232558139534, |
|
"grad_norm": 0.9012645483016968, |
|
"learning_rate": 0.00014127827566342863, |
|
"loss": 3.7495, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.1046511627906976, |
|
"grad_norm": 0.9330968260765076, |
|
"learning_rate": 0.00013837613689921037, |
|
"loss": 4.3093, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.1162790697674418, |
|
"grad_norm": 0.9980186820030212, |
|
"learning_rate": 0.00013547836939452313, |
|
"loss": 4.2077, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.127906976744186, |
|
"grad_norm": 0.8667058944702148, |
|
"learning_rate": 0.00013258606288121542, |
|
"loss": 3.4672, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.1395348837209303, |
|
"grad_norm": 1.185315489768982, |
|
"learning_rate": 0.00012970030503748036, |
|
"loss": 3.9651, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.1511627906976745, |
|
"grad_norm": 0.883290708065033, |
|
"learning_rate": 0.00012682218107882393, |
|
"loss": 3.8345, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.1627906976744187, |
|
"grad_norm": 0.7864513993263245, |
|
"learning_rate": 0.00012395277334996044, |
|
"loss": 3.7654, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.1627906976744187, |
|
"eval_loss": 0.991995632648468, |
|
"eval_runtime": 8.0088, |
|
"eval_samples_per_second": 6.243, |
|
"eval_steps_per_second": 0.874, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.1744186046511629, |
|
"grad_norm": 0.8056565523147583, |
|
"learning_rate": 0.0001210931609177865, |
|
"loss": 3.8797, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.1860465116279069, |
|
"grad_norm": 1.6551860570907593, |
|
"learning_rate": 0.00011824441916558842, |
|
"loss": 4.1864, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.197674418604651, |
|
"grad_norm": 0.8464885354042053, |
|
"learning_rate": 0.00011540761938863397, |
|
"loss": 4.1092, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.2093023255813953, |
|
"grad_norm": 0.9470563530921936, |
|
"learning_rate": 0.00011258382839130281, |
|
"loss": 4.2251, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.2209302325581395, |
|
"grad_norm": 1.0200443267822266, |
|
"learning_rate": 0.00010977410808590436, |
|
"loss": 4.2479, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.2325581395348837, |
|
"grad_norm": 1.1114517450332642, |
|
"learning_rate": 0.0001069795150933365, |
|
"loss": 4.3392, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.244186046511628, |
|
"grad_norm": 2.389307975769043, |
|
"learning_rate": 0.00010420110034573304, |
|
"loss": 4.9816, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.255813953488372, |
|
"grad_norm": 1.2697150707244873, |
|
"learning_rate": 0.00010143990869125184, |
|
"loss": 2.6297, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.2674418604651163, |
|
"grad_norm": 0.9846199154853821, |
|
"learning_rate": 9.869697850114969e-05, |
|
"loss": 2.7252, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.2790697674418605, |
|
"grad_norm": 1.0952304601669312, |
|
"learning_rate": 9.597334127929346e-05, |
|
"loss": 2.7719, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.2906976744186047, |
|
"grad_norm": 0.9392086267471313, |
|
"learning_rate": 9.327002127425363e-05, |
|
"loss": 3.5918, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.302325581395349, |
|
"grad_norm": 0.9461237192153931, |
|
"learning_rate": 9.058803509412646e-05, |
|
"loss": 3.874, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.3139534883720931, |
|
"grad_norm": 0.9763650894165039, |
|
"learning_rate": 8.792839132422913e-05, |
|
"loss": 4.1091, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.3255813953488373, |
|
"grad_norm": 0.9497533440589905, |
|
"learning_rate": 8.529209014781201e-05, |
|
"loss": 3.8025, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.3372093023255813, |
|
"grad_norm": 1.0650036334991455, |
|
"learning_rate": 8.268012296993067e-05, |
|
"loss": 4.2753, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.3488372093023255, |
|
"grad_norm": 1.0653228759765625, |
|
"learning_rate": 8.009347204461921e-05, |
|
"loss": 4.0739, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.3604651162790697, |
|
"grad_norm": 1.1015644073486328, |
|
"learning_rate": 7.753311010550421e-05, |
|
"loss": 4.076, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.372093023255814, |
|
"grad_norm": 1.0751556158065796, |
|
"learning_rate": 7.500000000000002e-05, |
|
"loss": 4.0752, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.3837209302325582, |
|
"grad_norm": 0.9765347838401794, |
|
"learning_rate": 7.249509432722056e-05, |
|
"loss": 3.8782, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.3953488372093024, |
|
"grad_norm": 1.0646424293518066, |
|
"learning_rate": 7.001933507974633e-05, |
|
"loss": 3.7195, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.4069767441860466, |
|
"grad_norm": 0.9464161992073059, |
|
"learning_rate": 6.75736532893791e-05, |
|
"loss": 3.7267, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.4186046511627908, |
|
"grad_norm": 1.1027458906173706, |
|
"learning_rate": 6.515896867701923e-05, |
|
"loss": 4.2282, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.4302325581395348, |
|
"grad_norm": 0.9523140788078308, |
|
"learning_rate": 6.277618930679598e-05, |
|
"loss": 4.1755, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.441860465116279, |
|
"grad_norm": 0.9763263463973999, |
|
"learning_rate": 6.04262112445821e-05, |
|
"loss": 4.1322, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.4534883720930232, |
|
"grad_norm": 0.9657738208770752, |
|
"learning_rate": 5.8109918221019566e-05, |
|
"loss": 4.2455, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.4534883720930232, |
|
"eval_loss": 0.9675049781799316, |
|
"eval_runtime": 8.0071, |
|
"eval_samples_per_second": 6.244, |
|
"eval_steps_per_second": 0.874, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.4651162790697674, |
|
"grad_norm": 1.0695807933807373, |
|
"learning_rate": 5.582818129918524e-05, |
|
"loss": 4.2064, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.4767441860465116, |
|
"grad_norm": 1.2192351818084717, |
|
"learning_rate": 5.358185854701909e-05, |
|
"loss": 4.4307, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.4883720930232558, |
|
"grad_norm": 2.163886308670044, |
|
"learning_rate": 5.137179471464047e-05, |
|
"loss": 4.5119, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.2242028713226318, |
|
"learning_rate": 4.9198820916671634e-05, |
|
"loss": 2.6797, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.5116279069767442, |
|
"grad_norm": 1.0933971405029297, |
|
"learning_rate": 4.706375431968997e-05, |
|
"loss": 2.4876, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.5232558139534884, |
|
"grad_norm": 1.007104754447937, |
|
"learning_rate": 4.4967397834924724e-05, |
|
"loss": 3.1256, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.5348837209302326, |
|
"grad_norm": 0.9222277402877808, |
|
"learning_rate": 4.2910539816315164e-05, |
|
"loss": 3.4608, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.5465116279069768, |
|
"grad_norm": 1.0027642250061035, |
|
"learning_rate": 4.089395376404269e-05, |
|
"loss": 3.692, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.558139534883721, |
|
"grad_norm": 1.139125943183899, |
|
"learning_rate": 3.891839803364934e-05, |
|
"loss": 4.2357, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.5697674418604652, |
|
"grad_norm": 1.028533697128296, |
|
"learning_rate": 3.698461555085089e-05, |
|
"loss": 4.1098, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.5813953488372094, |
|
"grad_norm": 1.124190092086792, |
|
"learning_rate": 3.509333353215331e-05, |
|
"loss": 4.0042, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.5930232558139537, |
|
"grad_norm": 1.042450189590454, |
|
"learning_rate": 3.324526321137599e-05, |
|
"loss": 3.7289, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.6046511627906976, |
|
"grad_norm": 1.0798102617263794, |
|
"learning_rate": 3.144109957218612e-05, |
|
"loss": 3.8191, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.6162790697674418, |
|
"grad_norm": 1.1347808837890625, |
|
"learning_rate": 2.9681521086743422e-05, |
|
"loss": 3.9489, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.627906976744186, |
|
"grad_norm": 1.0719928741455078, |
|
"learning_rate": 2.7967189460554872e-05, |
|
"loss": 4.3926, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.6395348837209303, |
|
"grad_norm": 0.9701864719390869, |
|
"learning_rate": 2.629874938363398e-05, |
|
"loss": 3.552, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.6511627906976745, |
|
"grad_norm": 0.8670133948326111, |
|
"learning_rate": 2.4676828288059558e-05, |
|
"loss": 3.6456, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.6627906976744184, |
|
"grad_norm": 0.9276437759399414, |
|
"learning_rate": 2.3102036112023836e-05, |
|
"loss": 3.3793, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.6744186046511627, |
|
"grad_norm": 1.150744915008545, |
|
"learning_rate": 2.1574965070460043e-05, |
|
"loss": 3.6836, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.6860465116279069, |
|
"grad_norm": 0.9892753958702087, |
|
"learning_rate": 2.009618943233419e-05, |
|
"loss": 4.1351, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.697674418604651, |
|
"grad_norm": 0.9877732396125793, |
|
"learning_rate": 1.8666265304686383e-05, |
|
"loss": 4.105, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.7093023255813953, |
|
"grad_norm": 1.0883934497833252, |
|
"learning_rate": 1.7285730423501327e-05, |
|
"loss": 4.1906, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.7209302325581395, |
|
"grad_norm": 1.1771783828735352, |
|
"learning_rate": 1.5955103951488173e-05, |
|
"loss": 4.368, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.7325581395348837, |
|
"grad_norm": 2.0767550468444824, |
|
"learning_rate": 1.467488628284434e-05, |
|
"loss": 5.0929, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.744186046511628, |
|
"grad_norm": 0.9449374079704285, |
|
"learning_rate": 1.3445558855078014e-05, |
|
"loss": 2.7604, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.744186046511628, |
|
"eval_loss": 0.970888078212738, |
|
"eval_runtime": 8.0092, |
|
"eval_samples_per_second": 6.243, |
|
"eval_steps_per_second": 0.874, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.755813953488372, |
|
"grad_norm": 1.1184589862823486, |
|
"learning_rate": 1.2267583967958916e-05, |
|
"loss": 2.6164, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.7674418604651163, |
|
"grad_norm": 0.9397222995758057, |
|
"learning_rate": 1.1141404609666449e-05, |
|
"loss": 3.2114, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.7790697674418605, |
|
"grad_norm": 0.8842024803161621, |
|
"learning_rate": 1.0067444290199917e-05, |
|
"loss": 3.6752, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.7906976744186047, |
|
"grad_norm": 0.9894306659698486, |
|
"learning_rate": 9.046106882113751e-06, |
|
"loss": 3.8772, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.802325581395349, |
|
"grad_norm": 0.9347633123397827, |
|
"learning_rate": 8.07777646863746e-06, |
|
"loss": 3.9717, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.8139534883720931, |
|
"grad_norm": 1.0060522556304932, |
|
"learning_rate": 7.1628171992377025e-06, |
|
"loss": 4.3449, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.8255813953488373, |
|
"grad_norm": 0.9990627765655518, |
|
"learning_rate": 6.301573152676664e-06, |
|
"loss": 4.1012, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.8372093023255816, |
|
"grad_norm": 1.0378891229629517, |
|
"learning_rate": 5.494368207617949e-06, |
|
"loss": 4.0339, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.8488372093023255, |
|
"grad_norm": 1.0531872510910034, |
|
"learning_rate": 4.741505920829131e-06, |
|
"loss": 4.4799, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.8604651162790697, |
|
"grad_norm": 0.9710641503334045, |
|
"learning_rate": 4.043269413026429e-06, |
|
"loss": 3.6334, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.872093023255814, |
|
"grad_norm": 0.9414262771606445, |
|
"learning_rate": 3.3999212624046646e-06, |
|
"loss": 3.8207, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.8837209302325582, |
|
"grad_norm": 0.8727117776870728, |
|
"learning_rate": 2.811703405892296e-06, |
|
"loss": 3.6237, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.8953488372093024, |
|
"grad_norm": 0.8367646932601929, |
|
"learning_rate": 2.2788370481687965e-06, |
|
"loss": 3.5522, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.9069767441860463, |
|
"grad_norm": 0.9638428688049316, |
|
"learning_rate": 1.801522578478648e-06, |
|
"loss": 3.7745, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.9186046511627906, |
|
"grad_norm": 1.643754005432129, |
|
"learning_rate": 1.3799394952732024e-06, |
|
"loss": 4.3574, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.9302325581395348, |
|
"grad_norm": 0.9748329520225525, |
|
"learning_rate": 1.0142463387085464e-06, |
|
"loss": 4.2242, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.941860465116279, |
|
"grad_norm": 0.938869297504425, |
|
"learning_rate": 7.045806310251257e-07, |
|
"loss": 4.0438, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.9534883720930232, |
|
"grad_norm": 1.0137250423431396, |
|
"learning_rate": 4.510588248311964e-07, |
|
"loss": 4.342, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.9651162790697674, |
|
"grad_norm": 1.2118498086929321, |
|
"learning_rate": 2.5377625930977363e-07, |
|
"loss": 4.5787, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.9767441860465116, |
|
"grad_norm": 2.177273750305176, |
|
"learning_rate": 1.1280712436549378e-07, |
|
"loss": 4.6907, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.9883720930232558, |
|
"grad_norm": 0.9952888488769531, |
|
"learning_rate": 2.8204432724798775e-08, |
|
"loss": 3.898, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.449486017227173, |
|
"learning_rate": 0.0, |
|
"loss": 5.0474, |
|
"step": 172 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 172, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 1, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.485613329088512e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|