{ "best_metric": 0.970888078212738, "best_model_checkpoint": "miner_id_24/checkpoint-150", "epoch": 2.0, "eval_steps": 25, "global_step": 172, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011627906976744186, "grad_norm": 2.9615676403045654, "learning_rate": 2.9999999999999997e-05, "loss": 3.7306, "step": 1 }, { "epoch": 0.011627906976744186, "eval_loss": 1.230737328529358, "eval_runtime": 7.9791, "eval_samples_per_second": 6.266, "eval_steps_per_second": 0.877, "step": 1 }, { "epoch": 0.023255813953488372, "grad_norm": 2.557929754257202, "learning_rate": 5.9999999999999995e-05, "loss": 3.564, "step": 2 }, { "epoch": 0.03488372093023256, "grad_norm": 2.1619441509246826, "learning_rate": 8.999999999999999e-05, "loss": 3.744, "step": 3 }, { "epoch": 0.046511627906976744, "grad_norm": 1.1338863372802734, "learning_rate": 0.00011999999999999999, "loss": 5.1335, "step": 4 }, { "epoch": 0.05813953488372093, "grad_norm": 1.0265060663223267, "learning_rate": 0.00015, "loss": 4.7372, "step": 5 }, { "epoch": 0.06976744186046512, "grad_norm": 1.4262174367904663, "learning_rate": 0.00017999999999999998, "loss": 5.1326, "step": 6 }, { "epoch": 0.08139534883720931, "grad_norm": 1.4468605518341064, "learning_rate": 0.00020999999999999998, "loss": 5.1196, "step": 7 }, { "epoch": 0.09302325581395349, "grad_norm": 1.4061644077301025, "learning_rate": 0.00023999999999999998, "loss": 5.5418, "step": 8 }, { "epoch": 0.10465116279069768, "grad_norm": 1.197460651397705, "learning_rate": 0.00027, "loss": 4.9912, "step": 9 }, { "epoch": 0.11627906976744186, "grad_norm": 1.332681655883789, "learning_rate": 0.0003, "loss": 5.1568, "step": 10 }, { "epoch": 0.12790697674418605, "grad_norm": 1.0973302125930786, "learning_rate": 0.00029997179556727515, "loss": 5.2029, "step": 11 }, { "epoch": 0.13953488372093023, "grad_norm": 1.9083119630813599, "learning_rate": 0.0002998871928756345, "loss": 4.6768, "step": 12 }, { "epoch": 0.1511627906976744, "grad_norm": 1.5310242176055908, "learning_rate": 0.00029974622374069024, "loss": 4.4159, "step": 13 }, { "epoch": 0.16279069767441862, "grad_norm": 0.9514783620834351, "learning_rate": 0.0002995489411751688, "loss": 4.3375, "step": 14 }, { "epoch": 0.1744186046511628, "grad_norm": 1.3517793416976929, "learning_rate": 0.0002992954193689748, "loss": 4.7541, "step": 15 }, { "epoch": 0.18604651162790697, "grad_norm": 1.600083827972412, "learning_rate": 0.00029898575366129145, "loss": 5.5065, "step": 16 }, { "epoch": 0.19767441860465115, "grad_norm": 0.972209095954895, "learning_rate": 0.00029862006050472675, "loss": 4.8114, "step": 17 }, { "epoch": 0.20930232558139536, "grad_norm": 0.8742607831954956, "learning_rate": 0.0002981984774215213, "loss": 4.5777, "step": 18 }, { "epoch": 0.22093023255813954, "grad_norm": 1.0211178064346313, "learning_rate": 0.0002977211629518312, "loss": 5.0349, "step": 19 }, { "epoch": 0.23255813953488372, "grad_norm": 1.19241464138031, "learning_rate": 0.00029718829659410766, "loss": 5.1001, "step": 20 }, { "epoch": 0.2441860465116279, "grad_norm": 2.0463316440582275, "learning_rate": 0.00029660007873759533, "loss": 5.5808, "step": 21 }, { "epoch": 0.2558139534883721, "grad_norm": 5.027365207672119, "learning_rate": 0.00029595673058697357, "loss": 3.2185, "step": 22 }, { "epoch": 0.26744186046511625, "grad_norm": 1.7920422554016113, "learning_rate": 0.00029525849407917087, "loss": 3.5772, "step": 23 }, { "epoch": 0.27906976744186046, "grad_norm": 2.2887630462646484, "learning_rate": 0.000294505631792382, "loss": 2.6486, "step": 24 }, { "epoch": 0.29069767441860467, "grad_norm": 1.6574441194534302, "learning_rate": 0.00029369842684732334, "loss": 3.5684, "step": 25 }, { "epoch": 0.29069767441860467, "eval_loss": 1.0823593139648438, "eval_runtime": 7.799, "eval_samples_per_second": 6.411, "eval_steps_per_second": 0.898, "step": 25 }, { "epoch": 0.3023255813953488, "grad_norm": 0.8784985542297363, "learning_rate": 0.00029283718280076227, "loss": 4.3739, "step": 26 }, { "epoch": 0.313953488372093, "grad_norm": 1.098872423171997, "learning_rate": 0.00029192222353136254, "loss": 4.6181, "step": 27 }, { "epoch": 0.32558139534883723, "grad_norm": 0.9574615359306335, "learning_rate": 0.0002909538931178862, "loss": 4.7032, "step": 28 }, { "epoch": 0.3372093023255814, "grad_norm": 1.1205692291259766, "learning_rate": 0.0002899325557098001, "loss": 4.9195, "step": 29 }, { "epoch": 0.3488372093023256, "grad_norm": 0.9449394941329956, "learning_rate": 0.00028885859539033357, "loss": 4.2665, "step": 30 }, { "epoch": 0.36046511627906974, "grad_norm": 1.0518043041229248, "learning_rate": 0.0002877324160320411, "loss": 4.6985, "step": 31 }, { "epoch": 0.37209302325581395, "grad_norm": 0.9868097901344299, "learning_rate": 0.000286554441144922, "loss": 4.8123, "step": 32 }, { "epoch": 0.38372093023255816, "grad_norm": 0.8500082492828369, "learning_rate": 0.00028532511371715566, "loss": 4.5633, "step": 33 }, { "epoch": 0.3953488372093023, "grad_norm": 0.9240705370903015, "learning_rate": 0.0002840448960485118, "loss": 4.4635, "step": 34 }, { "epoch": 0.4069767441860465, "grad_norm": 0.8523270487785339, "learning_rate": 0.00028271426957649865, "loss": 4.1489, "step": 35 }, { "epoch": 0.4186046511627907, "grad_norm": 0.9723194241523743, "learning_rate": 0.00028133373469531363, "loss": 4.0247, "step": 36 }, { "epoch": 0.43023255813953487, "grad_norm": 1.9848984479904175, "learning_rate": 0.0002799038105676658, "loss": 4.588, "step": 37 }, { "epoch": 0.4418604651162791, "grad_norm": 0.8572053909301758, "learning_rate": 0.00027842503492953995, "loss": 4.6031, "step": 38 }, { "epoch": 0.45348837209302323, "grad_norm": 0.9035416841506958, "learning_rate": 0.0002768979638879761, "loss": 4.8538, "step": 39 }, { "epoch": 0.46511627906976744, "grad_norm": 1.053252935409546, "learning_rate": 0.00027532317171194046, "loss": 4.9722, "step": 40 }, { "epoch": 0.47674418604651164, "grad_norm": 1.071892261505127, "learning_rate": 0.000273701250616366, "loss": 5.0359, "step": 41 }, { "epoch": 0.4883720930232558, "grad_norm": 2.722141981124878, "learning_rate": 0.0002720328105394451, "loss": 6.2443, "step": 42 }, { "epoch": 0.5, "grad_norm": 2.1856935024261475, "learning_rate": 0.00027031847891325657, "loss": 3.1716, "step": 43 }, { "epoch": 0.5116279069767442, "grad_norm": 1.6351940631866455, "learning_rate": 0.00026855890042781387, "loss": 2.7951, "step": 44 }, { "epoch": 0.5232558139534884, "grad_norm": 1.7642279863357544, "learning_rate": 0.000266754736788624, "loss": 3.6811, "step": 45 }, { "epoch": 0.5348837209302325, "grad_norm": 1.665910243988037, "learning_rate": 0.00026490666646784665, "loss": 4.2234, "step": 46 }, { "epoch": 0.5465116279069767, "grad_norm": 1.4838505983352661, "learning_rate": 0.00026301538444914907, "loss": 4.4947, "step": 47 }, { "epoch": 0.5581395348837209, "grad_norm": 1.082304835319519, "learning_rate": 0.00026108160196635066, "loss": 4.6096, "step": 48 }, { "epoch": 0.5697674418604651, "grad_norm": 0.9804915189743042, "learning_rate": 0.0002591060462359573, "loss": 4.3917, "step": 49 }, { "epoch": 0.5813953488372093, "grad_norm": 1.103212594985962, "learning_rate": 0.00025708946018368484, "loss": 4.2729, "step": 50 }, { "epoch": 0.5813953488372093, "eval_loss": 1.018844723701477, "eval_runtime": 8.0091, "eval_samples_per_second": 6.243, "eval_steps_per_second": 0.874, "step": 50 }, { "epoch": 0.5930232558139535, "grad_norm": 0.9773282408714294, "learning_rate": 0.00025503260216507527, "loss": 4.8289, "step": 51 }, { "epoch": 0.6046511627906976, "grad_norm": 0.9617090225219727, "learning_rate": 0.00025293624568031, "loss": 4.4117, "step": 52 }, { "epoch": 0.6162790697674418, "grad_norm": 2.6547250747680664, "learning_rate": 0.00025080117908332834, "loss": 4.1853, "step": 53 }, { "epoch": 0.627906976744186, "grad_norm": 0.9078519940376282, "learning_rate": 0.00024862820528535954, "loss": 4.1129, "step": 54 }, { "epoch": 0.6395348837209303, "grad_norm": 0.9886384606361389, "learning_rate": 0.0002464181414529809, "loss": 3.9609, "step": 55 }, { "epoch": 0.6511627906976745, "grad_norm": 0.9917184710502625, "learning_rate": 0.0002441718187008148, "loss": 3.9052, "step": 56 }, { "epoch": 0.6627906976744186, "grad_norm": 0.8498942255973816, "learning_rate": 0.0002418900817789804, "loss": 4.3063, "step": 57 }, { "epoch": 0.6744186046511628, "grad_norm": 3.341484308242798, "learning_rate": 0.00023957378875541792, "loss": 5.0247, "step": 58 }, { "epoch": 0.686046511627907, "grad_norm": 0.8067652583122253, "learning_rate": 0.00023722381069320398, "loss": 4.6544, "step": 59 }, { "epoch": 0.6976744186046512, "grad_norm": 0.8337781429290771, "learning_rate": 0.00023484103132298079, "loss": 4.5483, "step": 60 }, { "epoch": 0.7093023255813954, "grad_norm": 0.9353996515274048, "learning_rate": 0.0002324263467106209, "loss": 4.5697, "step": 61 }, { "epoch": 0.7209302325581395, "grad_norm": 1.1387176513671875, "learning_rate": 0.0002299806649202537, "loss": 4.8516, "step": 62 }, { "epoch": 0.7325581395348837, "grad_norm": 2.2056901454925537, "learning_rate": 0.00022750490567277943, "loss": 5.6958, "step": 63 }, { "epoch": 0.7441860465116279, "grad_norm": 1.9957636594772339, "learning_rate": 0.000225, "loss": 2.8987, "step": 64 }, { "epoch": 0.7558139534883721, "grad_norm": 2.1281590461730957, "learning_rate": 0.00022246688989449576, "loss": 3.1358, "step": 65 }, { "epoch": 0.7674418604651163, "grad_norm": 1.0050021409988403, "learning_rate": 0.00021990652795538082, "loss": 3.127, "step": 66 }, { "epoch": 0.7790697674418605, "grad_norm": 0.9873821139335632, "learning_rate": 0.00021731987703006933, "loss": 4.0994, "step": 67 }, { "epoch": 0.7906976744186046, "grad_norm": 1.039175271987915, "learning_rate": 0.00021470790985218802, "loss": 4.3436, "step": 68 }, { "epoch": 0.8023255813953488, "grad_norm": 0.9042471647262573, "learning_rate": 0.00021207160867577087, "loss": 4.0852, "step": 69 }, { "epoch": 0.813953488372093, "grad_norm": 0.8749189972877502, "learning_rate": 0.0002094119649058735, "loss": 4.5285, "step": 70 }, { "epoch": 0.8255813953488372, "grad_norm": 0.9549452662467957, "learning_rate": 0.00020672997872574637, "loss": 4.3908, "step": 71 }, { "epoch": 0.8372093023255814, "grad_norm": 1.4416093826293945, "learning_rate": 0.00020402665872070654, "loss": 4.1939, "step": 72 }, { "epoch": 0.8488372093023255, "grad_norm": 0.9574504494667053, "learning_rate": 0.00020130302149885031, "loss": 4.4245, "step": 73 }, { "epoch": 0.8604651162790697, "grad_norm": 0.8751915097236633, "learning_rate": 0.00019856009130874816, "loss": 4.207, "step": 74 }, { "epoch": 0.872093023255814, "grad_norm": 0.8442956209182739, "learning_rate": 0.00019579889965426698, "loss": 3.7247, "step": 75 }, { "epoch": 0.872093023255814, "eval_loss": 1.0017675161361694, "eval_runtime": 7.7953, "eval_samples_per_second": 6.414, "eval_steps_per_second": 0.898, "step": 75 }, { "epoch": 0.8837209302325582, "grad_norm": 0.7961885929107666, "learning_rate": 0.00019302048490666353, "loss": 3.855, "step": 76 }, { "epoch": 0.8953488372093024, "grad_norm": 0.8288469314575195, "learning_rate": 0.0001902258919140956, "loss": 4.3016, "step": 77 }, { "epoch": 0.9069767441860465, "grad_norm": 0.9873271584510803, "learning_rate": 0.00018741617160869718, "loss": 4.0653, "step": 78 }, { "epoch": 0.9186046511627907, "grad_norm": 0.8603689074516296, "learning_rate": 0.00018459238061136602, "loss": 4.3487, "step": 79 }, { "epoch": 0.9302325581395349, "grad_norm": 0.8512229323387146, "learning_rate": 0.00018175558083441162, "loss": 4.331, "step": 80 }, { "epoch": 0.9418604651162791, "grad_norm": 0.9621251821517944, "learning_rate": 0.00017890683908221346, "loss": 4.4871, "step": 81 }, { "epoch": 0.9534883720930233, "grad_norm": 1.079795479774475, "learning_rate": 0.00017604722665003956, "loss": 4.9358, "step": 82 }, { "epoch": 0.9651162790697675, "grad_norm": 1.1401747465133667, "learning_rate": 0.00017317781892117607, "loss": 5.0393, "step": 83 }, { "epoch": 0.9767441860465116, "grad_norm": 2.458475112915039, "learning_rate": 0.00017029969496251966, "loss": 5.3823, "step": 84 }, { "epoch": 0.9883720930232558, "grad_norm": 1.0386871099472046, "learning_rate": 0.00016741393711878453, "loss": 3.8578, "step": 85 }, { "epoch": 1.0, "grad_norm": 1.7885408401489258, "learning_rate": 0.00016452163060547687, "loss": 4.6428, "step": 86 }, { "epoch": 1.0116279069767442, "grad_norm": 1.0790536403656006, "learning_rate": 0.00016162386310078963, "loss": 2.3767, "step": 87 }, { "epoch": 1.0232558139534884, "grad_norm": 0.8506271839141846, "learning_rate": 0.00015872172433657134, "loss": 2.6958, "step": 88 }, { "epoch": 1.0348837209302326, "grad_norm": 0.7386451959609985, "learning_rate": 0.0001558163056885225, "loss": 3.0156, "step": 89 }, { "epoch": 1.0465116279069768, "grad_norm": 0.901319682598114, "learning_rate": 0.00015290869976577364, "loss": 3.9778, "step": 90 }, { "epoch": 1.058139534883721, "grad_norm": 0.9542390704154968, "learning_rate": 0.00015, "loss": 3.7751, "step": 91 }, { "epoch": 1.069767441860465, "grad_norm": 0.9419402480125427, "learning_rate": 0.00014709130023422633, "loss": 4.0309, "step": 92 }, { "epoch": 1.0813953488372092, "grad_norm": 0.8811500668525696, "learning_rate": 0.00014418369431147746, "loss": 4.2052, "step": 93 }, { "epoch": 1.0930232558139534, "grad_norm": 0.9012645483016968, "learning_rate": 0.00014127827566342863, "loss": 3.7495, "step": 94 }, { "epoch": 1.1046511627906976, "grad_norm": 0.9330968260765076, "learning_rate": 0.00013837613689921037, "loss": 4.3093, "step": 95 }, { "epoch": 1.1162790697674418, "grad_norm": 0.9980186820030212, "learning_rate": 0.00013547836939452313, "loss": 4.2077, "step": 96 }, { "epoch": 1.127906976744186, "grad_norm": 0.8667058944702148, "learning_rate": 0.00013258606288121542, "loss": 3.4672, "step": 97 }, { "epoch": 1.1395348837209303, "grad_norm": 1.185315489768982, "learning_rate": 0.00012970030503748036, "loss": 3.9651, "step": 98 }, { "epoch": 1.1511627906976745, "grad_norm": 0.883290708065033, "learning_rate": 0.00012682218107882393, "loss": 3.8345, "step": 99 }, { "epoch": 1.1627906976744187, "grad_norm": 0.7864513993263245, "learning_rate": 0.00012395277334996044, "loss": 3.7654, "step": 100 }, { "epoch": 1.1627906976744187, "eval_loss": 0.991995632648468, "eval_runtime": 8.0088, "eval_samples_per_second": 6.243, "eval_steps_per_second": 0.874, "step": 100 }, { "epoch": 1.1744186046511629, "grad_norm": 0.8056565523147583, "learning_rate": 0.0001210931609177865, "loss": 3.8797, "step": 101 }, { "epoch": 1.1860465116279069, "grad_norm": 1.6551860570907593, "learning_rate": 0.00011824441916558842, "loss": 4.1864, "step": 102 }, { "epoch": 1.197674418604651, "grad_norm": 0.8464885354042053, "learning_rate": 0.00011540761938863397, "loss": 4.1092, "step": 103 }, { "epoch": 1.2093023255813953, "grad_norm": 0.9470563530921936, "learning_rate": 0.00011258382839130281, "loss": 4.2251, "step": 104 }, { "epoch": 1.2209302325581395, "grad_norm": 1.0200443267822266, "learning_rate": 0.00010977410808590436, "loss": 4.2479, "step": 105 }, { "epoch": 1.2325581395348837, "grad_norm": 1.1114517450332642, "learning_rate": 0.0001069795150933365, "loss": 4.3392, "step": 106 }, { "epoch": 1.244186046511628, "grad_norm": 2.389307975769043, "learning_rate": 0.00010420110034573304, "loss": 4.9816, "step": 107 }, { "epoch": 1.255813953488372, "grad_norm": 1.2697150707244873, "learning_rate": 0.00010143990869125184, "loss": 2.6297, "step": 108 }, { "epoch": 1.2674418604651163, "grad_norm": 0.9846199154853821, "learning_rate": 9.869697850114969e-05, "loss": 2.7252, "step": 109 }, { "epoch": 1.2790697674418605, "grad_norm": 1.0952304601669312, "learning_rate": 9.597334127929346e-05, "loss": 2.7719, "step": 110 }, { "epoch": 1.2906976744186047, "grad_norm": 0.9392086267471313, "learning_rate": 9.327002127425363e-05, "loss": 3.5918, "step": 111 }, { "epoch": 1.302325581395349, "grad_norm": 0.9461237192153931, "learning_rate": 9.058803509412646e-05, "loss": 3.874, "step": 112 }, { "epoch": 1.3139534883720931, "grad_norm": 0.9763650894165039, "learning_rate": 8.792839132422913e-05, "loss": 4.1091, "step": 113 }, { "epoch": 1.3255813953488373, "grad_norm": 0.9497533440589905, "learning_rate": 8.529209014781201e-05, "loss": 3.8025, "step": 114 }, { "epoch": 1.3372093023255813, "grad_norm": 1.0650036334991455, "learning_rate": 8.268012296993067e-05, "loss": 4.2753, "step": 115 }, { "epoch": 1.3488372093023255, "grad_norm": 1.0653228759765625, "learning_rate": 8.009347204461921e-05, "loss": 4.0739, "step": 116 }, { "epoch": 1.3604651162790697, "grad_norm": 1.1015644073486328, "learning_rate": 7.753311010550421e-05, "loss": 4.076, "step": 117 }, { "epoch": 1.372093023255814, "grad_norm": 1.0751556158065796, "learning_rate": 7.500000000000002e-05, "loss": 4.0752, "step": 118 }, { "epoch": 1.3837209302325582, "grad_norm": 0.9765347838401794, "learning_rate": 7.249509432722056e-05, "loss": 3.8782, "step": 119 }, { "epoch": 1.3953488372093024, "grad_norm": 1.0646424293518066, "learning_rate": 7.001933507974633e-05, "loss": 3.7195, "step": 120 }, { "epoch": 1.4069767441860466, "grad_norm": 0.9464161992073059, "learning_rate": 6.75736532893791e-05, "loss": 3.7267, "step": 121 }, { "epoch": 1.4186046511627908, "grad_norm": 1.1027458906173706, "learning_rate": 6.515896867701923e-05, "loss": 4.2282, "step": 122 }, { "epoch": 1.4302325581395348, "grad_norm": 0.9523140788078308, "learning_rate": 6.277618930679598e-05, "loss": 4.1755, "step": 123 }, { "epoch": 1.441860465116279, "grad_norm": 0.9763263463973999, "learning_rate": 6.04262112445821e-05, "loss": 4.1322, "step": 124 }, { "epoch": 1.4534883720930232, "grad_norm": 0.9657738208770752, "learning_rate": 5.8109918221019566e-05, "loss": 4.2455, "step": 125 }, { "epoch": 1.4534883720930232, "eval_loss": 0.9675049781799316, "eval_runtime": 8.0071, "eval_samples_per_second": 6.244, "eval_steps_per_second": 0.874, "step": 125 }, { "epoch": 1.4651162790697674, "grad_norm": 1.0695807933807373, "learning_rate": 5.582818129918524e-05, "loss": 4.2064, "step": 126 }, { "epoch": 1.4767441860465116, "grad_norm": 1.2192351818084717, "learning_rate": 5.358185854701909e-05, "loss": 4.4307, "step": 127 }, { "epoch": 1.4883720930232558, "grad_norm": 2.163886308670044, "learning_rate": 5.137179471464047e-05, "loss": 4.5119, "step": 128 }, { "epoch": 1.5, "grad_norm": 1.2242028713226318, "learning_rate": 4.9198820916671634e-05, "loss": 2.6797, "step": 129 }, { "epoch": 1.5116279069767442, "grad_norm": 1.0933971405029297, "learning_rate": 4.706375431968997e-05, "loss": 2.4876, "step": 130 }, { "epoch": 1.5232558139534884, "grad_norm": 1.007104754447937, "learning_rate": 4.4967397834924724e-05, "loss": 3.1256, "step": 131 }, { "epoch": 1.5348837209302326, "grad_norm": 0.9222277402877808, "learning_rate": 4.2910539816315164e-05, "loss": 3.4608, "step": 132 }, { "epoch": 1.5465116279069768, "grad_norm": 1.0027642250061035, "learning_rate": 4.089395376404269e-05, "loss": 3.692, "step": 133 }, { "epoch": 1.558139534883721, "grad_norm": 1.139125943183899, "learning_rate": 3.891839803364934e-05, "loss": 4.2357, "step": 134 }, { "epoch": 1.5697674418604652, "grad_norm": 1.028533697128296, "learning_rate": 3.698461555085089e-05, "loss": 4.1098, "step": 135 }, { "epoch": 1.5813953488372094, "grad_norm": 1.124190092086792, "learning_rate": 3.509333353215331e-05, "loss": 4.0042, "step": 136 }, { "epoch": 1.5930232558139537, "grad_norm": 1.042450189590454, "learning_rate": 3.324526321137599e-05, "loss": 3.7289, "step": 137 }, { "epoch": 1.6046511627906976, "grad_norm": 1.0798102617263794, "learning_rate": 3.144109957218612e-05, "loss": 3.8191, "step": 138 }, { "epoch": 1.6162790697674418, "grad_norm": 1.1347808837890625, "learning_rate": 2.9681521086743422e-05, "loss": 3.9489, "step": 139 }, { "epoch": 1.627906976744186, "grad_norm": 1.0719928741455078, "learning_rate": 2.7967189460554872e-05, "loss": 4.3926, "step": 140 }, { "epoch": 1.6395348837209303, "grad_norm": 0.9701864719390869, "learning_rate": 2.629874938363398e-05, "loss": 3.552, "step": 141 }, { "epoch": 1.6511627906976745, "grad_norm": 0.8670133948326111, "learning_rate": 2.4676828288059558e-05, "loss": 3.6456, "step": 142 }, { "epoch": 1.6627906976744184, "grad_norm": 0.9276437759399414, "learning_rate": 2.3102036112023836e-05, "loss": 3.3793, "step": 143 }, { "epoch": 1.6744186046511627, "grad_norm": 1.150744915008545, "learning_rate": 2.1574965070460043e-05, "loss": 3.6836, "step": 144 }, { "epoch": 1.6860465116279069, "grad_norm": 0.9892753958702087, "learning_rate": 2.009618943233419e-05, "loss": 4.1351, "step": 145 }, { "epoch": 1.697674418604651, "grad_norm": 0.9877732396125793, "learning_rate": 1.8666265304686383e-05, "loss": 4.105, "step": 146 }, { "epoch": 1.7093023255813953, "grad_norm": 1.0883934497833252, "learning_rate": 1.7285730423501327e-05, "loss": 4.1906, "step": 147 }, { "epoch": 1.7209302325581395, "grad_norm": 1.1771783828735352, "learning_rate": 1.5955103951488173e-05, "loss": 4.368, "step": 148 }, { "epoch": 1.7325581395348837, "grad_norm": 2.0767550468444824, "learning_rate": 1.467488628284434e-05, "loss": 5.0929, "step": 149 }, { "epoch": 1.744186046511628, "grad_norm": 0.9449374079704285, "learning_rate": 1.3445558855078014e-05, "loss": 2.7604, "step": 150 }, { "epoch": 1.744186046511628, "eval_loss": 0.970888078212738, "eval_runtime": 8.0092, "eval_samples_per_second": 6.243, "eval_steps_per_second": 0.874, "step": 150 }, { "epoch": 1.755813953488372, "grad_norm": 1.1184589862823486, "learning_rate": 1.2267583967958916e-05, "loss": 2.6164, "step": 151 }, { "epoch": 1.7674418604651163, "grad_norm": 0.9397222995758057, "learning_rate": 1.1141404609666449e-05, "loss": 3.2114, "step": 152 }, { "epoch": 1.7790697674418605, "grad_norm": 0.8842024803161621, "learning_rate": 1.0067444290199917e-05, "loss": 3.6752, "step": 153 }, { "epoch": 1.7906976744186047, "grad_norm": 0.9894306659698486, "learning_rate": 9.046106882113751e-06, "loss": 3.8772, "step": 154 }, { "epoch": 1.802325581395349, "grad_norm": 0.9347633123397827, "learning_rate": 8.07777646863746e-06, "loss": 3.9717, "step": 155 }, { "epoch": 1.8139534883720931, "grad_norm": 1.0060522556304932, "learning_rate": 7.1628171992377025e-06, "loss": 4.3449, "step": 156 }, { "epoch": 1.8255813953488373, "grad_norm": 0.9990627765655518, "learning_rate": 6.301573152676664e-06, "loss": 4.1012, "step": 157 }, { "epoch": 1.8372093023255816, "grad_norm": 1.0378891229629517, "learning_rate": 5.494368207617949e-06, "loss": 4.0339, "step": 158 }, { "epoch": 1.8488372093023255, "grad_norm": 1.0531872510910034, "learning_rate": 4.741505920829131e-06, "loss": 4.4799, "step": 159 }, { "epoch": 1.8604651162790697, "grad_norm": 0.9710641503334045, "learning_rate": 4.043269413026429e-06, "loss": 3.6334, "step": 160 }, { "epoch": 1.872093023255814, "grad_norm": 0.9414262771606445, "learning_rate": 3.3999212624046646e-06, "loss": 3.8207, "step": 161 }, { "epoch": 1.8837209302325582, "grad_norm": 0.8727117776870728, "learning_rate": 2.811703405892296e-06, "loss": 3.6237, "step": 162 }, { "epoch": 1.8953488372093024, "grad_norm": 0.8367646932601929, "learning_rate": 2.2788370481687965e-06, "loss": 3.5522, "step": 163 }, { "epoch": 1.9069767441860463, "grad_norm": 0.9638428688049316, "learning_rate": 1.801522578478648e-06, "loss": 3.7745, "step": 164 }, { "epoch": 1.9186046511627906, "grad_norm": 1.643754005432129, "learning_rate": 1.3799394952732024e-06, "loss": 4.3574, "step": 165 }, { "epoch": 1.9302325581395348, "grad_norm": 0.9748329520225525, "learning_rate": 1.0142463387085464e-06, "loss": 4.2242, "step": 166 }, { "epoch": 1.941860465116279, "grad_norm": 0.938869297504425, "learning_rate": 7.045806310251257e-07, "loss": 4.0438, "step": 167 }, { "epoch": 1.9534883720930232, "grad_norm": 1.0137250423431396, "learning_rate": 4.510588248311964e-07, "loss": 4.342, "step": 168 }, { "epoch": 1.9651162790697674, "grad_norm": 1.2118498086929321, "learning_rate": 2.5377625930977363e-07, "loss": 4.5787, "step": 169 }, { "epoch": 1.9767441860465116, "grad_norm": 2.177273750305176, "learning_rate": 1.1280712436549378e-07, "loss": 4.6907, "step": 170 }, { "epoch": 1.9883720930232558, "grad_norm": 0.9952888488769531, "learning_rate": 2.8204432724798775e-08, "loss": 3.898, "step": 171 }, { "epoch": 2.0, "grad_norm": 2.449486017227173, "learning_rate": 0.0, "loss": 5.0474, "step": 172 } ], "logging_steps": 1, "max_steps": 172, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.485613329088512e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }