{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9535759096612297, "eval_steps": 100, "global_step": 1592, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.1427878886461258, "learning_rate": 2e-05, "loss": 1.3359, "step": 1 }, { "epoch": 0.0, "eval_loss": 1.324710488319397, "eval_runtime": 82.0652, "eval_samples_per_second": 31.67, "eval_steps_per_second": 31.67, "step": 1 }, { "epoch": 0.01, "grad_norm": 0.13537771999835968, "learning_rate": 4e-05, "loss": 1.2865, "step": 2 }, { "epoch": 0.01, "grad_norm": 0.14623422920703888, "learning_rate": 6e-05, "loss": 1.3192, "step": 3 }, { "epoch": 0.01, "grad_norm": 0.15388095378875732, "learning_rate": 8e-05, "loss": 1.3244, "step": 4 }, { "epoch": 0.01, "grad_norm": 0.1628686636686325, "learning_rate": 0.0001, "loss": 1.3, "step": 5 }, { "epoch": 0.02, "grad_norm": 0.20623335242271423, "learning_rate": 0.00012, "loss": 1.244, "step": 6 }, { "epoch": 0.02, "grad_norm": 0.1510678231716156, "learning_rate": 0.00014, "loss": 1.2799, "step": 7 }, { "epoch": 0.02, "grad_norm": 0.15237094461917877, "learning_rate": 0.00016, "loss": 1.2979, "step": 8 }, { "epoch": 0.02, "grad_norm": 0.15166334807872772, "learning_rate": 0.00018, "loss": 1.28, "step": 9 }, { "epoch": 0.03, "grad_norm": 0.17794868350028992, "learning_rate": 0.0002, "loss": 1.2298, "step": 10 }, { "epoch": 0.03, "grad_norm": 0.25811436772346497, "learning_rate": 0.0001999998028228211, "loss": 1.1909, "step": 11 }, { "epoch": 0.03, "grad_norm": 0.19142530858516693, "learning_rate": 0.000199999211292062, "loss": 1.178, "step": 12 }, { "epoch": 0.03, "grad_norm": 0.1891462802886963, "learning_rate": 0.00019999822541005537, "loss": 1.1173, "step": 13 }, { "epoch": 0.04, "grad_norm": 0.17077742516994476, "learning_rate": 0.00019999684518068916, "loss": 1.2092, "step": 14 }, { "epoch": 0.04, "grad_norm": 0.15135815739631653, "learning_rate": 0.00019999507060940625, "loss": 1.1439, "step": 15 }, { "epoch": 0.04, "grad_norm": 0.1767009049654007, "learning_rate": 0.00019999290170320485, "loss": 1.1408, "step": 16 }, { "epoch": 0.04, "grad_norm": 0.1310850977897644, "learning_rate": 0.00019999033847063811, "loss": 1.2369, "step": 17 }, { "epoch": 0.05, "grad_norm": 0.12432192265987396, "learning_rate": 0.00019998738092181421, "loss": 1.152, "step": 18 }, { "epoch": 0.05, "grad_norm": 0.12430022656917572, "learning_rate": 0.00019998402906839643, "loss": 1.2111, "step": 19 }, { "epoch": 0.05, "grad_norm": 0.12175025045871735, "learning_rate": 0.00019998028292360286, "loss": 1.1686, "step": 20 }, { "epoch": 0.05, "grad_norm": 0.11878372728824615, "learning_rate": 0.0001999761425022067, "loss": 1.2452, "step": 21 }, { "epoch": 0.06, "grad_norm": 0.11329779773950577, "learning_rate": 0.00019997160782053578, "loss": 1.0964, "step": 22 }, { "epoch": 0.06, "grad_norm": 0.11987729370594025, "learning_rate": 0.00019996667889647288, "loss": 1.1809, "step": 23 }, { "epoch": 0.06, "grad_norm": 0.12245086580514908, "learning_rate": 0.00019996135574945544, "loss": 1.1138, "step": 24 }, { "epoch": 0.06, "grad_norm": 0.1399640142917633, "learning_rate": 0.00019995563840047542, "loss": 1.184, "step": 25 }, { "epoch": 0.07, "grad_norm": 0.13597123324871063, "learning_rate": 0.00019994952687207954, "loss": 1.1872, "step": 26 }, { "epoch": 0.07, "grad_norm": 0.13976556062698364, "learning_rate": 0.00019994302118836883, "loss": 1.1685, "step": 27 }, { "epoch": 0.07, "grad_norm": 0.13106240332126617, "learning_rate": 0.00019993612137499876, "loss": 1.1872, "step": 28 }, { "epoch": 0.07, "grad_norm": 0.12896399199962616, "learning_rate": 0.00019992882745917902, "loss": 1.1462, "step": 29 }, { "epoch": 0.08, "grad_norm": 0.13873620331287384, "learning_rate": 0.00019992113946967353, "loss": 1.1742, "step": 30 }, { "epoch": 0.08, "grad_norm": 0.14103546738624573, "learning_rate": 0.00019991305743680013, "loss": 1.1245, "step": 31 }, { "epoch": 0.08, "grad_norm": 0.1377720981836319, "learning_rate": 0.00019990458139243077, "loss": 1.2045, "step": 32 }, { "epoch": 0.08, "grad_norm": 0.13191157579421997, "learning_rate": 0.000199895711369991, "loss": 1.1716, "step": 33 }, { "epoch": 0.09, "grad_norm": 0.13426551222801208, "learning_rate": 0.00019988644740446022, "loss": 1.1382, "step": 34 }, { "epoch": 0.09, "grad_norm": 0.13733097910881042, "learning_rate": 0.00019987678953237127, "loss": 1.1677, "step": 35 }, { "epoch": 0.09, "grad_norm": 0.12618272006511688, "learning_rate": 0.00019986673779181033, "loss": 1.2195, "step": 36 }, { "epoch": 0.09, "grad_norm": 0.13636991381645203, "learning_rate": 0.00019985629222241694, "loss": 1.1577, "step": 37 }, { "epoch": 0.1, "grad_norm": 0.13234035670757294, "learning_rate": 0.0001998454528653836, "loss": 1.1089, "step": 38 }, { "epoch": 0.1, "grad_norm": 0.1395445317029953, "learning_rate": 0.00019983421976345586, "loss": 1.139, "step": 39 }, { "epoch": 0.1, "grad_norm": 0.1284484714269638, "learning_rate": 0.0001998225929609319, "loss": 1.117, "step": 40 }, { "epoch": 0.1, "grad_norm": 0.13304275274276733, "learning_rate": 0.00019981057250366253, "loss": 1.161, "step": 41 }, { "epoch": 0.11, "grad_norm": 0.13184913992881775, "learning_rate": 0.00019979815843905097, "loss": 1.1826, "step": 42 }, { "epoch": 0.11, "grad_norm": 0.12830235064029694, "learning_rate": 0.0001997853508160526, "loss": 1.0739, "step": 43 }, { "epoch": 0.11, "grad_norm": 0.1346379965543747, "learning_rate": 0.0001997721496851748, "loss": 1.191, "step": 44 }, { "epoch": 0.11, "grad_norm": 0.13036642968654633, "learning_rate": 0.00019975855509847686, "loss": 1.1361, "step": 45 }, { "epoch": 0.12, "grad_norm": 0.12707848846912384, "learning_rate": 0.00019974456710956964, "loss": 1.101, "step": 46 }, { "epoch": 0.12, "grad_norm": 0.12984970211982727, "learning_rate": 0.00019973018577361536, "loss": 1.1085, "step": 47 }, { "epoch": 0.12, "grad_norm": 0.12627972662448883, "learning_rate": 0.00019971541114732741, "loss": 1.1607, "step": 48 }, { "epoch": 0.12, "grad_norm": 0.13074152171611786, "learning_rate": 0.00019970024328897022, "loss": 1.1004, "step": 49 }, { "epoch": 0.13, "grad_norm": 0.1309152990579605, "learning_rate": 0.0001996846822583589, "loss": 1.1378, "step": 50 }, { "epoch": 0.13, "grad_norm": 0.1303664743900299, "learning_rate": 0.000199668728116859, "loss": 1.0956, "step": 51 }, { "epoch": 0.13, "grad_norm": 0.13290388882160187, "learning_rate": 0.00019965238092738643, "loss": 1.1264, "step": 52 }, { "epoch": 0.13, "grad_norm": 0.12805409729480743, "learning_rate": 0.00019963564075440703, "loss": 1.183, "step": 53 }, { "epoch": 0.14, "grad_norm": 0.1399564892053604, "learning_rate": 0.0001996185076639364, "loss": 1.1102, "step": 54 }, { "epoch": 0.14, "grad_norm": 0.12978173792362213, "learning_rate": 0.00019960098172353962, "loss": 1.1634, "step": 55 }, { "epoch": 0.14, "grad_norm": 0.13925811648368835, "learning_rate": 0.00019958306300233098, "loss": 1.0636, "step": 56 }, { "epoch": 0.14, "grad_norm": 0.13258852064609528, "learning_rate": 0.00019956475157097378, "loss": 1.1428, "step": 57 }, { "epoch": 0.15, "grad_norm": 0.1285356879234314, "learning_rate": 0.00019954604750167993, "loss": 1.1664, "step": 58 }, { "epoch": 0.15, "grad_norm": 0.1321210116147995, "learning_rate": 0.00019952695086820975, "loss": 1.1419, "step": 59 }, { "epoch": 0.15, "grad_norm": 0.14086973667144775, "learning_rate": 0.00019950746174587163, "loss": 1.1827, "step": 60 }, { "epoch": 0.15, "grad_norm": 0.1311366856098175, "learning_rate": 0.0001994875802115218, "loss": 1.1971, "step": 61 }, { "epoch": 0.16, "grad_norm": 0.14063993096351624, "learning_rate": 0.0001994673063435639, "loss": 1.1945, "step": 62 }, { "epoch": 0.16, "grad_norm": 0.12695981562137604, "learning_rate": 0.00019944664022194885, "loss": 1.0385, "step": 63 }, { "epoch": 0.16, "grad_norm": 0.14170674979686737, "learning_rate": 0.0001994255819281744, "loss": 1.0883, "step": 64 }, { "epoch": 0.16, "grad_norm": 0.13162197172641754, "learning_rate": 0.0001994041315452849, "loss": 1.153, "step": 65 }, { "epoch": 0.17, "grad_norm": 0.1326906979084015, "learning_rate": 0.0001993822891578708, "loss": 1.1186, "step": 66 }, { "epoch": 0.17, "grad_norm": 0.13306689262390137, "learning_rate": 0.00019936005485206851, "loss": 1.1587, "step": 67 }, { "epoch": 0.17, "grad_norm": 0.13625258207321167, "learning_rate": 0.00019933742871556, "loss": 1.1339, "step": 68 }, { "epoch": 0.17, "grad_norm": 0.13773800432682037, "learning_rate": 0.00019931441083757245, "loss": 1.1944, "step": 69 }, { "epoch": 0.18, "grad_norm": 0.15291447937488556, "learning_rate": 0.00019929100130887782, "loss": 1.1028, "step": 70 }, { "epoch": 0.18, "grad_norm": 0.15140767395496368, "learning_rate": 0.0001992672002217926, "loss": 1.1896, "step": 71 }, { "epoch": 0.18, "grad_norm": 0.1344233751296997, "learning_rate": 0.0001992430076701775, "loss": 1.0561, "step": 72 }, { "epoch": 0.18, "grad_norm": 0.13877920806407928, "learning_rate": 0.0001992184237494368, "loss": 1.1108, "step": 73 }, { "epoch": 0.19, "grad_norm": 0.1359027922153473, "learning_rate": 0.00019919344855651833, "loss": 1.1563, "step": 74 }, { "epoch": 0.19, "grad_norm": 0.14610135555267334, "learning_rate": 0.0001991680821899128, "loss": 1.1299, "step": 75 }, { "epoch": 0.19, "grad_norm": 0.14259958267211914, "learning_rate": 0.00019914232474965365, "loss": 1.1021, "step": 76 }, { "epoch": 0.19, "grad_norm": 0.14158602058887482, "learning_rate": 0.00019911617633731638, "loss": 1.0787, "step": 77 }, { "epoch": 0.2, "grad_norm": 0.1418074518442154, "learning_rate": 0.00019908963705601846, "loss": 1.1359, "step": 78 }, { "epoch": 0.2, "grad_norm": 0.12850767374038696, "learning_rate": 0.0001990627070104187, "loss": 1.1373, "step": 79 }, { "epoch": 0.2, "grad_norm": 0.1312914341688156, "learning_rate": 0.0001990353863067169, "loss": 1.0832, "step": 80 }, { "epoch": 0.2, "grad_norm": 0.13280583918094635, "learning_rate": 0.0001990076750526534, "loss": 1.0462, "step": 81 }, { "epoch": 0.21, "grad_norm": 0.13617292046546936, "learning_rate": 0.00019897957335750878, "loss": 1.1059, "step": 82 }, { "epoch": 0.21, "grad_norm": 0.15030132234096527, "learning_rate": 0.00019895108133210335, "loss": 1.0761, "step": 83 }, { "epoch": 0.21, "grad_norm": 0.14291270077228546, "learning_rate": 0.00019892219908879653, "loss": 1.1217, "step": 84 }, { "epoch": 0.21, "grad_norm": 0.1685461699962616, "learning_rate": 0.00019889292674148682, "loss": 1.1607, "step": 85 }, { "epoch": 0.22, "grad_norm": 0.13756121695041656, "learning_rate": 0.00019886326440561093, "loss": 1.0914, "step": 86 }, { "epoch": 0.22, "grad_norm": 0.13901358842849731, "learning_rate": 0.0001988332121981436, "loss": 1.1234, "step": 87 }, { "epoch": 0.22, "grad_norm": 0.13816247880458832, "learning_rate": 0.00019880277023759702, "loss": 1.1583, "step": 88 }, { "epoch": 0.22, "grad_norm": 0.13309679925441742, "learning_rate": 0.00019877193864402038, "loss": 1.163, "step": 89 }, { "epoch": 0.23, "grad_norm": 0.13356180489063263, "learning_rate": 0.0001987407175389994, "loss": 1.1301, "step": 90 }, { "epoch": 0.23, "grad_norm": 0.1388397067785263, "learning_rate": 0.00019870910704565588, "loss": 1.1326, "step": 91 }, { "epoch": 0.23, "grad_norm": 0.13303454220294952, "learning_rate": 0.0001986771072886472, "loss": 1.0779, "step": 92 }, { "epoch": 0.23, "grad_norm": 0.1316283941268921, "learning_rate": 0.00019864471839416576, "loss": 1.0935, "step": 93 }, { "epoch": 0.24, "grad_norm": 0.1348309963941574, "learning_rate": 0.00019861194048993863, "loss": 1.1816, "step": 94 }, { "epoch": 0.24, "grad_norm": 0.1341564655303955, "learning_rate": 0.00019857877370522685, "loss": 1.1187, "step": 95 }, { "epoch": 0.24, "grad_norm": 0.13689687848091125, "learning_rate": 0.0001985452181708251, "loss": 1.1637, "step": 96 }, { "epoch": 0.24, "grad_norm": 0.13348707556724548, "learning_rate": 0.0001985112740190611, "loss": 1.1026, "step": 97 }, { "epoch": 0.25, "grad_norm": 0.13700643181800842, "learning_rate": 0.00019847694138379506, "loss": 1.1508, "step": 98 }, { "epoch": 0.25, "grad_norm": 0.13654476404190063, "learning_rate": 0.00019844222040041928, "loss": 1.1668, "step": 99 }, { "epoch": 0.25, "grad_norm": 0.15331624448299408, "learning_rate": 0.0001984071112058574, "loss": 1.1121, "step": 100 }, { "epoch": 0.25, "eval_loss": 1.1294280290603638, "eval_runtime": 81.6595, "eval_samples_per_second": 31.827, "eval_steps_per_second": 31.827, "step": 100 }, { "epoch": 0.25, "grad_norm": 0.14425526559352875, "learning_rate": 0.0001983716139385641, "loss": 1.1447, "step": 101 }, { "epoch": 0.26, "grad_norm": 0.13741208612918854, "learning_rate": 0.00019833572873852444, "loss": 1.1001, "step": 102 }, { "epoch": 0.26, "grad_norm": 0.1282232254743576, "learning_rate": 0.0001982994557472532, "loss": 1.1199, "step": 103 }, { "epoch": 0.26, "grad_norm": 0.13605354726314545, "learning_rate": 0.00019826279510779454, "loss": 1.154, "step": 104 }, { "epoch": 0.26, "grad_norm": 0.13503985106945038, "learning_rate": 0.00019822574696472126, "loss": 1.0565, "step": 105 }, { "epoch": 0.27, "grad_norm": 0.13878273963928223, "learning_rate": 0.00019818831146413434, "loss": 1.106, "step": 106 }, { "epoch": 0.27, "grad_norm": 0.141740083694458, "learning_rate": 0.00019815048875366234, "loss": 1.0848, "step": 107 }, { "epoch": 0.27, "grad_norm": 0.13799507915973663, "learning_rate": 0.0001981122789824607, "loss": 1.1582, "step": 108 }, { "epoch": 0.27, "grad_norm": 0.1441466212272644, "learning_rate": 0.0001980736823012114, "loss": 1.0787, "step": 109 }, { "epoch": 0.28, "grad_norm": 0.1377534121274948, "learning_rate": 0.0001980346988621221, "loss": 1.1092, "step": 110 }, { "epoch": 0.28, "grad_norm": 0.1400901973247528, "learning_rate": 0.00019799532881892564, "loss": 1.0549, "step": 111 }, { "epoch": 0.28, "grad_norm": 0.13621239364147186, "learning_rate": 0.00019795557232687956, "loss": 1.0991, "step": 112 }, { "epoch": 0.28, "grad_norm": 0.1324262171983719, "learning_rate": 0.0001979154295427653, "loss": 1.0583, "step": 113 }, { "epoch": 0.29, "grad_norm": 0.13273654878139496, "learning_rate": 0.0001978749006248877, "loss": 1.1504, "step": 114 }, { "epoch": 0.29, "grad_norm": 0.14279481768608093, "learning_rate": 0.00019783398573307428, "loss": 1.0941, "step": 115 }, { "epoch": 0.29, "grad_norm": 0.1432316154241562, "learning_rate": 0.00019779268502867473, "loss": 1.1111, "step": 116 }, { "epoch": 0.29, "grad_norm": 0.14505276083946228, "learning_rate": 0.00019775099867456013, "loss": 1.0941, "step": 117 }, { "epoch": 0.3, "grad_norm": 0.13935014605522156, "learning_rate": 0.0001977089268351225, "loss": 1.0597, "step": 118 }, { "epoch": 0.3, "grad_norm": 0.14532430469989777, "learning_rate": 0.0001976664696762739, "loss": 1.1116, "step": 119 }, { "epoch": 0.3, "grad_norm": 0.14096760749816895, "learning_rate": 0.00019762362736544607, "loss": 1.1381, "step": 120 }, { "epoch": 0.3, "grad_norm": 0.1470746099948883, "learning_rate": 0.00019758040007158948, "loss": 1.1215, "step": 121 }, { "epoch": 0.31, "grad_norm": 0.13610850274562836, "learning_rate": 0.00019753678796517282, "loss": 1.136, "step": 122 }, { "epoch": 0.31, "grad_norm": 0.1399529129266739, "learning_rate": 0.00019749279121818235, "loss": 1.1035, "step": 123 }, { "epoch": 0.31, "grad_norm": 0.13626012206077576, "learning_rate": 0.00019744841000412123, "loss": 1.1248, "step": 124 }, { "epoch": 0.31, "grad_norm": 0.13053762912750244, "learning_rate": 0.0001974036444980086, "loss": 1.1286, "step": 125 }, { "epoch": 0.32, "grad_norm": 0.14427675306797028, "learning_rate": 0.00019735849487637929, "loss": 1.2792, "step": 126 }, { "epoch": 0.32, "grad_norm": 0.14464688301086426, "learning_rate": 0.0001973129613172827, "loss": 1.1091, "step": 127 }, { "epoch": 0.32, "grad_norm": 0.12712322175502777, "learning_rate": 0.0001972670440002825, "loss": 1.1219, "step": 128 }, { "epoch": 0.32, "grad_norm": 0.13343971967697144, "learning_rate": 0.00019722074310645553, "loss": 1.1296, "step": 129 }, { "epoch": 0.33, "grad_norm": 0.15525247156620026, "learning_rate": 0.00019717405881839145, "loss": 1.159, "step": 130 }, { "epoch": 0.33, "grad_norm": 0.12908953428268433, "learning_rate": 0.0001971269913201918, "loss": 1.0821, "step": 131 }, { "epoch": 0.33, "grad_norm": 0.24165280163288116, "learning_rate": 0.00019707954079746927, "loss": 1.1388, "step": 132 }, { "epoch": 0.33, "grad_norm": 0.1432817280292511, "learning_rate": 0.00019703170743734706, "loss": 1.1184, "step": 133 }, { "epoch": 0.34, "grad_norm": 0.14007362723350525, "learning_rate": 0.00019698349142845814, "loss": 1.1576, "step": 134 }, { "epoch": 0.34, "grad_norm": 0.14235983788967133, "learning_rate": 0.00019693489296094443, "loss": 1.0847, "step": 135 }, { "epoch": 0.34, "grad_norm": 0.1430092453956604, "learning_rate": 0.00019688591222645607, "loss": 1.1562, "step": 136 }, { "epoch": 0.34, "grad_norm": 0.13986627757549286, "learning_rate": 0.00019683654941815077, "loss": 1.124, "step": 137 }, { "epoch": 0.35, "grad_norm": 0.13933469355106354, "learning_rate": 0.00019678680473069293, "loss": 1.1001, "step": 138 }, { "epoch": 0.35, "grad_norm": 0.13476844131946564, "learning_rate": 0.00019673667836025283, "loss": 1.1186, "step": 139 }, { "epoch": 0.35, "grad_norm": 0.13418316841125488, "learning_rate": 0.00019668617050450603, "loss": 1.1309, "step": 140 }, { "epoch": 0.35, "grad_norm": 0.12794847786426544, "learning_rate": 0.00019663528136263246, "loss": 1.1142, "step": 141 }, { "epoch": 0.36, "grad_norm": 0.1326293647289276, "learning_rate": 0.00019658401113531565, "loss": 1.0503, "step": 142 }, { "epoch": 0.36, "grad_norm": 0.14793147146701813, "learning_rate": 0.000196532360024742, "loss": 1.2104, "step": 143 }, { "epoch": 0.36, "grad_norm": 0.13718444108963013, "learning_rate": 0.00019648032823459994, "loss": 1.1685, "step": 144 }, { "epoch": 0.36, "grad_norm": 0.14404018223285675, "learning_rate": 0.00019642791597007902, "loss": 1.09, "step": 145 }, { "epoch": 0.37, "grad_norm": 0.14241506159305573, "learning_rate": 0.00019637512343786937, "loss": 1.1355, "step": 146 }, { "epoch": 0.37, "grad_norm": 0.14581352472305298, "learning_rate": 0.00019632195084616063, "loss": 1.1005, "step": 147 }, { "epoch": 0.37, "grad_norm": 0.14792676270008087, "learning_rate": 0.00019626839840464119, "loss": 1.1168, "step": 148 }, { "epoch": 0.37, "grad_norm": 0.1484677940607071, "learning_rate": 0.00019621446632449744, "loss": 1.1138, "step": 149 }, { "epoch": 0.38, "grad_norm": 0.15315671265125275, "learning_rate": 0.0001961601548184129, "loss": 1.1636, "step": 150 }, { "epoch": 0.38, "grad_norm": 0.14746810495853424, "learning_rate": 0.0001961054641005674, "loss": 1.0881, "step": 151 }, { "epoch": 0.38, "grad_norm": 0.1407732516527176, "learning_rate": 0.00019605039438663614, "loss": 1.0347, "step": 152 }, { "epoch": 0.38, "grad_norm": 0.14150719344615936, "learning_rate": 0.0001959949458937889, "loss": 1.1112, "step": 153 }, { "epoch": 0.39, "grad_norm": 0.16782569885253906, "learning_rate": 0.0001959391188406893, "loss": 1.0496, "step": 154 }, { "epoch": 0.39, "grad_norm": 0.1452791690826416, "learning_rate": 0.0001958829134474937, "loss": 1.1185, "step": 155 }, { "epoch": 0.39, "grad_norm": 0.145284965634346, "learning_rate": 0.00019582632993585052, "loss": 1.1431, "step": 156 }, { "epoch": 0.39, "grad_norm": 0.15500612556934357, "learning_rate": 0.00019576936852889936, "loss": 1.1679, "step": 157 }, { "epoch": 0.4, "grad_norm": 0.1416521966457367, "learning_rate": 0.00019571202945126994, "loss": 1.1322, "step": 158 }, { "epoch": 0.4, "grad_norm": 0.1465340405702591, "learning_rate": 0.00019565431292908146, "loss": 1.0693, "step": 159 }, { "epoch": 0.4, "grad_norm": 0.13601765036582947, "learning_rate": 0.0001955962191899415, "loss": 1.0676, "step": 160 }, { "epoch": 0.4, "grad_norm": 0.14759162068367004, "learning_rate": 0.0001955377484629453, "loss": 1.0506, "step": 161 }, { "epoch": 0.41, "grad_norm": 0.14839032292366028, "learning_rate": 0.00019547890097867468, "loss": 1.1245, "step": 162 }, { "epoch": 0.41, "grad_norm": 0.1440214365720749, "learning_rate": 0.0001954196769691973, "loss": 1.1672, "step": 163 }, { "epoch": 0.41, "grad_norm": 0.1372719258069992, "learning_rate": 0.00019536007666806556, "loss": 1.1084, "step": 164 }, { "epoch": 0.41, "grad_norm": 0.14372558891773224, "learning_rate": 0.00019530010031031586, "loss": 1.1679, "step": 165 }, { "epoch": 0.42, "grad_norm": 0.13789264857769012, "learning_rate": 0.00019523974813246767, "loss": 1.1253, "step": 166 }, { "epoch": 0.42, "grad_norm": 0.14368915557861328, "learning_rate": 0.0001951790203725223, "loss": 1.085, "step": 167 }, { "epoch": 0.42, "grad_norm": 0.1380469799041748, "learning_rate": 0.00019511791726996243, "loss": 1.1379, "step": 168 }, { "epoch": 0.42, "grad_norm": 0.13288158178329468, "learning_rate": 0.00019505643906575073, "loss": 1.113, "step": 169 }, { "epoch": 0.43, "grad_norm": 0.1390606164932251, "learning_rate": 0.0001949945860023292, "loss": 1.095, "step": 170 }, { "epoch": 0.43, "grad_norm": 0.14271940290927887, "learning_rate": 0.0001949323583236181, "loss": 1.1063, "step": 171 }, { "epoch": 0.43, "grad_norm": 0.13795693218708038, "learning_rate": 0.00019486975627501502, "loss": 1.0628, "step": 172 }, { "epoch": 0.43, "grad_norm": 0.14073535799980164, "learning_rate": 0.0001948067801033938, "loss": 1.1192, "step": 173 }, { "epoch": 0.44, "grad_norm": 0.138822540640831, "learning_rate": 0.0001947434300571038, "loss": 1.1299, "step": 174 }, { "epoch": 0.44, "grad_norm": 0.13592712581157684, "learning_rate": 0.0001946797063859686, "loss": 1.0868, "step": 175 }, { "epoch": 0.44, "grad_norm": 0.1379610300064087, "learning_rate": 0.00019461560934128533, "loss": 1.069, "step": 176 }, { "epoch": 0.44, "grad_norm": 0.14286787807941437, "learning_rate": 0.00019455113917582346, "loss": 1.139, "step": 177 }, { "epoch": 0.45, "grad_norm": 0.14168201386928558, "learning_rate": 0.0001944862961438239, "loss": 1.1405, "step": 178 }, { "epoch": 0.45, "grad_norm": 0.1345077008008957, "learning_rate": 0.000194421080500998, "loss": 1.1039, "step": 179 }, { "epoch": 0.45, "grad_norm": 0.1363426297903061, "learning_rate": 0.00019435549250452645, "loss": 1.1056, "step": 180 }, { "epoch": 0.45, "grad_norm": 0.14109478890895844, "learning_rate": 0.00019428953241305838, "loss": 1.0927, "step": 181 }, { "epoch": 0.46, "grad_norm": 0.14332321286201477, "learning_rate": 0.0001942232004867103, "loss": 1.0305, "step": 182 }, { "epoch": 0.46, "grad_norm": 0.15956294536590576, "learning_rate": 0.00019415649698706507, "loss": 1.1245, "step": 183 }, { "epoch": 0.46, "grad_norm": 0.14164718985557556, "learning_rate": 0.0001940894221771708, "loss": 1.0963, "step": 184 }, { "epoch": 0.46, "grad_norm": 0.14296875894069672, "learning_rate": 0.00019402197632153992, "loss": 1.0853, "step": 185 }, { "epoch": 0.47, "grad_norm": 0.12994709610939026, "learning_rate": 0.00019395415968614813, "loss": 1.0503, "step": 186 }, { "epoch": 0.47, "grad_norm": 0.1399766504764557, "learning_rate": 0.00019388597253843334, "loss": 1.0623, "step": 187 }, { "epoch": 0.47, "grad_norm": 0.14874404668807983, "learning_rate": 0.00019381741514729443, "loss": 1.0885, "step": 188 }, { "epoch": 0.47, "grad_norm": 0.1453857719898224, "learning_rate": 0.00019374848778309055, "loss": 1.1702, "step": 189 }, { "epoch": 0.48, "grad_norm": 0.14976643025875092, "learning_rate": 0.0001936791907176397, "loss": 1.0834, "step": 190 }, { "epoch": 0.48, "grad_norm": 0.1418897956609726, "learning_rate": 0.00019360952422421793, "loss": 1.0918, "step": 191 }, { "epoch": 0.48, "grad_norm": 0.14602817595005035, "learning_rate": 0.00019353948857755803, "loss": 1.0825, "step": 192 }, { "epoch": 0.48, "grad_norm": 0.14669157564640045, "learning_rate": 0.00019346908405384867, "loss": 1.0973, "step": 193 }, { "epoch": 0.49, "grad_norm": 0.14327263832092285, "learning_rate": 0.00019339831093073318, "loss": 1.1191, "step": 194 }, { "epoch": 0.49, "grad_norm": 0.13806897401809692, "learning_rate": 0.0001933271694873084, "loss": 1.1504, "step": 195 }, { "epoch": 0.49, "grad_norm": 0.13992969691753387, "learning_rate": 0.00019325566000412376, "loss": 1.0865, "step": 196 }, { "epoch": 0.49, "grad_norm": 0.14395759999752045, "learning_rate": 0.00019318378276318, "loss": 1.1204, "step": 197 }, { "epoch": 0.5, "grad_norm": 0.1409691572189331, "learning_rate": 0.0001931115380479281, "loss": 1.0766, "step": 198 }, { "epoch": 0.5, "grad_norm": 0.1448824405670166, "learning_rate": 0.00019303892614326836, "loss": 1.1741, "step": 199 }, { "epoch": 0.5, "grad_norm": 0.142364963889122, "learning_rate": 0.00019296594733554892, "loss": 1.1716, "step": 200 }, { "epoch": 0.5, "eval_loss": 1.109603762626648, "eval_runtime": 81.7249, "eval_samples_per_second": 31.802, "eval_steps_per_second": 31.802, "step": 200 }, { "epoch": 0.5, "grad_norm": 0.1372615098953247, "learning_rate": 0.00019289260191256483, "loss": 1.1084, "step": 201 }, { "epoch": 0.51, "grad_norm": 0.13863563537597656, "learning_rate": 0.0001928188901635571, "loss": 1.0546, "step": 202 }, { "epoch": 0.51, "grad_norm": 0.13055531680583954, "learning_rate": 0.00019274481237921114, "loss": 1.018, "step": 203 }, { "epoch": 0.51, "grad_norm": 0.14135099947452545, "learning_rate": 0.00019267036885165588, "loss": 1.1131, "step": 204 }, { "epoch": 0.51, "grad_norm": 0.14308464527130127, "learning_rate": 0.0001925955598744627, "loss": 1.0723, "step": 205 }, { "epoch": 0.52, "grad_norm": 0.13907764852046967, "learning_rate": 0.00019252038574264405, "loss": 1.1607, "step": 206 }, { "epoch": 0.52, "grad_norm": 0.13771073520183563, "learning_rate": 0.00019244484675265232, "loss": 1.172, "step": 207 }, { "epoch": 0.52, "grad_norm": 0.13774815201759338, "learning_rate": 0.00019236894320237894, "loss": 1.0622, "step": 208 }, { "epoch": 0.52, "grad_norm": 0.1426474153995514, "learning_rate": 0.0001922926753911527, "loss": 1.0368, "step": 209 }, { "epoch": 0.53, "grad_norm": 0.1380661278963089, "learning_rate": 0.00019221604361973919, "loss": 1.0873, "step": 210 }, { "epoch": 0.53, "grad_norm": 0.14044702053070068, "learning_rate": 0.00019213904819033903, "loss": 1.0901, "step": 211 }, { "epoch": 0.53, "grad_norm": 0.1415887176990509, "learning_rate": 0.00019206168940658712, "loss": 1.1061, "step": 212 }, { "epoch": 0.53, "grad_norm": 0.1580592840909958, "learning_rate": 0.00019198396757355118, "loss": 1.1073, "step": 213 }, { "epoch": 0.54, "grad_norm": 0.14094668626785278, "learning_rate": 0.00019190588299773062, "loss": 1.1781, "step": 214 }, { "epoch": 0.54, "grad_norm": 0.14229640364646912, "learning_rate": 0.00019182743598705542, "loss": 1.1095, "step": 215 }, { "epoch": 0.54, "grad_norm": 0.140314981341362, "learning_rate": 0.00019174862685088472, "loss": 1.1534, "step": 216 }, { "epoch": 0.54, "grad_norm": 0.160028338432312, "learning_rate": 0.00019166945590000584, "loss": 1.087, "step": 217 }, { "epoch": 0.55, "grad_norm": 0.14278572797775269, "learning_rate": 0.0001915899234466328, "loss": 1.1583, "step": 218 }, { "epoch": 0.55, "grad_norm": 0.13695856928825378, "learning_rate": 0.0001915100298044054, "loss": 1.1151, "step": 219 }, { "epoch": 0.55, "grad_norm": 0.14235751330852509, "learning_rate": 0.00019142977528838762, "loss": 1.1111, "step": 220 }, { "epoch": 0.55, "grad_norm": 0.15174664556980133, "learning_rate": 0.00019134916021506666, "loss": 1.1438, "step": 221 }, { "epoch": 0.56, "grad_norm": 0.15249325335025787, "learning_rate": 0.0001912681849023516, "loss": 1.1575, "step": 222 }, { "epoch": 0.56, "grad_norm": 0.14303787052631378, "learning_rate": 0.00019118684966957207, "loss": 1.1302, "step": 223 }, { "epoch": 0.56, "grad_norm": 0.1405183970928192, "learning_rate": 0.00019110515483747716, "loss": 1.1157, "step": 224 }, { "epoch": 0.56, "grad_norm": 0.1475205421447754, "learning_rate": 0.00019102310072823393, "loss": 1.1175, "step": 225 }, { "epoch": 0.57, "grad_norm": 0.14406634867191315, "learning_rate": 0.0001909406876654264, "loss": 1.0578, "step": 226 }, { "epoch": 0.57, "grad_norm": 0.13999773561954498, "learning_rate": 0.00019085791597405404, "loss": 1.0865, "step": 227 }, { "epoch": 0.57, "grad_norm": 0.1409848928451538, "learning_rate": 0.00019077478598053063, "loss": 1.1297, "step": 228 }, { "epoch": 0.57, "grad_norm": 0.14548417925834656, "learning_rate": 0.00019069129801268294, "loss": 1.1524, "step": 229 }, { "epoch": 0.58, "grad_norm": 0.13622736930847168, "learning_rate": 0.00019060745239974936, "loss": 1.0744, "step": 230 }, { "epoch": 0.58, "grad_norm": 0.14302954077720642, "learning_rate": 0.0001905232494723788, "loss": 1.1469, "step": 231 }, { "epoch": 0.58, "grad_norm": 0.15202221274375916, "learning_rate": 0.0001904386895626291, "loss": 1.0693, "step": 232 }, { "epoch": 0.58, "grad_norm": 0.14072120189666748, "learning_rate": 0.00019035377300396597, "loss": 1.0584, "step": 233 }, { "epoch": 0.59, "grad_norm": 0.13941141963005066, "learning_rate": 0.00019026850013126157, "loss": 1.1257, "step": 234 }, { "epoch": 0.59, "grad_norm": 0.1389845460653305, "learning_rate": 0.0001901828712807932, "loss": 1.0003, "step": 235 }, { "epoch": 0.59, "grad_norm": 0.1431329846382141, "learning_rate": 0.0001900968867902419, "loss": 1.0795, "step": 236 }, { "epoch": 0.59, "grad_norm": 0.15022633969783783, "learning_rate": 0.00019001054699869133, "loss": 1.1427, "step": 237 }, { "epoch": 0.6, "grad_norm": 0.1578160673379898, "learning_rate": 0.00018992385224662623, "loss": 1.13, "step": 238 }, { "epoch": 0.6, "grad_norm": 0.13778769969940186, "learning_rate": 0.00018983680287593105, "loss": 1.0739, "step": 239 }, { "epoch": 0.6, "grad_norm": 0.1454969048500061, "learning_rate": 0.00018974939922988883, "loss": 1.0864, "step": 240 }, { "epoch": 0.6, "grad_norm": 0.13545964658260345, "learning_rate": 0.00018966164165317966, "loss": 1.0169, "step": 241 }, { "epoch": 0.61, "grad_norm": 0.13648608326911926, "learning_rate": 0.00018957353049187936, "loss": 1.0732, "step": 242 }, { "epoch": 0.61, "grad_norm": 0.14080677926540375, "learning_rate": 0.00018948506609345813, "loss": 1.0579, "step": 243 }, { "epoch": 0.61, "grad_norm": 0.14503297209739685, "learning_rate": 0.00018939624880677918, "loss": 1.0755, "step": 244 }, { "epoch": 0.61, "grad_norm": 0.15316741168498993, "learning_rate": 0.00018930707898209733, "loss": 1.0885, "step": 245 }, { "epoch": 0.62, "grad_norm": 0.14839263260364532, "learning_rate": 0.0001892175569710577, "loss": 1.121, "step": 246 }, { "epoch": 0.62, "grad_norm": 0.13919925689697266, "learning_rate": 0.00018912768312669424, "loss": 1.1039, "step": 247 }, { "epoch": 0.62, "grad_norm": 0.13975974917411804, "learning_rate": 0.00018903745780342839, "loss": 1.1454, "step": 248 }, { "epoch": 0.62, "grad_norm": 0.13851100206375122, "learning_rate": 0.0001889468813570676, "loss": 1.0905, "step": 249 }, { "epoch": 0.63, "grad_norm": 0.14839564263820648, "learning_rate": 0.00018885595414480405, "loss": 1.1002, "step": 250 }, { "epoch": 0.63, "grad_norm": 0.1421942263841629, "learning_rate": 0.00018876467652521317, "loss": 1.093, "step": 251 }, { "epoch": 0.63, "grad_norm": 0.14453786611557007, "learning_rate": 0.0001886730488582522, "loss": 1.0278, "step": 252 }, { "epoch": 0.63, "grad_norm": 0.13856688141822815, "learning_rate": 0.0001885810715052589, "loss": 1.079, "step": 253 }, { "epoch": 0.64, "grad_norm": 0.14092479646205902, "learning_rate": 0.00018848874482894993, "loss": 1.0608, "step": 254 }, { "epoch": 0.64, "grad_norm": 0.14616413414478302, "learning_rate": 0.0001883960691934196, "loss": 1.1097, "step": 255 }, { "epoch": 0.64, "grad_norm": 0.1410474181175232, "learning_rate": 0.00018830304496413822, "loss": 1.0577, "step": 256 }, { "epoch": 0.64, "grad_norm": 0.15473878383636475, "learning_rate": 0.000188209672507951, "loss": 1.1453, "step": 257 }, { "epoch": 0.65, "grad_norm": 0.14370983839035034, "learning_rate": 0.00018811595219307622, "loss": 1.1732, "step": 258 }, { "epoch": 0.65, "grad_norm": 0.14861780405044556, "learning_rate": 0.00018802188438910405, "loss": 1.1471, "step": 259 }, { "epoch": 0.65, "grad_norm": 0.1523188352584839, "learning_rate": 0.000187927469466995, "loss": 1.129, "step": 260 }, { "epoch": 0.65, "grad_norm": 0.14366289973258972, "learning_rate": 0.00018783270779907838, "loss": 1.0792, "step": 261 }, { "epoch": 0.66, "grad_norm": 0.1363295018672943, "learning_rate": 0.00018773759975905098, "loss": 0.9848, "step": 262 }, { "epoch": 0.66, "grad_norm": 0.1438857764005661, "learning_rate": 0.00018764214572197552, "loss": 1.1371, "step": 263 }, { "epoch": 0.66, "grad_norm": 0.13751162588596344, "learning_rate": 0.00018754634606427914, "loss": 1.0557, "step": 264 }, { "epoch": 0.66, "grad_norm": 0.1384708732366562, "learning_rate": 0.00018745020116375197, "loss": 1.0664, "step": 265 }, { "epoch": 0.67, "grad_norm": 0.14196960628032684, "learning_rate": 0.00018735371139954558, "loss": 1.0828, "step": 266 }, { "epoch": 0.67, "grad_norm": 0.15374121069908142, "learning_rate": 0.00018725687715217163, "loss": 1.073, "step": 267 }, { "epoch": 0.67, "grad_norm": 0.14955537021160126, "learning_rate": 0.0001871596988035001, "loss": 1.1444, "step": 268 }, { "epoch": 0.68, "grad_norm": 0.13760650157928467, "learning_rate": 0.00018706217673675811, "loss": 1.088, "step": 269 }, { "epoch": 0.68, "grad_norm": 0.17072008550167084, "learning_rate": 0.00018696431133652817, "loss": 1.07, "step": 270 }, { "epoch": 0.68, "grad_norm": 0.14745061099529266, "learning_rate": 0.00018686610298874676, "loss": 1.1105, "step": 271 }, { "epoch": 0.68, "grad_norm": 0.14695587754249573, "learning_rate": 0.00018676755208070275, "loss": 1.0612, "step": 272 }, { "epoch": 0.69, "grad_norm": 0.15686020255088806, "learning_rate": 0.00018666865900103597, "loss": 1.0933, "step": 273 }, { "epoch": 0.69, "grad_norm": 0.14162233471870422, "learning_rate": 0.00018656942413973555, "loss": 1.0832, "step": 274 }, { "epoch": 0.69, "grad_norm": 0.14662939310073853, "learning_rate": 0.00018646984788813856, "loss": 1.1175, "step": 275 }, { "epoch": 0.69, "grad_norm": 0.13886839151382446, "learning_rate": 0.0001863699306389282, "loss": 1.1221, "step": 276 }, { "epoch": 0.7, "grad_norm": 0.13897326588630676, "learning_rate": 0.00018626967278613253, "loss": 1.0767, "step": 277 }, { "epoch": 0.7, "grad_norm": 0.13283655047416687, "learning_rate": 0.0001861690747251228, "loss": 1.1397, "step": 278 }, { "epoch": 0.7, "grad_norm": 0.14036604762077332, "learning_rate": 0.0001860681368526118, "loss": 1.0965, "step": 279 }, { "epoch": 0.7, "grad_norm": 0.1449379026889801, "learning_rate": 0.00018596685956665245, "loss": 1.1262, "step": 280 }, { "epoch": 0.71, "grad_norm": 0.14264287054538727, "learning_rate": 0.00018586524326663615, "loss": 1.1317, "step": 281 }, { "epoch": 0.71, "grad_norm": 0.14677459001541138, "learning_rate": 0.00018576328835329117, "loss": 1.0785, "step": 282 }, { "epoch": 0.71, "grad_norm": 0.14834077656269073, "learning_rate": 0.00018566099522868119, "loss": 1.0892, "step": 283 }, { "epoch": 0.71, "grad_norm": 0.15325355529785156, "learning_rate": 0.00018555836429620358, "loss": 1.0843, "step": 284 }, { "epoch": 0.72, "grad_norm": 0.14825651049613953, "learning_rate": 0.00018545539596058795, "loss": 1.1288, "step": 285 }, { "epoch": 0.72, "grad_norm": 0.14722499251365662, "learning_rate": 0.00018535209062789433, "loss": 1.1391, "step": 286 }, { "epoch": 0.72, "grad_norm": 0.14388781785964966, "learning_rate": 0.00018524844870551185, "loss": 1.1013, "step": 287 }, { "epoch": 0.72, "grad_norm": 0.1455835998058319, "learning_rate": 0.00018514447060215698, "loss": 1.0811, "step": 288 }, { "epoch": 0.73, "grad_norm": 0.14625433087348938, "learning_rate": 0.00018504015672787184, "loss": 1.0854, "step": 289 }, { "epoch": 0.73, "grad_norm": 0.13978470861911774, "learning_rate": 0.00018493550749402278, "loss": 1.1398, "step": 290 }, { "epoch": 0.73, "grad_norm": 0.1447162628173828, "learning_rate": 0.00018483052331329857, "loss": 1.0553, "step": 291 }, { "epoch": 0.73, "grad_norm": 0.13894303143024445, "learning_rate": 0.00018472520459970898, "loss": 1.0305, "step": 292 }, { "epoch": 0.74, "grad_norm": 0.1372181624174118, "learning_rate": 0.00018461955176858285, "loss": 1.021, "step": 293 }, { "epoch": 0.74, "grad_norm": 0.14599645137786865, "learning_rate": 0.0001845135652365668, "loss": 1.0808, "step": 294 }, { "epoch": 0.74, "grad_norm": 0.1599220335483551, "learning_rate": 0.00018440724542162328, "loss": 1.1143, "step": 295 }, { "epoch": 0.74, "grad_norm": 0.1450476050376892, "learning_rate": 0.00018430059274302917, "loss": 1.0508, "step": 296 }, { "epoch": 0.75, "grad_norm": 0.1439283937215805, "learning_rate": 0.00018419360762137395, "loss": 1.0592, "step": 297 }, { "epoch": 0.75, "grad_norm": 0.1410531848669052, "learning_rate": 0.00018408629047855804, "loss": 1.0632, "step": 298 }, { "epoch": 0.75, "grad_norm": 0.1468774974346161, "learning_rate": 0.00018397864173779133, "loss": 1.056, "step": 299 }, { "epoch": 0.75, "grad_norm": 0.1467033177614212, "learning_rate": 0.00018387066182359133, "loss": 1.1122, "step": 300 }, { "epoch": 0.75, "eval_loss": 1.0955116748809814, "eval_runtime": 81.7775, "eval_samples_per_second": 31.781, "eval_steps_per_second": 31.781, "step": 300 }, { "epoch": 0.76, "grad_norm": 0.14950688183307648, "learning_rate": 0.00018376235116178148, "loss": 1.0698, "step": 301 }, { "epoch": 0.76, "grad_norm": 0.142381951212883, "learning_rate": 0.00018365371017948964, "loss": 1.0528, "step": 302 }, { "epoch": 0.76, "grad_norm": 0.1410701423883438, "learning_rate": 0.0001835447393051463, "loss": 1.0785, "step": 303 }, { "epoch": 0.76, "grad_norm": 0.14708860218524933, "learning_rate": 0.00018343543896848273, "loss": 1.0142, "step": 304 }, { "epoch": 0.77, "grad_norm": 0.1467617303133011, "learning_rate": 0.00018332580960052965, "loss": 1.0973, "step": 305 }, { "epoch": 0.77, "grad_norm": 0.15761792659759521, "learning_rate": 0.00018321585163361527, "loss": 1.1745, "step": 306 }, { "epoch": 0.77, "grad_norm": 0.13972119987010956, "learning_rate": 0.00018310556550136357, "loss": 1.0832, "step": 307 }, { "epoch": 0.77, "grad_norm": 0.1481141895055771, "learning_rate": 0.00018299495163869275, "loss": 1.1573, "step": 308 }, { "epoch": 0.78, "grad_norm": 0.14397870004177094, "learning_rate": 0.0001828840104818134, "loss": 1.171, "step": 309 }, { "epoch": 0.78, "grad_norm": 0.14765049517154694, "learning_rate": 0.0001827727424682268, "loss": 1.0544, "step": 310 }, { "epoch": 0.78, "grad_norm": 0.14956365525722504, "learning_rate": 0.00018266114803672318, "loss": 1.1755, "step": 311 }, { "epoch": 0.78, "grad_norm": 0.15122386813163757, "learning_rate": 0.00018254922762738008, "loss": 1.1547, "step": 312 }, { "epoch": 0.79, "grad_norm": 0.14254115521907806, "learning_rate": 0.00018243698168156054, "loss": 1.1075, "step": 313 }, { "epoch": 0.79, "grad_norm": 0.14294452965259552, "learning_rate": 0.00018232441064191125, "loss": 1.1419, "step": 314 }, { "epoch": 0.79, "grad_norm": 0.14777772128582, "learning_rate": 0.0001822115149523611, "loss": 1.1662, "step": 315 }, { "epoch": 0.79, "grad_norm": 0.14944781363010406, "learning_rate": 0.0001820982950581191, "loss": 1.1497, "step": 316 }, { "epoch": 0.8, "grad_norm": 0.1466801017522812, "learning_rate": 0.00018198475140567287, "loss": 1.1374, "step": 317 }, { "epoch": 0.8, "grad_norm": 0.15346656739711761, "learning_rate": 0.00018187088444278674, "loss": 1.1356, "step": 318 }, { "epoch": 0.8, "grad_norm": 0.15271005034446716, "learning_rate": 0.00018175669461850005, "loss": 1.0845, "step": 319 }, { "epoch": 0.8, "grad_norm": 0.14452996850013733, "learning_rate": 0.00018164218238312535, "loss": 1.1162, "step": 320 }, { "epoch": 0.81, "grad_norm": 0.14632536470890045, "learning_rate": 0.00018152734818824658, "loss": 1.0187, "step": 321 }, { "epoch": 0.81, "grad_norm": 0.14935997128486633, "learning_rate": 0.00018141219248671745, "loss": 1.1167, "step": 322 }, { "epoch": 0.81, "grad_norm": 0.14043933153152466, "learning_rate": 0.0001812967157326595, "loss": 1.0044, "step": 323 }, { "epoch": 0.81, "grad_norm": 0.14850106835365295, "learning_rate": 0.00018118091838146029, "loss": 1.1226, "step": 324 }, { "epoch": 0.82, "grad_norm": 0.14655061066150665, "learning_rate": 0.00018106480088977172, "loss": 1.0508, "step": 325 }, { "epoch": 0.82, "grad_norm": 0.14721763134002686, "learning_rate": 0.00018094836371550824, "loss": 1.0659, "step": 326 }, { "epoch": 0.82, "grad_norm": 0.1433349996805191, "learning_rate": 0.00018083160731784486, "loss": 1.147, "step": 327 }, { "epoch": 0.82, "grad_norm": 0.13528144359588623, "learning_rate": 0.00018071453215721554, "loss": 1.0388, "step": 328 }, { "epoch": 0.83, "grad_norm": 0.15466062724590302, "learning_rate": 0.0001805971386953113, "loss": 1.0649, "step": 329 }, { "epoch": 0.83, "grad_norm": 0.15163114666938782, "learning_rate": 0.00018047942739507836, "loss": 1.1454, "step": 330 }, { "epoch": 0.83, "grad_norm": 0.14693276584148407, "learning_rate": 0.0001803613987207163, "loss": 1.1137, "step": 331 }, { "epoch": 0.83, "grad_norm": 0.14229321479797363, "learning_rate": 0.00018024305313767646, "loss": 1.0153, "step": 332 }, { "epoch": 0.84, "grad_norm": 0.13863018155097961, "learning_rate": 0.00018012439111265974, "loss": 1.0491, "step": 333 }, { "epoch": 0.84, "grad_norm": 0.1422068327665329, "learning_rate": 0.000180005413113615, "loss": 1.0952, "step": 334 }, { "epoch": 0.84, "grad_norm": 0.1419857293367386, "learning_rate": 0.00017988611960973713, "loss": 1.0532, "step": 335 }, { "epoch": 0.84, "grad_norm": 0.1446901261806488, "learning_rate": 0.00017976651107146533, "loss": 1.0477, "step": 336 }, { "epoch": 0.85, "grad_norm": 0.14558811485767365, "learning_rate": 0.00017964658797048108, "loss": 1.1481, "step": 337 }, { "epoch": 0.85, "grad_norm": 0.15488363802433014, "learning_rate": 0.0001795263507797063, "loss": 1.1302, "step": 338 }, { "epoch": 0.85, "grad_norm": 0.14942613244056702, "learning_rate": 0.00017940579997330165, "loss": 1.0698, "step": 339 }, { "epoch": 0.85, "grad_norm": 0.14417564868927002, "learning_rate": 0.00017928493602666445, "loss": 1.0867, "step": 340 }, { "epoch": 0.86, "grad_norm": 0.14839497208595276, "learning_rate": 0.0001791637594164269, "loss": 1.0124, "step": 341 }, { "epoch": 0.86, "grad_norm": 0.1415972113609314, "learning_rate": 0.00017904227062045437, "loss": 1.0958, "step": 342 }, { "epoch": 0.86, "grad_norm": 0.143202543258667, "learning_rate": 0.00017892047011784312, "loss": 1.0808, "step": 343 }, { "epoch": 0.86, "grad_norm": 0.14291773736476898, "learning_rate": 0.00017879835838891875, "loss": 1.1386, "step": 344 }, { "epoch": 0.87, "grad_norm": 0.1504325121641159, "learning_rate": 0.00017867593591523422, "loss": 1.0804, "step": 345 }, { "epoch": 0.87, "grad_norm": 0.1444767862558365, "learning_rate": 0.00017855320317956784, "loss": 1.1207, "step": 346 }, { "epoch": 0.87, "grad_norm": 0.14493699371814728, "learning_rate": 0.00017843016066592158, "loss": 1.0954, "step": 347 }, { "epoch": 0.87, "grad_norm": 0.14571166038513184, "learning_rate": 0.00017830680885951887, "loss": 1.0676, "step": 348 }, { "epoch": 0.88, "grad_norm": 0.14583171904087067, "learning_rate": 0.000178183148246803, "loss": 1.0674, "step": 349 }, { "epoch": 0.88, "grad_norm": 0.15080390870571136, "learning_rate": 0.00017805917931543492, "loss": 1.0757, "step": 350 }, { "epoch": 0.88, "grad_norm": 0.14790864288806915, "learning_rate": 0.00017793490255429157, "loss": 1.1005, "step": 351 }, { "epoch": 0.88, "grad_norm": 0.14861677587032318, "learning_rate": 0.00017781031845346375, "loss": 1.0645, "step": 352 }, { "epoch": 0.89, "grad_norm": 0.15099036693572998, "learning_rate": 0.00017768542750425426, "loss": 1.1306, "step": 353 }, { "epoch": 0.89, "grad_norm": 0.14353971183300018, "learning_rate": 0.00017756023019917607, "loss": 1.0834, "step": 354 }, { "epoch": 0.89, "grad_norm": 0.14582550525665283, "learning_rate": 0.00017743472703195015, "loss": 1.0722, "step": 355 }, { "epoch": 0.89, "grad_norm": 0.14268234372138977, "learning_rate": 0.00017730891849750377, "loss": 1.092, "step": 356 }, { "epoch": 0.9, "grad_norm": 0.1424105316400528, "learning_rate": 0.00017718280509196828, "loss": 1.1355, "step": 357 }, { "epoch": 0.9, "grad_norm": 0.13972117006778717, "learning_rate": 0.0001770563873126775, "loss": 1.0318, "step": 358 }, { "epoch": 0.9, "grad_norm": 0.14622163772583008, "learning_rate": 0.00017692966565816532, "loss": 1.0985, "step": 359 }, { "epoch": 0.9, "grad_norm": 0.13956372439861298, "learning_rate": 0.0001768026406281642, "loss": 1.102, "step": 360 }, { "epoch": 0.91, "grad_norm": 0.14042189717292786, "learning_rate": 0.0001766753127236029, "loss": 1.0284, "step": 361 }, { "epoch": 0.91, "grad_norm": 0.14376944303512573, "learning_rate": 0.00017654768244660448, "loss": 1.1452, "step": 362 }, { "epoch": 0.91, "grad_norm": 0.14055544137954712, "learning_rate": 0.00017641975030048454, "loss": 1.0306, "step": 363 }, { "epoch": 0.91, "grad_norm": 0.14599303901195526, "learning_rate": 0.00017629151678974907, "loss": 1.0838, "step": 364 }, { "epoch": 0.92, "grad_norm": 0.1528831571340561, "learning_rate": 0.00017616298242009251, "loss": 1.1293, "step": 365 }, { "epoch": 0.92, "grad_norm": 0.1404455453157425, "learning_rate": 0.00017603414769839577, "loss": 1.0425, "step": 366 }, { "epoch": 0.92, "grad_norm": 0.14992842078208923, "learning_rate": 0.00017590501313272415, "loss": 1.0928, "step": 367 }, { "epoch": 0.92, "grad_norm": 0.14540541172027588, "learning_rate": 0.00017577557923232546, "loss": 1.0366, "step": 368 }, { "epoch": 0.93, "grad_norm": 0.1451583057641983, "learning_rate": 0.00017564584650762793, "loss": 1.1108, "step": 369 }, { "epoch": 0.93, "grad_norm": 0.155447855591774, "learning_rate": 0.00017551581547023819, "loss": 1.1394, "step": 370 }, { "epoch": 0.93, "grad_norm": 0.1441376656293869, "learning_rate": 0.0001753854866329393, "loss": 1.0264, "step": 371 }, { "epoch": 0.93, "grad_norm": 0.13875485956668854, "learning_rate": 0.00017525486050968875, "loss": 1.0672, "step": 372 }, { "epoch": 0.94, "grad_norm": 0.14158080518245697, "learning_rate": 0.00017512393761561632, "loss": 1.053, "step": 373 }, { "epoch": 0.94, "grad_norm": 0.15505361557006836, "learning_rate": 0.00017499271846702213, "loss": 1.0713, "step": 374 }, { "epoch": 0.94, "grad_norm": 0.14172373712062836, "learning_rate": 0.0001748612035813747, "loss": 1.0544, "step": 375 }, { "epoch": 0.94, "grad_norm": 0.14016349613666534, "learning_rate": 0.00017472939347730856, "loss": 1.0382, "step": 376 }, { "epoch": 0.95, "grad_norm": 0.15148378908634186, "learning_rate": 0.00017459728867462275, "loss": 1.1218, "step": 377 }, { "epoch": 0.95, "grad_norm": 0.1416306346654892, "learning_rate": 0.0001744648896942782, "loss": 1.0895, "step": 378 }, { "epoch": 0.95, "grad_norm": 0.14276988804340363, "learning_rate": 0.00017433219705839616, "loss": 1.0991, "step": 379 }, { "epoch": 0.95, "grad_norm": 0.13922327756881714, "learning_rate": 0.00017419921129025576, "loss": 1.0883, "step": 380 }, { "epoch": 0.96, "grad_norm": 0.1479676216840744, "learning_rate": 0.00017406593291429217, "loss": 1.1083, "step": 381 }, { "epoch": 0.96, "grad_norm": 0.14659778773784637, "learning_rate": 0.0001739323624560945, "loss": 1.0863, "step": 382 }, { "epoch": 0.96, "grad_norm": 0.14685633778572083, "learning_rate": 0.00017379850044240368, "loss": 1.1075, "step": 383 }, { "epoch": 0.96, "grad_norm": 0.14316044747829437, "learning_rate": 0.00017366434740111037, "loss": 1.0584, "step": 384 }, { "epoch": 0.97, "grad_norm": 0.14292864501476288, "learning_rate": 0.00017352990386125292, "loss": 1.1002, "step": 385 }, { "epoch": 0.97, "grad_norm": 0.14412067830562592, "learning_rate": 0.00017339517035301532, "loss": 1.0671, "step": 386 }, { "epoch": 0.97, "grad_norm": 0.14292089641094208, "learning_rate": 0.000173260147407725, "loss": 1.0958, "step": 387 }, { "epoch": 0.97, "grad_norm": 0.1490335911512375, "learning_rate": 0.00017312483555785086, "loss": 1.1074, "step": 388 }, { "epoch": 0.98, "grad_norm": 0.14249826967716217, "learning_rate": 0.00017298923533700107, "loss": 1.1546, "step": 389 }, { "epoch": 0.98, "grad_norm": 0.14555396139621735, "learning_rate": 0.000172853347279921, "loss": 1.076, "step": 390 }, { "epoch": 0.98, "grad_norm": 0.14374902844429016, "learning_rate": 0.00017271717192249116, "loss": 1.0767, "step": 391 }, { "epoch": 0.98, "grad_norm": 0.14903804659843445, "learning_rate": 0.00017258070980172494, "loss": 1.0969, "step": 392 }, { "epoch": 0.99, "grad_norm": 0.1533229798078537, "learning_rate": 0.00017244396145576672, "loss": 1.1206, "step": 393 }, { "epoch": 0.99, "grad_norm": 0.14720167219638824, "learning_rate": 0.0001723069274238895, "loss": 1.0655, "step": 394 }, { "epoch": 0.99, "grad_norm": 0.14380764961242676, "learning_rate": 0.00017216960824649303, "loss": 1.0123, "step": 395 }, { "epoch": 0.99, "grad_norm": 0.14513961970806122, "learning_rate": 0.0001720320044651014, "loss": 1.0196, "step": 396 }, { "epoch": 1.0, "grad_norm": 0.14310909807682037, "learning_rate": 0.0001718941166223612, "loss": 1.0278, "step": 397 }, { "epoch": 1.0, "grad_norm": 0.14312389492988586, "learning_rate": 0.00017175594526203905, "loss": 1.0649, "step": 398 }, { "epoch": 1.0, "grad_norm": 0.1408112645149231, "learning_rate": 0.00017161749092901984, "loss": 1.0793, "step": 399 }, { "epoch": 1.0, "grad_norm": 0.14593806862831116, "learning_rate": 0.00017147875416930416, "loss": 1.0474, "step": 400 }, { "epoch": 1.0, "eval_loss": 1.083612322807312, "eval_runtime": 81.6893, "eval_samples_per_second": 31.816, "eval_steps_per_second": 31.816, "step": 400 }, { "epoch": 1.01, "grad_norm": 0.14213843643665314, "learning_rate": 0.00017133973553000654, "loss": 1.0476, "step": 401 }, { "epoch": 1.01, "grad_norm": 0.14211952686309814, "learning_rate": 0.00017120043555935298, "loss": 1.0386, "step": 402 }, { "epoch": 1.01, "grad_norm": 0.15638479590415955, "learning_rate": 0.00017106085480667903, "loss": 1.1145, "step": 403 }, { "epoch": 1.01, "grad_norm": 0.1525896191596985, "learning_rate": 0.00017092099382242748, "loss": 1.1124, "step": 404 }, { "epoch": 1.0, "grad_norm": 0.13780884444713593, "learning_rate": 0.0001707808531581462, "loss": 1.0208, "step": 405 }, { "epoch": 1.01, "grad_norm": 0.13917113840579987, "learning_rate": 0.00017064043336648599, "loss": 1.0143, "step": 406 }, { "epoch": 1.01, "grad_norm": 0.14122170209884644, "learning_rate": 0.00017049973500119845, "loss": 0.9977, "step": 407 }, { "epoch": 1.01, "grad_norm": 0.14243052899837494, "learning_rate": 0.0001703587586171337, "loss": 0.9933, "step": 408 }, { "epoch": 1.01, "grad_norm": 0.14186780154705048, "learning_rate": 0.0001702175047702382, "loss": 0.9567, "step": 409 }, { "epoch": 1.02, "grad_norm": 0.1524883359670639, "learning_rate": 0.00017007597401755276, "loss": 0.9874, "step": 410 }, { "epoch": 1.02, "grad_norm": 0.15759988129138947, "learning_rate": 0.00016993416691720998, "loss": 1.0292, "step": 411 }, { "epoch": 1.02, "grad_norm": 0.15617264807224274, "learning_rate": 0.00016979208402843237, "loss": 1.0168, "step": 412 }, { "epoch": 1.02, "grad_norm": 0.15921927988529205, "learning_rate": 0.00016964972591153, "loss": 1.0209, "step": 413 }, { "epoch": 1.03, "grad_norm": 0.1540677845478058, "learning_rate": 0.00016950709312789833, "loss": 1.0013, "step": 414 }, { "epoch": 1.03, "grad_norm": 0.156731516122818, "learning_rate": 0.00016936418624001592, "loss": 1.0171, "step": 415 }, { "epoch": 1.03, "grad_norm": 0.15679331123828888, "learning_rate": 0.00016922100581144228, "loss": 1.0137, "step": 416 }, { "epoch": 1.03, "grad_norm": 0.15117546916007996, "learning_rate": 0.00016907755240681577, "loss": 0.9041, "step": 417 }, { "epoch": 1.04, "grad_norm": 0.1581723839044571, "learning_rate": 0.00016893382659185105, "loss": 0.9891, "step": 418 }, { "epoch": 1.04, "grad_norm": 0.15231919288635254, "learning_rate": 0.00016878982893333717, "loss": 0.9626, "step": 419 }, { "epoch": 1.04, "grad_norm": 0.15532514452934265, "learning_rate": 0.00016864555999913518, "loss": 0.9639, "step": 420 }, { "epoch": 1.04, "grad_norm": 0.16158603131771088, "learning_rate": 0.00016850102035817588, "loss": 1.0156, "step": 421 }, { "epoch": 1.05, "grad_norm": 0.16860714554786682, "learning_rate": 0.0001683562105804577, "loss": 1.0279, "step": 422 }, { "epoch": 1.05, "grad_norm": 0.1704617142677307, "learning_rate": 0.00016821113123704424, "loss": 1.0261, "step": 423 }, { "epoch": 1.05, "grad_norm": 0.16520226001739502, "learning_rate": 0.00016806578290006225, "loss": 1.0307, "step": 424 }, { "epoch": 1.05, "grad_norm": 0.16199736297130585, "learning_rate": 0.00016792016614269924, "loss": 0.9764, "step": 425 }, { "epoch": 1.06, "grad_norm": 0.16184571385383606, "learning_rate": 0.0001677742815392012, "loss": 0.9958, "step": 426 }, { "epoch": 1.06, "grad_norm": 0.16386933624744415, "learning_rate": 0.00016762812966487044, "loss": 1.0221, "step": 427 }, { "epoch": 1.06, "grad_norm": 0.17046724259853363, "learning_rate": 0.00016748171109606328, "loss": 1.029, "step": 428 }, { "epoch": 1.06, "grad_norm": 0.1638820469379425, "learning_rate": 0.00016733502641018766, "loss": 1.0175, "step": 429 }, { "epoch": 1.07, "grad_norm": 0.16480222344398499, "learning_rate": 0.00016718807618570106, "loss": 1.033, "step": 430 }, { "epoch": 1.07, "grad_norm": 0.1661783903837204, "learning_rate": 0.00016704086100210815, "loss": 0.9379, "step": 431 }, { "epoch": 1.07, "grad_norm": 0.15570427477359772, "learning_rate": 0.00016689338143995833, "loss": 0.9877, "step": 432 }, { "epoch": 1.07, "grad_norm": 0.170819491147995, "learning_rate": 0.00016674563808084377, "loss": 1.0738, "step": 433 }, { "epoch": 1.08, "grad_norm": 0.16349053382873535, "learning_rate": 0.00016659763150739677, "loss": 0.9474, "step": 434 }, { "epoch": 1.08, "grad_norm": 0.1703306883573532, "learning_rate": 0.0001664493623032877, "loss": 1.054, "step": 435 }, { "epoch": 1.08, "grad_norm": 0.1705269068479538, "learning_rate": 0.00016630083105322266, "loss": 1.0175, "step": 436 }, { "epoch": 1.08, "grad_norm": 0.15883858501911163, "learning_rate": 0.00016615203834294119, "loss": 1.0414, "step": 437 }, { "epoch": 1.09, "grad_norm": 0.17120327055454254, "learning_rate": 0.00016600298475921365, "loss": 1.0222, "step": 438 }, { "epoch": 1.09, "grad_norm": 0.1668461114168167, "learning_rate": 0.00016585367088983946, "loss": 0.9212, "step": 439 }, { "epoch": 1.09, "grad_norm": 0.178915336728096, "learning_rate": 0.00016570409732364437, "loss": 1.0167, "step": 440 }, { "epoch": 1.09, "grad_norm": 0.171407088637352, "learning_rate": 0.00016555426465047823, "loss": 0.9693, "step": 441 }, { "epoch": 1.1, "grad_norm": 0.1687992811203003, "learning_rate": 0.0001654041734612127, "loss": 1.0257, "step": 442 }, { "epoch": 1.1, "grad_norm": 0.17136409878730774, "learning_rate": 0.00016525382434773894, "loss": 0.9874, "step": 443 }, { "epoch": 1.1, "grad_norm": 0.1806887686252594, "learning_rate": 0.00016510321790296525, "loss": 1.0684, "step": 444 }, { "epoch": 1.1, "grad_norm": 0.17648373544216156, "learning_rate": 0.00016495235472081468, "loss": 0.9867, "step": 445 }, { "epoch": 1.11, "grad_norm": 0.17426486313343048, "learning_rate": 0.00016480123539622281, "loss": 1.0439, "step": 446 }, { "epoch": 1.11, "grad_norm": 0.17550793290138245, "learning_rate": 0.0001646498605251352, "loss": 1.0127, "step": 447 }, { "epoch": 1.11, "grad_norm": 0.1805875450372696, "learning_rate": 0.00016449823070450531, "loss": 1.0317, "step": 448 }, { "epoch": 1.11, "grad_norm": 0.17466574907302856, "learning_rate": 0.00016434634653229199, "loss": 0.9713, "step": 449 }, { "epoch": 1.12, "grad_norm": 0.16918793320655823, "learning_rate": 0.00016419420860745699, "loss": 1.0376, "step": 450 }, { "epoch": 1.12, "grad_norm": 0.16672617197036743, "learning_rate": 0.00016404181752996289, "loss": 0.9211, "step": 451 }, { "epoch": 1.12, "grad_norm": 0.17270368337631226, "learning_rate": 0.00016388917390077054, "loss": 0.987, "step": 452 }, { "epoch": 1.12, "grad_norm": 0.16792818903923035, "learning_rate": 0.0001637362783218368, "loss": 0.9782, "step": 453 }, { "epoch": 1.13, "grad_norm": 0.1800449639558792, "learning_rate": 0.00016358313139611195, "loss": 0.9747, "step": 454 }, { "epoch": 1.13, "grad_norm": 0.17128407955169678, "learning_rate": 0.0001634297337275376, "loss": 1.0312, "step": 455 }, { "epoch": 1.13, "grad_norm": 0.17059966921806335, "learning_rate": 0.0001632760859210442, "loss": 1.0075, "step": 456 }, { "epoch": 1.13, "grad_norm": 0.18244986236095428, "learning_rate": 0.0001631221885825485, "loss": 1.0161, "step": 457 }, { "epoch": 1.14, "grad_norm": 0.17219580709934235, "learning_rate": 0.00016296804231895142, "loss": 1.0105, "step": 458 }, { "epoch": 1.14, "grad_norm": 0.1736789494752884, "learning_rate": 0.0001628136477381354, "loss": 1.0128, "step": 459 }, { "epoch": 1.14, "grad_norm": 0.2108864039182663, "learning_rate": 0.00016265900544896225, "loss": 0.9926, "step": 460 }, { "epoch": 1.14, "grad_norm": 0.16976673901081085, "learning_rate": 0.00016250411606127054, "loss": 0.9633, "step": 461 }, { "epoch": 1.15, "grad_norm": 0.1719416379928589, "learning_rate": 0.00016234898018587337, "loss": 1.0222, "step": 462 }, { "epoch": 1.15, "grad_norm": 0.17205439507961273, "learning_rate": 0.00016219359843455577, "loss": 1.0328, "step": 463 }, { "epoch": 1.15, "grad_norm": 0.17340464890003204, "learning_rate": 0.0001620379714200725, "loss": 0.9781, "step": 464 }, { "epoch": 1.15, "grad_norm": 0.17654834687709808, "learning_rate": 0.00016188209975614542, "loss": 1.0151, "step": 465 }, { "epoch": 1.16, "grad_norm": 0.17264829576015472, "learning_rate": 0.00016172598405746124, "loss": 0.9525, "step": 466 }, { "epoch": 1.16, "grad_norm": 0.16847053170204163, "learning_rate": 0.00016156962493966908, "loss": 0.9202, "step": 467 }, { "epoch": 1.16, "grad_norm": 0.18013043701648712, "learning_rate": 0.00016141302301937786, "loss": 1.0383, "step": 468 }, { "epoch": 1.16, "grad_norm": 0.17866036295890808, "learning_rate": 0.0001612561789141541, "loss": 0.9682, "step": 469 }, { "epoch": 1.17, "grad_norm": 0.17272624373435974, "learning_rate": 0.0001610990932425194, "loss": 1.0254, "step": 470 }, { "epoch": 1.17, "grad_norm": 0.18053527176380157, "learning_rate": 0.00016094176662394792, "loss": 1.0435, "step": 471 }, { "epoch": 1.17, "grad_norm": 0.17645591497421265, "learning_rate": 0.00016078419967886402, "loss": 0.9929, "step": 472 }, { "epoch": 1.17, "grad_norm": 0.17896148562431335, "learning_rate": 0.00016062639302863986, "loss": 0.9597, "step": 473 }, { "epoch": 1.18, "grad_norm": 0.1784675121307373, "learning_rate": 0.0001604683472955928, "loss": 0.9877, "step": 474 }, { "epoch": 1.18, "grad_norm": 0.18384787440299988, "learning_rate": 0.00016031006310298306, "loss": 0.98, "step": 475 }, { "epoch": 1.18, "grad_norm": 0.17336387932300568, "learning_rate": 0.00016015154107501133, "loss": 0.9813, "step": 476 }, { "epoch": 1.18, "grad_norm": 0.1778045892715454, "learning_rate": 0.00015999278183681604, "loss": 0.9327, "step": 477 }, { "epoch": 1.19, "grad_norm": 0.17641645669937134, "learning_rate": 0.00015983378601447127, "loss": 0.9955, "step": 478 }, { "epoch": 1.19, "grad_norm": 0.18100661039352417, "learning_rate": 0.00015967455423498387, "loss": 1.0304, "step": 479 }, { "epoch": 1.19, "grad_norm": 0.17939269542694092, "learning_rate": 0.0001595150871262914, "loss": 0.9129, "step": 480 }, { "epoch": 1.19, "grad_norm": 0.18178121745586395, "learning_rate": 0.00015935538531725927, "loss": 1.0567, "step": 481 }, { "epoch": 1.2, "grad_norm": 0.18156662583351135, "learning_rate": 0.00015919544943767856, "loss": 0.9731, "step": 482 }, { "epoch": 1.2, "grad_norm": 0.18265368044376373, "learning_rate": 0.00015903528011826335, "loss": 1.0253, "step": 483 }, { "epoch": 1.2, "grad_norm": 0.16867631673812866, "learning_rate": 0.00015887487799064838, "loss": 0.967, "step": 484 }, { "epoch": 1.2, "grad_norm": 0.181188702583313, "learning_rate": 0.0001587142436873864, "loss": 1.0113, "step": 485 }, { "epoch": 1.21, "grad_norm": 0.17186175286769867, "learning_rate": 0.00015855337784194577, "loss": 0.9987, "step": 486 }, { "epoch": 1.21, "grad_norm": 0.16855312883853912, "learning_rate": 0.000158392281088708, "loss": 0.9623, "step": 487 }, { "epoch": 1.21, "grad_norm": 0.1724013239145279, "learning_rate": 0.00015823095406296514, "loss": 0.922, "step": 488 }, { "epoch": 1.21, "grad_norm": 0.18288518488407135, "learning_rate": 0.00015806939740091734, "loss": 0.9884, "step": 489 }, { "epoch": 1.22, "grad_norm": 0.17419768869876862, "learning_rate": 0.00015790761173967036, "loss": 0.9246, "step": 490 }, { "epoch": 1.22, "grad_norm": 0.1798882633447647, "learning_rate": 0.00015774559771723298, "loss": 0.9276, "step": 491 }, { "epoch": 1.22, "grad_norm": 0.18484486639499664, "learning_rate": 0.00015758335597251458, "loss": 0.9967, "step": 492 }, { "epoch": 1.22, "grad_norm": 0.17431318759918213, "learning_rate": 0.00015742088714532247, "loss": 0.9672, "step": 493 }, { "epoch": 1.23, "grad_norm": 0.1722385287284851, "learning_rate": 0.00015725819187635968, "loss": 0.9561, "step": 494 }, { "epoch": 1.23, "grad_norm": 0.19427751004695892, "learning_rate": 0.00015709527080722202, "loss": 0.969, "step": 495 }, { "epoch": 1.23, "grad_norm": 0.1689085215330124, "learning_rate": 0.00015693212458039584, "loss": 0.9618, "step": 496 }, { "epoch": 1.23, "grad_norm": 0.1696721762418747, "learning_rate": 0.00015676875383925534, "loss": 0.9686, "step": 497 }, { "epoch": 1.24, "grad_norm": 0.17037516832351685, "learning_rate": 0.00015660515922806027, "loss": 0.956, "step": 498 }, { "epoch": 1.24, "grad_norm": 0.17930398881435394, "learning_rate": 0.000156441341391953, "loss": 0.983, "step": 499 }, { "epoch": 1.24, "grad_norm": 0.18172559142112732, "learning_rate": 0.00015627730097695638, "loss": 1.0447, "step": 500 }, { "epoch": 1.24, "eval_loss": 1.0872775316238403, "eval_runtime": 81.628, "eval_samples_per_second": 31.84, "eval_steps_per_second": 31.84, "step": 500 }, { "epoch": 1.24, "grad_norm": 0.179900661110878, "learning_rate": 0.0001561130386299709, "loss": 0.9864, "step": 501 }, { "epoch": 1.25, "grad_norm": 0.1860770583152771, "learning_rate": 0.0001559485549987723, "loss": 0.9963, "step": 502 }, { "epoch": 1.25, "grad_norm": 0.17942041158676147, "learning_rate": 0.00015578385073200895, "loss": 1.0004, "step": 503 }, { "epoch": 1.25, "grad_norm": 0.17420290410518646, "learning_rate": 0.0001556189264791992, "loss": 1.002, "step": 504 }, { "epoch": 1.25, "grad_norm": 0.17478443682193756, "learning_rate": 0.00015545378289072922, "loss": 0.9624, "step": 505 }, { "epoch": 1.26, "grad_norm": 0.18624065816402435, "learning_rate": 0.0001552884206178498, "loss": 1.0315, "step": 506 }, { "epoch": 1.26, "grad_norm": 0.17450089752674103, "learning_rate": 0.00015512284031267437, "loss": 0.9906, "step": 507 }, { "epoch": 1.26, "grad_norm": 0.1746608465909958, "learning_rate": 0.00015495704262817597, "loss": 0.9898, "step": 508 }, { "epoch": 1.26, "grad_norm": 0.17796628177165985, "learning_rate": 0.00015479102821818507, "loss": 1.0194, "step": 509 }, { "epoch": 1.27, "grad_norm": 0.17470288276672363, "learning_rate": 0.0001546247977373867, "loss": 0.9309, "step": 510 }, { "epoch": 1.27, "grad_norm": 0.17829464375972748, "learning_rate": 0.000154458351841318, "loss": 1.0141, "step": 511 }, { "epoch": 1.27, "grad_norm": 0.17732754349708557, "learning_rate": 0.00015429169118636566, "loss": 0.9817, "step": 512 }, { "epoch": 1.27, "grad_norm": 0.1795651614665985, "learning_rate": 0.00015412481642976318, "loss": 0.9709, "step": 513 }, { "epoch": 1.28, "grad_norm": 0.17974676191806793, "learning_rate": 0.00015395772822958845, "loss": 1.0243, "step": 514 }, { "epoch": 1.28, "grad_norm": 0.18511098623275757, "learning_rate": 0.0001537904272447611, "loss": 1.0001, "step": 515 }, { "epoch": 1.28, "grad_norm": 0.1780577152967453, "learning_rate": 0.00015362291413503984, "loss": 0.9829, "step": 516 }, { "epoch": 1.28, "grad_norm": 0.17798136174678802, "learning_rate": 0.0001534551895610199, "loss": 0.9659, "step": 517 }, { "epoch": 1.29, "grad_norm": 0.1870565563440323, "learning_rate": 0.00015328725418413045, "loss": 0.9749, "step": 518 }, { "epoch": 1.29, "grad_norm": 0.18744368851184845, "learning_rate": 0.00015311910866663196, "loss": 1.015, "step": 519 }, { "epoch": 1.29, "grad_norm": 0.18052896857261658, "learning_rate": 0.00015295075367161367, "loss": 1.0313, "step": 520 }, { "epoch": 1.29, "grad_norm": 0.1779204159975052, "learning_rate": 0.00015278218986299074, "loss": 0.9496, "step": 521 }, { "epoch": 1.3, "grad_norm": 0.1824800670146942, "learning_rate": 0.00015261341790550196, "loss": 1.0281, "step": 522 }, { "epoch": 1.3, "grad_norm": 0.19057531654834747, "learning_rate": 0.0001524444384647069, "loss": 1.0271, "step": 523 }, { "epoch": 1.3, "grad_norm": 0.19244614243507385, "learning_rate": 0.0001522752522069833, "loss": 0.9907, "step": 524 }, { "epoch": 1.3, "grad_norm": 0.17696735262870789, "learning_rate": 0.0001521058597995246, "loss": 0.9331, "step": 525 }, { "epoch": 1.31, "grad_norm": 0.17268431186676025, "learning_rate": 0.00015193626191033712, "loss": 0.9427, "step": 526 }, { "epoch": 1.31, "grad_norm": 0.18662290275096893, "learning_rate": 0.0001517664592082375, "loss": 1.0074, "step": 527 }, { "epoch": 1.31, "grad_norm": 0.17090214788913727, "learning_rate": 0.0001515964523628501, "loss": 0.9608, "step": 528 }, { "epoch": 1.31, "grad_norm": 0.1795254349708557, "learning_rate": 0.00015142624204460435, "loss": 0.9439, "step": 529 }, { "epoch": 1.32, "grad_norm": 0.18272066116333008, "learning_rate": 0.00015125582892473204, "loss": 0.9828, "step": 530 }, { "epoch": 1.32, "grad_norm": 0.2021034061908722, "learning_rate": 0.00015108521367526479, "loss": 1.0375, "step": 531 }, { "epoch": 1.32, "grad_norm": 0.18685071170330048, "learning_rate": 0.00015091439696903115, "loss": 1.0026, "step": 532 }, { "epoch": 1.32, "grad_norm": 0.17936167120933533, "learning_rate": 0.00015074337947965435, "loss": 0.9296, "step": 533 }, { "epoch": 1.33, "grad_norm": 0.18303433060646057, "learning_rate": 0.00015057216188154928, "loss": 0.9416, "step": 534 }, { "epoch": 1.33, "grad_norm": 0.18212522566318512, "learning_rate": 0.00015040074484992, "loss": 0.9812, "step": 535 }, { "epoch": 1.33, "grad_norm": 0.17352260649204254, "learning_rate": 0.00015022912906075702, "loss": 0.9766, "step": 536 }, { "epoch": 1.33, "grad_norm": 0.17948494851589203, "learning_rate": 0.0001500573151908347, "loss": 1.006, "step": 537 }, { "epoch": 1.34, "grad_norm": 0.18391214311122894, "learning_rate": 0.00014988530391770856, "loss": 1.0484, "step": 538 }, { "epoch": 1.34, "grad_norm": 0.1719055324792862, "learning_rate": 0.00014971309591971252, "loss": 0.964, "step": 539 }, { "epoch": 1.34, "grad_norm": 0.1985386312007904, "learning_rate": 0.00014954069187595633, "loss": 1.0035, "step": 540 }, { "epoch": 1.34, "grad_norm": 0.18530823290348053, "learning_rate": 0.0001493680924663228, "loss": 1.0089, "step": 541 }, { "epoch": 1.35, "grad_norm": 0.18150845170021057, "learning_rate": 0.00014919529837146528, "loss": 1.0586, "step": 542 }, { "epoch": 1.35, "grad_norm": 0.19130894541740417, "learning_rate": 0.00014902231027280486, "loss": 1.0152, "step": 543 }, { "epoch": 1.35, "grad_norm": 0.1798924058675766, "learning_rate": 0.0001488491288525275, "loss": 0.9548, "step": 544 }, { "epoch": 1.35, "grad_norm": 0.17213404178619385, "learning_rate": 0.0001486757547935818, "loss": 1.0226, "step": 545 }, { "epoch": 1.36, "grad_norm": 0.18383356928825378, "learning_rate": 0.0001485021887796759, "loss": 1.0291, "step": 546 }, { "epoch": 1.36, "grad_norm": 0.19143284857273102, "learning_rate": 0.0001483284314952749, "loss": 1.0055, "step": 547 }, { "epoch": 1.36, "grad_norm": 0.19124020636081696, "learning_rate": 0.00014815448362559826, "loss": 1.0231, "step": 548 }, { "epoch": 1.36, "grad_norm": 0.18096496164798737, "learning_rate": 0.00014798034585661695, "loss": 1.0152, "step": 549 }, { "epoch": 1.37, "grad_norm": 0.17621304094791412, "learning_rate": 0.00014780601887505088, "loss": 0.9718, "step": 550 }, { "epoch": 1.37, "grad_norm": 0.18995219469070435, "learning_rate": 0.00014763150336836604, "loss": 1.0052, "step": 551 }, { "epoch": 1.37, "grad_norm": 0.19126906991004944, "learning_rate": 0.00014745680002477203, "loss": 0.9409, "step": 552 }, { "epoch": 1.37, "grad_norm": 0.17537294328212738, "learning_rate": 0.00014728190953321903, "loss": 1.0021, "step": 553 }, { "epoch": 1.38, "grad_norm": 0.18963244557380676, "learning_rate": 0.00014710683258339536, "loss": 1.0154, "step": 554 }, { "epoch": 1.38, "grad_norm": 0.17940685153007507, "learning_rate": 0.00014693156986572456, "loss": 0.9898, "step": 555 }, { "epoch": 1.38, "grad_norm": 0.19598953425884247, "learning_rate": 0.0001467561220713628, "loss": 1.0479, "step": 556 }, { "epoch": 1.38, "grad_norm": 0.18346156179904938, "learning_rate": 0.00014658048989219614, "loss": 1.0076, "step": 557 }, { "epoch": 1.39, "grad_norm": 0.17553867399692535, "learning_rate": 0.0001464046740208377, "loss": 0.9696, "step": 558 }, { "epoch": 1.39, "grad_norm": 0.1788376122713089, "learning_rate": 0.00014622867515062503, "loss": 0.9788, "step": 559 }, { "epoch": 1.39, "grad_norm": 0.17731797695159912, "learning_rate": 0.00014605249397561736, "loss": 1.003, "step": 560 }, { "epoch": 1.39, "grad_norm": 0.17706608772277832, "learning_rate": 0.00014587613119059284, "loss": 1.0055, "step": 561 }, { "epoch": 1.4, "grad_norm": 0.168448805809021, "learning_rate": 0.00014569958749104575, "loss": 0.9516, "step": 562 }, { "epoch": 1.4, "grad_norm": 0.18675707280635834, "learning_rate": 0.0001455228635731839, "loss": 0.9837, "step": 563 }, { "epoch": 1.4, "grad_norm": 0.17538242042064667, "learning_rate": 0.00014534596013392575, "loss": 1.0367, "step": 564 }, { "epoch": 1.4, "grad_norm": 0.17501141130924225, "learning_rate": 0.00014516887787089774, "loss": 0.9733, "step": 565 }, { "epoch": 1.41, "grad_norm": 0.1874341070652008, "learning_rate": 0.00014499161748243147, "loss": 1.0206, "step": 566 }, { "epoch": 1.41, "grad_norm": 0.1980811208486557, "learning_rate": 0.00014481417966756102, "loss": 1.0289, "step": 567 }, { "epoch": 1.41, "grad_norm": 0.18807095289230347, "learning_rate": 0.0001446365651260201, "loss": 1.0205, "step": 568 }, { "epoch": 1.41, "grad_norm": 0.1855577528476715, "learning_rate": 0.00014445877455823946, "loss": 1.0497, "step": 569 }, { "epoch": 1.42, "grad_norm": 0.18725629150867462, "learning_rate": 0.00014428080866534396, "loss": 1.0326, "step": 570 }, { "epoch": 1.42, "grad_norm": 0.19902606308460236, "learning_rate": 0.0001441026681491498, "loss": 1.0252, "step": 571 }, { "epoch": 1.42, "grad_norm": 0.19441325962543488, "learning_rate": 0.00014392435371216185, "loss": 1.0191, "step": 572 }, { "epoch": 1.42, "grad_norm": 0.18167538940906525, "learning_rate": 0.00014374586605757095, "loss": 1.029, "step": 573 }, { "epoch": 1.43, "grad_norm": 0.1809268742799759, "learning_rate": 0.0001435672058892509, "loss": 0.975, "step": 574 }, { "epoch": 1.43, "grad_norm": 0.18132343888282776, "learning_rate": 0.00014338837391175582, "loss": 0.9688, "step": 575 }, { "epoch": 1.43, "grad_norm": 0.1733206808567047, "learning_rate": 0.00014320937083031748, "loss": 0.958, "step": 576 }, { "epoch": 1.43, "grad_norm": 0.1799648404121399, "learning_rate": 0.00014303019735084226, "loss": 0.9842, "step": 577 }, { "epoch": 1.44, "grad_norm": 0.1771499365568161, "learning_rate": 0.0001428508541799086, "loss": 1.0048, "step": 578 }, { "epoch": 1.44, "grad_norm": 0.1818363070487976, "learning_rate": 0.00014267134202476417, "loss": 1.0374, "step": 579 }, { "epoch": 1.44, "grad_norm": 0.1858426034450531, "learning_rate": 0.0001424916615933229, "loss": 0.9952, "step": 580 }, { "epoch": 1.44, "grad_norm": 0.19056333601474762, "learning_rate": 0.00014231181359416247, "loss": 1.0125, "step": 581 }, { "epoch": 1.45, "grad_norm": 0.179644376039505, "learning_rate": 0.00014213179873652127, "loss": 0.9194, "step": 582 }, { "epoch": 1.45, "grad_norm": 0.177077516913414, "learning_rate": 0.0001419516177302957, "loss": 0.991, "step": 583 }, { "epoch": 1.45, "grad_norm": 0.18390731513500214, "learning_rate": 0.00014177127128603745, "loss": 0.9921, "step": 584 }, { "epoch": 1.45, "grad_norm": 0.1845334768295288, "learning_rate": 0.00014159076011495061, "loss": 0.993, "step": 585 }, { "epoch": 1.46, "grad_norm": 0.1941182017326355, "learning_rate": 0.0001414100849288888, "loss": 0.9864, "step": 586 }, { "epoch": 1.46, "grad_norm": 0.17679093778133392, "learning_rate": 0.00014122924644035249, "loss": 1.0078, "step": 587 }, { "epoch": 1.46, "grad_norm": 0.1847458928823471, "learning_rate": 0.00014104824536248614, "loss": 1.0043, "step": 588 }, { "epoch": 1.46, "grad_norm": 0.1811904013156891, "learning_rate": 0.00014086708240907542, "loss": 0.9493, "step": 589 }, { "epoch": 1.47, "grad_norm": 0.18393242359161377, "learning_rate": 0.00014068575829454436, "loss": 1.0019, "step": 590 }, { "epoch": 1.47, "grad_norm": 0.17711445689201355, "learning_rate": 0.0001405042737339524, "loss": 0.9666, "step": 591 }, { "epoch": 1.47, "grad_norm": 0.18920022249221802, "learning_rate": 0.00014032262944299194, "loss": 0.9579, "step": 592 }, { "epoch": 1.47, "grad_norm": 0.18185077607631683, "learning_rate": 0.00014014082613798503, "loss": 1.0523, "step": 593 }, { "epoch": 1.48, "grad_norm": 0.19337935745716095, "learning_rate": 0.00013995886453588104, "loss": 0.9841, "step": 594 }, { "epoch": 1.48, "grad_norm": 0.1859455108642578, "learning_rate": 0.00013977674535425337, "loss": 1.0389, "step": 595 }, { "epoch": 1.48, "grad_norm": 0.17890392243862152, "learning_rate": 0.00013959446931129704, "loss": 1.0308, "step": 596 }, { "epoch": 1.48, "grad_norm": 0.1741844266653061, "learning_rate": 0.00013941203712582553, "loss": 1.0466, "step": 597 }, { "epoch": 1.49, "grad_norm": 0.19279837608337402, "learning_rate": 0.0001392294495172681, "loss": 0.9952, "step": 598 }, { "epoch": 1.49, "grad_norm": 0.19602486491203308, "learning_rate": 0.00013904670720566698, "loss": 1.0273, "step": 599 }, { "epoch": 1.49, "grad_norm": 0.18000701069831848, "learning_rate": 0.0001388638109116744, "loss": 1.0131, "step": 600 }, { "epoch": 1.49, "eval_loss": 1.080866813659668, "eval_runtime": 81.6407, "eval_samples_per_second": 31.835, "eval_steps_per_second": 31.835, "step": 600 }, { "epoch": 1.49, "grad_norm": 0.18183240294456482, "learning_rate": 0.0001386807613565499, "loss": 0.9962, "step": 601 }, { "epoch": 1.5, "grad_norm": 0.1762516349554062, "learning_rate": 0.00013849755926215735, "loss": 1.0288, "step": 602 }, { "epoch": 1.5, "grad_norm": 0.17683060467243195, "learning_rate": 0.00013831420535096223, "loss": 0.9464, "step": 603 }, { "epoch": 1.5, "grad_norm": 0.1796884983778, "learning_rate": 0.00013813070034602863, "loss": 1.0294, "step": 604 }, { "epoch": 1.5, "grad_norm": 0.1921350210905075, "learning_rate": 0.00013794704497101655, "loss": 1.0216, "step": 605 }, { "epoch": 1.51, "grad_norm": 0.18306772410869598, "learning_rate": 0.00013776323995017898, "loss": 1.0552, "step": 606 }, { "epoch": 1.51, "grad_norm": 0.18202297389507294, "learning_rate": 0.000137579286008359, "loss": 0.9735, "step": 607 }, { "epoch": 1.51, "grad_norm": 0.18103723227977753, "learning_rate": 0.00013739518387098705, "loss": 0.9673, "step": 608 }, { "epoch": 1.51, "grad_norm": 0.17903882265090942, "learning_rate": 0.0001372109342640779, "loss": 0.9405, "step": 609 }, { "epoch": 1.52, "grad_norm": 0.18169891834259033, "learning_rate": 0.0001370265379142279, "loss": 0.9595, "step": 610 }, { "epoch": 1.52, "grad_norm": 0.18569333851337433, "learning_rate": 0.00013684199554861207, "loss": 0.9859, "step": 611 }, { "epoch": 1.52, "grad_norm": 0.18026390671730042, "learning_rate": 0.0001366573078949813, "loss": 0.9804, "step": 612 }, { "epoch": 1.52, "grad_norm": 0.18330590426921844, "learning_rate": 0.00013647247568165938, "loss": 0.9623, "step": 613 }, { "epoch": 1.53, "grad_norm": 0.18787868320941925, "learning_rate": 0.00013628749963754026, "loss": 0.977, "step": 614 }, { "epoch": 1.53, "grad_norm": 0.17502212524414062, "learning_rate": 0.00013610238049208495, "loss": 0.9615, "step": 615 }, { "epoch": 1.53, "grad_norm": 0.18354558944702148, "learning_rate": 0.0001359171189753189, "loss": 0.9493, "step": 616 }, { "epoch": 1.53, "grad_norm": 0.18860042095184326, "learning_rate": 0.00013573171581782897, "loss": 1.0698, "step": 617 }, { "epoch": 1.54, "grad_norm": 0.1900940239429474, "learning_rate": 0.00013554617175076062, "loss": 0.961, "step": 618 }, { "epoch": 1.54, "grad_norm": 0.18823568522930145, "learning_rate": 0.00013536048750581494, "loss": 0.9106, "step": 619 }, { "epoch": 1.54, "grad_norm": 0.18658524751663208, "learning_rate": 0.0001351746638152458, "loss": 0.9161, "step": 620 }, { "epoch": 1.54, "grad_norm": 0.18179596960544586, "learning_rate": 0.00013498870141185712, "loss": 0.9394, "step": 621 }, { "epoch": 1.55, "grad_norm": 0.18801775574684143, "learning_rate": 0.00013480260102899966, "loss": 0.9827, "step": 622 }, { "epoch": 1.55, "grad_norm": 0.18649117648601532, "learning_rate": 0.00013461636340056843, "loss": 0.9565, "step": 623 }, { "epoch": 1.55, "grad_norm": 0.1857774257659912, "learning_rate": 0.0001344299892609996, "loss": 1.0292, "step": 624 }, { "epoch": 1.55, "grad_norm": 0.1910741627216339, "learning_rate": 0.00013424347934526772, "loss": 1.0411, "step": 625 }, { "epoch": 1.56, "grad_norm": 0.19100044667720795, "learning_rate": 0.00013405683438888282, "loss": 1.0071, "step": 626 }, { "epoch": 1.56, "grad_norm": 0.17907825112342834, "learning_rate": 0.00013387005512788733, "loss": 1.0374, "step": 627 }, { "epoch": 1.56, "grad_norm": 0.1795564442873001, "learning_rate": 0.00013368314229885347, "loss": 1.0094, "step": 628 }, { "epoch": 1.56, "grad_norm": 0.17529642581939697, "learning_rate": 0.00013349609663888015, "loss": 0.9316, "step": 629 }, { "epoch": 1.57, "grad_norm": 0.18285749852657318, "learning_rate": 0.00013330891888559002, "loss": 0.9878, "step": 630 }, { "epoch": 1.57, "grad_norm": 0.18477262556552887, "learning_rate": 0.00013312160977712668, "loss": 1.0027, "step": 631 }, { "epoch": 1.57, "grad_norm": 0.1869228482246399, "learning_rate": 0.00013293417005215188, "loss": 1.0269, "step": 632 }, { "epoch": 1.57, "grad_norm": 0.19262288510799408, "learning_rate": 0.00013274660044984224, "loss": 1.0839, "step": 633 }, { "epoch": 1.58, "grad_norm": 0.18182508647441864, "learning_rate": 0.0001325589017098867, "loss": 0.9953, "step": 634 }, { "epoch": 1.58, "grad_norm": 0.21832676231861115, "learning_rate": 0.0001323710745724834, "loss": 1.028, "step": 635 }, { "epoch": 1.58, "grad_norm": 0.18413691222667694, "learning_rate": 0.00013218311977833687, "loss": 1.0081, "step": 636 }, { "epoch": 1.58, "grad_norm": 0.182253897190094, "learning_rate": 0.00013199503806865504, "loss": 0.9492, "step": 637 }, { "epoch": 1.59, "grad_norm": 0.19804389774799347, "learning_rate": 0.0001318068301851463, "loss": 0.9859, "step": 638 }, { "epoch": 1.59, "grad_norm": 0.1846335232257843, "learning_rate": 0.00013161849687001666, "loss": 0.9594, "step": 639 }, { "epoch": 1.59, "grad_norm": 0.18544115126132965, "learning_rate": 0.00013143003886596669, "loss": 1.0116, "step": 640 }, { "epoch": 1.59, "grad_norm": 0.1846534013748169, "learning_rate": 0.00013124145691618884, "loss": 1.0081, "step": 641 }, { "epoch": 1.6, "grad_norm": 0.17868997156620026, "learning_rate": 0.0001310527517643642, "loss": 0.9044, "step": 642 }, { "epoch": 1.6, "grad_norm": 0.18729160726070404, "learning_rate": 0.00013086392415465972, "loss": 0.9888, "step": 643 }, { "epoch": 1.6, "grad_norm": 0.1919986605644226, "learning_rate": 0.00013067497483172538, "loss": 1.0277, "step": 644 }, { "epoch": 1.6, "grad_norm": 0.20795708894729614, "learning_rate": 0.00013048590454069108, "loss": 0.8709, "step": 645 }, { "epoch": 1.61, "grad_norm": 0.19611623883247375, "learning_rate": 0.00013029671402716366, "loss": 0.984, "step": 646 }, { "epoch": 1.61, "grad_norm": 0.19515739381313324, "learning_rate": 0.0001301074040372242, "loss": 0.9985, "step": 647 }, { "epoch": 1.61, "grad_norm": 0.1995517462491989, "learning_rate": 0.00012991797531742492, "loss": 1.034, "step": 648 }, { "epoch": 1.61, "grad_norm": 0.18805646896362305, "learning_rate": 0.00012972842861478618, "loss": 0.9625, "step": 649 }, { "epoch": 1.62, "grad_norm": 0.19192944467067719, "learning_rate": 0.00012953876467679373, "loss": 1.0583, "step": 650 }, { "epoch": 1.62, "grad_norm": 0.19570088386535645, "learning_rate": 0.0001293489842513955, "loss": 0.9634, "step": 651 }, { "epoch": 1.62, "grad_norm": 0.19576574862003326, "learning_rate": 0.00012915908808699893, "loss": 1.0172, "step": 652 }, { "epoch": 1.62, "grad_norm": 0.17955078184604645, "learning_rate": 0.0001289690769324678, "loss": 0.9849, "step": 653 }, { "epoch": 1.63, "grad_norm": 0.18549513816833496, "learning_rate": 0.00012877895153711935, "loss": 0.9527, "step": 654 }, { "epoch": 1.63, "grad_norm": 0.19443288445472717, "learning_rate": 0.0001285887126507214, "loss": 1.0151, "step": 655 }, { "epoch": 1.63, "grad_norm": 0.17947880923748016, "learning_rate": 0.00012839836102348926, "loss": 0.9655, "step": 656 }, { "epoch": 1.63, "grad_norm": 0.18537116050720215, "learning_rate": 0.00012820789740608293, "loss": 0.9429, "step": 657 }, { "epoch": 1.64, "grad_norm": 0.19015100598335266, "learning_rate": 0.00012801732254960388, "loss": 1.0355, "step": 658 }, { "epoch": 1.64, "grad_norm": 0.18511660397052765, "learning_rate": 0.00012782663720559246, "loss": 1.0473, "step": 659 }, { "epoch": 1.64, "grad_norm": 0.18822525441646576, "learning_rate": 0.00012763584212602453, "loss": 0.9671, "step": 660 }, { "epoch": 1.64, "grad_norm": 0.18707570433616638, "learning_rate": 0.0001274449380633089, "loss": 1.0481, "step": 661 }, { "epoch": 1.65, "grad_norm": 0.1918199360370636, "learning_rate": 0.00012725392577028402, "loss": 1.0062, "step": 662 }, { "epoch": 1.65, "grad_norm": 0.19667948782444, "learning_rate": 0.00012706280600021522, "loss": 0.9817, "step": 663 }, { "epoch": 1.65, "grad_norm": 0.1822723001241684, "learning_rate": 0.0001268715795067916, "loss": 0.9716, "step": 664 }, { "epoch": 1.65, "grad_norm": 0.1914030760526657, "learning_rate": 0.00012668024704412317, "loss": 1.0209, "step": 665 }, { "epoch": 1.66, "grad_norm": 0.187057226896286, "learning_rate": 0.00012648880936673787, "loss": 1.0381, "step": 666 }, { "epoch": 1.66, "grad_norm": 0.18619103729724884, "learning_rate": 0.00012629726722957846, "loss": 1.0432, "step": 667 }, { "epoch": 1.66, "grad_norm": 0.19731828570365906, "learning_rate": 0.00012610562138799978, "loss": 1.0611, "step": 668 }, { "epoch": 1.66, "grad_norm": 0.1894959807395935, "learning_rate": 0.00012591387259776551, "loss": 0.9914, "step": 669 }, { "epoch": 1.67, "grad_norm": 0.1772470325231552, "learning_rate": 0.00012572202161504543, "loss": 0.9843, "step": 670 }, { "epoch": 1.67, "grad_norm": 0.18182332813739777, "learning_rate": 0.00012553006919641214, "loss": 0.949, "step": 671 }, { "epoch": 1.67, "grad_norm": 0.1846974790096283, "learning_rate": 0.00012533801609883842, "loss": 0.9959, "step": 672 }, { "epoch": 1.68, "grad_norm": 0.18767496943473816, "learning_rate": 0.0001251458630796941, "loss": 0.9466, "step": 673 }, { "epoch": 1.68, "grad_norm": 0.18881787359714508, "learning_rate": 0.00012495361089674285, "loss": 0.9637, "step": 674 }, { "epoch": 1.68, "grad_norm": 0.1902247816324234, "learning_rate": 0.00012476126030813963, "loss": 0.9985, "step": 675 }, { "epoch": 1.68, "grad_norm": 0.18302756547927856, "learning_rate": 0.00012456881207242732, "loss": 0.95, "step": 676 }, { "epoch": 1.69, "grad_norm": 0.18244938552379608, "learning_rate": 0.000124376266948534, "loss": 0.9918, "step": 677 }, { "epoch": 1.69, "grad_norm": 0.19507256150245667, "learning_rate": 0.00012418362569576965, "loss": 1.0055, "step": 678 }, { "epoch": 1.69, "grad_norm": 0.19234226644039154, "learning_rate": 0.0001239908890738235, "loss": 1.0511, "step": 679 }, { "epoch": 1.69, "grad_norm": 0.19556111097335815, "learning_rate": 0.00012379805784276082, "loss": 0.9981, "step": 680 }, { "epoch": 1.7, "grad_norm": 0.19322308897972107, "learning_rate": 0.00012360513276301997, "loss": 0.9603, "step": 681 }, { "epoch": 1.7, "grad_norm": 0.1905602067708969, "learning_rate": 0.0001234121145954094, "loss": 0.9937, "step": 682 }, { "epoch": 1.7, "grad_norm": 0.19340857863426208, "learning_rate": 0.00012321900410110464, "loss": 0.9996, "step": 683 }, { "epoch": 1.7, "grad_norm": 0.181385800242424, "learning_rate": 0.00012302580204164541, "loss": 0.9563, "step": 684 }, { "epoch": 1.71, "grad_norm": 0.19400039315223694, "learning_rate": 0.00012283250917893244, "loss": 1.0732, "step": 685 }, { "epoch": 1.71, "grad_norm": 0.1877606064081192, "learning_rate": 0.0001226391262752245, "loss": 1.0057, "step": 686 }, { "epoch": 1.71, "grad_norm": 0.18977177143096924, "learning_rate": 0.00012244565409313547, "loss": 0.9898, "step": 687 }, { "epoch": 1.71, "grad_norm": 0.19174890220165253, "learning_rate": 0.00012225209339563145, "loss": 1.0449, "step": 688 }, { "epoch": 1.72, "grad_norm": 0.18353353440761566, "learning_rate": 0.0001220584449460274, "loss": 0.9952, "step": 689 }, { "epoch": 1.72, "grad_norm": 0.18639762699604034, "learning_rate": 0.00012186470950798445, "loss": 0.9693, "step": 690 }, { "epoch": 1.72, "grad_norm": 0.1900029480457306, "learning_rate": 0.00012167088784550673, "loss": 0.9574, "step": 691 }, { "epoch": 1.72, "grad_norm": 0.18529686331748962, "learning_rate": 0.00012147698072293842, "loss": 1.0299, "step": 692 }, { "epoch": 1.73, "grad_norm": 0.1907936930656433, "learning_rate": 0.00012128298890496072, "loss": 0.9557, "step": 693 }, { "epoch": 1.73, "grad_norm": 0.1865403652191162, "learning_rate": 0.00012108891315658879, "loss": 0.946, "step": 694 }, { "epoch": 1.73, "grad_norm": 0.18556007742881775, "learning_rate": 0.00012089475424316883, "loss": 1.0129, "step": 695 }, { "epoch": 1.73, "grad_norm": 0.1845078021287918, "learning_rate": 0.00012070051293037492, "loss": 0.9436, "step": 696 }, { "epoch": 1.74, "grad_norm": 0.18208341300487518, "learning_rate": 0.00012050618998420624, "loss": 0.9985, "step": 697 }, { "epoch": 1.74, "grad_norm": 0.19252164661884308, "learning_rate": 0.00012031178617098371, "loss": 1.0147, "step": 698 }, { "epoch": 1.74, "grad_norm": 0.1972821056842804, "learning_rate": 0.00012011730225734723, "loss": 1.0548, "step": 699 }, { "epoch": 1.74, "grad_norm": 0.18477863073349, "learning_rate": 0.00011992273901025269, "loss": 0.9847, "step": 700 }, { "epoch": 1.74, "eval_loss": 1.0762046575546265, "eval_runtime": 81.6492, "eval_samples_per_second": 31.831, "eval_steps_per_second": 31.831, "step": 700 }, { "epoch": 1.75, "grad_norm": 0.19482113420963287, "learning_rate": 0.00011972809719696864, "loss": 0.9685, "step": 701 }, { "epoch": 1.75, "grad_norm": 0.19040922820568085, "learning_rate": 0.0001195333775850736, "loss": 1.0528, "step": 702 }, { "epoch": 1.75, "grad_norm": 0.19116735458374023, "learning_rate": 0.00011933858094245281, "loss": 0.983, "step": 703 }, { "epoch": 1.75, "grad_norm": 0.17496508359909058, "learning_rate": 0.00011914370803729533, "loss": 0.936, "step": 704 }, { "epoch": 1.76, "grad_norm": 0.1774684637784958, "learning_rate": 0.00011894875963809098, "loss": 1.001, "step": 705 }, { "epoch": 1.76, "grad_norm": 0.1926085203886032, "learning_rate": 0.00011875373651362727, "loss": 1.0406, "step": 706 }, { "epoch": 1.76, "grad_norm": 0.18313874304294586, "learning_rate": 0.00011855863943298631, "loss": 0.9501, "step": 707 }, { "epoch": 1.76, "grad_norm": 0.18082866072654724, "learning_rate": 0.00011836346916554205, "loss": 0.9773, "step": 708 }, { "epoch": 1.77, "grad_norm": 0.1892704963684082, "learning_rate": 0.00011816822648095687, "loss": 0.9879, "step": 709 }, { "epoch": 1.77, "grad_norm": 0.1928127110004425, "learning_rate": 0.00011797291214917881, "loss": 1.0106, "step": 710 }, { "epoch": 1.77, "grad_norm": 0.191785529255867, "learning_rate": 0.00011777752694043849, "loss": 0.9633, "step": 711 }, { "epoch": 1.77, "grad_norm": 0.18815581500530243, "learning_rate": 0.00011758207162524598, "loss": 1.0087, "step": 712 }, { "epoch": 1.78, "grad_norm": 0.19140002131462097, "learning_rate": 0.00011738654697438782, "loss": 1.022, "step": 713 }, { "epoch": 1.78, "grad_norm": 0.18412011861801147, "learning_rate": 0.00011719095375892396, "loss": 0.9177, "step": 714 }, { "epoch": 1.78, "grad_norm": 0.19803179800510406, "learning_rate": 0.00011699529275018484, "loss": 1.056, "step": 715 }, { "epoch": 1.78, "grad_norm": 0.18873557448387146, "learning_rate": 0.00011679956471976814, "loss": 0.9664, "step": 716 }, { "epoch": 1.79, "grad_norm": 0.1954958438873291, "learning_rate": 0.00011660377043953588, "loss": 0.9837, "step": 717 }, { "epoch": 1.79, "grad_norm": 0.1911032795906067, "learning_rate": 0.0001164079106816113, "loss": 1.0281, "step": 718 }, { "epoch": 1.79, "grad_norm": 0.19415371119976044, "learning_rate": 0.00011621198621837593, "loss": 0.9596, "step": 719 }, { "epoch": 1.79, "grad_norm": 0.1977900266647339, "learning_rate": 0.00011601599782246646, "loss": 0.9503, "step": 720 }, { "epoch": 1.8, "grad_norm": 0.1874951422214508, "learning_rate": 0.0001158199462667716, "loss": 1.0024, "step": 721 }, { "epoch": 1.8, "grad_norm": 0.1944780796766281, "learning_rate": 0.00011562383232442926, "loss": 0.9805, "step": 722 }, { "epoch": 1.8, "grad_norm": 0.18960687518119812, "learning_rate": 0.00011542765676882325, "loss": 1.0155, "step": 723 }, { "epoch": 1.8, "grad_norm": 0.1834162324666977, "learning_rate": 0.0001152314203735805, "loss": 0.9558, "step": 724 }, { "epoch": 1.81, "grad_norm": 0.1892080008983612, "learning_rate": 0.00011503512391256776, "loss": 1.0202, "step": 725 }, { "epoch": 1.81, "grad_norm": 0.19285555183887482, "learning_rate": 0.00011483876815988867, "loss": 0.986, "step": 726 }, { "epoch": 1.81, "grad_norm": 0.1912676841020584, "learning_rate": 0.00011464235388988067, "loss": 1.0215, "step": 727 }, { "epoch": 1.81, "grad_norm": 0.18774007260799408, "learning_rate": 0.00011444588187711205, "loss": 0.9133, "step": 728 }, { "epoch": 1.82, "grad_norm": 0.18041113018989563, "learning_rate": 0.0001142493528963787, "loss": 0.9651, "step": 729 }, { "epoch": 1.82, "grad_norm": 0.18634317815303802, "learning_rate": 0.00011405276772270126, "loss": 1.0167, "step": 730 }, { "epoch": 1.82, "grad_norm": 0.18424159288406372, "learning_rate": 0.0001138561271313219, "loss": 0.9602, "step": 731 }, { "epoch": 1.82, "grad_norm": 0.18384714424610138, "learning_rate": 0.0001136594318977014, "loss": 0.9298, "step": 732 }, { "epoch": 1.83, "grad_norm": 0.19117358326911926, "learning_rate": 0.00011346268279751595, "loss": 0.9123, "step": 733 }, { "epoch": 1.83, "grad_norm": 0.18405017256736755, "learning_rate": 0.0001132658806066542, "loss": 0.9986, "step": 734 }, { "epoch": 1.83, "grad_norm": 0.1914985477924347, "learning_rate": 0.00011306902610121419, "loss": 0.9518, "step": 735 }, { "epoch": 1.83, "grad_norm": 0.1904747486114502, "learning_rate": 0.00011287212005750024, "loss": 0.9891, "step": 736 }, { "epoch": 1.84, "grad_norm": 0.1916552037000656, "learning_rate": 0.00011267516325201985, "loss": 0.9616, "step": 737 }, { "epoch": 1.84, "grad_norm": 0.18625429272651672, "learning_rate": 0.00011247815646148087, "loss": 0.9592, "step": 738 }, { "epoch": 1.84, "grad_norm": 0.1944790482521057, "learning_rate": 0.00011228110046278808, "loss": 0.9469, "step": 739 }, { "epoch": 1.84, "grad_norm": 0.20122645795345306, "learning_rate": 0.00011208399603304047, "loss": 0.9849, "step": 740 }, { "epoch": 1.85, "grad_norm": 0.19067947566509247, "learning_rate": 0.00011188684394952789, "loss": 1.0099, "step": 741 }, { "epoch": 1.85, "grad_norm": 0.18489985167980194, "learning_rate": 0.00011168964498972818, "loss": 0.9669, "step": 742 }, { "epoch": 1.85, "grad_norm": 0.1892281025648117, "learning_rate": 0.00011149239993130403, "loss": 0.9674, "step": 743 }, { "epoch": 1.85, "grad_norm": 0.1811356395483017, "learning_rate": 0.00011129510955209996, "loss": 1.0119, "step": 744 }, { "epoch": 1.86, "grad_norm": 0.19581769406795502, "learning_rate": 0.00011109777463013915, "loss": 0.9978, "step": 745 }, { "epoch": 1.86, "grad_norm": 0.19298292696475983, "learning_rate": 0.00011090039594362045, "loss": 0.9971, "step": 746 }, { "epoch": 1.86, "grad_norm": 0.1880626529455185, "learning_rate": 0.00011070297427091534, "loss": 1.0108, "step": 747 }, { "epoch": 1.86, "grad_norm": 0.1833215206861496, "learning_rate": 0.00011050551039056479, "loss": 0.9353, "step": 748 }, { "epoch": 1.87, "grad_norm": 0.18261606991291046, "learning_rate": 0.0001103080050812762, "loss": 0.9607, "step": 749 }, { "epoch": 1.87, "grad_norm": 0.1790233999490738, "learning_rate": 0.00011011045912192035, "loss": 0.9579, "step": 750 }, { "epoch": 1.87, "grad_norm": 0.20333704352378845, "learning_rate": 0.00010991287329152838, "loss": 1.0136, "step": 751 }, { "epoch": 1.87, "grad_norm": 0.18839126825332642, "learning_rate": 0.0001097152483692886, "loss": 0.992, "step": 752 }, { "epoch": 1.88, "grad_norm": 0.1932857632637024, "learning_rate": 0.00010951758513454351, "loss": 0.9098, "step": 753 }, { "epoch": 1.88, "grad_norm": 0.19326822459697723, "learning_rate": 0.00010931988436678666, "loss": 0.9718, "step": 754 }, { "epoch": 1.88, "grad_norm": 0.19290626049041748, "learning_rate": 0.00010912214684565967, "loss": 0.9569, "step": 755 }, { "epoch": 1.88, "grad_norm": 0.1982078105211258, "learning_rate": 0.00010892437335094912, "loss": 0.929, "step": 756 }, { "epoch": 1.89, "grad_norm": 0.18881501257419586, "learning_rate": 0.00010872656466258328, "loss": 1.0139, "step": 757 }, { "epoch": 1.89, "grad_norm": 0.18985024094581604, "learning_rate": 0.00010852872156062946, "loss": 0.9946, "step": 758 }, { "epoch": 1.89, "grad_norm": 0.19749155640602112, "learning_rate": 0.00010833084482529048, "loss": 1.0356, "step": 759 }, { "epoch": 1.89, "grad_norm": 0.19211384654045105, "learning_rate": 0.00010813293523690191, "loss": 0.9779, "step": 760 }, { "epoch": 1.9, "grad_norm": 0.19262412190437317, "learning_rate": 0.0001079349935759288, "loss": 0.9665, "step": 761 }, { "epoch": 1.9, "grad_norm": 0.18871724605560303, "learning_rate": 0.00010773702062296273, "loss": 0.9511, "step": 762 }, { "epoch": 1.9, "grad_norm": 0.18119603395462036, "learning_rate": 0.00010753901715871866, "loss": 0.9482, "step": 763 }, { "epoch": 1.9, "grad_norm": 0.18349209427833557, "learning_rate": 0.00010734098396403192, "loss": 0.9386, "step": 764 }, { "epoch": 1.91, "grad_norm": 0.19208337366580963, "learning_rate": 0.00010714292181985498, "loss": 0.9473, "step": 765 }, { "epoch": 1.91, "grad_norm": 0.18588630855083466, "learning_rate": 0.00010694483150725458, "loss": 1.0278, "step": 766 }, { "epoch": 1.91, "grad_norm": 0.18634718656539917, "learning_rate": 0.00010674671380740851, "loss": 1.0387, "step": 767 }, { "epoch": 1.91, "grad_norm": 0.18514113128185272, "learning_rate": 0.00010654856950160253, "loss": 0.9557, "step": 768 }, { "epoch": 1.92, "grad_norm": 0.18085001409053802, "learning_rate": 0.00010635039937122733, "loss": 0.9689, "step": 769 }, { "epoch": 1.92, "grad_norm": 0.18852289021015167, "learning_rate": 0.00010615220419777548, "loss": 1.0444, "step": 770 }, { "epoch": 1.92, "grad_norm": 0.19260498881340027, "learning_rate": 0.00010595398476283827, "loss": 0.9204, "step": 771 }, { "epoch": 1.92, "grad_norm": 0.19677571952342987, "learning_rate": 0.00010575574184810269, "loss": 1.0183, "step": 772 }, { "epoch": 1.93, "grad_norm": 0.19709721207618713, "learning_rate": 0.00010555747623534831, "loss": 1.011, "step": 773 }, { "epoch": 1.93, "grad_norm": 0.18773804605007172, "learning_rate": 0.0001053591887064442, "loss": 0.9834, "step": 774 }, { "epoch": 1.93, "grad_norm": 0.19036594033241272, "learning_rate": 0.0001051608800433459, "loss": 0.9657, "step": 775 }, { "epoch": 1.93, "grad_norm": 0.1866806596517563, "learning_rate": 0.00010496255102809223, "loss": 0.9609, "step": 776 }, { "epoch": 1.94, "grad_norm": 0.1847977638244629, "learning_rate": 0.00010476420244280232, "loss": 0.9814, "step": 777 }, { "epoch": 1.94, "grad_norm": 0.19136272370815277, "learning_rate": 0.00010456583506967248, "loss": 1.0256, "step": 778 }, { "epoch": 1.94, "grad_norm": 0.22890682518482208, "learning_rate": 0.00010436744969097306, "loss": 0.9979, "step": 779 }, { "epoch": 1.94, "grad_norm": 0.19637508690357208, "learning_rate": 0.00010416904708904548, "loss": 0.9841, "step": 780 }, { "epoch": 1.95, "grad_norm": 0.1934499442577362, "learning_rate": 0.000103970628046299, "loss": 0.9251, "step": 781 }, { "epoch": 1.95, "grad_norm": 0.1859968602657318, "learning_rate": 0.00010377219334520783, "loss": 0.9702, "step": 782 }, { "epoch": 1.95, "grad_norm": 0.18776066601276398, "learning_rate": 0.00010357374376830775, "loss": 0.95, "step": 783 }, { "epoch": 1.95, "grad_norm": 0.19182752072811127, "learning_rate": 0.00010337528009819344, "loss": 0.9476, "step": 784 }, { "epoch": 1.96, "grad_norm": 0.19188746809959412, "learning_rate": 0.00010317680311751496, "loss": 1.0165, "step": 785 }, { "epoch": 1.96, "grad_norm": 0.18225421011447906, "learning_rate": 0.00010297831360897492, "loss": 0.9593, "step": 786 }, { "epoch": 1.96, "grad_norm": 0.1944630891084671, "learning_rate": 0.00010277981235532541, "loss": 0.9439, "step": 787 }, { "epoch": 1.96, "grad_norm": 0.1944238543510437, "learning_rate": 0.00010258130013936474, "loss": 1.0166, "step": 788 }, { "epoch": 1.97, "grad_norm": 0.18848265707492828, "learning_rate": 0.00010238277774393448, "loss": 0.9808, "step": 789 }, { "epoch": 1.97, "grad_norm": 0.1884046196937561, "learning_rate": 0.00010218424595191631, "loss": 1.0332, "step": 790 }, { "epoch": 1.97, "grad_norm": 0.1906522959470749, "learning_rate": 0.00010198570554622909, "loss": 0.9361, "step": 791 }, { "epoch": 1.97, "grad_norm": 0.1847391128540039, "learning_rate": 0.00010178715730982549, "loss": 0.9522, "step": 792 }, { "epoch": 1.98, "grad_norm": 0.18664193153381348, "learning_rate": 0.00010158860202568916, "loss": 0.9834, "step": 793 }, { "epoch": 1.98, "grad_norm": 0.19117935001850128, "learning_rate": 0.00010139004047683151, "loss": 0.9931, "step": 794 }, { "epoch": 1.98, "grad_norm": 0.1847536265850067, "learning_rate": 0.0001011914734462887, "loss": 1.0131, "step": 795 }, { "epoch": 1.98, "grad_norm": 0.18716172873973846, "learning_rate": 0.00010099290171711841, "loss": 0.948, "step": 796 }, { "epoch": 1.99, "grad_norm": 0.18498627841472626, "learning_rate": 0.00010079432607239692, "loss": 0.9776, "step": 797 }, { "epoch": 1.99, "grad_norm": 0.18583077192306519, "learning_rate": 0.00010059574729521595, "loss": 0.9272, "step": 798 }, { "epoch": 1.99, "grad_norm": 0.18982268869876862, "learning_rate": 0.00010039716616867957, "loss": 1.0097, "step": 799 }, { "epoch": 1.99, "grad_norm": 0.17928728461265564, "learning_rate": 0.0001001985834759011, "loss": 0.9584, "step": 800 }, { "epoch": 1.99, "eval_loss": 1.0697418451309204, "eval_runtime": 81.6952, "eval_samples_per_second": 31.813, "eval_steps_per_second": 31.813, "step": 800 }, { "epoch": 2.0, "grad_norm": 0.18747156858444214, "learning_rate": 0.0001, "loss": 0.9594, "step": 801 }, { "epoch": 2.0, "grad_norm": 0.19288472831249237, "learning_rate": 9.980141652409895e-05, "loss": 0.9338, "step": 802 }, { "epoch": 2.0, "grad_norm": 0.18970318138599396, "learning_rate": 9.960283383132045e-05, "loss": 0.9898, "step": 803 }, { "epoch": 2.0, "grad_norm": 0.1925521343946457, "learning_rate": 9.940425270478407e-05, "loss": 0.9461, "step": 804 }, { "epoch": 2.01, "grad_norm": 0.18187780678272247, "learning_rate": 9.920567392760312e-05, "loss": 0.944, "step": 805 }, { "epoch": 2.01, "grad_norm": 0.19572007656097412, "learning_rate": 9.900709828288164e-05, "loss": 0.9885, "step": 806 }, { "epoch": 2.01, "grad_norm": 0.1877673715353012, "learning_rate": 9.880852655371134e-05, "loss": 0.9392, "step": 807 }, { "epoch": 2.0, "grad_norm": 0.1854715347290039, "learning_rate": 9.860995952316851e-05, "loss": 1.0065, "step": 808 }, { "epoch": 2.0, "grad_norm": 0.18235796689987183, "learning_rate": 9.841139797431087e-05, "loss": 0.9312, "step": 809 }, { "epoch": 2.01, "grad_norm": 0.17826877534389496, "learning_rate": 9.821284269017455e-05, "loss": 0.8701, "step": 810 }, { "epoch": 2.01, "grad_norm": 0.18416720628738403, "learning_rate": 9.801429445377094e-05, "loss": 0.8408, "step": 811 }, { "epoch": 2.01, "grad_norm": 0.17920765280723572, "learning_rate": 9.781575404808371e-05, "loss": 0.8983, "step": 812 }, { "epoch": 2.01, "grad_norm": 0.183628648519516, "learning_rate": 9.761722225606557e-05, "loss": 0.8709, "step": 813 }, { "epoch": 2.02, "grad_norm": 0.18957631289958954, "learning_rate": 9.741869986063526e-05, "loss": 0.8804, "step": 814 }, { "epoch": 2.02, "grad_norm": 0.2053278684616089, "learning_rate": 9.722018764467461e-05, "loss": 0.8398, "step": 815 }, { "epoch": 2.02, "grad_norm": 0.19788406789302826, "learning_rate": 9.702168639102509e-05, "loss": 0.9097, "step": 816 }, { "epoch": 2.02, "grad_norm": 0.20031650364398956, "learning_rate": 9.682319688248509e-05, "loss": 0.8645, "step": 817 }, { "epoch": 2.03, "grad_norm": 0.19214944541454315, "learning_rate": 9.662471990180657e-05, "loss": 0.8217, "step": 818 }, { "epoch": 2.03, "grad_norm": 0.2092873752117157, "learning_rate": 9.642625623169226e-05, "loss": 0.8243, "step": 819 }, { "epoch": 2.03, "grad_norm": 0.2160460352897644, "learning_rate": 9.622780665479222e-05, "loss": 0.8859, "step": 820 }, { "epoch": 2.03, "grad_norm": 0.2088296115398407, "learning_rate": 9.602937195370099e-05, "loss": 0.8526, "step": 821 }, { "epoch": 2.04, "grad_norm": 0.2098212093114853, "learning_rate": 9.583095291095453e-05, "loss": 0.8604, "step": 822 }, { "epoch": 2.04, "grad_norm": 0.21992380917072296, "learning_rate": 9.563255030902697e-05, "loss": 0.9074, "step": 823 }, { "epoch": 2.04, "grad_norm": 0.21225717663764954, "learning_rate": 9.543416493032757e-05, "loss": 0.8701, "step": 824 }, { "epoch": 2.04, "grad_norm": 0.214600071310997, "learning_rate": 9.523579755719769e-05, "loss": 0.8925, "step": 825 }, { "epoch": 2.05, "grad_norm": 0.21339939534664154, "learning_rate": 9.503744897190778e-05, "loss": 0.9007, "step": 826 }, { "epoch": 2.05, "grad_norm": 0.20003697276115417, "learning_rate": 9.483911995665414e-05, "loss": 0.8921, "step": 827 }, { "epoch": 2.05, "grad_norm": 0.20841309428215027, "learning_rate": 9.464081129355586e-05, "loss": 0.9281, "step": 828 }, { "epoch": 2.05, "grad_norm": 0.21467553079128265, "learning_rate": 9.444252376465171e-05, "loss": 0.8974, "step": 829 }, { "epoch": 2.06, "grad_norm": 0.20813807845115662, "learning_rate": 9.424425815189733e-05, "loss": 0.8384, "step": 830 }, { "epoch": 2.06, "grad_norm": 0.2062835544347763, "learning_rate": 9.404601523716175e-05, "loss": 0.89, "step": 831 }, { "epoch": 2.06, "grad_norm": 0.21031899750232697, "learning_rate": 9.384779580222453e-05, "loss": 0.8344, "step": 832 }, { "epoch": 2.06, "grad_norm": 0.2135552167892456, "learning_rate": 9.364960062877268e-05, "loss": 0.875, "step": 833 }, { "epoch": 2.07, "grad_norm": 0.21999678015708923, "learning_rate": 9.345143049839749e-05, "loss": 0.8554, "step": 834 }, { "epoch": 2.07, "grad_norm": 0.21540965139865875, "learning_rate": 9.325328619259151e-05, "loss": 0.8411, "step": 835 }, { "epoch": 2.07, "grad_norm": 0.21845032274723053, "learning_rate": 9.305516849274541e-05, "loss": 0.8716, "step": 836 }, { "epoch": 2.07, "grad_norm": 0.21824797987937927, "learning_rate": 9.285707818014502e-05, "loss": 0.8517, "step": 837 }, { "epoch": 2.08, "grad_norm": 0.20518356561660767, "learning_rate": 9.265901603596811e-05, "loss": 0.8687, "step": 838 }, { "epoch": 2.08, "grad_norm": 0.22265198826789856, "learning_rate": 9.246098284128133e-05, "loss": 0.8336, "step": 839 }, { "epoch": 2.08, "grad_norm": 0.21480637788772583, "learning_rate": 9.226297937703728e-05, "loss": 0.8317, "step": 840 }, { "epoch": 2.08, "grad_norm": 0.22485694289207458, "learning_rate": 9.206500642407123e-05, "loss": 0.9148, "step": 841 }, { "epoch": 2.09, "grad_norm": 0.23164290189743042, "learning_rate": 9.186706476309812e-05, "loss": 0.9167, "step": 842 }, { "epoch": 2.09, "grad_norm": 0.21595808863639832, "learning_rate": 9.166915517470953e-05, "loss": 0.8332, "step": 843 }, { "epoch": 2.09, "grad_norm": 0.2094314992427826, "learning_rate": 9.147127843937055e-05, "loss": 0.9095, "step": 844 }, { "epoch": 2.09, "grad_norm": 0.20858456194400787, "learning_rate": 9.127343533741673e-05, "loss": 0.8165, "step": 845 }, { "epoch": 2.1, "grad_norm": 0.2048875093460083, "learning_rate": 9.107562664905093e-05, "loss": 0.8217, "step": 846 }, { "epoch": 2.1, "grad_norm": 0.22322624921798706, "learning_rate": 9.087785315434034e-05, "loss": 0.8802, "step": 847 }, { "epoch": 2.1, "grad_norm": 0.21676169335842133, "learning_rate": 9.068011563321336e-05, "loss": 0.9194, "step": 848 }, { "epoch": 2.1, "grad_norm": 0.21484245359897614, "learning_rate": 9.048241486545653e-05, "loss": 0.8414, "step": 849 }, { "epoch": 2.11, "grad_norm": 0.21517163515090942, "learning_rate": 9.028475163071141e-05, "loss": 0.8892, "step": 850 }, { "epoch": 2.11, "grad_norm": 0.21394751965999603, "learning_rate": 9.008712670847164e-05, "loss": 0.8815, "step": 851 }, { "epoch": 2.11, "grad_norm": 0.22271205484867096, "learning_rate": 8.988954087807968e-05, "loss": 0.8445, "step": 852 }, { "epoch": 2.11, "grad_norm": 0.21329210698604584, "learning_rate": 8.969199491872384e-05, "loss": 0.8545, "step": 853 }, { "epoch": 2.12, "grad_norm": 0.21919883787631989, "learning_rate": 8.949448960943524e-05, "loss": 0.894, "step": 854 }, { "epoch": 2.12, "grad_norm": 0.22663602232933044, "learning_rate": 8.929702572908468e-05, "loss": 0.9052, "step": 855 }, { "epoch": 2.12, "grad_norm": 0.22025009989738464, "learning_rate": 8.909960405637958e-05, "loss": 0.8843, "step": 856 }, { "epoch": 2.12, "grad_norm": 0.2537056505680084, "learning_rate": 8.890222536986085e-05, "loss": 0.8879, "step": 857 }, { "epoch": 2.13, "grad_norm": 0.2221290022134781, "learning_rate": 8.870489044790006e-05, "loss": 0.9292, "step": 858 }, { "epoch": 2.13, "grad_norm": 0.2207321673631668, "learning_rate": 8.8507600068696e-05, "loss": 0.8372, "step": 859 }, { "epoch": 2.13, "grad_norm": 0.22196346521377563, "learning_rate": 8.831035501027186e-05, "loss": 0.9012, "step": 860 }, { "epoch": 2.13, "grad_norm": 0.22689001262187958, "learning_rate": 8.811315605047212e-05, "loss": 0.8297, "step": 861 }, { "epoch": 2.14, "grad_norm": 0.22168859839439392, "learning_rate": 8.791600396695954e-05, "loss": 0.8708, "step": 862 }, { "epoch": 2.14, "grad_norm": 0.24377846717834473, "learning_rate": 8.771889953721193e-05, "loss": 0.8795, "step": 863 }, { "epoch": 2.14, "grad_norm": 0.21760863065719604, "learning_rate": 8.752184353851916e-05, "loss": 0.9101, "step": 864 }, { "epoch": 2.14, "grad_norm": 0.22010110318660736, "learning_rate": 8.732483674798013e-05, "loss": 0.8257, "step": 865 }, { "epoch": 2.15, "grad_norm": 0.22590592503547668, "learning_rate": 8.712787994249979e-05, "loss": 0.8307, "step": 866 }, { "epoch": 2.15, "grad_norm": 0.23051592707633972, "learning_rate": 8.693097389878584e-05, "loss": 0.8684, "step": 867 }, { "epoch": 2.15, "grad_norm": 0.236350417137146, "learning_rate": 8.673411939334581e-05, "loss": 0.9198, "step": 868 }, { "epoch": 2.15, "grad_norm": 0.21821850538253784, "learning_rate": 8.653731720248406e-05, "loss": 0.827, "step": 869 }, { "epoch": 2.16, "grad_norm": 0.22644954919815063, "learning_rate": 8.634056810229862e-05, "loss": 0.8043, "step": 870 }, { "epoch": 2.16, "grad_norm": 0.23422570526599884, "learning_rate": 8.614387286867814e-05, "loss": 0.8717, "step": 871 }, { "epoch": 2.16, "grad_norm": 0.22765624523162842, "learning_rate": 8.594723227729875e-05, "loss": 0.8231, "step": 872 }, { "epoch": 2.16, "grad_norm": 0.22841888666152954, "learning_rate": 8.575064710362131e-05, "loss": 0.8835, "step": 873 }, { "epoch": 2.17, "grad_norm": 0.2258443683385849, "learning_rate": 8.555411812288798e-05, "loss": 0.9019, "step": 874 }, { "epoch": 2.17, "grad_norm": 0.21592696011066437, "learning_rate": 8.535764611011938e-05, "loss": 0.8337, "step": 875 }, { "epoch": 2.17, "grad_norm": 0.22289562225341797, "learning_rate": 8.516123184011135e-05, "loss": 0.8484, "step": 876 }, { "epoch": 2.17, "grad_norm": 0.2233499139547348, "learning_rate": 8.496487608743225e-05, "loss": 0.8179, "step": 877 }, { "epoch": 2.18, "grad_norm": 0.23362183570861816, "learning_rate": 8.47685796264195e-05, "loss": 0.9047, "step": 878 }, { "epoch": 2.18, "grad_norm": 0.23437048494815826, "learning_rate": 8.457234323117675e-05, "loss": 0.9417, "step": 879 }, { "epoch": 2.18, "grad_norm": 0.22312527894973755, "learning_rate": 8.437616767557077e-05, "loss": 0.8303, "step": 880 }, { "epoch": 2.18, "grad_norm": 0.2268453687429428, "learning_rate": 8.418005373322841e-05, "loss": 0.8701, "step": 881 }, { "epoch": 2.19, "grad_norm": 0.22004881501197815, "learning_rate": 8.398400217753357e-05, "loss": 0.871, "step": 882 }, { "epoch": 2.19, "grad_norm": 0.22518005967140198, "learning_rate": 8.378801378162407e-05, "loss": 0.8727, "step": 883 }, { "epoch": 2.19, "grad_norm": 0.2279236763715744, "learning_rate": 8.359208931838871e-05, "loss": 0.8439, "step": 884 }, { "epoch": 2.19, "grad_norm": 0.22399497032165527, "learning_rate": 8.339622956046417e-05, "loss": 0.8659, "step": 885 }, { "epoch": 2.2, "grad_norm": 0.223841592669487, "learning_rate": 8.320043528023188e-05, "loss": 0.8873, "step": 886 }, { "epoch": 2.2, "grad_norm": 0.21393626928329468, "learning_rate": 8.300470724981517e-05, "loss": 0.819, "step": 887 }, { "epoch": 2.2, "grad_norm": 0.21786653995513916, "learning_rate": 8.280904624107606e-05, "loss": 0.8836, "step": 888 }, { "epoch": 2.2, "grad_norm": 0.22029568254947662, "learning_rate": 8.261345302561223e-05, "loss": 0.8588, "step": 889 }, { "epoch": 2.21, "grad_norm": 0.22604787349700928, "learning_rate": 8.241792837475405e-05, "loss": 0.8482, "step": 890 }, { "epoch": 2.21, "grad_norm": 0.23287111520767212, "learning_rate": 8.222247305956153e-05, "loss": 0.896, "step": 891 }, { "epoch": 2.21, "grad_norm": 0.2144191414117813, "learning_rate": 8.202708785082121e-05, "loss": 0.8683, "step": 892 }, { "epoch": 2.21, "grad_norm": 0.23104549944400787, "learning_rate": 8.183177351904318e-05, "loss": 0.8505, "step": 893 }, { "epoch": 2.22, "grad_norm": 0.2190437614917755, "learning_rate": 8.163653083445799e-05, "loss": 0.8358, "step": 894 }, { "epoch": 2.22, "grad_norm": 0.24076685309410095, "learning_rate": 8.144136056701371e-05, "loss": 0.8632, "step": 895 }, { "epoch": 2.22, "grad_norm": 0.23634831607341766, "learning_rate": 8.124626348637279e-05, "loss": 0.8675, "step": 896 }, { "epoch": 2.22, "grad_norm": 0.24546468257904053, "learning_rate": 8.105124036190901e-05, "loss": 0.9333, "step": 897 }, { "epoch": 2.23, "grad_norm": 0.23648321628570557, "learning_rate": 8.085629196270469e-05, "loss": 0.8685, "step": 898 }, { "epoch": 2.23, "grad_norm": 0.22194604575634003, "learning_rate": 8.066141905754723e-05, "loss": 0.8685, "step": 899 }, { "epoch": 2.23, "grad_norm": 0.22531919181346893, "learning_rate": 8.046662241492645e-05, "loss": 0.8514, "step": 900 }, { "epoch": 2.23, "eval_loss": 1.0966360569000244, "eval_runtime": 81.6957, "eval_samples_per_second": 31.813, "eval_steps_per_second": 31.813, "step": 900 }, { "epoch": 2.23, "grad_norm": 0.2201288640499115, "learning_rate": 8.027190280303137e-05, "loss": 0.8848, "step": 901 }, { "epoch": 2.24, "grad_norm": 0.23173043131828308, "learning_rate": 8.007726098974734e-05, "loss": 0.9054, "step": 902 }, { "epoch": 2.24, "grad_norm": 0.22361530363559723, "learning_rate": 7.988269774265278e-05, "loss": 0.8482, "step": 903 }, { "epoch": 2.24, "grad_norm": 0.227571040391922, "learning_rate": 7.96882138290163e-05, "loss": 0.8468, "step": 904 }, { "epoch": 2.24, "grad_norm": 0.2319188416004181, "learning_rate": 7.949381001579378e-05, "loss": 0.9162, "step": 905 }, { "epoch": 2.25, "grad_norm": 0.21636833250522614, "learning_rate": 7.929948706962508e-05, "loss": 0.8132, "step": 906 }, { "epoch": 2.25, "grad_norm": 0.23730331659317017, "learning_rate": 7.910524575683122e-05, "loss": 0.9081, "step": 907 }, { "epoch": 2.25, "grad_norm": 0.2545856833457947, "learning_rate": 7.891108684341121e-05, "loss": 0.8963, "step": 908 }, { "epoch": 2.25, "grad_norm": 0.22913877665996552, "learning_rate": 7.871701109503929e-05, "loss": 0.865, "step": 909 }, { "epoch": 2.26, "grad_norm": 0.21807163953781128, "learning_rate": 7.852301927706159e-05, "loss": 0.8544, "step": 910 }, { "epoch": 2.26, "grad_norm": 0.22247523069381714, "learning_rate": 7.83291121544933e-05, "loss": 0.8338, "step": 911 }, { "epoch": 2.26, "grad_norm": 0.23280644416809082, "learning_rate": 7.813529049201556e-05, "loss": 0.8507, "step": 912 }, { "epoch": 2.26, "grad_norm": 0.23217426240444183, "learning_rate": 7.794155505397261e-05, "loss": 0.8885, "step": 913 }, { "epoch": 2.27, "grad_norm": 0.23153464496135712, "learning_rate": 7.774790660436858e-05, "loss": 0.8608, "step": 914 }, { "epoch": 2.27, "grad_norm": 0.23713816702365875, "learning_rate": 7.755434590686452e-05, "loss": 0.8647, "step": 915 }, { "epoch": 2.27, "grad_norm": 0.23218050599098206, "learning_rate": 7.736087372477554e-05, "loss": 0.8713, "step": 916 }, { "epoch": 2.27, "grad_norm": 0.2232307493686676, "learning_rate": 7.71674908210676e-05, "loss": 0.857, "step": 917 }, { "epoch": 2.28, "grad_norm": 0.23642230033874512, "learning_rate": 7.69741979583546e-05, "loss": 0.8837, "step": 918 }, { "epoch": 2.28, "grad_norm": 0.22629418969154358, "learning_rate": 7.678099589889534e-05, "loss": 0.8818, "step": 919 }, { "epoch": 2.28, "grad_norm": 0.22235704958438873, "learning_rate": 7.658788540459062e-05, "loss": 0.861, "step": 920 }, { "epoch": 2.28, "grad_norm": 0.23001636564731598, "learning_rate": 7.639486723698006e-05, "loss": 0.9049, "step": 921 }, { "epoch": 2.29, "grad_norm": 0.23310962319374084, "learning_rate": 7.620194215723919e-05, "loss": 0.8823, "step": 922 }, { "epoch": 2.29, "grad_norm": 0.2352224886417389, "learning_rate": 7.600911092617651e-05, "loss": 0.9145, "step": 923 }, { "epoch": 2.29, "grad_norm": 0.23719394207000732, "learning_rate": 7.581637430423037e-05, "loss": 0.8488, "step": 924 }, { "epoch": 2.29, "grad_norm": 0.22607235610485077, "learning_rate": 7.562373305146604e-05, "loss": 0.8455, "step": 925 }, { "epoch": 2.3, "grad_norm": 0.2291191816329956, "learning_rate": 7.543118792757266e-05, "loss": 0.8635, "step": 926 }, { "epoch": 2.3, "grad_norm": 0.23770484328269958, "learning_rate": 7.523873969186039e-05, "loss": 0.8383, "step": 927 }, { "epoch": 2.3, "grad_norm": 0.24489553272724152, "learning_rate": 7.504638910325717e-05, "loss": 0.9295, "step": 928 }, { "epoch": 2.3, "grad_norm": 0.24724024534225464, "learning_rate": 7.485413692030596e-05, "loss": 0.8608, "step": 929 }, { "epoch": 2.31, "grad_norm": 0.2722989320755005, "learning_rate": 7.466198390116158e-05, "loss": 0.8665, "step": 930 }, { "epoch": 2.31, "grad_norm": 0.23387101292610168, "learning_rate": 7.446993080358789e-05, "loss": 0.8605, "step": 931 }, { "epoch": 2.31, "grad_norm": 0.2289327085018158, "learning_rate": 7.427797838495463e-05, "loss": 0.879, "step": 932 }, { "epoch": 2.31, "grad_norm": 0.22585996985435486, "learning_rate": 7.408612740223448e-05, "loss": 0.8726, "step": 933 }, { "epoch": 2.32, "grad_norm": 0.23639705777168274, "learning_rate": 7.389437861200024e-05, "loss": 0.8741, "step": 934 }, { "epoch": 2.32, "grad_norm": 0.23411568999290466, "learning_rate": 7.370273277042156e-05, "loss": 0.8082, "step": 935 }, { "epoch": 2.32, "grad_norm": 0.22799403965473175, "learning_rate": 7.35111906332622e-05, "loss": 0.9161, "step": 936 }, { "epoch": 2.32, "grad_norm": 0.23831185698509216, "learning_rate": 7.331975295587687e-05, "loss": 0.876, "step": 937 }, { "epoch": 2.33, "grad_norm": 0.22817480564117432, "learning_rate": 7.312842049320844e-05, "loss": 0.921, "step": 938 }, { "epoch": 2.33, "grad_norm": 0.24017876386642456, "learning_rate": 7.293719399978482e-05, "loss": 0.9114, "step": 939 }, { "epoch": 2.33, "grad_norm": 0.22485961019992828, "learning_rate": 7.2746074229716e-05, "loss": 0.842, "step": 940 }, { "epoch": 2.33, "grad_norm": 0.23912273347377777, "learning_rate": 7.25550619366911e-05, "loss": 0.8769, "step": 941 }, { "epoch": 2.34, "grad_norm": 0.23127301037311554, "learning_rate": 7.236415787397548e-05, "loss": 0.8836, "step": 942 }, { "epoch": 2.34, "grad_norm": 0.22729326784610748, "learning_rate": 7.217336279440761e-05, "loss": 0.8359, "step": 943 }, { "epoch": 2.34, "grad_norm": 0.24395537376403809, "learning_rate": 7.198267745039612e-05, "loss": 0.9073, "step": 944 }, { "epoch": 2.34, "grad_norm": 0.22521136701107025, "learning_rate": 7.179210259391709e-05, "loss": 0.8923, "step": 945 }, { "epoch": 2.35, "grad_norm": 0.21926188468933105, "learning_rate": 7.160163897651075e-05, "loss": 0.8695, "step": 946 }, { "epoch": 2.35, "grad_norm": 0.23410847783088684, "learning_rate": 7.141128734927863e-05, "loss": 0.8842, "step": 947 }, { "epoch": 2.35, "grad_norm": 0.23125596344470978, "learning_rate": 7.122104846288064e-05, "loss": 0.7969, "step": 948 }, { "epoch": 2.35, "grad_norm": 0.23090513050556183, "learning_rate": 7.103092306753222e-05, "loss": 0.8205, "step": 949 }, { "epoch": 2.36, "grad_norm": 0.23909111320972443, "learning_rate": 7.08409119130011e-05, "loss": 0.9612, "step": 950 }, { "epoch": 2.36, "grad_norm": 0.22639630734920502, "learning_rate": 7.065101574860449e-05, "loss": 0.8638, "step": 951 }, { "epoch": 2.36, "grad_norm": 0.240586057305336, "learning_rate": 7.04612353232063e-05, "loss": 0.8854, "step": 952 }, { "epoch": 2.36, "grad_norm": 0.22903908789157867, "learning_rate": 7.027157138521383e-05, "loss": 0.8789, "step": 953 }, { "epoch": 2.37, "grad_norm": 0.23088420927524567, "learning_rate": 7.008202468257514e-05, "loss": 0.8621, "step": 954 }, { "epoch": 2.37, "grad_norm": 0.23314878344535828, "learning_rate": 6.989259596277582e-05, "loss": 0.836, "step": 955 }, { "epoch": 2.37, "grad_norm": 0.23400847613811493, "learning_rate": 6.970328597283637e-05, "loss": 0.8466, "step": 956 }, { "epoch": 2.37, "grad_norm": 0.24101755023002625, "learning_rate": 6.951409545930895e-05, "loss": 0.8814, "step": 957 }, { "epoch": 2.38, "grad_norm": 0.22932396829128265, "learning_rate": 6.932502516827461e-05, "loss": 0.89, "step": 958 }, { "epoch": 2.38, "grad_norm": 0.23362137377262115, "learning_rate": 6.913607584534026e-05, "loss": 0.8795, "step": 959 }, { "epoch": 2.38, "grad_norm": 0.23562505841255188, "learning_rate": 6.894724823563583e-05, "loss": 0.8497, "step": 960 }, { "epoch": 2.38, "grad_norm": 0.24299654364585876, "learning_rate": 6.875854308381118e-05, "loss": 0.8842, "step": 961 }, { "epoch": 2.39, "grad_norm": 0.2539837062358856, "learning_rate": 6.85699611340333e-05, "loss": 0.881, "step": 962 }, { "epoch": 2.39, "grad_norm": 0.22038055956363678, "learning_rate": 6.838150312998338e-05, "loss": 0.7901, "step": 963 }, { "epoch": 2.39, "grad_norm": 0.24034848809242249, "learning_rate": 6.819316981485372e-05, "loss": 0.7992, "step": 964 }, { "epoch": 2.39, "grad_norm": 0.23470063507556915, "learning_rate": 6.800496193134498e-05, "loss": 0.8451, "step": 965 }, { "epoch": 2.4, "grad_norm": 0.2265550196170807, "learning_rate": 6.781688022166311e-05, "loss": 0.8209, "step": 966 }, { "epoch": 2.4, "grad_norm": 0.22969329357147217, "learning_rate": 6.76289254275166e-05, "loss": 0.7919, "step": 967 }, { "epoch": 2.4, "grad_norm": 0.21824900805950165, "learning_rate": 6.744109829011332e-05, "loss": 0.7832, "step": 968 }, { "epoch": 2.4, "grad_norm": 0.23124177753925323, "learning_rate": 6.725339955015777e-05, "loss": 0.8228, "step": 969 }, { "epoch": 2.41, "grad_norm": 0.2427556961774826, "learning_rate": 6.706582994784814e-05, "loss": 0.8742, "step": 970 }, { "epoch": 2.41, "grad_norm": 0.2298479974269867, "learning_rate": 6.687839022287332e-05, "loss": 0.8546, "step": 971 }, { "epoch": 2.41, "grad_norm": 0.23871837556362152, "learning_rate": 6.669108111441003e-05, "loss": 0.9039, "step": 972 }, { "epoch": 2.41, "grad_norm": 0.2351294755935669, "learning_rate": 6.650390336111989e-05, "loss": 0.8459, "step": 973 }, { "epoch": 2.42, "grad_norm": 0.22315853834152222, "learning_rate": 6.631685770114654e-05, "loss": 0.8429, "step": 974 }, { "epoch": 2.42, "grad_norm": 0.23592588305473328, "learning_rate": 6.61299448721127e-05, "loss": 0.858, "step": 975 }, { "epoch": 2.42, "grad_norm": 0.23219649493694305, "learning_rate": 6.594316561111724e-05, "loss": 0.8142, "step": 976 }, { "epoch": 2.42, "grad_norm": 0.2380094975233078, "learning_rate": 6.57565206547323e-05, "loss": 0.8822, "step": 977 }, { "epoch": 2.43, "grad_norm": 0.2358466237783432, "learning_rate": 6.557001073900044e-05, "loss": 0.8647, "step": 978 }, { "epoch": 2.43, "grad_norm": 0.2390095740556717, "learning_rate": 6.538363659943162e-05, "loss": 0.8555, "step": 979 }, { "epoch": 2.43, "grad_norm": 0.23388570547103882, "learning_rate": 6.519739897100034e-05, "loss": 0.8834, "step": 980 }, { "epoch": 2.43, "grad_norm": 0.22735805809497833, "learning_rate": 6.50112985881429e-05, "loss": 0.8118, "step": 981 }, { "epoch": 2.44, "grad_norm": 0.23097509145736694, "learning_rate": 6.482533618475422e-05, "loss": 0.8806, "step": 982 }, { "epoch": 2.44, "grad_norm": 0.24261382222175598, "learning_rate": 6.46395124941851e-05, "loss": 0.8686, "step": 983 }, { "epoch": 2.44, "grad_norm": 0.24292969703674316, "learning_rate": 6.445382824923938e-05, "loss": 0.8828, "step": 984 }, { "epoch": 2.44, "grad_norm": 0.2404637634754181, "learning_rate": 6.426828418217104e-05, "loss": 0.8477, "step": 985 }, { "epoch": 2.45, "grad_norm": 0.23497383296489716, "learning_rate": 6.408288102468113e-05, "loss": 0.8443, "step": 986 }, { "epoch": 2.45, "grad_norm": 0.22989152371883392, "learning_rate": 6.38976195079151e-05, "loss": 0.904, "step": 987 }, { "epoch": 2.45, "grad_norm": 0.32862499356269836, "learning_rate": 6.371250036245976e-05, "loss": 0.8985, "step": 988 }, { "epoch": 2.45, "grad_norm": 0.23098599910736084, "learning_rate": 6.352752431834063e-05, "loss": 0.8491, "step": 989 }, { "epoch": 2.46, "grad_norm": 0.2405708283185959, "learning_rate": 6.334269210501875e-05, "loss": 0.8633, "step": 990 }, { "epoch": 2.46, "grad_norm": 0.24079830944538116, "learning_rate": 6.315800445138796e-05, "loss": 0.8733, "step": 991 }, { "epoch": 2.46, "grad_norm": 0.24686433374881744, "learning_rate": 6.297346208577213e-05, "loss": 0.8782, "step": 992 }, { "epoch": 2.46, "grad_norm": 0.24181082844734192, "learning_rate": 6.278906573592213e-05, "loss": 0.88, "step": 993 }, { "epoch": 2.47, "grad_norm": 0.22755932807922363, "learning_rate": 6.260481612901299e-05, "loss": 0.8649, "step": 994 }, { "epoch": 2.47, "grad_norm": 0.22607646882534027, "learning_rate": 6.2420713991641e-05, "loss": 0.8275, "step": 995 }, { "epoch": 2.47, "grad_norm": 0.24847641587257385, "learning_rate": 6.223676004982105e-05, "loss": 0.8119, "step": 996 }, { "epoch": 2.47, "grad_norm": 0.24037522077560425, "learning_rate": 6.205295502898348e-05, "loss": 0.8569, "step": 997 }, { "epoch": 2.48, "grad_norm": 0.23675377666950226, "learning_rate": 6.18692996539714e-05, "loss": 0.8486, "step": 998 }, { "epoch": 2.48, "grad_norm": 0.22981135547161102, "learning_rate": 6.168579464903779e-05, "loss": 0.8692, "step": 999 }, { "epoch": 2.48, "grad_norm": 0.24225787818431854, "learning_rate": 6.150244073784266e-05, "loss": 0.9217, "step": 1000 }, { "epoch": 2.48, "eval_loss": 1.0994617938995361, "eval_runtime": 81.6867, "eval_samples_per_second": 31.817, "eval_steps_per_second": 31.817, "step": 1000 }, { "epoch": 2.48, "grad_norm": 0.23787778615951538, "learning_rate": 6.131923864345012e-05, "loss": 0.8524, "step": 1001 }, { "epoch": 2.49, "grad_norm": 0.24544064700603485, "learning_rate": 6.113618908832561e-05, "loss": 0.8633, "step": 1002 }, { "epoch": 2.49, "grad_norm": 0.23733370006084442, "learning_rate": 6.095329279433304e-05, "loss": 0.8377, "step": 1003 }, { "epoch": 2.49, "grad_norm": 0.2332485020160675, "learning_rate": 6.0770550482731924e-05, "loss": 0.8691, "step": 1004 }, { "epoch": 2.49, "grad_norm": 0.24771836400032043, "learning_rate": 6.058796287417451e-05, "loss": 0.899, "step": 1005 }, { "epoch": 2.5, "grad_norm": 0.2370125651359558, "learning_rate": 6.0405530688702986e-05, "loss": 0.9022, "step": 1006 }, { "epoch": 2.5, "grad_norm": 0.24643033742904663, "learning_rate": 6.022325464574665e-05, "loss": 0.8361, "step": 1007 }, { "epoch": 2.5, "grad_norm": 0.23578715324401855, "learning_rate": 6.0041135464119024e-05, "loss": 0.8887, "step": 1008 }, { "epoch": 2.51, "grad_norm": 0.24807056784629822, "learning_rate": 5.9859173862014985e-05, "loss": 0.8856, "step": 1009 }, { "epoch": 2.51, "grad_norm": 0.23163636028766632, "learning_rate": 5.9677370557008104e-05, "loss": 0.8757, "step": 1010 }, { "epoch": 2.51, "grad_norm": 0.23828206956386566, "learning_rate": 5.9495726266047605e-05, "loss": 0.8544, "step": 1011 }, { "epoch": 2.51, "grad_norm": 0.23905439674854279, "learning_rate": 5.9314241705455674e-05, "loss": 0.7964, "step": 1012 }, { "epoch": 2.52, "grad_norm": 0.23195435106754303, "learning_rate": 5.9132917590924564e-05, "loss": 0.8351, "step": 1013 }, { "epoch": 2.52, "grad_norm": 0.23809070885181427, "learning_rate": 5.895175463751385e-05, "loss": 0.8169, "step": 1014 }, { "epoch": 2.52, "grad_norm": 0.22984261810779572, "learning_rate": 5.877075355964754e-05, "loss": 0.8237, "step": 1015 }, { "epoch": 2.52, "grad_norm": 0.2386801838874817, "learning_rate": 5.858991507111122e-05, "loss": 0.8129, "step": 1016 }, { "epoch": 2.53, "grad_norm": 0.23563861846923828, "learning_rate": 5.84092398850494e-05, "loss": 0.8052, "step": 1017 }, { "epoch": 2.53, "grad_norm": 0.2533442974090576, "learning_rate": 5.8228728713962543e-05, "loss": 0.8839, "step": 1018 }, { "epoch": 2.53, "grad_norm": 0.24461986124515533, "learning_rate": 5.8048382269704305e-05, "loss": 0.8754, "step": 1019 }, { "epoch": 2.53, "grad_norm": 0.23056775331497192, "learning_rate": 5.786820126347876e-05, "loss": 0.8227, "step": 1020 }, { "epoch": 2.54, "grad_norm": 0.22662828862667084, "learning_rate": 5.768818640583755e-05, "loss": 0.8322, "step": 1021 }, { "epoch": 2.54, "grad_norm": 0.23077984154224396, "learning_rate": 5.750833840667711e-05, "loss": 0.7937, "step": 1022 }, { "epoch": 2.54, "grad_norm": 0.23161981999874115, "learning_rate": 5.7328657975235864e-05, "loss": 0.8493, "step": 1023 }, { "epoch": 2.54, "grad_norm": 0.23408883810043335, "learning_rate": 5.7149145820091385e-05, "loss": 0.848, "step": 1024 }, { "epoch": 2.55, "grad_norm": 0.24241715669631958, "learning_rate": 5.696980264915777e-05, "loss": 0.8525, "step": 1025 }, { "epoch": 2.55, "grad_norm": 0.23030132055282593, "learning_rate": 5.6790629169682564e-05, "loss": 0.8419, "step": 1026 }, { "epoch": 2.55, "grad_norm": 0.23942214250564575, "learning_rate": 5.6611626088244194e-05, "loss": 0.8917, "step": 1027 }, { "epoch": 2.55, "grad_norm": 0.24789659678936005, "learning_rate": 5.6432794110749134e-05, "loss": 0.9304, "step": 1028 }, { "epoch": 2.56, "grad_norm": 0.2257944494485855, "learning_rate": 5.625413394242907e-05, "loss": 0.8732, "step": 1029 }, { "epoch": 2.56, "grad_norm": 0.23342953622341156, "learning_rate": 5.607564628783817e-05, "loss": 0.8425, "step": 1030 }, { "epoch": 2.56, "grad_norm": 0.2391550987958908, "learning_rate": 5.589733185085022e-05, "loss": 0.844, "step": 1031 }, { "epoch": 2.56, "grad_norm": 0.2298198938369751, "learning_rate": 5.571919133465605e-05, "loss": 0.8093, "step": 1032 }, { "epoch": 2.57, "grad_norm": 0.22968660295009613, "learning_rate": 5.5541225441760524e-05, "loss": 0.8567, "step": 1033 }, { "epoch": 2.57, "grad_norm": 0.23968294262886047, "learning_rate": 5.5363434873979903e-05, "loss": 0.8456, "step": 1034 }, { "epoch": 2.57, "grad_norm": 0.23312266170978546, "learning_rate": 5.518582033243902e-05, "loss": 0.8736, "step": 1035 }, { "epoch": 2.57, "grad_norm": 0.23636901378631592, "learning_rate": 5.500838251756857e-05, "loss": 0.8193, "step": 1036 }, { "epoch": 2.58, "grad_norm": 0.2466956079006195, "learning_rate": 5.4831122129102307e-05, "loss": 0.8624, "step": 1037 }, { "epoch": 2.58, "grad_norm": 0.2368500530719757, "learning_rate": 5.465403986607426e-05, "loss": 0.9215, "step": 1038 }, { "epoch": 2.58, "grad_norm": 0.2406860888004303, "learning_rate": 5.447713642681612e-05, "loss": 0.8701, "step": 1039 }, { "epoch": 2.58, "grad_norm": 0.2592841684818268, "learning_rate": 5.430041250895428e-05, "loss": 0.8584, "step": 1040 }, { "epoch": 2.59, "grad_norm": 0.23043441772460938, "learning_rate": 5.4123868809407206e-05, "loss": 0.8487, "step": 1041 }, { "epoch": 2.59, "grad_norm": 0.25117915868759155, "learning_rate": 5.3947506024382665e-05, "loss": 0.8997, "step": 1042 }, { "epoch": 2.59, "grad_norm": 0.232276052236557, "learning_rate": 5.377132484937499e-05, "loss": 0.879, "step": 1043 }, { "epoch": 2.59, "grad_norm": 0.23870497941970825, "learning_rate": 5.359532597916233e-05, "loss": 0.8555, "step": 1044 }, { "epoch": 2.6, "grad_norm": 0.24226155877113342, "learning_rate": 5.341951010780386e-05, "loss": 0.9121, "step": 1045 }, { "epoch": 2.6, "grad_norm": 0.23830334842205048, "learning_rate": 5.324387792863719e-05, "loss": 0.8793, "step": 1046 }, { "epoch": 2.6, "grad_norm": 0.24411016702651978, "learning_rate": 5.306843013427545e-05, "loss": 0.8638, "step": 1047 }, { "epoch": 2.6, "grad_norm": 0.23800359666347504, "learning_rate": 5.289316741660466e-05, "loss": 0.819, "step": 1048 }, { "epoch": 2.61, "grad_norm": 0.2342299073934555, "learning_rate": 5.271809046678094e-05, "loss": 0.8437, "step": 1049 }, { "epoch": 2.61, "grad_norm": 0.23816512525081635, "learning_rate": 5.254319997522796e-05, "loss": 0.8115, "step": 1050 }, { "epoch": 2.61, "grad_norm": 0.26241835951805115, "learning_rate": 5.236849663163399e-05, "loss": 0.8486, "step": 1051 }, { "epoch": 2.61, "grad_norm": 0.24685335159301758, "learning_rate": 5.21939811249492e-05, "loss": 0.8306, "step": 1052 }, { "epoch": 2.62, "grad_norm": 0.2531743347644806, "learning_rate": 5.201965414338308e-05, "loss": 0.9084, "step": 1053 }, { "epoch": 2.62, "grad_norm": 0.24715054035186768, "learning_rate": 5.1845516374401784e-05, "loss": 0.8725, "step": 1054 }, { "epoch": 2.62, "grad_norm": 0.24543942511081696, "learning_rate": 5.1671568504725135e-05, "loss": 0.859, "step": 1055 }, { "epoch": 2.62, "grad_norm": 0.2586332857608795, "learning_rate": 5.14978112203241e-05, "loss": 0.8582, "step": 1056 }, { "epoch": 2.63, "grad_norm": 0.23888130486011505, "learning_rate": 5.1324245206418184e-05, "loss": 0.8681, "step": 1057 }, { "epoch": 2.63, "grad_norm": 0.2790679633617401, "learning_rate": 5.11508711474725e-05, "loss": 0.9189, "step": 1058 }, { "epoch": 2.63, "grad_norm": 0.2425728142261505, "learning_rate": 5.097768972719522e-05, "loss": 0.8832, "step": 1059 }, { "epoch": 2.63, "grad_norm": 0.25885894894599915, "learning_rate": 5.080470162853472e-05, "loss": 0.8812, "step": 1060 }, { "epoch": 2.64, "grad_norm": 0.25290602445602417, "learning_rate": 5.063190753367721e-05, "loss": 0.8626, "step": 1061 }, { "epoch": 2.64, "grad_norm": 0.23895353078842163, "learning_rate": 5.0459308124043715e-05, "loss": 0.8645, "step": 1062 }, { "epoch": 2.64, "grad_norm": 0.2505248486995697, "learning_rate": 5.028690408028748e-05, "loss": 0.8445, "step": 1063 }, { "epoch": 2.64, "grad_norm": 0.24904492497444153, "learning_rate": 5.0114696082291425e-05, "loss": 0.8551, "step": 1064 }, { "epoch": 2.65, "grad_norm": 0.235503688454628, "learning_rate": 4.9942684809165284e-05, "loss": 0.83, "step": 1065 }, { "epoch": 2.65, "grad_norm": 0.24133411049842834, "learning_rate": 4.9770870939242986e-05, "loss": 0.863, "step": 1066 }, { "epoch": 2.65, "grad_norm": 0.23815608024597168, "learning_rate": 4.959925515008002e-05, "loss": 0.8487, "step": 1067 }, { "epoch": 2.65, "grad_norm": 0.23583517968654633, "learning_rate": 4.942783811845074e-05, "loss": 0.91, "step": 1068 }, { "epoch": 2.66, "grad_norm": 0.23426254093647003, "learning_rate": 4.9256620520345675e-05, "loss": 0.8264, "step": 1069 }, { "epoch": 2.66, "grad_norm": 0.24411818385124207, "learning_rate": 4.908560303096887e-05, "loss": 0.8203, "step": 1070 }, { "epoch": 2.66, "grad_norm": 0.24334770441055298, "learning_rate": 4.891478632473524e-05, "loss": 0.8984, "step": 1071 }, { "epoch": 2.66, "grad_norm": 0.23765143752098083, "learning_rate": 4.874417107526795e-05, "loss": 0.8344, "step": 1072 }, { "epoch": 2.67, "grad_norm": 0.23669512569904327, "learning_rate": 4.857375795539566e-05, "loss": 0.8571, "step": 1073 }, { "epoch": 2.67, "grad_norm": 0.2507469356060028, "learning_rate": 4.840354763714991e-05, "loss": 0.8609, "step": 1074 }, { "epoch": 2.67, "grad_norm": 0.2642061412334442, "learning_rate": 4.823354079176253e-05, "loss": 0.8545, "step": 1075 }, { "epoch": 2.67, "grad_norm": 0.2463250607252121, "learning_rate": 4.8063738089662926e-05, "loss": 0.8988, "step": 1076 }, { "epoch": 2.68, "grad_norm": 0.24115216732025146, "learning_rate": 4.7894140200475435e-05, "loss": 0.8035, "step": 1077 }, { "epoch": 2.68, "grad_norm": 0.24064919352531433, "learning_rate": 4.772474779301669e-05, "loss": 0.8105, "step": 1078 }, { "epoch": 2.68, "grad_norm": 0.23809556663036346, "learning_rate": 4.755556153529311e-05, "loss": 0.8661, "step": 1079 }, { "epoch": 2.68, "grad_norm": 0.2361900955438614, "learning_rate": 4.738658209449805e-05, "loss": 0.8703, "step": 1080 }, { "epoch": 2.69, "grad_norm": 0.24005629122257233, "learning_rate": 4.7217810137009274e-05, "loss": 0.8274, "step": 1081 }, { "epoch": 2.69, "grad_norm": 0.24326114356517792, "learning_rate": 4.704924632838636e-05, "loss": 0.8717, "step": 1082 }, { "epoch": 2.69, "grad_norm": 0.24389806389808655, "learning_rate": 4.688089133336805e-05, "loss": 0.8709, "step": 1083 }, { "epoch": 2.69, "grad_norm": 0.25470393896102905, "learning_rate": 4.671274581586958e-05, "loss": 0.9047, "step": 1084 }, { "epoch": 2.7, "grad_norm": 0.23721614480018616, "learning_rate": 4.654481043898011e-05, "loss": 0.8582, "step": 1085 }, { "epoch": 2.7, "grad_norm": 0.2489212304353714, "learning_rate": 4.637708586496018e-05, "loss": 0.8728, "step": 1086 }, { "epoch": 2.7, "grad_norm": 0.25072595477104187, "learning_rate": 4.6209572755238905e-05, "loss": 0.8573, "step": 1087 }, { "epoch": 2.7, "grad_norm": 0.23837445676326752, "learning_rate": 4.604227177041156e-05, "loss": 0.8527, "step": 1088 }, { "epoch": 2.71, "grad_norm": 0.2543460428714752, "learning_rate": 4.5875183570236815e-05, "loss": 0.879, "step": 1089 }, { "epoch": 2.71, "grad_norm": 0.24079710245132446, "learning_rate": 4.570830881363439e-05, "loss": 0.7991, "step": 1090 }, { "epoch": 2.71, "grad_norm": 0.24399738013744354, "learning_rate": 4.554164815868204e-05, "loss": 0.8833, "step": 1091 }, { "epoch": 2.71, "grad_norm": 0.22862033545970917, "learning_rate": 4.537520226261333e-05, "loss": 0.8312, "step": 1092 }, { "epoch": 2.72, "grad_norm": 0.247565358877182, "learning_rate": 4.5208971781814955e-05, "loss": 0.8847, "step": 1093 }, { "epoch": 2.72, "grad_norm": 0.24801327288150787, "learning_rate": 4.5042957371824057e-05, "loss": 0.8261, "step": 1094 }, { "epoch": 2.72, "grad_norm": 0.24174731969833374, "learning_rate": 4.487715968732568e-05, "loss": 0.8684, "step": 1095 }, { "epoch": 2.72, "grad_norm": 0.2395808845758438, "learning_rate": 4.471157938215017e-05, "loss": 0.8125, "step": 1096 }, { "epoch": 2.73, "grad_norm": 0.24519509077072144, "learning_rate": 4.454621710927077e-05, "loss": 0.8945, "step": 1097 }, { "epoch": 2.73, "grad_norm": 0.23781876266002655, "learning_rate": 4.438107352080076e-05, "loss": 0.8322, "step": 1098 }, { "epoch": 2.73, "grad_norm": 0.24549739062786102, "learning_rate": 4.421614926799108e-05, "loss": 0.879, "step": 1099 }, { "epoch": 2.73, "grad_norm": 0.24997320771217346, "learning_rate": 4.405144500122772e-05, "loss": 0.8732, "step": 1100 }, { "epoch": 2.73, "eval_loss": 1.0963517427444458, "eval_runtime": 81.6065, "eval_samples_per_second": 31.848, "eval_steps_per_second": 31.848, "step": 1100 }, { "epoch": 2.74, "grad_norm": 0.23882664740085602, "learning_rate": 4.388696137002911e-05, "loss": 0.8819, "step": 1101 }, { "epoch": 2.74, "grad_norm": 0.2379320114850998, "learning_rate": 4.372269902304363e-05, "loss": 0.8449, "step": 1102 }, { "epoch": 2.74, "grad_norm": 0.25028368830680847, "learning_rate": 4.355865860804698e-05, "loss": 0.9019, "step": 1103 }, { "epoch": 2.74, "grad_norm": 0.2810663878917694, "learning_rate": 4.339484077193974e-05, "loss": 0.8657, "step": 1104 }, { "epoch": 2.75, "grad_norm": 0.23311807215213776, "learning_rate": 4.323124616074464e-05, "loss": 0.819, "step": 1105 }, { "epoch": 2.75, "grad_norm": 0.24217994511127472, "learning_rate": 4.3067875419604184e-05, "loss": 0.8688, "step": 1106 }, { "epoch": 2.75, "grad_norm": 0.25671547651290894, "learning_rate": 4.2904729192778006e-05, "loss": 0.932, "step": 1107 }, { "epoch": 2.75, "grad_norm": 0.24109551310539246, "learning_rate": 4.2741808123640335e-05, "loss": 0.8474, "step": 1108 }, { "epoch": 2.76, "grad_norm": 0.23516185581684113, "learning_rate": 4.257911285467754e-05, "loss": 0.8328, "step": 1109 }, { "epoch": 2.76, "grad_norm": 0.23598629236221313, "learning_rate": 4.241664402748544e-05, "loss": 0.7996, "step": 1110 }, { "epoch": 2.76, "grad_norm": 0.24916405975818634, "learning_rate": 4.2254402282767034e-05, "loss": 0.8598, "step": 1111 }, { "epoch": 2.76, "grad_norm": 0.25266116857528687, "learning_rate": 4.209238826032965e-05, "loss": 0.8836, "step": 1112 }, { "epoch": 2.77, "grad_norm": 0.24511922895908356, "learning_rate": 4.1930602599082666e-05, "loss": 0.878, "step": 1113 }, { "epoch": 2.77, "grad_norm": 0.2427850365638733, "learning_rate": 4.1769045937034876e-05, "loss": 0.8808, "step": 1114 }, { "epoch": 2.77, "grad_norm": 0.2547648549079895, "learning_rate": 4.1607718911292025e-05, "loss": 0.8838, "step": 1115 }, { "epoch": 2.77, "grad_norm": 0.2500211000442505, "learning_rate": 4.144662215805426e-05, "loss": 0.8743, "step": 1116 }, { "epoch": 2.78, "grad_norm": 0.24311552941799164, "learning_rate": 4.1285756312613654e-05, "loss": 0.8771, "step": 1117 }, { "epoch": 2.78, "grad_norm": 0.22430969774723053, "learning_rate": 4.1125122009351634e-05, "loss": 0.7847, "step": 1118 }, { "epoch": 2.78, "grad_norm": 0.23909464478492737, "learning_rate": 4.096471988173667e-05, "loss": 0.8544, "step": 1119 }, { "epoch": 2.78, "grad_norm": 0.24670055508613586, "learning_rate": 4.080455056232147e-05, "loss": 0.8567, "step": 1120 }, { "epoch": 2.79, "grad_norm": 0.2429244965314865, "learning_rate": 4.064461468274077e-05, "loss": 0.8565, "step": 1121 }, { "epoch": 2.79, "grad_norm": 0.2525031864643097, "learning_rate": 4.048491287370863e-05, "loss": 0.857, "step": 1122 }, { "epoch": 2.79, "grad_norm": 0.24017855525016785, "learning_rate": 4.0325445765016145e-05, "loss": 0.8675, "step": 1123 }, { "epoch": 2.79, "grad_norm": 0.24993157386779785, "learning_rate": 4.016621398552877e-05, "loss": 0.9149, "step": 1124 }, { "epoch": 2.8, "grad_norm": 0.24595491588115692, "learning_rate": 4.000721816318395e-05, "loss": 0.8199, "step": 1125 }, { "epoch": 2.8, "grad_norm": 0.24830754101276398, "learning_rate": 3.9848458924988684e-05, "loss": 0.8721, "step": 1126 }, { "epoch": 2.8, "grad_norm": 0.4584636688232422, "learning_rate": 3.9689936897016944e-05, "loss": 0.7831, "step": 1127 }, { "epoch": 2.8, "grad_norm": 0.24669432640075684, "learning_rate": 3.953165270440721e-05, "loss": 0.8892, "step": 1128 }, { "epoch": 2.81, "grad_norm": 0.24245108664035797, "learning_rate": 3.937360697136019e-05, "loss": 0.8853, "step": 1129 }, { "epoch": 2.81, "grad_norm": 0.23646272718906403, "learning_rate": 3.921580032113602e-05, "loss": 0.7994, "step": 1130 }, { "epoch": 2.81, "grad_norm": 0.24627800285816193, "learning_rate": 3.905823337605213e-05, "loss": 0.8379, "step": 1131 }, { "epoch": 2.81, "grad_norm": 0.23912878334522247, "learning_rate": 3.8900906757480614e-05, "loss": 0.856, "step": 1132 }, { "epoch": 2.82, "grad_norm": 0.2569670081138611, "learning_rate": 3.874382108584591e-05, "loss": 0.887, "step": 1133 }, { "epoch": 2.82, "grad_norm": 0.2441360503435135, "learning_rate": 3.858697698062217e-05, "loss": 0.8101, "step": 1134 }, { "epoch": 2.82, "grad_norm": 0.25435709953308105, "learning_rate": 3.843037506033096e-05, "loss": 0.8839, "step": 1135 }, { "epoch": 2.82, "grad_norm": 0.23224297165870667, "learning_rate": 3.8274015942538745e-05, "loss": 0.7814, "step": 1136 }, { "epoch": 2.83, "grad_norm": 0.24714259803295135, "learning_rate": 3.8117900243854595e-05, "loss": 0.8391, "step": 1137 }, { "epoch": 2.83, "grad_norm": 0.2544543147087097, "learning_rate": 3.7962028579927555e-05, "loss": 0.8488, "step": 1138 }, { "epoch": 2.83, "grad_norm": 0.24433031678199768, "learning_rate": 3.780640156544424e-05, "loss": 0.8364, "step": 1139 }, { "epoch": 2.83, "grad_norm": 0.24626778066158295, "learning_rate": 3.7651019814126654e-05, "loss": 0.8359, "step": 1140 }, { "epoch": 2.84, "grad_norm": 0.23161056637763977, "learning_rate": 3.749588393872947e-05, "loss": 0.7973, "step": 1141 }, { "epoch": 2.84, "grad_norm": 0.24299640953540802, "learning_rate": 3.734099455103779e-05, "loss": 0.8663, "step": 1142 }, { "epoch": 2.84, "grad_norm": 0.23688755929470062, "learning_rate": 3.71863522618646e-05, "loss": 0.8788, "step": 1143 }, { "epoch": 2.84, "grad_norm": 0.24079696834087372, "learning_rate": 3.7031957681048604e-05, "loss": 0.825, "step": 1144 }, { "epoch": 2.85, "grad_norm": 0.26706093549728394, "learning_rate": 3.68778114174515e-05, "loss": 0.9043, "step": 1145 }, { "epoch": 2.85, "grad_norm": 0.25147339701652527, "learning_rate": 3.6723914078955825e-05, "loss": 0.8592, "step": 1146 }, { "epoch": 2.85, "grad_norm": 0.24465493857860565, "learning_rate": 3.65702662724624e-05, "loss": 0.8805, "step": 1147 }, { "epoch": 2.85, "grad_norm": 0.2561103105545044, "learning_rate": 3.64168686038881e-05, "loss": 0.8335, "step": 1148 }, { "epoch": 2.86, "grad_norm": 0.23355357348918915, "learning_rate": 3.626372167816326e-05, "loss": 0.8079, "step": 1149 }, { "epoch": 2.86, "grad_norm": 0.24586178362369537, "learning_rate": 3.6110826099229453e-05, "loss": 0.8381, "step": 1150 }, { "epoch": 2.86, "grad_norm": 0.26193350553512573, "learning_rate": 3.595818247003713e-05, "loss": 0.8652, "step": 1151 }, { "epoch": 2.86, "grad_norm": 0.2508774399757385, "learning_rate": 3.580579139254303e-05, "loss": 0.9423, "step": 1152 }, { "epoch": 2.87, "grad_norm": 0.24596548080444336, "learning_rate": 3.565365346770805e-05, "loss": 0.8543, "step": 1153 }, { "epoch": 2.87, "grad_norm": 0.258883535861969, "learning_rate": 3.550176929549468e-05, "loss": 0.8808, "step": 1154 }, { "epoch": 2.87, "grad_norm": 0.2447265088558197, "learning_rate": 3.535013947486481e-05, "loss": 0.8675, "step": 1155 }, { "epoch": 2.87, "grad_norm": 0.23408272862434387, "learning_rate": 3.5198764603777235e-05, "loss": 0.8649, "step": 1156 }, { "epoch": 2.88, "grad_norm": 0.26081281900405884, "learning_rate": 3.50476452791853e-05, "loss": 0.8724, "step": 1157 }, { "epoch": 2.88, "grad_norm": 0.2497384399175644, "learning_rate": 3.489678209703475e-05, "loss": 0.8791, "step": 1158 }, { "epoch": 2.88, "grad_norm": 0.2521859407424927, "learning_rate": 3.4746175652261056e-05, "loss": 0.8444, "step": 1159 }, { "epoch": 2.88, "grad_norm": 0.2415734827518463, "learning_rate": 3.459582653878731e-05, "loss": 0.8615, "step": 1160 }, { "epoch": 2.89, "grad_norm": 0.2494700402021408, "learning_rate": 3.44457353495218e-05, "loss": 0.8484, "step": 1161 }, { "epoch": 2.89, "grad_norm": 0.23962122201919556, "learning_rate": 3.429590267635565e-05, "loss": 0.8546, "step": 1162 }, { "epoch": 2.89, "grad_norm": 0.24284565448760986, "learning_rate": 3.414632911016056e-05, "loss": 0.9345, "step": 1163 }, { "epoch": 2.89, "grad_norm": 0.24653640389442444, "learning_rate": 3.399701524078635e-05, "loss": 0.8454, "step": 1164 }, { "epoch": 2.9, "grad_norm": 0.2546091675758362, "learning_rate": 3.3847961657058845e-05, "loss": 0.8595, "step": 1165 }, { "epoch": 2.9, "grad_norm": 0.2380194067955017, "learning_rate": 3.369916894677733e-05, "loss": 0.8321, "step": 1166 }, { "epoch": 2.9, "grad_norm": 0.25154367089271545, "learning_rate": 3.355063769671232e-05, "loss": 0.9244, "step": 1167 }, { "epoch": 2.9, "grad_norm": 0.23600707948207855, "learning_rate": 3.340236849260324e-05, "loss": 0.8587, "step": 1168 }, { "epoch": 2.91, "grad_norm": 0.24559392035007477, "learning_rate": 3.325436191915628e-05, "loss": 0.874, "step": 1169 }, { "epoch": 2.91, "grad_norm": 0.24205893278121948, "learning_rate": 3.31066185600417e-05, "loss": 0.8207, "step": 1170 }, { "epoch": 2.91, "grad_norm": 0.2409772425889969, "learning_rate": 3.2959138997891905e-05, "loss": 0.8255, "step": 1171 }, { "epoch": 2.91, "grad_norm": 0.23933902382850647, "learning_rate": 3.281192381429894e-05, "loss": 0.891, "step": 1172 }, { "epoch": 2.92, "grad_norm": 0.23304079473018646, "learning_rate": 3.2664973589812364e-05, "loss": 0.7923, "step": 1173 }, { "epoch": 2.92, "grad_norm": 0.25415170192718506, "learning_rate": 3.251828890393677e-05, "loss": 0.8304, "step": 1174 }, { "epoch": 2.92, "grad_norm": 0.2426324039697647, "learning_rate": 3.237187033512956e-05, "loss": 0.8494, "step": 1175 }, { "epoch": 2.92, "grad_norm": 0.25308680534362793, "learning_rate": 3.222571846079881e-05, "loss": 0.8844, "step": 1176 }, { "epoch": 2.93, "grad_norm": 0.2543047368526459, "learning_rate": 3.207983385730081e-05, "loss": 0.9307, "step": 1177 }, { "epoch": 2.93, "grad_norm": 0.24869924783706665, "learning_rate": 3.193421709993779e-05, "loss": 0.8162, "step": 1178 }, { "epoch": 2.93, "grad_norm": 0.25677043199539185, "learning_rate": 3.178886876295578e-05, "loss": 0.8811, "step": 1179 }, { "epoch": 2.93, "grad_norm": 0.24882106482982635, "learning_rate": 3.1643789419542324e-05, "loss": 0.878, "step": 1180 }, { "epoch": 2.94, "grad_norm": 0.24929676949977875, "learning_rate": 3.149897964182413e-05, "loss": 0.8805, "step": 1181 }, { "epoch": 2.94, "grad_norm": 0.24904274940490723, "learning_rate": 3.135444000086485e-05, "loss": 0.8397, "step": 1182 }, { "epoch": 2.94, "grad_norm": 0.24811436235904694, "learning_rate": 3.121017106666283e-05, "loss": 0.8535, "step": 1183 }, { "epoch": 2.94, "grad_norm": 0.23216772079467773, "learning_rate": 3.1066173408148955e-05, "loss": 0.8212, "step": 1184 }, { "epoch": 2.95, "grad_norm": 0.24234245717525482, "learning_rate": 3.092244759318424e-05, "loss": 0.8568, "step": 1185 }, { "epoch": 2.95, "grad_norm": 0.24297314882278442, "learning_rate": 3.077899418855772e-05, "loss": 0.8578, "step": 1186 }, { "epoch": 2.95, "grad_norm": 0.2410762459039688, "learning_rate": 3.063581375998412e-05, "loss": 0.8617, "step": 1187 }, { "epoch": 2.95, "grad_norm": 0.2403203397989273, "learning_rate": 3.04929068721017e-05, "loss": 0.8991, "step": 1188 }, { "epoch": 2.96, "grad_norm": 0.2315671145915985, "learning_rate": 3.0350274088470022e-05, "loss": 0.8145, "step": 1189 }, { "epoch": 2.96, "grad_norm": 0.24772675335407257, "learning_rate": 3.0207915971567624e-05, "loss": 0.8133, "step": 1190 }, { "epoch": 2.96, "grad_norm": 0.24197424948215485, "learning_rate": 3.006583308279003e-05, "loss": 0.8382, "step": 1191 }, { "epoch": 2.96, "grad_norm": 0.24510471522808075, "learning_rate": 2.992402598244727e-05, "loss": 0.8618, "step": 1192 }, { "epoch": 2.97, "grad_norm": 0.25491824746131897, "learning_rate": 2.9782495229761808e-05, "loss": 0.923, "step": 1193 }, { "epoch": 2.97, "grad_norm": 0.23867730796337128, "learning_rate": 2.9641241382866348e-05, "loss": 0.8735, "step": 1194 }, { "epoch": 2.97, "grad_norm": 0.24734465777873993, "learning_rate": 2.9500264998801584e-05, "loss": 0.8237, "step": 1195 }, { "epoch": 2.97, "grad_norm": 0.28391632437705994, "learning_rate": 2.9359566633514037e-05, "loss": 0.9401, "step": 1196 }, { "epoch": 2.98, "grad_norm": 0.24533286690711975, "learning_rate": 2.9219146841853807e-05, "loss": 0.8287, "step": 1197 }, { "epoch": 2.98, "grad_norm": 0.24379463493824005, "learning_rate": 2.907900617757252e-05, "loss": 0.7804, "step": 1198 }, { "epoch": 2.98, "grad_norm": 0.25721925497055054, "learning_rate": 2.893914519332097e-05, "loss": 0.8606, "step": 1199 }, { "epoch": 2.98, "grad_norm": 0.26963183283805847, "learning_rate": 2.879956444064703e-05, "loss": 0.9226, "step": 1200 }, { "epoch": 2.98, "eval_loss": 1.0951114892959595, "eval_runtime": 81.6207, "eval_samples_per_second": 31.842, "eval_steps_per_second": 31.842, "step": 1200 }, { "epoch": 2.99, "grad_norm": 0.2419532984495163, "learning_rate": 2.8660264469993502e-05, "loss": 0.8379, "step": 1201 }, { "epoch": 2.99, "grad_norm": 0.25379815697669983, "learning_rate": 2.8521245830695864e-05, "loss": 0.8785, "step": 1202 }, { "epoch": 2.99, "grad_norm": 0.2557288408279419, "learning_rate": 2.83825090709802e-05, "loss": 0.8932, "step": 1203 }, { "epoch": 2.99, "grad_norm": 0.2442978322505951, "learning_rate": 2.8244054737960935e-05, "loss": 0.864, "step": 1204 }, { "epoch": 3.0, "grad_norm": 0.2575509548187256, "learning_rate": 2.810588337763881e-05, "loss": 0.9002, "step": 1205 }, { "epoch": 3.0, "grad_norm": 0.25195080041885376, "learning_rate": 2.7967995534898596e-05, "loss": 0.8713, "step": 1206 }, { "epoch": 3.0, "grad_norm": 0.2624851167201996, "learning_rate": 2.783039175350699e-05, "loss": 0.8785, "step": 1207 }, { "epoch": 3.0, "grad_norm": 0.2531834542751312, "learning_rate": 2.7693072576110514e-05, "loss": 0.8689, "step": 1208 }, { "epoch": 3.01, "grad_norm": 0.25603699684143066, "learning_rate": 2.755603854423332e-05, "loss": 0.8793, "step": 1209 }, { "epoch": 3.01, "grad_norm": 0.2399108111858368, "learning_rate": 2.7419290198275095e-05, "loss": 0.8078, "step": 1210 }, { "epoch": 3.01, "grad_norm": 0.24585610628128052, "learning_rate": 2.728282807750886e-05, "loss": 0.8346, "step": 1211 }, { "epoch": 3.01, "grad_norm": 0.27591878175735474, "learning_rate": 2.7146652720079003e-05, "loss": 0.8093, "step": 1212 }, { "epoch": 3.0, "grad_norm": 0.243075892329216, "learning_rate": 2.7010764662998933e-05, "loss": 0.8278, "step": 1213 }, { "epoch": 3.01, "grad_norm": 0.23767191171646118, "learning_rate": 2.6875164442149147e-05, "loss": 0.8666, "step": 1214 }, { "epoch": 3.01, "grad_norm": 0.23902587592601776, "learning_rate": 2.6739852592274995e-05, "loss": 0.7874, "step": 1215 }, { "epoch": 3.01, "grad_norm": 0.2440321147441864, "learning_rate": 2.6604829646984686e-05, "loss": 0.7921, "step": 1216 }, { "epoch": 3.01, "grad_norm": 0.2384667545557022, "learning_rate": 2.6470096138747126e-05, "loss": 0.7427, "step": 1217 }, { "epoch": 3.02, "grad_norm": 0.23948150873184204, "learning_rate": 2.6335652598889683e-05, "loss": 0.757, "step": 1218 }, { "epoch": 3.02, "grad_norm": 0.25211286544799805, "learning_rate": 2.620149955759633e-05, "loss": 0.8402, "step": 1219 }, { "epoch": 3.02, "grad_norm": 0.24333299696445465, "learning_rate": 2.60676375439055e-05, "loss": 0.7975, "step": 1220 }, { "epoch": 3.02, "grad_norm": 0.24414288997650146, "learning_rate": 2.5934067085707834e-05, "loss": 0.726, "step": 1221 }, { "epoch": 3.03, "grad_norm": 0.2554560899734497, "learning_rate": 2.5800788709744227e-05, "loss": 0.7947, "step": 1222 }, { "epoch": 3.03, "grad_norm": 0.26515471935272217, "learning_rate": 2.5667802941603834e-05, "loss": 0.8416, "step": 1223 }, { "epoch": 3.03, "grad_norm": 0.25791820883750916, "learning_rate": 2.5535110305721776e-05, "loss": 0.8175, "step": 1224 }, { "epoch": 3.03, "grad_norm": 0.26911717653274536, "learning_rate": 2.540271132537729e-05, "loss": 0.7658, "step": 1225 }, { "epoch": 3.04, "grad_norm": 0.2754501402378082, "learning_rate": 2.5270606522691443e-05, "loss": 0.7682, "step": 1226 }, { "epoch": 3.04, "grad_norm": 0.2647700905799866, "learning_rate": 2.5138796418625343e-05, "loss": 0.7402, "step": 1227 }, { "epoch": 3.04, "grad_norm": 0.2651618421077728, "learning_rate": 2.500728153297788e-05, "loss": 0.7649, "step": 1228 }, { "epoch": 3.04, "grad_norm": 0.6237964630126953, "learning_rate": 2.4876062384383714e-05, "loss": 0.7701, "step": 1229 }, { "epoch": 3.05, "grad_norm": 0.27707546949386597, "learning_rate": 2.4745139490311254e-05, "loss": 0.7654, "step": 1230 }, { "epoch": 3.05, "grad_norm": 0.26269418001174927, "learning_rate": 2.46145133670607e-05, "loss": 0.722, "step": 1231 }, { "epoch": 3.05, "grad_norm": 0.27101683616638184, "learning_rate": 2.4484184529761834e-05, "loss": 0.8106, "step": 1232 }, { "epoch": 3.05, "grad_norm": 0.2758999168872833, "learning_rate": 2.43541534923721e-05, "loss": 0.7717, "step": 1233 }, { "epoch": 3.06, "grad_norm": 0.27700987458229065, "learning_rate": 2.4224420767674562e-05, "loss": 0.8284, "step": 1234 }, { "epoch": 3.06, "grad_norm": 0.2708686292171478, "learning_rate": 2.409498686727587e-05, "loss": 0.8002, "step": 1235 }, { "epoch": 3.06, "grad_norm": 0.2619761526584625, "learning_rate": 2.3965852301604254e-05, "loss": 0.7471, "step": 1236 }, { "epoch": 3.06, "grad_norm": 0.26751142740249634, "learning_rate": 2.3837017579907472e-05, "loss": 0.7378, "step": 1237 }, { "epoch": 3.07, "grad_norm": 0.2663439214229584, "learning_rate": 2.370848321025093e-05, "loss": 0.7832, "step": 1238 }, { "epoch": 3.07, "grad_norm": 0.2717893421649933, "learning_rate": 2.3580249699515467e-05, "loss": 0.8168, "step": 1239 }, { "epoch": 3.07, "grad_norm": 0.26105281710624695, "learning_rate": 2.345231755339554e-05, "loss": 0.7418, "step": 1240 }, { "epoch": 3.07, "grad_norm": 0.28485798835754395, "learning_rate": 2.332468727639713e-05, "loss": 0.7687, "step": 1241 }, { "epoch": 3.08, "grad_norm": 0.27356845140457153, "learning_rate": 2.3197359371835802e-05, "loss": 0.7767, "step": 1242 }, { "epoch": 3.08, "grad_norm": 0.25320693850517273, "learning_rate": 2.30703343418347e-05, "loss": 0.7811, "step": 1243 }, { "epoch": 3.08, "grad_norm": 0.30001091957092285, "learning_rate": 2.2943612687322525e-05, "loss": 0.8288, "step": 1244 }, { "epoch": 3.08, "grad_norm": 0.2742901146411896, "learning_rate": 2.2817194908031712e-05, "loss": 0.7172, "step": 1245 }, { "epoch": 3.09, "grad_norm": 0.25700873136520386, "learning_rate": 2.2691081502496246e-05, "loss": 0.7552, "step": 1246 }, { "epoch": 3.09, "grad_norm": 0.256821870803833, "learning_rate": 2.2565272968049844e-05, "loss": 0.7701, "step": 1247 }, { "epoch": 3.09, "grad_norm": 0.26405730843544006, "learning_rate": 2.243976980082394e-05, "loss": 0.7769, "step": 1248 }, { "epoch": 3.09, "grad_norm": 0.2745378017425537, "learning_rate": 2.2314572495745746e-05, "loss": 0.7855, "step": 1249 }, { "epoch": 3.1, "grad_norm": 0.27247995138168335, "learning_rate": 2.218968154653629e-05, "loss": 0.7956, "step": 1250 }, { "epoch": 3.1, "grad_norm": 0.2709554135799408, "learning_rate": 2.2065097445708437e-05, "loss": 0.7819, "step": 1251 }, { "epoch": 3.1, "grad_norm": 0.24753987789154053, "learning_rate": 2.194082068456509e-05, "loss": 0.6953, "step": 1252 }, { "epoch": 3.1, "grad_norm": 0.26115405559539795, "learning_rate": 2.181685175319702e-05, "loss": 0.7945, "step": 1253 }, { "epoch": 3.11, "grad_norm": 0.2646583020687103, "learning_rate": 2.169319114048114e-05, "loss": 0.7281, "step": 1254 }, { "epoch": 3.11, "grad_norm": 0.2585940659046173, "learning_rate": 2.1569839334078422e-05, "loss": 0.7027, "step": 1255 }, { "epoch": 3.11, "grad_norm": 0.2597663998603821, "learning_rate": 2.1446796820432167e-05, "loss": 0.8161, "step": 1256 }, { "epoch": 3.11, "grad_norm": 0.2645276188850403, "learning_rate": 2.1324064084765815e-05, "loss": 0.7877, "step": 1257 }, { "epoch": 3.12, "grad_norm": 0.2637201249599457, "learning_rate": 2.1201641611081246e-05, "loss": 0.751, "step": 1258 }, { "epoch": 3.12, "grad_norm": 0.2551720440387726, "learning_rate": 2.10795298821569e-05, "loss": 0.7361, "step": 1259 }, { "epoch": 3.12, "grad_norm": 0.2742619514465332, "learning_rate": 2.0957729379545655e-05, "loss": 0.8284, "step": 1260 }, { "epoch": 3.12, "grad_norm": 0.2725660800933838, "learning_rate": 2.0836240583573098e-05, "loss": 0.7309, "step": 1261 }, { "epoch": 3.13, "grad_norm": 0.25940898060798645, "learning_rate": 2.0715063973335568e-05, "loss": 0.7646, "step": 1262 }, { "epoch": 3.13, "grad_norm": 0.2767871916294098, "learning_rate": 2.0594200026698363e-05, "loss": 0.7446, "step": 1263 }, { "epoch": 3.13, "grad_norm": 0.27153125405311584, "learning_rate": 2.04736492202937e-05, "loss": 0.776, "step": 1264 }, { "epoch": 3.13, "grad_norm": 0.2660311460494995, "learning_rate": 2.035341202951897e-05, "loss": 0.7533, "step": 1265 }, { "epoch": 3.14, "grad_norm": 0.28477585315704346, "learning_rate": 2.0233488928534673e-05, "loss": 0.7496, "step": 1266 }, { "epoch": 3.14, "grad_norm": 0.26483017206192017, "learning_rate": 2.0113880390262884e-05, "loss": 0.7825, "step": 1267 }, { "epoch": 3.14, "grad_norm": 0.26726511120796204, "learning_rate": 1.9994586886385046e-05, "loss": 0.7564, "step": 1268 }, { "epoch": 3.14, "grad_norm": 0.26417261362075806, "learning_rate": 1.987560888734027e-05, "loss": 0.7548, "step": 1269 }, { "epoch": 3.15, "grad_norm": 0.2672629952430725, "learning_rate": 1.9756946862323535e-05, "loss": 0.7798, "step": 1270 }, { "epoch": 3.15, "grad_norm": 0.2740299701690674, "learning_rate": 1.9638601279283684e-05, "loss": 0.8098, "step": 1271 }, { "epoch": 3.15, "grad_norm": 0.28779342770576477, "learning_rate": 1.9520572604921672e-05, "loss": 0.7795, "step": 1272 }, { "epoch": 3.15, "grad_norm": 0.27534219622612, "learning_rate": 1.9402861304688712e-05, "loss": 0.7533, "step": 1273 }, { "epoch": 3.16, "grad_norm": 0.28089454770088196, "learning_rate": 1.9285467842784467e-05, "loss": 0.7792, "step": 1274 }, { "epoch": 3.16, "grad_norm": 0.28745076060295105, "learning_rate": 1.9168392682155157e-05, "loss": 0.7657, "step": 1275 }, { "epoch": 3.16, "grad_norm": 0.2681332528591156, "learning_rate": 1.9051636284491757e-05, "loss": 0.7899, "step": 1276 }, { "epoch": 3.16, "grad_norm": 0.26951977610588074, "learning_rate": 1.8935199110228275e-05, "loss": 0.7806, "step": 1277 }, { "epoch": 3.17, "grad_norm": 0.2794302701950073, "learning_rate": 1.8819081618539723e-05, "loss": 0.7462, "step": 1278 }, { "epoch": 3.17, "grad_norm": 0.26609373092651367, "learning_rate": 1.8703284267340516e-05, "loss": 0.7968, "step": 1279 }, { "epoch": 3.17, "grad_norm": 0.25456827878952026, "learning_rate": 1.858780751328255e-05, "loss": 0.6933, "step": 1280 }, { "epoch": 3.17, "grad_norm": 0.27659982442855835, "learning_rate": 1.8472651811753428e-05, "loss": 0.8159, "step": 1281 }, { "epoch": 3.18, "grad_norm": 0.27182477712631226, "learning_rate": 1.8357817616874694e-05, "loss": 0.7707, "step": 1282 }, { "epoch": 3.18, "grad_norm": 0.2805425226688385, "learning_rate": 1.8243305381499976e-05, "loss": 0.7684, "step": 1283 }, { "epoch": 3.18, "grad_norm": 0.2774295210838318, "learning_rate": 1.8129115557213262e-05, "loss": 0.7787, "step": 1284 }, { "epoch": 3.18, "grad_norm": 0.2643285393714905, "learning_rate": 1.801524859432714e-05, "loss": 0.7573, "step": 1285 }, { "epoch": 3.19, "grad_norm": 0.2701185345649719, "learning_rate": 1.7901704941880914e-05, "loss": 0.7622, "step": 1286 }, { "epoch": 3.19, "grad_norm": 0.2777290344238281, "learning_rate": 1.7788485047638925e-05, "loss": 0.7977, "step": 1287 }, { "epoch": 3.19, "grad_norm": 0.2658153176307678, "learning_rate": 1.7675589358088763e-05, "loss": 0.8102, "step": 1288 }, { "epoch": 3.19, "grad_norm": 0.2731580138206482, "learning_rate": 1.7563018318439496e-05, "loss": 0.7446, "step": 1289 }, { "epoch": 3.2, "grad_norm": 0.26347973942756653, "learning_rate": 1.745077237261994e-05, "loss": 0.7524, "step": 1290 }, { "epoch": 3.2, "grad_norm": 0.2692364454269409, "learning_rate": 1.7338851963276825e-05, "loss": 0.7287, "step": 1291 }, { "epoch": 3.2, "grad_norm": 0.27053266763687134, "learning_rate": 1.7227257531773223e-05, "loss": 0.772, "step": 1292 }, { "epoch": 3.2, "grad_norm": 0.26587438583374023, "learning_rate": 1.7115989518186615e-05, "loss": 0.6906, "step": 1293 }, { "epoch": 3.21, "grad_norm": 0.2816382050514221, "learning_rate": 1.7005048361307262e-05, "loss": 0.8012, "step": 1294 }, { "epoch": 3.21, "grad_norm": 0.2815219759941101, "learning_rate": 1.6894434498636446e-05, "loss": 0.7996, "step": 1295 }, { "epoch": 3.21, "grad_norm": 0.2698955535888672, "learning_rate": 1.6784148366384754e-05, "loss": 0.7446, "step": 1296 }, { "epoch": 3.21, "grad_norm": 0.26657864451408386, "learning_rate": 1.667419039947037e-05, "loss": 0.7606, "step": 1297 }, { "epoch": 3.22, "grad_norm": 0.28396204113960266, "learning_rate": 1.656456103151728e-05, "loss": 0.7654, "step": 1298 }, { "epoch": 3.22, "grad_norm": 0.2739189863204956, "learning_rate": 1.6455260694853736e-05, "loss": 0.8064, "step": 1299 }, { "epoch": 3.22, "grad_norm": 0.26701492071151733, "learning_rate": 1.6346289820510363e-05, "loss": 0.76, "step": 1300 }, { "epoch": 3.22, "eval_loss": 1.130713939666748, "eval_runtime": 81.5556, "eval_samples_per_second": 31.868, "eval_steps_per_second": 31.868, "step": 1300 }, { "epoch": 3.22, "grad_norm": 0.2754208445549011, "learning_rate": 1.6237648838218532e-05, "loss": 0.8223, "step": 1301 }, { "epoch": 3.23, "grad_norm": 0.26871708035469055, "learning_rate": 1.612933817640868e-05, "loss": 0.7836, "step": 1302 }, { "epoch": 3.23, "grad_norm": 0.2776527404785156, "learning_rate": 1.6021358262208665e-05, "loss": 0.7564, "step": 1303 }, { "epoch": 3.23, "grad_norm": 0.2610631585121155, "learning_rate": 1.5913709521441988e-05, "loss": 0.7565, "step": 1304 }, { "epoch": 3.23, "grad_norm": 0.2923867404460907, "learning_rate": 1.580639237862608e-05, "loss": 0.7564, "step": 1305 }, { "epoch": 3.24, "grad_norm": 0.26441213488578796, "learning_rate": 1.5699407256970833e-05, "loss": 0.7964, "step": 1306 }, { "epoch": 3.24, "grad_norm": 0.27306240797042847, "learning_rate": 1.5592754578376724e-05, "loss": 0.8001, "step": 1307 }, { "epoch": 3.24, "grad_norm": 0.27287718653678894, "learning_rate": 1.5486434763433222e-05, "loss": 0.7363, "step": 1308 }, { "epoch": 3.24, "grad_norm": 0.2708844244480133, "learning_rate": 1.5380448231417144e-05, "loss": 0.7372, "step": 1309 }, { "epoch": 3.25, "grad_norm": 0.27012452483177185, "learning_rate": 1.527479540029104e-05, "loss": 0.7307, "step": 1310 }, { "epoch": 3.25, "grad_norm": 0.2752881944179535, "learning_rate": 1.5169476686701423e-05, "loss": 0.8257, "step": 1311 }, { "epoch": 3.25, "grad_norm": 0.27313387393951416, "learning_rate": 1.5064492505977234e-05, "loss": 0.7556, "step": 1312 }, { "epoch": 3.25, "grad_norm": 0.2661667466163635, "learning_rate": 1.4959843272128172e-05, "loss": 0.7743, "step": 1313 }, { "epoch": 3.26, "grad_norm": 0.26338526606559753, "learning_rate": 1.4855529397843038e-05, "loss": 0.7089, "step": 1314 }, { "epoch": 3.26, "grad_norm": 0.28373342752456665, "learning_rate": 1.4751551294488154e-05, "loss": 0.7911, "step": 1315 }, { "epoch": 3.26, "grad_norm": 0.29280614852905273, "learning_rate": 1.4647909372105672e-05, "loss": 0.8041, "step": 1316 }, { "epoch": 3.26, "grad_norm": 0.26794326305389404, "learning_rate": 1.454460403941207e-05, "loss": 0.7916, "step": 1317 }, { "epoch": 3.27, "grad_norm": 0.2564448416233063, "learning_rate": 1.4441635703796408e-05, "loss": 0.7495, "step": 1318 }, { "epoch": 3.27, "grad_norm": 0.2706497013568878, "learning_rate": 1.433900477131882e-05, "loss": 0.7448, "step": 1319 }, { "epoch": 3.27, "grad_norm": 0.26938024163246155, "learning_rate": 1.4236711646708844e-05, "loss": 0.7526, "step": 1320 }, { "epoch": 3.27, "grad_norm": 0.27030321955680847, "learning_rate": 1.4134756733363886e-05, "loss": 0.7563, "step": 1321 }, { "epoch": 3.28, "grad_norm": 0.25589773058891296, "learning_rate": 1.4033140433347569e-05, "loss": 0.8058, "step": 1322 }, { "epoch": 3.28, "grad_norm": 0.26774731278419495, "learning_rate": 1.3931863147388202e-05, "loss": 0.7614, "step": 1323 }, { "epoch": 3.28, "grad_norm": 0.27598094940185547, "learning_rate": 1.3830925274877216e-05, "loss": 0.7477, "step": 1324 }, { "epoch": 3.28, "grad_norm": 0.2637529671192169, "learning_rate": 1.3730327213867478e-05, "loss": 0.7795, "step": 1325 }, { "epoch": 3.29, "grad_norm": 0.25538310408592224, "learning_rate": 1.363006936107183e-05, "loss": 0.7465, "step": 1326 }, { "epoch": 3.29, "grad_norm": 0.26589810848236084, "learning_rate": 1.3530152111861483e-05, "loss": 0.7319, "step": 1327 }, { "epoch": 3.29, "grad_norm": 0.2793591618537903, "learning_rate": 1.343057586026446e-05, "loss": 0.7761, "step": 1328 }, { "epoch": 3.29, "grad_norm": 0.2663894593715668, "learning_rate": 1.333134099896406e-05, "loss": 0.7459, "step": 1329 }, { "epoch": 3.3, "grad_norm": 0.2803102433681488, "learning_rate": 1.3232447919297274e-05, "loss": 0.8253, "step": 1330 }, { "epoch": 3.3, "grad_norm": 0.26876065135002136, "learning_rate": 1.313389701125325e-05, "loss": 0.753, "step": 1331 }, { "epoch": 3.3, "grad_norm": 0.27501749992370605, "learning_rate": 1.3035688663471834e-05, "loss": 0.7993, "step": 1332 }, { "epoch": 3.3, "grad_norm": 0.2795599102973938, "learning_rate": 1.29378232632419e-05, "loss": 0.7632, "step": 1333 }, { "epoch": 3.31, "grad_norm": 0.28799694776535034, "learning_rate": 1.2840301196499893e-05, "loss": 0.7716, "step": 1334 }, { "epoch": 3.31, "grad_norm": 0.26930999755859375, "learning_rate": 1.2743122847828415e-05, "loss": 0.734, "step": 1335 }, { "epoch": 3.31, "grad_norm": 0.27179402112960815, "learning_rate": 1.2646288600454448e-05, "loss": 0.7641, "step": 1336 }, { "epoch": 3.31, "grad_norm": 0.2897116541862488, "learning_rate": 1.2549798836248072e-05, "loss": 0.7919, "step": 1337 }, { "epoch": 3.32, "grad_norm": 0.2676919996738434, "learning_rate": 1.2453653935720867e-05, "loss": 0.7945, "step": 1338 }, { "epoch": 3.32, "grad_norm": 0.286775678396225, "learning_rate": 1.2357854278024484e-05, "loss": 0.806, "step": 1339 }, { "epoch": 3.32, "grad_norm": 0.2756466865539551, "learning_rate": 1.2262400240949023e-05, "loss": 0.743, "step": 1340 }, { "epoch": 3.32, "grad_norm": 0.28273239731788635, "learning_rate": 1.216729220092162e-05, "loss": 0.803, "step": 1341 }, { "epoch": 3.33, "grad_norm": 0.2863914966583252, "learning_rate": 1.2072530533005012e-05, "loss": 0.8214, "step": 1342 }, { "epoch": 3.33, "grad_norm": 0.2831990122795105, "learning_rate": 1.197811561089598e-05, "loss": 0.7872, "step": 1343 }, { "epoch": 3.33, "grad_norm": 0.26051804423332214, "learning_rate": 1.1884047806923815e-05, "loss": 0.7473, "step": 1344 }, { "epoch": 3.33, "grad_norm": 0.27011704444885254, "learning_rate": 1.1790327492049025e-05, "loss": 0.834, "step": 1345 }, { "epoch": 3.34, "grad_norm": 0.2778359353542328, "learning_rate": 1.169695503586179e-05, "loss": 0.7466, "step": 1346 }, { "epoch": 3.34, "grad_norm": 0.2707320749759674, "learning_rate": 1.1603930806580444e-05, "loss": 0.771, "step": 1347 }, { "epoch": 3.34, "grad_norm": 0.2874731421470642, "learning_rate": 1.1511255171050084e-05, "loss": 0.7668, "step": 1348 }, { "epoch": 3.34, "grad_norm": 0.2955639660358429, "learning_rate": 1.1418928494741087e-05, "loss": 0.8177, "step": 1349 }, { "epoch": 3.35, "grad_norm": 0.2815730571746826, "learning_rate": 1.1326951141747788e-05, "loss": 0.8012, "step": 1350 }, { "epoch": 3.35, "grad_norm": 0.2788692116737366, "learning_rate": 1.1235323474786841e-05, "loss": 0.8392, "step": 1351 }, { "epoch": 3.35, "grad_norm": 0.27602502703666687, "learning_rate": 1.1144045855195973e-05, "loss": 0.7693, "step": 1352 }, { "epoch": 3.35, "grad_norm": 0.28211286664009094, "learning_rate": 1.1053118642932425e-05, "loss": 0.7498, "step": 1353 }, { "epoch": 3.36, "grad_norm": 0.29697540402412415, "learning_rate": 1.0962542196571634e-05, "loss": 0.7986, "step": 1354 }, { "epoch": 3.36, "grad_norm": 0.26737043261528015, "learning_rate": 1.0872316873305766e-05, "loss": 0.7504, "step": 1355 }, { "epoch": 3.36, "grad_norm": 0.2647438943386078, "learning_rate": 1.078244302894229e-05, "loss": 0.7657, "step": 1356 }, { "epoch": 3.36, "grad_norm": 0.27758243680000305, "learning_rate": 1.069292101790268e-05, "loss": 0.7449, "step": 1357 }, { "epoch": 3.37, "grad_norm": 0.29031941294670105, "learning_rate": 1.0603751193220846e-05, "loss": 0.8258, "step": 1358 }, { "epoch": 3.37, "grad_norm": 0.2783955931663513, "learning_rate": 1.0514933906541901e-05, "loss": 0.7852, "step": 1359 }, { "epoch": 3.37, "grad_norm": 0.2836972177028656, "learning_rate": 1.0426469508120662e-05, "loss": 0.7829, "step": 1360 }, { "epoch": 3.37, "grad_norm": 0.316567063331604, "learning_rate": 1.0338358346820353e-05, "loss": 0.7875, "step": 1361 }, { "epoch": 3.38, "grad_norm": 0.28013697266578674, "learning_rate": 1.0250600770111185e-05, "loss": 0.8016, "step": 1362 }, { "epoch": 3.38, "grad_norm": 0.2763884663581848, "learning_rate": 1.0163197124068957e-05, "loss": 0.7796, "step": 1363 }, { "epoch": 3.38, "grad_norm": 0.27770814299583435, "learning_rate": 1.0076147753373789e-05, "loss": 0.7898, "step": 1364 }, { "epoch": 3.38, "grad_norm": 0.27236419916152954, "learning_rate": 9.989453001308657e-06, "loss": 0.722, "step": 1365 }, { "epoch": 3.39, "grad_norm": 0.2883667051792145, "learning_rate": 9.903113209758096e-06, "loss": 0.7246, "step": 1366 }, { "epoch": 3.39, "grad_norm": 0.2697809040546417, "learning_rate": 9.817128719206825e-06, "loss": 0.7397, "step": 1367 }, { "epoch": 3.39, "grad_norm": 0.28278815746307373, "learning_rate": 9.731499868738447e-06, "loss": 0.7335, "step": 1368 }, { "epoch": 3.39, "grad_norm": 0.2717222273349762, "learning_rate": 9.646226996034048e-06, "loss": 0.7905, "step": 1369 }, { "epoch": 3.4, "grad_norm": 0.2660965323448181, "learning_rate": 9.561310437370907e-06, "loss": 0.7718, "step": 1370 }, { "epoch": 3.4, "grad_norm": 0.2756327688694, "learning_rate": 9.476750527621214e-06, "loss": 0.7862, "step": 1371 }, { "epoch": 3.4, "grad_norm": 0.26889774203300476, "learning_rate": 9.392547600250634e-06, "loss": 0.7589, "step": 1372 }, { "epoch": 3.4, "grad_norm": 0.27550622820854187, "learning_rate": 9.308701987317081e-06, "loss": 0.773, "step": 1373 }, { "epoch": 3.41, "grad_norm": 0.2738642990589142, "learning_rate": 9.225214019469385e-06, "loss": 0.754, "step": 1374 }, { "epoch": 3.41, "grad_norm": 0.26347842812538147, "learning_rate": 9.142084025945984e-06, "loss": 0.737, "step": 1375 }, { "epoch": 3.41, "grad_norm": 0.27429768443107605, "learning_rate": 9.059312334573633e-06, "loss": 0.796, "step": 1376 }, { "epoch": 3.41, "grad_norm": 0.28162965178489685, "learning_rate": 8.976899271766092e-06, "loss": 0.8138, "step": 1377 }, { "epoch": 3.42, "grad_norm": 0.27058038115501404, "learning_rate": 8.89484516252287e-06, "loss": 0.7743, "step": 1378 }, { "epoch": 3.42, "grad_norm": 0.2732222080230713, "learning_rate": 8.813150330427945e-06, "loss": 0.7831, "step": 1379 }, { "epoch": 3.42, "grad_norm": 0.2896285057067871, "learning_rate": 8.731815097648433e-06, "loss": 0.7717, "step": 1380 }, { "epoch": 3.42, "grad_norm": 0.27621400356292725, "learning_rate": 8.65083978493334e-06, "loss": 0.755, "step": 1381 }, { "epoch": 3.43, "grad_norm": 0.2736296057701111, "learning_rate": 8.570224711612385e-06, "loss": 0.7568, "step": 1382 }, { "epoch": 3.43, "grad_norm": 0.273482084274292, "learning_rate": 8.489970195594632e-06, "loss": 0.7436, "step": 1383 }, { "epoch": 3.43, "grad_norm": 0.28220561146736145, "learning_rate": 8.410076553367208e-06, "loss": 0.7433, "step": 1384 }, { "epoch": 3.43, "grad_norm": 0.2805227041244507, "learning_rate": 8.330544099994187e-06, "loss": 0.8022, "step": 1385 }, { "epoch": 3.44, "grad_norm": 0.27991506457328796, "learning_rate": 8.251373149115293e-06, "loss": 0.7144, "step": 1386 }, { "epoch": 3.44, "grad_norm": 0.28852060437202454, "learning_rate": 8.172564012944595e-06, "loss": 0.8103, "step": 1387 }, { "epoch": 3.44, "grad_norm": 0.28653353452682495, "learning_rate": 8.094117002269363e-06, "loss": 0.7789, "step": 1388 }, { "epoch": 3.44, "grad_norm": 0.26750099658966064, "learning_rate": 8.016032426448817e-06, "loss": 0.7979, "step": 1389 }, { "epoch": 3.45, "grad_norm": 0.2889355421066284, "learning_rate": 7.938310593412879e-06, "loss": 0.7741, "step": 1390 }, { "epoch": 3.45, "grad_norm": 0.2845998704433441, "learning_rate": 7.860951809660989e-06, "loss": 0.7921, "step": 1391 }, { "epoch": 3.45, "grad_norm": 0.2809029519557953, "learning_rate": 7.783956380260837e-06, "loss": 0.7838, "step": 1392 }, { "epoch": 3.45, "grad_norm": 0.27812841534614563, "learning_rate": 7.70732460884731e-06, "loss": 0.7296, "step": 1393 }, { "epoch": 3.46, "grad_norm": 0.2806148827075958, "learning_rate": 7.631056797621106e-06, "loss": 0.7499, "step": 1394 }, { "epoch": 3.46, "grad_norm": 0.27627986669540405, "learning_rate": 7.5551532473476795e-06, "loss": 0.6947, "step": 1395 }, { "epoch": 3.46, "grad_norm": 0.28309470415115356, "learning_rate": 7.479614257355971e-06, "loss": 0.7499, "step": 1396 }, { "epoch": 3.46, "grad_norm": 0.2808564007282257, "learning_rate": 7.404440125537293e-06, "loss": 0.7625, "step": 1397 }, { "epoch": 3.47, "grad_norm": 0.27724358439445496, "learning_rate": 7.329631148344118e-06, "loss": 0.816, "step": 1398 }, { "epoch": 3.47, "grad_norm": 0.28232425451278687, "learning_rate": 7.255187620788894e-06, "loss": 0.7401, "step": 1399 }, { "epoch": 3.47, "grad_norm": 0.2749876081943512, "learning_rate": 7.181109836442912e-06, "loss": 0.8056, "step": 1400 }, { "epoch": 3.47, "eval_loss": 1.1313637495040894, "eval_runtime": 81.5978, "eval_samples_per_second": 31.851, "eval_steps_per_second": 31.851, "step": 1400 }, { "epoch": 3.47, "grad_norm": 0.2691822350025177, "learning_rate": 7.1073980874351575e-06, "loss": 0.7741, "step": 1401 }, { "epoch": 3.48, "grad_norm": 0.2862139642238617, "learning_rate": 7.034052664451118e-06, "loss": 0.7749, "step": 1402 }, { "epoch": 3.48, "grad_norm": 0.27442795038223267, "learning_rate": 6.961073856731648e-06, "loss": 0.8034, "step": 1403 }, { "epoch": 3.48, "grad_norm": 0.2818858027458191, "learning_rate": 6.88846195207189e-06, "loss": 0.7506, "step": 1404 }, { "epoch": 3.48, "grad_norm": 0.28241288661956787, "learning_rate": 6.816217236820032e-06, "loss": 0.7671, "step": 1405 }, { "epoch": 3.49, "grad_norm": 0.28324833512306213, "learning_rate": 6.7443399958762584e-06, "loss": 0.7817, "step": 1406 }, { "epoch": 3.49, "grad_norm": 0.27306148409843445, "learning_rate": 6.672830512691608e-06, "loss": 0.7091, "step": 1407 }, { "epoch": 3.49, "grad_norm": 0.2793382406234741, "learning_rate": 6.6016890692668364e-06, "loss": 0.7748, "step": 1408 }, { "epoch": 3.49, "grad_norm": 0.2656705975532532, "learning_rate": 6.530915946151339e-06, "loss": 0.8124, "step": 1409 }, { "epoch": 3.5, "grad_norm": 0.2844313979148865, "learning_rate": 6.460511422441984e-06, "loss": 0.7791, "step": 1410 }, { "epoch": 3.5, "grad_norm": 0.27086642384529114, "learning_rate": 6.390475775782101e-06, "loss": 0.7942, "step": 1411 }, { "epoch": 3.5, "grad_norm": 0.27322399616241455, "learning_rate": 6.320809282360319e-06, "loss": 0.7913, "step": 1412 }, { "epoch": 3.5, "grad_norm": 0.2926284074783325, "learning_rate": 6.2515122169094835e-06, "loss": 0.8321, "step": 1413 }, { "epoch": 3.51, "grad_norm": 0.2735247313976288, "learning_rate": 6.1825848527055865e-06, "loss": 0.758, "step": 1414 }, { "epoch": 3.51, "grad_norm": 0.2706631124019623, "learning_rate": 6.114027461566696e-06, "loss": 0.7633, "step": 1415 }, { "epoch": 3.51, "grad_norm": 0.2732473909854889, "learning_rate": 6.04584031385188e-06, "loss": 0.7726, "step": 1416 }, { "epoch": 3.51, "grad_norm": 0.2772659957408905, "learning_rate": 5.978023678460099e-06, "loss": 0.6985, "step": 1417 }, { "epoch": 3.52, "grad_norm": 0.28777289390563965, "learning_rate": 5.910577822829233e-06, "loss": 0.7461, "step": 1418 }, { "epoch": 3.52, "grad_norm": 0.2738015353679657, "learning_rate": 5.843503012934959e-06, "loss": 0.832, "step": 1419 }, { "epoch": 3.52, "grad_norm": 0.269728422164917, "learning_rate": 5.77679951328971e-06, "loss": 0.7603, "step": 1420 }, { "epoch": 3.52, "grad_norm": 0.2839926481246948, "learning_rate": 5.710467586941615e-06, "loss": 0.7431, "step": 1421 }, { "epoch": 3.53, "grad_norm": 0.2668074369430542, "learning_rate": 5.644507495473572e-06, "loss": 0.7568, "step": 1422 }, { "epoch": 3.53, "grad_norm": 0.283124715089798, "learning_rate": 5.5789194990020225e-06, "loss": 0.7723, "step": 1423 }, { "epoch": 3.53, "grad_norm": 0.2843737304210663, "learning_rate": 5.5137038561761115e-06, "loss": 0.8047, "step": 1424 }, { "epoch": 3.53, "grad_norm": 0.2718368172645569, "learning_rate": 5.4488608241765494e-06, "loss": 0.7024, "step": 1425 }, { "epoch": 3.54, "grad_norm": 0.28055357933044434, "learning_rate": 5.3843906587146886e-06, "loss": 0.7997, "step": 1426 }, { "epoch": 3.54, "grad_norm": 0.28059250116348267, "learning_rate": 5.320293614031413e-06, "loss": 0.7806, "step": 1427 }, { "epoch": 3.54, "grad_norm": 0.26555076241493225, "learning_rate": 5.256569942896217e-06, "loss": 0.7708, "step": 1428 }, { "epoch": 3.54, "grad_norm": 0.2872565984725952, "learning_rate": 5.193219896606194e-06, "loss": 0.7687, "step": 1429 }, { "epoch": 3.55, "grad_norm": 0.2674780786037445, "learning_rate": 5.130243724984995e-06, "loss": 0.7507, "step": 1430 }, { "epoch": 3.55, "grad_norm": 0.28814786672592163, "learning_rate": 5.067641676381918e-06, "loss": 0.7567, "step": 1431 }, { "epoch": 3.55, "grad_norm": 0.28274017572402954, "learning_rate": 5.005413997670816e-06, "loss": 0.8348, "step": 1432 }, { "epoch": 3.55, "grad_norm": 0.2761125862598419, "learning_rate": 4.9435609342493025e-06, "loss": 0.7556, "step": 1433 }, { "epoch": 3.56, "grad_norm": 0.27695879340171814, "learning_rate": 4.8820827300376075e-06, "loss": 0.7591, "step": 1434 }, { "epoch": 3.56, "grad_norm": 0.2901512086391449, "learning_rate": 4.820979627477706e-06, "loss": 0.8176, "step": 1435 }, { "epoch": 3.56, "grad_norm": 0.28417840600013733, "learning_rate": 4.760251867532362e-06, "loss": 0.799, "step": 1436 }, { "epoch": 3.56, "grad_norm": 0.28090596199035645, "learning_rate": 4.699899689684129e-06, "loss": 0.8039, "step": 1437 }, { "epoch": 3.57, "grad_norm": 0.27434042096138, "learning_rate": 4.639923331934471e-06, "loss": 0.7967, "step": 1438 }, { "epoch": 3.57, "grad_norm": 0.2743697464466095, "learning_rate": 4.5803230308027356e-06, "loss": 0.7745, "step": 1439 }, { "epoch": 3.57, "grad_norm": 0.2777216136455536, "learning_rate": 4.521099021325336e-06, "loss": 0.7893, "step": 1440 }, { "epoch": 3.57, "grad_norm": 0.2794959545135498, "learning_rate": 4.462251537054718e-06, "loss": 0.8102, "step": 1441 }, { "epoch": 3.58, "grad_norm": 0.29014652967453003, "learning_rate": 4.403780810058511e-06, "loss": 0.7683, "step": 1442 }, { "epoch": 3.58, "grad_norm": 0.2840610444545746, "learning_rate": 4.345687070918559e-06, "loss": 0.7983, "step": 1443 }, { "epoch": 3.58, "grad_norm": 0.28343015909194946, "learning_rate": 4.287970548730069e-06, "loss": 0.7894, "step": 1444 }, { "epoch": 3.58, "grad_norm": 0.2948470115661621, "learning_rate": 4.230631471100655e-06, "loss": 0.7753, "step": 1445 }, { "epoch": 3.59, "grad_norm": 0.27446454763412476, "learning_rate": 4.173670064149482e-06, "loss": 0.7713, "step": 1446 }, { "epoch": 3.59, "grad_norm": 0.27941253781318665, "learning_rate": 4.117086552506322e-06, "loss": 0.7558, "step": 1447 }, { "epoch": 3.59, "grad_norm": 0.2800524830818176, "learning_rate": 4.060881159310725e-06, "loss": 0.767, "step": 1448 }, { "epoch": 3.59, "grad_norm": 0.2823658883571625, "learning_rate": 4.005054106211104e-06, "loss": 0.8294, "step": 1449 }, { "epoch": 3.6, "grad_norm": 0.273548424243927, "learning_rate": 3.949605613363882e-06, "loss": 0.767, "step": 1450 }, { "epoch": 3.6, "grad_norm": 0.2744486927986145, "learning_rate": 3.894535899432606e-06, "loss": 0.7873, "step": 1451 }, { "epoch": 3.6, "grad_norm": 0.2942350208759308, "learning_rate": 3.839845181587098e-06, "loss": 0.8005, "step": 1452 }, { "epoch": 3.6, "grad_norm": 0.2791810631752014, "learning_rate": 3.785533675502584e-06, "loss": 0.7904, "step": 1453 }, { "epoch": 3.61, "grad_norm": 0.2806760370731354, "learning_rate": 3.7316015953588467e-06, "loss": 0.7576, "step": 1454 }, { "epoch": 3.61, "grad_norm": 0.2763613164424896, "learning_rate": 3.6780491538394025e-06, "loss": 0.7792, "step": 1455 }, { "epoch": 3.61, "grad_norm": 0.26908808946609497, "learning_rate": 3.6248765621306414e-06, "loss": 0.7388, "step": 1456 }, { "epoch": 3.61, "grad_norm": 0.2751446068286896, "learning_rate": 3.5720840299209747e-06, "loss": 0.7636, "step": 1457 }, { "epoch": 3.62, "grad_norm": 0.27938565611839294, "learning_rate": 3.519671765400079e-06, "loss": 0.7842, "step": 1458 }, { "epoch": 3.62, "grad_norm": 0.30847108364105225, "learning_rate": 3.467639975257997e-06, "loss": 0.7646, "step": 1459 }, { "epoch": 3.62, "grad_norm": 0.27805548906326294, "learning_rate": 3.4159888646843495e-06, "loss": 0.7568, "step": 1460 }, { "epoch": 3.62, "grad_norm": 0.27638089656829834, "learning_rate": 3.364718637367548e-06, "loss": 0.7849, "step": 1461 }, { "epoch": 3.63, "grad_norm": 0.2811928391456604, "learning_rate": 3.313829495493992e-06, "loss": 0.8197, "step": 1462 }, { "epoch": 3.63, "grad_norm": 0.29005560278892517, "learning_rate": 3.2633216397471966e-06, "loss": 0.7654, "step": 1463 }, { "epoch": 3.63, "grad_norm": 0.2657933533191681, "learning_rate": 3.2131952693070898e-06, "loss": 0.7621, "step": 1464 }, { "epoch": 3.63, "grad_norm": 0.29370948672294617, "learning_rate": 3.1634505818492256e-06, "loss": 0.8145, "step": 1465 }, { "epoch": 3.64, "grad_norm": 0.2741262912750244, "learning_rate": 3.1140877735439387e-06, "loss": 0.7315, "step": 1466 }, { "epoch": 3.64, "grad_norm": 0.2838849127292633, "learning_rate": 3.0651070390556034e-06, "loss": 0.7401, "step": 1467 }, { "epoch": 3.64, "grad_norm": 0.2751835584640503, "learning_rate": 3.0165085715418763e-06, "loss": 0.8034, "step": 1468 }, { "epoch": 3.64, "grad_norm": 0.2787778973579407, "learning_rate": 2.9682925626529522e-06, "loss": 0.77, "step": 1469 }, { "epoch": 3.65, "grad_norm": 0.26870402693748474, "learning_rate": 2.9204592025307566e-06, "loss": 0.7741, "step": 1470 }, { "epoch": 3.65, "grad_norm": 0.2907876968383789, "learning_rate": 2.87300867980822e-06, "loss": 0.7936, "step": 1471 }, { "epoch": 3.65, "grad_norm": 0.2779775857925415, "learning_rate": 2.8259411816085492e-06, "loss": 0.8232, "step": 1472 }, { "epoch": 3.65, "grad_norm": 0.2774050235748291, "learning_rate": 2.7792568935444796e-06, "loss": 0.7313, "step": 1473 }, { "epoch": 3.66, "grad_norm": 0.2906379699707031, "learning_rate": 2.732955999717546e-06, "loss": 0.7736, "step": 1474 }, { "epoch": 3.66, "grad_norm": 0.2775977849960327, "learning_rate": 2.687038682717302e-06, "loss": 0.7589, "step": 1475 }, { "epoch": 3.66, "grad_norm": 0.26822540163993835, "learning_rate": 2.6415051236207355e-06, "loss": 0.7631, "step": 1476 }, { "epoch": 3.66, "grad_norm": 0.30677536129951477, "learning_rate": 2.5963555019913988e-06, "loss": 0.8044, "step": 1477 }, { "epoch": 3.67, "grad_norm": 0.29074108600616455, "learning_rate": 2.551589995878789e-06, "loss": 0.8274, "step": 1478 }, { "epoch": 3.67, "grad_norm": 0.27496954798698425, "learning_rate": 2.5072087818176382e-06, "loss": 0.7334, "step": 1479 }, { "epoch": 3.67, "grad_norm": 0.2761988043785095, "learning_rate": 2.4632120348272003e-06, "loss": 0.776, "step": 1480 }, { "epoch": 3.68, "grad_norm": 0.2803685665130615, "learning_rate": 2.419599928410554e-06, "loss": 0.8236, "step": 1481 }, { "epoch": 3.68, "grad_norm": 0.2737310230731964, "learning_rate": 2.376372634553936e-06, "loss": 0.7394, "step": 1482 }, { "epoch": 3.68, "grad_norm": 0.2803831100463867, "learning_rate": 2.3335303237260853e-06, "loss": 0.771, "step": 1483 }, { "epoch": 3.68, "grad_norm": 0.289489209651947, "learning_rate": 2.291073164877511e-06, "loss": 0.7389, "step": 1484 }, { "epoch": 3.69, "grad_norm": 0.27078330516815186, "learning_rate": 2.24900132543987e-06, "loss": 0.7704, "step": 1485 }, { "epoch": 3.69, "grad_norm": 0.2747279107570648, "learning_rate": 2.207314971325292e-06, "loss": 0.7244, "step": 1486 }, { "epoch": 3.69, "grad_norm": 0.2696608304977417, "learning_rate": 2.166014266925731e-06, "loss": 0.7311, "step": 1487 }, { "epoch": 3.69, "grad_norm": 0.2755465507507324, "learning_rate": 2.125099375112316e-06, "loss": 0.7673, "step": 1488 }, { "epoch": 3.7, "grad_norm": 0.28506776690483093, "learning_rate": 2.0845704572347025e-06, "loss": 0.7479, "step": 1489 }, { "epoch": 3.7, "grad_norm": 0.2692389190196991, "learning_rate": 2.0444276731204415e-06, "loss": 0.7695, "step": 1490 }, { "epoch": 3.7, "grad_norm": 0.2776127755641937, "learning_rate": 2.004671181074369e-06, "loss": 0.7398, "step": 1491 }, { "epoch": 3.7, "grad_norm": 0.2748464345932007, "learning_rate": 1.9653011378779283e-06, "loss": 0.7599, "step": 1492 }, { "epoch": 3.71, "grad_norm": 0.28339067101478577, "learning_rate": 1.9263176987886043e-06, "loss": 0.7723, "step": 1493 }, { "epoch": 3.71, "grad_norm": 0.27863991260528564, "learning_rate": 1.88772101753929e-06, "loss": 0.7492, "step": 1494 }, { "epoch": 3.71, "grad_norm": 0.2686544358730316, "learning_rate": 1.8495112463376874e-06, "loss": 0.7448, "step": 1495 }, { "epoch": 3.71, "grad_norm": 0.2779110372066498, "learning_rate": 1.8116885358656744e-06, "loss": 0.7693, "step": 1496 }, { "epoch": 3.72, "grad_norm": 0.2744112014770508, "learning_rate": 1.7742530352787612e-06, "loss": 0.7515, "step": 1497 }, { "epoch": 3.72, "grad_norm": 0.2754310071468353, "learning_rate": 1.7372048922054906e-06, "loss": 0.7747, "step": 1498 }, { "epoch": 3.72, "grad_norm": 0.28231918811798096, "learning_rate": 1.7005442527468163e-06, "loss": 0.7966, "step": 1499 }, { "epoch": 3.72, "grad_norm": 0.280091792345047, "learning_rate": 1.6642712614755695e-06, "loss": 0.7895, "step": 1500 }, { "epoch": 3.72, "eval_loss": 1.1296623945236206, "eval_runtime": 81.6396, "eval_samples_per_second": 31.835, "eval_steps_per_second": 31.835, "step": 1500 }, { "epoch": 3.73, "grad_norm": 0.2754596471786499, "learning_rate": 1.6283860614358936e-06, "loss": 0.7579, "step": 1501 }, { "epoch": 3.73, "grad_norm": 0.2746996283531189, "learning_rate": 1.5928887941426107e-06, "loss": 0.7708, "step": 1502 }, { "epoch": 3.73, "grad_norm": 0.2860219180583954, "learning_rate": 1.5577795995807554e-06, "loss": 0.7738, "step": 1503 }, { "epoch": 3.73, "grad_norm": 0.28111305832862854, "learning_rate": 1.523058616204942e-06, "loss": 0.7446, "step": 1504 }, { "epoch": 3.74, "grad_norm": 0.2891142964363098, "learning_rate": 1.4887259809389208e-06, "loss": 0.7357, "step": 1505 }, { "epoch": 3.74, "grad_norm": 0.3040487468242645, "learning_rate": 1.4547818291749115e-06, "loss": 0.8158, "step": 1506 }, { "epoch": 3.74, "grad_norm": 0.27055880427360535, "learning_rate": 1.4212262947731703e-06, "loss": 0.7825, "step": 1507 }, { "epoch": 3.74, "grad_norm": 0.2738685607910156, "learning_rate": 1.3880595100613792e-06, "loss": 0.8117, "step": 1508 }, { "epoch": 3.75, "grad_norm": 0.26631131768226624, "learning_rate": 1.3552816058342354e-06, "loss": 0.7624, "step": 1509 }, { "epoch": 3.75, "grad_norm": 0.275269478559494, "learning_rate": 1.3228927113528189e-06, "loss": 0.7361, "step": 1510 }, { "epoch": 3.75, "grad_norm": 0.277022123336792, "learning_rate": 1.290892954344125e-06, "loss": 0.8235, "step": 1511 }, { "epoch": 3.75, "grad_norm": 0.28082889318466187, "learning_rate": 1.2592824610006215e-06, "loss": 0.7851, "step": 1512 }, { "epoch": 3.76, "grad_norm": 0.27698802947998047, "learning_rate": 1.2280613559796595e-06, "loss": 0.7763, "step": 1513 }, { "epoch": 3.76, "grad_norm": 0.2701203227043152, "learning_rate": 1.1972297624030072e-06, "loss": 0.7365, "step": 1514 }, { "epoch": 3.76, "grad_norm": 0.28105244040489197, "learning_rate": 1.1667878018564171e-06, "loss": 0.7405, "step": 1515 }, { "epoch": 3.76, "grad_norm": 0.28660351037979126, "learning_rate": 1.1367355943890823e-06, "loss": 0.8375, "step": 1516 }, { "epoch": 3.77, "grad_norm": 0.2766847312450409, "learning_rate": 1.1070732585132026e-06, "loss": 0.7419, "step": 1517 }, { "epoch": 3.77, "grad_norm": 0.27921485900878906, "learning_rate": 1.0778009112034748e-06, "loss": 0.81, "step": 1518 }, { "epoch": 3.77, "grad_norm": 0.273929625749588, "learning_rate": 1.0489186678966812e-06, "loss": 0.7605, "step": 1519 }, { "epoch": 3.77, "grad_norm": 0.2831816077232361, "learning_rate": 1.0204266424912123e-06, "loss": 0.8009, "step": 1520 }, { "epoch": 3.78, "grad_norm": 0.271957665681839, "learning_rate": 9.923249473466012e-07, "loss": 0.7516, "step": 1521 }, { "epoch": 3.78, "grad_norm": 0.2766115665435791, "learning_rate": 9.64613693283123e-07, "loss": 0.7503, "step": 1522 }, { "epoch": 3.78, "grad_norm": 0.2731226086616516, "learning_rate": 9.372929895813065e-07, "loss": 0.7647, "step": 1523 }, { "epoch": 3.78, "grad_norm": 0.2769404947757721, "learning_rate": 9.103629439815354e-07, "loss": 0.754, "step": 1524 }, { "epoch": 3.79, "grad_norm": 0.27753058075904846, "learning_rate": 8.838236626836138e-07, "loss": 0.7763, "step": 1525 }, { "epoch": 3.79, "grad_norm": 0.2744705379009247, "learning_rate": 8.57675250346368e-07, "loss": 0.7591, "step": 1526 }, { "epoch": 3.79, "grad_norm": 0.2842487394809723, "learning_rate": 8.319178100872016e-07, "loss": 0.7368, "step": 1527 }, { "epoch": 3.79, "grad_norm": 0.26666221022605896, "learning_rate": 8.065514434816845e-07, "loss": 0.7895, "step": 1528 }, { "epoch": 3.8, "grad_norm": 0.2703384459018707, "learning_rate": 7.815762505632096e-07, "loss": 0.7919, "step": 1529 }, { "epoch": 3.8, "grad_norm": 0.2714150846004486, "learning_rate": 7.569923298225146e-07, "loss": 0.7581, "step": 1530 }, { "epoch": 3.8, "grad_norm": 0.2792799174785614, "learning_rate": 7.327997782073936e-07, "loss": 0.7965, "step": 1531 }, { "epoch": 3.8, "grad_norm": 0.2913144528865814, "learning_rate": 7.08998691122198e-07, "loss": 0.7865, "step": 1532 }, { "epoch": 3.81, "grad_norm": 0.2729615569114685, "learning_rate": 6.855891624275801e-07, "loss": 0.7509, "step": 1533 }, { "epoch": 3.81, "grad_norm": 0.2664259672164917, "learning_rate": 6.625712844400056e-07, "loss": 0.7275, "step": 1534 }, { "epoch": 3.81, "grad_norm": 0.2921704053878784, "learning_rate": 6.399451479315088e-07, "loss": 0.8285, "step": 1535 }, { "epoch": 3.81, "grad_norm": 0.26288658380508423, "learning_rate": 6.177108421292266e-07, "loss": 0.7178, "step": 1536 }, { "epoch": 3.82, "grad_norm": 0.2717953026294708, "learning_rate": 5.958684547151095e-07, "loss": 0.767, "step": 1537 }, { "epoch": 3.82, "grad_norm": 0.2692357897758484, "learning_rate": 5.744180718255776e-07, "loss": 0.8129, "step": 1538 }, { "epoch": 3.82, "grad_norm": 0.28223443031311035, "learning_rate": 5.533597780511435e-07, "loss": 0.7437, "step": 1539 }, { "epoch": 3.82, "grad_norm": 0.2923904061317444, "learning_rate": 5.326936564361118e-07, "loss": 0.8025, "step": 1540 }, { "epoch": 3.83, "grad_norm": 0.29196780920028687, "learning_rate": 5.124197884782356e-07, "loss": 0.7651, "step": 1541 }, { "epoch": 3.83, "grad_norm": 0.2696937620639801, "learning_rate": 4.92538254128383e-07, "loss": 0.6992, "step": 1542 }, { "epoch": 3.83, "grad_norm": 0.2914939820766449, "learning_rate": 4.7304913179025965e-07, "loss": 0.8015, "step": 1543 }, { "epoch": 3.83, "grad_norm": 0.2795959413051605, "learning_rate": 4.5395249832007604e-07, "loss": 0.759, "step": 1544 }, { "epoch": 3.84, "grad_norm": 0.2830215096473694, "learning_rate": 4.352484290262249e-07, "loss": 0.7622, "step": 1545 }, { "epoch": 3.84, "grad_norm": 0.2834395170211792, "learning_rate": 4.1693699766902626e-07, "loss": 0.7639, "step": 1546 }, { "epoch": 3.84, "grad_norm": 0.2674468755722046, "learning_rate": 3.9901827646039446e-07, "loss": 0.7031, "step": 1547 }, { "epoch": 3.84, "grad_norm": 0.2810966968536377, "learning_rate": 3.814923360636158e-07, "loss": 0.7395, "step": 1548 }, { "epoch": 3.85, "grad_norm": 0.2885318398475647, "learning_rate": 3.643592455929712e-07, "loss": 0.813, "step": 1549 }, { "epoch": 3.85, "grad_norm": 0.2727202773094177, "learning_rate": 3.4761907261356976e-07, "loss": 0.7284, "step": 1550 }, { "epoch": 3.85, "grad_norm": 0.26994508504867554, "learning_rate": 3.3127188314100444e-07, "loss": 0.7561, "step": 1551 }, { "epoch": 3.85, "grad_norm": 0.28642725944519043, "learning_rate": 3.1531774164111903e-07, "loss": 0.7287, "step": 1552 }, { "epoch": 3.86, "grad_norm": 0.2679294943809509, "learning_rate": 2.997567110297861e-07, "loss": 0.7386, "step": 1553 }, { "epoch": 3.86, "grad_norm": 0.27557045221328735, "learning_rate": 2.8458885267260705e-07, "loss": 0.7623, "step": 1554 }, { "epoch": 3.86, "grad_norm": 0.28410604596138, "learning_rate": 2.6981422638466814e-07, "loss": 0.8263, "step": 1555 }, { "epoch": 3.86, "grad_norm": 0.27892133593559265, "learning_rate": 2.554328904303738e-07, "loss": 0.7985, "step": 1556 }, { "epoch": 3.87, "grad_norm": 0.2855932414531708, "learning_rate": 2.414449015231357e-07, "loss": 0.7673, "step": 1557 }, { "epoch": 3.87, "grad_norm": 0.27998268604278564, "learning_rate": 2.2785031482521758e-07, "loss": 0.8088, "step": 1558 }, { "epoch": 3.87, "grad_norm": 0.2954777777194977, "learning_rate": 2.1464918394743516e-07, "loss": 0.7499, "step": 1559 }, { "epoch": 3.87, "grad_norm": 0.2695976197719574, "learning_rate": 2.0184156094905648e-07, "loss": 0.7803, "step": 1560 }, { "epoch": 3.88, "grad_norm": 0.2727389931678772, "learning_rate": 1.894274963374798e-07, "loss": 0.7405, "step": 1561 }, { "epoch": 3.88, "grad_norm": 0.26949208974838257, "learning_rate": 1.7740703906810042e-07, "loss": 0.7879, "step": 1562 }, { "epoch": 3.88, "grad_norm": 0.27831128239631653, "learning_rate": 1.657802365441441e-07, "loss": 0.7484, "step": 1563 }, { "epoch": 3.88, "grad_norm": 0.26965537667274475, "learning_rate": 1.545471346164007e-07, "loss": 0.7134, "step": 1564 }, { "epoch": 3.89, "grad_norm": 0.2697620987892151, "learning_rate": 1.4370777758307974e-07, "loss": 0.7667, "step": 1565 }, { "epoch": 3.89, "grad_norm": 0.2854253649711609, "learning_rate": 1.3326220818968838e-07, "loss": 0.7328, "step": 1566 }, { "epoch": 3.89, "grad_norm": 0.28360316157341003, "learning_rate": 1.2321046762876487e-07, "loss": 0.7763, "step": 1567 }, { "epoch": 3.89, "grad_norm": 0.28326350450515747, "learning_rate": 1.1355259553978981e-07, "loss": 0.8221, "step": 1568 }, { "epoch": 3.9, "grad_norm": 0.2706443667411804, "learning_rate": 1.0428863000899735e-07, "loss": 0.7627, "step": 1569 }, { "epoch": 3.9, "grad_norm": 0.27884283661842346, "learning_rate": 9.541860756925314e-08, "loss": 0.7344, "step": 1570 }, { "epoch": 3.9, "grad_norm": 0.2777668833732605, "learning_rate": 8.694256319987659e-08, "loss": 0.7544, "step": 1571 }, { "epoch": 3.9, "grad_norm": 0.2982774078845978, "learning_rate": 7.886053032649665e-08, "loss": 0.7994, "step": 1572 }, { "epoch": 3.91, "grad_norm": 0.27645036578178406, "learning_rate": 7.117254082098512e-08, "loss": 0.7258, "step": 1573 }, { "epoch": 3.91, "grad_norm": 0.28957119584083557, "learning_rate": 6.387862500125685e-08, "loss": 0.767, "step": 1574 }, { "epoch": 3.91, "grad_norm": 0.27518337965011597, "learning_rate": 5.697881163118091e-08, "loss": 0.7331, "step": 1575 }, { "epoch": 3.91, "grad_norm": 0.2808379530906677, "learning_rate": 5.047312792046954e-08, "loss": 0.7446, "step": 1576 }, { "epoch": 3.92, "grad_norm": 0.2827063798904419, "learning_rate": 4.4361599524589406e-08, "loss": 0.8012, "step": 1577 }, { "epoch": 3.92, "grad_norm": 0.2789200246334076, "learning_rate": 3.8644250544594975e-08, "loss": 0.8029, "step": 1578 }, { "epoch": 3.92, "grad_norm": 0.2743687033653259, "learning_rate": 3.332110352712858e-08, "loss": 0.8483, "step": 1579 }, { "epoch": 3.92, "grad_norm": 0.28349533677101135, "learning_rate": 2.839217946422057e-08, "loss": 0.8181, "step": 1580 }, { "epoch": 3.93, "grad_norm": 0.28121450543403625, "learning_rate": 2.385749779332258e-08, "loss": 0.7622, "step": 1581 }, { "epoch": 3.93, "grad_norm": 0.2831059694290161, "learning_rate": 1.971707639712994e-08, "loss": 0.8035, "step": 1582 }, { "epoch": 3.93, "grad_norm": 0.29177501797676086, "learning_rate": 1.5970931603592752e-08, "loss": 0.7915, "step": 1583 }, { "epoch": 3.93, "grad_norm": 0.2955121695995331, "learning_rate": 1.2619078185793776e-08, "loss": 0.7879, "step": 1584 }, { "epoch": 3.94, "grad_norm": 0.2781122624874115, "learning_rate": 9.661529361892907e-09, "loss": 0.7315, "step": 1585 }, { "epoch": 3.94, "grad_norm": 0.2738924026489258, "learning_rate": 7.098296795138293e-09, "loss": 0.743, "step": 1586 }, { "epoch": 3.94, "grad_norm": 0.276124507188797, "learning_rate": 4.929390593744199e-09, "loss": 0.7888, "step": 1587 }, { "epoch": 3.94, "grad_norm": 0.2650710940361023, "learning_rate": 3.154819310868806e-09, "loss": 0.7843, "step": 1588 }, { "epoch": 3.95, "grad_norm": 0.2870003879070282, "learning_rate": 1.7745899446364178e-09, "loss": 0.7683, "step": 1589 }, { "epoch": 3.95, "grad_norm": 0.2782718241214752, "learning_rate": 7.887079380153317e-10, "loss": 0.7786, "step": 1590 }, { "epoch": 3.95, "grad_norm": 0.2961905598640442, "learning_rate": 1.9717717889555787e-10, "loss": 0.7908, "step": 1591 }, { "epoch": 3.95, "grad_norm": 0.2717040181159973, "learning_rate": 0.0, "loss": 0.7326, "step": 1592 } ], "logging_steps": 1, "max_steps": 1592, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 398, "total_flos": 1.187648639846056e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }