{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.971014492753623, "eval_steps": 121, "global_step": 966, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002070393374741201, "grad_norm": 0.09777352213859558, "learning_rate": 1.0000000000000002e-06, "loss": 0.6308, "step": 1 }, { "epoch": 0.002070393374741201, "eval_loss": 0.5235128402709961, "eval_runtime": 93.3308, "eval_samples_per_second": 5.914, "eval_steps_per_second": 0.739, "step": 1 }, { "epoch": 0.004140786749482402, "grad_norm": 0.09036517143249512, "learning_rate": 2.0000000000000003e-06, "loss": 0.4787, "step": 2 }, { "epoch": 0.006211180124223602, "grad_norm": 0.0952724814414978, "learning_rate": 3e-06, "loss": 0.5643, "step": 3 }, { "epoch": 0.008281573498964804, "grad_norm": 0.10640479624271393, "learning_rate": 4.000000000000001e-06, "loss": 0.6445, "step": 4 }, { "epoch": 0.010351966873706004, "grad_norm": 0.0985221341252327, "learning_rate": 5e-06, "loss": 0.5846, "step": 5 }, { "epoch": 0.012422360248447204, "grad_norm": 0.07589095830917358, "learning_rate": 6e-06, "loss": 0.3481, "step": 6 }, { "epoch": 0.014492753623188406, "grad_norm": 0.09868590533733368, "learning_rate": 7e-06, "loss": 0.5721, "step": 7 }, { "epoch": 0.016563146997929608, "grad_norm": 0.0714084655046463, "learning_rate": 8.000000000000001e-06, "loss": 0.3615, "step": 8 }, { "epoch": 0.018633540372670808, "grad_norm": 0.074221171438694, "learning_rate": 9e-06, "loss": 0.4541, "step": 9 }, { "epoch": 0.020703933747412008, "grad_norm": 0.0852167010307312, "learning_rate": 1e-05, "loss": 0.5734, "step": 10 }, { "epoch": 0.022774327122153208, "grad_norm": 0.08485738188028336, "learning_rate": 9.999973002498209e-06, "loss": 0.4918, "step": 11 }, { "epoch": 0.024844720496894408, "grad_norm": 0.09552149474620819, "learning_rate": 9.999892010284378e-06, "loss": 0.5242, "step": 12 }, { "epoch": 0.026915113871635612, "grad_norm": 0.09766828268766403, "learning_rate": 9.999757024233143e-06, "loss": 0.5503, "step": 13 }, { "epoch": 0.028985507246376812, "grad_norm": 0.10151398926973343, "learning_rate": 9.999568045802216e-06, "loss": 0.6607, "step": 14 }, { "epoch": 0.031055900621118012, "grad_norm": 0.09601914882659912, "learning_rate": 9.999325077032382e-06, "loss": 0.6295, "step": 15 }, { "epoch": 0.033126293995859216, "grad_norm": 0.09811417013406754, "learning_rate": 9.999028120547456e-06, "loss": 0.4831, "step": 16 }, { "epoch": 0.035196687370600416, "grad_norm": 0.0988076776266098, "learning_rate": 9.99867717955427e-06, "loss": 0.513, "step": 17 }, { "epoch": 0.037267080745341616, "grad_norm": 0.08319983631372452, "learning_rate": 9.99827225784264e-06, "loss": 0.4423, "step": 18 }, { "epoch": 0.039337474120082816, "grad_norm": 0.08122409880161285, "learning_rate": 9.997813359785313e-06, "loss": 0.5584, "step": 19 }, { "epoch": 0.041407867494824016, "grad_norm": 0.09155779331922531, "learning_rate": 9.99730049033793e-06, "loss": 0.4512, "step": 20 }, { "epoch": 0.043478260869565216, "grad_norm": 0.09295297414064407, "learning_rate": 9.99673365503897e-06, "loss": 0.4382, "step": 21 }, { "epoch": 0.045548654244306416, "grad_norm": 0.12233804911375046, "learning_rate": 9.996112860009689e-06, "loss": 0.6128, "step": 22 }, { "epoch": 0.047619047619047616, "grad_norm": 0.1146979108452797, "learning_rate": 9.995438111954047e-06, "loss": 0.421, "step": 23 }, { "epoch": 0.049689440993788817, "grad_norm": 0.11026836186647415, "learning_rate": 9.994709418158652e-06, "loss": 0.6736, "step": 24 }, { "epoch": 0.051759834368530024, "grad_norm": 0.11445167660713196, "learning_rate": 9.993926786492672e-06, "loss": 0.5125, "step": 25 }, { "epoch": 0.053830227743271224, "grad_norm": 0.0917944386601448, "learning_rate": 9.993090225407743e-06, "loss": 0.436, "step": 26 }, { "epoch": 0.055900621118012424, "grad_norm": 0.09246832877397537, "learning_rate": 9.992199743937888e-06, "loss": 0.4278, "step": 27 }, { "epoch": 0.057971014492753624, "grad_norm": 0.07395437359809875, "learning_rate": 9.991255351699422e-06, "loss": 0.3163, "step": 28 }, { "epoch": 0.060041407867494824, "grad_norm": 0.0859774798154831, "learning_rate": 9.990257058890834e-06, "loss": 0.4385, "step": 29 }, { "epoch": 0.062111801242236024, "grad_norm": 0.11907877773046494, "learning_rate": 9.98920487629269e-06, "loss": 0.5022, "step": 30 }, { "epoch": 0.06418219461697723, "grad_norm": 0.08157500624656677, "learning_rate": 9.988098815267507e-06, "loss": 0.3605, "step": 31 }, { "epoch": 0.06625258799171843, "grad_norm": 0.10693008452653885, "learning_rate": 9.986938887759643e-06, "loss": 0.5329, "step": 32 }, { "epoch": 0.06832298136645963, "grad_norm": 0.11197963356971741, "learning_rate": 9.985725106295154e-06, "loss": 0.6241, "step": 33 }, { "epoch": 0.07039337474120083, "grad_norm": 0.07341015338897705, "learning_rate": 9.98445748398167e-06, "loss": 0.3944, "step": 34 }, { "epoch": 0.07246376811594203, "grad_norm": 0.08865534514188766, "learning_rate": 9.98313603450824e-06, "loss": 0.4709, "step": 35 }, { "epoch": 0.07453416149068323, "grad_norm": 0.09436415880918503, "learning_rate": 9.981760772145201e-06, "loss": 0.4746, "step": 36 }, { "epoch": 0.07660455486542443, "grad_norm": 0.07138030976057053, "learning_rate": 9.980331711744013e-06, "loss": 0.3825, "step": 37 }, { "epoch": 0.07867494824016563, "grad_norm": 0.11378210783004761, "learning_rate": 9.978848868737099e-06, "loss": 0.5328, "step": 38 }, { "epoch": 0.08074534161490683, "grad_norm": 0.10511545836925507, "learning_rate": 9.97731225913768e-06, "loss": 0.6454, "step": 39 }, { "epoch": 0.08281573498964803, "grad_norm": 0.08871125429868698, "learning_rate": 9.975721899539607e-06, "loss": 0.5153, "step": 40 }, { "epoch": 0.08488612836438923, "grad_norm": 0.07444795966148376, "learning_rate": 9.974077807117176e-06, "loss": 0.3639, "step": 41 }, { "epoch": 0.08695652173913043, "grad_norm": 0.06881678849458694, "learning_rate": 9.972379999624935e-06, "loss": 0.3338, "step": 42 }, { "epoch": 0.08902691511387163, "grad_norm": 0.07631891965866089, "learning_rate": 9.970628495397516e-06, "loss": 0.4224, "step": 43 }, { "epoch": 0.09109730848861283, "grad_norm": 0.0697372555732727, "learning_rate": 9.968823313349412e-06, "loss": 0.2783, "step": 44 }, { "epoch": 0.09316770186335403, "grad_norm": 0.07862786948680878, "learning_rate": 9.966964472974783e-06, "loss": 0.3833, "step": 45 }, { "epoch": 0.09523809523809523, "grad_norm": 0.09798941016197205, "learning_rate": 9.96505199434725e-06, "loss": 0.5329, "step": 46 }, { "epoch": 0.09730848861283643, "grad_norm": 0.07815828919410706, "learning_rate": 9.96308589811967e-06, "loss": 0.5146, "step": 47 }, { "epoch": 0.09937888198757763, "grad_norm": 0.10919066518545151, "learning_rate": 9.961066205523917e-06, "loss": 0.5496, "step": 48 }, { "epoch": 0.10144927536231885, "grad_norm": 0.07423596829175949, "learning_rate": 9.958992938370655e-06, "loss": 0.3172, "step": 49 }, { "epoch": 0.10351966873706005, "grad_norm": 0.0891297236084938, "learning_rate": 9.956866119049095e-06, "loss": 0.473, "step": 50 }, { "epoch": 0.10559006211180125, "grad_norm": 0.0813724547624588, "learning_rate": 9.954685770526762e-06, "loss": 0.3897, "step": 51 }, { "epoch": 0.10766045548654245, "grad_norm": 0.0775756984949112, "learning_rate": 9.952451916349242e-06, "loss": 0.3008, "step": 52 }, { "epoch": 0.10973084886128365, "grad_norm": 0.07236716151237488, "learning_rate": 9.950164580639925e-06, "loss": 0.3333, "step": 53 }, { "epoch": 0.11180124223602485, "grad_norm": 0.1051044836640358, "learning_rate": 9.947823788099754e-06, "loss": 0.4174, "step": 54 }, { "epoch": 0.11387163561076605, "grad_norm": 0.06772394478321075, "learning_rate": 9.945429564006945e-06, "loss": 0.4263, "step": 55 }, { "epoch": 0.11594202898550725, "grad_norm": 0.08235827088356018, "learning_rate": 9.942981934216731e-06, "loss": 0.4846, "step": 56 }, { "epoch": 0.11801242236024845, "grad_norm": 0.07849966734647751, "learning_rate": 9.940480925161065e-06, "loss": 0.4597, "step": 57 }, { "epoch": 0.12008281573498965, "grad_norm": 0.08702969551086426, "learning_rate": 9.937926563848345e-06, "loss": 0.468, "step": 58 }, { "epoch": 0.12215320910973085, "grad_norm": 0.08962167054414749, "learning_rate": 9.935318877863123e-06, "loss": 0.4397, "step": 59 }, { "epoch": 0.12422360248447205, "grad_norm": 0.06720270216464996, "learning_rate": 9.9326578953658e-06, "loss": 0.3646, "step": 60 }, { "epoch": 0.12629399585921325, "grad_norm": 0.07080550491809845, "learning_rate": 9.929943645092328e-06, "loss": 0.4352, "step": 61 }, { "epoch": 0.12836438923395446, "grad_norm": 0.10958462953567505, "learning_rate": 9.9271761563539e-06, "loss": 0.5284, "step": 62 }, { "epoch": 0.13043478260869565, "grad_norm": 0.06647852063179016, "learning_rate": 9.924355459036624e-06, "loss": 0.4193, "step": 63 }, { "epoch": 0.13250517598343686, "grad_norm": 0.08806627988815308, "learning_rate": 9.921481583601218e-06, "loss": 0.3681, "step": 64 }, { "epoch": 0.13457556935817805, "grad_norm": 0.08910389989614487, "learning_rate": 9.91855456108266e-06, "loss": 0.5611, "step": 65 }, { "epoch": 0.13664596273291926, "grad_norm": 0.08766467124223709, "learning_rate": 9.915574423089872e-06, "loss": 0.3665, "step": 66 }, { "epoch": 0.13871635610766045, "grad_norm": 0.05857433006167412, "learning_rate": 9.912541201805362e-06, "loss": 0.3057, "step": 67 }, { "epoch": 0.14078674948240166, "grad_norm": 0.07592141628265381, "learning_rate": 9.909454929984894e-06, "loss": 0.3898, "step": 68 }, { "epoch": 0.14285714285714285, "grad_norm": 0.06521393358707428, "learning_rate": 9.906315640957117e-06, "loss": 0.424, "step": 69 }, { "epoch": 0.14492753623188406, "grad_norm": 0.07010532915592194, "learning_rate": 9.903123368623216e-06, "loss": 0.2962, "step": 70 }, { "epoch": 0.14699792960662525, "grad_norm": 0.08857309818267822, "learning_rate": 9.89987814745654e-06, "loss": 0.5392, "step": 71 }, { "epoch": 0.14906832298136646, "grad_norm": 0.11536813527345657, "learning_rate": 9.896580012502238e-06, "loss": 0.4399, "step": 72 }, { "epoch": 0.15113871635610765, "grad_norm": 0.13220086693763733, "learning_rate": 9.89322899937687e-06, "loss": 0.36, "step": 73 }, { "epoch": 0.15320910973084886, "grad_norm": 0.10927895456552505, "learning_rate": 9.889825144268029e-06, "loss": 0.6121, "step": 74 }, { "epoch": 0.15527950310559005, "grad_norm": 0.07254831492900848, "learning_rate": 9.88636848393395e-06, "loss": 0.3819, "step": 75 }, { "epoch": 0.15734989648033126, "grad_norm": 0.06788697838783264, "learning_rate": 9.882859055703109e-06, "loss": 0.2731, "step": 76 }, { "epoch": 0.15942028985507245, "grad_norm": 0.06452120095491409, "learning_rate": 9.879296897473825e-06, "loss": 0.261, "step": 77 }, { "epoch": 0.16149068322981366, "grad_norm": 0.078448086977005, "learning_rate": 9.875682047713847e-06, "loss": 0.3601, "step": 78 }, { "epoch": 0.16356107660455488, "grad_norm": 0.07225798815488815, "learning_rate": 9.87201454545994e-06, "loss": 0.4058, "step": 79 }, { "epoch": 0.16563146997929606, "grad_norm": 0.07582813501358032, "learning_rate": 9.868294430317464e-06, "loss": 0.3832, "step": 80 }, { "epoch": 0.16770186335403728, "grad_norm": 0.08351872116327286, "learning_rate": 9.864521742459943e-06, "loss": 0.4399, "step": 81 }, { "epoch": 0.16977225672877846, "grad_norm": 0.0709993913769722, "learning_rate": 9.860696522628638e-06, "loss": 0.474, "step": 82 }, { "epoch": 0.17184265010351968, "grad_norm": 0.06954403221607208, "learning_rate": 9.856818812132101e-06, "loss": 0.3555, "step": 83 }, { "epoch": 0.17391304347826086, "grad_norm": 0.07412121444940567, "learning_rate": 9.852888652845729e-06, "loss": 0.3702, "step": 84 }, { "epoch": 0.17598343685300208, "grad_norm": 0.059929411858320236, "learning_rate": 9.848906087211317e-06, "loss": 0.335, "step": 85 }, { "epoch": 0.17805383022774326, "grad_norm": 0.08295787125825882, "learning_rate": 9.84487115823659e-06, "loss": 0.4468, "step": 86 }, { "epoch": 0.18012422360248448, "grad_norm": 0.10332628339529037, "learning_rate": 9.840783909494753e-06, "loss": 0.4893, "step": 87 }, { "epoch": 0.18219461697722567, "grad_norm": 0.06922546774148941, "learning_rate": 9.836644385124006e-06, "loss": 0.3423, "step": 88 }, { "epoch": 0.18426501035196688, "grad_norm": 0.0938820093870163, "learning_rate": 9.832452629827079e-06, "loss": 0.5032, "step": 89 }, { "epoch": 0.18633540372670807, "grad_norm": 0.08957293629646301, "learning_rate": 9.828208688870736e-06, "loss": 0.4738, "step": 90 }, { "epoch": 0.18840579710144928, "grad_norm": 0.08262480795383453, "learning_rate": 9.823912608085299e-06, "loss": 0.4135, "step": 91 }, { "epoch": 0.19047619047619047, "grad_norm": 0.10143536329269409, "learning_rate": 9.81956443386415e-06, "loss": 0.4859, "step": 92 }, { "epoch": 0.19254658385093168, "grad_norm": 0.08232709765434265, "learning_rate": 9.815164213163224e-06, "loss": 0.454, "step": 93 }, { "epoch": 0.19461697722567287, "grad_norm": 0.08575092256069183, "learning_rate": 9.810711993500506e-06, "loss": 0.377, "step": 94 }, { "epoch": 0.19668737060041408, "grad_norm": 0.08846532553434372, "learning_rate": 9.806207822955524e-06, "loss": 0.4189, "step": 95 }, { "epoch": 0.19875776397515527, "grad_norm": 0.07108867168426514, "learning_rate": 9.801651750168815e-06, "loss": 0.421, "step": 96 }, { "epoch": 0.20082815734989648, "grad_norm": 0.09010709822177887, "learning_rate": 9.797043824341413e-06, "loss": 0.4374, "step": 97 }, { "epoch": 0.2028985507246377, "grad_norm": 0.11712660640478134, "learning_rate": 9.792384095234312e-06, "loss": 0.4972, "step": 98 }, { "epoch": 0.20496894409937888, "grad_norm": 0.10217002779245377, "learning_rate": 9.787672613167931e-06, "loss": 0.5974, "step": 99 }, { "epoch": 0.2070393374741201, "grad_norm": 0.0857044979929924, "learning_rate": 9.782909429021568e-06, "loss": 0.507, "step": 100 }, { "epoch": 0.20910973084886128, "grad_norm": 0.08360479027032852, "learning_rate": 9.778094594232853e-06, "loss": 0.452, "step": 101 }, { "epoch": 0.2111801242236025, "grad_norm": 0.08019517362117767, "learning_rate": 9.773228160797187e-06, "loss": 0.4235, "step": 102 }, { "epoch": 0.21325051759834368, "grad_norm": 0.08056485652923584, "learning_rate": 9.76831018126719e-06, "loss": 0.453, "step": 103 }, { "epoch": 0.2153209109730849, "grad_norm": 0.06662017107009888, "learning_rate": 9.76334070875213e-06, "loss": 0.3214, "step": 104 }, { "epoch": 0.21739130434782608, "grad_norm": 0.1045277863740921, "learning_rate": 9.75831979691734e-06, "loss": 0.3387, "step": 105 }, { "epoch": 0.2194616977225673, "grad_norm": 0.0656440332531929, "learning_rate": 9.753247499983649e-06, "loss": 0.3272, "step": 106 }, { "epoch": 0.22153209109730848, "grad_norm": 0.09750998765230179, "learning_rate": 9.748123872726801e-06, "loss": 0.4563, "step": 107 }, { "epoch": 0.2236024844720497, "grad_norm": 0.09482847899198532, "learning_rate": 9.742948970476845e-06, "loss": 0.4397, "step": 108 }, { "epoch": 0.22567287784679088, "grad_norm": 0.0843338593840599, "learning_rate": 9.737722849117556e-06, "loss": 0.3945, "step": 109 }, { "epoch": 0.2277432712215321, "grad_norm": 0.07675352692604065, "learning_rate": 9.732445565085823e-06, "loss": 0.3514, "step": 110 }, { "epoch": 0.22981366459627328, "grad_norm": 0.09576563537120819, "learning_rate": 9.72711717537104e-06, "loss": 0.3179, "step": 111 }, { "epoch": 0.2318840579710145, "grad_norm": 0.09575846046209335, "learning_rate": 9.721737737514492e-06, "loss": 0.3947, "step": 112 }, { "epoch": 0.23395445134575568, "grad_norm": 0.06680424511432648, "learning_rate": 9.71630730960873e-06, "loss": 0.2413, "step": 113 }, { "epoch": 0.2360248447204969, "grad_norm": 0.09097225219011307, "learning_rate": 9.71082595029695e-06, "loss": 0.3346, "step": 114 }, { "epoch": 0.23809523809523808, "grad_norm": 0.08269556611776352, "learning_rate": 9.705293718772355e-06, "loss": 0.3557, "step": 115 }, { "epoch": 0.2401656314699793, "grad_norm": 0.06669075042009354, "learning_rate": 9.699710674777519e-06, "loss": 0.2507, "step": 116 }, { "epoch": 0.2422360248447205, "grad_norm": 0.0904952734708786, "learning_rate": 9.694076878603736e-06, "loss": 0.4732, "step": 117 }, { "epoch": 0.2443064182194617, "grad_norm": 0.08195852488279343, "learning_rate": 9.688392391090374e-06, "loss": 0.3463, "step": 118 }, { "epoch": 0.2463768115942029, "grad_norm": 0.0797749012708664, "learning_rate": 9.682657273624221e-06, "loss": 0.417, "step": 119 }, { "epoch": 0.2484472049689441, "grad_norm": 0.07914768904447556, "learning_rate": 9.676871588138812e-06, "loss": 0.3735, "step": 120 }, { "epoch": 0.2505175983436853, "grad_norm": 0.08743813633918762, "learning_rate": 9.671035397113769e-06, "loss": 0.4272, "step": 121 }, { "epoch": 0.2505175983436853, "eval_loss": 0.44131994247436523, "eval_runtime": 93.4201, "eval_samples_per_second": 5.909, "eval_steps_per_second": 0.739, "step": 121 }, { "epoch": 0.2525879917184265, "grad_norm": 0.07742875814437866, "learning_rate": 9.665148763574123e-06, "loss": 0.3742, "step": 122 }, { "epoch": 0.2546583850931677, "grad_norm": 0.07875270396471024, "learning_rate": 9.659211751089636e-06, "loss": 0.2702, "step": 123 }, { "epoch": 0.2567287784679089, "grad_norm": 0.08928168565034866, "learning_rate": 9.653224423774107e-06, "loss": 0.399, "step": 124 }, { "epoch": 0.2587991718426501, "grad_norm": 0.10149750858545303, "learning_rate": 9.647186846284692e-06, "loss": 0.5301, "step": 125 }, { "epoch": 0.2608695652173913, "grad_norm": 0.12193959951400757, "learning_rate": 9.64109908382119e-06, "loss": 0.6185, "step": 126 }, { "epoch": 0.2629399585921325, "grad_norm": 0.0960773155093193, "learning_rate": 9.634961202125355e-06, "loss": 0.4239, "step": 127 }, { "epoch": 0.2650103519668737, "grad_norm": 0.10699175298213959, "learning_rate": 9.628773267480177e-06, "loss": 0.3386, "step": 128 }, { "epoch": 0.2670807453416149, "grad_norm": 0.08573795109987259, "learning_rate": 9.622535346709161e-06, "loss": 0.3367, "step": 129 }, { "epoch": 0.2691511387163561, "grad_norm": 0.09970315545797348, "learning_rate": 9.616247507175624e-06, "loss": 0.4698, "step": 130 }, { "epoch": 0.2712215320910973, "grad_norm": 0.07994365692138672, "learning_rate": 9.609909816781945e-06, "loss": 0.3454, "step": 131 }, { "epoch": 0.2732919254658385, "grad_norm": 0.07029321044683456, "learning_rate": 9.603522343968852e-06, "loss": 0.3227, "step": 132 }, { "epoch": 0.2753623188405797, "grad_norm": 0.10029646009206772, "learning_rate": 9.597085157714666e-06, "loss": 0.4583, "step": 133 }, { "epoch": 0.2774327122153209, "grad_norm": 0.10204445570707321, "learning_rate": 9.590598327534563e-06, "loss": 0.5137, "step": 134 }, { "epoch": 0.2795031055900621, "grad_norm": 0.09162542223930359, "learning_rate": 9.584061923479833e-06, "loss": 0.413, "step": 135 }, { "epoch": 0.2815734989648033, "grad_norm": 0.08620335161685944, "learning_rate": 9.577476016137105e-06, "loss": 0.4518, "step": 136 }, { "epoch": 0.2836438923395445, "grad_norm": 0.07258706539869308, "learning_rate": 9.570840676627593e-06, "loss": 0.3211, "step": 137 }, { "epoch": 0.2857142857142857, "grad_norm": 0.10509861260652542, "learning_rate": 9.56415597660634e-06, "loss": 0.4713, "step": 138 }, { "epoch": 0.28778467908902694, "grad_norm": 0.09548232704401016, "learning_rate": 9.557421988261422e-06, "loss": 0.4447, "step": 139 }, { "epoch": 0.2898550724637681, "grad_norm": 0.08879134804010391, "learning_rate": 9.550638784313187e-06, "loss": 0.4448, "step": 140 }, { "epoch": 0.2919254658385093, "grad_norm": 0.07666925340890884, "learning_rate": 9.543806438013454e-06, "loss": 0.3178, "step": 141 }, { "epoch": 0.2939958592132505, "grad_norm": 0.15115399658679962, "learning_rate": 9.536925023144742e-06, "loss": 0.4872, "step": 142 }, { "epoch": 0.29606625258799174, "grad_norm": 0.08174444735050201, "learning_rate": 9.52999461401945e-06, "loss": 0.2779, "step": 143 }, { "epoch": 0.2981366459627329, "grad_norm": 0.0791732594370842, "learning_rate": 9.523015285479076e-06, "loss": 0.3226, "step": 144 }, { "epoch": 0.3002070393374741, "grad_norm": 0.10522507131099701, "learning_rate": 9.51598711289339e-06, "loss": 0.4394, "step": 145 }, { "epoch": 0.3022774327122153, "grad_norm": 0.08529925346374512, "learning_rate": 9.508910172159635e-06, "loss": 0.3471, "step": 146 }, { "epoch": 0.30434782608695654, "grad_norm": 0.11986136436462402, "learning_rate": 9.501784539701697e-06, "loss": 0.359, "step": 147 }, { "epoch": 0.3064182194616977, "grad_norm": 0.08316604793071747, "learning_rate": 9.494610292469287e-06, "loss": 0.3872, "step": 148 }, { "epoch": 0.3084886128364389, "grad_norm": 0.07519234716892242, "learning_rate": 9.487387507937106e-06, "loss": 0.3103, "step": 149 }, { "epoch": 0.3105590062111801, "grad_norm": 0.11129677295684814, "learning_rate": 9.48011626410401e-06, "loss": 0.5156, "step": 150 }, { "epoch": 0.31262939958592134, "grad_norm": 0.11108915507793427, "learning_rate": 9.472796639492166e-06, "loss": 0.365, "step": 151 }, { "epoch": 0.3146997929606625, "grad_norm": 0.0829719603061676, "learning_rate": 9.465428713146206e-06, "loss": 0.4079, "step": 152 }, { "epoch": 0.3167701863354037, "grad_norm": 0.10608082264661789, "learning_rate": 9.45801256463237e-06, "loss": 0.4702, "step": 153 }, { "epoch": 0.3188405797101449, "grad_norm": 0.08056455105543137, "learning_rate": 9.450548274037652e-06, "loss": 0.2567, "step": 154 }, { "epoch": 0.32091097308488614, "grad_norm": 0.12216547131538391, "learning_rate": 9.443035921968932e-06, "loss": 0.4258, "step": 155 }, { "epoch": 0.32298136645962733, "grad_norm": 0.10286413133144379, "learning_rate": 9.435475589552107e-06, "loss": 0.3515, "step": 156 }, { "epoch": 0.3250517598343685, "grad_norm": 0.10109732300043106, "learning_rate": 9.427867358431209e-06, "loss": 0.4022, "step": 157 }, { "epoch": 0.32712215320910976, "grad_norm": 0.09029930084943771, "learning_rate": 9.420211310767534e-06, "loss": 0.2505, "step": 158 }, { "epoch": 0.32919254658385094, "grad_norm": 0.13366563618183136, "learning_rate": 9.412507529238741e-06, "loss": 0.4932, "step": 159 }, { "epoch": 0.33126293995859213, "grad_norm": 0.08471870422363281, "learning_rate": 9.40475609703798e-06, "loss": 0.3423, "step": 160 }, { "epoch": 0.3333333333333333, "grad_norm": 0.09047792106866837, "learning_rate": 9.396957097872967e-06, "loss": 0.2468, "step": 161 }, { "epoch": 0.33540372670807456, "grad_norm": 0.09556887298822403, "learning_rate": 9.389110615965102e-06, "loss": 0.3965, "step": 162 }, { "epoch": 0.33747412008281574, "grad_norm": 0.10276677459478378, "learning_rate": 9.38121673604855e-06, "loss": 0.4059, "step": 163 }, { "epoch": 0.33954451345755693, "grad_norm": 0.11112170666456223, "learning_rate": 9.37327554336932e-06, "loss": 0.3625, "step": 164 }, { "epoch": 0.3416149068322981, "grad_norm": 0.12287746369838715, "learning_rate": 9.365287123684365e-06, "loss": 0.3374, "step": 165 }, { "epoch": 0.34368530020703936, "grad_norm": 0.12305274605751038, "learning_rate": 9.35725156326063e-06, "loss": 0.3823, "step": 166 }, { "epoch": 0.34575569358178054, "grad_norm": 0.10408051311969757, "learning_rate": 9.34916894887414e-06, "loss": 0.4478, "step": 167 }, { "epoch": 0.34782608695652173, "grad_norm": 0.09180740267038345, "learning_rate": 9.341039367809056e-06, "loss": 0.3882, "step": 168 }, { "epoch": 0.3498964803312629, "grad_norm": 0.09142342954874039, "learning_rate": 9.332862907856722e-06, "loss": 0.312, "step": 169 }, { "epoch": 0.35196687370600416, "grad_norm": 0.14173369109630585, "learning_rate": 9.324639657314742e-06, "loss": 0.4436, "step": 170 }, { "epoch": 0.35403726708074534, "grad_norm": 0.12495898455381393, "learning_rate": 9.316369704986001e-06, "loss": 0.532, "step": 171 }, { "epoch": 0.35610766045548653, "grad_norm": 0.08666636794805527, "learning_rate": 9.308053140177722e-06, "loss": 0.3174, "step": 172 }, { "epoch": 0.3581780538302277, "grad_norm": 0.1289636641740799, "learning_rate": 9.299690052700492e-06, "loss": 0.3586, "step": 173 }, { "epoch": 0.36024844720496896, "grad_norm": 0.11257122457027435, "learning_rate": 9.291280532867301e-06, "loss": 0.48, "step": 174 }, { "epoch": 0.36231884057971014, "grad_norm": 0.12760891020298004, "learning_rate": 9.28282467149256e-06, "loss": 0.5194, "step": 175 }, { "epoch": 0.36438923395445133, "grad_norm": 0.1176360547542572, "learning_rate": 9.27432255989112e-06, "loss": 0.5659, "step": 176 }, { "epoch": 0.36645962732919257, "grad_norm": 0.10507313907146454, "learning_rate": 9.265774289877291e-06, "loss": 0.3186, "step": 177 }, { "epoch": 0.36853002070393376, "grad_norm": 0.08758605271577835, "learning_rate": 9.257179953763846e-06, "loss": 0.2769, "step": 178 }, { "epoch": 0.37060041407867494, "grad_norm": 0.10309769958257675, "learning_rate": 9.248539644361029e-06, "loss": 0.3603, "step": 179 }, { "epoch": 0.37267080745341613, "grad_norm": 0.08716636151075363, "learning_rate": 9.239853454975548e-06, "loss": 0.3524, "step": 180 }, { "epoch": 0.3747412008281574, "grad_norm": 0.09289219230413437, "learning_rate": 9.231121479409567e-06, "loss": 0.2974, "step": 181 }, { "epoch": 0.37681159420289856, "grad_norm": 0.12185970693826675, "learning_rate": 9.222343811959694e-06, "loss": 0.5011, "step": 182 }, { "epoch": 0.37888198757763975, "grad_norm": 0.16153395175933838, "learning_rate": 9.213520547415968e-06, "loss": 0.3844, "step": 183 }, { "epoch": 0.38095238095238093, "grad_norm": 0.09068119525909424, "learning_rate": 9.204651781060832e-06, "loss": 0.2212, "step": 184 }, { "epoch": 0.3830227743271222, "grad_norm": 0.10143032670021057, "learning_rate": 9.195737608668096e-06, "loss": 0.3724, "step": 185 }, { "epoch": 0.38509316770186336, "grad_norm": 0.14434686303138733, "learning_rate": 9.186778126501916e-06, "loss": 0.5017, "step": 186 }, { "epoch": 0.38716356107660455, "grad_norm": 0.0935361459851265, "learning_rate": 9.177773431315748e-06, "loss": 0.2123, "step": 187 }, { "epoch": 0.38923395445134573, "grad_norm": 0.09748780727386475, "learning_rate": 9.168723620351298e-06, "loss": 0.373, "step": 188 }, { "epoch": 0.391304347826087, "grad_norm": 0.10403398424386978, "learning_rate": 9.159628791337483e-06, "loss": 0.2962, "step": 189 }, { "epoch": 0.39337474120082816, "grad_norm": 0.11641765385866165, "learning_rate": 9.150489042489368e-06, "loss": 0.4867, "step": 190 }, { "epoch": 0.39544513457556935, "grad_norm": 0.09807530045509338, "learning_rate": 9.141304472507109e-06, "loss": 0.3982, "step": 191 }, { "epoch": 0.39751552795031053, "grad_norm": 0.1166764423251152, "learning_rate": 9.13207518057488e-06, "loss": 0.384, "step": 192 }, { "epoch": 0.3995859213250518, "grad_norm": 0.11490147560834885, "learning_rate": 9.122801266359815e-06, "loss": 0.4438, "step": 193 }, { "epoch": 0.40165631469979296, "grad_norm": 0.1409396082162857, "learning_rate": 9.113482830010918e-06, "loss": 0.5902, "step": 194 }, { "epoch": 0.40372670807453415, "grad_norm": 0.1281116008758545, "learning_rate": 9.10411997215799e-06, "loss": 0.329, "step": 195 }, { "epoch": 0.4057971014492754, "grad_norm": 0.1288306564092636, "learning_rate": 9.094712793910541e-06, "loss": 0.4421, "step": 196 }, { "epoch": 0.4078674948240166, "grad_norm": 0.1229570135474205, "learning_rate": 9.085261396856694e-06, "loss": 0.5416, "step": 197 }, { "epoch": 0.40993788819875776, "grad_norm": 0.09709765762090683, "learning_rate": 9.075765883062093e-06, "loss": 0.359, "step": 198 }, { "epoch": 0.41200828157349895, "grad_norm": 0.12015578895807266, "learning_rate": 9.0662263550688e-06, "loss": 0.3323, "step": 199 }, { "epoch": 0.4140786749482402, "grad_norm": 0.09695754200220108, "learning_rate": 9.056642915894182e-06, "loss": 0.3129, "step": 200 }, { "epoch": 0.4161490683229814, "grad_norm": 0.09204108268022537, "learning_rate": 9.047015669029808e-06, "loss": 0.2289, "step": 201 }, { "epoch": 0.41821946169772256, "grad_norm": 0.11979050934314728, "learning_rate": 9.037344718440321e-06, "loss": 0.3477, "step": 202 }, { "epoch": 0.42028985507246375, "grad_norm": 0.14150062203407288, "learning_rate": 9.027630168562326e-06, "loss": 0.5484, "step": 203 }, { "epoch": 0.422360248447205, "grad_norm": 0.12129364162683487, "learning_rate": 9.017872124303255e-06, "loss": 0.4362, "step": 204 }, { "epoch": 0.4244306418219462, "grad_norm": 0.09622828662395477, "learning_rate": 9.00807069104023e-06, "loss": 0.2868, "step": 205 }, { "epoch": 0.42650103519668736, "grad_norm": 0.11513911187648773, "learning_rate": 8.99822597461894e-06, "loss": 0.4002, "step": 206 }, { "epoch": 0.42857142857142855, "grad_norm": 0.11842213571071625, "learning_rate": 8.988338081352484e-06, "loss": 0.3668, "step": 207 }, { "epoch": 0.4306418219461698, "grad_norm": 0.10251864790916443, "learning_rate": 8.978407118020226e-06, "loss": 0.3032, "step": 208 }, { "epoch": 0.432712215320911, "grad_norm": 0.12288280576467514, "learning_rate": 8.96843319186665e-06, "loss": 0.4732, "step": 209 }, { "epoch": 0.43478260869565216, "grad_norm": 0.1105843186378479, "learning_rate": 8.958416410600188e-06, "loss": 0.3651, "step": 210 }, { "epoch": 0.43685300207039335, "grad_norm": 0.09893371909856796, "learning_rate": 8.948356882392072e-06, "loss": 0.4154, "step": 211 }, { "epoch": 0.4389233954451346, "grad_norm": 0.12337377667427063, "learning_rate": 8.938254715875152e-06, "loss": 0.4027, "step": 212 }, { "epoch": 0.4409937888198758, "grad_norm": 0.12832386791706085, "learning_rate": 8.928110020142729e-06, "loss": 0.4879, "step": 213 }, { "epoch": 0.44306418219461696, "grad_norm": 0.13107864558696747, "learning_rate": 8.917922904747385e-06, "loss": 0.3745, "step": 214 }, { "epoch": 0.4451345755693582, "grad_norm": 0.11851947754621506, "learning_rate": 8.907693479699783e-06, "loss": 0.2237, "step": 215 }, { "epoch": 0.4472049689440994, "grad_norm": 0.10010041296482086, "learning_rate": 8.897421855467491e-06, "loss": 0.3745, "step": 216 }, { "epoch": 0.4492753623188406, "grad_norm": 0.1251843422651291, "learning_rate": 8.887108142973788e-06, "loss": 0.4469, "step": 217 }, { "epoch": 0.45134575569358176, "grad_norm": 0.08511991798877716, "learning_rate": 8.876752453596462e-06, "loss": 0.2385, "step": 218 }, { "epoch": 0.453416149068323, "grad_norm": 0.14126259088516235, "learning_rate": 8.86635489916661e-06, "loss": 0.5021, "step": 219 }, { "epoch": 0.4554865424430642, "grad_norm": 0.12813101708889008, "learning_rate": 8.85591559196743e-06, "loss": 0.2839, "step": 220 }, { "epoch": 0.4575569358178054, "grad_norm": 0.09878244251012802, "learning_rate": 8.845434644733009e-06, "loss": 0.2601, "step": 221 }, { "epoch": 0.45962732919254656, "grad_norm": 0.12693674862384796, "learning_rate": 8.834912170647102e-06, "loss": 0.3788, "step": 222 }, { "epoch": 0.4616977225672878, "grad_norm": 0.14547109603881836, "learning_rate": 8.824348283341912e-06, "loss": 0.5055, "step": 223 }, { "epoch": 0.463768115942029, "grad_norm": 0.1215929314494133, "learning_rate": 8.813743096896872e-06, "loss": 0.3685, "step": 224 }, { "epoch": 0.4658385093167702, "grad_norm": 0.14049437642097473, "learning_rate": 8.803096725837392e-06, "loss": 0.4663, "step": 225 }, { "epoch": 0.46790890269151136, "grad_norm": 0.12880191206932068, "learning_rate": 8.792409285133644e-06, "loss": 0.3377, "step": 226 }, { "epoch": 0.4699792960662526, "grad_norm": 0.1347607672214508, "learning_rate": 8.781680890199306e-06, "loss": 0.4076, "step": 227 }, { "epoch": 0.4720496894409938, "grad_norm": 0.12177550792694092, "learning_rate": 8.770911656890325e-06, "loss": 0.3779, "step": 228 }, { "epoch": 0.474120082815735, "grad_norm": 0.15446029603481293, "learning_rate": 8.760101701503656e-06, "loss": 0.3923, "step": 229 }, { "epoch": 0.47619047619047616, "grad_norm": 0.11769753694534302, "learning_rate": 8.749251140776016e-06, "loss": 0.3247, "step": 230 }, { "epoch": 0.4782608695652174, "grad_norm": 0.14402779936790466, "learning_rate": 8.73836009188262e-06, "loss": 0.3048, "step": 231 }, { "epoch": 0.4803312629399586, "grad_norm": 0.14113663136959076, "learning_rate": 8.727428672435911e-06, "loss": 0.2981, "step": 232 }, { "epoch": 0.4824016563146998, "grad_norm": 0.15857934951782227, "learning_rate": 8.716457000484296e-06, "loss": 0.6381, "step": 233 }, { "epoch": 0.484472049689441, "grad_norm": 0.12054027616977692, "learning_rate": 8.705445194510868e-06, "loss": 0.3409, "step": 234 }, { "epoch": 0.4865424430641822, "grad_norm": 0.104091577231884, "learning_rate": 8.694393373432129e-06, "loss": 0.3034, "step": 235 }, { "epoch": 0.4886128364389234, "grad_norm": 0.10336735844612122, "learning_rate": 8.6833016565967e-06, "loss": 0.1919, "step": 236 }, { "epoch": 0.4906832298136646, "grad_norm": 0.13413453102111816, "learning_rate": 8.672170163784042e-06, "loss": 0.4726, "step": 237 }, { "epoch": 0.4927536231884058, "grad_norm": 0.15956872701644897, "learning_rate": 8.660999015203152e-06, "loss": 0.4549, "step": 238 }, { "epoch": 0.494824016563147, "grad_norm": 0.10782767832279205, "learning_rate": 8.64978833149127e-06, "loss": 0.2556, "step": 239 }, { "epoch": 0.4968944099378882, "grad_norm": 0.11683110892772675, "learning_rate": 8.638538233712581e-06, "loss": 0.3507, "step": 240 }, { "epoch": 0.4989648033126294, "grad_norm": 0.1023135557770729, "learning_rate": 8.6272488433569e-06, "loss": 0.2694, "step": 241 }, { "epoch": 0.5010351966873706, "grad_norm": 0.11874102801084518, "learning_rate": 8.615920282338355e-06, "loss": 0.3197, "step": 242 }, { "epoch": 0.5010351966873706, "eval_loss": 0.4105847179889679, "eval_runtime": 93.4074, "eval_samples_per_second": 5.91, "eval_steps_per_second": 0.739, "step": 242 }, { "epoch": 0.5031055900621118, "grad_norm": 0.1680690497159958, "learning_rate": 8.60455267299409e-06, "loss": 0.4984, "step": 243 }, { "epoch": 0.505175983436853, "grad_norm": 0.1034797728061676, "learning_rate": 8.593146138082925e-06, "loss": 0.258, "step": 244 }, { "epoch": 0.5072463768115942, "grad_norm": 0.09805410355329514, "learning_rate": 8.581700800784038e-06, "loss": 0.2416, "step": 245 }, { "epoch": 0.5093167701863354, "grad_norm": 0.11391060799360275, "learning_rate": 8.570216784695637e-06, "loss": 0.3188, "step": 246 }, { "epoch": 0.5113871635610766, "grad_norm": 0.12456899881362915, "learning_rate": 8.558694213833618e-06, "loss": 0.3211, "step": 247 }, { "epoch": 0.5134575569358178, "grad_norm": 0.10828563570976257, "learning_rate": 8.54713321263023e-06, "loss": 0.221, "step": 248 }, { "epoch": 0.515527950310559, "grad_norm": 0.09911594539880753, "learning_rate": 8.535533905932739e-06, "loss": 0.2517, "step": 249 }, { "epoch": 0.5175983436853002, "grad_norm": 0.14771407842636108, "learning_rate": 8.52389641900206e-06, "loss": 0.3483, "step": 250 }, { "epoch": 0.5196687370600414, "grad_norm": 0.13070322573184967, "learning_rate": 8.512220877511428e-06, "loss": 0.3345, "step": 251 }, { "epoch": 0.5217391304347826, "grad_norm": 0.18328367173671722, "learning_rate": 8.50050740754502e-06, "loss": 0.4107, "step": 252 }, { "epoch": 0.5238095238095238, "grad_norm": 0.11168783158063889, "learning_rate": 8.488756135596609e-06, "loss": 0.2455, "step": 253 }, { "epoch": 0.525879917184265, "grad_norm": 0.16458994150161743, "learning_rate": 8.476967188568187e-06, "loss": 0.4157, "step": 254 }, { "epoch": 0.5279503105590062, "grad_norm": 0.14378412067890167, "learning_rate": 8.465140693768606e-06, "loss": 0.4946, "step": 255 }, { "epoch": 0.5300207039337475, "grad_norm": 0.14328721165657043, "learning_rate": 8.453276778912186e-06, "loss": 0.3925, "step": 256 }, { "epoch": 0.5320910973084886, "grad_norm": 0.12056344002485275, "learning_rate": 8.441375572117356e-06, "loss": 0.2878, "step": 257 }, { "epoch": 0.5341614906832298, "grad_norm": 0.115929014980793, "learning_rate": 8.429437201905254e-06, "loss": 0.2342, "step": 258 }, { "epoch": 0.5362318840579711, "grad_norm": 0.17692901194095612, "learning_rate": 8.41746179719835e-06, "loss": 0.3761, "step": 259 }, { "epoch": 0.5383022774327122, "grad_norm": 0.14847911894321442, "learning_rate": 8.405449487319049e-06, "loss": 0.4452, "step": 260 }, { "epoch": 0.5403726708074534, "grad_norm": 0.18752224743366241, "learning_rate": 8.393400401988293e-06, "loss": 0.313, "step": 261 }, { "epoch": 0.5424430641821946, "grad_norm": 0.18232041597366333, "learning_rate": 8.38131467132416e-06, "loss": 0.3092, "step": 262 }, { "epoch": 0.5445134575569358, "grad_norm": 0.12929639220237732, "learning_rate": 8.369192425840469e-06, "loss": 0.344, "step": 263 }, { "epoch": 0.546583850931677, "grad_norm": 0.1309351623058319, "learning_rate": 8.357033796445356e-06, "loss": 0.2635, "step": 264 }, { "epoch": 0.5486542443064182, "grad_norm": 0.14819861948490143, "learning_rate": 8.344838914439869e-06, "loss": 0.4324, "step": 265 }, { "epoch": 0.5507246376811594, "grad_norm": 0.10555935651063919, "learning_rate": 8.332607911516545e-06, "loss": 0.3072, "step": 266 }, { "epoch": 0.5527950310559007, "grad_norm": 0.12563839554786682, "learning_rate": 8.320340919757997e-06, "loss": 0.3399, "step": 267 }, { "epoch": 0.5548654244306418, "grad_norm": 0.14571739733219147, "learning_rate": 8.308038071635475e-06, "loss": 0.3716, "step": 268 }, { "epoch": 0.556935817805383, "grad_norm": 0.14644472301006317, "learning_rate": 8.295699500007447e-06, "loss": 0.3165, "step": 269 }, { "epoch": 0.5590062111801242, "grad_norm": 0.33376821875572205, "learning_rate": 8.283325338118154e-06, "loss": 0.4557, "step": 270 }, { "epoch": 0.5610766045548654, "grad_norm": 0.1606837511062622, "learning_rate": 8.27091571959618e-06, "loss": 0.4879, "step": 271 }, { "epoch": 0.5631469979296067, "grad_norm": 0.13942554593086243, "learning_rate": 8.258470778453005e-06, "loss": 0.4332, "step": 272 }, { "epoch": 0.5652173913043478, "grad_norm": 0.14843975007534027, "learning_rate": 8.245990649081559e-06, "loss": 0.317, "step": 273 }, { "epoch": 0.567287784679089, "grad_norm": 0.1546916365623474, "learning_rate": 8.233475466254766e-06, "loss": 0.4053, "step": 274 }, { "epoch": 0.5693581780538303, "grad_norm": 0.15177839994430542, "learning_rate": 8.220925365124092e-06, "loss": 0.3222, "step": 275 }, { "epoch": 0.5714285714285714, "grad_norm": 0.15528550744056702, "learning_rate": 8.208340481218094e-06, "loss": 0.4215, "step": 276 }, { "epoch": 0.5734989648033126, "grad_norm": 0.11311841756105423, "learning_rate": 8.195720950440937e-06, "loss": 0.295, "step": 277 }, { "epoch": 0.5755693581780539, "grad_norm": 0.22601701319217682, "learning_rate": 8.183066909070946e-06, "loss": 0.4662, "step": 278 }, { "epoch": 0.577639751552795, "grad_norm": 0.10438232123851776, "learning_rate": 8.170378493759122e-06, "loss": 0.2396, "step": 279 }, { "epoch": 0.5797101449275363, "grad_norm": 0.09224749356508255, "learning_rate": 8.15765584152767e-06, "loss": 0.1875, "step": 280 }, { "epoch": 0.5817805383022774, "grad_norm": 0.13479924201965332, "learning_rate": 8.144899089768522e-06, "loss": 0.2768, "step": 281 }, { "epoch": 0.5838509316770186, "grad_norm": 0.17148271203041077, "learning_rate": 8.132108376241849e-06, "loss": 0.38, "step": 282 }, { "epoch": 0.5859213250517599, "grad_norm": 0.18072481453418732, "learning_rate": 8.119283839074573e-06, "loss": 0.4015, "step": 283 }, { "epoch": 0.587991718426501, "grad_norm": 0.14164257049560547, "learning_rate": 8.106425616758886e-06, "loss": 0.4656, "step": 284 }, { "epoch": 0.5900621118012422, "grad_norm": 0.19027261435985565, "learning_rate": 8.093533848150736e-06, "loss": 0.3513, "step": 285 }, { "epoch": 0.5921325051759835, "grad_norm": 0.12463188916444778, "learning_rate": 8.08060867246834e-06, "loss": 0.2498, "step": 286 }, { "epoch": 0.5942028985507246, "grad_norm": 0.14109396934509277, "learning_rate": 8.067650229290683e-06, "loss": 0.3645, "step": 287 }, { "epoch": 0.5962732919254659, "grad_norm": 0.1650514006614685, "learning_rate": 8.054658658555998e-06, "loss": 0.4303, "step": 288 }, { "epoch": 0.598343685300207, "grad_norm": 0.18131284415721893, "learning_rate": 8.04163410056027e-06, "loss": 0.2617, "step": 289 }, { "epoch": 0.6004140786749482, "grad_norm": 0.15095612406730652, "learning_rate": 8.028576695955711e-06, "loss": 0.4539, "step": 290 }, { "epoch": 0.6024844720496895, "grad_norm": 0.15879206359386444, "learning_rate": 8.01548658574924e-06, "loss": 0.3165, "step": 291 }, { "epoch": 0.6045548654244306, "grad_norm": 0.12368228286504745, "learning_rate": 8.002363911300966e-06, "loss": 0.2427, "step": 292 }, { "epoch": 0.6066252587991718, "grad_norm": 0.13043078780174255, "learning_rate": 7.989208814322662e-06, "loss": 0.2737, "step": 293 }, { "epoch": 0.6086956521739131, "grad_norm": 0.13512444496154785, "learning_rate": 7.976021436876232e-06, "loss": 0.3112, "step": 294 }, { "epoch": 0.6107660455486542, "grad_norm": 0.1622573733329773, "learning_rate": 7.962801921372168e-06, "loss": 0.4542, "step": 295 }, { "epoch": 0.6128364389233955, "grad_norm": 0.14550381898880005, "learning_rate": 7.949550410568033e-06, "loss": 0.2678, "step": 296 }, { "epoch": 0.6149068322981367, "grad_norm": 0.1805560141801834, "learning_rate": 7.936267047566897e-06, "loss": 0.5813, "step": 297 }, { "epoch": 0.6169772256728778, "grad_norm": 0.14014331996440887, "learning_rate": 7.92295197581581e-06, "loss": 0.3421, "step": 298 }, { "epoch": 0.6190476190476191, "grad_norm": 0.13757413625717163, "learning_rate": 7.909605339104243e-06, "loss": 0.342, "step": 299 }, { "epoch": 0.6211180124223602, "grad_norm": 0.09992540627717972, "learning_rate": 7.89622728156253e-06, "loss": 0.1582, "step": 300 }, { "epoch": 0.6231884057971014, "grad_norm": 0.15090250968933105, "learning_rate": 7.882817947660328e-06, "loss": 0.4279, "step": 301 }, { "epoch": 0.6252587991718427, "grad_norm": 0.22069942951202393, "learning_rate": 7.869377482205042e-06, "loss": 0.403, "step": 302 }, { "epoch": 0.6273291925465838, "grad_norm": 0.21121914684772491, "learning_rate": 7.855906030340268e-06, "loss": 0.3499, "step": 303 }, { "epoch": 0.629399585921325, "grad_norm": 0.16488689184188843, "learning_rate": 7.842403737544226e-06, "loss": 0.3166, "step": 304 }, { "epoch": 0.6314699792960663, "grad_norm": 0.14506235718727112, "learning_rate": 7.828870749628184e-06, "loss": 0.2906, "step": 305 }, { "epoch": 0.6335403726708074, "grad_norm": 0.15652890503406525, "learning_rate": 7.815307212734888e-06, "loss": 0.2071, "step": 306 }, { "epoch": 0.6356107660455487, "grad_norm": 0.17045624554157257, "learning_rate": 7.801713273336984e-06, "loss": 0.3238, "step": 307 }, { "epoch": 0.6376811594202898, "grad_norm": 0.12453598529100418, "learning_rate": 7.788089078235432e-06, "loss": 0.2382, "step": 308 }, { "epoch": 0.639751552795031, "grad_norm": 0.15407682955265045, "learning_rate": 7.774434774557926e-06, "loss": 0.3982, "step": 309 }, { "epoch": 0.6418219461697723, "grad_norm": 0.16916336119174957, "learning_rate": 7.7607505097573e-06, "loss": 0.3729, "step": 310 }, { "epoch": 0.6438923395445134, "grad_norm": 0.16933326423168182, "learning_rate": 7.747036431609938e-06, "loss": 0.3201, "step": 311 }, { "epoch": 0.6459627329192547, "grad_norm": 0.18653111159801483, "learning_rate": 7.733292688214182e-06, "loss": 0.4113, "step": 312 }, { "epoch": 0.6480331262939959, "grad_norm": 0.1700931340456009, "learning_rate": 7.719519427988728e-06, "loss": 0.2422, "step": 313 }, { "epoch": 0.650103519668737, "grad_norm": 0.2505623996257782, "learning_rate": 7.705716799671019e-06, "loss": 0.4215, "step": 314 }, { "epoch": 0.6521739130434783, "grad_norm": 0.1554984301328659, "learning_rate": 7.691884952315654e-06, "loss": 0.2661, "step": 315 }, { "epoch": 0.6542443064182195, "grad_norm": 0.1653880923986435, "learning_rate": 7.678024035292757e-06, "loss": 0.5224, "step": 316 }, { "epoch": 0.6563146997929606, "grad_norm": 0.19526076316833496, "learning_rate": 7.66413419828638e-06, "loss": 0.4749, "step": 317 }, { "epoch": 0.6583850931677019, "grad_norm": 0.11639489978551865, "learning_rate": 7.650215591292888e-06, "loss": 0.2833, "step": 318 }, { "epoch": 0.660455486542443, "grad_norm": 0.15517526865005493, "learning_rate": 7.636268364619328e-06, "loss": 0.2725, "step": 319 }, { "epoch": 0.6625258799171843, "grad_norm": 0.14740225672721863, "learning_rate": 7.622292668881805e-06, "loss": 0.2737, "step": 320 }, { "epoch": 0.6645962732919255, "grad_norm": 0.14966771006584167, "learning_rate": 7.608288655003872e-06, "loss": 0.3643, "step": 321 }, { "epoch": 0.6666666666666666, "grad_norm": 0.19022630155086517, "learning_rate": 7.594256474214883e-06, "loss": 0.4768, "step": 322 }, { "epoch": 0.6687370600414079, "grad_norm": 0.1434745341539383, "learning_rate": 7.580196278048368e-06, "loss": 0.443, "step": 323 }, { "epoch": 0.6708074534161491, "grad_norm": 0.16878576576709747, "learning_rate": 7.566108218340399e-06, "loss": 0.3245, "step": 324 }, { "epoch": 0.6728778467908902, "grad_norm": 0.12783066928386688, "learning_rate": 7.551992447227939e-06, "loss": 0.2576, "step": 325 }, { "epoch": 0.6749482401656315, "grad_norm": 0.131923645734787, "learning_rate": 7.537849117147212e-06, "loss": 0.3536, "step": 326 }, { "epoch": 0.6770186335403726, "grad_norm": 0.16058656573295593, "learning_rate": 7.523678380832049e-06, "loss": 0.3952, "step": 327 }, { "epoch": 0.6790890269151139, "grad_norm": 0.1566784381866455, "learning_rate": 7.509480391312243e-06, "loss": 0.4466, "step": 328 }, { "epoch": 0.6811594202898551, "grad_norm": 0.13159289956092834, "learning_rate": 7.4952553019118915e-06, "loss": 0.3165, "step": 329 }, { "epoch": 0.6832298136645962, "grad_norm": 0.17786771059036255, "learning_rate": 7.481003266247745e-06, "loss": 0.4053, "step": 330 }, { "epoch": 0.6853002070393375, "grad_norm": 0.09457248449325562, "learning_rate": 7.4667244382275486e-06, "loss": 0.1508, "step": 331 }, { "epoch": 0.6873706004140787, "grad_norm": 0.17693917453289032, "learning_rate": 7.452418972048372e-06, "loss": 0.3678, "step": 332 }, { "epoch": 0.6894409937888198, "grad_norm": 0.15471003949642181, "learning_rate": 7.4380870221949604e-06, "loss": 0.2698, "step": 333 }, { "epoch": 0.6915113871635611, "grad_norm": 0.14803005754947662, "learning_rate": 7.4237287434380485e-06, "loss": 0.2693, "step": 334 }, { "epoch": 0.6935817805383023, "grad_norm": 0.1645655333995819, "learning_rate": 7.4093442908326985e-06, "loss": 0.3928, "step": 335 }, { "epoch": 0.6956521739130435, "grad_norm": 0.17979875206947327, "learning_rate": 7.394933819716625e-06, "loss": 0.3584, "step": 336 }, { "epoch": 0.6977225672877847, "grad_norm": 0.1641678512096405, "learning_rate": 7.380497485708512e-06, "loss": 0.3704, "step": 337 }, { "epoch": 0.6997929606625258, "grad_norm": 0.1472330093383789, "learning_rate": 7.366035444706346e-06, "loss": 0.3375, "step": 338 }, { "epoch": 0.7018633540372671, "grad_norm": 0.13386555016040802, "learning_rate": 7.351547852885716e-06, "loss": 0.2683, "step": 339 }, { "epoch": 0.7039337474120083, "grad_norm": 0.17538785934448242, "learning_rate": 7.337034866698138e-06, "loss": 0.3281, "step": 340 }, { "epoch": 0.7060041407867494, "grad_norm": 0.23761358857154846, "learning_rate": 7.322496642869359e-06, "loss": 0.5206, "step": 341 }, { "epoch": 0.7080745341614907, "grad_norm": 0.14608746767044067, "learning_rate": 7.307933338397667e-06, "loss": 0.3623, "step": 342 }, { "epoch": 0.7101449275362319, "grad_norm": 0.15300914645195007, "learning_rate": 7.293345110552199e-06, "loss": 0.3471, "step": 343 }, { "epoch": 0.7122153209109731, "grad_norm": 0.5103980898857117, "learning_rate": 7.278732116871239e-06, "loss": 0.4771, "step": 344 }, { "epoch": 0.7142857142857143, "grad_norm": 0.11499185115098953, "learning_rate": 7.264094515160515e-06, "loss": 0.195, "step": 345 }, { "epoch": 0.7163561076604554, "grad_norm": 0.17889314889907837, "learning_rate": 7.249432463491498e-06, "loss": 0.4114, "step": 346 }, { "epoch": 0.7184265010351967, "grad_norm": 0.20027822256088257, "learning_rate": 7.234746120199695e-06, "loss": 0.3861, "step": 347 }, { "epoch": 0.7204968944099379, "grad_norm": 0.2676618695259094, "learning_rate": 7.220035643882938e-06, "loss": 0.3303, "step": 348 }, { "epoch": 0.722567287784679, "grad_norm": 0.14831817150115967, "learning_rate": 7.205301193399671e-06, "loss": 0.3329, "step": 349 }, { "epoch": 0.7246376811594203, "grad_norm": 0.15850083529949188, "learning_rate": 7.190542927867234e-06, "loss": 0.2805, "step": 350 }, { "epoch": 0.7267080745341615, "grad_norm": 0.17525608837604523, "learning_rate": 7.175761006660151e-06, "loss": 0.4265, "step": 351 }, { "epoch": 0.7287784679089027, "grad_norm": 0.22206392884254456, "learning_rate": 7.160955589408395e-06, "loss": 0.3882, "step": 352 }, { "epoch": 0.7308488612836439, "grad_norm": 0.27820146083831787, "learning_rate": 7.1461268359956806e-06, "loss": 0.4193, "step": 353 }, { "epoch": 0.7329192546583851, "grad_norm": 0.15169090032577515, "learning_rate": 7.131274906557725e-06, "loss": 0.2663, "step": 354 }, { "epoch": 0.7349896480331263, "grad_norm": 0.1922980397939682, "learning_rate": 7.1163999614805255e-06, "loss": 0.2922, "step": 355 }, { "epoch": 0.7370600414078675, "grad_norm": 0.14194855093955994, "learning_rate": 7.101502161398626e-06, "loss": 0.2633, "step": 356 }, { "epoch": 0.7391304347826086, "grad_norm": 0.12898112833499908, "learning_rate": 7.0865816671933765e-06, "loss": 0.1775, "step": 357 }, { "epoch": 0.7412008281573499, "grad_norm": 0.24564723670482635, "learning_rate": 7.0716386399912075e-06, "loss": 0.3405, "step": 358 }, { "epoch": 0.7432712215320911, "grad_norm": 0.1857512891292572, "learning_rate": 7.056673241161881e-06, "loss": 0.3334, "step": 359 }, { "epoch": 0.7453416149068323, "grad_norm": 0.12627875804901123, "learning_rate": 7.041685632316748e-06, "loss": 0.2766, "step": 360 }, { "epoch": 0.7474120082815735, "grad_norm": 0.22340427339076996, "learning_rate": 7.026675975307009e-06, "loss": 0.5199, "step": 361 }, { "epoch": 0.7494824016563147, "grad_norm": 0.17509430646896362, "learning_rate": 7.0116444322219575e-06, "loss": 0.3756, "step": 362 }, { "epoch": 0.7515527950310559, "grad_norm": 0.19094456732273102, "learning_rate": 6.996591165387241e-06, "loss": 0.4757, "step": 363 }, { "epoch": 0.7515527950310559, "eval_loss": 0.3904164433479309, "eval_runtime": 93.4751, "eval_samples_per_second": 5.905, "eval_steps_per_second": 0.738, "step": 363 }, { "epoch": 0.7536231884057971, "grad_norm": 0.21799920499324799, "learning_rate": 6.981516337363099e-06, "loss": 0.2783, "step": 364 }, { "epoch": 0.7556935817805382, "grad_norm": 0.17750999331474304, "learning_rate": 6.966420110942609e-06, "loss": 0.2854, "step": 365 }, { "epoch": 0.7577639751552795, "grad_norm": 0.16166891157627106, "learning_rate": 6.95130264914993e-06, "loss": 0.341, "step": 366 }, { "epoch": 0.7598343685300207, "grad_norm": 0.17185021936893463, "learning_rate": 6.936164115238543e-06, "loss": 0.353, "step": 367 }, { "epoch": 0.7619047619047619, "grad_norm": 0.16961365938186646, "learning_rate": 6.9210046726894885e-06, "loss": 0.3104, "step": 368 }, { "epoch": 0.7639751552795031, "grad_norm": 0.160708487033844, "learning_rate": 6.905824485209598e-06, "loss": 0.3879, "step": 369 }, { "epoch": 0.7660455486542443, "grad_norm": 0.12559418380260468, "learning_rate": 6.890623716729724e-06, "loss": 0.1893, "step": 370 }, { "epoch": 0.7681159420289855, "grad_norm": 0.1993858963251114, "learning_rate": 6.875402531402977e-06, "loss": 0.3177, "step": 371 }, { "epoch": 0.7701863354037267, "grad_norm": 0.1785745769739151, "learning_rate": 6.860161093602949e-06, "loss": 0.3831, "step": 372 }, { "epoch": 0.772256728778468, "grad_norm": 0.15908163785934448, "learning_rate": 6.844899567921938e-06, "loss": 0.2946, "step": 373 }, { "epoch": 0.7743271221532091, "grad_norm": 0.1833140105009079, "learning_rate": 6.829618119169169e-06, "loss": 0.3715, "step": 374 }, { "epoch": 0.7763975155279503, "grad_norm": 0.1361018717288971, "learning_rate": 6.814316912369021e-06, "loss": 0.2445, "step": 375 }, { "epoch": 0.7784679089026915, "grad_norm": 0.1777130365371704, "learning_rate": 6.798996112759233e-06, "loss": 0.3649, "step": 376 }, { "epoch": 0.7805383022774327, "grad_norm": 0.16926829516887665, "learning_rate": 6.783655885789136e-06, "loss": 0.2612, "step": 377 }, { "epoch": 0.782608695652174, "grad_norm": 0.1698054075241089, "learning_rate": 6.768296397117848e-06, "loss": 0.397, "step": 378 }, { "epoch": 0.7846790890269151, "grad_norm": 0.21158775687217712, "learning_rate": 6.7529178126125005e-06, "loss": 0.3234, "step": 379 }, { "epoch": 0.7867494824016563, "grad_norm": 0.16220830380916595, "learning_rate": 6.737520298346438e-06, "loss": 0.4353, "step": 380 }, { "epoch": 0.7888198757763976, "grad_norm": 0.16705025732517242, "learning_rate": 6.722104020597428e-06, "loss": 0.4326, "step": 381 }, { "epoch": 0.7908902691511387, "grad_norm": 0.18142729997634888, "learning_rate": 6.706669145845863e-06, "loss": 0.3197, "step": 382 }, { "epoch": 0.7929606625258799, "grad_norm": 0.17288783192634583, "learning_rate": 6.691215840772971e-06, "loss": 0.316, "step": 383 }, { "epoch": 0.7950310559006211, "grad_norm": 0.13447780907154083, "learning_rate": 6.6757442722590015e-06, "loss": 0.2254, "step": 384 }, { "epoch": 0.7971014492753623, "grad_norm": 0.16198629140853882, "learning_rate": 6.660254607381433e-06, "loss": 0.3463, "step": 385 }, { "epoch": 0.7991718426501035, "grad_norm": 0.16485463082790375, "learning_rate": 6.6447470134131685e-06, "loss": 0.2498, "step": 386 }, { "epoch": 0.8012422360248447, "grad_norm": 0.17164930701255798, "learning_rate": 6.629221657820729e-06, "loss": 0.2875, "step": 387 }, { "epoch": 0.8033126293995859, "grad_norm": 0.16868580877780914, "learning_rate": 6.613678708262439e-06, "loss": 0.2725, "step": 388 }, { "epoch": 0.8053830227743272, "grad_norm": 0.1734611988067627, "learning_rate": 6.598118332586619e-06, "loss": 0.3235, "step": 389 }, { "epoch": 0.8074534161490683, "grad_norm": 0.1865498423576355, "learning_rate": 6.5825406988297815e-06, "loss": 0.3326, "step": 390 }, { "epoch": 0.8095238095238095, "grad_norm": 0.18980611860752106, "learning_rate": 6.566945975214803e-06, "loss": 0.3541, "step": 391 }, { "epoch": 0.8115942028985508, "grad_norm": 0.15975767374038696, "learning_rate": 6.551334330149114e-06, "loss": 0.3307, "step": 392 }, { "epoch": 0.8136645962732919, "grad_norm": 0.2354326993227005, "learning_rate": 6.535705932222881e-06, "loss": 0.3146, "step": 393 }, { "epoch": 0.8157349896480331, "grad_norm": 0.2226390838623047, "learning_rate": 6.520060950207186e-06, "loss": 0.3535, "step": 394 }, { "epoch": 0.8178053830227743, "grad_norm": 0.20899449288845062, "learning_rate": 6.504399553052199e-06, "loss": 0.4009, "step": 395 }, { "epoch": 0.8198757763975155, "grad_norm": 0.17171704769134521, "learning_rate": 6.488721909885359e-06, "loss": 0.3999, "step": 396 }, { "epoch": 0.8219461697722568, "grad_norm": 0.1482180655002594, "learning_rate": 6.4730281900095474e-06, "loss": 0.2778, "step": 397 }, { "epoch": 0.8240165631469979, "grad_norm": 0.1984974592924118, "learning_rate": 6.457318562901257e-06, "loss": 0.3702, "step": 398 }, { "epoch": 0.8260869565217391, "grad_norm": 0.1667233407497406, "learning_rate": 6.44159319820876e-06, "loss": 0.3629, "step": 399 }, { "epoch": 0.8281573498964804, "grad_norm": 0.18199460208415985, "learning_rate": 6.425852265750282e-06, "loss": 0.3925, "step": 400 }, { "epoch": 0.8302277432712215, "grad_norm": 0.18740075826644897, "learning_rate": 6.4100959355121636e-06, "loss": 0.3852, "step": 401 }, { "epoch": 0.8322981366459627, "grad_norm": 0.15155306458473206, "learning_rate": 6.394324377647028e-06, "loss": 0.198, "step": 402 }, { "epoch": 0.8343685300207039, "grad_norm": 0.18929250538349152, "learning_rate": 6.378537762471937e-06, "loss": 0.3827, "step": 403 }, { "epoch": 0.8364389233954451, "grad_norm": 0.15943989157676697, "learning_rate": 6.362736260466561e-06, "loss": 0.3253, "step": 404 }, { "epoch": 0.8385093167701864, "grad_norm": 0.2277781367301941, "learning_rate": 6.3469200422713316e-06, "loss": 0.4614, "step": 405 }, { "epoch": 0.8405797101449275, "grad_norm": 0.13026756048202515, "learning_rate": 6.331089278685599e-06, "loss": 0.2236, "step": 406 }, { "epoch": 0.8426501035196687, "grad_norm": 0.2032948136329651, "learning_rate": 6.315244140665793e-06, "loss": 0.4658, "step": 407 }, { "epoch": 0.84472049689441, "grad_norm": 0.16974839568138123, "learning_rate": 6.299384799323568e-06, "loss": 0.409, "step": 408 }, { "epoch": 0.8467908902691511, "grad_norm": 0.14744219183921814, "learning_rate": 6.283511425923964e-06, "loss": 0.263, "step": 409 }, { "epoch": 0.8488612836438924, "grad_norm": 0.24342601001262665, "learning_rate": 6.267624191883551e-06, "loss": 0.2567, "step": 410 }, { "epoch": 0.8509316770186336, "grad_norm": 0.22432643175125122, "learning_rate": 6.2517232687685815e-06, "loss": 0.3231, "step": 411 }, { "epoch": 0.8530020703933747, "grad_norm": 0.25641682744026184, "learning_rate": 6.235808828293135e-06, "loss": 0.4221, "step": 412 }, { "epoch": 0.855072463768116, "grad_norm": 0.22766365110874176, "learning_rate": 6.2198810423172655e-06, "loss": 0.2003, "step": 413 }, { "epoch": 0.8571428571428571, "grad_norm": 0.15609072148799896, "learning_rate": 6.203940082845144e-06, "loss": 0.4432, "step": 414 }, { "epoch": 0.8592132505175983, "grad_norm": 0.269815593957901, "learning_rate": 6.187986122023207e-06, "loss": 0.4135, "step": 415 }, { "epoch": 0.8612836438923396, "grad_norm": 0.11367934942245483, "learning_rate": 6.172019332138285e-06, "loss": 0.1468, "step": 416 }, { "epoch": 0.8633540372670807, "grad_norm": 0.17702323198318481, "learning_rate": 6.1560398856157554e-06, "loss": 0.3226, "step": 417 }, { "epoch": 0.865424430641822, "grad_norm": 0.18008998036384583, "learning_rate": 6.140047955017672e-06, "loss": 0.4057, "step": 418 }, { "epoch": 0.8674948240165632, "grad_norm": 0.13514308631420135, "learning_rate": 6.1240437130409056e-06, "loss": 0.2623, "step": 419 }, { "epoch": 0.8695652173913043, "grad_norm": 0.1675521731376648, "learning_rate": 6.108027332515276e-06, "loss": 0.2896, "step": 420 }, { "epoch": 0.8716356107660456, "grad_norm": 0.2000652253627777, "learning_rate": 6.091998986401687e-06, "loss": 0.3716, "step": 421 }, { "epoch": 0.8737060041407867, "grad_norm": 0.13207314908504486, "learning_rate": 6.075958847790262e-06, "loss": 0.2428, "step": 422 }, { "epoch": 0.8757763975155279, "grad_norm": 0.1675049066543579, "learning_rate": 6.059907089898468e-06, "loss": 0.2582, "step": 423 }, { "epoch": 0.8778467908902692, "grad_norm": 0.19665411114692688, "learning_rate": 6.043843886069251e-06, "loss": 0.3385, "step": 424 }, { "epoch": 0.8799171842650103, "grad_norm": 0.2043440341949463, "learning_rate": 6.02776940976916e-06, "loss": 0.3633, "step": 425 }, { "epoch": 0.8819875776397516, "grad_norm": 0.19804300367832184, "learning_rate": 6.011683834586474e-06, "loss": 0.3014, "step": 426 }, { "epoch": 0.8840579710144928, "grad_norm": 0.15968140959739685, "learning_rate": 5.995587334229334e-06, "loss": 0.2579, "step": 427 }, { "epoch": 0.8861283643892339, "grad_norm": 0.213617742061615, "learning_rate": 5.979480082523858e-06, "loss": 0.2747, "step": 428 }, { "epoch": 0.8881987577639752, "grad_norm": 0.1532445102930069, "learning_rate": 5.963362253412269e-06, "loss": 0.2513, "step": 429 }, { "epoch": 0.8902691511387164, "grad_norm": 0.15875503420829773, "learning_rate": 5.947234020951015e-06, "loss": 0.2581, "step": 430 }, { "epoch": 0.8923395445134575, "grad_norm": 0.16298717260360718, "learning_rate": 5.931095559308888e-06, "loss": 0.3126, "step": 431 }, { "epoch": 0.8944099378881988, "grad_norm": 0.18198642134666443, "learning_rate": 5.914947042765149e-06, "loss": 0.3726, "step": 432 }, { "epoch": 0.8964803312629399, "grad_norm": 0.1657399982213974, "learning_rate": 5.8987886457076405e-06, "loss": 0.2984, "step": 433 }, { "epoch": 0.8985507246376812, "grad_norm": 0.13853290677070618, "learning_rate": 5.882620542630901e-06, "loss": 0.1906, "step": 434 }, { "epoch": 0.9006211180124224, "grad_norm": 0.20282739400863647, "learning_rate": 5.866442908134291e-06, "loss": 0.3678, "step": 435 }, { "epoch": 0.9026915113871635, "grad_norm": 0.18411196768283844, "learning_rate": 5.850255916920093e-06, "loss": 0.3661, "step": 436 }, { "epoch": 0.9047619047619048, "grad_norm": 0.16809453070163727, "learning_rate": 5.8340597437916395e-06, "loss": 0.4337, "step": 437 }, { "epoch": 0.906832298136646, "grad_norm": 0.24382607638835907, "learning_rate": 5.817854563651415e-06, "loss": 0.2962, "step": 438 }, { "epoch": 0.9089026915113871, "grad_norm": 0.21871711313724518, "learning_rate": 5.80164055149917e-06, "loss": 0.413, "step": 439 }, { "epoch": 0.9109730848861284, "grad_norm": 0.14632663130760193, "learning_rate": 5.785417882430035e-06, "loss": 0.3107, "step": 440 }, { "epoch": 0.9130434782608695, "grad_norm": 0.19092142581939697, "learning_rate": 5.769186731632624e-06, "loss": 0.3483, "step": 441 }, { "epoch": 0.9151138716356108, "grad_norm": 0.2034963071346283, "learning_rate": 5.752947274387147e-06, "loss": 0.4898, "step": 442 }, { "epoch": 0.917184265010352, "grad_norm": 0.15895184874534607, "learning_rate": 5.736699686063515e-06, "loss": 0.2704, "step": 443 }, { "epoch": 0.9192546583850931, "grad_norm": 0.21739298105239868, "learning_rate": 5.720444142119445e-06, "loss": 0.2768, "step": 444 }, { "epoch": 0.9213250517598344, "grad_norm": 0.17576731741428375, "learning_rate": 5.704180818098567e-06, "loss": 0.335, "step": 445 }, { "epoch": 0.9233954451345756, "grad_norm": 0.15474459528923035, "learning_rate": 5.687909889628529e-06, "loss": 0.3155, "step": 446 }, { "epoch": 0.9254658385093167, "grad_norm": 0.2610938847064972, "learning_rate": 5.6716315324191e-06, "loss": 0.4243, "step": 447 }, { "epoch": 0.927536231884058, "grad_norm": 0.18764007091522217, "learning_rate": 5.6553459222602714e-06, "loss": 0.3547, "step": 448 }, { "epoch": 0.9296066252587992, "grad_norm": 0.19586984813213348, "learning_rate": 5.639053235020358e-06, "loss": 0.4074, "step": 449 }, { "epoch": 0.9316770186335404, "grad_norm": 0.16940924525260925, "learning_rate": 5.622753646644102e-06, "loss": 0.2609, "step": 450 }, { "epoch": 0.9337474120082816, "grad_norm": 0.1849866807460785, "learning_rate": 5.606447333150768e-06, "loss": 0.3816, "step": 451 }, { "epoch": 0.9358178053830227, "grad_norm": 0.23204651474952698, "learning_rate": 5.59013447063225e-06, "loss": 0.2354, "step": 452 }, { "epoch": 0.937888198757764, "grad_norm": 0.22019827365875244, "learning_rate": 5.57381523525116e-06, "loss": 0.3601, "step": 453 }, { "epoch": 0.9399585921325052, "grad_norm": 0.2608383595943451, "learning_rate": 5.557489803238934e-06, "loss": 0.3874, "step": 454 }, { "epoch": 0.9420289855072463, "grad_norm": 0.13737766444683075, "learning_rate": 5.541158350893922e-06, "loss": 0.1811, "step": 455 }, { "epoch": 0.9440993788819876, "grad_norm": 0.2050437182188034, "learning_rate": 5.524821054579491e-06, "loss": 0.3328, "step": 456 }, { "epoch": 0.9461697722567288, "grad_norm": 0.1720466911792755, "learning_rate": 5.508478090722116e-06, "loss": 0.3377, "step": 457 }, { "epoch": 0.94824016563147, "grad_norm": 0.16786806285381317, "learning_rate": 5.492129635809473e-06, "loss": 0.319, "step": 458 }, { "epoch": 0.9503105590062112, "grad_norm": 0.16816474497318268, "learning_rate": 5.475775866388542e-06, "loss": 0.3468, "step": 459 }, { "epoch": 0.9523809523809523, "grad_norm": 0.22582465410232544, "learning_rate": 5.459416959063688e-06, "loss": 0.3769, "step": 460 }, { "epoch": 0.9544513457556936, "grad_norm": 0.24350348114967346, "learning_rate": 5.443053090494763e-06, "loss": 0.3339, "step": 461 }, { "epoch": 0.9565217391304348, "grad_norm": 0.17309525609016418, "learning_rate": 5.426684437395196e-06, "loss": 0.2794, "step": 462 }, { "epoch": 0.9585921325051759, "grad_norm": 0.18608134984970093, "learning_rate": 5.4103111765300855e-06, "loss": 0.3433, "step": 463 }, { "epoch": 0.9606625258799172, "grad_norm": 0.16802716255187988, "learning_rate": 5.393933484714284e-06, "loss": 0.2962, "step": 464 }, { "epoch": 0.9627329192546584, "grad_norm": 0.22532106935977936, "learning_rate": 5.377551538810499e-06, "loss": 0.2857, "step": 465 }, { "epoch": 0.9648033126293996, "grad_norm": 0.23656564950942993, "learning_rate": 5.361165515727374e-06, "loss": 0.2993, "step": 466 }, { "epoch": 0.9668737060041408, "grad_norm": 0.1953495591878891, "learning_rate": 5.3447755924175885e-06, "loss": 0.3035, "step": 467 }, { "epoch": 0.968944099378882, "grad_norm": 0.21909025311470032, "learning_rate": 5.328381945875933e-06, "loss": 0.347, "step": 468 }, { "epoch": 0.9710144927536232, "grad_norm": 0.24978509545326233, "learning_rate": 5.311984753137407e-06, "loss": 0.2855, "step": 469 }, { "epoch": 0.9730848861283644, "grad_norm": 0.14834006130695343, "learning_rate": 5.295584191275308e-06, "loss": 0.1876, "step": 470 }, { "epoch": 0.9751552795031055, "grad_norm": 0.16186289489269257, "learning_rate": 5.279180437399316e-06, "loss": 0.225, "step": 471 }, { "epoch": 0.9772256728778468, "grad_norm": 0.206668421626091, "learning_rate": 5.26277366865358e-06, "loss": 0.2826, "step": 472 }, { "epoch": 0.979296066252588, "grad_norm": 0.18108750879764557, "learning_rate": 5.246364062214809e-06, "loss": 0.2851, "step": 473 }, { "epoch": 0.9813664596273292, "grad_norm": 0.1724204570055008, "learning_rate": 5.229951795290353e-06, "loss": 0.3175, "step": 474 }, { "epoch": 0.9834368530020704, "grad_norm": 0.22895722091197968, "learning_rate": 5.213537045116296e-06, "loss": 0.5164, "step": 475 }, { "epoch": 0.9855072463768116, "grad_norm": 0.204612135887146, "learning_rate": 5.197119988955534e-06, "loss": 0.491, "step": 476 }, { "epoch": 0.9875776397515528, "grad_norm": 0.26122862100601196, "learning_rate": 5.180700804095871e-06, "loss": 0.2628, "step": 477 }, { "epoch": 0.989648033126294, "grad_norm": 0.2136368602514267, "learning_rate": 5.164279667848094e-06, "loss": 0.3404, "step": 478 }, { "epoch": 0.9917184265010351, "grad_norm": 0.18253998458385468, "learning_rate": 5.147856757544067e-06, "loss": 0.2777, "step": 479 }, { "epoch": 0.9937888198757764, "grad_norm": 0.19649861752986908, "learning_rate": 5.131432250534809e-06, "loss": 0.3999, "step": 480 }, { "epoch": 0.9958592132505176, "grad_norm": 0.18403136730194092, "learning_rate": 5.1150063241885815e-06, "loss": 0.3778, "step": 481 }, { "epoch": 0.9979296066252588, "grad_norm": 0.19857674837112427, "learning_rate": 5.0985791558889785e-06, "loss": 0.2799, "step": 482 }, { "epoch": 1.0, "grad_norm": 0.13328629732131958, "learning_rate": 5.082150923033001e-06, "loss": 0.1996, "step": 483 }, { "epoch": 1.0020703933747412, "grad_norm": 0.1862826645374298, "learning_rate": 5.065721803029146e-06, "loss": 0.411, "step": 484 }, { "epoch": 1.0020703933747412, "eval_loss": 0.3763667643070221, "eval_runtime": 93.2875, "eval_samples_per_second": 5.917, "eval_steps_per_second": 0.74, "step": 484 }, { "epoch": 1.0041407867494825, "grad_norm": 0.23797769844532013, "learning_rate": 5.049291973295492e-06, "loss": 0.4413, "step": 485 }, { "epoch": 1.0062111801242235, "grad_norm": 0.1646006554365158, "learning_rate": 5.032861611257783e-06, "loss": 0.2354, "step": 486 }, { "epoch": 1.0082815734989647, "grad_norm": 0.1261710673570633, "learning_rate": 5.016430894347513e-06, "loss": 0.2208, "step": 487 }, { "epoch": 1.010351966873706, "grad_norm": 0.16026055812835693, "learning_rate": 5e-06, "loss": 0.2227, "step": 488 }, { "epoch": 1.0124223602484472, "grad_norm": 0.1686556041240692, "learning_rate": 4.983569105652489e-06, "loss": 0.3089, "step": 489 }, { "epoch": 1.0144927536231885, "grad_norm": 0.18485727906227112, "learning_rate": 4.967138388742218e-06, "loss": 0.3351, "step": 490 }, { "epoch": 1.0165631469979297, "grad_norm": 0.24687248468399048, "learning_rate": 4.9507080267045095e-06, "loss": 0.3287, "step": 491 }, { "epoch": 1.0186335403726707, "grad_norm": 0.1875036209821701, "learning_rate": 4.934278196970857e-06, "loss": 0.3356, "step": 492 }, { "epoch": 1.020703933747412, "grad_norm": 0.2408786565065384, "learning_rate": 4.917849076966999e-06, "loss": 0.3066, "step": 493 }, { "epoch": 1.0227743271221532, "grad_norm": 0.17887943983078003, "learning_rate": 4.9014208441110215e-06, "loss": 0.2968, "step": 494 }, { "epoch": 1.0248447204968945, "grad_norm": 0.1789269596338272, "learning_rate": 4.8849936758114184e-06, "loss": 0.3354, "step": 495 }, { "epoch": 1.0269151138716357, "grad_norm": 0.14091640710830688, "learning_rate": 4.868567749465192e-06, "loss": 0.2079, "step": 496 }, { "epoch": 1.0289855072463767, "grad_norm": 0.27091121673583984, "learning_rate": 4.852143242455935e-06, "loss": 0.3013, "step": 497 }, { "epoch": 1.0020703933747412, "grad_norm": 0.23239950835704803, "learning_rate": 4.835720332151907e-06, "loss": 0.4112, "step": 498 }, { "epoch": 1.0041407867494825, "grad_norm": 0.2228323519229889, "learning_rate": 4.819299195904131e-06, "loss": 0.3032, "step": 499 }, { "epoch": 1.0062111801242235, "grad_norm": 0.2078544944524765, "learning_rate": 4.802880011044467e-06, "loss": 0.2802, "step": 500 }, { "epoch": 1.0082815734989647, "grad_norm": 0.20137901604175568, "learning_rate": 4.786462954883706e-06, "loss": 0.3043, "step": 501 }, { "epoch": 1.010351966873706, "grad_norm": 0.22740206122398376, "learning_rate": 4.770048204709648e-06, "loss": 0.3051, "step": 502 }, { "epoch": 1.0124223602484472, "grad_norm": 0.18020185828208923, "learning_rate": 4.753635937785191e-06, "loss": 0.2257, "step": 503 }, { "epoch": 1.0144927536231885, "grad_norm": 0.18203116953372955, "learning_rate": 4.73722633134642e-06, "loss": 0.2019, "step": 504 }, { "epoch": 1.0165631469979297, "grad_norm": 0.22147925198078156, "learning_rate": 4.720819562600684e-06, "loss": 0.2875, "step": 505 }, { "epoch": 1.0186335403726707, "grad_norm": 0.28298357129096985, "learning_rate": 4.7044158087246926e-06, "loss": 0.3057, "step": 506 }, { "epoch": 1.020703933747412, "grad_norm": 0.25004157423973083, "learning_rate": 4.688015246862595e-06, "loss": 0.2947, "step": 507 }, { "epoch": 1.0227743271221532, "grad_norm": 0.29407742619514465, "learning_rate": 4.67161805412407e-06, "loss": 0.475, "step": 508 }, { "epoch": 1.0248447204968945, "grad_norm": 0.2174006849527359, "learning_rate": 4.655224407582413e-06, "loss": 0.3527, "step": 509 }, { "epoch": 1.0269151138716357, "grad_norm": 0.17737045884132385, "learning_rate": 4.6388344842726266e-06, "loss": 0.2289, "step": 510 }, { "epoch": 1.0289855072463767, "grad_norm": 0.2775312662124634, "learning_rate": 4.622448461189504e-06, "loss": 0.2495, "step": 511 }, { "epoch": 1.031055900621118, "grad_norm": 0.2338017374277115, "learning_rate": 4.606066515285719e-06, "loss": 0.347, "step": 512 }, { "epoch": 1.0331262939958592, "grad_norm": 0.18186333775520325, "learning_rate": 4.589688823469915e-06, "loss": 0.4323, "step": 513 }, { "epoch": 1.0351966873706004, "grad_norm": 0.15205258131027222, "learning_rate": 4.573315562604804e-06, "loss": 0.2887, "step": 514 }, { "epoch": 1.0372670807453417, "grad_norm": 0.19091299176216125, "learning_rate": 4.5569469095052375e-06, "loss": 0.3767, "step": 515 }, { "epoch": 1.039337474120083, "grad_norm": 0.1971992403268814, "learning_rate": 4.540583040936313e-06, "loss": 0.3108, "step": 516 }, { "epoch": 1.041407867494824, "grad_norm": 0.19016583263874054, "learning_rate": 4.5242241336114595e-06, "loss": 0.3295, "step": 517 }, { "epoch": 1.0434782608695652, "grad_norm": 0.19993621110916138, "learning_rate": 4.5078703641905275e-06, "loss": 0.2432, "step": 518 }, { "epoch": 1.0455486542443064, "grad_norm": 0.160321444272995, "learning_rate": 4.491521909277886e-06, "loss": 0.2274, "step": 519 }, { "epoch": 1.0476190476190477, "grad_norm": 0.13464973866939545, "learning_rate": 4.4751789454205105e-06, "loss": 0.1574, "step": 520 }, { "epoch": 1.049689440993789, "grad_norm": 0.2205759882926941, "learning_rate": 4.458841649106079e-06, "loss": 0.1798, "step": 521 }, { "epoch": 1.05175983436853, "grad_norm": 0.1981126219034195, "learning_rate": 4.442510196761068e-06, "loss": 0.2323, "step": 522 }, { "epoch": 1.0538302277432712, "grad_norm": 0.14194904267787933, "learning_rate": 4.42618476474884e-06, "loss": 0.2363, "step": 523 }, { "epoch": 1.0559006211180124, "grad_norm": 0.2177942842245102, "learning_rate": 4.409865529367751e-06, "loss": 0.2997, "step": 524 }, { "epoch": 1.0579710144927537, "grad_norm": 0.18768401443958282, "learning_rate": 4.3935526668492325e-06, "loss": 0.2711, "step": 525 }, { "epoch": 1.060041407867495, "grad_norm": 0.23028354346752167, "learning_rate": 4.377246353355899e-06, "loss": 0.3507, "step": 526 }, { "epoch": 1.062111801242236, "grad_norm": 0.18663090467453003, "learning_rate": 4.3609467649796434e-06, "loss": 0.3046, "step": 527 }, { "epoch": 1.0641821946169772, "grad_norm": 0.18047387897968292, "learning_rate": 4.34465407773973e-06, "loss": 0.2462, "step": 528 }, { "epoch": 1.0662525879917184, "grad_norm": 0.17824740707874298, "learning_rate": 4.328368467580901e-06, "loss": 0.3397, "step": 529 }, { "epoch": 1.0683229813664596, "grad_norm": 0.1294453889131546, "learning_rate": 4.312090110371473e-06, "loss": 0.1294, "step": 530 }, { "epoch": 1.0703933747412009, "grad_norm": 0.1727997064590454, "learning_rate": 4.295819181901436e-06, "loss": 0.2847, "step": 531 }, { "epoch": 1.0724637681159421, "grad_norm": 0.20239372551441193, "learning_rate": 4.279555857880558e-06, "loss": 0.2757, "step": 532 }, { "epoch": 1.0745341614906831, "grad_norm": 0.23195482790470123, "learning_rate": 4.263300313936485e-06, "loss": 0.2639, "step": 533 }, { "epoch": 1.0766045548654244, "grad_norm": 0.1517760008573532, "learning_rate": 4.247052725612853e-06, "loss": 0.2448, "step": 534 }, { "epoch": 1.0786749482401656, "grad_norm": 0.2006104290485382, "learning_rate": 4.2308132683673765e-06, "loss": 0.3653, "step": 535 }, { "epoch": 1.0807453416149069, "grad_norm": 0.20593738555908203, "learning_rate": 4.214582117569966e-06, "loss": 0.3572, "step": 536 }, { "epoch": 1.0828157349896481, "grad_norm": 0.17792487144470215, "learning_rate": 4.198359448500832e-06, "loss": 0.3457, "step": 537 }, { "epoch": 1.0848861283643891, "grad_norm": 0.1635383516550064, "learning_rate": 4.182145436348587e-06, "loss": 0.3257, "step": 538 }, { "epoch": 1.0869565217391304, "grad_norm": 0.2741135060787201, "learning_rate": 4.165940256208361e-06, "loss": 0.3041, "step": 539 }, { "epoch": 1.0890269151138716, "grad_norm": 0.2326912134885788, "learning_rate": 4.1497440830799084e-06, "loss": 0.3032, "step": 540 }, { "epoch": 1.0910973084886129, "grad_norm": 0.18751777708530426, "learning_rate": 4.133557091865711e-06, "loss": 0.2383, "step": 541 }, { "epoch": 1.093167701863354, "grad_norm": 0.16328708827495575, "learning_rate": 4.1173794573691e-06, "loss": 0.2221, "step": 542 }, { "epoch": 1.0952380952380953, "grad_norm": 0.20164957642555237, "learning_rate": 4.1012113542923595e-06, "loss": 0.3223, "step": 543 }, { "epoch": 1.0973084886128364, "grad_norm": 0.1946948915719986, "learning_rate": 4.0850529572348505e-06, "loss": 0.3173, "step": 544 }, { "epoch": 1.0993788819875776, "grad_norm": 0.1779978722333908, "learning_rate": 4.068904440691113e-06, "loss": 0.3294, "step": 545 }, { "epoch": 1.1014492753623188, "grad_norm": 0.20775990188121796, "learning_rate": 4.052765979048986e-06, "loss": 0.2444, "step": 546 }, { "epoch": 1.10351966873706, "grad_norm": 0.1465255469083786, "learning_rate": 4.036637746587732e-06, "loss": 0.1946, "step": 547 }, { "epoch": 1.1055900621118013, "grad_norm": 0.1578349471092224, "learning_rate": 4.0205199174761435e-06, "loss": 0.3166, "step": 548 }, { "epoch": 1.1076604554865424, "grad_norm": 0.16846297681331635, "learning_rate": 4.004412665770667e-06, "loss": 0.2747, "step": 549 }, { "epoch": 1.1097308488612836, "grad_norm": 0.17560021579265594, "learning_rate": 3.988316165413528e-06, "loss": 0.2477, "step": 550 }, { "epoch": 1.1118012422360248, "grad_norm": 0.3064589202404022, "learning_rate": 3.972230590230844e-06, "loss": 0.384, "step": 551 }, { "epoch": 1.113871635610766, "grad_norm": 0.28778812289237976, "learning_rate": 3.956156113930752e-06, "loss": 0.2753, "step": 552 }, { "epoch": 1.1159420289855073, "grad_norm": 0.18141014873981476, "learning_rate": 3.940092910101533e-06, "loss": 0.3229, "step": 553 }, { "epoch": 1.1180124223602483, "grad_norm": 0.2242807000875473, "learning_rate": 3.924041152209739e-06, "loss": 0.3954, "step": 554 }, { "epoch": 1.1200828157349896, "grad_norm": 0.17454344034194946, "learning_rate": 3.9080010135983134e-06, "loss": 0.2391, "step": 555 }, { "epoch": 1.1221532091097308, "grad_norm": 0.22734415531158447, "learning_rate": 3.891972667484726e-06, "loss": 0.3418, "step": 556 }, { "epoch": 1.124223602484472, "grad_norm": 0.16846442222595215, "learning_rate": 3.875956286959096e-06, "loss": 0.3489, "step": 557 }, { "epoch": 1.1262939958592133, "grad_norm": 0.1894855499267578, "learning_rate": 3.859952044982329e-06, "loss": 0.312, "step": 558 }, { "epoch": 1.1283643892339545, "grad_norm": 0.17569231986999512, "learning_rate": 3.843960114384246e-06, "loss": 0.2616, "step": 559 }, { "epoch": 1.1304347826086956, "grad_norm": 0.26564398407936096, "learning_rate": 3.827980667861716e-06, "loss": 0.3152, "step": 560 }, { "epoch": 1.1325051759834368, "grad_norm": 0.19927287101745605, "learning_rate": 3.8120138779767958e-06, "loss": 0.2786, "step": 561 }, { "epoch": 1.134575569358178, "grad_norm": 0.2221873551607132, "learning_rate": 3.7960599171548572e-06, "loss": 0.3006, "step": 562 }, { "epoch": 1.1366459627329193, "grad_norm": 0.22802579402923584, "learning_rate": 3.780118957682736e-06, "loss": 0.4023, "step": 563 }, { "epoch": 1.1387163561076605, "grad_norm": 0.21268118917942047, "learning_rate": 3.764191171706867e-06, "loss": 0.3199, "step": 564 }, { "epoch": 1.1407867494824018, "grad_norm": 0.18768849968910217, "learning_rate": 3.7482767312314206e-06, "loss": 0.2721, "step": 565 }, { "epoch": 1.1428571428571428, "grad_norm": 0.19857840240001678, "learning_rate": 3.732375808116451e-06, "loss": 0.3533, "step": 566 }, { "epoch": 1.144927536231884, "grad_norm": 0.23903526365756989, "learning_rate": 3.7164885740760375e-06, "loss": 0.3196, "step": 567 }, { "epoch": 1.1469979296066253, "grad_norm": 0.21161702275276184, "learning_rate": 3.7006152006764336e-06, "loss": 0.3384, "step": 568 }, { "epoch": 1.1490683229813665, "grad_norm": 0.180229052901268, "learning_rate": 3.684755859334209e-06, "loss": 0.3353, "step": 569 }, { "epoch": 1.1511387163561078, "grad_norm": 0.19796229898929596, "learning_rate": 3.6689107213144025e-06, "loss": 0.3109, "step": 570 }, { "epoch": 1.1532091097308488, "grad_norm": 0.209706112742424, "learning_rate": 3.653079957728671e-06, "loss": 0.2478, "step": 571 }, { "epoch": 1.15527950310559, "grad_norm": 0.24063701927661896, "learning_rate": 3.6372637395334416e-06, "loss": 0.4906, "step": 572 }, { "epoch": 1.1573498964803313, "grad_norm": 0.17047333717346191, "learning_rate": 3.6214622375280637e-06, "loss": 0.2871, "step": 573 }, { "epoch": 1.1594202898550725, "grad_norm": 0.1966450810432434, "learning_rate": 3.6056756223529734e-06, "loss": 0.3539, "step": 574 }, { "epoch": 1.1614906832298137, "grad_norm": 0.33538636565208435, "learning_rate": 3.589904064487837e-06, "loss": 0.3929, "step": 575 }, { "epoch": 1.1635610766045548, "grad_norm": 0.17587830126285553, "learning_rate": 3.574147734249719e-06, "loss": 0.2755, "step": 576 }, { "epoch": 1.165631469979296, "grad_norm": 0.1887330859899521, "learning_rate": 3.5584068017912415e-06, "loss": 0.2438, "step": 577 }, { "epoch": 1.1677018633540373, "grad_norm": 0.17411023378372192, "learning_rate": 3.542681437098745e-06, "loss": 0.2861, "step": 578 }, { "epoch": 1.1697722567287785, "grad_norm": 0.18099556863307953, "learning_rate": 3.526971809990454e-06, "loss": 0.2884, "step": 579 }, { "epoch": 1.1718426501035197, "grad_norm": 0.1913526952266693, "learning_rate": 3.5112780901146426e-06, "loss": 0.2879, "step": 580 }, { "epoch": 1.1739130434782608, "grad_norm": 0.19618435204029083, "learning_rate": 3.4956004469478038e-06, "loss": 0.2704, "step": 581 }, { "epoch": 1.175983436853002, "grad_norm": 0.24511279165744781, "learning_rate": 3.479939049792817e-06, "loss": 0.3838, "step": 582 }, { "epoch": 1.1780538302277432, "grad_norm": 0.2790193259716034, "learning_rate": 3.4642940677771203e-06, "loss": 0.3175, "step": 583 }, { "epoch": 1.1801242236024845, "grad_norm": 0.19622787833213806, "learning_rate": 3.448665669850888e-06, "loss": 0.2721, "step": 584 }, { "epoch": 1.1821946169772257, "grad_norm": 0.2602899968624115, "learning_rate": 3.433054024785199e-06, "loss": 0.4717, "step": 585 }, { "epoch": 1.184265010351967, "grad_norm": 0.2367587685585022, "learning_rate": 3.4174593011702197e-06, "loss": 0.2121, "step": 586 }, { "epoch": 1.186335403726708, "grad_norm": 0.17743825912475586, "learning_rate": 3.4018816674133814e-06, "loss": 0.3188, "step": 587 }, { "epoch": 1.1884057971014492, "grad_norm": 0.18005116283893585, "learning_rate": 3.386321291737563e-06, "loss": 0.222, "step": 588 }, { "epoch": 1.1904761904761905, "grad_norm": 0.17598001658916473, "learning_rate": 3.3707783421792724e-06, "loss": 0.3, "step": 589 }, { "epoch": 1.1925465838509317, "grad_norm": 0.24181915819644928, "learning_rate": 3.3552529865868323e-06, "loss": 0.2492, "step": 590 }, { "epoch": 1.194616977225673, "grad_norm": 0.16726148128509521, "learning_rate": 3.339745392618569e-06, "loss": 0.2398, "step": 591 }, { "epoch": 1.1966873706004142, "grad_norm": 0.20498837530612946, "learning_rate": 3.3242557277410015e-06, "loss": 0.427, "step": 592 }, { "epoch": 1.1987577639751552, "grad_norm": 0.14857308566570282, "learning_rate": 3.3087841592270296e-06, "loss": 0.2312, "step": 593 }, { "epoch": 1.2008281573498965, "grad_norm": 0.20990735292434692, "learning_rate": 3.2933308541541365e-06, "loss": 0.3369, "step": 594 }, { "epoch": 1.2028985507246377, "grad_norm": 0.21872520446777344, "learning_rate": 3.2778959794025735e-06, "loss": 0.3638, "step": 595 }, { "epoch": 1.204968944099379, "grad_norm": 0.20835131406784058, "learning_rate": 3.2624797016535626e-06, "loss": 0.3235, "step": 596 }, { "epoch": 1.2070393374741202, "grad_norm": 0.3120534420013428, "learning_rate": 3.2470821873875003e-06, "loss": 0.4174, "step": 597 }, { "epoch": 1.2091097308488612, "grad_norm": 0.176408052444458, "learning_rate": 3.2317036028821523e-06, "loss": 0.2244, "step": 598 }, { "epoch": 1.2111801242236024, "grad_norm": 0.47249072790145874, "learning_rate": 3.216344114210865e-06, "loss": 0.459, "step": 599 }, { "epoch": 1.2132505175983437, "grad_norm": 0.21692520380020142, "learning_rate": 3.201003887240768e-06, "loss": 0.329, "step": 600 }, { "epoch": 1.215320910973085, "grad_norm": 0.2014329880475998, "learning_rate": 3.185683087630982e-06, "loss": 0.3236, "step": 601 }, { "epoch": 1.2173913043478262, "grad_norm": 0.17368103563785553, "learning_rate": 3.1703818808308327e-06, "loss": 0.2636, "step": 602 }, { "epoch": 1.2194616977225672, "grad_norm": 0.25841301679611206, "learning_rate": 3.1551004320780634e-06, "loss": 0.3909, "step": 603 }, { "epoch": 1.2215320910973084, "grad_norm": 0.15552645921707153, "learning_rate": 3.1398389063970512e-06, "loss": 0.201, "step": 604 }, { "epoch": 1.2236024844720497, "grad_norm": 0.2024988979101181, "learning_rate": 3.124597468597024e-06, "loss": 0.3928, "step": 605 }, { "epoch": 1.2236024844720497, "eval_loss": 0.3698825240135193, "eval_runtime": 93.2967, "eval_samples_per_second": 5.917, "eval_steps_per_second": 0.74, "step": 605 }, { "epoch": 1.225672877846791, "grad_norm": 0.25626295804977417, "learning_rate": 3.1093762832702775e-06, "loss": 0.3436, "step": 606 }, { "epoch": 1.2277432712215322, "grad_norm": 0.21667784452438354, "learning_rate": 3.0941755147904027e-06, "loss": 0.2657, "step": 607 }, { "epoch": 1.2298136645962732, "grad_norm": 0.19471991062164307, "learning_rate": 3.0789953273105123e-06, "loss": 0.2913, "step": 608 }, { "epoch": 1.2318840579710144, "grad_norm": 0.2012849897146225, "learning_rate": 3.063835884761458e-06, "loss": 0.3049, "step": 609 }, { "epoch": 1.2339544513457557, "grad_norm": 0.22813789546489716, "learning_rate": 3.048697350850073e-06, "loss": 0.3013, "step": 610 }, { "epoch": 1.236024844720497, "grad_norm": 0.15448856353759766, "learning_rate": 3.0335798890573944e-06, "loss": 0.331, "step": 611 }, { "epoch": 1.2380952380952381, "grad_norm": 0.21554410457611084, "learning_rate": 3.0184836626369034e-06, "loss": 0.4201, "step": 612 }, { "epoch": 1.2401656314699794, "grad_norm": 0.17196184396743774, "learning_rate": 3.003408834612759e-06, "loss": 0.2679, "step": 613 }, { "epoch": 1.2422360248447206, "grad_norm": 0.1990622580051422, "learning_rate": 2.988355567778043e-06, "loss": 0.2306, "step": 614 }, { "epoch": 1.2443064182194616, "grad_norm": 0.181349977850914, "learning_rate": 2.9733240246929927e-06, "loss": 0.2736, "step": 615 }, { "epoch": 1.2463768115942029, "grad_norm": 0.17092598974704742, "learning_rate": 2.9583143676832526e-06, "loss": 0.3401, "step": 616 }, { "epoch": 1.2484472049689441, "grad_norm": 0.16415558755397797, "learning_rate": 2.9433267588381198e-06, "loss": 0.2277, "step": 617 }, { "epoch": 1.2505175983436854, "grad_norm": 0.16953089833259583, "learning_rate": 2.9283613600087933e-06, "loss": 0.2741, "step": 618 }, { "epoch": 1.2525879917184266, "grad_norm": 0.1386069357395172, "learning_rate": 2.913418332806624e-06, "loss": 0.1662, "step": 619 }, { "epoch": 1.2546583850931676, "grad_norm": 0.17235419154167175, "learning_rate": 2.8984978386013767e-06, "loss": 0.2238, "step": 620 }, { "epoch": 1.2567287784679089, "grad_norm": 0.13924579322338104, "learning_rate": 2.8836000385194753e-06, "loss": 0.1699, "step": 621 }, { "epoch": 1.25879917184265, "grad_norm": 0.17973099648952484, "learning_rate": 2.8687250934422774e-06, "loss": 0.2394, "step": 622 }, { "epoch": 1.2608695652173914, "grad_norm": 0.23071612417697906, "learning_rate": 2.853873164004321e-06, "loss": 0.2409, "step": 623 }, { "epoch": 1.2629399585921326, "grad_norm": 0.17298616468906403, "learning_rate": 2.839044410591606e-06, "loss": 0.3126, "step": 624 }, { "epoch": 1.2650103519668736, "grad_norm": 0.18979336321353912, "learning_rate": 2.824238993339852e-06, "loss": 0.1662, "step": 625 }, { "epoch": 1.2670807453416149, "grad_norm": 0.14998288452625275, "learning_rate": 2.809457072132766e-06, "loss": 0.2216, "step": 626 }, { "epoch": 1.269151138716356, "grad_norm": 0.19262179732322693, "learning_rate": 2.794698806600331e-06, "loss": 0.2759, "step": 627 }, { "epoch": 1.2712215320910973, "grad_norm": 0.2046218365430832, "learning_rate": 2.779964356117063e-06, "loss": 0.4046, "step": 628 }, { "epoch": 1.2732919254658386, "grad_norm": 0.1875535100698471, "learning_rate": 2.765253879800307e-06, "loss": 0.2233, "step": 629 }, { "epoch": 1.2753623188405796, "grad_norm": 0.19307266175746918, "learning_rate": 2.750567536508504e-06, "loss": 0.2949, "step": 630 }, { "epoch": 1.2774327122153208, "grad_norm": 0.21273551881313324, "learning_rate": 2.735905484839488e-06, "loss": 0.3943, "step": 631 }, { "epoch": 1.279503105590062, "grad_norm": 0.19536952674388885, "learning_rate": 2.7212678831287627e-06, "loss": 0.3996, "step": 632 }, { "epoch": 1.2815734989648033, "grad_norm": 0.2063191533088684, "learning_rate": 2.7066548894478013e-06, "loss": 0.3424, "step": 633 }, { "epoch": 1.2836438923395446, "grad_norm": 0.2701599597930908, "learning_rate": 2.692066661602333e-06, "loss": 0.4112, "step": 634 }, { "epoch": 1.2857142857142856, "grad_norm": 0.1624588668346405, "learning_rate": 2.6775033571306425e-06, "loss": 0.2414, "step": 635 }, { "epoch": 1.287784679089027, "grad_norm": 0.18011558055877686, "learning_rate": 2.662965133301862e-06, "loss": 0.2735, "step": 636 }, { "epoch": 1.289855072463768, "grad_norm": 0.28961238265037537, "learning_rate": 2.6484521471142845e-06, "loss": 0.3408, "step": 637 }, { "epoch": 1.2919254658385093, "grad_norm": 0.1710355579853058, "learning_rate": 2.633964555293654e-06, "loss": 0.2626, "step": 638 }, { "epoch": 1.2939958592132506, "grad_norm": 0.1638377606868744, "learning_rate": 2.619502514291489e-06, "loss": 0.2677, "step": 639 }, { "epoch": 1.2960662525879918, "grad_norm": 0.2379457652568817, "learning_rate": 2.605066180283378e-06, "loss": 0.2812, "step": 640 }, { "epoch": 1.298136645962733, "grad_norm": 0.2495289295911789, "learning_rate": 2.590655709167304e-06, "loss": 0.3053, "step": 641 }, { "epoch": 1.300207039337474, "grad_norm": 0.18783414363861084, "learning_rate": 2.576271256561953e-06, "loss": 0.3656, "step": 642 }, { "epoch": 1.3022774327122153, "grad_norm": 0.1957497000694275, "learning_rate": 2.5619129778050395e-06, "loss": 0.3956, "step": 643 }, { "epoch": 1.3043478260869565, "grad_norm": 0.19549976289272308, "learning_rate": 2.5475810279516287e-06, "loss": 0.3395, "step": 644 }, { "epoch": 1.3064182194616978, "grad_norm": 0.17275187373161316, "learning_rate": 2.5332755617724535e-06, "loss": 0.2132, "step": 645 }, { "epoch": 1.308488612836439, "grad_norm": 0.218386709690094, "learning_rate": 2.5189967337522574e-06, "loss": 0.3096, "step": 646 }, { "epoch": 1.31055900621118, "grad_norm": 0.2581380009651184, "learning_rate": 2.5047446980881106e-06, "loss": 0.4621, "step": 647 }, { "epoch": 1.3126293995859213, "grad_norm": 0.1893983781337738, "learning_rate": 2.49051960868776e-06, "loss": 0.2872, "step": 648 }, { "epoch": 1.3146997929606625, "grad_norm": 0.22159774601459503, "learning_rate": 2.476321619167952e-06, "loss": 0.4122, "step": 649 }, { "epoch": 1.3167701863354038, "grad_norm": 0.1909107267856598, "learning_rate": 2.46215088285279e-06, "loss": 0.2927, "step": 650 }, { "epoch": 1.318840579710145, "grad_norm": 0.23836734890937805, "learning_rate": 2.448007552772062e-06, "loss": 0.2825, "step": 651 }, { "epoch": 1.320910973084886, "grad_norm": 0.19211657345294952, "learning_rate": 2.433891781659603e-06, "loss": 0.2591, "step": 652 }, { "epoch": 1.3229813664596273, "grad_norm": 0.16029834747314453, "learning_rate": 2.41980372195163e-06, "loss": 0.2418, "step": 653 }, { "epoch": 1.3250517598343685, "grad_norm": 0.17719939351081848, "learning_rate": 2.4057435257851173e-06, "loss": 0.3019, "step": 654 }, { "epoch": 1.3271221532091098, "grad_norm": 0.17115546762943268, "learning_rate": 2.391711344996128e-06, "loss": 0.2524, "step": 655 }, { "epoch": 1.329192546583851, "grad_norm": 0.19674405455589294, "learning_rate": 2.377707331118196e-06, "loss": 0.3467, "step": 656 }, { "epoch": 1.331262939958592, "grad_norm": 0.19814486801624298, "learning_rate": 2.363731635380673e-06, "loss": 0.3938, "step": 657 }, { "epoch": 1.3333333333333333, "grad_norm": 0.19398008286952972, "learning_rate": 2.349784408707112e-06, "loss": 0.3151, "step": 658 }, { "epoch": 1.3354037267080745, "grad_norm": 0.19555437564849854, "learning_rate": 2.33586580171362e-06, "loss": 0.3578, "step": 659 }, { "epoch": 1.3374741200828157, "grad_norm": 0.19209536910057068, "learning_rate": 2.3219759647072467e-06, "loss": 0.2102, "step": 660 }, { "epoch": 1.339544513457557, "grad_norm": 0.1797213852405548, "learning_rate": 2.3081150476843484e-06, "loss": 0.3129, "step": 661 }, { "epoch": 1.341614906832298, "grad_norm": 0.21602602303028107, "learning_rate": 2.2942832003289823e-06, "loss": 0.3156, "step": 662 }, { "epoch": 1.3436853002070395, "grad_norm": 0.15796010196208954, "learning_rate": 2.280480572011274e-06, "loss": 0.2369, "step": 663 }, { "epoch": 1.3457556935817805, "grad_norm": 0.2077028453350067, "learning_rate": 2.2667073117858185e-06, "loss": 0.3046, "step": 664 }, { "epoch": 1.3478260869565217, "grad_norm": 0.24971678853034973, "learning_rate": 2.252963568390064e-06, "loss": 0.3312, "step": 665 }, { "epoch": 1.349896480331263, "grad_norm": 0.2832271456718445, "learning_rate": 2.2392494902427027e-06, "loss": 0.3522, "step": 666 }, { "epoch": 1.3519668737060042, "grad_norm": 0.1852179914712906, "learning_rate": 2.2255652254420774e-06, "loss": 0.3036, "step": 667 }, { "epoch": 1.3540372670807455, "grad_norm": 0.21870185434818268, "learning_rate": 2.2119109217645697e-06, "loss": 0.4147, "step": 668 }, { "epoch": 1.3561076604554865, "grad_norm": 0.16570810973644257, "learning_rate": 2.1982867266630185e-06, "loss": 0.2208, "step": 669 }, { "epoch": 1.3581780538302277, "grad_norm": 0.1982434242963791, "learning_rate": 2.1846927872651135e-06, "loss": 0.2939, "step": 670 }, { "epoch": 1.360248447204969, "grad_norm": 0.17540566623210907, "learning_rate": 2.171129250371819e-06, "loss": 0.241, "step": 671 }, { "epoch": 1.3623188405797102, "grad_norm": 0.2423764020204544, "learning_rate": 2.1575962624557754e-06, "loss": 0.3772, "step": 672 }, { "epoch": 1.3643892339544514, "grad_norm": 0.15541775524616241, "learning_rate": 2.1440939696597323e-06, "loss": 0.2138, "step": 673 }, { "epoch": 1.3664596273291925, "grad_norm": 0.16889381408691406, "learning_rate": 2.1306225177949584e-06, "loss": 0.3073, "step": 674 }, { "epoch": 1.3685300207039337, "grad_norm": 0.17894522845745087, "learning_rate": 2.1171820523396737e-06, "loss": 0.3637, "step": 675 }, { "epoch": 1.370600414078675, "grad_norm": 0.17433612048625946, "learning_rate": 2.1037727184374705e-06, "loss": 0.3006, "step": 676 }, { "epoch": 1.3726708074534162, "grad_norm": 0.1843237578868866, "learning_rate": 2.09039466089576e-06, "loss": 0.3195, "step": 677 }, { "epoch": 1.3747412008281574, "grad_norm": 0.18528933823108673, "learning_rate": 2.07704802418419e-06, "loss": 0.2284, "step": 678 }, { "epoch": 1.3768115942028984, "grad_norm": 0.28900831937789917, "learning_rate": 2.0637329524331044e-06, "loss": 0.4572, "step": 679 }, { "epoch": 1.3788819875776397, "grad_norm": 0.23907282948493958, "learning_rate": 2.050449589431969e-06, "loss": 0.2616, "step": 680 }, { "epoch": 1.380952380952381, "grad_norm": 0.29129111766815186, "learning_rate": 2.0371980786278346e-06, "loss": 0.4761, "step": 681 }, { "epoch": 1.3830227743271222, "grad_norm": 0.186233788728714, "learning_rate": 2.023978563123771e-06, "loss": 0.3047, "step": 682 }, { "epoch": 1.3850931677018634, "grad_norm": 0.22705401480197906, "learning_rate": 2.010791185677337e-06, "loss": 0.3393, "step": 683 }, { "epoch": 1.3871635610766044, "grad_norm": 0.2112823873758316, "learning_rate": 1.997636088699035e-06, "loss": 0.3082, "step": 684 }, { "epoch": 1.3892339544513457, "grad_norm": 0.2420927882194519, "learning_rate": 1.9845134142507615e-06, "loss": 0.312, "step": 685 }, { "epoch": 1.391304347826087, "grad_norm": 0.18831101059913635, "learning_rate": 1.9714233040442915e-06, "loss": 0.2822, "step": 686 }, { "epoch": 1.3933747412008282, "grad_norm": 0.22394514083862305, "learning_rate": 1.958365899439731e-06, "loss": 0.2924, "step": 687 }, { "epoch": 1.3954451345755694, "grad_norm": 0.23200605809688568, "learning_rate": 1.9453413414440043e-06, "loss": 0.3346, "step": 688 }, { "epoch": 1.3975155279503104, "grad_norm": 0.2842341661453247, "learning_rate": 1.93234977070932e-06, "loss": 0.3364, "step": 689 }, { "epoch": 1.3995859213250519, "grad_norm": 0.339133083820343, "learning_rate": 1.919391327531663e-06, "loss": 0.3318, "step": 690 }, { "epoch": 1.401656314699793, "grad_norm": 0.2594052255153656, "learning_rate": 1.9064661518492666e-06, "loss": 0.3141, "step": 691 }, { "epoch": 1.4037267080745341, "grad_norm": 0.1589246690273285, "learning_rate": 1.8935743832411163e-06, "loss": 0.2184, "step": 692 }, { "epoch": 1.4057971014492754, "grad_norm": 0.21517537534236908, "learning_rate": 1.8807161609254254e-06, "loss": 0.3006, "step": 693 }, { "epoch": 1.4078674948240166, "grad_norm": 0.18755370378494263, "learning_rate": 1.8678916237581524e-06, "loss": 0.3264, "step": 694 }, { "epoch": 1.4099378881987579, "grad_norm": 0.15639418363571167, "learning_rate": 1.8551009102314782e-06, "loss": 0.2196, "step": 695 }, { "epoch": 1.412008281573499, "grad_norm": 0.2891353964805603, "learning_rate": 1.8423441584723312e-06, "loss": 0.2758, "step": 696 }, { "epoch": 1.4140786749482401, "grad_norm": 0.27669522166252136, "learning_rate": 1.8296215062408785e-06, "loss": 0.407, "step": 697 }, { "epoch": 1.4161490683229814, "grad_norm": 0.1550840139389038, "learning_rate": 1.816933090929055e-06, "loss": 0.1668, "step": 698 }, { "epoch": 1.4182194616977226, "grad_norm": 0.21120239794254303, "learning_rate": 1.8042790495590629e-06, "loss": 0.3332, "step": 699 }, { "epoch": 1.4202898550724639, "grad_norm": 0.21207259595394135, "learning_rate": 1.791659518781908e-06, "loss": 0.2253, "step": 700 }, { "epoch": 1.4223602484472049, "grad_norm": 0.15627498924732208, "learning_rate": 1.779074634875908e-06, "loss": 0.1653, "step": 701 }, { "epoch": 1.4244306418219461, "grad_norm": 0.2120233029127121, "learning_rate": 1.7665245337452368e-06, "loss": 0.2852, "step": 702 }, { "epoch": 1.4265010351966874, "grad_norm": 0.25371691584587097, "learning_rate": 1.7540093509184426e-06, "loss": 0.3425, "step": 703 }, { "epoch": 1.4285714285714286, "grad_norm": 0.2664846181869507, "learning_rate": 1.7415292215469948e-06, "loss": 0.4242, "step": 704 }, { "epoch": 1.4306418219461698, "grad_norm": 0.20060203969478607, "learning_rate": 1.7290842804038215e-06, "loss": 0.2486, "step": 705 }, { "epoch": 1.4327122153209109, "grad_norm": 0.17833402752876282, "learning_rate": 1.716674661881848e-06, "loss": 0.322, "step": 706 }, { "epoch": 1.434782608695652, "grad_norm": 0.15655331313610077, "learning_rate": 1.7043004999925561e-06, "loss": 0.27, "step": 707 }, { "epoch": 1.4368530020703933, "grad_norm": 0.3605753481388092, "learning_rate": 1.6919619283645262e-06, "loss": 0.2902, "step": 708 }, { "epoch": 1.4389233954451346, "grad_norm": 0.2352147102355957, "learning_rate": 1.6796590802420054e-06, "loss": 0.3429, "step": 709 }, { "epoch": 1.4409937888198758, "grad_norm": 0.2271387279033661, "learning_rate": 1.667392088483456e-06, "loss": 0.3038, "step": 710 }, { "epoch": 1.4430641821946169, "grad_norm": 0.20507076382637024, "learning_rate": 1.6551610855601336e-06, "loss": 0.3321, "step": 711 }, { "epoch": 1.4451345755693583, "grad_norm": 0.192070871591568, "learning_rate": 1.6429662035546451e-06, "loss": 0.2685, "step": 712 }, { "epoch": 1.4472049689440993, "grad_norm": 0.18269428610801697, "learning_rate": 1.6308075741595313e-06, "loss": 0.2416, "step": 713 }, { "epoch": 1.4492753623188406, "grad_norm": 0.274029403924942, "learning_rate": 1.6186853286758397e-06, "loss": 0.4039, "step": 714 }, { "epoch": 1.4513457556935818, "grad_norm": 0.19074054062366486, "learning_rate": 1.60659959801171e-06, "loss": 0.3047, "step": 715 }, { "epoch": 1.453416149068323, "grad_norm": 0.20888854563236237, "learning_rate": 1.5945505126809524e-06, "loss": 0.3014, "step": 716 }, { "epoch": 1.4554865424430643, "grad_norm": 0.29325804114341736, "learning_rate": 1.5825382028016516e-06, "loss": 0.4245, "step": 717 }, { "epoch": 1.4575569358178053, "grad_norm": 0.23209422826766968, "learning_rate": 1.570562798094747e-06, "loss": 0.2608, "step": 718 }, { "epoch": 1.4596273291925466, "grad_norm": 0.19618046283721924, "learning_rate": 1.5586244278826469e-06, "loss": 0.2987, "step": 719 }, { "epoch": 1.4616977225672878, "grad_norm": 0.24281901121139526, "learning_rate": 1.5467232210878153e-06, "loss": 0.3474, "step": 720 }, { "epoch": 1.463768115942029, "grad_norm": 0.2319604754447937, "learning_rate": 1.5348593062313966e-06, "loss": 0.3492, "step": 721 }, { "epoch": 1.4658385093167703, "grad_norm": 0.19683247804641724, "learning_rate": 1.5230328114318127e-06, "loss": 0.2443, "step": 722 }, { "epoch": 1.4679089026915113, "grad_norm": 0.2371407449245453, "learning_rate": 1.5112438644033917e-06, "loss": 0.3228, "step": 723 }, { "epoch": 1.4699792960662525, "grad_norm": 0.19741837680339813, "learning_rate": 1.4994925924549797e-06, "loss": 0.2199, "step": 724 }, { "epoch": 1.4720496894409938, "grad_norm": 0.1965048760175705, "learning_rate": 1.4877791224885729e-06, "loss": 0.2673, "step": 725 }, { "epoch": 1.474120082815735, "grad_norm": 0.20242713391780853, "learning_rate": 1.4761035809979395e-06, "loss": 0.2455, "step": 726 }, { "epoch": 1.474120082815735, "eval_loss": 0.36622509360313416, "eval_runtime": 93.169, "eval_samples_per_second": 5.925, "eval_steps_per_second": 0.741, "step": 726 }, { "epoch": 1.4761904761904763, "grad_norm": 0.22379763424396515, "learning_rate": 1.4644660940672628e-06, "loss": 0.2963, "step": 727 }, { "epoch": 1.4782608695652173, "grad_norm": 0.20382773876190186, "learning_rate": 1.452866787369771e-06, "loss": 0.3634, "step": 728 }, { "epoch": 1.4803312629399585, "grad_norm": 0.1743982583284378, "learning_rate": 1.4413057861663843e-06, "loss": 0.2676, "step": 729 }, { "epoch": 1.4824016563146998, "grad_norm": 0.21269293129444122, "learning_rate": 1.4297832153043657e-06, "loss": 0.3427, "step": 730 }, { "epoch": 1.484472049689441, "grad_norm": 0.15802772343158722, "learning_rate": 1.418299199215963e-06, "loss": 0.1784, "step": 731 }, { "epoch": 1.4865424430641823, "grad_norm": 0.1924794614315033, "learning_rate": 1.4068538619170763e-06, "loss": 0.2559, "step": 732 }, { "epoch": 1.4886128364389233, "grad_norm": 0.19855406880378723, "learning_rate": 1.3954473270059104e-06, "loss": 0.3105, "step": 733 }, { "epoch": 1.4906832298136645, "grad_norm": 0.17971496284008026, "learning_rate": 1.3840797176616467e-06, "loss": 0.1931, "step": 734 }, { "epoch": 1.4927536231884058, "grad_norm": 0.23061345517635345, "learning_rate": 1.3727511566431024e-06, "loss": 0.3635, "step": 735 }, { "epoch": 1.494824016563147, "grad_norm": 0.21202407777309418, "learning_rate": 1.3614617662874197e-06, "loss": 0.3096, "step": 736 }, { "epoch": 1.4968944099378882, "grad_norm": 0.19486172497272491, "learning_rate": 1.3502116685087303e-06, "loss": 0.2196, "step": 737 }, { "epoch": 1.4989648033126293, "grad_norm": 0.3029009699821472, "learning_rate": 1.3390009847968505e-06, "loss": 0.2623, "step": 738 }, { "epoch": 1.5010351966873707, "grad_norm": 0.21870052814483643, "learning_rate": 1.3278298362159598e-06, "loss": 0.3429, "step": 739 }, { "epoch": 1.5031055900621118, "grad_norm": 0.21567311882972717, "learning_rate": 1.316698343403302e-06, "loss": 0.2831, "step": 740 }, { "epoch": 1.505175983436853, "grad_norm": 0.20199137926101685, "learning_rate": 1.305606626567873e-06, "loss": 0.3005, "step": 741 }, { "epoch": 1.5072463768115942, "grad_norm": 0.17147856950759888, "learning_rate": 1.2945548054891322e-06, "loss": 0.235, "step": 742 }, { "epoch": 1.5093167701863353, "grad_norm": 0.1645573228597641, "learning_rate": 1.283542999515704e-06, "loss": 0.1795, "step": 743 }, { "epoch": 1.5113871635610767, "grad_norm": 0.1960628777742386, "learning_rate": 1.27257132756409e-06, "loss": 0.2717, "step": 744 }, { "epoch": 1.5134575569358177, "grad_norm": 0.2242807298898697, "learning_rate": 1.2616399081173802e-06, "loss": 0.2269, "step": 745 }, { "epoch": 1.515527950310559, "grad_norm": 0.20267319679260254, "learning_rate": 1.2507488592239848e-06, "loss": 0.2729, "step": 746 }, { "epoch": 1.5175983436853002, "grad_norm": 0.16490796208381653, "learning_rate": 1.2398982984963454e-06, "loss": 0.1948, "step": 747 }, { "epoch": 1.5196687370600412, "grad_norm": 0.21590249240398407, "learning_rate": 1.2290883431096778e-06, "loss": 0.4181, "step": 748 }, { "epoch": 1.5217391304347827, "grad_norm": 0.1928826868534088, "learning_rate": 1.2183191098006948e-06, "loss": 0.2692, "step": 749 }, { "epoch": 1.5238095238095237, "grad_norm": 0.2254112809896469, "learning_rate": 1.2075907148663579e-06, "loss": 0.2489, "step": 750 }, { "epoch": 1.525879917184265, "grad_norm": 0.20881828665733337, "learning_rate": 1.196903274162609e-06, "loss": 0.2952, "step": 751 }, { "epoch": 1.5279503105590062, "grad_norm": 0.1981811821460724, "learning_rate": 1.186256903103129e-06, "loss": 0.2278, "step": 752 }, { "epoch": 1.5300207039337475, "grad_norm": 0.1390255093574524, "learning_rate": 1.1756517166580883e-06, "loss": 0.1711, "step": 753 }, { "epoch": 1.5320910973084887, "grad_norm": 0.20273476839065552, "learning_rate": 1.1650878293528994e-06, "loss": 0.3948, "step": 754 }, { "epoch": 1.5341614906832297, "grad_norm": 0.2850019633769989, "learning_rate": 1.1545653552669928e-06, "loss": 0.3879, "step": 755 }, { "epoch": 1.5362318840579712, "grad_norm": 0.1915498822927475, "learning_rate": 1.1440844080325703e-06, "loss": 0.289, "step": 756 }, { "epoch": 1.5383022774327122, "grad_norm": 0.2492920160293579, "learning_rate": 1.133645100833391e-06, "loss": 0.3461, "step": 757 }, { "epoch": 1.5403726708074534, "grad_norm": 0.1911374032497406, "learning_rate": 1.1232475464035386e-06, "loss": 0.2607, "step": 758 }, { "epoch": 1.5424430641821947, "grad_norm": 0.18775556981563568, "learning_rate": 1.1128918570262137e-06, "loss": 0.3516, "step": 759 }, { "epoch": 1.5445134575569357, "grad_norm": 0.24565483629703522, "learning_rate": 1.10257814453251e-06, "loss": 0.2813, "step": 760 }, { "epoch": 1.5465838509316772, "grad_norm": 0.20192737877368927, "learning_rate": 1.0923065203002199e-06, "loss": 0.3487, "step": 761 }, { "epoch": 1.5486542443064182, "grad_norm": 0.2754063606262207, "learning_rate": 1.0820770952526155e-06, "loss": 0.2709, "step": 762 }, { "epoch": 1.5507246376811594, "grad_norm": 0.19039547443389893, "learning_rate": 1.0718899798572713e-06, "loss": 0.3251, "step": 763 }, { "epoch": 1.5527950310559007, "grad_norm": 0.19267532229423523, "learning_rate": 1.0617452841248494e-06, "loss": 0.2611, "step": 764 }, { "epoch": 1.5548654244306417, "grad_norm": 0.2473609745502472, "learning_rate": 1.0516431176079296e-06, "loss": 0.2622, "step": 765 }, { "epoch": 1.5569358178053831, "grad_norm": 0.19303014874458313, "learning_rate": 1.0415835893998116e-06, "loss": 0.2595, "step": 766 }, { "epoch": 1.5590062111801242, "grad_norm": 0.24576401710510254, "learning_rate": 1.0315668081333519e-06, "loss": 0.3033, "step": 767 }, { "epoch": 1.5610766045548654, "grad_norm": 0.15718652307987213, "learning_rate": 1.0215928819797744e-06, "loss": 0.2193, "step": 768 }, { "epoch": 1.5631469979296067, "grad_norm": 0.22884951531887054, "learning_rate": 1.0116619186475185e-06, "loss": 0.2461, "step": 769 }, { "epoch": 1.5652173913043477, "grad_norm": 0.14403100311756134, "learning_rate": 1.0017740253810608e-06, "loss": 0.12, "step": 770 }, { "epoch": 1.5672877846790891, "grad_norm": 0.2506249248981476, "learning_rate": 9.919293089597715e-07, "loss": 0.2952, "step": 771 }, { "epoch": 1.5693581780538302, "grad_norm": 0.23279593884944916, "learning_rate": 9.821278756967467e-07, "loss": 0.2258, "step": 772 }, { "epoch": 1.5714285714285714, "grad_norm": 0.4160136282444, "learning_rate": 9.723698314376733e-07, "loss": 0.4735, "step": 773 }, { "epoch": 1.5734989648033126, "grad_norm": 0.22142386436462402, "learning_rate": 9.62655281559679e-07, "loss": 0.2525, "step": 774 }, { "epoch": 1.5755693581780539, "grad_norm": 0.20253802835941315, "learning_rate": 9.529843309701925e-07, "loss": 0.3334, "step": 775 }, { "epoch": 1.5776397515527951, "grad_norm": 0.17480939626693726, "learning_rate": 9.433570841058187e-07, "loss": 0.2547, "step": 776 }, { "epoch": 1.5797101449275361, "grad_norm": 0.1820262223482132, "learning_rate": 9.337736449312007e-07, "loss": 0.2979, "step": 777 }, { "epoch": 1.5817805383022774, "grad_norm": 0.23739475011825562, "learning_rate": 9.242341169379077e-07, "loss": 0.3483, "step": 778 }, { "epoch": 1.5838509316770186, "grad_norm": 0.2090044617652893, "learning_rate": 9.14738603143307e-07, "loss": 0.3042, "step": 779 }, { "epoch": 1.5859213250517599, "grad_norm": 0.33553430438041687, "learning_rate": 9.052872060894613e-07, "loss": 0.266, "step": 780 }, { "epoch": 1.587991718426501, "grad_norm": 0.19441604614257812, "learning_rate": 8.958800278420116e-07, "loss": 0.3754, "step": 781 }, { "epoch": 1.5900621118012421, "grad_norm": 0.18226413428783417, "learning_rate": 8.865171699890835e-07, "loss": 0.2289, "step": 782 }, { "epoch": 1.5921325051759836, "grad_norm": 0.1726485639810562, "learning_rate": 8.77198733640186e-07, "loss": 0.2424, "step": 783 }, { "epoch": 1.5942028985507246, "grad_norm": 0.19749626517295837, "learning_rate": 8.679248194251211e-07, "loss": 0.2693, "step": 784 }, { "epoch": 1.5962732919254659, "grad_norm": 0.3344188332557678, "learning_rate": 8.586955274928926e-07, "loss": 0.4119, "step": 785 }, { "epoch": 1.598343685300207, "grad_norm": 0.1941257119178772, "learning_rate": 8.495109575106331e-07, "loss": 0.2691, "step": 786 }, { "epoch": 1.6004140786749481, "grad_norm": 0.23682045936584473, "learning_rate": 8.403712086625176e-07, "loss": 0.399, "step": 787 }, { "epoch": 1.6024844720496896, "grad_norm": 0.17793674767017365, "learning_rate": 8.312763796487038e-07, "loss": 0.2604, "step": 788 }, { "epoch": 1.6045548654244306, "grad_norm": 0.20170265436172485, "learning_rate": 8.22226568684254e-07, "loss": 0.2313, "step": 789 }, { "epoch": 1.6066252587991718, "grad_norm": 0.22592318058013916, "learning_rate": 8.132218734980852e-07, "loss": 0.2609, "step": 790 }, { "epoch": 1.608695652173913, "grad_norm": 0.1931549608707428, "learning_rate": 8.042623913319048e-07, "loss": 0.2652, "step": 791 }, { "epoch": 1.610766045548654, "grad_norm": 0.2012760043144226, "learning_rate": 7.953482189391687e-07, "loss": 0.2863, "step": 792 }, { "epoch": 1.6128364389233956, "grad_norm": 0.17883974313735962, "learning_rate": 7.864794525840325e-07, "loss": 0.2197, "step": 793 }, { "epoch": 1.6149068322981366, "grad_norm": 0.2195083647966385, "learning_rate": 7.776561880403072e-07, "loss": 0.3113, "step": 794 }, { "epoch": 1.6169772256728778, "grad_norm": 0.27091073989868164, "learning_rate": 7.688785205904359e-07, "loss": 0.3241, "step": 795 }, { "epoch": 1.619047619047619, "grad_norm": 0.2131056785583496, "learning_rate": 7.601465450244528e-07, "loss": 0.3203, "step": 796 }, { "epoch": 1.62111801242236, "grad_norm": 0.16343261301517487, "learning_rate": 7.514603556389716e-07, "loss": 0.1965, "step": 797 }, { "epoch": 1.6231884057971016, "grad_norm": 0.17004933953285217, "learning_rate": 7.42820046236154e-07, "loss": 0.2833, "step": 798 }, { "epoch": 1.6252587991718426, "grad_norm": 0.17998455464839935, "learning_rate": 7.342257101227112e-07, "loss": 0.2284, "step": 799 }, { "epoch": 1.6273291925465838, "grad_norm": 0.16908283531665802, "learning_rate": 7.256774401088817e-07, "loss": 0.2292, "step": 800 }, { "epoch": 1.629399585921325, "grad_norm": 0.2141779363155365, "learning_rate": 7.171753285074424e-07, "loss": 0.332, "step": 801 }, { "epoch": 1.6314699792960663, "grad_norm": 0.26314640045166016, "learning_rate": 7.087194671326986e-07, "loss": 0.4573, "step": 802 }, { "epoch": 1.6335403726708075, "grad_norm": 0.20474179089069366, "learning_rate": 7.003099472995084e-07, "loss": 0.3609, "step": 803 }, { "epoch": 1.6356107660455486, "grad_norm": 0.2234301120042801, "learning_rate": 6.91946859822279e-07, "loss": 0.315, "step": 804 }, { "epoch": 1.6376811594202898, "grad_norm": 0.22140094637870789, "learning_rate": 6.83630295014e-07, "loss": 0.3776, "step": 805 }, { "epoch": 1.639751552795031, "grad_norm": 0.35324928164482117, "learning_rate": 6.753603426852589e-07, "loss": 0.3518, "step": 806 }, { "epoch": 1.6418219461697723, "grad_norm": 0.23616626858711243, "learning_rate": 6.67137092143279e-07, "loss": 0.2792, "step": 807 }, { "epoch": 1.6438923395445135, "grad_norm": 0.21780608594417572, "learning_rate": 6.589606321909464e-07, "loss": 0.3164, "step": 808 }, { "epoch": 1.6459627329192545, "grad_norm": 0.224593847990036, "learning_rate": 6.508310511258603e-07, "loss": 0.2601, "step": 809 }, { "epoch": 1.648033126293996, "grad_norm": 0.22506175935268402, "learning_rate": 6.427484367393699e-07, "loss": 0.3016, "step": 810 }, { "epoch": 1.650103519668737, "grad_norm": 0.2013794630765915, "learning_rate": 6.347128763156363e-07, "loss": 0.2155, "step": 811 }, { "epoch": 1.6521739130434783, "grad_norm": 0.210536926984787, "learning_rate": 6.267244566306801e-07, "loss": 0.3181, "step": 812 }, { "epoch": 1.6542443064182195, "grad_norm": 0.2421325147151947, "learning_rate": 6.187832639514524e-07, "loss": 0.2719, "step": 813 }, { "epoch": 1.6563146997929605, "grad_norm": 0.2348569929599762, "learning_rate": 6.108893840348995e-07, "loss": 0.3543, "step": 814 }, { "epoch": 1.658385093167702, "grad_norm": 0.20567581057548523, "learning_rate": 6.030429021270346e-07, "loss": 0.3187, "step": 815 }, { "epoch": 1.660455486542443, "grad_norm": 0.22527620196342468, "learning_rate": 5.952439029620222e-07, "loss": 0.3088, "step": 816 }, { "epoch": 1.6625258799171843, "grad_norm": 0.15716291964054108, "learning_rate": 5.874924707612595e-07, "loss": 0.2427, "step": 817 }, { "epoch": 1.6645962732919255, "grad_norm": 0.2059607058763504, "learning_rate": 5.797886892324695e-07, "loss": 0.3054, "step": 818 }, { "epoch": 1.6666666666666665, "grad_norm": 0.21156099438667297, "learning_rate": 5.721326415687928e-07, "loss": 0.3218, "step": 819 }, { "epoch": 1.668737060041408, "grad_norm": 0.2147274613380432, "learning_rate": 5.645244104478947e-07, "loss": 0.2965, "step": 820 }, { "epoch": 1.670807453416149, "grad_norm": 0.24699914455413818, "learning_rate": 5.569640780310681e-07, "loss": 0.3731, "step": 821 }, { "epoch": 1.6728778467908902, "grad_norm": 0.1968562752008438, "learning_rate": 5.494517259623478e-07, "loss": 0.2637, "step": 822 }, { "epoch": 1.6749482401656315, "grad_norm": 0.19729909300804138, "learning_rate": 5.419874353676302e-07, "loss": 0.2993, "step": 823 }, { "epoch": 1.6770186335403725, "grad_norm": 0.21764995157718658, "learning_rate": 5.34571286853795e-07, "loss": 0.4701, "step": 824 }, { "epoch": 1.679089026915114, "grad_norm": 0.2567196190357208, "learning_rate": 5.272033605078336e-07, "loss": 0.306, "step": 825 }, { "epoch": 1.681159420289855, "grad_norm": 0.19084686040878296, "learning_rate": 5.198837358959901e-07, "loss": 0.3407, "step": 826 }, { "epoch": 1.6832298136645962, "grad_norm": 0.18046824634075165, "learning_rate": 5.126124920628939e-07, "loss": 0.1684, "step": 827 }, { "epoch": 1.6853002070393375, "grad_norm": 0.2360648661851883, "learning_rate": 5.05389707530714e-07, "loss": 0.3395, "step": 828 }, { "epoch": 1.6873706004140787, "grad_norm": 0.2442813515663147, "learning_rate": 4.982154602983042e-07, "loss": 0.3327, "step": 829 }, { "epoch": 1.68944099378882, "grad_norm": 0.24826480448246002, "learning_rate": 4.91089827840367e-07, "loss": 0.3129, "step": 830 }, { "epoch": 1.691511387163561, "grad_norm": 0.35874247550964355, "learning_rate": 4.840128871066107e-07, "loss": 0.3476, "step": 831 }, { "epoch": 1.6935817805383024, "grad_norm": 0.2496299296617508, "learning_rate": 4.769847145209244e-07, "loss": 0.4227, "step": 832 }, { "epoch": 1.6956521739130435, "grad_norm": 0.20718629658222198, "learning_rate": 4.700053859805498e-07, "loss": 0.2448, "step": 833 }, { "epoch": 1.6977225672877847, "grad_norm": 0.1962181031703949, "learning_rate": 4.6307497685525894e-07, "loss": 0.3321, "step": 834 }, { "epoch": 1.699792960662526, "grad_norm": 0.20894816517829895, "learning_rate": 4.5619356198654654e-07, "loss": 0.3279, "step": 835 }, { "epoch": 1.701863354037267, "grad_norm": 0.20937037467956543, "learning_rate": 4.4936121568681546e-07, "loss": 0.2815, "step": 836 }, { "epoch": 1.7039337474120084, "grad_norm": 0.220305398106575, "learning_rate": 4.425780117385797e-07, "loss": 0.2257, "step": 837 }, { "epoch": 1.7060041407867494, "grad_norm": 0.2656574249267578, "learning_rate": 4.3584402339366174e-07, "loss": 0.3553, "step": 838 }, { "epoch": 1.7080745341614907, "grad_norm": 0.23722606897354126, "learning_rate": 4.291593233724084e-07, "loss": 0.4194, "step": 839 }, { "epoch": 1.710144927536232, "grad_norm": 0.21551856398582458, "learning_rate": 4.225239838628981e-07, "loss": 0.3319, "step": 840 }, { "epoch": 1.712215320910973, "grad_norm": 0.24776722490787506, "learning_rate": 4.159380765201687e-07, "loss": 0.3984, "step": 841 }, { "epoch": 1.7142857142857144, "grad_norm": 0.20069292187690735, "learning_rate": 4.0940167246543595e-07, "loss": 0.3096, "step": 842 }, { "epoch": 1.7163561076604554, "grad_norm": 0.19585564732551575, "learning_rate": 4.0291484228533574e-07, "loss": 0.2744, "step": 843 }, { "epoch": 1.7184265010351967, "grad_norm": 0.2340921312570572, "learning_rate": 3.964776560311484e-07, "loss": 0.3995, "step": 844 }, { "epoch": 1.720496894409938, "grad_norm": 0.18698614835739136, "learning_rate": 3.900901832180548e-07, "loss": 0.2305, "step": 845 }, { "epoch": 1.722567287784679, "grad_norm": 0.16317184269428253, "learning_rate": 3.8375249282437743e-07, "loss": 0.1814, "step": 846 }, { "epoch": 1.7246376811594204, "grad_norm": 0.1703299582004547, "learning_rate": 3.774646532908405e-07, "loss": 0.2401, "step": 847 }, { "epoch": 1.7246376811594204, "eval_loss": 0.36501544713974, "eval_runtime": 93.4662, "eval_samples_per_second": 5.906, "eval_steps_per_second": 0.738, "step": 847 }, { "epoch": 1.7267080745341614, "grad_norm": 0.22787785530090332, "learning_rate": 3.71226732519826e-07, "loss": 0.2781, "step": 848 }, { "epoch": 1.7287784679089027, "grad_norm": 0.2101709544658661, "learning_rate": 3.6503879787464614e-07, "loss": 0.2191, "step": 849 }, { "epoch": 1.730848861283644, "grad_norm": 0.18964454531669617, "learning_rate": 3.589009161788104e-07, "loss": 0.2831, "step": 850 }, { "epoch": 1.7329192546583851, "grad_norm": 0.2441266030073166, "learning_rate": 3.5281315371530953e-07, "loss": 0.3778, "step": 851 }, { "epoch": 1.7349896480331264, "grad_norm": 0.20503799617290497, "learning_rate": 3.4677557622589175e-07, "loss": 0.3275, "step": 852 }, { "epoch": 1.7370600414078674, "grad_norm": 0.28412970900535583, "learning_rate": 3.407882489103642e-07, "loss": 0.2926, "step": 853 }, { "epoch": 1.7391304347826086, "grad_norm": 0.20368856191635132, "learning_rate": 3.3485123642587657e-07, "loss": 0.2754, "step": 854 }, { "epoch": 1.74120082815735, "grad_norm": 0.18978296220302582, "learning_rate": 3.28964602886232e-07, "loss": 0.3035, "step": 855 }, { "epoch": 1.7432712215320911, "grad_norm": 0.26132866740226746, "learning_rate": 3.2312841186118937e-07, "loss": 0.3749, "step": 856 }, { "epoch": 1.7453416149068324, "grad_norm": 0.21782605350017548, "learning_rate": 3.1734272637578113e-07, "loss": 0.2398, "step": 857 }, { "epoch": 1.7474120082815734, "grad_norm": 0.24689531326293945, "learning_rate": 3.116076089096265e-07, "loss": 0.3806, "step": 858 }, { "epoch": 1.7494824016563149, "grad_norm": 0.27699175477027893, "learning_rate": 3.05923121396266e-07, "loss": 0.4021, "step": 859 }, { "epoch": 1.7515527950310559, "grad_norm": 0.263994425535202, "learning_rate": 3.0028932522248256e-07, "loss": 0.3092, "step": 860 }, { "epoch": 1.7536231884057971, "grad_norm": 0.21080216765403748, "learning_rate": 2.9470628122764575e-07, "loss": 0.4134, "step": 861 }, { "epoch": 1.7556935817805384, "grad_norm": 0.21831892430782318, "learning_rate": 2.8917404970305096e-07, "loss": 0.2616, "step": 862 }, { "epoch": 1.7577639751552794, "grad_norm": 0.22182129323482513, "learning_rate": 2.8369269039127124e-07, "loss": 0.3379, "step": 863 }, { "epoch": 1.7598343685300208, "grad_norm": 0.20161934196949005, "learning_rate": 2.782622624855097e-07, "loss": 0.3677, "step": 864 }, { "epoch": 1.7619047619047619, "grad_norm": 0.20959687232971191, "learning_rate": 2.728828246289605e-07, "loss": 0.328, "step": 865 }, { "epoch": 1.763975155279503, "grad_norm": 0.22280235588550568, "learning_rate": 2.6755443491417786e-07, "loss": 0.341, "step": 866 }, { "epoch": 1.7660455486542443, "grad_norm": 0.19911250472068787, "learning_rate": 2.622771508824451e-07, "loss": 0.2035, "step": 867 }, { "epoch": 1.7681159420289854, "grad_norm": 0.2472066879272461, "learning_rate": 2.570510295231571e-07, "loss": 0.3076, "step": 868 }, { "epoch": 1.7701863354037268, "grad_norm": 0.2838335335254669, "learning_rate": 2.518761272732012e-07, "loss": 0.3175, "step": 869 }, { "epoch": 1.7722567287784678, "grad_norm": 0.1958410143852234, "learning_rate": 2.467525000163523e-07, "loss": 0.328, "step": 870 }, { "epoch": 1.774327122153209, "grad_norm": 0.21654951572418213, "learning_rate": 2.416802030826626e-07, "loss": 0.3124, "step": 871 }, { "epoch": 1.7763975155279503, "grad_norm": 0.21129873394966125, "learning_rate": 2.36659291247871e-07, "loss": 0.3465, "step": 872 }, { "epoch": 1.7784679089026914, "grad_norm": 0.25625911355018616, "learning_rate": 2.316898187328087e-07, "loss": 0.3733, "step": 873 }, { "epoch": 1.7805383022774328, "grad_norm": 0.20912493765354156, "learning_rate": 2.2677183920281342e-07, "loss": 0.2978, "step": 874 }, { "epoch": 1.7826086956521738, "grad_norm": 0.1701553463935852, "learning_rate": 2.2190540576714825e-07, "loss": 0.2677, "step": 875 }, { "epoch": 1.784679089026915, "grad_norm": 0.1535242795944214, "learning_rate": 2.1709057097843266e-07, "loss": 0.1532, "step": 876 }, { "epoch": 1.7867494824016563, "grad_norm": 0.19895993173122406, "learning_rate": 2.123273868320691e-07, "loss": 0.3049, "step": 877 }, { "epoch": 1.7888198757763976, "grad_norm": 0.15035057067871094, "learning_rate": 2.0761590476568893e-07, "loss": 0.2621, "step": 878 }, { "epoch": 1.7908902691511388, "grad_norm": 0.21586264669895172, "learning_rate": 2.029561756585885e-07, "loss": 0.3422, "step": 879 }, { "epoch": 1.7929606625258798, "grad_norm": 0.18950589001178741, "learning_rate": 1.9834824983118673e-07, "loss": 0.2516, "step": 880 }, { "epoch": 1.795031055900621, "grad_norm": 0.163870170712471, "learning_rate": 1.9379217704447728e-07, "loss": 0.2275, "step": 881 }, { "epoch": 1.7971014492753623, "grad_norm": 0.206380695104599, "learning_rate": 1.892880064994934e-07, "loss": 0.305, "step": 882 }, { "epoch": 1.7991718426501035, "grad_norm": 0.17586331069469452, "learning_rate": 1.8483578683677783e-07, "loss": 0.2527, "step": 883 }, { "epoch": 1.8012422360248448, "grad_norm": 0.18862633407115936, "learning_rate": 1.8043556613585143e-07, "loss": 0.2868, "step": 884 }, { "epoch": 1.8033126293995858, "grad_norm": 0.2271614372730255, "learning_rate": 1.7608739191470214e-07, "loss": 0.3503, "step": 885 }, { "epoch": 1.8053830227743273, "grad_norm": 0.20493744313716888, "learning_rate": 1.7179131112926628e-07, "loss": 0.2666, "step": 886 }, { "epoch": 1.8074534161490683, "grad_norm": 0.2284647822380066, "learning_rate": 1.67547370172923e-07, "loss": 0.2872, "step": 887 }, { "epoch": 1.8095238095238095, "grad_norm": 0.18809698522090912, "learning_rate": 1.6335561487599406e-07, "loss": 0.2607, "step": 888 }, { "epoch": 1.8115942028985508, "grad_norm": 0.23940181732177734, "learning_rate": 1.592160905052481e-07, "loss": 0.3169, "step": 889 }, { "epoch": 1.8136645962732918, "grad_norm": 0.16910064220428467, "learning_rate": 1.551288417634106e-07, "loss": 0.1735, "step": 890 }, { "epoch": 1.8157349896480333, "grad_norm": 0.21867364645004272, "learning_rate": 1.5109391278868568e-07, "loss": 0.3319, "step": 891 }, { "epoch": 1.8178053830227743, "grad_norm": 0.22258295118808746, "learning_rate": 1.471113471542712e-07, "loss": 0.2971, "step": 892 }, { "epoch": 1.8198757763975155, "grad_norm": 0.19150149822235107, "learning_rate": 1.4318118786789958e-07, "loss": 0.4353, "step": 893 }, { "epoch": 1.8219461697722568, "grad_norm": 0.21853388845920563, "learning_rate": 1.3930347737136195e-07, "loss": 0.2208, "step": 894 }, { "epoch": 1.8240165631469978, "grad_norm": 0.13483737409114838, "learning_rate": 1.3547825754005762e-07, "loss": 0.142, "step": 895 }, { "epoch": 1.8260869565217392, "grad_norm": 0.1933605819940567, "learning_rate": 1.3170556968253756e-07, "loss": 0.3566, "step": 896 }, { "epoch": 1.8281573498964803, "grad_norm": 0.17729544639587402, "learning_rate": 1.2798545454006106e-07, "loss": 0.204, "step": 897 }, { "epoch": 1.8302277432712215, "grad_norm": 0.2337586134672165, "learning_rate": 1.2431795228615372e-07, "loss": 0.2763, "step": 898 }, { "epoch": 1.8322981366459627, "grad_norm": 0.2700212597846985, "learning_rate": 1.2070310252617567e-07, "loss": 0.3541, "step": 899 }, { "epoch": 1.8343685300207038, "grad_norm": 0.247345969080925, "learning_rate": 1.1714094429689127e-07, "loss": 0.3693, "step": 900 }, { "epoch": 1.8364389233954452, "grad_norm": 0.21636363863945007, "learning_rate": 1.1363151606605117e-07, "loss": 0.3488, "step": 901 }, { "epoch": 1.8385093167701863, "grad_norm": 0.19248615205287933, "learning_rate": 1.1017485573197151e-07, "loss": 0.346, "step": 902 }, { "epoch": 1.8405797101449275, "grad_norm": 0.2149479240179062, "learning_rate": 1.0677100062313095e-07, "loss": 0.2206, "step": 903 }, { "epoch": 1.8426501035196687, "grad_norm": 0.1936839520931244, "learning_rate": 1.0341998749776316e-07, "loss": 0.2105, "step": 904 }, { "epoch": 1.84472049689441, "grad_norm": 0.24326792359352112, "learning_rate": 1.0012185254346052e-07, "loss": 0.3633, "step": 905 }, { "epoch": 1.8467908902691512, "grad_norm": 0.17744815349578857, "learning_rate": 9.687663137678605e-08, "loss": 0.3085, "step": 906 }, { "epoch": 1.8488612836438922, "grad_norm": 0.18408702313899994, "learning_rate": 9.368435904288431e-08, "loss": 0.2346, "step": 907 }, { "epoch": 1.8509316770186337, "grad_norm": 0.27747613191604614, "learning_rate": 9.054507001510727e-08, "loss": 0.3265, "step": 908 }, { "epoch": 1.8530020703933747, "grad_norm": 0.1913614422082901, "learning_rate": 8.745879819463843e-08, "loss": 0.2061, "step": 909 }, { "epoch": 1.855072463768116, "grad_norm": 0.28982409834861755, "learning_rate": 8.442557691013042e-08, "loss": 0.5099, "step": 910 }, { "epoch": 1.8571428571428572, "grad_norm": 0.2792413830757141, "learning_rate": 8.144543891734135e-08, "loss": 0.4041, "step": 911 }, { "epoch": 1.8592132505175982, "grad_norm": 0.22993594408035278, "learning_rate": 7.851841639878399e-08, "loss": 0.3219, "step": 912 }, { "epoch": 1.8612836438923397, "grad_norm": 0.31364697217941284, "learning_rate": 7.564454096337659e-08, "loss": 0.377, "step": 913 }, { "epoch": 1.8633540372670807, "grad_norm": 0.1780468225479126, "learning_rate": 7.282384364610207e-08, "loss": 0.3827, "step": 914 }, { "epoch": 1.865424430641822, "grad_norm": 0.2630448341369629, "learning_rate": 7.005635490767216e-08, "loss": 0.4095, "step": 915 }, { "epoch": 1.8674948240165632, "grad_norm": 0.15141789615154266, "learning_rate": 6.734210463420099e-08, "loss": 0.1393, "step": 916 }, { "epoch": 1.8695652173913042, "grad_norm": 0.20141035318374634, "learning_rate": 6.46811221368776e-08, "loss": 0.3595, "step": 917 }, { "epoch": 1.8716356107660457, "grad_norm": 0.2934320867061615, "learning_rate": 6.207343615165562e-08, "loss": 0.3961, "step": 918 }, { "epoch": 1.8737060041407867, "grad_norm": 0.2153390347957611, "learning_rate": 5.951907483893626e-08, "loss": 0.2565, "step": 919 }, { "epoch": 1.875776397515528, "grad_norm": 0.33150872588157654, "learning_rate": 5.701806578327029e-08, "loss": 0.392, "step": 920 }, { "epoch": 1.8778467908902692, "grad_norm": 0.26579582691192627, "learning_rate": 5.4570435993055446e-08, "loss": 0.2612, "step": 921 }, { "epoch": 1.8799171842650102, "grad_norm": 0.1782764196395874, "learning_rate": 5.21762119002478e-08, "loss": 0.1443, "step": 922 }, { "epoch": 1.8819875776397517, "grad_norm": 0.347116619348526, "learning_rate": 4.9835419360076406e-08, "loss": 0.3098, "step": 923 }, { "epoch": 1.8840579710144927, "grad_norm": 0.20360776782035828, "learning_rate": 4.7548083650759134e-08, "loss": 0.2233, "step": 924 }, { "epoch": 1.886128364389234, "grad_norm": 0.18242114782333374, "learning_rate": 4.531422947323838e-08, "loss": 0.247, "step": 925 }, { "epoch": 1.8881987577639752, "grad_norm": 0.25464263558387756, "learning_rate": 4.31338809509052e-08, "loss": 0.3339, "step": 926 }, { "epoch": 1.8902691511387164, "grad_norm": 0.1700025200843811, "learning_rate": 4.10070616293462e-08, "loss": 0.2131, "step": 927 }, { "epoch": 1.8923395445134576, "grad_norm": 0.14673881232738495, "learning_rate": 3.8933794476083143e-08, "loss": 0.153, "step": 928 }, { "epoch": 1.8944099378881987, "grad_norm": 0.22516903281211853, "learning_rate": 3.691410188033151e-08, "loss": 0.251, "step": 929 }, { "epoch": 1.89648033126294, "grad_norm": 0.16170381009578705, "learning_rate": 3.494800565275125e-08, "loss": 0.2359, "step": 930 }, { "epoch": 1.8985507246376812, "grad_norm": 0.20099857449531555, "learning_rate": 3.303552702521806e-08, "loss": 0.3992, "step": 931 }, { "epoch": 1.9006211180124224, "grad_norm": 0.22314739227294922, "learning_rate": 3.1176686650589147e-08, "loss": 0.2455, "step": 932 }, { "epoch": 1.9026915113871636, "grad_norm": 0.236553356051445, "learning_rate": 2.93715046024845e-08, "loss": 0.2531, "step": 933 }, { "epoch": 1.9047619047619047, "grad_norm": 0.21454867720603943, "learning_rate": 2.7620000375064848e-08, "loss": 0.275, "step": 934 }, { "epoch": 1.9068322981366461, "grad_norm": 0.24925491213798523, "learning_rate": 2.592219288282627e-08, "loss": 0.3424, "step": 935 }, { "epoch": 1.9089026915113871, "grad_norm": 0.21076105535030365, "learning_rate": 2.4278100460393138e-08, "loss": 0.3642, "step": 936 }, { "epoch": 1.9109730848861284, "grad_norm": 0.1848498284816742, "learning_rate": 2.2687740862320506e-08, "loss": 0.2639, "step": 937 }, { "epoch": 1.9130434782608696, "grad_norm": 0.21795926988124847, "learning_rate": 2.115113126290258e-08, "loss": 0.4229, "step": 938 }, { "epoch": 1.9151138716356106, "grad_norm": 0.2156882882118225, "learning_rate": 1.966828825598843e-08, "loss": 0.2438, "step": 939 }, { "epoch": 1.917184265010352, "grad_norm": 0.24736322462558746, "learning_rate": 1.8239227854799368e-08, "loss": 0.4286, "step": 940 }, { "epoch": 1.9192546583850931, "grad_norm": 0.2588346004486084, "learning_rate": 1.686396549176128e-08, "loss": 0.2705, "step": 941 }, { "epoch": 1.9213250517598344, "grad_norm": 0.19903713464736938, "learning_rate": 1.554251601833201e-08, "loss": 0.1754, "step": 942 }, { "epoch": 1.9233954451345756, "grad_norm": 0.21781648695468903, "learning_rate": 1.4274893704845916e-08, "loss": 0.2451, "step": 943 }, { "epoch": 1.9254658385093166, "grad_norm": 0.23910443484783173, "learning_rate": 1.3061112240357887e-08, "loss": 0.3731, "step": 944 }, { "epoch": 1.927536231884058, "grad_norm": 0.3607318699359894, "learning_rate": 1.1901184732493465e-08, "loss": 0.3392, "step": 945 }, { "epoch": 1.9296066252587991, "grad_norm": 0.18589964509010315, "learning_rate": 1.0795123707312283e-08, "loss": 0.3498, "step": 946 }, { "epoch": 1.9316770186335404, "grad_norm": 0.20047861337661743, "learning_rate": 9.742941109166515e-09, "loss": 0.3472, "step": 947 }, { "epoch": 1.9337474120082816, "grad_norm": 0.1777239441871643, "learning_rate": 8.744648300578196e-09, "loss": 0.3169, "step": 948 }, { "epoch": 1.9358178053830226, "grad_norm": 0.22908566892147064, "learning_rate": 7.80025606211099e-09, "loss": 0.3575, "step": 949 }, { "epoch": 1.937888198757764, "grad_norm": 0.20297054946422577, "learning_rate": 6.9097745922580564e-09, "loss": 0.3494, "step": 950 }, { "epoch": 1.939958592132505, "grad_norm": 0.25517526268959045, "learning_rate": 6.073213507328812e-09, "loss": 0.2926, "step": 951 }, { "epoch": 1.9420289855072463, "grad_norm": 0.21190893650054932, "learning_rate": 5.2905818413478975e-09, "loss": 0.309, "step": 952 }, { "epoch": 1.9440993788819876, "grad_norm": 0.22347129881381989, "learning_rate": 4.5618880459541435e-09, "loss": 0.2969, "step": 953 }, { "epoch": 1.9461697722567288, "grad_norm": 0.23573170602321625, "learning_rate": 3.887139990313427e-09, "loss": 0.2966, "step": 954 }, { "epoch": 1.94824016563147, "grad_norm": 0.18915671110153198, "learning_rate": 3.2663449610298435e-09, "loss": 0.2483, "step": 955 }, { "epoch": 1.950310559006211, "grad_norm": 0.2534366250038147, "learning_rate": 2.699509662069666e-09, "loss": 0.2978, "step": 956 }, { "epoch": 1.9523809523809523, "grad_norm": 0.17050714790821075, "learning_rate": 2.1866402146875077e-09, "loss": 0.2408, "step": 957 }, { "epoch": 1.9544513457556936, "grad_norm": 0.2743813097476959, "learning_rate": 1.7277421573608234e-09, "loss": 0.3852, "step": 958 }, { "epoch": 1.9565217391304348, "grad_norm": 0.20148064196109772, "learning_rate": 1.3228204457299555e-09, "loss": 0.2587, "step": 959 }, { "epoch": 1.958592132505176, "grad_norm": 0.2194848358631134, "learning_rate": 9.71879452545399e-10, "loss": 0.2841, "step": 960 }, { "epoch": 1.960662525879917, "grad_norm": 0.2698388993740082, "learning_rate": 6.749229676183965e-10, "loss": 0.4823, "step": 961 }, { "epoch": 1.9627329192546585, "grad_norm": 0.23014101386070251, "learning_rate": 4.3195419778319095e-10, "loss": 0.3208, "step": 962 }, { "epoch": 1.9648033126293996, "grad_norm": 0.16867674887180328, "learning_rate": 2.429757668587218e-10, "loss": 0.1936, "step": 963 }, { "epoch": 1.9668737060041408, "grad_norm": 0.18036042153835297, "learning_rate": 1.0798971562364647e-10, "loss": 0.2331, "step": 964 }, { "epoch": 1.968944099378882, "grad_norm": 0.1979045867919922, "learning_rate": 2.6997501792469515e-11, "loss": 0.286, "step": 965 }, { "epoch": 1.971014492753623, "grad_norm": 0.1963033676147461, "learning_rate": 0.0, "loss": 0.2669, "step": 966 } ], "logging_steps": 1, "max_steps": 966, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 242, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.343597735784219e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }