{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.980132450331126, "eval_steps": 500, "global_step": 225, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013245033112582781, "grad_norm": 0.8096176088552035, "learning_rate": 8.695652173913044e-06, "loss": 1.2541, "step": 1 }, { "epoch": 0.026490066225165563, "grad_norm": 0.8050822017472643, "learning_rate": 1.739130434782609e-05, "loss": 1.227, "step": 2 }, { "epoch": 0.039735099337748346, "grad_norm": 0.7944772711887119, "learning_rate": 2.608695652173913e-05, "loss": 1.2415, "step": 3 }, { "epoch": 0.052980132450331126, "grad_norm": 0.7598134845438774, "learning_rate": 3.478260869565218e-05, "loss": 1.1949, "step": 4 }, { "epoch": 0.06622516556291391, "grad_norm": 0.7683127560022982, "learning_rate": 4.347826086956522e-05, "loss": 1.2093, "step": 5 }, { "epoch": 0.07947019867549669, "grad_norm": 0.5344525931760804, "learning_rate": 5.217391304347826e-05, "loss": 1.1036, "step": 6 }, { "epoch": 0.09271523178807947, "grad_norm": 0.4587044664340658, "learning_rate": 6.086956521739131e-05, "loss": 1.0166, "step": 7 }, { "epoch": 0.10596026490066225, "grad_norm": 0.4868625164917359, "learning_rate": 6.956521739130436e-05, "loss": 0.955, "step": 8 }, { "epoch": 0.11920529801324503, "grad_norm": 0.5418471125188639, "learning_rate": 7.82608695652174e-05, "loss": 0.8997, "step": 9 }, { "epoch": 0.13245033112582782, "grad_norm": 0.5223349521251892, "learning_rate": 8.695652173913044e-05, "loss": 0.8113, "step": 10 }, { "epoch": 0.1456953642384106, "grad_norm": 0.4786982568033246, "learning_rate": 9.565217391304348e-05, "loss": 0.7325, "step": 11 }, { "epoch": 0.15894039735099338, "grad_norm": 0.46957216029807536, "learning_rate": 0.00010434782608695653, "loss": 0.6606, "step": 12 }, { "epoch": 0.17218543046357615, "grad_norm": 0.38029367288689914, "learning_rate": 0.00011304347826086956, "loss": 0.5808, "step": 13 }, { "epoch": 0.18543046357615894, "grad_norm": 0.24720582418095602, "learning_rate": 0.00012173913043478263, "loss": 0.5613, "step": 14 }, { "epoch": 0.1986754966887417, "grad_norm": 0.23099067802861695, "learning_rate": 0.00013043478260869567, "loss": 0.5391, "step": 15 }, { "epoch": 0.2119205298013245, "grad_norm": 0.20957820248410008, "learning_rate": 0.0001391304347826087, "loss": 0.539, "step": 16 }, { "epoch": 0.2251655629139073, "grad_norm": 0.21711931182463448, "learning_rate": 0.00014782608695652173, "loss": 0.5268, "step": 17 }, { "epoch": 0.23841059602649006, "grad_norm": 0.1951790595421549, "learning_rate": 0.0001565217391304348, "loss": 0.4963, "step": 18 }, { "epoch": 0.25165562913907286, "grad_norm": 0.1826409685431601, "learning_rate": 0.00016521739130434784, "loss": 0.4952, "step": 19 }, { "epoch": 0.26490066225165565, "grad_norm": 0.14373385619543355, "learning_rate": 0.00017391304347826088, "loss": 0.4837, "step": 20 }, { "epoch": 0.2781456953642384, "grad_norm": 0.12173908533781636, "learning_rate": 0.00018260869565217392, "loss": 0.4634, "step": 21 }, { "epoch": 0.2913907284768212, "grad_norm": 0.12297735060498352, "learning_rate": 0.00019130434782608697, "loss": 0.4573, "step": 22 }, { "epoch": 0.304635761589404, "grad_norm": 0.10994270746188307, "learning_rate": 0.0002, "loss": 0.4683, "step": 23 }, { "epoch": 0.31788079470198677, "grad_norm": 0.11351044281096902, "learning_rate": 0.00019998790632601496, "loss": 0.4322, "step": 24 }, { "epoch": 0.33112582781456956, "grad_norm": 0.11243087776192183, "learning_rate": 0.00019995162822919883, "loss": 0.4516, "step": 25 }, { "epoch": 0.3443708609271523, "grad_norm": 0.11510175208476785, "learning_rate": 0.00019989117448426108, "loss": 0.4499, "step": 26 }, { "epoch": 0.3576158940397351, "grad_norm": 0.11693433753737806, "learning_rate": 0.00019980655971335945, "loss": 0.4542, "step": 27 }, { "epoch": 0.3708609271523179, "grad_norm": 0.11467246423231502, "learning_rate": 0.00019969780438256293, "loss": 0.4337, "step": 28 }, { "epoch": 0.3841059602649007, "grad_norm": 0.11115653137915112, "learning_rate": 0.0001995649347969019, "loss": 0.4263, "step": 29 }, { "epoch": 0.3973509933774834, "grad_norm": 0.11024786542483019, "learning_rate": 0.00019940798309400526, "loss": 0.4342, "step": 30 }, { "epoch": 0.4105960264900662, "grad_norm": 0.10312580553142063, "learning_rate": 0.00019922698723632767, "loss": 0.4267, "step": 31 }, { "epoch": 0.423841059602649, "grad_norm": 0.11074151337400631, "learning_rate": 0.00019902199100196697, "loss": 0.4286, "step": 32 }, { "epoch": 0.4370860927152318, "grad_norm": 0.09029943151079976, "learning_rate": 0.0001987930439740757, "loss": 0.4152, "step": 33 }, { "epoch": 0.4503311258278146, "grad_norm": 0.09101826700354056, "learning_rate": 0.00019854020152886814, "loss": 0.4313, "step": 34 }, { "epoch": 0.46357615894039733, "grad_norm": 0.0914630983642065, "learning_rate": 0.00019826352482222638, "loss": 0.4117, "step": 35 }, { "epoch": 0.4768211920529801, "grad_norm": 0.09219697877770537, "learning_rate": 0.00019796308077490817, "loss": 0.4175, "step": 36 }, { "epoch": 0.4900662251655629, "grad_norm": 0.08852002864296264, "learning_rate": 0.00019763894205636072, "loss": 0.4041, "step": 37 }, { "epoch": 0.5033112582781457, "grad_norm": 0.08580676378486166, "learning_rate": 0.00019729118706714375, "loss": 0.404, "step": 38 }, { "epoch": 0.5165562913907285, "grad_norm": 0.08598698501328113, "learning_rate": 0.00019691989991996663, "loss": 0.4087, "step": 39 }, { "epoch": 0.5298013245033113, "grad_norm": 0.08961053716539952, "learning_rate": 0.00019652517041934356, "loss": 0.4014, "step": 40 }, { "epoch": 0.543046357615894, "grad_norm": 0.08443482401797175, "learning_rate": 0.00019610709403987246, "loss": 0.4137, "step": 41 }, { "epoch": 0.5562913907284768, "grad_norm": 0.08466021640310874, "learning_rate": 0.00019566577190314197, "loss": 0.4071, "step": 42 }, { "epoch": 0.5695364238410596, "grad_norm": 0.08784527020927076, "learning_rate": 0.00019520131075327298, "loss": 0.4061, "step": 43 }, { "epoch": 0.5827814569536424, "grad_norm": 0.08325332082087357, "learning_rate": 0.00019471382293110003, "loss": 0.3957, "step": 44 }, { "epoch": 0.5960264900662252, "grad_norm": 0.08614805595781429, "learning_rate": 0.0001942034263469989, "loss": 0.4053, "step": 45 }, { "epoch": 0.609271523178808, "grad_norm": 0.07902174863469037, "learning_rate": 0.00019367024445236754, "loss": 0.3987, "step": 46 }, { "epoch": 0.6225165562913907, "grad_norm": 0.08133695710941313, "learning_rate": 0.00019311440620976597, "loss": 0.3942, "step": 47 }, { "epoch": 0.6357615894039735, "grad_norm": 0.08276360028919133, "learning_rate": 0.00019253604606172417, "loss": 0.3951, "step": 48 }, { "epoch": 0.6490066225165563, "grad_norm": 0.08194802489692825, "learning_rate": 0.00019193530389822363, "loss": 0.3917, "step": 49 }, { "epoch": 0.6622516556291391, "grad_norm": 0.08159974959706186, "learning_rate": 0.00019131232502286188, "loss": 0.3934, "step": 50 }, { "epoch": 0.6754966887417219, "grad_norm": 0.08170998905157066, "learning_rate": 0.00019066726011770726, "loss": 0.3851, "step": 51 }, { "epoch": 0.6887417218543046, "grad_norm": 0.08020907094953274, "learning_rate": 0.00019000026520685302, "loss": 0.3893, "step": 52 }, { "epoch": 0.7019867549668874, "grad_norm": 0.08034981466771474, "learning_rate": 0.00018931150161867916, "loss": 0.381, "step": 53 }, { "epoch": 0.7152317880794702, "grad_norm": 0.08444845993593682, "learning_rate": 0.00018860113594683148, "loss": 0.3915, "step": 54 }, { "epoch": 0.7284768211920529, "grad_norm": 0.08015215412606266, "learning_rate": 0.00018786934000992688, "loss": 0.3833, "step": 55 }, { "epoch": 0.7417218543046358, "grad_norm": 0.08464858931007045, "learning_rate": 0.00018711629080999504, "loss": 0.3826, "step": 56 }, { "epoch": 0.7549668874172185, "grad_norm": 0.08291520407405459, "learning_rate": 0.00018634217048966637, "loss": 0.3738, "step": 57 }, { "epoch": 0.7682119205298014, "grad_norm": 0.08660040487398858, "learning_rate": 0.0001855471662881164, "loss": 0.3856, "step": 58 }, { "epoch": 0.7814569536423841, "grad_norm": 0.0857196214995308, "learning_rate": 0.00018473147049577774, "loss": 0.3779, "step": 59 }, { "epoch": 0.7947019867549668, "grad_norm": 0.07987880371713715, "learning_rate": 0.00018389528040783012, "loss": 0.3766, "step": 60 }, { "epoch": 0.8079470198675497, "grad_norm": 0.08369440099668185, "learning_rate": 0.00018303879827647975, "loss": 0.3835, "step": 61 }, { "epoch": 0.8211920529801324, "grad_norm": 0.08373532556639413, "learning_rate": 0.00018216223126204007, "loss": 0.3745, "step": 62 }, { "epoch": 0.8344370860927153, "grad_norm": 0.08073536197157054, "learning_rate": 0.00018126579138282503, "loss": 0.3687, "step": 63 }, { "epoch": 0.847682119205298, "grad_norm": 0.08284465509601228, "learning_rate": 0.00018034969546386757, "loss": 0.3787, "step": 64 }, { "epoch": 0.8609271523178808, "grad_norm": 0.0842934427371451, "learning_rate": 0.00017941416508447536, "loss": 0.3873, "step": 65 }, { "epoch": 0.8741721854304636, "grad_norm": 0.08355593713327628, "learning_rate": 0.0001784594265246366, "loss": 0.3778, "step": 66 }, { "epoch": 0.8874172185430463, "grad_norm": 0.08950539941436171, "learning_rate": 0.000177485710710289, "loss": 0.3727, "step": 67 }, { "epoch": 0.9006622516556292, "grad_norm": 0.08710263548451828, "learning_rate": 0.00017649325315746478, "loss": 0.3808, "step": 68 }, { "epoch": 0.9139072847682119, "grad_norm": 0.0887614198652171, "learning_rate": 0.00017548229391532572, "loss": 0.3789, "step": 69 }, { "epoch": 0.9271523178807947, "grad_norm": 0.08666661250569707, "learning_rate": 0.0001744530775081015, "loss": 0.3732, "step": 70 }, { "epoch": 0.9403973509933775, "grad_norm": 0.0849525268450149, "learning_rate": 0.00017340585287594604, "loss": 0.3712, "step": 71 }, { "epoch": 0.9536423841059603, "grad_norm": 0.08625788315304235, "learning_rate": 0.00017234087331472497, "loss": 0.3597, "step": 72 }, { "epoch": 0.9668874172185431, "grad_norm": 0.07851130512605926, "learning_rate": 0.00017125839641475072, "loss": 0.3639, "step": 73 }, { "epoch": 0.9801324503311258, "grad_norm": 0.08964240238751611, "learning_rate": 0.00017015868399847768, "loss": 0.3844, "step": 74 }, { "epoch": 0.9933774834437086, "grad_norm": 0.08516340365396252, "learning_rate": 0.0001690420020571747, "loss": 0.372, "step": 75 }, { "epoch": 0.9933774834437086, "eval_loss": 0.3703567683696747, "eval_runtime": 46.123, "eval_samples_per_second": 21.941, "eval_steps_per_second": 0.694, "step": 75 }, { "epoch": 1.0066225165562914, "grad_norm": 0.07944382362889917, "learning_rate": 0.0001679086206865886, "loss": 0.3697, "step": 76 }, { "epoch": 1.0198675496688743, "grad_norm": 0.08265930361903498, "learning_rate": 0.00016675881402161536, "loss": 0.3551, "step": 77 }, { "epoch": 1.033112582781457, "grad_norm": 0.08703614399996357, "learning_rate": 0.000165592860169994, "loss": 0.3442, "step": 78 }, { "epoch": 1.0463576158940397, "grad_norm": 0.08916319509375828, "learning_rate": 0.0001644110411450398, "loss": 0.365, "step": 79 }, { "epoch": 1.0596026490066226, "grad_norm": 0.08703848127871557, "learning_rate": 0.00016321364279743266, "loss": 0.3611, "step": 80 }, { "epoch": 1.0728476821192052, "grad_norm": 0.09052558000694078, "learning_rate": 0.00016200095474607753, "loss": 0.3615, "step": 81 }, { "epoch": 1.086092715231788, "grad_norm": 0.08918100371610707, "learning_rate": 0.0001607732703080532, "loss": 0.342, "step": 82 }, { "epoch": 1.099337748344371, "grad_norm": 0.08576575268439565, "learning_rate": 0.0001595308864276666, "loss": 0.3598, "step": 83 }, { "epoch": 1.1125827814569536, "grad_norm": 0.08585017464402006, "learning_rate": 0.0001582741036046301, "loss": 0.3504, "step": 84 }, { "epoch": 1.1258278145695364, "grad_norm": 0.08593452414859805, "learning_rate": 0.00015700322582137827, "loss": 0.3432, "step": 85 }, { "epoch": 1.1390728476821192, "grad_norm": 0.08731970510720415, "learning_rate": 0.00015571856046954285, "loss": 0.3457, "step": 86 }, { "epoch": 1.152317880794702, "grad_norm": 0.0921843418842424, "learning_rate": 0.00015442041827560274, "loss": 0.3507, "step": 87 }, { "epoch": 1.1655629139072847, "grad_norm": 0.09651961400159455, "learning_rate": 0.00015310911322572753, "loss": 0.3596, "step": 88 }, { "epoch": 1.1788079470198676, "grad_norm": 0.08524005048376013, "learning_rate": 0.00015178496248983254, "loss": 0.3554, "step": 89 }, { "epoch": 1.1920529801324504, "grad_norm": 0.08859594152270273, "learning_rate": 0.000150448286344864, "loss": 0.3551, "step": 90 }, { "epoch": 1.205298013245033, "grad_norm": 0.0924808469627539, "learning_rate": 0.00014909940809733222, "loss": 0.3525, "step": 91 }, { "epoch": 1.218543046357616, "grad_norm": 0.08644059805052462, "learning_rate": 0.00014773865400511272, "loss": 0.3503, "step": 92 }, { "epoch": 1.2317880794701987, "grad_norm": 0.09131894341880005, "learning_rate": 0.00014636635319853275, "loss": 0.3571, "step": 93 }, { "epoch": 1.2450331125827814, "grad_norm": 0.08393682045402433, "learning_rate": 0.0001449828376007636, "loss": 0.3476, "step": 94 }, { "epoch": 1.2582781456953642, "grad_norm": 0.08696313045637266, "learning_rate": 0.00014358844184753712, "loss": 0.3594, "step": 95 }, { "epoch": 1.271523178807947, "grad_norm": 0.09458041630505085, "learning_rate": 0.00014218350320620624, "loss": 0.3626, "step": 96 }, { "epoch": 1.2847682119205297, "grad_norm": 0.08823303635376296, "learning_rate": 0.00014076836149416887, "loss": 0.3499, "step": 97 }, { "epoch": 1.2980132450331126, "grad_norm": 0.09294675372857181, "learning_rate": 0.00013934335899667527, "loss": 0.3539, "step": 98 }, { "epoch": 1.3112582781456954, "grad_norm": 0.08824268036877034, "learning_rate": 0.00013790884038403795, "loss": 0.3514, "step": 99 }, { "epoch": 1.3245033112582782, "grad_norm": 0.08535480262896947, "learning_rate": 0.00013646515262826552, "loss": 0.345, "step": 100 }, { "epoch": 1.3377483443708609, "grad_norm": 0.08847562725166169, "learning_rate": 0.00013501264491913906, "loss": 0.3616, "step": 101 }, { "epoch": 1.3509933774834437, "grad_norm": 0.08859058434854095, "learning_rate": 0.0001335516685797525, "loss": 0.3562, "step": 102 }, { "epoch": 1.3642384105960264, "grad_norm": 0.08715025975746184, "learning_rate": 0.00013208257698153677, "loss": 0.3455, "step": 103 }, { "epoch": 1.3774834437086092, "grad_norm": 0.0853594568437305, "learning_rate": 0.00013060572545878875, "loss": 0.346, "step": 104 }, { "epoch": 1.390728476821192, "grad_norm": 0.08722491192064814, "learning_rate": 0.00012912147122272523, "loss": 0.3555, "step": 105 }, { "epoch": 1.403973509933775, "grad_norm": 0.0871433664730764, "learning_rate": 0.00012763017327508305, "loss": 0.3556, "step": 106 }, { "epoch": 1.4172185430463577, "grad_norm": 0.08803547541904783, "learning_rate": 0.00012613219232128608, "loss": 0.3534, "step": 107 }, { "epoch": 1.4304635761589404, "grad_norm": 0.09122226233927531, "learning_rate": 0.00012462789068320017, "loss": 0.3569, "step": 108 }, { "epoch": 1.4437086092715232, "grad_norm": 0.09822341257641279, "learning_rate": 0.000123117632211497, "loss": 0.3633, "step": 109 }, { "epoch": 1.4569536423841059, "grad_norm": 0.09270090775666746, "learning_rate": 0.00012160178219764837, "loss": 0.3453, "step": 110 }, { "epoch": 1.4701986754966887, "grad_norm": 0.08925565696630358, "learning_rate": 0.00012008070728557186, "loss": 0.3508, "step": 111 }, { "epoch": 1.4834437086092715, "grad_norm": 0.09170653617303556, "learning_rate": 0.00011855477538294935, "loss": 0.3534, "step": 112 }, { "epoch": 1.4966887417218544, "grad_norm": 0.08583635619816832, "learning_rate": 0.00011702435557223987, "loss": 0.3463, "step": 113 }, { "epoch": 1.5099337748344372, "grad_norm": 0.08058809711878263, "learning_rate": 0.00011548981802140848, "loss": 0.3477, "step": 114 }, { "epoch": 1.5231788079470199, "grad_norm": 0.09093533643868798, "learning_rate": 0.00011395153389439233, "loss": 0.3512, "step": 115 }, { "epoch": 1.5364238410596025, "grad_norm": 0.09171376470501859, "learning_rate": 0.00011240987526132594, "loss": 0.3544, "step": 116 }, { "epoch": 1.5496688741721854, "grad_norm": 0.08586078909940174, "learning_rate": 0.00011086521500854745, "loss": 0.3694, "step": 117 }, { "epoch": 1.5629139072847682, "grad_norm": 0.08632019045566638, "learning_rate": 0.00010931792674840718, "loss": 0.3453, "step": 118 }, { "epoch": 1.576158940397351, "grad_norm": 0.09269094674353331, "learning_rate": 0.00010776838472890065, "loss": 0.3587, "step": 119 }, { "epoch": 1.589403973509934, "grad_norm": 0.08779002368050795, "learning_rate": 0.00010621696374314807, "loss": 0.3478, "step": 120 }, { "epoch": 1.6026490066225165, "grad_norm": 0.08586261022719192, "learning_rate": 0.00010466403903874176, "loss": 0.341, "step": 121 }, { "epoch": 1.6158940397350994, "grad_norm": 0.08611577193250892, "learning_rate": 0.0001031099862269837, "loss": 0.3558, "step": 122 }, { "epoch": 1.629139072847682, "grad_norm": 0.09316621499512412, "learning_rate": 0.0001015551811920351, "loss": 0.3541, "step": 123 }, { "epoch": 1.6423841059602649, "grad_norm": 0.08404147766450029, "learning_rate": 0.0001, "loss": 0.3489, "step": 124 }, { "epoch": 1.6556291390728477, "grad_norm": 0.08524287111150772, "learning_rate": 9.844481880796491e-05, "loss": 0.3541, "step": 125 }, { "epoch": 1.6688741721854305, "grad_norm": 0.08369196863657465, "learning_rate": 9.689001377301633e-05, "loss": 0.3421, "step": 126 }, { "epoch": 1.6821192052980134, "grad_norm": 0.08831018354579961, "learning_rate": 9.533596096125825e-05, "loss": 0.3484, "step": 127 }, { "epoch": 1.695364238410596, "grad_norm": 0.08931583825994703, "learning_rate": 9.378303625685195e-05, "loss": 0.3418, "step": 128 }, { "epoch": 1.7086092715231787, "grad_norm": 0.0920976409870365, "learning_rate": 9.223161527109937e-05, "loss": 0.3477, "step": 129 }, { "epoch": 1.7218543046357615, "grad_norm": 0.0866166191323527, "learning_rate": 9.068207325159284e-05, "loss": 0.3422, "step": 130 }, { "epoch": 1.7350993377483444, "grad_norm": 0.08394672431065998, "learning_rate": 8.913478499145254e-05, "loss": 0.337, "step": 131 }, { "epoch": 1.7483443708609272, "grad_norm": 0.08368403453651165, "learning_rate": 8.759012473867407e-05, "loss": 0.3487, "step": 132 }, { "epoch": 1.76158940397351, "grad_norm": 0.08503534775674756, "learning_rate": 8.604846610560771e-05, "loss": 0.3463, "step": 133 }, { "epoch": 1.7748344370860927, "grad_norm": 0.08495442186575057, "learning_rate": 8.451018197859153e-05, "loss": 0.3506, "step": 134 }, { "epoch": 1.7880794701986755, "grad_norm": 0.08766338307723749, "learning_rate": 8.297564442776014e-05, "loss": 0.3423, "step": 135 }, { "epoch": 1.8013245033112582, "grad_norm": 0.08162961612606438, "learning_rate": 8.144522461705067e-05, "loss": 0.3316, "step": 136 }, { "epoch": 1.814569536423841, "grad_norm": 0.08852249330426205, "learning_rate": 7.991929271442817e-05, "loss": 0.3483, "step": 137 }, { "epoch": 1.8278145695364238, "grad_norm": 0.08788889130608463, "learning_rate": 7.839821780235168e-05, "loss": 0.3554, "step": 138 }, { "epoch": 1.8410596026490067, "grad_norm": 0.08567621661342421, "learning_rate": 7.688236778850306e-05, "loss": 0.3333, "step": 139 }, { "epoch": 1.8543046357615895, "grad_norm": 0.09025227183243908, "learning_rate": 7.537210931679987e-05, "loss": 0.3461, "step": 140 }, { "epoch": 1.8675496688741722, "grad_norm": 0.0887176743957205, "learning_rate": 7.386780767871397e-05, "loss": 0.3459, "step": 141 }, { "epoch": 1.8807947019867548, "grad_norm": 0.08665996940712498, "learning_rate": 7.236982672491698e-05, "loss": 0.3539, "step": 142 }, { "epoch": 1.8940397350993377, "grad_norm": 0.08608862013105582, "learning_rate": 7.087852877727481e-05, "loss": 0.3418, "step": 143 }, { "epoch": 1.9072847682119205, "grad_norm": 0.08420947731369693, "learning_rate": 6.939427454121128e-05, "loss": 0.3385, "step": 144 }, { "epoch": 1.9205298013245033, "grad_norm": 0.08687771570570416, "learning_rate": 6.791742301846326e-05, "loss": 0.3484, "step": 145 }, { "epoch": 1.9337748344370862, "grad_norm": 0.09001811775951214, "learning_rate": 6.644833142024751e-05, "loss": 0.3482, "step": 146 }, { "epoch": 1.9470198675496688, "grad_norm": 0.08461468347282106, "learning_rate": 6.498735508086093e-05, "loss": 0.3384, "step": 147 }, { "epoch": 1.9602649006622517, "grad_norm": 0.08353611993941902, "learning_rate": 6.35348473717345e-05, "loss": 0.343, "step": 148 }, { "epoch": 1.9735099337748343, "grad_norm": 0.0834738694275141, "learning_rate": 6.209115961596208e-05, "loss": 0.3431, "step": 149 }, { "epoch": 1.9867549668874172, "grad_norm": 0.08599845820919347, "learning_rate": 6.065664100332478e-05, "loss": 0.3381, "step": 150 }, { "epoch": 2.0, "grad_norm": 0.08781968968497832, "learning_rate": 5.923163850583113e-05, "loss": 0.3361, "step": 151 }, { "epoch": 2.0, "eval_loss": 0.35156726837158203, "eval_runtime": 38.8035, "eval_samples_per_second": 26.08, "eval_steps_per_second": 0.825, "step": 151 }, { "epoch": 2.013245033112583, "grad_norm": 0.08189131042429836, "learning_rate": 5.781649679379378e-05, "loss": 0.3168, "step": 152 }, { "epoch": 2.0264900662251657, "grad_norm": 0.08590965338671859, "learning_rate": 5.6411558152462894e-05, "loss": 0.3327, "step": 153 }, { "epoch": 2.0397350993377485, "grad_norm": 0.08632653329140866, "learning_rate": 5.501716239923642e-05, "loss": 0.331, "step": 154 }, { "epoch": 2.052980132450331, "grad_norm": 0.08516842826462703, "learning_rate": 5.363364680146725e-05, "loss": 0.3306, "step": 155 }, { "epoch": 2.066225165562914, "grad_norm": 0.08496401039658237, "learning_rate": 5.226134599488728e-05, "loss": 0.3248, "step": 156 }, { "epoch": 2.0794701986754967, "grad_norm": 0.08826525390483432, "learning_rate": 5.090059190266779e-05, "loss": 0.3308, "step": 157 }, { "epoch": 2.0927152317880795, "grad_norm": 0.08487280637626197, "learning_rate": 4.955171365513603e-05, "loss": 0.3211, "step": 158 }, { "epoch": 2.1059602649006623, "grad_norm": 0.09382764910639449, "learning_rate": 4.821503751016746e-05, "loss": 0.3354, "step": 159 }, { "epoch": 2.119205298013245, "grad_norm": 0.08732672940741114, "learning_rate": 4.689088677427249e-05, "loss": 0.3315, "step": 160 }, { "epoch": 2.1324503311258276, "grad_norm": 0.09541697755263766, "learning_rate": 4.5579581724397255e-05, "loss": 0.3373, "step": 161 }, { "epoch": 2.1456953642384105, "grad_norm": 0.08867554361971618, "learning_rate": 4.428143953045717e-05, "loss": 0.3383, "step": 162 }, { "epoch": 2.1589403973509933, "grad_norm": 0.09288456090060858, "learning_rate": 4.2996774178621736e-05, "loss": 0.331, "step": 163 }, { "epoch": 2.172185430463576, "grad_norm": 0.08808813047917079, "learning_rate": 4.172589639536991e-05, "loss": 0.3223, "step": 164 }, { "epoch": 2.185430463576159, "grad_norm": 0.09275105554751231, "learning_rate": 4.046911357233343e-05, "loss": 0.3301, "step": 165 }, { "epoch": 2.198675496688742, "grad_norm": 0.09353735027294084, "learning_rate": 3.922672969194686e-05, "loss": 0.3295, "step": 166 }, { "epoch": 2.2119205298013247, "grad_norm": 0.09234588799290942, "learning_rate": 3.79990452539225e-05, "loss": 0.3214, "step": 167 }, { "epoch": 2.225165562913907, "grad_norm": 0.09179773375765557, "learning_rate": 3.678635720256737e-05, "loss": 0.3241, "step": 168 }, { "epoch": 2.23841059602649, "grad_norm": 0.08971692725792768, "learning_rate": 3.558895885496023e-05, "loss": 0.3175, "step": 169 }, { "epoch": 2.251655629139073, "grad_norm": 0.08939100980866099, "learning_rate": 3.440713983000601e-05, "loss": 0.3252, "step": 170 }, { "epoch": 2.2649006622516556, "grad_norm": 0.09306831321980909, "learning_rate": 3.324118597838464e-05, "loss": 0.3225, "step": 171 }, { "epoch": 2.2781456953642385, "grad_norm": 0.09091774211009096, "learning_rate": 3.209137931341143e-05, "loss": 0.3215, "step": 172 }, { "epoch": 2.2913907284768213, "grad_norm": 0.08998835153295978, "learning_rate": 3.0957997942825336e-05, "loss": 0.3332, "step": 173 }, { "epoch": 2.304635761589404, "grad_norm": 0.08999871518726542, "learning_rate": 2.9841316001522347e-05, "loss": 0.3265, "step": 174 }, { "epoch": 2.3178807947019866, "grad_norm": 0.08874688997641272, "learning_rate": 2.874160358524931e-05, "loss": 0.328, "step": 175 }, { "epoch": 2.3311258278145695, "grad_norm": 0.08979245895359222, "learning_rate": 2.7659126685275027e-05, "loss": 0.3288, "step": 176 }, { "epoch": 2.3443708609271523, "grad_norm": 0.09322170086883196, "learning_rate": 2.659414712405398e-05, "loss": 0.3264, "step": 177 }, { "epoch": 2.357615894039735, "grad_norm": 0.0873785964065595, "learning_rate": 2.5546922491898495e-05, "loss": 0.3283, "step": 178 }, { "epoch": 2.370860927152318, "grad_norm": 0.09137697607964013, "learning_rate": 2.451770608467432e-05, "loss": 0.3265, "step": 179 }, { "epoch": 2.384105960264901, "grad_norm": 0.08934971281847022, "learning_rate": 2.3506746842535242e-05, "loss": 0.3197, "step": 180 }, { "epoch": 2.3973509933774833, "grad_norm": 0.09226380851297578, "learning_rate": 2.251428928971102e-05, "loss": 0.3303, "step": 181 }, { "epoch": 2.410596026490066, "grad_norm": 0.08813038828978075, "learning_rate": 2.1540573475363402e-05, "loss": 0.3147, "step": 182 }, { "epoch": 2.423841059602649, "grad_norm": 0.09148478249319783, "learning_rate": 2.058583491552465e-05, "loss": 0.3304, "step": 183 }, { "epoch": 2.437086092715232, "grad_norm": 0.08970976007155415, "learning_rate": 1.9650304536132426e-05, "loss": 0.3142, "step": 184 }, { "epoch": 2.4503311258278146, "grad_norm": 0.0914061480835884, "learning_rate": 1.8734208617174988e-05, "loss": 0.3332, "step": 185 }, { "epoch": 2.4635761589403975, "grad_norm": 0.09223482668849642, "learning_rate": 1.783776873795994e-05, "loss": 0.3235, "step": 186 }, { "epoch": 2.47682119205298, "grad_norm": 0.09218058790384615, "learning_rate": 1.696120172352025e-05, "loss": 0.3281, "step": 187 }, { "epoch": 2.4900662251655628, "grad_norm": 0.09120288324314661, "learning_rate": 1.6104719592169902e-05, "loss": 0.323, "step": 188 }, { "epoch": 2.5033112582781456, "grad_norm": 0.09425838170079778, "learning_rate": 1.526852950422226e-05, "loss": 0.3214, "step": 189 }, { "epoch": 2.5165562913907285, "grad_norm": 0.09259911612664488, "learning_rate": 1.4452833711883628e-05, "loss": 0.3172, "step": 190 }, { "epoch": 2.5298013245033113, "grad_norm": 0.08967866399346999, "learning_rate": 1.3657829510333654e-05, "loss": 0.314, "step": 191 }, { "epoch": 2.543046357615894, "grad_norm": 0.09263981141490185, "learning_rate": 1.2883709190004955e-05, "loss": 0.3306, "step": 192 }, { "epoch": 2.556291390728477, "grad_norm": 0.0924041757651034, "learning_rate": 1.2130659990073146e-05, "loss": 0.3238, "step": 193 }, { "epoch": 2.5695364238410594, "grad_norm": 0.08680414784000516, "learning_rate": 1.1398864053168534e-05, "loss": 0.3172, "step": 194 }, { "epoch": 2.5827814569536423, "grad_norm": 0.08927214818010673, "learning_rate": 1.0688498381320855e-05, "loss": 0.3148, "step": 195 }, { "epoch": 2.596026490066225, "grad_norm": 0.09039528377033235, "learning_rate": 9.999734793146998e-06, "loss": 0.3212, "step": 196 }, { "epoch": 2.609271523178808, "grad_norm": 0.08907654916187858, "learning_rate": 9.332739882292752e-06, "loss": 0.3124, "step": 197 }, { "epoch": 2.622516556291391, "grad_norm": 0.09035973348094353, "learning_rate": 8.687674977138116e-06, "loss": 0.3246, "step": 198 }, { "epoch": 2.6357615894039736, "grad_norm": 0.08737713823497803, "learning_rate": 8.064696101776358e-06, "loss": 0.3143, "step": 199 }, { "epoch": 2.6490066225165565, "grad_norm": 0.08814135175802748, "learning_rate": 7.463953938275858e-06, "loss": 0.3094, "step": 200 }, { "epoch": 2.662251655629139, "grad_norm": 0.08889240634697596, "learning_rate": 6.8855937902340576e-06, "loss": 0.3214, "step": 201 }, { "epoch": 2.6754966887417218, "grad_norm": 0.09012485234682949, "learning_rate": 6.329755547632499e-06, "loss": 0.3169, "step": 202 }, { "epoch": 2.6887417218543046, "grad_norm": 0.09076602960863962, "learning_rate": 5.7965736530010916e-06, "loss": 0.3218, "step": 203 }, { "epoch": 2.7019867549668874, "grad_norm": 0.09128692637997875, "learning_rate": 5.286177068899989e-06, "loss": 0.3224, "step": 204 }, { "epoch": 2.7152317880794703, "grad_norm": 0.08980696390068593, "learning_rate": 4.798689246727006e-06, "loss": 0.3255, "step": 205 }, { "epoch": 2.7284768211920527, "grad_norm": 0.08721555286082, "learning_rate": 4.3342280968580285e-06, "loss": 0.3056, "step": 206 }, { "epoch": 2.741721854304636, "grad_norm": 0.09013962844918878, "learning_rate": 3.892905960127546e-06, "loss": 0.3198, "step": 207 }, { "epoch": 2.7549668874172184, "grad_norm": 0.09102568370124482, "learning_rate": 3.4748295806564356e-06, "loss": 0.3192, "step": 208 }, { "epoch": 2.7682119205298013, "grad_norm": 0.09384836363080047, "learning_rate": 3.0801000800333877e-06, "loss": 0.3269, "step": 209 }, { "epoch": 2.781456953642384, "grad_norm": 0.09126268422899254, "learning_rate": 2.708812932856253e-06, "loss": 0.3302, "step": 210 }, { "epoch": 2.794701986754967, "grad_norm": 0.08781813338797502, "learning_rate": 2.3610579436393e-06, "loss": 0.3272, "step": 211 }, { "epoch": 2.80794701986755, "grad_norm": 0.09110065248669541, "learning_rate": 2.036919225091827e-06, "loss": 0.3206, "step": 212 }, { "epoch": 2.821192052980132, "grad_norm": 0.09086421544518553, "learning_rate": 1.7364751777736332e-06, "loss": 0.3245, "step": 213 }, { "epoch": 2.8344370860927155, "grad_norm": 0.08855581117736014, "learning_rate": 1.459798471131868e-06, "loss": 0.3118, "step": 214 }, { "epoch": 2.847682119205298, "grad_norm": 0.08936995804191887, "learning_rate": 1.2069560259243328e-06, "loss": 0.3215, "step": 215 }, { "epoch": 2.8609271523178808, "grad_norm": 0.0921595910113618, "learning_rate": 9.780089980330642e-07, "loss": 0.3174, "step": 216 }, { "epoch": 2.8741721854304636, "grad_norm": 0.08711718437070236, "learning_rate": 7.730127636723539e-07, "loss": 0.3177, "step": 217 }, { "epoch": 2.8874172185430464, "grad_norm": 0.09131775721484407, "learning_rate": 5.920169059947411e-07, "loss": 0.3232, "step": 218 }, { "epoch": 2.9006622516556293, "grad_norm": 0.08947994407470564, "learning_rate": 4.3506520309813947e-07, "loss": 0.3204, "step": 219 }, { "epoch": 2.9139072847682117, "grad_norm": 0.08743216843583222, "learning_rate": 3.0219561743707326e-07, "loss": 0.3231, "step": 220 }, { "epoch": 2.9271523178807946, "grad_norm": 0.09204563273581286, "learning_rate": 1.9344028664056713e-07, "loss": 0.3206, "step": 221 }, { "epoch": 2.9403973509933774, "grad_norm": 0.08928755161531188, "learning_rate": 1.0882551573891953e-07, "loss": 0.3258, "step": 222 }, { "epoch": 2.9536423841059603, "grad_norm": 0.09055680073868443, "learning_rate": 4.837177080119215e-08, "loss": 0.3207, "step": 223 }, { "epoch": 2.966887417218543, "grad_norm": 0.0882029082304654, "learning_rate": 1.209367398504746e-08, "loss": 0.314, "step": 224 }, { "epoch": 2.980132450331126, "grad_norm": 0.09307741342290024, "learning_rate": 0.0, "loss": 0.3346, "step": 225 }, { "epoch": 2.980132450331126, "eval_loss": 0.3478808104991913, "eval_runtime": 37.4367, "eval_samples_per_second": 27.032, "eval_steps_per_second": 0.855, "step": 225 }, { "epoch": 2.980132450331126, "step": 225, "total_flos": 1.002324572158034e+17, "train_loss": 0.3962253777186076, "train_runtime": 3220.2895, "train_samples_per_second": 8.951, "train_steps_per_second": 0.07 } ], "logging_steps": 1, "max_steps": 225, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.002324572158034e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }