{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9381931732529343, "eval_steps": 500, "global_step": 4500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004307095940562076, "grad_norm": 9.035698890686035, "learning_rate": 5.730659025787966e-07, "loss": 1.2789, "step": 10 }, { "epoch": 0.008614191881124151, "grad_norm": 6.42362642288208, "learning_rate": 1.1461318051575932e-06, "loss": 1.1797, "step": 20 }, { "epoch": 0.012921287821686228, "grad_norm": 1.6813161373138428, "learning_rate": 1.7191977077363897e-06, "loss": 0.9736, "step": 30 }, { "epoch": 0.017228383762248303, "grad_norm": 0.9371042251586914, "learning_rate": 2.2922636103151864e-06, "loss": 0.871, "step": 40 }, { "epoch": 0.02153547970281038, "grad_norm": 0.7695503234863281, "learning_rate": 2.865329512893983e-06, "loss": 0.8025, "step": 50 }, { "epoch": 0.025842575643372456, "grad_norm": 0.716374397277832, "learning_rate": 3.4383954154727795e-06, "loss": 0.7698, "step": 60 }, { "epoch": 0.030149671583934532, "grad_norm": 0.6864265203475952, "learning_rate": 4.011461318051576e-06, "loss": 0.7701, "step": 70 }, { "epoch": 0.034456767524496605, "grad_norm": 0.7581704258918762, "learning_rate": 4.584527220630373e-06, "loss": 0.7619, "step": 80 }, { "epoch": 0.03876386346505868, "grad_norm": 0.6616791486740112, "learning_rate": 5.157593123209169e-06, "loss": 0.7296, "step": 90 }, { "epoch": 0.04307095940562076, "grad_norm": 0.6397051811218262, "learning_rate": 5.730659025787966e-06, "loss": 0.7453, "step": 100 }, { "epoch": 0.047378055346182835, "grad_norm": 0.6572911143302917, "learning_rate": 6.303724928366762e-06, "loss": 0.767, "step": 110 }, { "epoch": 0.05168515128674491, "grad_norm": 0.669222354888916, "learning_rate": 6.876790830945559e-06, "loss": 0.7369, "step": 120 }, { "epoch": 0.05599224722730699, "grad_norm": 0.6517964601516724, "learning_rate": 7.449856733524356e-06, "loss": 0.7186, "step": 130 }, { "epoch": 0.060299343167869064, "grad_norm": 0.6209223866462708, "learning_rate": 8.022922636103152e-06, "loss": 0.7155, "step": 140 }, { "epoch": 0.06460643910843114, "grad_norm": 0.6591508388519287, "learning_rate": 8.595988538681949e-06, "loss": 0.7289, "step": 150 }, { "epoch": 0.06891353504899321, "grad_norm": 0.5842370390892029, "learning_rate": 9.169054441260746e-06, "loss": 0.7183, "step": 160 }, { "epoch": 0.0732206309895553, "grad_norm": 0.7117204070091248, "learning_rate": 9.742120343839543e-06, "loss": 0.7192, "step": 170 }, { "epoch": 0.07752772693011736, "grad_norm": 0.6163178086280823, "learning_rate": 1.0315186246418338e-05, "loss": 0.7193, "step": 180 }, { "epoch": 0.08183482287067945, "grad_norm": 0.5932906270027161, "learning_rate": 1.0888252148997137e-05, "loss": 0.714, "step": 190 }, { "epoch": 0.08614191881124152, "grad_norm": 0.5982919335365295, "learning_rate": 1.1461318051575932e-05, "loss": 0.7058, "step": 200 }, { "epoch": 0.0904490147518036, "grad_norm": 0.6208463907241821, "learning_rate": 1.2034383954154729e-05, "loss": 0.7189, "step": 210 }, { "epoch": 0.09475611069236567, "grad_norm": 0.5887411236763, "learning_rate": 1.2607449856733524e-05, "loss": 0.7249, "step": 220 }, { "epoch": 0.09906320663292775, "grad_norm": 0.5963988900184631, "learning_rate": 1.3180515759312323e-05, "loss": 0.7293, "step": 230 }, { "epoch": 0.10337030257348982, "grad_norm": 0.5715692043304443, "learning_rate": 1.3753581661891118e-05, "loss": 0.6845, "step": 240 }, { "epoch": 0.1076773985140519, "grad_norm": 0.639398455619812, "learning_rate": 1.4326647564469915e-05, "loss": 0.6994, "step": 250 }, { "epoch": 0.11198449445461398, "grad_norm": 0.6884477734565735, "learning_rate": 1.4899713467048712e-05, "loss": 0.7126, "step": 260 }, { "epoch": 0.11629159039517606, "grad_norm": 0.6021578907966614, "learning_rate": 1.5472779369627507e-05, "loss": 0.7215, "step": 270 }, { "epoch": 0.12059868633573813, "grad_norm": 0.6716468930244446, "learning_rate": 1.6045845272206304e-05, "loss": 0.6969, "step": 280 }, { "epoch": 0.1249057822763002, "grad_norm": 0.5783571600914001, "learning_rate": 1.66189111747851e-05, "loss": 0.7111, "step": 290 }, { "epoch": 0.12921287821686228, "grad_norm": 0.5546681880950928, "learning_rate": 1.7191977077363898e-05, "loss": 0.7, "step": 300 }, { "epoch": 0.13351997415742436, "grad_norm": 0.5409330129623413, "learning_rate": 1.7765042979942695e-05, "loss": 0.696, "step": 310 }, { "epoch": 0.13782707009798642, "grad_norm": 0.5752865672111511, "learning_rate": 1.833810888252149e-05, "loss": 0.6883, "step": 320 }, { "epoch": 0.1421341660385485, "grad_norm": 0.6340565085411072, "learning_rate": 1.891117478510029e-05, "loss": 0.6881, "step": 330 }, { "epoch": 0.1464412619791106, "grad_norm": 0.5298891067504883, "learning_rate": 1.9484240687679085e-05, "loss": 0.6935, "step": 340 }, { "epoch": 0.15074835791967267, "grad_norm": 0.5659753680229187, "learning_rate": 1.9999998871916207e-05, "loss": 0.7103, "step": 350 }, { "epoch": 0.15505545386023473, "grad_norm": 0.6017744541168213, "learning_rate": 1.999986350216883e-05, "loss": 0.6855, "step": 360 }, { "epoch": 0.1593625498007968, "grad_norm": 0.5426760911941528, "learning_rate": 1.999950251916212e-05, "loss": 0.6914, "step": 370 }, { "epoch": 0.1636696457413589, "grad_norm": 0.5532637238502502, "learning_rate": 1.999891593104044e-05, "loss": 0.6895, "step": 380 }, { "epoch": 0.16797674168192098, "grad_norm": 0.5581168532371521, "learning_rate": 1.9998103751038177e-05, "loss": 0.6897, "step": 390 }, { "epoch": 0.17228383762248303, "grad_norm": 0.5208210945129395, "learning_rate": 1.9997065997479442e-05, "loss": 0.6889, "step": 400 }, { "epoch": 0.17659093356304512, "grad_norm": 0.5863595604896545, "learning_rate": 1.9995802693777644e-05, "loss": 0.6905, "step": 410 }, { "epoch": 0.1808980295036072, "grad_norm": 0.5605342984199524, "learning_rate": 1.9994313868434988e-05, "loss": 0.6815, "step": 420 }, { "epoch": 0.18520512544416926, "grad_norm": 0.5580301880836487, "learning_rate": 1.9992599555041798e-05, "loss": 0.7067, "step": 430 }, { "epoch": 0.18951222138473134, "grad_norm": 0.558312177658081, "learning_rate": 1.999065979227579e-05, "loss": 0.7061, "step": 440 }, { "epoch": 0.19381931732529342, "grad_norm": 0.5273975133895874, "learning_rate": 1.998849462390118e-05, "loss": 0.6905, "step": 450 }, { "epoch": 0.1981264132658555, "grad_norm": 0.4772217571735382, "learning_rate": 1.9986104098767703e-05, "loss": 0.686, "step": 460 }, { "epoch": 0.20243350920641756, "grad_norm": 0.5336763858795166, "learning_rate": 1.9983488270809515e-05, "loss": 0.6861, "step": 470 }, { "epoch": 0.20674060514697964, "grad_norm": 0.4961983859539032, "learning_rate": 1.9980647199043966e-05, "loss": 0.6882, "step": 480 }, { "epoch": 0.21104770108754173, "grad_norm": 0.5408128499984741, "learning_rate": 1.9977580947570275e-05, "loss": 0.7001, "step": 490 }, { "epoch": 0.2153547970281038, "grad_norm": 0.5350680351257324, "learning_rate": 1.997428958556809e-05, "loss": 0.6931, "step": 500 }, { "epoch": 0.21966189296866587, "grad_norm": 0.5455281734466553, "learning_rate": 1.9970773187295917e-05, "loss": 0.6919, "step": 510 }, { "epoch": 0.22396898890922795, "grad_norm": 0.524664580821991, "learning_rate": 1.9967031832089438e-05, "loss": 0.6738, "step": 520 }, { "epoch": 0.22827608484979003, "grad_norm": 0.48598727583885193, "learning_rate": 1.9963065604359746e-05, "loss": 0.6678, "step": 530 }, { "epoch": 0.23258318079035212, "grad_norm": 0.5560494065284729, "learning_rate": 1.9958874593591418e-05, "loss": 0.694, "step": 540 }, { "epoch": 0.23689027673091417, "grad_norm": 0.5516777038574219, "learning_rate": 1.99544588943405e-05, "loss": 0.6715, "step": 550 }, { "epoch": 0.24119737267147626, "grad_norm": 0.5097941756248474, "learning_rate": 1.9949818606232393e-05, "loss": 0.6782, "step": 560 }, { "epoch": 0.24550446861203834, "grad_norm": 0.5353350639343262, "learning_rate": 1.9944953833959567e-05, "loss": 0.6904, "step": 570 }, { "epoch": 0.2498115645526004, "grad_norm": 0.5160298943519592, "learning_rate": 1.9939864687279237e-05, "loss": 0.6756, "step": 580 }, { "epoch": 0.2541186604931625, "grad_norm": 0.5377163887023926, "learning_rate": 1.993455128101087e-05, "loss": 0.712, "step": 590 }, { "epoch": 0.25842575643372456, "grad_norm": 0.47318100929260254, "learning_rate": 1.992901373503359e-05, "loss": 0.6648, "step": 600 }, { "epoch": 0.2627328523742866, "grad_norm": 0.4977729916572571, "learning_rate": 1.992325217428348e-05, "loss": 0.6893, "step": 610 }, { "epoch": 0.26703994831484873, "grad_norm": 0.5569038391113281, "learning_rate": 1.991726672875077e-05, "loss": 0.6876, "step": 620 }, { "epoch": 0.2713470442554108, "grad_norm": 0.544884443283081, "learning_rate": 1.9911057533476884e-05, "loss": 0.6736, "step": 630 }, { "epoch": 0.27565414019597284, "grad_norm": 0.5159808993339539, "learning_rate": 1.9904624728551417e-05, "loss": 0.674, "step": 640 }, { "epoch": 0.27996123613653495, "grad_norm": 0.48680537939071655, "learning_rate": 1.989796845910896e-05, "loss": 0.6903, "step": 650 }, { "epoch": 0.284268332077097, "grad_norm": 0.527867317199707, "learning_rate": 1.9891088875325827e-05, "loss": 0.6693, "step": 660 }, { "epoch": 0.2885754280176591, "grad_norm": 0.5441365838050842, "learning_rate": 1.988398613241666e-05, "loss": 0.6721, "step": 670 }, { "epoch": 0.2928825239582212, "grad_norm": 0.5693966150283813, "learning_rate": 1.9876660390630954e-05, "loss": 0.6684, "step": 680 }, { "epoch": 0.29718961989878323, "grad_norm": 0.5607503652572632, "learning_rate": 1.986911181524941e-05, "loss": 0.6783, "step": 690 }, { "epoch": 0.30149671583934534, "grad_norm": 0.5421719551086426, "learning_rate": 1.9861340576580225e-05, "loss": 0.6658, "step": 700 }, { "epoch": 0.3058038117799074, "grad_norm": 0.497612863779068, "learning_rate": 1.9853346849955236e-05, "loss": 0.6816, "step": 710 }, { "epoch": 0.31011090772046945, "grad_norm": 0.5503632426261902, "learning_rate": 1.984513081572598e-05, "loss": 0.6663, "step": 720 }, { "epoch": 0.31441800366103156, "grad_norm": 0.5319767594337463, "learning_rate": 1.983669265925961e-05, "loss": 0.6513, "step": 730 }, { "epoch": 0.3187250996015936, "grad_norm": 0.5350950956344604, "learning_rate": 1.9828032570934726e-05, "loss": 0.6699, "step": 740 }, { "epoch": 0.3230321955421557, "grad_norm": 0.5330127477645874, "learning_rate": 1.9819150746137067e-05, "loss": 0.6786, "step": 750 }, { "epoch": 0.3273392914827178, "grad_norm": 0.4740910232067108, "learning_rate": 1.981004738525512e-05, "loss": 0.6867, "step": 760 }, { "epoch": 0.33164638742327984, "grad_norm": 0.5131900906562805, "learning_rate": 1.980072269367557e-05, "loss": 0.6618, "step": 770 }, { "epoch": 0.33595348336384195, "grad_norm": 0.4712623059749603, "learning_rate": 1.97911768817787e-05, "loss": 0.6863, "step": 780 }, { "epoch": 0.340260579304404, "grad_norm": 0.5240254998207092, "learning_rate": 1.9781410164933626e-05, "loss": 0.6941, "step": 790 }, { "epoch": 0.34456767524496607, "grad_norm": 0.5192612409591675, "learning_rate": 1.9771422763493434e-05, "loss": 0.6726, "step": 800 }, { "epoch": 0.3488747711855282, "grad_norm": 0.4864448010921478, "learning_rate": 1.9761214902790217e-05, "loss": 0.6541, "step": 810 }, { "epoch": 0.35318186712609023, "grad_norm": 0.5248873829841614, "learning_rate": 1.9750786813129995e-05, "loss": 0.6713, "step": 820 }, { "epoch": 0.3574889630666523, "grad_norm": 0.5010212659835815, "learning_rate": 1.9740138729787505e-05, "loss": 0.6793, "step": 830 }, { "epoch": 0.3617960590072144, "grad_norm": 0.4966225326061249, "learning_rate": 1.9729270893000913e-05, "loss": 0.6692, "step": 840 }, { "epoch": 0.36610315494777645, "grad_norm": 0.48576685786247253, "learning_rate": 1.9718183547966366e-05, "loss": 0.6812, "step": 850 }, { "epoch": 0.3704102508883385, "grad_norm": 0.5232109427452087, "learning_rate": 1.9706876944832486e-05, "loss": 0.6567, "step": 860 }, { "epoch": 0.3747173468289006, "grad_norm": 0.4847777485847473, "learning_rate": 1.9695351338694713e-05, "loss": 0.6638, "step": 870 }, { "epoch": 0.3790244427694627, "grad_norm": 0.49412795901298523, "learning_rate": 1.9683606989589553e-05, "loss": 0.6731, "step": 880 }, { "epoch": 0.3833315387100248, "grad_norm": 0.5143546462059021, "learning_rate": 1.9671644162488716e-05, "loss": 0.6779, "step": 890 }, { "epoch": 0.38763863465058684, "grad_norm": 0.5516107082366943, "learning_rate": 1.965946312729312e-05, "loss": 0.6798, "step": 900 }, { "epoch": 0.3919457305911489, "grad_norm": 0.5140990018844604, "learning_rate": 1.9647064158826825e-05, "loss": 0.6473, "step": 910 }, { "epoch": 0.396252826531711, "grad_norm": 0.4911974370479584, "learning_rate": 1.9634447536830815e-05, "loss": 0.6565, "step": 920 }, { "epoch": 0.40055992247227307, "grad_norm": 0.4995877742767334, "learning_rate": 1.9621613545956703e-05, "loss": 0.6514, "step": 930 }, { "epoch": 0.4048670184128351, "grad_norm": 0.48752328753471375, "learning_rate": 1.9608562475760287e-05, "loss": 0.6751, "step": 940 }, { "epoch": 0.40917411435339723, "grad_norm": 0.4956004321575165, "learning_rate": 1.9595294620695036e-05, "loss": 0.6492, "step": 950 }, { "epoch": 0.4134812102939593, "grad_norm": 0.48215603828430176, "learning_rate": 1.958181028010544e-05, "loss": 0.6741, "step": 960 }, { "epoch": 0.4177883062345214, "grad_norm": 0.48835939168930054, "learning_rate": 1.9568109758220253e-05, "loss": 0.6638, "step": 970 }, { "epoch": 0.42209540217508346, "grad_norm": 0.47754788398742676, "learning_rate": 1.9554193364145635e-05, "loss": 0.6657, "step": 980 }, { "epoch": 0.4264024981156455, "grad_norm": 0.5080917477607727, "learning_rate": 1.9540061411858172e-05, "loss": 0.6675, "step": 990 }, { "epoch": 0.4307095940562076, "grad_norm": 0.4634297788143158, "learning_rate": 1.9525714220197802e-05, "loss": 0.6693, "step": 1000 }, { "epoch": 0.4350166899967697, "grad_norm": 0.4760366678237915, "learning_rate": 1.951115211286061e-05, "loss": 0.6721, "step": 1010 }, { "epoch": 0.43932378593733173, "grad_norm": 0.5227916836738586, "learning_rate": 1.9496375418391525e-05, "loss": 0.6691, "step": 1020 }, { "epoch": 0.44363088187789385, "grad_norm": 0.5157990455627441, "learning_rate": 1.948138447017692e-05, "loss": 0.6774, "step": 1030 }, { "epoch": 0.4479379778184559, "grad_norm": 0.49596408009529114, "learning_rate": 1.9466179606437087e-05, "loss": 0.6313, "step": 1040 }, { "epoch": 0.45224507375901796, "grad_norm": 0.47041237354278564, "learning_rate": 1.945076117021859e-05, "loss": 0.6724, "step": 1050 }, { "epoch": 0.45655216969958007, "grad_norm": 0.5206364989280701, "learning_rate": 1.9435129509386538e-05, "loss": 0.6843, "step": 1060 }, { "epoch": 0.4608592656401421, "grad_norm": 0.5067657828330994, "learning_rate": 1.9419284976616745e-05, "loss": 0.6649, "step": 1070 }, { "epoch": 0.46516636158070424, "grad_norm": 1.3445152044296265, "learning_rate": 1.9403227929387756e-05, "loss": 0.6548, "step": 1080 }, { "epoch": 0.4694734575212663, "grad_norm": 0.5465224385261536, "learning_rate": 1.93869587299728e-05, "loss": 0.6427, "step": 1090 }, { "epoch": 0.47378055346182835, "grad_norm": 0.49137911200523376, "learning_rate": 1.9370477745431587e-05, "loss": 0.6519, "step": 1100 }, { "epoch": 0.47808764940239046, "grad_norm": 0.48190736770629883, "learning_rate": 1.935378534760206e-05, "loss": 0.6615, "step": 1110 }, { "epoch": 0.4823947453429525, "grad_norm": 0.4869353771209717, "learning_rate": 1.9336881913091992e-05, "loss": 0.65, "step": 1120 }, { "epoch": 0.48670184128351457, "grad_norm": 0.4473590552806854, "learning_rate": 1.931976782327048e-05, "loss": 0.6821, "step": 1130 }, { "epoch": 0.4910089372240767, "grad_norm": 0.4703207314014435, "learning_rate": 1.9302443464259352e-05, "loss": 0.657, "step": 1140 }, { "epoch": 0.49531603316463874, "grad_norm": 0.48172295093536377, "learning_rate": 1.9284909226924457e-05, "loss": 0.6581, "step": 1150 }, { "epoch": 0.4996231291052008, "grad_norm": 0.4986841082572937, "learning_rate": 1.9267165506866835e-05, "loss": 0.664, "step": 1160 }, { "epoch": 0.5039302250457629, "grad_norm": 0.4936910569667816, "learning_rate": 1.9249212704413803e-05, "loss": 0.6409, "step": 1170 }, { "epoch": 0.508237320986325, "grad_norm": 0.48618724942207336, "learning_rate": 1.9231051224609918e-05, "loss": 0.6566, "step": 1180 }, { "epoch": 0.512544416926887, "grad_norm": 0.5300356149673462, "learning_rate": 1.921268147720784e-05, "loss": 0.6533, "step": 1190 }, { "epoch": 0.5168515128674491, "grad_norm": 0.4799743890762329, "learning_rate": 1.919410387665908e-05, "loss": 0.6677, "step": 1200 }, { "epoch": 0.5211586088080112, "grad_norm": 0.5317394137382507, "learning_rate": 1.9175318842104667e-05, "loss": 0.6464, "step": 1210 }, { "epoch": 0.5254657047485732, "grad_norm": 0.49199768900871277, "learning_rate": 1.9156326797365665e-05, "loss": 0.6655, "step": 1220 }, { "epoch": 0.5297728006891353, "grad_norm": 0.4916874170303345, "learning_rate": 1.913712817093364e-05, "loss": 0.6372, "step": 1230 }, { "epoch": 0.5340798966296975, "grad_norm": 0.48562970757484436, "learning_rate": 1.9117723395960972e-05, "loss": 0.6639, "step": 1240 }, { "epoch": 0.5383869925702595, "grad_norm": 0.5152992010116577, "learning_rate": 1.909811291025109e-05, "loss": 0.6609, "step": 1250 }, { "epoch": 0.5426940885108216, "grad_norm": 0.48352181911468506, "learning_rate": 1.907829715624859e-05, "loss": 0.6726, "step": 1260 }, { "epoch": 0.5470011844513837, "grad_norm": 0.5064017176628113, "learning_rate": 1.905827658102926e-05, "loss": 0.6698, "step": 1270 }, { "epoch": 0.5513082803919457, "grad_norm": 0.46494290232658386, "learning_rate": 1.9038051636289997e-05, "loss": 0.68, "step": 1280 }, { "epoch": 0.5556153763325078, "grad_norm": 0.4788792133331299, "learning_rate": 1.9017622778338585e-05, "loss": 0.6501, "step": 1290 }, { "epoch": 0.5599224722730699, "grad_norm": 0.4712987542152405, "learning_rate": 1.8996990468083448e-05, "loss": 0.6488, "step": 1300 }, { "epoch": 0.5642295682136319, "grad_norm": 0.4997137784957886, "learning_rate": 1.8976155171023216e-05, "loss": 0.6518, "step": 1310 }, { "epoch": 0.568536664154194, "grad_norm": 0.5003030896186829, "learning_rate": 1.895511735723623e-05, "loss": 0.6317, "step": 1320 }, { "epoch": 0.5728437600947561, "grad_norm": 0.4551664888858795, "learning_rate": 1.8933877501369944e-05, "loss": 0.6634, "step": 1330 }, { "epoch": 0.5771508560353182, "grad_norm": 0.532534122467041, "learning_rate": 1.891243608263021e-05, "loss": 0.6656, "step": 1340 }, { "epoch": 0.5814579519758802, "grad_norm": 0.47166600823402405, "learning_rate": 1.889079358477047e-05, "loss": 0.657, "step": 1350 }, { "epoch": 0.5857650479164423, "grad_norm": 0.45552805066108704, "learning_rate": 1.8868950496080832e-05, "loss": 0.6652, "step": 1360 }, { "epoch": 0.5900721438570045, "grad_norm": 0.5267536044120789, "learning_rate": 1.884690730937707e-05, "loss": 0.6463, "step": 1370 }, { "epoch": 0.5943792397975665, "grad_norm": 0.49093228578567505, "learning_rate": 1.882466452198949e-05, "loss": 0.6604, "step": 1380 }, { "epoch": 0.5986863357381286, "grad_norm": 0.5105960369110107, "learning_rate": 1.880222263575172e-05, "loss": 0.6457, "step": 1390 }, { "epoch": 0.6029934316786907, "grad_norm": 0.47326135635375977, "learning_rate": 1.8779582156989384e-05, "loss": 0.6464, "step": 1400 }, { "epoch": 0.6073005276192527, "grad_norm": 0.4910115599632263, "learning_rate": 1.875674359650867e-05, "loss": 0.6547, "step": 1410 }, { "epoch": 0.6116076235598148, "grad_norm": 0.48352956771850586, "learning_rate": 1.873370746958482e-05, "loss": 0.654, "step": 1420 }, { "epoch": 0.6159147195003769, "grad_norm": 0.4722056984901428, "learning_rate": 1.871047429595049e-05, "loss": 0.6372, "step": 1430 }, { "epoch": 0.6202218154409389, "grad_norm": 0.4340212345123291, "learning_rate": 1.868704459978405e-05, "loss": 0.6507, "step": 1440 }, { "epoch": 0.624528911381501, "grad_norm": 0.48497867584228516, "learning_rate": 1.8663418909697723e-05, "loss": 0.6349, "step": 1450 }, { "epoch": 0.6288360073220631, "grad_norm": 0.4707370102405548, "learning_rate": 1.863959775872567e-05, "loss": 0.6445, "step": 1460 }, { "epoch": 0.6331431032626251, "grad_norm": 0.5151925683021545, "learning_rate": 1.861558168431199e-05, "loss": 0.6493, "step": 1470 }, { "epoch": 0.6374501992031872, "grad_norm": 0.47226110100746155, "learning_rate": 1.8591371228298554e-05, "loss": 0.6211, "step": 1480 }, { "epoch": 0.6417572951437494, "grad_norm": 0.48166829347610474, "learning_rate": 1.856696693691281e-05, "loss": 0.6476, "step": 1490 }, { "epoch": 0.6460643910843114, "grad_norm": 0.5039719343185425, "learning_rate": 1.8542369360755448e-05, "loss": 0.636, "step": 1500 }, { "epoch": 0.6503714870248735, "grad_norm": 0.45818519592285156, "learning_rate": 1.8517579054787974e-05, "loss": 0.658, "step": 1510 }, { "epoch": 0.6546785829654356, "grad_norm": 0.4803057014942169, "learning_rate": 1.8492596578320194e-05, "loss": 0.6468, "step": 1520 }, { "epoch": 0.6589856789059977, "grad_norm": 0.480227530002594, "learning_rate": 1.8467422494997593e-05, "loss": 0.641, "step": 1530 }, { "epoch": 0.6632927748465597, "grad_norm": 0.49187588691711426, "learning_rate": 1.844205737278863e-05, "loss": 0.6572, "step": 1540 }, { "epoch": 0.6675998707871218, "grad_norm": 0.49701517820358276, "learning_rate": 1.84165017839719e-05, "loss": 0.6567, "step": 1550 }, { "epoch": 0.6719069667276839, "grad_norm": 0.48368483781814575, "learning_rate": 1.8390756305123246e-05, "loss": 0.669, "step": 1560 }, { "epoch": 0.6762140626682459, "grad_norm": 0.5007254481315613, "learning_rate": 1.836482151710273e-05, "loss": 0.6448, "step": 1570 }, { "epoch": 0.680521158608808, "grad_norm": 0.44526585936546326, "learning_rate": 1.8338698005041556e-05, "loss": 0.6386, "step": 1580 }, { "epoch": 0.6848282545493701, "grad_norm": 0.4812663197517395, "learning_rate": 1.8312386358328828e-05, "loss": 0.6447, "step": 1590 }, { "epoch": 0.6891353504899321, "grad_norm": 0.4910503029823303, "learning_rate": 1.828588717059829e-05, "loss": 0.6449, "step": 1600 }, { "epoch": 0.6934424464304942, "grad_norm": 0.47431930899620056, "learning_rate": 1.8259201039714914e-05, "loss": 0.6372, "step": 1610 }, { "epoch": 0.6977495423710564, "grad_norm": 0.5024338364601135, "learning_rate": 1.8232328567761416e-05, "loss": 0.6433, "step": 1620 }, { "epoch": 0.7020566383116184, "grad_norm": 0.47510799765586853, "learning_rate": 1.820527036102467e-05, "loss": 0.6601, "step": 1630 }, { "epoch": 0.7063637342521805, "grad_norm": 0.47990313172340393, "learning_rate": 1.8178027029982027e-05, "loss": 0.6463, "step": 1640 }, { "epoch": 0.7106708301927426, "grad_norm": 0.5117030739784241, "learning_rate": 1.8150599189287553e-05, "loss": 0.6455, "step": 1650 }, { "epoch": 0.7149779261333046, "grad_norm": 0.4917861819267273, "learning_rate": 1.8122987457758147e-05, "loss": 0.6688, "step": 1660 }, { "epoch": 0.7192850220738667, "grad_norm": 0.49872297048568726, "learning_rate": 1.8095192458359588e-05, "loss": 0.6513, "step": 1670 }, { "epoch": 0.7235921180144288, "grad_norm": 0.47510796785354614, "learning_rate": 1.806721481819247e-05, "loss": 0.649, "step": 1680 }, { "epoch": 0.7278992139549908, "grad_norm": 0.4924173057079315, "learning_rate": 1.8039055168478074e-05, "loss": 0.6177, "step": 1690 }, { "epoch": 0.7322063098955529, "grad_norm": 0.4918348789215088, "learning_rate": 1.8010714144544104e-05, "loss": 0.6543, "step": 1700 }, { "epoch": 0.736513405836115, "grad_norm": 0.45298415422439575, "learning_rate": 1.7982192385810372e-05, "loss": 0.6367, "step": 1710 }, { "epoch": 0.740820501776677, "grad_norm": 0.46879851818084717, "learning_rate": 1.795349053577435e-05, "loss": 0.6414, "step": 1720 }, { "epoch": 0.7451275977172391, "grad_norm": 0.4573706388473511, "learning_rate": 1.7924609241996672e-05, "loss": 0.628, "step": 1730 }, { "epoch": 0.7494346936578012, "grad_norm": 0.46929094195365906, "learning_rate": 1.7895549156086514e-05, "loss": 0.6478, "step": 1740 }, { "epoch": 0.7537417895983634, "grad_norm": 0.5428628325462341, "learning_rate": 1.78663109336869e-05, "loss": 0.6405, "step": 1750 }, { "epoch": 0.7580488855389254, "grad_norm": 0.47853079438209534, "learning_rate": 1.78368952344599e-05, "loss": 0.6442, "step": 1760 }, { "epoch": 0.7623559814794875, "grad_norm": 0.46747061610221863, "learning_rate": 1.7807302722071742e-05, "loss": 0.6369, "step": 1770 }, { "epoch": 0.7666630774200496, "grad_norm": 0.5107671022415161, "learning_rate": 1.7777534064177864e-05, "loss": 0.6322, "step": 1780 }, { "epoch": 0.7709701733606116, "grad_norm": 0.5013517141342163, "learning_rate": 1.7747589932407826e-05, "loss": 0.6384, "step": 1790 }, { "epoch": 0.7752772693011737, "grad_norm": 0.5039073824882507, "learning_rate": 1.7717471002350162e-05, "loss": 0.6504, "step": 1800 }, { "epoch": 0.7795843652417358, "grad_norm": 0.4767347276210785, "learning_rate": 1.7687177953537148e-05, "loss": 0.645, "step": 1810 }, { "epoch": 0.7838914611822978, "grad_norm": 0.4766087532043457, "learning_rate": 1.7656711469429464e-05, "loss": 0.6249, "step": 1820 }, { "epoch": 0.7881985571228599, "grad_norm": 0.5031486749649048, "learning_rate": 1.7626072237400764e-05, "loss": 0.6263, "step": 1830 }, { "epoch": 0.792505653063422, "grad_norm": 0.444658488035202, "learning_rate": 1.759526094872219e-05, "loss": 0.6561, "step": 1840 }, { "epoch": 0.796812749003984, "grad_norm": 0.5070600509643555, "learning_rate": 1.7564278298546758e-05, "loss": 0.6477, "step": 1850 }, { "epoch": 0.8011198449445461, "grad_norm": 0.45487794280052185, "learning_rate": 1.753312498589367e-05, "loss": 0.6257, "step": 1860 }, { "epoch": 0.8054269408851082, "grad_norm": 0.4745471477508545, "learning_rate": 1.7501801713632568e-05, "loss": 0.6586, "step": 1870 }, { "epoch": 0.8097340368256702, "grad_norm": 0.4743909537792206, "learning_rate": 1.7470309188467645e-05, "loss": 0.6255, "step": 1880 }, { "epoch": 0.8140411327662324, "grad_norm": 0.5165956020355225, "learning_rate": 1.7438648120921736e-05, "loss": 0.6592, "step": 1890 }, { "epoch": 0.8183482287067945, "grad_norm": 0.455861359834671, "learning_rate": 1.740681922532025e-05, "loss": 0.6467, "step": 1900 }, { "epoch": 0.8226553246473565, "grad_norm": 0.468013733625412, "learning_rate": 1.7374823219775073e-05, "loss": 0.6382, "step": 1910 }, { "epoch": 0.8269624205879186, "grad_norm": 0.46119919419288635, "learning_rate": 1.7342660826168374e-05, "loss": 0.6437, "step": 1920 }, { "epoch": 0.8312695165284807, "grad_norm": 0.4399983286857605, "learning_rate": 1.73103327701363e-05, "loss": 0.6379, "step": 1930 }, { "epoch": 0.8355766124690428, "grad_norm": 0.46829739212989807, "learning_rate": 1.7277839781052617e-05, "loss": 0.6402, "step": 1940 }, { "epoch": 0.8398837084096048, "grad_norm": 0.5193459987640381, "learning_rate": 1.7245182592012248e-05, "loss": 0.6348, "step": 1950 }, { "epoch": 0.8441908043501669, "grad_norm": 0.5310715436935425, "learning_rate": 1.7212361939814735e-05, "loss": 0.6351, "step": 1960 }, { "epoch": 0.848497900290729, "grad_norm": 0.4883059561252594, "learning_rate": 1.7179378564947615e-05, "loss": 0.6401, "step": 1970 }, { "epoch": 0.852804996231291, "grad_norm": 0.5028474926948547, "learning_rate": 1.7146233211569723e-05, "loss": 0.6559, "step": 1980 }, { "epoch": 0.8571120921718531, "grad_norm": 0.48668941855430603, "learning_rate": 1.7112926627494385e-05, "loss": 0.6572, "step": 1990 }, { "epoch": 0.8614191881124152, "grad_norm": 0.4668605327606201, "learning_rate": 1.7079459564172555e-05, "loss": 0.6321, "step": 2000 }, { "epoch": 0.8657262840529772, "grad_norm": 0.4556910991668701, "learning_rate": 1.7045832776675863e-05, "loss": 0.6268, "step": 2010 }, { "epoch": 0.8700333799935394, "grad_norm": 0.45260846614837646, "learning_rate": 1.701204702367958e-05, "loss": 0.6271, "step": 2020 }, { "epoch": 0.8743404759341015, "grad_norm": 0.4828309714794159, "learning_rate": 1.6978103067445494e-05, "loss": 0.6351, "step": 2030 }, { "epoch": 0.8786475718746635, "grad_norm": 0.4691152274608612, "learning_rate": 1.6944001673804723e-05, "loss": 0.6512, "step": 2040 }, { "epoch": 0.8829546678152256, "grad_norm": 0.4812765419483185, "learning_rate": 1.6909743612140417e-05, "loss": 0.6335, "step": 2050 }, { "epoch": 0.8872617637557877, "grad_norm": 0.4415755867958069, "learning_rate": 1.687532965537043e-05, "loss": 0.6541, "step": 2060 }, { "epoch": 0.8915688596963497, "grad_norm": 0.4993227422237396, "learning_rate": 1.6840760579929846e-05, "loss": 0.6318, "step": 2070 }, { "epoch": 0.8958759556369118, "grad_norm": 0.4628779888153076, "learning_rate": 1.6806037165753498e-05, "loss": 0.6369, "step": 2080 }, { "epoch": 0.9001830515774739, "grad_norm": 0.5235878229141235, "learning_rate": 1.677116019625834e-05, "loss": 0.6415, "step": 2090 }, { "epoch": 0.9044901475180359, "grad_norm": 0.4750138819217682, "learning_rate": 1.6736130458325793e-05, "loss": 0.6101, "step": 2100 }, { "epoch": 0.908797243458598, "grad_norm": 0.5292583107948303, "learning_rate": 1.6700948742283977e-05, "loss": 0.6248, "step": 2110 }, { "epoch": 0.9131043393991601, "grad_norm": 0.45959070324897766, "learning_rate": 1.6665615841889885e-05, "loss": 0.6339, "step": 2120 }, { "epoch": 0.9174114353397222, "grad_norm": 0.48287901282310486, "learning_rate": 1.6630132554311486e-05, "loss": 0.6161, "step": 2130 }, { "epoch": 0.9217185312802842, "grad_norm": 0.4725618064403534, "learning_rate": 1.6594499680109722e-05, "loss": 0.627, "step": 2140 }, { "epoch": 0.9260256272208464, "grad_norm": 0.4820912778377533, "learning_rate": 1.6558718023220457e-05, "loss": 0.6399, "step": 2150 }, { "epoch": 0.9303327231614085, "grad_norm": 0.48815685510635376, "learning_rate": 1.6522788390936328e-05, "loss": 0.6437, "step": 2160 }, { "epoch": 0.9346398191019705, "grad_norm": 0.4747340679168701, "learning_rate": 1.648671159388855e-05, "loss": 0.6455, "step": 2170 }, { "epoch": 0.9389469150425326, "grad_norm": 0.4894673526287079, "learning_rate": 1.6450488446028612e-05, "loss": 0.6545, "step": 2180 }, { "epoch": 0.9432540109830947, "grad_norm": 0.4756160080432892, "learning_rate": 1.641411976460991e-05, "loss": 0.6498, "step": 2190 }, { "epoch": 0.9475611069236567, "grad_norm": 0.45228078961372375, "learning_rate": 1.637760637016932e-05, "loss": 0.6438, "step": 2200 }, { "epoch": 0.9518682028642188, "grad_norm": 0.49898287653923035, "learning_rate": 1.6340949086508676e-05, "loss": 0.6518, "step": 2210 }, { "epoch": 0.9561752988047809, "grad_norm": 0.4354493021965027, "learning_rate": 1.6304148740676204e-05, "loss": 0.6125, "step": 2220 }, { "epoch": 0.9604823947453429, "grad_norm": 0.45118704438209534, "learning_rate": 1.6267206162947823e-05, "loss": 0.6146, "step": 2230 }, { "epoch": 0.964789490685905, "grad_norm": 0.4822487533092499, "learning_rate": 1.6230122186808443e-05, "loss": 0.6425, "step": 2240 }, { "epoch": 0.9690965866264671, "grad_norm": 0.490903377532959, "learning_rate": 1.619289764893317e-05, "loss": 0.6353, "step": 2250 }, { "epoch": 0.9734036825670291, "grad_norm": 0.4738866686820984, "learning_rate": 1.615553338916839e-05, "loss": 0.6315, "step": 2260 }, { "epoch": 0.9777107785075912, "grad_norm": 0.46285027265548706, "learning_rate": 1.6118030250512863e-05, "loss": 0.6501, "step": 2270 }, { "epoch": 0.9820178744481534, "grad_norm": 0.46414172649383545, "learning_rate": 1.6080389079098657e-05, "loss": 0.6501, "step": 2280 }, { "epoch": 0.9863249703887154, "grad_norm": 0.5042113661766052, "learning_rate": 1.604261072417211e-05, "loss": 0.6319, "step": 2290 }, { "epoch": 0.9906320663292775, "grad_norm": 0.43653419613838196, "learning_rate": 1.600469603807464e-05, "loss": 0.6461, "step": 2300 }, { "epoch": 0.9949391622698396, "grad_norm": 0.4572006165981293, "learning_rate": 1.5966645876223505e-05, "loss": 0.6477, "step": 2310 }, { "epoch": 0.9992462582104016, "grad_norm": 0.43867436051368713, "learning_rate": 1.5928461097092532e-05, "loss": 0.6288, "step": 2320 }, { "epoch": 1.0035533541509638, "grad_norm": 0.5620077848434448, "learning_rate": 1.589014256219273e-05, "loss": 0.5378, "step": 2330 }, { "epoch": 1.0078604500915258, "grad_norm": 0.4836018681526184, "learning_rate": 1.5851691136052842e-05, "loss": 0.5421, "step": 2340 }, { "epoch": 1.0121675460320878, "grad_norm": 0.49632197618484497, "learning_rate": 1.581310768619988e-05, "loss": 0.5237, "step": 2350 }, { "epoch": 1.01647464197265, "grad_norm": 0.49445948004722595, "learning_rate": 1.5774393083139513e-05, "loss": 0.5313, "step": 2360 }, { "epoch": 1.020781737913212, "grad_norm": 0.5299666523933411, "learning_rate": 1.5735548200336435e-05, "loss": 0.5326, "step": 2370 }, { "epoch": 1.025088833853774, "grad_norm": 0.5012844204902649, "learning_rate": 1.569657391419468e-05, "loss": 0.5401, "step": 2380 }, { "epoch": 1.0293959297943363, "grad_norm": 0.4741289019584656, "learning_rate": 1.565747110403781e-05, "loss": 0.5052, "step": 2390 }, { "epoch": 1.0337030257348983, "grad_norm": 0.4950823485851288, "learning_rate": 1.5618240652089123e-05, "loss": 0.5294, "step": 2400 }, { "epoch": 1.0380101216754603, "grad_norm": 0.4934958517551422, "learning_rate": 1.557888344345171e-05, "loss": 0.5278, "step": 2410 }, { "epoch": 1.0423172176160225, "grad_norm": 0.467101514339447, "learning_rate": 1.5539400366088503e-05, "loss": 0.504, "step": 2420 }, { "epoch": 1.0466243135565845, "grad_norm": 0.5479716062545776, "learning_rate": 1.5499792310802238e-05, "loss": 0.5256, "step": 2430 }, { "epoch": 1.0509314094971465, "grad_norm": 0.4706737697124481, "learning_rate": 1.5460060171215362e-05, "loss": 0.5251, "step": 2440 }, { "epoch": 1.0552385054377087, "grad_norm": 0.5142565965652466, "learning_rate": 1.5420204843749857e-05, "loss": 0.5333, "step": 2450 }, { "epoch": 1.0595456013782707, "grad_norm": 0.5430694222450256, "learning_rate": 1.5380227227607032e-05, "loss": 0.5391, "step": 2460 }, { "epoch": 1.0638526973188327, "grad_norm": 0.4780258536338806, "learning_rate": 1.5340128224747225e-05, "loss": 0.5338, "step": 2470 }, { "epoch": 1.068159793259395, "grad_norm": 0.47647717595100403, "learning_rate": 1.5299908739869464e-05, "loss": 0.5178, "step": 2480 }, { "epoch": 1.072466889199957, "grad_norm": 0.5330241918563843, "learning_rate": 1.525956968039103e-05, "loss": 0.5027, "step": 2490 }, { "epoch": 1.076773985140519, "grad_norm": 0.4681854546070099, "learning_rate": 1.5219111956427027e-05, "loss": 0.5315, "step": 2500 }, { "epoch": 1.0810810810810811, "grad_norm": 0.5060921311378479, "learning_rate": 1.5178536480769803e-05, "loss": 0.5103, "step": 2510 }, { "epoch": 1.0853881770216431, "grad_norm": 0.497199147939682, "learning_rate": 1.5137844168868391e-05, "loss": 0.5302, "step": 2520 }, { "epoch": 1.0896952729622051, "grad_norm": 0.4658927321434021, "learning_rate": 1.5097035938807834e-05, "loss": 0.5196, "step": 2530 }, { "epoch": 1.0940023689027674, "grad_norm": 0.5109249353408813, "learning_rate": 1.5056112711288475e-05, "loss": 0.5099, "step": 2540 }, { "epoch": 1.0983094648433294, "grad_norm": 0.5212246775627136, "learning_rate": 1.5015075409605189e-05, "loss": 0.4911, "step": 2550 }, { "epoch": 1.1026165607838914, "grad_norm": 0.47850698232650757, "learning_rate": 1.497392495962656e-05, "loss": 0.5225, "step": 2560 }, { "epoch": 1.1069236567244536, "grad_norm": 0.4982755184173584, "learning_rate": 1.4932662289773969e-05, "loss": 0.5278, "step": 2570 }, { "epoch": 1.1112307526650156, "grad_norm": 0.49975791573524475, "learning_rate": 1.4891288331000668e-05, "loss": 0.5261, "step": 2580 }, { "epoch": 1.1155378486055776, "grad_norm": 0.5002388954162598, "learning_rate": 1.484980401677077e-05, "loss": 0.5313, "step": 2590 }, { "epoch": 1.1198449445461398, "grad_norm": 0.4950617253780365, "learning_rate": 1.4808210283038183e-05, "loss": 0.5286, "step": 2600 }, { "epoch": 1.1241520404867018, "grad_norm": 0.49831753969192505, "learning_rate": 1.47665080682255e-05, "loss": 0.5133, "step": 2610 }, { "epoch": 1.128459136427264, "grad_norm": 0.6730148792266846, "learning_rate": 1.4724698313202825e-05, "loss": 0.5224, "step": 2620 }, { "epoch": 1.132766232367826, "grad_norm": 0.5355139374732971, "learning_rate": 1.4682781961266546e-05, "loss": 0.5188, "step": 2630 }, { "epoch": 1.137073328308388, "grad_norm": 0.5199829936027527, "learning_rate": 1.4640759958118045e-05, "loss": 0.5121, "step": 2640 }, { "epoch": 1.14138042424895, "grad_norm": 0.5292408466339111, "learning_rate": 1.4598633251842373e-05, "loss": 0.5267, "step": 2650 }, { "epoch": 1.1456875201895123, "grad_norm": 0.5363121032714844, "learning_rate": 1.4556402792886856e-05, "loss": 0.5147, "step": 2660 }, { "epoch": 1.1499946161300743, "grad_norm": 0.5359490513801575, "learning_rate": 1.4514069534039649e-05, "loss": 0.5155, "step": 2670 }, { "epoch": 1.1543017120706365, "grad_norm": 0.4707220792770386, "learning_rate": 1.4471634430408244e-05, "loss": 0.5419, "step": 2680 }, { "epoch": 1.1586088080111985, "grad_norm": 0.4798811376094818, "learning_rate": 1.4429098439397901e-05, "loss": 0.5152, "step": 2690 }, { "epoch": 1.1629159039517605, "grad_norm": 0.4730081260204315, "learning_rate": 1.4386462520690087e-05, "loss": 0.5283, "step": 2700 }, { "epoch": 1.1672229998923225, "grad_norm": 0.524276614189148, "learning_rate": 1.4343727636220785e-05, "loss": 0.5087, "step": 2710 }, { "epoch": 1.1715300958328847, "grad_norm": 0.5093454122543335, "learning_rate": 1.430089475015882e-05, "loss": 0.5371, "step": 2720 }, { "epoch": 1.1758371917734467, "grad_norm": 0.5228180289268494, "learning_rate": 1.4257964828884077e-05, "loss": 0.5121, "step": 2730 }, { "epoch": 1.180144287714009, "grad_norm": 0.5263434052467346, "learning_rate": 1.4214938840965729e-05, "loss": 0.5104, "step": 2740 }, { "epoch": 1.184451383654571, "grad_norm": 0.5519675612449646, "learning_rate": 1.417181775714036e-05, "loss": 0.5081, "step": 2750 }, { "epoch": 1.188758479595133, "grad_norm": 0.48901626467704773, "learning_rate": 1.4128602550290078e-05, "loss": 0.5332, "step": 2760 }, { "epoch": 1.1930655755356951, "grad_norm": 0.5022098422050476, "learning_rate": 1.4085294195420563e-05, "loss": 0.5267, "step": 2770 }, { "epoch": 1.1973726714762571, "grad_norm": 0.5244942307472229, "learning_rate": 1.4041893669639053e-05, "loss": 0.5309, "step": 2780 }, { "epoch": 1.2016797674168191, "grad_norm": 0.5060109496116638, "learning_rate": 1.399840195213233e-05, "loss": 0.509, "step": 2790 }, { "epoch": 1.2059868633573814, "grad_norm": 0.48709142208099365, "learning_rate": 1.3954820024144595e-05, "loss": 0.5249, "step": 2800 }, { "epoch": 1.2102939592979434, "grad_norm": 0.48755279183387756, "learning_rate": 1.3911148868955357e-05, "loss": 0.5216, "step": 2810 }, { "epoch": 1.2146010552385054, "grad_norm": 0.4871668219566345, "learning_rate": 1.3867389471857229e-05, "loss": 0.5199, "step": 2820 }, { "epoch": 1.2189081511790676, "grad_norm": 0.5313363671302795, "learning_rate": 1.3823542820133706e-05, "loss": 0.5146, "step": 2830 }, { "epoch": 1.2232152471196296, "grad_norm": 0.48473960161209106, "learning_rate": 1.3779609903036894e-05, "loss": 0.5126, "step": 2840 }, { "epoch": 1.2275223430601916, "grad_norm": 0.5411814451217651, "learning_rate": 1.3735591711765189e-05, "loss": 0.5186, "step": 2850 }, { "epoch": 1.2318294390007538, "grad_norm": 0.5286210775375366, "learning_rate": 1.3691489239440899e-05, "loss": 0.513, "step": 2860 }, { "epoch": 1.2361365349413158, "grad_norm": 0.47112423181533813, "learning_rate": 1.3647303481087858e-05, "loss": 0.5268, "step": 2870 }, { "epoch": 1.2404436308818778, "grad_norm": 0.5465208888053894, "learning_rate": 1.3603035433608977e-05, "loss": 0.5109, "step": 2880 }, { "epoch": 1.24475072682244, "grad_norm": 0.4758882522583008, "learning_rate": 1.3558686095763732e-05, "loss": 0.5307, "step": 2890 }, { "epoch": 1.249057822763002, "grad_norm": 0.5721794962882996, "learning_rate": 1.3514256468145645e-05, "loss": 0.5104, "step": 2900 }, { "epoch": 1.2533649187035643, "grad_norm": 0.5125982761383057, "learning_rate": 1.3469747553159714e-05, "loss": 0.5278, "step": 2910 }, { "epoch": 1.2576720146441263, "grad_norm": 0.5272653698921204, "learning_rate": 1.342516035499978e-05, "loss": 0.5276, "step": 2920 }, { "epoch": 1.2619791105846883, "grad_norm": 0.5423816442489624, "learning_rate": 1.3380495879625884e-05, "loss": 0.5408, "step": 2930 }, { "epoch": 1.2662862065252503, "grad_norm": 0.4817509055137634, "learning_rate": 1.333575513474157e-05, "loss": 0.5152, "step": 2940 }, { "epoch": 1.2705933024658125, "grad_norm": 0.5113592147827148, "learning_rate": 1.3290939129771143e-05, "loss": 0.5397, "step": 2950 }, { "epoch": 1.2749003984063745, "grad_norm": 0.5106224417686462, "learning_rate": 1.3246048875836898e-05, "loss": 0.5269, "step": 2960 }, { "epoch": 1.2792074943469367, "grad_norm": 0.5446826219558716, "learning_rate": 1.3201085385736313e-05, "loss": 0.5252, "step": 2970 }, { "epoch": 1.2835145902874987, "grad_norm": 0.484943151473999, "learning_rate": 1.3156049673919184e-05, "loss": 0.525, "step": 2980 }, { "epoch": 1.2878216862280607, "grad_norm": 0.5692194700241089, "learning_rate": 1.3110942756464764e-05, "loss": 0.5197, "step": 2990 }, { "epoch": 1.2921287821686227, "grad_norm": 0.5009827017784119, "learning_rate": 1.3065765651058802e-05, "loss": 0.5325, "step": 3000 }, { "epoch": 1.296435878109185, "grad_norm": 0.4953298568725586, "learning_rate": 1.3020519376970613e-05, "loss": 0.5095, "step": 3010 }, { "epoch": 1.300742974049747, "grad_norm": 0.5116891264915466, "learning_rate": 1.2975204955030068e-05, "loss": 0.5263, "step": 3020 }, { "epoch": 1.3050500699903091, "grad_norm": 0.4844088554382324, "learning_rate": 1.2929823407604567e-05, "loss": 0.5113, "step": 3030 }, { "epoch": 1.3093571659308711, "grad_norm": 0.4732029438018799, "learning_rate": 1.2884375758575967e-05, "loss": 0.532, "step": 3040 }, { "epoch": 1.3136642618714331, "grad_norm": 0.5469485521316528, "learning_rate": 1.2838863033317484e-05, "loss": 0.519, "step": 3050 }, { "epoch": 1.3179713578119951, "grad_norm": 0.4888254702091217, "learning_rate": 1.2793286258670565e-05, "loss": 0.5097, "step": 3060 }, { "epoch": 1.3222784537525574, "grad_norm": 0.5359517335891724, "learning_rate": 1.2747646462921717e-05, "loss": 0.5246, "step": 3070 }, { "epoch": 1.3265855496931194, "grad_norm": 0.5013801455497742, "learning_rate": 1.2701944675779299e-05, "loss": 0.524, "step": 3080 }, { "epoch": 1.3308926456336816, "grad_norm": 0.49307557940483093, "learning_rate": 1.2656181928350301e-05, "loss": 0.5403, "step": 3090 }, { "epoch": 1.3351997415742436, "grad_norm": 0.47625210881233215, "learning_rate": 1.2610359253117078e-05, "loss": 0.5275, "step": 3100 }, { "epoch": 1.3395068375148056, "grad_norm": 0.5096368789672852, "learning_rate": 1.2564477683914053e-05, "loss": 0.5231, "step": 3110 }, { "epoch": 1.3438139334553676, "grad_norm": 0.4992668926715851, "learning_rate": 1.2518538255904389e-05, "loss": 0.5235, "step": 3120 }, { "epoch": 1.3481210293959298, "grad_norm": 0.491062194108963, "learning_rate": 1.2472542005556647e-05, "loss": 0.5432, "step": 3130 }, { "epoch": 1.3524281253364918, "grad_norm": 0.48666131496429443, "learning_rate": 1.2426489970621385e-05, "loss": 0.531, "step": 3140 }, { "epoch": 1.356735221277054, "grad_norm": 0.4706876575946808, "learning_rate": 1.2380383190107757e-05, "loss": 0.5188, "step": 3150 }, { "epoch": 1.361042317217616, "grad_norm": 0.4910385310649872, "learning_rate": 1.2334222704260063e-05, "loss": 0.5106, "step": 3160 }, { "epoch": 1.365349413158178, "grad_norm": 0.506514847278595, "learning_rate": 1.2288009554534291e-05, "loss": 0.5292, "step": 3170 }, { "epoch": 1.36965650909874, "grad_norm": 0.49671700596809387, "learning_rate": 1.2241744783574596e-05, "loss": 0.5284, "step": 3180 }, { "epoch": 1.3739636050393023, "grad_norm": 0.4892718195915222, "learning_rate": 1.219542943518981e-05, "loss": 0.5215, "step": 3190 }, { "epoch": 1.3782707009798643, "grad_norm": 0.5412102937698364, "learning_rate": 1.2149064554329864e-05, "loss": 0.5256, "step": 3200 }, { "epoch": 1.3825777969204265, "grad_norm": 0.4869970679283142, "learning_rate": 1.2102651187062227e-05, "loss": 0.5218, "step": 3210 }, { "epoch": 1.3868848928609885, "grad_norm": 0.5195066332817078, "learning_rate": 1.2056190380548299e-05, "loss": 0.5269, "step": 3220 }, { "epoch": 1.3911919888015505, "grad_norm": 0.5343438982963562, "learning_rate": 1.2009683183019788e-05, "loss": 0.5301, "step": 3230 }, { "epoch": 1.3954990847421127, "grad_norm": 0.522270679473877, "learning_rate": 1.1963130643755055e-05, "loss": 0.545, "step": 3240 }, { "epoch": 1.3998061806826747, "grad_norm": 0.501485288143158, "learning_rate": 1.191653381305545e-05, "loss": 0.5253, "step": 3250 }, { "epoch": 1.4041132766232367, "grad_norm": 0.5288712382316589, "learning_rate": 1.186989374222161e-05, "loss": 0.5181, "step": 3260 }, { "epoch": 1.408420372563799, "grad_norm": 0.5131502151489258, "learning_rate": 1.1823211483529733e-05, "loss": 0.5138, "step": 3270 }, { "epoch": 1.412727468504361, "grad_norm": 0.4853404462337494, "learning_rate": 1.1776488090207852e-05, "loss": 0.5319, "step": 3280 }, { "epoch": 1.417034564444923, "grad_norm": 0.5093010663986206, "learning_rate": 1.1729724616412062e-05, "loss": 0.5155, "step": 3290 }, { "epoch": 1.4213416603854852, "grad_norm": 0.5078168511390686, "learning_rate": 1.1682922117202736e-05, "loss": 0.5206, "step": 3300 }, { "epoch": 1.4256487563260472, "grad_norm": 0.5315324664115906, "learning_rate": 1.163608164852073e-05, "loss": 0.5314, "step": 3310 }, { "epoch": 1.4299558522666094, "grad_norm": 0.4705192446708679, "learning_rate": 1.1589204267163545e-05, "loss": 0.4966, "step": 3320 }, { "epoch": 1.4342629482071714, "grad_norm": 0.48757535219192505, "learning_rate": 1.15422910307615e-05, "loss": 0.5299, "step": 3330 }, { "epoch": 1.4385700441477334, "grad_norm": 0.5582148432731628, "learning_rate": 1.1495342997753864e-05, "loss": 0.5201, "step": 3340 }, { "epoch": 1.4428771400882954, "grad_norm": 0.5134326219558716, "learning_rate": 1.1448361227364963e-05, "loss": 0.5061, "step": 3350 }, { "epoch": 1.4471842360288576, "grad_norm": 0.5316387414932251, "learning_rate": 1.1401346779580303e-05, "loss": 0.5145, "step": 3360 }, { "epoch": 1.4514913319694196, "grad_norm": 0.5328738689422607, "learning_rate": 1.1354300715122637e-05, "loss": 0.5288, "step": 3370 }, { "epoch": 1.4557984279099818, "grad_norm": 0.5279168486595154, "learning_rate": 1.1307224095428058e-05, "loss": 0.5031, "step": 3380 }, { "epoch": 1.4601055238505438, "grad_norm": 0.5049686431884766, "learning_rate": 1.1260117982622021e-05, "loss": 0.5004, "step": 3390 }, { "epoch": 1.4644126197911058, "grad_norm": 0.47000184655189514, "learning_rate": 1.1212983439495392e-05, "loss": 0.5267, "step": 3400 }, { "epoch": 1.4687197157316678, "grad_norm": 0.49505382776260376, "learning_rate": 1.1165821529480483e-05, "loss": 0.5278, "step": 3410 }, { "epoch": 1.47302681167223, "grad_norm": 0.568454384803772, "learning_rate": 1.1118633316627037e-05, "loss": 0.5116, "step": 3420 }, { "epoch": 1.477333907612792, "grad_norm": 0.5094279646873474, "learning_rate": 1.1071419865578241e-05, "loss": 0.5181, "step": 3430 }, { "epoch": 1.4816410035533543, "grad_norm": 0.5605435371398926, "learning_rate": 1.1024182241546686e-05, "loss": 0.5191, "step": 3440 }, { "epoch": 1.4859480994939163, "grad_norm": 0.49941274523735046, "learning_rate": 1.097692151029036e-05, "loss": 0.5036, "step": 3450 }, { "epoch": 1.4902551954344783, "grad_norm": 0.5064433813095093, "learning_rate": 1.0929638738088571e-05, "loss": 0.5195, "step": 3460 }, { "epoch": 1.4945622913750403, "grad_norm": 0.5021061301231384, "learning_rate": 1.088233499171792e-05, "loss": 0.522, "step": 3470 }, { "epoch": 1.4988693873156025, "grad_norm": 0.5188096761703491, "learning_rate": 1.0835011338428217e-05, "loss": 0.5156, "step": 3480 }, { "epoch": 1.5031764832561645, "grad_norm": 0.6124559640884399, "learning_rate": 1.0787668845918393e-05, "loss": 0.5145, "step": 3490 }, { "epoch": 1.5074835791967267, "grad_norm": 0.48937344551086426, "learning_rate": 1.074030858231244e-05, "loss": 0.515, "step": 3500 }, { "epoch": 1.5117906751372887, "grad_norm": 0.518526017665863, "learning_rate": 1.0692931616135283e-05, "loss": 0.505, "step": 3510 }, { "epoch": 1.5160977710778507, "grad_norm": 0.5395667552947998, "learning_rate": 1.0645539016288686e-05, "loss": 0.5076, "step": 3520 }, { "epoch": 1.5204048670184127, "grad_norm": 0.495190292596817, "learning_rate": 1.059813185202714e-05, "loss": 0.523, "step": 3530 }, { "epoch": 1.524711962958975, "grad_norm": 0.49644342064857483, "learning_rate": 1.055071119293373e-05, "loss": 0.5038, "step": 3540 }, { "epoch": 1.5290190588995372, "grad_norm": 0.483696848154068, "learning_rate": 1.0503278108896e-05, "loss": 0.5103, "step": 3550 }, { "epoch": 1.5333261548400992, "grad_norm": 0.5149986147880554, "learning_rate": 1.0455833670081831e-05, "loss": 0.5402, "step": 3560 }, { "epoch": 1.5376332507806612, "grad_norm": 0.4734952449798584, "learning_rate": 1.0408378946915282e-05, "loss": 0.5292, "step": 3570 }, { "epoch": 1.5419403467212232, "grad_norm": 0.5490080118179321, "learning_rate": 1.0360915010052443e-05, "loss": 0.5155, "step": 3580 }, { "epoch": 1.5462474426617852, "grad_norm": 0.5176838636398315, "learning_rate": 1.0313442930357278e-05, "loss": 0.5111, "step": 3590 }, { "epoch": 1.5505545386023474, "grad_norm": 0.5659157633781433, "learning_rate": 1.026596377887747e-05, "loss": 0.5152, "step": 3600 }, { "epoch": 1.5548616345429096, "grad_norm": 0.5195504426956177, "learning_rate": 1.0218478626820256e-05, "loss": 0.5178, "step": 3610 }, { "epoch": 1.5591687304834716, "grad_norm": 0.533338189125061, "learning_rate": 1.0170988545528248e-05, "loss": 0.5138, "step": 3620 }, { "epoch": 1.5634758264240336, "grad_norm": 0.5108840465545654, "learning_rate": 1.0123494606455278e-05, "loss": 0.5273, "step": 3630 }, { "epoch": 1.5677829223645956, "grad_norm": 0.4785379469394684, "learning_rate": 1.0075997881142208e-05, "loss": 0.5071, "step": 3640 }, { "epoch": 1.5720900183051576, "grad_norm": 0.49497827887535095, "learning_rate": 1.0028499441192765e-05, "loss": 0.5132, "step": 3650 }, { "epoch": 1.5763971142457198, "grad_norm": 0.5214102864265442, "learning_rate": 9.981000358249368e-06, "loss": 0.5133, "step": 3660 }, { "epoch": 1.580704210186282, "grad_norm": 0.47462400794029236, "learning_rate": 9.933501703968928e-06, "loss": 0.5226, "step": 3670 }, { "epoch": 1.585011306126844, "grad_norm": 0.4743979275226593, "learning_rate": 9.8860045499987e-06, "loss": 0.5219, "step": 3680 }, { "epoch": 1.589318402067406, "grad_norm": 0.5265910625457764, "learning_rate": 9.838509967952076e-06, "loss": 0.4945, "step": 3690 }, { "epoch": 1.593625498007968, "grad_norm": 0.5075172185897827, "learning_rate": 9.791019029384437e-06, "loss": 0.5175, "step": 3700 }, { "epoch": 1.59793259394853, "grad_norm": 0.5206677913665771, "learning_rate": 9.743532805768948e-06, "loss": 0.5188, "step": 3710 }, { "epoch": 1.6022396898890923, "grad_norm": 0.4802674651145935, "learning_rate": 9.696052368472406e-06, "loss": 0.5064, "step": 3720 }, { "epoch": 1.6065467858296545, "grad_norm": 0.5289535522460938, "learning_rate": 9.648578788731044e-06, "loss": 0.5281, "step": 3730 }, { "epoch": 1.6108538817702165, "grad_norm": 0.47722700238227844, "learning_rate": 9.601113137626394e-06, "loss": 0.5151, "step": 3740 }, { "epoch": 1.6151609777107785, "grad_norm": 0.4994152784347534, "learning_rate": 9.553656486061098e-06, "loss": 0.52, "step": 3750 }, { "epoch": 1.6194680736513405, "grad_norm": 0.48130089044570923, "learning_rate": 9.506209904734753e-06, "loss": 0.5336, "step": 3760 }, { "epoch": 1.6237751695919027, "grad_norm": 0.48449528217315674, "learning_rate": 9.45877446411976e-06, "loss": 0.5252, "step": 3770 }, { "epoch": 1.6280822655324647, "grad_norm": 0.5411643981933594, "learning_rate": 9.411351234437163e-06, "loss": 0.5187, "step": 3780 }, { "epoch": 1.632389361473027, "grad_norm": 0.5133873820304871, "learning_rate": 9.363941285632507e-06, "loss": 0.5217, "step": 3790 }, { "epoch": 1.636696457413589, "grad_norm": 0.5814666748046875, "learning_rate": 9.3165456873517e-06, "loss": 0.5, "step": 3800 }, { "epoch": 1.641003553354151, "grad_norm": 0.52715665102005, "learning_rate": 9.269165508916883e-06, "loss": 0.5184, "step": 3810 }, { "epoch": 1.645310649294713, "grad_norm": 0.48196879029273987, "learning_rate": 9.221801819302288e-06, "loss": 0.5191, "step": 3820 }, { "epoch": 1.6496177452352752, "grad_norm": 0.49397778511047363, "learning_rate": 9.174455687110142e-06, "loss": 0.5013, "step": 3830 }, { "epoch": 1.6539248411758372, "grad_norm": 0.5037091970443726, "learning_rate": 9.127128180546548e-06, "loss": 0.5298, "step": 3840 }, { "epoch": 1.6582319371163994, "grad_norm": 0.5031833052635193, "learning_rate": 9.079820367397384e-06, "loss": 0.4929, "step": 3850 }, { "epoch": 1.6625390330569614, "grad_norm": 0.5380353927612305, "learning_rate": 9.032533315004207e-06, "loss": 0.4968, "step": 3860 }, { "epoch": 1.6668461289975234, "grad_norm": 0.5191226005554199, "learning_rate": 8.98526809024018e-06, "loss": 0.5267, "step": 3870 }, { "epoch": 1.6711532249380854, "grad_norm": 0.5179468393325806, "learning_rate": 8.938025759486007e-06, "loss": 0.5159, "step": 3880 }, { "epoch": 1.6754603208786476, "grad_norm": 0.4779166579246521, "learning_rate": 8.89080738860585e-06, "loss": 0.5211, "step": 3890 }, { "epoch": 1.6797674168192096, "grad_norm": 0.5136571526527405, "learning_rate": 8.843614042923318e-06, "loss": 0.5003, "step": 3900 }, { "epoch": 1.6840745127597718, "grad_norm": 0.540773332118988, "learning_rate": 8.796446787197383e-06, "loss": 0.5131, "step": 3910 }, { "epoch": 1.6883816087003338, "grad_norm": 0.5126665234565735, "learning_rate": 8.749306685598409e-06, "loss": 0.5093, "step": 3920 }, { "epoch": 1.6926887046408958, "grad_norm": 0.47659188508987427, "learning_rate": 8.702194801684112e-06, "loss": 0.5158, "step": 3930 }, { "epoch": 1.6969958005814578, "grad_norm": 0.47945475578308105, "learning_rate": 8.655112198375564e-06, "loss": 0.5026, "step": 3940 }, { "epoch": 1.70130289652202, "grad_norm": 0.4939498007297516, "learning_rate": 8.60805993793323e-06, "loss": 0.5099, "step": 3950 }, { "epoch": 1.7056099924625823, "grad_norm": 0.5328351259231567, "learning_rate": 8.561039081932975e-06, "loss": 0.52, "step": 3960 }, { "epoch": 1.7099170884031443, "grad_norm": 0.49865198135375977, "learning_rate": 8.514050691242145e-06, "loss": 0.5077, "step": 3970 }, { "epoch": 1.7142241843437063, "grad_norm": 0.49807870388031006, "learning_rate": 8.467095825995605e-06, "loss": 0.4976, "step": 3980 }, { "epoch": 1.7185312802842683, "grad_norm": 0.5023031234741211, "learning_rate": 8.420175545571837e-06, "loss": 0.5233, "step": 3990 }, { "epoch": 1.7228383762248303, "grad_norm": 0.49054110050201416, "learning_rate": 8.373290908569026e-06, "loss": 0.5115, "step": 4000 }, { "epoch": 1.7271454721653925, "grad_norm": 0.47637811303138733, "learning_rate": 8.32644297278119e-06, "loss": 0.5103, "step": 4010 }, { "epoch": 1.7314525681059547, "grad_norm": 0.5239661931991577, "learning_rate": 8.279632795174304e-06, "loss": 0.5161, "step": 4020 }, { "epoch": 1.7357596640465167, "grad_norm": 0.5000544190406799, "learning_rate": 8.232861431862457e-06, "loss": 0.5113, "step": 4030 }, { "epoch": 1.7400667599870787, "grad_norm": 0.5361005067825317, "learning_rate": 8.186129938084028e-06, "loss": 0.5137, "step": 4040 }, { "epoch": 1.7443738559276407, "grad_norm": 0.48270535469055176, "learning_rate": 8.139439368177868e-06, "loss": 0.5116, "step": 4050 }, { "epoch": 1.7486809518682027, "grad_norm": 0.48645904660224915, "learning_rate": 8.092790775559522e-06, "loss": 0.517, "step": 4060 }, { "epoch": 1.752988047808765, "grad_norm": 0.4865799844264984, "learning_rate": 8.046185212697459e-06, "loss": 0.5202, "step": 4070 }, { "epoch": 1.7572951437493272, "grad_norm": 0.5095897912979126, "learning_rate": 7.999623731089327e-06, "loss": 0.5186, "step": 4080 }, { "epoch": 1.7616022396898892, "grad_norm": 0.49918055534362793, "learning_rate": 7.953107381238226e-06, "loss": 0.5091, "step": 4090 }, { "epoch": 1.7659093356304512, "grad_norm": 0.5209227204322815, "learning_rate": 7.906637212629011e-06, "loss": 0.5098, "step": 4100 }, { "epoch": 1.7702164315710132, "grad_norm": 0.5320930480957031, "learning_rate": 7.860214273704614e-06, "loss": 0.5172, "step": 4110 }, { "epoch": 1.7745235275115752, "grad_norm": 0.4841155707836151, "learning_rate": 7.813839611842387e-06, "loss": 0.4851, "step": 4120 }, { "epoch": 1.7788306234521374, "grad_norm": 0.5300472378730774, "learning_rate": 7.767514273330473e-06, "loss": 0.4953, "step": 4130 }, { "epoch": 1.7831377193926996, "grad_norm": 0.5021957159042358, "learning_rate": 7.721239303344201e-06, "loss": 0.5112, "step": 4140 }, { "epoch": 1.7874448153332616, "grad_norm": 0.498737096786499, "learning_rate": 7.675015745922499e-06, "loss": 0.5045, "step": 4150 }, { "epoch": 1.7917519112738236, "grad_norm": 0.4690532684326172, "learning_rate": 7.628844643944349e-06, "loss": 0.5102, "step": 4160 }, { "epoch": 1.7960590072143856, "grad_norm": 0.5077162384986877, "learning_rate": 7.582727039105255e-06, "loss": 0.5105, "step": 4170 }, { "epoch": 1.8003661031549478, "grad_norm": 0.47492554783821106, "learning_rate": 7.536663971893724e-06, "loss": 0.5008, "step": 4180 }, { "epoch": 1.8046731990955098, "grad_norm": 0.5036799907684326, "learning_rate": 7.4906564815678205e-06, "loss": 0.5179, "step": 4190 }, { "epoch": 1.808980295036072, "grad_norm": 0.5044455528259277, "learning_rate": 7.444705606131697e-06, "loss": 0.5171, "step": 4200 }, { "epoch": 1.813287390976634, "grad_norm": 0.5645790696144104, "learning_rate": 7.39881238231218e-06, "loss": 0.5111, "step": 4210 }, { "epoch": 1.817594486917196, "grad_norm": 0.4966265857219696, "learning_rate": 7.352977845535387e-06, "loss": 0.5144, "step": 4220 }, { "epoch": 1.821901582857758, "grad_norm": 0.5225628614425659, "learning_rate": 7.307203029903354e-06, "loss": 0.5115, "step": 4230 }, { "epoch": 1.8262086787983203, "grad_norm": 0.5282090902328491, "learning_rate": 7.261488968170713e-06, "loss": 0.5251, "step": 4240 }, { "epoch": 1.8305157747388823, "grad_norm": 0.5346629023551941, "learning_rate": 7.21583669172139e-06, "loss": 0.5042, "step": 4250 }, { "epoch": 1.8348228706794445, "grad_norm": 0.5141210556030273, "learning_rate": 7.170247230545335e-06, "loss": 0.5199, "step": 4260 }, { "epoch": 1.8391299666200065, "grad_norm": 0.5251668691635132, "learning_rate": 7.124721613215275e-06, "loss": 0.4936, "step": 4270 }, { "epoch": 1.8434370625605685, "grad_norm": 0.5125293731689453, "learning_rate": 7.079260866863523e-06, "loss": 0.5161, "step": 4280 }, { "epoch": 1.8477441585011305, "grad_norm": 0.4881208837032318, "learning_rate": 7.033866017158797e-06, "loss": 0.5142, "step": 4290 }, { "epoch": 1.8520512544416927, "grad_norm": 0.5215027928352356, "learning_rate": 6.9885380882830735e-06, "loss": 0.5097, "step": 4300 }, { "epoch": 1.8563583503822547, "grad_norm": 0.4931368827819824, "learning_rate": 6.943278102908491e-06, "loss": 0.5123, "step": 4310 }, { "epoch": 1.860665446322817, "grad_norm": 0.5080362558364868, "learning_rate": 6.898087082174267e-06, "loss": 0.5093, "step": 4320 }, { "epoch": 1.864972542263379, "grad_norm": 0.537807285785675, "learning_rate": 6.852966045663671e-06, "loss": 0.5245, "step": 4330 }, { "epoch": 1.869279638203941, "grad_norm": 0.5395597815513611, "learning_rate": 6.807916011381008e-06, "loss": 0.5016, "step": 4340 }, { "epoch": 1.873586734144503, "grad_norm": 0.48623430728912354, "learning_rate": 6.762937995728663e-06, "loss": 0.4962, "step": 4350 }, { "epoch": 1.8778938300850652, "grad_norm": 0.5058403611183167, "learning_rate": 6.718033013484147e-06, "loss": 0.5401, "step": 4360 }, { "epoch": 1.8822009260256274, "grad_norm": 0.5220633149147034, "learning_rate": 6.673202077777239e-06, "loss": 0.5112, "step": 4370 }, { "epoch": 1.8865080219661894, "grad_norm": 0.5163370966911316, "learning_rate": 6.6284462000670924e-06, "loss": 0.5231, "step": 4380 }, { "epoch": 1.8908151179067514, "grad_norm": 0.508660614490509, "learning_rate": 6.583766390119437e-06, "loss": 0.5304, "step": 4390 }, { "epoch": 1.8951222138473134, "grad_norm": 0.568144679069519, "learning_rate": 6.539163655983786e-06, "loss": 0.5086, "step": 4400 }, { "epoch": 1.8994293097878754, "grad_norm": 0.5001341700553894, "learning_rate": 6.494639003970701e-06, "loss": 0.5084, "step": 4410 }, { "epoch": 1.9037364057284376, "grad_norm": 0.5228297710418701, "learning_rate": 6.450193438629078e-06, "loss": 0.504, "step": 4420 }, { "epoch": 1.9080435016689998, "grad_norm": 0.4816001057624817, "learning_rate": 6.40582796272349e-06, "loss": 0.5102, "step": 4430 }, { "epoch": 1.9123505976095618, "grad_norm": 0.5058324933052063, "learning_rate": 6.361543577211566e-06, "loss": 0.524, "step": 4440 }, { "epoch": 1.9166576935501238, "grad_norm": 0.5428106188774109, "learning_rate": 6.317341281221392e-06, "loss": 0.5082, "step": 4450 }, { "epoch": 1.9209647894906858, "grad_norm": 0.5131290555000305, "learning_rate": 6.273222072028991e-06, "loss": 0.5316, "step": 4460 }, { "epoch": 1.9252718854312478, "grad_norm": 0.5238609910011292, "learning_rate": 6.2291869450358074e-06, "loss": 0.5021, "step": 4470 }, { "epoch": 1.92957898137181, "grad_norm": 0.4843258261680603, "learning_rate": 6.1852368937462585e-06, "loss": 0.5048, "step": 4480 }, { "epoch": 1.9338860773123723, "grad_norm": 0.5138316750526428, "learning_rate": 6.141372909745307e-06, "loss": 0.5352, "step": 4490 }, { "epoch": 1.9381931732529343, "grad_norm": 0.49319642782211304, "learning_rate": 6.097595982676103e-06, "loss": 0.5065, "step": 4500 } ], "logging_steps": 10, "max_steps": 6963, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4861580908953600.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }