{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1670, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005988023952095808, "grad_norm": 5.398237705230713, "learning_rate": 1.1976047904191619e-06, "loss": 5.2288, "step": 1 }, { "epoch": 0.0029940119760479044, "grad_norm": 4.768432140350342, "learning_rate": 5.9880239520958085e-06, "loss": 5.1682, "step": 5 }, { "epoch": 0.005988023952095809, "grad_norm": 4.144229412078857, "learning_rate": 1.1976047904191617e-05, "loss": 5.1486, "step": 10 }, { "epoch": 0.008982035928143712, "grad_norm": 3.3602042198181152, "learning_rate": 1.7964071856287426e-05, "loss": 5.0451, "step": 15 }, { "epoch": 0.011976047904191617, "grad_norm": 3.0493969917297363, "learning_rate": 2.3952095808383234e-05, "loss": 4.9569, "step": 20 }, { "epoch": 0.014970059880239521, "grad_norm": 2.564634323120117, "learning_rate": 2.994011976047904e-05, "loss": 4.8593, "step": 25 }, { "epoch": 0.017964071856287425, "grad_norm": 2.1028244495391846, "learning_rate": 3.592814371257485e-05, "loss": 4.7325, "step": 30 }, { "epoch": 0.020958083832335328, "grad_norm": 2.0943355560302734, "learning_rate": 4.191616766467066e-05, "loss": 4.6032, "step": 35 }, { "epoch": 0.023952095808383235, "grad_norm": 2.1831929683685303, "learning_rate": 4.790419161676647e-05, "loss": 4.5291, "step": 40 }, { "epoch": 0.02694610778443114, "grad_norm": 2.2486143112182617, "learning_rate": 5.389221556886228e-05, "loss": 4.4485, "step": 45 }, { "epoch": 0.029940119760479042, "grad_norm": 1.977081537246704, "learning_rate": 5.988023952095808e-05, "loss": 4.3569, "step": 50 }, { "epoch": 0.03293413173652695, "grad_norm": 2.1440491676330566, "learning_rate": 6.58682634730539e-05, "loss": 4.2673, "step": 55 }, { "epoch": 0.03592814371257485, "grad_norm": 1.845775842666626, "learning_rate": 7.18562874251497e-05, "loss": 4.1614, "step": 60 }, { "epoch": 0.038922155688622756, "grad_norm": 2.0755035877227783, "learning_rate": 7.784431137724552e-05, "loss": 4.1784, "step": 65 }, { "epoch": 0.041916167664670656, "grad_norm": 2.0788209438323975, "learning_rate": 8.383233532934131e-05, "loss": 4.0587, "step": 70 }, { "epoch": 0.04491017964071856, "grad_norm": 2.0177366733551025, "learning_rate": 8.982035928143712e-05, "loss": 4.0414, "step": 75 }, { "epoch": 0.04790419161676647, "grad_norm": 1.9424874782562256, "learning_rate": 9.580838323353294e-05, "loss": 3.9817, "step": 80 }, { "epoch": 0.05089820359281437, "grad_norm": 2.1245999336242676, "learning_rate": 0.00010179640718562875, "loss": 3.8902, "step": 85 }, { "epoch": 0.05389221556886228, "grad_norm": 1.7750043869018555, "learning_rate": 0.00010778443113772456, "loss": 3.8991, "step": 90 }, { "epoch": 0.05688622754491018, "grad_norm": 1.888543725013733, "learning_rate": 0.00011377245508982037, "loss": 3.8088, "step": 95 }, { "epoch": 0.059880239520958084, "grad_norm": 1.696926236152649, "learning_rate": 0.00011976047904191617, "loss": 3.814, "step": 100 }, { "epoch": 0.06287425149700598, "grad_norm": 1.7164498567581177, "learning_rate": 0.00012574850299401196, "loss": 3.7896, "step": 105 }, { "epoch": 0.0658682634730539, "grad_norm": 1.924892544746399, "learning_rate": 0.0001317365269461078, "loss": 3.733, "step": 110 }, { "epoch": 0.0688622754491018, "grad_norm": 2.0697178840637207, "learning_rate": 0.00013772455089820359, "loss": 3.7365, "step": 115 }, { "epoch": 0.0718562874251497, "grad_norm": 1.8498303890228271, "learning_rate": 0.0001437125748502994, "loss": 3.6889, "step": 120 }, { "epoch": 0.0748502994011976, "grad_norm": 1.799263596534729, "learning_rate": 0.0001497005988023952, "loss": 3.6785, "step": 125 }, { "epoch": 0.07784431137724551, "grad_norm": 1.8041021823883057, "learning_rate": 0.00015568862275449103, "loss": 3.6193, "step": 130 }, { "epoch": 0.08083832335329341, "grad_norm": 2.032904863357544, "learning_rate": 0.00016167664670658683, "loss": 3.5986, "step": 135 }, { "epoch": 0.08383233532934131, "grad_norm": 2.044524908065796, "learning_rate": 0.00016766467065868263, "loss": 3.549, "step": 140 }, { "epoch": 0.08682634730538923, "grad_norm": 2.168320655822754, "learning_rate": 0.00017365269461077845, "loss": 3.5249, "step": 145 }, { "epoch": 0.08982035928143713, "grad_norm": 1.8781085014343262, "learning_rate": 0.00017964071856287425, "loss": 3.5475, "step": 150 }, { "epoch": 0.09281437125748503, "grad_norm": 1.6960855722427368, "learning_rate": 0.00018562874251497007, "loss": 3.5204, "step": 155 }, { "epoch": 0.09580838323353294, "grad_norm": 1.9243298768997192, "learning_rate": 0.00019161676646706587, "loss": 3.5206, "step": 160 }, { "epoch": 0.09880239520958084, "grad_norm": 2.000701904296875, "learning_rate": 0.0001976047904191617, "loss": 3.5163, "step": 165 }, { "epoch": 0.10179640718562874, "grad_norm": 1.6321606636047363, "learning_rate": 0.00019999803395762152, "loss": 3.4382, "step": 170 }, { "epoch": 0.10479041916167664, "grad_norm": 1.5973315238952637, "learning_rate": 0.00019998601953415373, "loss": 3.4415, "step": 175 }, { "epoch": 0.10778443113772455, "grad_norm": 1.6465246677398682, "learning_rate": 0.00019996308424365594, "loss": 3.4091, "step": 180 }, { "epoch": 0.11077844311377245, "grad_norm": 1.6946362257003784, "learning_rate": 0.00019992923059121106, "loss": 3.3752, "step": 185 }, { "epoch": 0.11377245508982035, "grad_norm": 1.602065086364746, "learning_rate": 0.0001998844622744483, "loss": 3.415, "step": 190 }, { "epoch": 0.11676646706586827, "grad_norm": 1.8944158554077148, "learning_rate": 0.0001998287841831396, "loss": 3.3535, "step": 195 }, { "epoch": 0.11976047904191617, "grad_norm": 2.0829012393951416, "learning_rate": 0.00019976220239866562, "loss": 3.3339, "step": 200 }, { "epoch": 0.12275449101796407, "grad_norm": 1.8381649255752563, "learning_rate": 0.00019968472419335106, "loss": 3.3609, "step": 205 }, { "epoch": 0.12574850299401197, "grad_norm": 1.864918828010559, "learning_rate": 0.00019959635802967087, "loss": 3.3285, "step": 210 }, { "epoch": 0.12874251497005987, "grad_norm": 2.0552523136138916, "learning_rate": 0.00019949711355932566, "loss": 3.3141, "step": 215 }, { "epoch": 0.1317365269461078, "grad_norm": 1.4688948392868042, "learning_rate": 0.0001993870016221875, "loss": 3.3043, "step": 220 }, { "epoch": 0.1347305389221557, "grad_norm": 1.7277065515518188, "learning_rate": 0.000199266034245116, "loss": 3.3307, "step": 225 }, { "epoch": 0.1377245508982036, "grad_norm": 1.6926337480545044, "learning_rate": 0.0001991342246406448, "loss": 3.302, "step": 230 }, { "epoch": 0.1407185628742515, "grad_norm": 1.7383888959884644, "learning_rate": 0.00019899158720553824, "loss": 3.2949, "step": 235 }, { "epoch": 0.1437125748502994, "grad_norm": 1.4267903566360474, "learning_rate": 0.00019883813751921903, "loss": 3.2932, "step": 240 }, { "epoch": 0.1467065868263473, "grad_norm": 1.4597861766815186, "learning_rate": 0.00019867389234206654, "loss": 3.2629, "step": 245 }, { "epoch": 0.1497005988023952, "grad_norm": 1.6646181344985962, "learning_rate": 0.00019849886961358621, "loss": 3.2334, "step": 250 }, { "epoch": 0.15269461077844312, "grad_norm": 1.46786367893219, "learning_rate": 0.0001983130884504501, "loss": 3.2368, "step": 255 }, { "epoch": 0.15568862275449102, "grad_norm": 1.3595575094223022, "learning_rate": 0.00019811656914440885, "loss": 3.2124, "step": 260 }, { "epoch": 0.15868263473053892, "grad_norm": 1.7106595039367676, "learning_rate": 0.0001979093331600754, "loss": 3.2399, "step": 265 }, { "epoch": 0.16167664670658682, "grad_norm": 1.7180885076522827, "learning_rate": 0.0001976914031325806, "loss": 3.1848, "step": 270 }, { "epoch": 0.16467065868263472, "grad_norm": 1.5397311449050903, "learning_rate": 0.0001974628028651007, "loss": 3.1918, "step": 275 }, { "epoch": 0.16766467065868262, "grad_norm": 1.2814067602157593, "learning_rate": 0.00019722355732625774, "loss": 3.1661, "step": 280 }, { "epoch": 0.17065868263473055, "grad_norm": 1.3572536706924438, "learning_rate": 0.0001969736926473921, "loss": 3.2363, "step": 285 }, { "epoch": 0.17365269461077845, "grad_norm": 1.1298632621765137, "learning_rate": 0.0001967132361197086, "loss": 3.1441, "step": 290 }, { "epoch": 0.17664670658682635, "grad_norm": 1.3632957935333252, "learning_rate": 0.00019644221619129548, "loss": 3.1402, "step": 295 }, { "epoch": 0.17964071856287425, "grad_norm": 1.4946106672286987, "learning_rate": 0.00019616066246401717, "loss": 3.1352, "step": 300 }, { "epoch": 0.18263473053892215, "grad_norm": 1.3740743398666382, "learning_rate": 0.00019586860569028124, "loss": 3.1609, "step": 305 }, { "epoch": 0.18562874251497005, "grad_norm": 1.359784483909607, "learning_rate": 0.0001955660777696793, "loss": 3.1191, "step": 310 }, { "epoch": 0.18862275449101795, "grad_norm": 1.0663578510284424, "learning_rate": 0.00019525311174550285, "loss": 3.1508, "step": 315 }, { "epoch": 0.19161676646706588, "grad_norm": 1.2853511571884155, "learning_rate": 0.00019492974180113426, "loss": 3.1197, "step": 320 }, { "epoch": 0.19461077844311378, "grad_norm": 1.2358075380325317, "learning_rate": 0.00019459600325631303, "loss": 3.0737, "step": 325 }, { "epoch": 0.19760479041916168, "grad_norm": 1.2823344469070435, "learning_rate": 0.0001942519325632781, "loss": 3.1486, "step": 330 }, { "epoch": 0.20059880239520958, "grad_norm": 1.0905367136001587, "learning_rate": 0.00019389756730278627, "loss": 3.1254, "step": 335 }, { "epoch": 0.20359281437125748, "grad_norm": 1.1666399240493774, "learning_rate": 0.00019353294618000758, "loss": 3.0982, "step": 340 }, { "epoch": 0.20658682634730538, "grad_norm": 1.352597951889038, "learning_rate": 0.00019315810902029786, "loss": 3.1079, "step": 345 }, { "epoch": 0.20958083832335328, "grad_norm": 1.5166900157928467, "learning_rate": 0.00019277309676484858, "loss": 3.1142, "step": 350 }, { "epoch": 0.2125748502994012, "grad_norm": 1.1076050996780396, "learning_rate": 0.0001923779514662154, "loss": 3.0817, "step": 355 }, { "epoch": 0.2155688622754491, "grad_norm": 1.1077985763549805, "learning_rate": 0.00019197271628372482, "loss": 3.0804, "step": 360 }, { "epoch": 0.218562874251497, "grad_norm": 1.0608789920806885, "learning_rate": 0.00019155743547876023, "loss": 3.0823, "step": 365 }, { "epoch": 0.2215568862275449, "grad_norm": 1.2678635120391846, "learning_rate": 0.00019113215440992752, "loss": 3.0845, "step": 370 }, { "epoch": 0.2245508982035928, "grad_norm": 1.1210081577301025, "learning_rate": 0.0001906969195281007, "loss": 3.066, "step": 375 }, { "epoch": 0.2275449101796407, "grad_norm": 1.3343095779418945, "learning_rate": 0.00019025177837134858, "loss": 3.0512, "step": 380 }, { "epoch": 0.23053892215568864, "grad_norm": 1.1427243947982788, "learning_rate": 0.00018979677955974228, "loss": 3.073, "step": 385 }, { "epoch": 0.23353293413173654, "grad_norm": 1.2000013589859009, "learning_rate": 0.0001893319727900448, "loss": 3.0551, "step": 390 }, { "epoch": 0.23652694610778444, "grad_norm": 1.0942010879516602, "learning_rate": 0.0001888574088302831, "loss": 3.0414, "step": 395 }, { "epoch": 0.23952095808383234, "grad_norm": 0.9435099363327026, "learning_rate": 0.00018837313951420272, "loss": 3.0231, "step": 400 }, { "epoch": 0.24251497005988024, "grad_norm": 1.0394734144210815, "learning_rate": 0.00018787921773560657, "loss": 3.0175, "step": 405 }, { "epoch": 0.24550898203592814, "grad_norm": 0.9441319704055786, "learning_rate": 0.00018737569744257756, "loss": 3.0385, "step": 410 }, { "epoch": 0.24850299401197604, "grad_norm": 1.020687460899353, "learning_rate": 0.00018686263363158602, "loss": 3.0505, "step": 415 }, { "epoch": 0.25149700598802394, "grad_norm": 0.9998067617416382, "learning_rate": 0.0001863400823414831, "loss": 3.0358, "step": 420 }, { "epoch": 0.25449101796407186, "grad_norm": 1.0903741121292114, "learning_rate": 0.00018580810064737965, "loss": 3.0195, "step": 425 }, { "epoch": 0.25748502994011974, "grad_norm": 1.0015404224395752, "learning_rate": 0.00018526674665441257, "loss": 3.0362, "step": 430 }, { "epoch": 0.26047904191616766, "grad_norm": 0.9371241927146912, "learning_rate": 0.00018471607949139803, "loss": 3.0123, "step": 435 }, { "epoch": 0.2634730538922156, "grad_norm": 0.9184331893920898, "learning_rate": 0.00018415615930437337, "loss": 3.0075, "step": 440 }, { "epoch": 0.26646706586826346, "grad_norm": 1.0484634637832642, "learning_rate": 0.00018358704725002768, "loss": 3.0029, "step": 445 }, { "epoch": 0.2694610778443114, "grad_norm": 1.0423493385314941, "learning_rate": 0.000183008805489022, "loss": 2.9913, "step": 450 }, { "epoch": 0.27245508982035926, "grad_norm": 1.2088091373443604, "learning_rate": 0.00018242149717919993, "loss": 2.9834, "step": 455 }, { "epoch": 0.2754491017964072, "grad_norm": 0.8996514678001404, "learning_rate": 0.0001818251864686893, "loss": 2.972, "step": 460 }, { "epoch": 0.27844311377245506, "grad_norm": 1.014440894126892, "learning_rate": 0.00018121993848889552, "loss": 2.9687, "step": 465 }, { "epoch": 0.281437125748503, "grad_norm": 0.8995974063873291, "learning_rate": 0.00018060581934738784, "loss": 2.9317, "step": 470 }, { "epoch": 0.2844311377245509, "grad_norm": 1.0253899097442627, "learning_rate": 0.00017998289612067864, "loss": 2.9695, "step": 475 }, { "epoch": 0.2874251497005988, "grad_norm": 0.954319179058075, "learning_rate": 0.00017935123684689733, "loss": 2.9884, "step": 480 }, { "epoch": 0.2904191616766467, "grad_norm": 0.8713873028755188, "learning_rate": 0.00017871091051835874, "loss": 2.9689, "step": 485 }, { "epoch": 0.2934131736526946, "grad_norm": 0.9465665817260742, "learning_rate": 0.00017806198707402752, "loss": 2.9073, "step": 490 }, { "epoch": 0.2964071856287425, "grad_norm": 0.9693202972412109, "learning_rate": 0.00017740453739187922, "loss": 2.953, "step": 495 }, { "epoch": 0.2994011976047904, "grad_norm": 0.9308465719223022, "learning_rate": 0.0001767386332811587, "loss": 2.9439, "step": 500 }, { "epoch": 0.3023952095808383, "grad_norm": 0.8614184260368347, "learning_rate": 0.0001760643474745368, "loss": 2.9567, "step": 505 }, { "epoch": 0.30538922155688625, "grad_norm": 0.913870632648468, "learning_rate": 0.00017538175362016622, "loss": 2.9447, "step": 510 }, { "epoch": 0.3083832335329341, "grad_norm": 0.8058397769927979, "learning_rate": 0.00017469092627363738, "loss": 2.9677, "step": 515 }, { "epoch": 0.31137724550898205, "grad_norm": 0.7762190699577332, "learning_rate": 0.00017399194088983511, "loss": 2.9242, "step": 520 }, { "epoch": 0.3143712574850299, "grad_norm": 0.8444433808326721, "learning_rate": 0.0001732848738146973, "loss": 2.9612, "step": 525 }, { "epoch": 0.31736526946107785, "grad_norm": 0.7365703582763672, "learning_rate": 0.00017256980227687595, "loss": 2.9629, "step": 530 }, { "epoch": 0.3203592814371258, "grad_norm": 0.841096818447113, "learning_rate": 0.00017184680437930198, "loss": 2.9349, "step": 535 }, { "epoch": 0.32335329341317365, "grad_norm": 0.7778092622756958, "learning_rate": 0.00017111595909065466, "loss": 2.9147, "step": 540 }, { "epoch": 0.3263473053892216, "grad_norm": 0.8155597448348999, "learning_rate": 0.00017037734623673615, "loss": 2.9255, "step": 545 }, { "epoch": 0.32934131736526945, "grad_norm": 0.693092942237854, "learning_rate": 0.00016963104649175272, "loss": 2.902, "step": 550 }, { "epoch": 0.3323353293413174, "grad_norm": 0.8276283144950867, "learning_rate": 0.0001688771413695032, "loss": 2.9414, "step": 555 }, { "epoch": 0.33532934131736525, "grad_norm": 0.8656878471374512, "learning_rate": 0.00016811571321447566, "loss": 2.9105, "step": 560 }, { "epoch": 0.3383233532934132, "grad_norm": 0.713503897190094, "learning_rate": 0.00016734684519285344, "loss": 2.9138, "step": 565 }, { "epoch": 0.3413173652694611, "grad_norm": 0.7443877458572388, "learning_rate": 0.00016657062128343144, "loss": 2.8801, "step": 570 }, { "epoch": 0.344311377245509, "grad_norm": 0.8257678151130676, "learning_rate": 0.00016578712626844365, "loss": 2.8803, "step": 575 }, { "epoch": 0.3473053892215569, "grad_norm": 0.8291301131248474, "learning_rate": 0.00016499644572430278, "loss": 2.9076, "step": 580 }, { "epoch": 0.3502994011976048, "grad_norm": 0.738528311252594, "learning_rate": 0.0001641986660122534, "loss": 2.8978, "step": 585 }, { "epoch": 0.3532934131736527, "grad_norm": 0.7673987150192261, "learning_rate": 0.00016339387426893918, "loss": 2.9111, "step": 590 }, { "epoch": 0.3562874251497006, "grad_norm": 0.8196045756340027, "learning_rate": 0.0001625821583968855, "loss": 2.8467, "step": 595 }, { "epoch": 0.3592814371257485, "grad_norm": 0.7220514416694641, "learning_rate": 0.00016176360705489823, "loss": 2.8808, "step": 600 }, { "epoch": 0.36227544910179643, "grad_norm": 0.8424475193023682, "learning_rate": 0.00016093830964838035, "loss": 2.858, "step": 605 }, { "epoch": 0.3652694610778443, "grad_norm": 0.7498836517333984, "learning_rate": 0.00016010635631956652, "loss": 2.8987, "step": 610 }, { "epoch": 0.36826347305389223, "grad_norm": 0.7145126461982727, "learning_rate": 0.0001592678379376775, "loss": 2.8769, "step": 615 }, { "epoch": 0.3712574850299401, "grad_norm": 0.6803230047225952, "learning_rate": 0.0001584228460889949, "loss": 2.8812, "step": 620 }, { "epoch": 0.37425149700598803, "grad_norm": 0.6997132897377014, "learning_rate": 0.00015757147306685808, "loss": 2.8886, "step": 625 }, { "epoch": 0.3772455089820359, "grad_norm": 0.7119357585906982, "learning_rate": 0.00015671381186158312, "loss": 2.9011, "step": 630 }, { "epoch": 0.38023952095808383, "grad_norm": 0.8835166096687317, "learning_rate": 0.00015584995615030634, "loss": 2.8876, "step": 635 }, { "epoch": 0.38323353293413176, "grad_norm": 0.7940571904182434, "learning_rate": 0.0001549800002867524, "loss": 2.8739, "step": 640 }, { "epoch": 0.38622754491017963, "grad_norm": 0.7138169407844543, "learning_rate": 0.00015410403929092857, "loss": 2.869, "step": 645 }, { "epoch": 0.38922155688622756, "grad_norm": 0.711258053779602, "learning_rate": 0.00015322216883874643, "loss": 2.878, "step": 650 }, { "epoch": 0.39221556886227543, "grad_norm": 0.730831503868103, "learning_rate": 0.0001523344852515716, "loss": 2.8573, "step": 655 }, { "epoch": 0.39520958083832336, "grad_norm": 0.7164433598518372, "learning_rate": 0.00015144108548570322, "loss": 2.8375, "step": 660 }, { "epoch": 0.39820359281437123, "grad_norm": 0.7521623969078064, "learning_rate": 0.000150542067121784, "loss": 2.8714, "step": 665 }, { "epoch": 0.40119760479041916, "grad_norm": 0.6829126477241516, "learning_rate": 0.00014963752835414203, "loss": 2.8614, "step": 670 }, { "epoch": 0.4041916167664671, "grad_norm": 0.6588369011878967, "learning_rate": 0.00014872756798006576, "loss": 2.8498, "step": 675 }, { "epoch": 0.40718562874251496, "grad_norm": 0.670486569404602, "learning_rate": 0.00014781228538901267, "loss": 2.853, "step": 680 }, { "epoch": 0.4101796407185629, "grad_norm": 0.6251630783081055, "learning_rate": 0.00014689178055175394, "loss": 2.8709, "step": 685 }, { "epoch": 0.41317365269461076, "grad_norm": 0.6421228647232056, "learning_rate": 0.00014596615400945496, "loss": 2.8585, "step": 690 }, { "epoch": 0.4161676646706587, "grad_norm": 0.7128710746765137, "learning_rate": 0.0001450355068626939, "loss": 2.8333, "step": 695 }, { "epoch": 0.41916167664670656, "grad_norm": 0.7639265060424805, "learning_rate": 0.0001440999407604192, "loss": 2.8495, "step": 700 }, { "epoch": 0.4221556886227545, "grad_norm": 0.7937913537025452, "learning_rate": 0.00014315955788884698, "loss": 2.8667, "step": 705 }, { "epoch": 0.4251497005988024, "grad_norm": 0.7219135761260986, "learning_rate": 0.00014221446096029992, "loss": 2.8137, "step": 710 }, { "epoch": 0.4281437125748503, "grad_norm": 0.6893067359924316, "learning_rate": 0.00014126475320198843, "loss": 2.8147, "step": 715 }, { "epoch": 0.4311377245508982, "grad_norm": 0.7299759387969971, "learning_rate": 0.00014031053834473613, "loss": 2.8157, "step": 720 }, { "epoch": 0.4341317365269461, "grad_norm": 0.6203035116195679, "learning_rate": 0.00013935192061164956, "loss": 2.842, "step": 725 }, { "epoch": 0.437125748502994, "grad_norm": 0.67593914270401, "learning_rate": 0.0001383890047067348, "loss": 2.8281, "step": 730 }, { "epoch": 0.44011976047904194, "grad_norm": 0.629625678062439, "learning_rate": 0.0001374218958034612, "loss": 2.8321, "step": 735 }, { "epoch": 0.4431137724550898, "grad_norm": 0.7435698509216309, "learning_rate": 0.0001364506995332739, "loss": 2.8069, "step": 740 }, { "epoch": 0.44610778443113774, "grad_norm": 0.7014979124069214, "learning_rate": 0.00013547552197405632, "loss": 2.8271, "step": 745 }, { "epoch": 0.4491017964071856, "grad_norm": 0.6280860900878906, "learning_rate": 0.00013449646963854396, "loss": 2.7851, "step": 750 }, { "epoch": 0.45209580838323354, "grad_norm": 0.6429287195205688, "learning_rate": 0.00013351364946269072, "loss": 2.8342, "step": 755 }, { "epoch": 0.4550898203592814, "grad_norm": 0.6879150867462158, "learning_rate": 0.00013252716879398884, "loss": 2.8017, "step": 760 }, { "epoch": 0.45808383233532934, "grad_norm": 0.6519134044647217, "learning_rate": 0.00013153713537974394, "loss": 2.8272, "step": 765 }, { "epoch": 0.46107784431137727, "grad_norm": 0.7050421833992004, "learning_rate": 0.00013054365735530664, "loss": 2.8046, "step": 770 }, { "epoch": 0.46407185628742514, "grad_norm": 0.6754117012023926, "learning_rate": 0.00012954684323226136, "loss": 2.7996, "step": 775 }, { "epoch": 0.46706586826347307, "grad_norm": 0.6119791269302368, "learning_rate": 0.00012854680188657437, "loss": 2.825, "step": 780 }, { "epoch": 0.47005988023952094, "grad_norm": 0.7737504839897156, "learning_rate": 0.00012754364254670192, "loss": 2.8229, "step": 785 }, { "epoch": 0.47305389221556887, "grad_norm": 0.6356379985809326, "learning_rate": 0.00012653747478165987, "loss": 2.7901, "step": 790 }, { "epoch": 0.47604790419161674, "grad_norm": 0.679063081741333, "learning_rate": 0.0001255284084890562, "loss": 2.7847, "step": 795 }, { "epoch": 0.47904191616766467, "grad_norm": 0.6137480139732361, "learning_rate": 0.0001245165538830873, "loss": 2.7921, "step": 800 }, { "epoch": 0.4820359281437126, "grad_norm": 0.6218224167823792, "learning_rate": 0.00012350202148250037, "loss": 2.8368, "step": 805 }, { "epoch": 0.48502994011976047, "grad_norm": 0.6735543012619019, "learning_rate": 0.0001224849220985218, "loss": 2.7808, "step": 810 }, { "epoch": 0.4880239520958084, "grad_norm": 0.5928522944450378, "learning_rate": 0.00012146536682275387, "loss": 2.7922, "step": 815 }, { "epoch": 0.49101796407185627, "grad_norm": 0.640355110168457, "learning_rate": 0.00012044346701504128, "loss": 2.8388, "step": 820 }, { "epoch": 0.4940119760479042, "grad_norm": 0.5893163084983826, "learning_rate": 0.00011941933429130758, "loss": 2.8005, "step": 825 }, { "epoch": 0.49700598802395207, "grad_norm": 0.5506909489631653, "learning_rate": 0.0001183930805113643, "loss": 2.8018, "step": 830 }, { "epoch": 0.5, "grad_norm": 0.6417153477668762, "learning_rate": 0.00011736481776669306, "loss": 2.7829, "step": 835 }, { "epoch": 0.5029940119760479, "grad_norm": 0.6764044761657715, "learning_rate": 0.00011633465836820243, "loss": 2.7853, "step": 840 }, { "epoch": 0.5059880239520959, "grad_norm": 0.6103654503822327, "learning_rate": 0.00011530271483396115, "loss": 2.7922, "step": 845 }, { "epoch": 0.5089820359281437, "grad_norm": 0.6200462579727173, "learning_rate": 0.00011426909987690819, "loss": 2.798, "step": 850 }, { "epoch": 0.5119760479041916, "grad_norm": 0.556036114692688, "learning_rate": 0.00011323392639254193, "loss": 2.823, "step": 855 }, { "epoch": 0.5149700598802395, "grad_norm": 0.5561334490776062, "learning_rate": 0.00011219730744658921, "loss": 2.7946, "step": 860 }, { "epoch": 0.5179640718562875, "grad_norm": 0.6291623711585999, "learning_rate": 0.00011115935626265594, "loss": 2.769, "step": 865 }, { "epoch": 0.5209580838323353, "grad_norm": 0.622279703617096, "learning_rate": 0.00011012018620986028, "loss": 2.7905, "step": 870 }, { "epoch": 0.5239520958083832, "grad_norm": 0.5651618242263794, "learning_rate": 0.00010907991079045006, "loss": 2.7611, "step": 875 }, { "epoch": 0.5269461077844312, "grad_norm": 0.5883045792579651, "learning_rate": 0.00010803864362740562, "loss": 2.7853, "step": 880 }, { "epoch": 0.5299401197604791, "grad_norm": 0.5984832048416138, "learning_rate": 0.00010699649845202934, "loss": 2.7686, "step": 885 }, { "epoch": 0.5329341317365269, "grad_norm": 0.631034791469574, "learning_rate": 0.00010595358909152378, "loss": 2.7649, "step": 890 }, { "epoch": 0.5359281437125748, "grad_norm": 0.5855838656425476, "learning_rate": 0.00010491002945655861, "loss": 2.7972, "step": 895 }, { "epoch": 0.5389221556886228, "grad_norm": 0.644307017326355, "learning_rate": 0.00010386593352882909, "loss": 2.7894, "step": 900 }, { "epoch": 0.5419161676646707, "grad_norm": 0.6458478569984436, "learning_rate": 0.0001028214153486066, "loss": 2.7875, "step": 905 }, { "epoch": 0.5449101796407185, "grad_norm": 0.6464332342147827, "learning_rate": 0.00010177658900228249, "loss": 2.7967, "step": 910 }, { "epoch": 0.5479041916167665, "grad_norm": 0.564663827419281, "learning_rate": 0.0001007315686099072, "loss": 2.8149, "step": 915 }, { "epoch": 0.5508982035928144, "grad_norm": 0.669620931148529, "learning_rate": 9.96864683127257e-05, "loss": 2.773, "step": 920 }, { "epoch": 0.5538922155688623, "grad_norm": 0.6294126510620117, "learning_rate": 9.864140226071053e-05, "loss": 2.7909, "step": 925 }, { "epoch": 0.5568862275449101, "grad_norm": 0.5996381044387817, "learning_rate": 9.759648460009376e-05, "loss": 2.7736, "step": 930 }, { "epoch": 0.5598802395209581, "grad_norm": 0.6062589287757874, "learning_rate": 9.655182946089956e-05, "loss": 2.7693, "step": 935 }, { "epoch": 0.562874251497006, "grad_norm": 0.6130212545394897, "learning_rate": 9.550755094447848e-05, "loss": 2.7422, "step": 940 }, { "epoch": 0.5658682634730539, "grad_norm": 0.6118180155754089, "learning_rate": 9.446376311104494e-05, "loss": 2.7847, "step": 945 }, { "epoch": 0.5688622754491018, "grad_norm": 0.6395862698554993, "learning_rate": 9.342057996721894e-05, "loss": 2.7557, "step": 950 }, { "epoch": 0.5718562874251497, "grad_norm": 0.5906466841697693, "learning_rate": 9.237811545357392e-05, "loss": 2.7821, "step": 955 }, { "epoch": 0.5748502994011976, "grad_norm": 0.6405777335166931, "learning_rate": 9.133648343219168e-05, "loss": 2.7561, "step": 960 }, { "epoch": 0.5778443113772455, "grad_norm": 0.537714421749115, "learning_rate": 9.029579767422592e-05, "loss": 2.7624, "step": 965 }, { "epoch": 0.5808383233532934, "grad_norm": 0.5861092805862427, "learning_rate": 8.925617184747584e-05, "loss": 2.7885, "step": 970 }, { "epoch": 0.5838323353293413, "grad_norm": 0.5366087555885315, "learning_rate": 8.821771950397066e-05, "loss": 2.7717, "step": 975 }, { "epoch": 0.5868263473053892, "grad_norm": 0.5149749517440796, "learning_rate": 8.718055406756714e-05, "loss": 2.7722, "step": 980 }, { "epoch": 0.5898203592814372, "grad_norm": 0.6070750951766968, "learning_rate": 8.614478882156103e-05, "loss": 2.7747, "step": 985 }, { "epoch": 0.592814371257485, "grad_norm": 0.5566183924674988, "learning_rate": 8.51105368963137e-05, "loss": 2.7363, "step": 990 }, { "epoch": 0.5958083832335329, "grad_norm": 0.5821384191513062, "learning_rate": 8.407791125689578e-05, "loss": 2.7286, "step": 995 }, { "epoch": 0.5988023952095808, "grad_norm": 0.5972596406936646, "learning_rate": 8.30470246907484e-05, "loss": 2.8008, "step": 1000 }, { "epoch": 0.6017964071856288, "grad_norm": 0.5653948187828064, "learning_rate": 8.201798979536437e-05, "loss": 2.7381, "step": 1005 }, { "epoch": 0.6047904191616766, "grad_norm": 0.5987522602081299, "learning_rate": 8.099091896598964e-05, "loss": 2.7171, "step": 1010 }, { "epoch": 0.6077844311377245, "grad_norm": 0.6162002086639404, "learning_rate": 7.996592438334728e-05, "loss": 2.7595, "step": 1015 }, { "epoch": 0.6107784431137725, "grad_norm": 0.5301627516746521, "learning_rate": 7.894311800138432e-05, "loss": 2.7199, "step": 1020 }, { "epoch": 0.6137724550898204, "grad_norm": 0.5902780890464783, "learning_rate": 7.792261153504402e-05, "loss": 2.7465, "step": 1025 }, { "epoch": 0.6167664670658682, "grad_norm": 0.5662907958030701, "learning_rate": 7.690451644806372e-05, "loss": 2.7463, "step": 1030 }, { "epoch": 0.6197604790419161, "grad_norm": 0.5153141617774963, "learning_rate": 7.588894394080045e-05, "loss": 2.7497, "step": 1035 }, { "epoch": 0.6227544910179641, "grad_norm": 0.5608941316604614, "learning_rate": 7.487600493808513e-05, "loss": 2.7587, "step": 1040 }, { "epoch": 0.625748502994012, "grad_norm": 0.5900591015815735, "learning_rate": 7.386581007710693e-05, "loss": 2.721, "step": 1045 }, { "epoch": 0.6287425149700598, "grad_norm": 0.5923062562942505, "learning_rate": 7.285846969532907e-05, "loss": 2.7245, "step": 1050 }, { "epoch": 0.6317365269461078, "grad_norm": 0.5622250437736511, "learning_rate": 7.185409381843727e-05, "loss": 2.7514, "step": 1055 }, { "epoch": 0.6347305389221557, "grad_norm": 0.5878713130950928, "learning_rate": 7.085279214832233e-05, "loss": 2.7361, "step": 1060 }, { "epoch": 0.6377245508982036, "grad_norm": 0.5391808748245239, "learning_rate": 6.985467405109815e-05, "loss": 2.7708, "step": 1065 }, { "epoch": 0.6407185628742516, "grad_norm": 0.5266147255897522, "learning_rate": 6.885984854515623e-05, "loss": 2.739, "step": 1070 }, { "epoch": 0.6437125748502994, "grad_norm": 0.5293008089065552, "learning_rate": 6.786842428925821e-05, "loss": 2.7338, "step": 1075 }, { "epoch": 0.6467065868263473, "grad_norm": 0.5374533534049988, "learning_rate": 6.688050957066787e-05, "loss": 2.7521, "step": 1080 }, { "epoch": 0.6497005988023952, "grad_norm": 0.519822895526886, "learning_rate": 6.58962122933234e-05, "loss": 2.7509, "step": 1085 }, { "epoch": 0.6526946107784432, "grad_norm": 0.4745498597621918, "learning_rate": 6.491563996605198e-05, "loss": 2.6944, "step": 1090 }, { "epoch": 0.655688622754491, "grad_norm": 0.5428311228752136, "learning_rate": 6.393889969082691e-05, "loss": 2.746, "step": 1095 }, { "epoch": 0.6586826347305389, "grad_norm": 0.6130102276802063, "learning_rate": 6.29660981510697e-05, "loss": 2.7142, "step": 1100 }, { "epoch": 0.6616766467065869, "grad_norm": 0.565981388092041, "learning_rate": 6.199734159999769e-05, "loss": 2.7377, "step": 1105 }, { "epoch": 0.6646706586826348, "grad_norm": 0.5873706340789795, "learning_rate": 6.103273584901856e-05, "loss": 2.7707, "step": 1110 }, { "epoch": 0.6676646706586826, "grad_norm": 0.6199204325675964, "learning_rate": 6.007238625617333e-05, "loss": 2.7288, "step": 1115 }, { "epoch": 0.6706586826347305, "grad_norm": 0.5575565695762634, "learning_rate": 5.911639771462858e-05, "loss": 2.7454, "step": 1120 }, { "epoch": 0.6736526946107785, "grad_norm": 0.5518234372138977, "learning_rate": 5.8164874641219735e-05, "loss": 2.7345, "step": 1125 }, { "epoch": 0.6766467065868264, "grad_norm": 0.5617517828941345, "learning_rate": 5.721792096504611e-05, "loss": 2.756, "step": 1130 }, { "epoch": 0.6796407185628742, "grad_norm": 0.5457236170768738, "learning_rate": 5.627564011611961e-05, "loss": 2.7492, "step": 1135 }, { "epoch": 0.6826347305389222, "grad_norm": 0.5588122606277466, "learning_rate": 5.5338135014067395e-05, "loss": 2.7385, "step": 1140 }, { "epoch": 0.6856287425149701, "grad_norm": 0.5922512412071228, "learning_rate": 5.440550805689075e-05, "loss": 2.7128, "step": 1145 }, { "epoch": 0.688622754491018, "grad_norm": 0.5327529311180115, "learning_rate": 5.3477861109780835e-05, "loss": 2.7507, "step": 1150 }, { "epoch": 0.6916167664670658, "grad_norm": 0.5234887003898621, "learning_rate": 5.255529549399234e-05, "loss": 2.77, "step": 1155 }, { "epoch": 0.6946107784431138, "grad_norm": 0.5305109024047852, "learning_rate": 5.163791197577714e-05, "loss": 2.7765, "step": 1160 }, { "epoch": 0.6976047904191617, "grad_norm": 0.5282033085823059, "learning_rate": 5.0725810755377825e-05, "loss": 2.7518, "step": 1165 }, { "epoch": 0.7005988023952096, "grad_norm": 0.5210415124893188, "learning_rate": 4.9819091456083644e-05, "loss": 2.7493, "step": 1170 }, { "epoch": 0.7035928143712575, "grad_norm": 0.560530424118042, "learning_rate": 4.891785311334923e-05, "loss": 2.7429, "step": 1175 }, { "epoch": 0.7065868263473054, "grad_norm": 0.5484561324119568, "learning_rate": 4.8022194163977494e-05, "loss": 2.7131, "step": 1180 }, { "epoch": 0.7095808383233533, "grad_norm": 0.5590568780899048, "learning_rate": 4.713221243536816e-05, "loss": 2.715, "step": 1185 }, { "epoch": 0.7125748502994012, "grad_norm": 0.5286269783973694, "learning_rate": 4.6248005134832394e-05, "loss": 2.7006, "step": 1190 }, { "epoch": 0.7155688622754491, "grad_norm": 0.5053289532661438, "learning_rate": 4.5369668838975597e-05, "loss": 2.7375, "step": 1195 }, { "epoch": 0.718562874251497, "grad_norm": 0.5476385354995728, "learning_rate": 4.449729948314894e-05, "loss": 2.7402, "step": 1200 }, { "epoch": 0.7215568862275449, "grad_norm": 0.5178956985473633, "learning_rate": 4.363099235097087e-05, "loss": 2.7, "step": 1205 }, { "epoch": 0.7245508982035929, "grad_norm": 0.5388510227203369, "learning_rate": 4.277084206391989e-05, "loss": 2.7014, "step": 1210 }, { "epoch": 0.7275449101796407, "grad_norm": 0.510368824005127, "learning_rate": 4.191694257099962e-05, "loss": 2.7267, "step": 1215 }, { "epoch": 0.7305389221556886, "grad_norm": 0.563225269317627, "learning_rate": 4.10693871384773e-05, "loss": 2.7277, "step": 1220 }, { "epoch": 0.7335329341317365, "grad_norm": 0.5411152243614197, "learning_rate": 4.022826833969692e-05, "loss": 2.7442, "step": 1225 }, { "epoch": 0.7365269461077845, "grad_norm": 0.5123066902160645, "learning_rate": 3.93936780449679e-05, "loss": 2.7042, "step": 1230 }, { "epoch": 0.7395209580838323, "grad_norm": 0.5124099254608154, "learning_rate": 3.856570741153087e-05, "loss": 2.7288, "step": 1235 }, { "epoch": 0.7425149700598802, "grad_norm": 0.5201666355133057, "learning_rate": 3.774444687360082e-05, "loss": 2.7266, "step": 1240 }, { "epoch": 0.7455089820359282, "grad_norm": 0.5414671301841736, "learning_rate": 3.692998613248977e-05, "loss": 2.7237, "step": 1245 }, { "epoch": 0.7485029940119761, "grad_norm": 0.5279539823532104, "learning_rate": 3.6122414146809014e-05, "loss": 2.7114, "step": 1250 }, { "epoch": 0.7514970059880239, "grad_norm": 0.5688903331756592, "learning_rate": 3.532181912275301e-05, "loss": 2.7434, "step": 1255 }, { "epoch": 0.7544910179640718, "grad_norm": 0.5434580445289612, "learning_rate": 3.4528288504464844e-05, "loss": 2.7369, "step": 1260 }, { "epoch": 0.7574850299401198, "grad_norm": 0.5412757992744446, "learning_rate": 3.3741908964485414e-05, "loss": 2.7672, "step": 1265 }, { "epoch": 0.7604790419161677, "grad_norm": 0.5109795331954956, "learning_rate": 3.296276639428665e-05, "loss": 2.6753, "step": 1270 }, { "epoch": 0.7634730538922155, "grad_norm": 0.5179950594902039, "learning_rate": 3.21909458948901e-05, "loss": 2.7301, "step": 1275 }, { "epoch": 0.7664670658682635, "grad_norm": 0.5099706649780273, "learning_rate": 3.1426531767572e-05, "loss": 2.7018, "step": 1280 }, { "epoch": 0.7694610778443114, "grad_norm": 0.5369354486465454, "learning_rate": 3.0669607504655326e-05, "loss": 2.7473, "step": 1285 }, { "epoch": 0.7724550898203593, "grad_norm": 0.45971599221229553, "learning_rate": 2.9920255780390617e-05, "loss": 2.7035, "step": 1290 }, { "epoch": 0.7754491017964071, "grad_norm": 0.49725160002708435, "learning_rate": 2.917855844192584e-05, "loss": 2.7258, "step": 1295 }, { "epoch": 0.7784431137724551, "grad_norm": 0.48002126812934875, "learning_rate": 2.8444596500366825e-05, "loss": 2.6843, "step": 1300 }, { "epoch": 0.781437125748503, "grad_norm": 0.47459498047828674, "learning_rate": 2.7718450121928918e-05, "loss": 2.7133, "step": 1305 }, { "epoch": 0.7844311377245509, "grad_norm": 0.5074918270111084, "learning_rate": 2.7000198619180794e-05, "loss": 2.7191, "step": 1310 }, { "epoch": 0.7874251497005988, "grad_norm": 0.48583555221557617, "learning_rate": 2.6289920442381722e-05, "loss": 2.7248, "step": 1315 }, { "epoch": 0.7904191616766467, "grad_norm": 0.5015809535980225, "learning_rate": 2.5587693170912875e-05, "loss": 2.7373, "step": 1320 }, { "epoch": 0.7934131736526946, "grad_norm": 0.49757909774780273, "learning_rate": 2.4893593504803826e-05, "loss": 2.7156, "step": 1325 }, { "epoch": 0.7964071856287425, "grad_norm": 0.48436880111694336, "learning_rate": 2.4207697256355145e-05, "loss": 2.7378, "step": 1330 }, { "epoch": 0.7994011976047904, "grad_norm": 0.5281265377998352, "learning_rate": 2.353007934185768e-05, "loss": 2.7332, "step": 1335 }, { "epoch": 0.8023952095808383, "grad_norm": 0.5300716161727905, "learning_rate": 2.2860813773410106e-05, "loss": 2.663, "step": 1340 }, { "epoch": 0.8053892215568862, "grad_norm": 0.429849237203598, "learning_rate": 2.2199973650834906e-05, "loss": 2.6993, "step": 1345 }, { "epoch": 0.8083832335329342, "grad_norm": 0.5136377811431885, "learning_rate": 2.154763115369419e-05, "loss": 2.7297, "step": 1350 }, { "epoch": 0.811377245508982, "grad_norm": 0.523758053779602, "learning_rate": 2.0903857533405958e-05, "loss": 2.7227, "step": 1355 }, { "epoch": 0.8143712574850299, "grad_norm": 0.5091609954833984, "learning_rate": 2.026872310546165e-05, "loss": 2.7174, "step": 1360 }, { "epoch": 0.8173652694610778, "grad_norm": 0.5340681672096252, "learning_rate": 1.9642297241746142e-05, "loss": 2.706, "step": 1365 }, { "epoch": 0.8203592814371258, "grad_norm": 0.4736381471157074, "learning_rate": 1.902464836296054e-05, "loss": 2.7244, "step": 1370 }, { "epoch": 0.8233532934131736, "grad_norm": 0.5413190126419067, "learning_rate": 1.841584393114919e-05, "loss": 2.682, "step": 1375 }, { "epoch": 0.8263473053892215, "grad_norm": 0.45962581038475037, "learning_rate": 1.7815950442330963e-05, "loss": 2.6946, "step": 1380 }, { "epoch": 0.8293413173652695, "grad_norm": 0.5146113634109497, "learning_rate": 1.7225033419236503e-05, "loss": 2.6919, "step": 1385 }, { "epoch": 0.8323353293413174, "grad_norm": 0.48548585176467896, "learning_rate": 1.6643157404151467e-05, "loss": 2.7278, "step": 1390 }, { "epoch": 0.8353293413173652, "grad_norm": 0.46142202615737915, "learning_rate": 1.6070385951866952e-05, "loss": 2.7241, "step": 1395 }, { "epoch": 0.8383233532934131, "grad_norm": 0.5457370281219482, "learning_rate": 1.5506781622737942e-05, "loss": 2.7047, "step": 1400 }, { "epoch": 0.8413173652694611, "grad_norm": 0.48171794414520264, "learning_rate": 1.4952405975850026e-05, "loss": 2.6901, "step": 1405 }, { "epoch": 0.844311377245509, "grad_norm": 0.5125330090522766, "learning_rate": 1.4407319562295762e-05, "loss": 2.7052, "step": 1410 }, { "epoch": 0.8473053892215568, "grad_norm": 0.4971305727958679, "learning_rate": 1.387158191856105e-05, "loss": 2.7405, "step": 1415 }, { "epoch": 0.8502994011976048, "grad_norm": 0.539444625377655, "learning_rate": 1.3345251560022288e-05, "loss": 2.6869, "step": 1420 }, { "epoch": 0.8532934131736527, "grad_norm": 0.5079681873321533, "learning_rate": 1.2828385974555202e-05, "loss": 2.7132, "step": 1425 }, { "epoch": 0.8562874251497006, "grad_norm": 0.5335781574249268, "learning_rate": 1.2321041616255614e-05, "loss": 2.7378, "step": 1430 }, { "epoch": 0.8592814371257484, "grad_norm": 0.4930953085422516, "learning_rate": 1.1823273899273435e-05, "loss": 2.678, "step": 1435 }, { "epoch": 0.8622754491017964, "grad_norm": 0.4556470513343811, "learning_rate": 1.1335137191760093e-05, "loss": 2.6918, "step": 1440 }, { "epoch": 0.8652694610778443, "grad_norm": 0.49691587686538696, "learning_rate": 1.0856684809930151e-05, "loss": 2.6948, "step": 1445 }, { "epoch": 0.8682634730538922, "grad_norm": 0.48159509897232056, "learning_rate": 1.0387969012238064e-05, "loss": 2.694, "step": 1450 }, { "epoch": 0.8712574850299402, "grad_norm": 0.4907771050930023, "learning_rate": 9.929040993670114e-06, "loss": 2.724, "step": 1455 }, { "epoch": 0.874251497005988, "grad_norm": 0.4428238570690155, "learning_rate": 9.47995088015281e-06, "loss": 2.6853, "step": 1460 }, { "epoch": 0.8772455089820359, "grad_norm": 0.4720156192779541, "learning_rate": 9.040747723077902e-06, "loss": 2.7465, "step": 1465 }, { "epoch": 0.8802395209580839, "grad_norm": 0.4877772629261017, "learning_rate": 8.61147949394483e-06, "loss": 2.7284, "step": 1470 }, { "epoch": 0.8832335329341318, "grad_norm": 0.48317137360572815, "learning_rate": 8.192193079121002e-06, "loss": 2.7261, "step": 1475 }, { "epoch": 0.8862275449101796, "grad_norm": 0.48045286536216736, "learning_rate": 7.782934274720777e-06, "loss": 2.6818, "step": 1480 }, { "epoch": 0.8892215568862275, "grad_norm": 0.47768479585647583, "learning_rate": 7.3837477816033896e-06, "loss": 2.692, "step": 1485 }, { "epoch": 0.8922155688622755, "grad_norm": 0.5294510722160339, "learning_rate": 6.994677200490507e-06, "loss": 2.6831, "step": 1490 }, { "epoch": 0.8952095808383234, "grad_norm": 0.48493650555610657, "learning_rate": 6.615765027204102e-06, "loss": 2.6695, "step": 1495 }, { "epoch": 0.8982035928143712, "grad_norm": 0.5145286917686462, "learning_rate": 6.247052648024765e-06, "loss": 2.6966, "step": 1500 }, { "epoch": 0.9011976047904192, "grad_norm": 0.4654744565486908, "learning_rate": 5.888580335171368e-06, "loss": 2.7247, "step": 1505 }, { "epoch": 0.9041916167664671, "grad_norm": 0.4812209904193878, "learning_rate": 5.540387242402434e-06, "loss": 2.7261, "step": 1510 }, { "epoch": 0.907185628742515, "grad_norm": 0.4804396331310272, "learning_rate": 5.20251140073953e-06, "loss": 2.692, "step": 1515 }, { "epoch": 0.9101796407185628, "grad_norm": 0.4617965519428253, "learning_rate": 4.874989714313449e-06, "loss": 2.7015, "step": 1520 }, { "epoch": 0.9131736526946108, "grad_norm": 0.4632161259651184, "learning_rate": 4.5578579563333e-06, "loss": 2.6971, "step": 1525 }, { "epoch": 0.9161676646706587, "grad_norm": 0.530784010887146, "learning_rate": 4.251150765179291e-06, "loss": 2.7031, "step": 1530 }, { "epoch": 0.9191616766467066, "grad_norm": 0.49502164125442505, "learning_rate": 3.954901640619368e-06, "loss": 2.6704, "step": 1535 }, { "epoch": 0.9221556886227545, "grad_norm": 0.4807766079902649, "learning_rate": 3.6691429401502053e-06, "loss": 2.6889, "step": 1540 }, { "epoch": 0.9251497005988024, "grad_norm": 0.49187591671943665, "learning_rate": 3.3939058754630882e-06, "loss": 2.7056, "step": 1545 }, { "epoch": 0.9281437125748503, "grad_norm": 0.5204874277114868, "learning_rate": 3.1292205090347248e-06, "loss": 2.7579, "step": 1550 }, { "epoch": 0.9311377245508982, "grad_norm": 0.5218049883842468, "learning_rate": 2.875115750843771e-06, "loss": 2.722, "step": 1555 }, { "epoch": 0.9341317365269461, "grad_norm": 0.4557659327983856, "learning_rate": 2.6316193552131884e-06, "loss": 2.7076, "step": 1560 }, { "epoch": 0.937125748502994, "grad_norm": 0.4934738874435425, "learning_rate": 2.398757917778727e-06, "loss": 2.7231, "step": 1565 }, { "epoch": 0.9401197604790419, "grad_norm": 0.48129838705062866, "learning_rate": 2.176556872584168e-06, "loss": 2.7231, "step": 1570 }, { "epoch": 0.9431137724550899, "grad_norm": 0.4986419975757599, "learning_rate": 1.965040489303194e-06, "loss": 2.7142, "step": 1575 }, { "epoch": 0.9461077844311377, "grad_norm": 0.48590248823165894, "learning_rate": 1.7642318705886286e-06, "loss": 2.7517, "step": 1580 }, { "epoch": 0.9491017964071856, "grad_norm": 0.4665396809577942, "learning_rate": 1.574152949549057e-06, "loss": 2.7028, "step": 1585 }, { "epoch": 0.9520958083832335, "grad_norm": 0.4755544364452362, "learning_rate": 1.3948244873532078e-06, "loss": 2.6893, "step": 1590 }, { "epoch": 0.9550898203592815, "grad_norm": 0.44289329648017883, "learning_rate": 1.226266070962323e-06, "loss": 2.7409, "step": 1595 }, { "epoch": 0.9580838323353293, "grad_norm": 0.5039628744125366, "learning_rate": 1.0684961109908353e-06, "loss": 2.6975, "step": 1600 }, { "epoch": 0.9610778443113772, "grad_norm": 0.47264382243156433, "learning_rate": 9.21531839695411e-07, "loss": 2.6629, "step": 1605 }, { "epoch": 0.9640718562874252, "grad_norm": 0.4993440806865692, "learning_rate": 7.853893090928654e-07, "loss": 2.7854, "step": 1610 }, { "epoch": 0.9670658682634731, "grad_norm": 0.4866315722465515, "learning_rate": 6.600833892068336e-07, "loss": 2.7192, "step": 1615 }, { "epoch": 0.9700598802395209, "grad_norm": 0.42745715379714966, "learning_rate": 5.456277664436127e-07, "loss": 2.7148, "step": 1620 }, { "epoch": 0.9730538922155688, "grad_norm": 0.49950963258743286, "learning_rate": 4.4203494209733576e-07, "loss": 2.743, "step": 1625 }, { "epoch": 0.9760479041916168, "grad_norm": 0.45195409655570984, "learning_rate": 3.4931623098445334e-07, "loss": 2.7275, "step": 1630 }, { "epoch": 0.9790419161676647, "grad_norm": 0.47945547103881836, "learning_rate": 2.674817602079327e-07, "loss": 2.725, "step": 1635 }, { "epoch": 0.9820359281437125, "grad_norm": 0.48270925879478455, "learning_rate": 1.965404680511207e-07, "loss": 2.6793, "step": 1640 }, { "epoch": 0.9850299401197605, "grad_norm": 0.4643472731113434, "learning_rate": 1.3650010300150228e-07, "loss": 2.7078, "step": 1645 }, { "epoch": 0.9880239520958084, "grad_norm": 0.4867922365665436, "learning_rate": 8.736722290429988e-08, "loss": 2.7148, "step": 1650 }, { "epoch": 0.9910179640718563, "grad_norm": 0.5043019652366638, "learning_rate": 4.9147194246290664e-08, "loss": 2.7255, "step": 1655 }, { "epoch": 0.9940119760479041, "grad_norm": 0.4517767131328583, "learning_rate": 2.1844191569597716e-08, "loss": 2.6949, "step": 1660 }, { "epoch": 0.9970059880239521, "grad_norm": 0.4739466905593872, "learning_rate": 5.461197015765862e-09, "loss": 2.7152, "step": 1665 }, { "epoch": 1.0, "grad_norm": 0.46331822872161865, "learning_rate": 0.0, "loss": 2.7246, "step": 1670 }, { "epoch": 1.0, "step": 1670, "total_flos": 1.396344619008e+16, "train_loss": 2.9739012581145694, "train_runtime": 9187.7213, "train_samples_per_second": 11.629, "train_steps_per_second": 0.182 } ], "logging_steps": 5, "max_steps": 1670, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.396344619008e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }