{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.982608695652174, "eval_steps": 87, "global_step": 690, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002898550724637681, "grad_norm": 0.44052618741989136, "learning_rate": 1.0000000000000002e-06, "loss": 1.4473, "step": 1 }, { "epoch": 0.002898550724637681, "eval_loss": 1.4117156267166138, "eval_runtime": 46.1446, "eval_samples_per_second": 5.548, "eval_steps_per_second": 0.693, "step": 1 }, { "epoch": 0.005797101449275362, "grad_norm": 0.4932183027267456, "learning_rate": 2.0000000000000003e-06, "loss": 1.3923, "step": 2 }, { "epoch": 0.008695652173913044, "grad_norm": 0.4844379723072052, "learning_rate": 3e-06, "loss": 1.4468, "step": 3 }, { "epoch": 0.011594202898550725, "grad_norm": 0.5023930668830872, "learning_rate": 4.000000000000001e-06, "loss": 1.3773, "step": 4 }, { "epoch": 0.014492753623188406, "grad_norm": 0.483876496553421, "learning_rate": 5e-06, "loss": 1.4103, "step": 5 }, { "epoch": 0.017391304347826087, "grad_norm": 0.4460753798484802, "learning_rate": 6e-06, "loss": 1.4707, "step": 6 }, { "epoch": 0.020289855072463767, "grad_norm": 0.4342319369316101, "learning_rate": 7e-06, "loss": 1.3563, "step": 7 }, { "epoch": 0.02318840579710145, "grad_norm": 0.479257196187973, "learning_rate": 8.000000000000001e-06, "loss": 1.414, "step": 8 }, { "epoch": 0.02608695652173913, "grad_norm": 0.5028970241546631, "learning_rate": 9e-06, "loss": 1.4601, "step": 9 }, { "epoch": 0.028985507246376812, "grad_norm": 0.49131453037261963, "learning_rate": 1e-05, "loss": 1.4364, "step": 10 }, { "epoch": 0.03188405797101449, "grad_norm": 0.5517832040786743, "learning_rate": 9.999946639344475e-06, "loss": 1.4873, "step": 11 }, { "epoch": 0.034782608695652174, "grad_norm": 0.5310211181640625, "learning_rate": 9.99978655851684e-06, "loss": 1.4346, "step": 12 }, { "epoch": 0.03768115942028986, "grad_norm": 0.4639141857624054, "learning_rate": 9.999519760933905e-06, "loss": 1.4402, "step": 13 }, { "epoch": 0.04057971014492753, "grad_norm": 0.47811073064804077, "learning_rate": 9.999146252290264e-06, "loss": 1.4106, "step": 14 }, { "epoch": 0.043478260869565216, "grad_norm": 0.5223386883735657, "learning_rate": 9.998666040558187e-06, "loss": 1.3732, "step": 15 }, { "epoch": 0.0463768115942029, "grad_norm": 0.5601791143417358, "learning_rate": 9.998079135987437e-06, "loss": 1.4166, "step": 16 }, { "epoch": 0.04927536231884058, "grad_norm": 0.5459745526313782, "learning_rate": 9.997385551105061e-06, "loss": 1.4501, "step": 17 }, { "epoch": 0.05217391304347826, "grad_norm": 0.6155043244361877, "learning_rate": 9.996585300715117e-06, "loss": 1.3987, "step": 18 }, { "epoch": 0.05507246376811594, "grad_norm": 0.539135754108429, "learning_rate": 9.995678401898354e-06, "loss": 1.3943, "step": 19 }, { "epoch": 0.057971014492753624, "grad_norm": 0.5232663154602051, "learning_rate": 9.994664874011864e-06, "loss": 1.3742, "step": 20 }, { "epoch": 0.06086956521739131, "grad_norm": 0.4995758533477783, "learning_rate": 9.993544738688647e-06, "loss": 1.3969, "step": 21 }, { "epoch": 0.06376811594202898, "grad_norm": 0.5397970080375671, "learning_rate": 9.992318019837171e-06, "loss": 1.3238, "step": 22 }, { "epoch": 0.06666666666666667, "grad_norm": 0.5533668994903564, "learning_rate": 9.990984743640839e-06, "loss": 1.3717, "step": 23 }, { "epoch": 0.06956521739130435, "grad_norm": 0.5304050445556641, "learning_rate": 9.989544938557453e-06, "loss": 1.3565, "step": 24 }, { "epoch": 0.07246376811594203, "grad_norm": 0.5658550262451172, "learning_rate": 9.987998635318586e-06, "loss": 1.3075, "step": 25 }, { "epoch": 0.07536231884057971, "grad_norm": 0.5798805952072144, "learning_rate": 9.98634586692894e-06, "loss": 1.4202, "step": 26 }, { "epoch": 0.0782608695652174, "grad_norm": 0.49352607131004333, "learning_rate": 9.984586668665641e-06, "loss": 1.3172, "step": 27 }, { "epoch": 0.08115942028985507, "grad_norm": 0.576454222202301, "learning_rate": 9.982721078077474e-06, "loss": 1.3633, "step": 28 }, { "epoch": 0.08405797101449275, "grad_norm": 0.5843266248703003, "learning_rate": 9.980749134984094e-06, "loss": 1.3031, "step": 29 }, { "epoch": 0.08695652173913043, "grad_norm": 0.5863199234008789, "learning_rate": 9.978670881475173e-06, "loss": 1.3228, "step": 30 }, { "epoch": 0.08985507246376812, "grad_norm": 0.6071418523788452, "learning_rate": 9.9764863619095e-06, "loss": 1.3277, "step": 31 }, { "epoch": 0.0927536231884058, "grad_norm": 0.5361754298210144, "learning_rate": 9.97419562291403e-06, "loss": 1.3189, "step": 32 }, { "epoch": 0.09565217391304348, "grad_norm": 0.6043053865432739, "learning_rate": 9.971798713382896e-06, "loss": 1.2567, "step": 33 }, { "epoch": 0.09855072463768116, "grad_norm": 0.4795907139778137, "learning_rate": 9.96929568447637e-06, "loss": 1.33, "step": 34 }, { "epoch": 0.10144927536231885, "grad_norm": 0.5752019882202148, "learning_rate": 9.96668658961975e-06, "loss": 1.1915, "step": 35 }, { "epoch": 0.10434782608695652, "grad_norm": 0.47888195514678955, "learning_rate": 9.963971484502247e-06, "loss": 1.2753, "step": 36 }, { "epoch": 0.1072463768115942, "grad_norm": 0.5371452569961548, "learning_rate": 9.96115042707577e-06, "loss": 1.2659, "step": 37 }, { "epoch": 0.11014492753623188, "grad_norm": 0.6198606491088867, "learning_rate": 9.958223477553715e-06, "loss": 1.2166, "step": 38 }, { "epoch": 0.11304347826086956, "grad_norm": 0.4718591272830963, "learning_rate": 9.955190698409656e-06, "loss": 1.2708, "step": 39 }, { "epoch": 0.11594202898550725, "grad_norm": 0.5691114068031311, "learning_rate": 9.952052154376027e-06, "loss": 1.2074, "step": 40 }, { "epoch": 0.11884057971014493, "grad_norm": 0.515771210193634, "learning_rate": 9.948807912442735e-06, "loss": 1.1958, "step": 41 }, { "epoch": 0.12173913043478261, "grad_norm": 0.6830301880836487, "learning_rate": 9.945458041855732e-06, "loss": 1.2992, "step": 42 }, { "epoch": 0.1246376811594203, "grad_norm": 0.5583641529083252, "learning_rate": 9.94200261411553e-06, "loss": 1.2654, "step": 43 }, { "epoch": 0.12753623188405797, "grad_norm": 0.5985351800918579, "learning_rate": 9.938441702975689e-06, "loss": 1.2064, "step": 44 }, { "epoch": 0.13043478260869565, "grad_norm": 0.5092725157737732, "learning_rate": 9.93477538444123e-06, "loss": 1.1477, "step": 45 }, { "epoch": 0.13333333333333333, "grad_norm": 0.5719948410987854, "learning_rate": 9.931003736767013e-06, "loss": 1.3045, "step": 46 }, { "epoch": 0.13623188405797101, "grad_norm": 0.5000984072685242, "learning_rate": 9.92712684045608e-06, "loss": 1.2954, "step": 47 }, { "epoch": 0.1391304347826087, "grad_norm": 0.6268609762191772, "learning_rate": 9.923144778257918e-06, "loss": 1.2742, "step": 48 }, { "epoch": 0.14202898550724638, "grad_norm": 0.5395749807357788, "learning_rate": 9.91905763516671e-06, "loss": 1.1651, "step": 49 }, { "epoch": 0.14492753623188406, "grad_norm": 0.6797102689743042, "learning_rate": 9.91486549841951e-06, "loss": 1.2083, "step": 50 }, { "epoch": 0.14782608695652175, "grad_norm": 0.554821252822876, "learning_rate": 9.91056845749438e-06, "loss": 1.1623, "step": 51 }, { "epoch": 0.15072463768115943, "grad_norm": 0.6033896803855896, "learning_rate": 9.906166604108494e-06, "loss": 1.2135, "step": 52 }, { "epoch": 0.1536231884057971, "grad_norm": 0.568701446056366, "learning_rate": 9.901660032216159e-06, "loss": 1.1956, "step": 53 }, { "epoch": 0.1565217391304348, "grad_norm": 0.6862343549728394, "learning_rate": 9.89704883800683e-06, "loss": 1.1992, "step": 54 }, { "epoch": 0.15942028985507245, "grad_norm": 0.49399352073669434, "learning_rate": 9.892333119903045e-06, "loss": 1.1711, "step": 55 }, { "epoch": 0.16231884057971013, "grad_norm": 0.5683416724205017, "learning_rate": 9.887512978558329e-06, "loss": 1.2608, "step": 56 }, { "epoch": 0.16521739130434782, "grad_norm": 0.4855175018310547, "learning_rate": 9.88258851685504e-06, "loss": 1.1652, "step": 57 }, { "epoch": 0.1681159420289855, "grad_norm": 0.5765471458435059, "learning_rate": 9.877559839902185e-06, "loss": 1.2653, "step": 58 }, { "epoch": 0.17101449275362318, "grad_norm": 0.5921582579612732, "learning_rate": 9.872427055033156e-06, "loss": 1.1191, "step": 59 }, { "epoch": 0.17391304347826086, "grad_norm": 0.5046260356903076, "learning_rate": 9.867190271803466e-06, "loss": 1.1824, "step": 60 }, { "epoch": 0.17681159420289855, "grad_norm": 0.5180432796478271, "learning_rate": 9.861849601988384e-06, "loss": 1.1736, "step": 61 }, { "epoch": 0.17971014492753623, "grad_norm": 0.65400230884552, "learning_rate": 9.85640515958057e-06, "loss": 1.1129, "step": 62 }, { "epoch": 0.1826086956521739, "grad_norm": 0.5726003646850586, "learning_rate": 9.85085706078763e-06, "loss": 1.1567, "step": 63 }, { "epoch": 0.1855072463768116, "grad_norm": 0.5297178030014038, "learning_rate": 9.845205424029639e-06, "loss": 1.101, "step": 64 }, { "epoch": 0.18840579710144928, "grad_norm": 0.5242377519607544, "learning_rate": 9.839450369936615e-06, "loss": 1.174, "step": 65 }, { "epoch": 0.19130434782608696, "grad_norm": 0.5277882218360901, "learning_rate": 9.833592021345938e-06, "loss": 1.1772, "step": 66 }, { "epoch": 0.19420289855072465, "grad_norm": 0.5334244966506958, "learning_rate": 9.827630503299741e-06, "loss": 1.1722, "step": 67 }, { "epoch": 0.19710144927536233, "grad_norm": 0.6054286360740662, "learning_rate": 9.821565943042225e-06, "loss": 1.2022, "step": 68 }, { "epoch": 0.2, "grad_norm": 0.5691675543785095, "learning_rate": 9.815398470016957e-06, "loss": 1.1256, "step": 69 }, { "epoch": 0.2028985507246377, "grad_norm": 0.4579974114894867, "learning_rate": 9.809128215864096e-06, "loss": 1.1548, "step": 70 }, { "epoch": 0.20579710144927535, "grad_norm": 0.605627715587616, "learning_rate": 9.802755314417592e-06, "loss": 1.0972, "step": 71 }, { "epoch": 0.20869565217391303, "grad_norm": 0.5655208826065063, "learning_rate": 9.796279901702326e-06, "loss": 1.0902, "step": 72 }, { "epoch": 0.21159420289855072, "grad_norm": 0.570743978023529, "learning_rate": 9.789702115931202e-06, "loss": 1.0654, "step": 73 }, { "epoch": 0.2144927536231884, "grad_norm": 0.7513704895973206, "learning_rate": 9.783022097502204e-06, "loss": 1.1348, "step": 74 }, { "epoch": 0.21739130434782608, "grad_norm": 0.592363715171814, "learning_rate": 9.776239988995401e-06, "loss": 1.1733, "step": 75 }, { "epoch": 0.22028985507246376, "grad_norm": 0.5394357442855835, "learning_rate": 9.76935593516989e-06, "loss": 1.1313, "step": 76 }, { "epoch": 0.22318840579710145, "grad_norm": 0.598983108997345, "learning_rate": 9.762370082960727e-06, "loss": 1.1077, "step": 77 }, { "epoch": 0.22608695652173913, "grad_norm": 0.5635719895362854, "learning_rate": 9.755282581475769e-06, "loss": 1.0393, "step": 78 }, { "epoch": 0.2289855072463768, "grad_norm": 0.5638449788093567, "learning_rate": 9.748093581992506e-06, "loss": 1.1126, "step": 79 }, { "epoch": 0.2318840579710145, "grad_norm": 0.5267054438591003, "learning_rate": 9.74080323795483e-06, "loss": 1.108, "step": 80 }, { "epoch": 0.23478260869565218, "grad_norm": 0.69565749168396, "learning_rate": 9.733411704969754e-06, "loss": 1.1065, "step": 81 }, { "epoch": 0.23768115942028986, "grad_norm": 0.5769387483596802, "learning_rate": 9.7259191408041e-06, "loss": 1.0892, "step": 82 }, { "epoch": 0.24057971014492754, "grad_norm": 0.4646681845188141, "learning_rate": 9.718325705381115e-06, "loss": 1.0984, "step": 83 }, { "epoch": 0.24347826086956523, "grad_norm": 0.5441101789474487, "learning_rate": 9.710631560777082e-06, "loss": 1.134, "step": 84 }, { "epoch": 0.2463768115942029, "grad_norm": 0.6711792349815369, "learning_rate": 9.702836871217838e-06, "loss": 1.118, "step": 85 }, { "epoch": 0.2492753623188406, "grad_norm": 0.6086435914039612, "learning_rate": 9.694941803075285e-06, "loss": 1.1332, "step": 86 }, { "epoch": 0.25217391304347825, "grad_norm": 0.6047069430351257, "learning_rate": 9.686946524863821e-06, "loss": 1.0948, "step": 87 }, { "epoch": 0.25217391304347825, "eval_loss": 1.093648910522461, "eval_runtime": 46.2827, "eval_samples_per_second": 5.531, "eval_steps_per_second": 0.691, "step": 87 }, { "epoch": 0.25507246376811593, "grad_norm": 0.5494099259376526, "learning_rate": 9.678851207236764e-06, "loss": 1.0677, "step": 88 }, { "epoch": 0.2579710144927536, "grad_norm": 0.6029177308082581, "learning_rate": 9.670656022982696e-06, "loss": 1.1122, "step": 89 }, { "epoch": 0.2608695652173913, "grad_norm": 0.6882422566413879, "learning_rate": 9.66236114702178e-06, "loss": 1.131, "step": 90 }, { "epoch": 0.263768115942029, "grad_norm": 0.5858222246170044, "learning_rate": 9.65396675640202e-06, "loss": 1.0904, "step": 91 }, { "epoch": 0.26666666666666666, "grad_norm": 0.6096974611282349, "learning_rate": 9.645473030295496e-06, "loss": 1.1001, "step": 92 }, { "epoch": 0.26956521739130435, "grad_norm": 0.5705183148384094, "learning_rate": 9.636880149994518e-06, "loss": 1.1159, "step": 93 }, { "epoch": 0.27246376811594203, "grad_norm": 0.5896604061126709, "learning_rate": 9.628188298907782e-06, "loss": 1.0236, "step": 94 }, { "epoch": 0.2753623188405797, "grad_norm": 0.6060263514518738, "learning_rate": 9.619397662556434e-06, "loss": 1.0991, "step": 95 }, { "epoch": 0.2782608695652174, "grad_norm": 0.6302357316017151, "learning_rate": 9.610508428570122e-06, "loss": 1.073, "step": 96 }, { "epoch": 0.2811594202898551, "grad_norm": 0.6086059212684631, "learning_rate": 9.601520786682989e-06, "loss": 1.1556, "step": 97 }, { "epoch": 0.28405797101449276, "grad_norm": 0.5601389408111572, "learning_rate": 9.592434928729617e-06, "loss": 1.0691, "step": 98 }, { "epoch": 0.28695652173913044, "grad_norm": 0.6236623525619507, "learning_rate": 9.583251048640941e-06, "loss": 1.0293, "step": 99 }, { "epoch": 0.2898550724637681, "grad_norm": 0.661264181137085, "learning_rate": 9.573969342440107e-06, "loss": 1.0597, "step": 100 }, { "epoch": 0.2927536231884058, "grad_norm": 0.5187559127807617, "learning_rate": 9.564590008238284e-06, "loss": 1.0152, "step": 101 }, { "epoch": 0.2956521739130435, "grad_norm": 0.7033849358558655, "learning_rate": 9.555113246230443e-06, "loss": 1.0583, "step": 102 }, { "epoch": 0.2985507246376812, "grad_norm": 0.6243430376052856, "learning_rate": 9.545539258691076e-06, "loss": 1.0415, "step": 103 }, { "epoch": 0.30144927536231886, "grad_norm": 0.7448285222053528, "learning_rate": 9.535868249969882e-06, "loss": 1.1665, "step": 104 }, { "epoch": 0.30434782608695654, "grad_norm": 0.7407688498497009, "learning_rate": 9.52610042648741e-06, "loss": 1.0805, "step": 105 }, { "epoch": 0.3072463768115942, "grad_norm": 0.6399569511413574, "learning_rate": 9.516235996730645e-06, "loss": 1.0622, "step": 106 }, { "epoch": 0.3101449275362319, "grad_norm": 0.6391183733940125, "learning_rate": 9.50627517124856e-06, "loss": 1.0988, "step": 107 }, { "epoch": 0.3130434782608696, "grad_norm": 0.6799684166908264, "learning_rate": 9.496218162647629e-06, "loss": 1.0667, "step": 108 }, { "epoch": 0.3159420289855073, "grad_norm": 0.6955932378768921, "learning_rate": 9.486065185587278e-06, "loss": 1.0475, "step": 109 }, { "epoch": 0.3188405797101449, "grad_norm": 0.6768685579299927, "learning_rate": 9.475816456775313e-06, "loss": 1.0906, "step": 110 }, { "epoch": 0.3217391304347826, "grad_norm": 0.6448860168457031, "learning_rate": 9.465472194963287e-06, "loss": 1.0725, "step": 111 }, { "epoch": 0.32463768115942027, "grad_norm": 0.654137909412384, "learning_rate": 9.45503262094184e-06, "loss": 1.0477, "step": 112 }, { "epoch": 0.32753623188405795, "grad_norm": 0.5668336749076843, "learning_rate": 9.444497957535975e-06, "loss": 1.0419, "step": 113 }, { "epoch": 0.33043478260869563, "grad_norm": 0.8345162868499756, "learning_rate": 9.43386842960031e-06, "loss": 1.1125, "step": 114 }, { "epoch": 0.3333333333333333, "grad_norm": 0.5995410084724426, "learning_rate": 9.423144264014278e-06, "loss": 1.048, "step": 115 }, { "epoch": 0.336231884057971, "grad_norm": 0.6526032090187073, "learning_rate": 9.41232568967728e-06, "loss": 1.0868, "step": 116 }, { "epoch": 0.3391304347826087, "grad_norm": 0.7131723165512085, "learning_rate": 9.401412937503802e-06, "loss": 1.0154, "step": 117 }, { "epoch": 0.34202898550724636, "grad_norm": 0.7425084114074707, "learning_rate": 9.39040624041849e-06, "loss": 1.1046, "step": 118 }, { "epoch": 0.34492753623188405, "grad_norm": 0.6741538643836975, "learning_rate": 9.379305833351174e-06, "loss": 1.0884, "step": 119 }, { "epoch": 0.34782608695652173, "grad_norm": 0.6611533164978027, "learning_rate": 9.368111953231849e-06, "loss": 1.1291, "step": 120 }, { "epoch": 0.3507246376811594, "grad_norm": 0.6605979204177856, "learning_rate": 9.35682483898563e-06, "loss": 1.0354, "step": 121 }, { "epoch": 0.3536231884057971, "grad_norm": 0.7649601101875305, "learning_rate": 9.345444731527642e-06, "loss": 1.0705, "step": 122 }, { "epoch": 0.3565217391304348, "grad_norm": 0.6104558110237122, "learning_rate": 9.333971873757885e-06, "loss": 1.0221, "step": 123 }, { "epoch": 0.35942028985507246, "grad_norm": 0.5945985913276672, "learning_rate": 9.32240651055604e-06, "loss": 1.0352, "step": 124 }, { "epoch": 0.36231884057971014, "grad_norm": 0.7351408004760742, "learning_rate": 9.310748888776254e-06, "loss": 1.0283, "step": 125 }, { "epoch": 0.3652173913043478, "grad_norm": 0.6751654148101807, "learning_rate": 9.298999257241862e-06, "loss": 1.1355, "step": 126 }, { "epoch": 0.3681159420289855, "grad_norm": 0.6744984984397888, "learning_rate": 9.287157866740082e-06, "loss": 1.097, "step": 127 }, { "epoch": 0.3710144927536232, "grad_norm": 0.6096031665802002, "learning_rate": 9.275224970016656e-06, "loss": 0.9879, "step": 128 }, { "epoch": 0.3739130434782609, "grad_norm": 0.6282311081886292, "learning_rate": 9.263200821770462e-06, "loss": 1.0088, "step": 129 }, { "epoch": 0.37681159420289856, "grad_norm": 0.6340439319610596, "learning_rate": 9.251085678648072e-06, "loss": 1.0314, "step": 130 }, { "epoch": 0.37971014492753624, "grad_norm": 0.6008773446083069, "learning_rate": 9.238879799238278e-06, "loss": 1.0304, "step": 131 }, { "epoch": 0.3826086956521739, "grad_norm": 0.83261638879776, "learning_rate": 9.22658344406657e-06, "loss": 1.0767, "step": 132 }, { "epoch": 0.3855072463768116, "grad_norm": 0.6942703127861023, "learning_rate": 9.214196875589577e-06, "loss": 1.0238, "step": 133 }, { "epoch": 0.3884057971014493, "grad_norm": 0.6649532914161682, "learning_rate": 9.201720358189464e-06, "loss": 1.0353, "step": 134 }, { "epoch": 0.391304347826087, "grad_norm": 0.6827482581138611, "learning_rate": 9.189154158168293e-06, "loss": 1.0123, "step": 135 }, { "epoch": 0.39420289855072466, "grad_norm": 0.8225923776626587, "learning_rate": 9.176498543742328e-06, "loss": 1.0894, "step": 136 }, { "epoch": 0.39710144927536234, "grad_norm": 0.7622413635253906, "learning_rate": 9.163753785036324e-06, "loss": 1.0987, "step": 137 }, { "epoch": 0.4, "grad_norm": 0.729880690574646, "learning_rate": 9.150920154077753e-06, "loss": 1.0686, "step": 138 }, { "epoch": 0.4028985507246377, "grad_norm": 0.5569338798522949, "learning_rate": 9.137997924791e-06, "loss": 1.0554, "step": 139 }, { "epoch": 0.4057971014492754, "grad_norm": 0.7127766013145447, "learning_rate": 9.124987372991512e-06, "loss": 1.0878, "step": 140 }, { "epoch": 0.40869565217391307, "grad_norm": 0.6865119338035583, "learning_rate": 9.11188877637992e-06, "loss": 1.078, "step": 141 }, { "epoch": 0.4115942028985507, "grad_norm": 0.7496594786643982, "learning_rate": 9.098702414536107e-06, "loss": 1.1678, "step": 142 }, { "epoch": 0.4144927536231884, "grad_norm": 0.7547608017921448, "learning_rate": 9.085428568913233e-06, "loss": 1.0282, "step": 143 }, { "epoch": 0.41739130434782606, "grad_norm": 0.6696781516075134, "learning_rate": 9.072067522831743e-06, "loss": 1.0529, "step": 144 }, { "epoch": 0.42028985507246375, "grad_norm": 0.6223747134208679, "learning_rate": 9.058619561473308e-06, "loss": 1.0101, "step": 145 }, { "epoch": 0.42318840579710143, "grad_norm": 0.6682969331741333, "learning_rate": 9.045084971874738e-06, "loss": 1.0669, "step": 146 }, { "epoch": 0.4260869565217391, "grad_norm": 0.702489972114563, "learning_rate": 9.031464042921866e-06, "loss": 1.0696, "step": 147 }, { "epoch": 0.4289855072463768, "grad_norm": 0.6877920031547546, "learning_rate": 9.017757065343368e-06, "loss": 1.0181, "step": 148 }, { "epoch": 0.4318840579710145, "grad_norm": 0.7262343168258667, "learning_rate": 9.003964331704574e-06, "loss": 1.0869, "step": 149 }, { "epoch": 0.43478260869565216, "grad_norm": 0.6435033082962036, "learning_rate": 8.990086136401199e-06, "loss": 1.0943, "step": 150 }, { "epoch": 0.43768115942028984, "grad_norm": 0.8294116854667664, "learning_rate": 8.976122775653087e-06, "loss": 1.0053, "step": 151 }, { "epoch": 0.4405797101449275, "grad_norm": 0.7582129240036011, "learning_rate": 8.96207454749787e-06, "loss": 1.0255, "step": 152 }, { "epoch": 0.4434782608695652, "grad_norm": 0.7421862483024597, "learning_rate": 8.947941751784614e-06, "loss": 0.995, "step": 153 }, { "epoch": 0.4463768115942029, "grad_norm": 0.6562067866325378, "learning_rate": 8.933724690167417e-06, "loss": 1.0051, "step": 154 }, { "epoch": 0.4492753623188406, "grad_norm": 0.7008780241012573, "learning_rate": 8.91942366609897e-06, "loss": 1.0224, "step": 155 }, { "epoch": 0.45217391304347826, "grad_norm": 0.8320948481559753, "learning_rate": 8.905038984824079e-06, "loss": 1.0867, "step": 156 }, { "epoch": 0.45507246376811594, "grad_norm": 0.7078688740730286, "learning_rate": 8.890570953373152e-06, "loss": 1.0233, "step": 157 }, { "epoch": 0.4579710144927536, "grad_norm": 0.602080225944519, "learning_rate": 8.87601988055565e-06, "loss": 1.033, "step": 158 }, { "epoch": 0.4608695652173913, "grad_norm": 0.6947946548461914, "learning_rate": 8.861386076953485e-06, "loss": 1.0056, "step": 159 }, { "epoch": 0.463768115942029, "grad_norm": 0.7520703673362732, "learning_rate": 8.846669854914395e-06, "loss": 1.0129, "step": 160 }, { "epoch": 0.4666666666666667, "grad_norm": 0.8198053240776062, "learning_rate": 8.831871528545286e-06, "loss": 1.0554, "step": 161 }, { "epoch": 0.46956521739130436, "grad_norm": 0.8595309257507324, "learning_rate": 8.816991413705515e-06, "loss": 0.9769, "step": 162 }, { "epoch": 0.47246376811594204, "grad_norm": 0.7658084034919739, "learning_rate": 8.802029828000157e-06, "loss": 1.0942, "step": 163 }, { "epoch": 0.4753623188405797, "grad_norm": 0.779561460018158, "learning_rate": 8.786987090773214e-06, "loss": 1.0526, "step": 164 }, { "epoch": 0.4782608695652174, "grad_norm": 0.7491458654403687, "learning_rate": 8.771863523100821e-06, "loss": 1.076, "step": 165 }, { "epoch": 0.4811594202898551, "grad_norm": 0.7698597311973572, "learning_rate": 8.756659447784367e-06, "loss": 1.0513, "step": 166 }, { "epoch": 0.48405797101449277, "grad_norm": 0.7076740860939026, "learning_rate": 8.741375189343625e-06, "loss": 0.952, "step": 167 }, { "epoch": 0.48695652173913045, "grad_norm": 0.8549159169197083, "learning_rate": 8.726011074009813e-06, "loss": 1.0062, "step": 168 }, { "epoch": 0.48985507246376814, "grad_norm": 0.7257103323936462, "learning_rate": 8.71056742971864e-06, "loss": 1.0124, "step": 169 }, { "epoch": 0.4927536231884058, "grad_norm": 0.6643837094306946, "learning_rate": 8.695044586103297e-06, "loss": 1.0646, "step": 170 }, { "epoch": 0.4956521739130435, "grad_norm": 0.6454336643218994, "learning_rate": 8.679442874487427e-06, "loss": 1.0482, "step": 171 }, { "epoch": 0.4985507246376812, "grad_norm": 0.6484606266021729, "learning_rate": 8.663762627878059e-06, "loss": 1.0361, "step": 172 }, { "epoch": 0.5014492753623189, "grad_norm": 0.8437646627426147, "learning_rate": 8.64800418095848e-06, "loss": 1.1064, "step": 173 }, { "epoch": 0.5043478260869565, "grad_norm": 0.8865697979927063, "learning_rate": 8.632167870081122e-06, "loss": 1.0187, "step": 174 }, { "epoch": 0.5043478260869565, "eval_loss": 1.0253716707229614, "eval_runtime": 46.4716, "eval_samples_per_second": 5.509, "eval_steps_per_second": 0.689, "step": 174 }, { "epoch": 0.5072463768115942, "grad_norm": 0.6522702574729919, "learning_rate": 8.616254033260351e-06, "loss": 1.0466, "step": 175 }, { "epoch": 0.5101449275362319, "grad_norm": 0.7485548257827759, "learning_rate": 8.600263010165275e-06, "loss": 1.051, "step": 176 }, { "epoch": 0.5130434782608696, "grad_norm": 0.7864269614219666, "learning_rate": 8.584195142112482e-06, "loss": 0.9823, "step": 177 }, { "epoch": 0.5159420289855072, "grad_norm": 0.669228732585907, "learning_rate": 8.568050772058763e-06, "loss": 0.9959, "step": 178 }, { "epoch": 0.518840579710145, "grad_norm": 0.7351509928703308, "learning_rate": 8.551830244593785e-06, "loss": 1.0523, "step": 179 }, { "epoch": 0.5217391304347826, "grad_norm": 0.6464654207229614, "learning_rate": 8.535533905932739e-06, "loss": 1.0576, "step": 180 }, { "epoch": 0.5246376811594203, "grad_norm": 0.6708983182907104, "learning_rate": 8.519162103908951e-06, "loss": 1.0036, "step": 181 }, { "epoch": 0.527536231884058, "grad_norm": 0.6712408661842346, "learning_rate": 8.502715187966455e-06, "loss": 0.9567, "step": 182 }, { "epoch": 0.5304347826086957, "grad_norm": 0.8165604472160339, "learning_rate": 8.48619350915254e-06, "loss": 1.0074, "step": 183 }, { "epoch": 0.5333333333333333, "grad_norm": 0.8015124797821045, "learning_rate": 8.469597420110249e-06, "loss": 1.04, "step": 184 }, { "epoch": 0.5362318840579711, "grad_norm": 0.6764898896217346, "learning_rate": 8.452927275070858e-06, "loss": 1.0259, "step": 185 }, { "epoch": 0.5391304347826087, "grad_norm": 0.7508796453475952, "learning_rate": 8.436183429846314e-06, "loss": 1.0153, "step": 186 }, { "epoch": 0.5420289855072464, "grad_norm": 0.7400704026222229, "learning_rate": 8.41936624182164e-06, "loss": 1.0302, "step": 187 }, { "epoch": 0.5449275362318841, "grad_norm": 0.7747941017150879, "learning_rate": 8.402476069947309e-06, "loss": 1.0516, "step": 188 }, { "epoch": 0.5478260869565217, "grad_norm": 0.6391712427139282, "learning_rate": 8.385513274731574e-06, "loss": 0.9144, "step": 189 }, { "epoch": 0.5507246376811594, "grad_norm": 0.7723587155342102, "learning_rate": 8.368478218232787e-06, "loss": 1.038, "step": 190 }, { "epoch": 0.553623188405797, "grad_norm": 0.6703996062278748, "learning_rate": 8.351371264051659e-06, "loss": 0.9767, "step": 191 }, { "epoch": 0.5565217391304348, "grad_norm": 0.6496030688285828, "learning_rate": 8.334192777323508e-06, "loss": 1.0139, "step": 192 }, { "epoch": 0.5594202898550724, "grad_norm": 0.9179766178131104, "learning_rate": 8.316943124710457e-06, "loss": 1.0217, "step": 193 }, { "epoch": 0.5623188405797102, "grad_norm": 0.739105761051178, "learning_rate": 8.299622674393615e-06, "loss": 1.0097, "step": 194 }, { "epoch": 0.5652173913043478, "grad_norm": 0.6799715757369995, "learning_rate": 8.282231796065215e-06, "loss": 0.9814, "step": 195 }, { "epoch": 0.5681159420289855, "grad_norm": 0.7482266426086426, "learning_rate": 8.264770860920722e-06, "loss": 0.9651, "step": 196 }, { "epoch": 0.5710144927536231, "grad_norm": 0.7226840853691101, "learning_rate": 8.247240241650918e-06, "loss": 1.0257, "step": 197 }, { "epoch": 0.5739130434782609, "grad_norm": 0.8682334423065186, "learning_rate": 8.229640312433938e-06, "loss": 0.9359, "step": 198 }, { "epoch": 0.5768115942028985, "grad_norm": 0.7574880123138428, "learning_rate": 8.21197144892728e-06, "loss": 1.0316, "step": 199 }, { "epoch": 0.5797101449275363, "grad_norm": 0.6719037890434265, "learning_rate": 8.194234028259806e-06, "loss": 0.9718, "step": 200 }, { "epoch": 0.5826086956521739, "grad_norm": 0.7872765064239502, "learning_rate": 8.176428429023674e-06, "loss": 1.0055, "step": 201 }, { "epoch": 0.5855072463768116, "grad_norm": 0.8982404470443726, "learning_rate": 8.158555031266255e-06, "loss": 1.0763, "step": 202 }, { "epoch": 0.5884057971014492, "grad_norm": 0.7265183925628662, "learning_rate": 8.140614216482046e-06, "loss": 0.9921, "step": 203 }, { "epoch": 0.591304347826087, "grad_norm": 0.7971622943878174, "learning_rate": 8.122606367604497e-06, "loss": 0.9986, "step": 204 }, { "epoch": 0.5942028985507246, "grad_norm": 0.689160943031311, "learning_rate": 8.104531868997858e-06, "loss": 0.9896, "step": 205 }, { "epoch": 0.5971014492753624, "grad_norm": 0.8191243410110474, "learning_rate": 8.086391106448965e-06, "loss": 1.0141, "step": 206 }, { "epoch": 0.6, "grad_norm": 0.860882043838501, "learning_rate": 8.068184467159014e-06, "loss": 0.9608, "step": 207 }, { "epoch": 0.6028985507246377, "grad_norm": 0.7216934561729431, "learning_rate": 8.049912339735284e-06, "loss": 0.9898, "step": 208 }, { "epoch": 0.6057971014492753, "grad_norm": 0.685965359210968, "learning_rate": 8.031575114182856e-06, "loss": 0.9532, "step": 209 }, { "epoch": 0.6086956521739131, "grad_norm": 0.6752814054489136, "learning_rate": 8.013173181896283e-06, "loss": 1.0043, "step": 210 }, { "epoch": 0.6115942028985507, "grad_norm": 0.815260112285614, "learning_rate": 7.994706935651228e-06, "loss": 1.0049, "step": 211 }, { "epoch": 0.6144927536231884, "grad_norm": 0.729771077632904, "learning_rate": 7.976176769596095e-06, "loss": 1.0003, "step": 212 }, { "epoch": 0.6173913043478261, "grad_norm": 0.6407178044319153, "learning_rate": 7.957583079243607e-06, "loss": 1.0197, "step": 213 }, { "epoch": 0.6202898550724638, "grad_norm": 0.6758530735969543, "learning_rate": 7.938926261462366e-06, "loss": 1.0632, "step": 214 }, { "epoch": 0.6231884057971014, "grad_norm": 0.7678017616271973, "learning_rate": 7.920206714468383e-06, "loss": 1.004, "step": 215 }, { "epoch": 0.6260869565217392, "grad_norm": 0.6864491105079651, "learning_rate": 7.90142483781658e-06, "loss": 0.9798, "step": 216 }, { "epoch": 0.6289855072463768, "grad_norm": 0.7141516804695129, "learning_rate": 7.882581032392252e-06, "loss": 0.9969, "step": 217 }, { "epoch": 0.6318840579710145, "grad_norm": 0.7497020363807678, "learning_rate": 7.863675700402527e-06, "loss": 0.9951, "step": 218 }, { "epoch": 0.6347826086956522, "grad_norm": 0.7010701894760132, "learning_rate": 7.844709245367766e-06, "loss": 1.0164, "step": 219 }, { "epoch": 0.6376811594202898, "grad_norm": 0.8556409478187561, "learning_rate": 7.82568207211296e-06, "loss": 1.0079, "step": 220 }, { "epoch": 0.6405797101449275, "grad_norm": 0.8755605816841125, "learning_rate": 7.806594586759083e-06, "loss": 1.0401, "step": 221 }, { "epoch": 0.6434782608695652, "grad_norm": 0.7478286623954773, "learning_rate": 7.787447196714428e-06, "loss": 0.9966, "step": 222 }, { "epoch": 0.6463768115942029, "grad_norm": 0.6972207427024841, "learning_rate": 7.768240310665909e-06, "loss": 1.0277, "step": 223 }, { "epoch": 0.6492753623188405, "grad_norm": 0.7753648161888123, "learning_rate": 7.748974338570337e-06, "loss": 1.0531, "step": 224 }, { "epoch": 0.6521739130434783, "grad_norm": 0.8420187830924988, "learning_rate": 7.729649691645673e-06, "loss": 1.0101, "step": 225 }, { "epoch": 0.6550724637681159, "grad_norm": 0.7467186450958252, "learning_rate": 7.710266782362248e-06, "loss": 1.086, "step": 226 }, { "epoch": 0.6579710144927536, "grad_norm": 0.679282009601593, "learning_rate": 7.69082602443396e-06, "loss": 1.0756, "step": 227 }, { "epoch": 0.6608695652173913, "grad_norm": 0.8682421445846558, "learning_rate": 7.671327832809442e-06, "loss": 1.0337, "step": 228 }, { "epoch": 0.663768115942029, "grad_norm": 0.9190111756324768, "learning_rate": 7.651772623663212e-06, "loss": 1.0412, "step": 229 }, { "epoch": 0.6666666666666666, "grad_norm": 0.7419721484184265, "learning_rate": 7.63216081438678e-06, "loss": 0.9895, "step": 230 }, { "epoch": 0.6695652173913044, "grad_norm": 0.7735477685928345, "learning_rate": 7.612492823579744e-06, "loss": 1.0109, "step": 231 }, { "epoch": 0.672463768115942, "grad_norm": 0.6718391180038452, "learning_rate": 7.5927690710408606e-06, "loss": 1.0699, "step": 232 }, { "epoch": 0.6753623188405797, "grad_norm": 0.8104904890060425, "learning_rate": 7.572989977759073e-06, "loss": 0.9957, "step": 233 }, { "epoch": 0.6782608695652174, "grad_norm": 0.8718286752700806, "learning_rate": 7.553155965904535e-06, "loss": 0.9674, "step": 234 }, { "epoch": 0.6811594202898551, "grad_norm": 0.727627158164978, "learning_rate": 7.533267458819597e-06, "loss": 1.0256, "step": 235 }, { "epoch": 0.6840579710144927, "grad_norm": 0.6747854948043823, "learning_rate": 7.513324881009769e-06, "loss": 0.9956, "step": 236 }, { "epoch": 0.6869565217391305, "grad_norm": 0.8896199464797974, "learning_rate": 7.49332865813466e-06, "loss": 1.052, "step": 237 }, { "epoch": 0.6898550724637681, "grad_norm": 0.8011343479156494, "learning_rate": 7.473279216998896e-06, "loss": 0.9809, "step": 238 }, { "epoch": 0.6927536231884058, "grad_norm": 0.7936311960220337, "learning_rate": 7.453176985543002e-06, "loss": 0.9491, "step": 239 }, { "epoch": 0.6956521739130435, "grad_norm": 0.783686101436615, "learning_rate": 7.4330223928342814e-06, "loss": 1.0627, "step": 240 }, { "epoch": 0.6985507246376812, "grad_norm": 0.6777355670928955, "learning_rate": 7.412815869057644e-06, "loss": 0.9836, "step": 241 }, { "epoch": 0.7014492753623188, "grad_norm": 0.8609856367111206, "learning_rate": 7.392557845506433e-06, "loss": 1.0383, "step": 242 }, { "epoch": 0.7043478260869566, "grad_norm": 0.7346140146255493, "learning_rate": 7.372248754573213e-06, "loss": 1.0237, "step": 243 }, { "epoch": 0.7072463768115942, "grad_norm": 0.8134037852287292, "learning_rate": 7.351889029740548e-06, "loss": 1.0051, "step": 244 }, { "epoch": 0.7101449275362319, "grad_norm": 0.7623313069343567, "learning_rate": 7.33147910557174e-06, "loss": 0.966, "step": 245 }, { "epoch": 0.7130434782608696, "grad_norm": 0.8289423584938049, "learning_rate": 7.311019417701567e-06, "loss": 1.0162, "step": 246 }, { "epoch": 0.7159420289855073, "grad_norm": 0.6778679490089417, "learning_rate": 7.290510402826967e-06, "loss": 1.042, "step": 247 }, { "epoch": 0.7188405797101449, "grad_norm": 0.7705609798431396, "learning_rate": 7.269952498697734e-06, "loss": 0.9979, "step": 248 }, { "epoch": 0.7217391304347827, "grad_norm": 0.8417146801948547, "learning_rate": 7.249346144107165e-06, "loss": 0.9937, "step": 249 }, { "epoch": 0.7246376811594203, "grad_norm": 0.6634312868118286, "learning_rate": 7.2286917788826926e-06, "loss": 1.0299, "step": 250 }, { "epoch": 0.7275362318840579, "grad_norm": 0.7162610292434692, "learning_rate": 7.207989843876505e-06, "loss": 0.9627, "step": 251 }, { "epoch": 0.7304347826086957, "grad_norm": 0.886674165725708, "learning_rate": 7.187240780956133e-06, "loss": 0.9804, "step": 252 }, { "epoch": 0.7333333333333333, "grad_norm": 0.8589048385620117, "learning_rate": 7.166445032995013e-06, "loss": 0.9972, "step": 253 }, { "epoch": 0.736231884057971, "grad_norm": 0.792225182056427, "learning_rate": 7.145603043863045e-06, "loss": 1.0047, "step": 254 }, { "epoch": 0.7391304347826086, "grad_norm": 0.7787736654281616, "learning_rate": 7.124715258417111e-06, "loss": 0.974, "step": 255 }, { "epoch": 0.7420289855072464, "grad_norm": 0.7716973423957825, "learning_rate": 7.103782122491577e-06, "loss": 0.9476, "step": 256 }, { "epoch": 0.744927536231884, "grad_norm": 0.8235695958137512, "learning_rate": 7.082804082888787e-06, "loss": 1.0303, "step": 257 }, { "epoch": 0.7478260869565218, "grad_norm": 0.8061054944992065, "learning_rate": 7.061781587369518e-06, "loss": 1.0254, "step": 258 }, { "epoch": 0.7507246376811594, "grad_norm": 0.8522235751152039, "learning_rate": 7.040715084643429e-06, "loss": 1.0196, "step": 259 }, { "epoch": 0.7536231884057971, "grad_norm": 0.8005476593971252, "learning_rate": 7.019605024359475e-06, "loss": 1.052, "step": 260 }, { "epoch": 0.7565217391304347, "grad_norm": 0.9044481515884399, "learning_rate": 6.998451857096321e-06, "loss": 1.04, "step": 261 }, { "epoch": 0.7565217391304347, "eval_loss": 0.9999631643295288, "eval_runtime": 46.2792, "eval_samples_per_second": 5.532, "eval_steps_per_second": 0.691, "step": 261 }, { "epoch": 0.7594202898550725, "grad_norm": 0.6946824193000793, "learning_rate": 6.977256034352713e-06, "loss": 0.9869, "step": 262 }, { "epoch": 0.7623188405797101, "grad_norm": 0.8048357963562012, "learning_rate": 6.956018008537852e-06, "loss": 0.9773, "step": 263 }, { "epoch": 0.7652173913043478, "grad_norm": 0.7211609482765198, "learning_rate": 6.934738232961728e-06, "loss": 0.9727, "step": 264 }, { "epoch": 0.7681159420289855, "grad_norm": 0.7225235104560852, "learning_rate": 6.913417161825449e-06, "loss": 1.0209, "step": 265 }, { "epoch": 0.7710144927536232, "grad_norm": 0.6443622708320618, "learning_rate": 6.892055250211552e-06, "loss": 1.0398, "step": 266 }, { "epoch": 0.7739130434782608, "grad_norm": 0.8570783138275146, "learning_rate": 6.8706529540742775e-06, "loss": 0.9883, "step": 267 }, { "epoch": 0.7768115942028986, "grad_norm": 0.9808831810951233, "learning_rate": 6.849210730229846e-06, "loss": 1.0847, "step": 268 }, { "epoch": 0.7797101449275362, "grad_norm": 0.8551820516586304, "learning_rate": 6.827729036346706e-06, "loss": 0.9621, "step": 269 }, { "epoch": 0.782608695652174, "grad_norm": 0.8964309692382812, "learning_rate": 6.806208330935766e-06, "loss": 0.9886, "step": 270 }, { "epoch": 0.7855072463768116, "grad_norm": 0.8737574219703674, "learning_rate": 6.784649073340601e-06, "loss": 1.0019, "step": 271 }, { "epoch": 0.7884057971014493, "grad_norm": 0.7480164170265198, "learning_rate": 6.763051723727663e-06, "loss": 0.9987, "step": 272 }, { "epoch": 0.7913043478260869, "grad_norm": 0.7155961990356445, "learning_rate": 6.741416743076443e-06, "loss": 1.0043, "step": 273 }, { "epoch": 0.7942028985507247, "grad_norm": 0.8288201093673706, "learning_rate": 6.719744593169642e-06, "loss": 0.9703, "step": 274 }, { "epoch": 0.7971014492753623, "grad_norm": 0.7403139472007751, "learning_rate": 6.698035736583307e-06, "loss": 0.9453, "step": 275 }, { "epoch": 0.8, "grad_norm": 0.7977936863899231, "learning_rate": 6.67629063667697e-06, "loss": 1.0091, "step": 276 }, { "epoch": 0.8028985507246377, "grad_norm": 0.8381959795951843, "learning_rate": 6.6545097575837405e-06, "loss": 1.0001, "step": 277 }, { "epoch": 0.8057971014492754, "grad_norm": 0.7988629937171936, "learning_rate": 6.6326935642004165e-06, "loss": 1.0053, "step": 278 }, { "epoch": 0.808695652173913, "grad_norm": 0.8848451375961304, "learning_rate": 6.610842522177549e-06, "loss": 1.021, "step": 279 }, { "epoch": 0.8115942028985508, "grad_norm": 0.8423268795013428, "learning_rate": 6.588957097909509e-06, "loss": 1.0245, "step": 280 }, { "epoch": 0.8144927536231884, "grad_norm": 0.6828733682632446, "learning_rate": 6.567037758524529e-06, "loss": 0.9966, "step": 281 }, { "epoch": 0.8173913043478261, "grad_norm": 0.8118813633918762, "learning_rate": 6.545084971874738e-06, "loss": 0.9777, "step": 282 }, { "epoch": 0.8202898550724638, "grad_norm": 0.8288912773132324, "learning_rate": 6.5230992065261685e-06, "loss": 1.0158, "step": 283 }, { "epoch": 0.8231884057971014, "grad_norm": 0.7110708951950073, "learning_rate": 6.501080931748764e-06, "loss": 0.9331, "step": 284 }, { "epoch": 0.8260869565217391, "grad_norm": 0.767749011516571, "learning_rate": 6.4790306175063535e-06, "loss": 0.8917, "step": 285 }, { "epoch": 0.8289855072463768, "grad_norm": 0.8519418835639954, "learning_rate": 6.456948734446624e-06, "loss": 1.0296, "step": 286 }, { "epoch": 0.8318840579710145, "grad_norm": 0.7988749742507935, "learning_rate": 6.43483575389108e-06, "loss": 0.9296, "step": 287 }, { "epoch": 0.8347826086956521, "grad_norm": 0.8312949538230896, "learning_rate": 6.412692147824976e-06, "loss": 1.0632, "step": 288 }, { "epoch": 0.8376811594202899, "grad_norm": 0.9024953246116638, "learning_rate": 6.390518388887246e-06, "loss": 1.0013, "step": 289 }, { "epoch": 0.8405797101449275, "grad_norm": 0.6774289011955261, "learning_rate": 6.368314950360416e-06, "loss": 0.954, "step": 290 }, { "epoch": 0.8434782608695652, "grad_norm": 0.739329993724823, "learning_rate": 6.3460823061604984e-06, "loss": 0.9453, "step": 291 }, { "epoch": 0.8463768115942029, "grad_norm": 0.7888621687889099, "learning_rate": 6.323820930826879e-06, "loss": 0.9672, "step": 292 }, { "epoch": 0.8492753623188406, "grad_norm": 0.7777626514434814, "learning_rate": 6.301531299512195e-06, "loss": 1.0118, "step": 293 }, { "epoch": 0.8521739130434782, "grad_norm": 0.8532302975654602, "learning_rate": 6.279213887972179e-06, "loss": 0.9837, "step": 294 }, { "epoch": 0.855072463768116, "grad_norm": 0.8223821520805359, "learning_rate": 6.2568691725555144e-06, "loss": 0.9786, "step": 295 }, { "epoch": 0.8579710144927536, "grad_norm": 0.7102084755897522, "learning_rate": 6.234497630193666e-06, "loss": 0.9634, "step": 296 }, { "epoch": 0.8608695652173913, "grad_norm": 0.7488099932670593, "learning_rate": 6.2120997383907015e-06, "loss": 1.0271, "step": 297 }, { "epoch": 0.863768115942029, "grad_norm": 0.755387008190155, "learning_rate": 6.189675975213094e-06, "loss": 1.0068, "step": 298 }, { "epoch": 0.8666666666666667, "grad_norm": 0.7323296666145325, "learning_rate": 6.1672268192795285e-06, "loss": 1.0177, "step": 299 }, { "epoch": 0.8695652173913043, "grad_norm": 0.7505559325218201, "learning_rate": 6.144752749750671e-06, "loss": 1.0031, "step": 300 }, { "epoch": 0.8724637681159421, "grad_norm": 0.8251679539680481, "learning_rate": 6.122254246318957e-06, "loss": 1.0281, "step": 301 }, { "epoch": 0.8753623188405797, "grad_norm": 0.7030305862426758, "learning_rate": 6.099731789198344e-06, "loss": 0.977, "step": 302 }, { "epoch": 0.8782608695652174, "grad_norm": 0.872175931930542, "learning_rate": 6.077185859114059e-06, "loss": 1.0279, "step": 303 }, { "epoch": 0.881159420289855, "grad_norm": 0.6906105279922485, "learning_rate": 6.05461693729235e-06, "loss": 0.9747, "step": 304 }, { "epoch": 0.8840579710144928, "grad_norm": 0.8041731119155884, "learning_rate": 6.0320255054501985e-06, "loss": 0.9706, "step": 305 }, { "epoch": 0.8869565217391304, "grad_norm": 0.9219099283218384, "learning_rate": 6.009412045785051e-06, "loss": 1.0192, "step": 306 }, { "epoch": 0.8898550724637682, "grad_norm": 0.5931650996208191, "learning_rate": 5.986777040964521e-06, "loss": 1.0064, "step": 307 }, { "epoch": 0.8927536231884058, "grad_norm": 0.9496859908103943, "learning_rate": 5.964120974116085e-06, "loss": 1.0138, "step": 308 }, { "epoch": 0.8956521739130435, "grad_norm": 0.719667375087738, "learning_rate": 5.941444328816775e-06, "loss": 1.0213, "step": 309 }, { "epoch": 0.8985507246376812, "grad_norm": 0.8299076557159424, "learning_rate": 5.918747589082853e-06, "loss": 0.9931, "step": 310 }, { "epoch": 0.9014492753623189, "grad_norm": 0.8233078718185425, "learning_rate": 5.896031239359485e-06, "loss": 0.9789, "step": 311 }, { "epoch": 0.9043478260869565, "grad_norm": 0.6814295649528503, "learning_rate": 5.8732957645103946e-06, "loss": 1.0711, "step": 312 }, { "epoch": 0.9072463768115943, "grad_norm": 0.786590039730072, "learning_rate": 5.85054164980752e-06, "loss": 1.0282, "step": 313 }, { "epoch": 0.9101449275362319, "grad_norm": 0.7114934921264648, "learning_rate": 5.82776938092065e-06, "loss": 1.0125, "step": 314 }, { "epoch": 0.9130434782608695, "grad_norm": 0.8856657147407532, "learning_rate": 5.804979443907065e-06, "loss": 1.0325, "step": 315 }, { "epoch": 0.9159420289855073, "grad_norm": 0.9123273491859436, "learning_rate": 5.782172325201155e-06, "loss": 1.0696, "step": 316 }, { "epoch": 0.9188405797101449, "grad_norm": 0.7296032905578613, "learning_rate": 5.7593485116040425e-06, "loss": 1.0004, "step": 317 }, { "epoch": 0.9217391304347826, "grad_norm": 0.8410807847976685, "learning_rate": 5.736508490273189e-06, "loss": 0.9547, "step": 318 }, { "epoch": 0.9246376811594202, "grad_norm": 1.0709190368652344, "learning_rate": 5.713652748711997e-06, "loss": 0.9583, "step": 319 }, { "epoch": 0.927536231884058, "grad_norm": 0.6270896196365356, "learning_rate": 5.690781774759412e-06, "loss": 1.0024, "step": 320 }, { "epoch": 0.9304347826086956, "grad_norm": 0.7849041223526001, "learning_rate": 5.667896056579495e-06, "loss": 0.9477, "step": 321 }, { "epoch": 0.9333333333333333, "grad_norm": 0.7513189315795898, "learning_rate": 5.644996082651018e-06, "loss": 0.9937, "step": 322 }, { "epoch": 0.936231884057971, "grad_norm": 0.8150386214256287, "learning_rate": 5.622082341757027e-06, "loss": 1.0589, "step": 323 }, { "epoch": 0.9391304347826087, "grad_norm": 0.8518944978713989, "learning_rate": 5.5991553229744166e-06, "loss": 1.0393, "step": 324 }, { "epoch": 0.9420289855072463, "grad_norm": 0.814802885055542, "learning_rate": 5.576215515663489e-06, "loss": 1.0186, "step": 325 }, { "epoch": 0.9449275362318841, "grad_norm": 0.9456635117530823, "learning_rate": 5.553263409457504e-06, "loss": 0.9657, "step": 326 }, { "epoch": 0.9478260869565217, "grad_norm": 0.7259712815284729, "learning_rate": 5.530299494252238e-06, "loss": 1.0066, "step": 327 }, { "epoch": 0.9507246376811594, "grad_norm": 0.7462155818939209, "learning_rate": 5.507324260195516e-06, "loss": 0.9246, "step": 328 }, { "epoch": 0.9536231884057971, "grad_norm": 0.9022188782691956, "learning_rate": 5.484338197676757e-06, "loss": 0.9624, "step": 329 }, { "epoch": 0.9565217391304348, "grad_norm": 0.8874835968017578, "learning_rate": 5.46134179731651e-06, "loss": 0.9851, "step": 330 }, { "epoch": 0.9594202898550724, "grad_norm": 0.7534209489822388, "learning_rate": 5.4383355499559734e-06, "loss": 0.9761, "step": 331 }, { "epoch": 0.9623188405797102, "grad_norm": 0.9121699929237366, "learning_rate": 5.41531994664652e-06, "loss": 0.9994, "step": 332 }, { "epoch": 0.9652173913043478, "grad_norm": 0.774753212928772, "learning_rate": 5.392295478639226e-06, "loss": 1.0218, "step": 333 }, { "epoch": 0.9681159420289855, "grad_norm": 0.7575943470001221, "learning_rate": 5.36926263737437e-06, "loss": 0.9855, "step": 334 }, { "epoch": 0.9710144927536232, "grad_norm": 0.8202754259109497, "learning_rate": 5.346221914470959e-06, "loss": 1.0112, "step": 335 }, { "epoch": 0.9739130434782609, "grad_norm": 0.8952569961547852, "learning_rate": 5.323173801716222e-06, "loss": 0.9722, "step": 336 }, { "epoch": 0.9768115942028985, "grad_norm": 0.7153046727180481, "learning_rate": 5.300118791055122e-06, "loss": 0.9847, "step": 337 }, { "epoch": 0.9797101449275363, "grad_norm": 0.7900391221046448, "learning_rate": 5.27705737457985e-06, "loss": 1.0324, "step": 338 }, { "epoch": 0.9826086956521739, "grad_norm": 0.8250629305839539, "learning_rate": 5.253990044519329e-06, "loss": 0.9764, "step": 339 }, { "epoch": 0.9855072463768116, "grad_norm": 0.8809992671012878, "learning_rate": 5.230917293228699e-06, "loss": 1.0198, "step": 340 }, { "epoch": 0.9884057971014493, "grad_norm": 0.7209755778312683, "learning_rate": 5.207839613178814e-06, "loss": 1.0253, "step": 341 }, { "epoch": 0.991304347826087, "grad_norm": 0.8488002419471741, "learning_rate": 5.184757496945726e-06, "loss": 0.9333, "step": 342 }, { "epoch": 0.9942028985507246, "grad_norm": 0.8114776611328125, "learning_rate": 5.161671437200179e-06, "loss": 1.0026, "step": 343 }, { "epoch": 0.9971014492753624, "grad_norm": 0.8550688028335571, "learning_rate": 5.138581926697083e-06, "loss": 1.0057, "step": 344 }, { "epoch": 1.0, "grad_norm": 0.9187963008880615, "learning_rate": 5.115489458265006e-06, "loss": 1.0037, "step": 345 }, { "epoch": 1.0028985507246377, "grad_norm": 0.8499656915664673, "learning_rate": 5.09239452479565e-06, "loss": 0.9793, "step": 346 }, { "epoch": 1.0057971014492753, "grad_norm": 0.9663048982620239, "learning_rate": 5.0692976192333295e-06, "loss": 0.9337, "step": 347 }, { "epoch": 1.008695652173913, "grad_norm": 0.8095614910125732, "learning_rate": 5.046199234564455e-06, "loss": 0.9461, "step": 348 }, { "epoch": 1.008695652173913, "eval_loss": 0.9858289361000061, "eval_runtime": 46.4396, "eval_samples_per_second": 5.513, "eval_steps_per_second": 0.689, "step": 348 }, { "epoch": 1.0115942028985507, "grad_norm": 0.839413046836853, "learning_rate": 5.0230998638070024e-06, "loss": 0.9702, "step": 349 }, { "epoch": 1.0144927536231885, "grad_norm": 0.8220239877700806, "learning_rate": 5e-06, "loss": 0.9403, "step": 350 }, { "epoch": 1.017391304347826, "grad_norm": 0.8942255973815918, "learning_rate": 4.976900136192998e-06, "loss": 0.9763, "step": 351 }, { "epoch": 1.0028985507246377, "grad_norm": 0.785389244556427, "learning_rate": 4.953800765435547e-06, "loss": 1.0033, "step": 352 }, { "epoch": 1.0057971014492753, "grad_norm": 0.9310470223426819, "learning_rate": 4.930702380766671e-06, "loss": 0.9569, "step": 353 }, { "epoch": 1.008695652173913, "grad_norm": 0.9420292377471924, "learning_rate": 4.907605475204352e-06, "loss": 1.0085, "step": 354 }, { "epoch": 1.0115942028985507, "grad_norm": 0.8762017488479614, "learning_rate": 4.8845105417349955e-06, "loss": 1.0225, "step": 355 }, { "epoch": 1.0144927536231885, "grad_norm": 0.8962522149085999, "learning_rate": 4.861418073302919e-06, "loss": 0.9543, "step": 356 }, { "epoch": 1.017391304347826, "grad_norm": 0.8070088028907776, "learning_rate": 4.838328562799824e-06, "loss": 0.9334, "step": 357 }, { "epoch": 1.0202898550724637, "grad_norm": 0.8407843708992004, "learning_rate": 4.815242503054277e-06, "loss": 0.9499, "step": 358 }, { "epoch": 1.0231884057971015, "grad_norm": 0.8197099566459656, "learning_rate": 4.79216038682119e-06, "loss": 1.0039, "step": 359 }, { "epoch": 1.0260869565217392, "grad_norm": 0.7919727563858032, "learning_rate": 4.7690827067713035e-06, "loss": 0.9731, "step": 360 }, { "epoch": 1.0289855072463767, "grad_norm": 0.7514965534210205, "learning_rate": 4.746009955480672e-06, "loss": 0.9124, "step": 361 }, { "epoch": 1.0318840579710145, "grad_norm": 0.7958142757415771, "learning_rate": 4.7229426254201504e-06, "loss": 0.9836, "step": 362 }, { "epoch": 1.0347826086956522, "grad_norm": 0.9223296642303467, "learning_rate": 4.69988120894488e-06, "loss": 1.0372, "step": 363 }, { "epoch": 1.03768115942029, "grad_norm": 0.7448701858520508, "learning_rate": 4.676826198283779e-06, "loss": 0.9189, "step": 364 }, { "epoch": 1.0405797101449274, "grad_norm": 0.731107771396637, "learning_rate": 4.653778085529043e-06, "loss": 0.9632, "step": 365 }, { "epoch": 1.0434782608695652, "grad_norm": 0.8460220694541931, "learning_rate": 4.630737362625631e-06, "loss": 0.9794, "step": 366 }, { "epoch": 1.046376811594203, "grad_norm": 0.8166036605834961, "learning_rate": 4.6077045213607765e-06, "loss": 0.9976, "step": 367 }, { "epoch": 1.0492753623188407, "grad_norm": 0.6962491869926453, "learning_rate": 4.584680053353481e-06, "loss": 0.9374, "step": 368 }, { "epoch": 1.0521739130434782, "grad_norm": 0.8353239893913269, "learning_rate": 4.561664450044029e-06, "loss": 0.991, "step": 369 }, { "epoch": 1.055072463768116, "grad_norm": 0.8190463781356812, "learning_rate": 4.53865820268349e-06, "loss": 0.9971, "step": 370 }, { "epoch": 1.0579710144927537, "grad_norm": 0.904393196105957, "learning_rate": 4.515661802323244e-06, "loss": 0.9548, "step": 371 }, { "epoch": 1.0608695652173914, "grad_norm": 0.7582879066467285, "learning_rate": 4.492675739804486e-06, "loss": 0.934, "step": 372 }, { "epoch": 1.063768115942029, "grad_norm": 0.7787836194038391, "learning_rate": 4.4697005057477634e-06, "loss": 0.973, "step": 373 }, { "epoch": 1.0666666666666667, "grad_norm": 0.7273504137992859, "learning_rate": 4.446736590542497e-06, "loss": 1.0166, "step": 374 }, { "epoch": 1.0695652173913044, "grad_norm": 0.7512848377227783, "learning_rate": 4.4237844843365126e-06, "loss": 0.9951, "step": 375 }, { "epoch": 1.0724637681159421, "grad_norm": 0.8715952038764954, "learning_rate": 4.400844677025585e-06, "loss": 1.0384, "step": 376 }, { "epoch": 1.0753623188405796, "grad_norm": 1.1643601655960083, "learning_rate": 4.377917658242975e-06, "loss": 0.9725, "step": 377 }, { "epoch": 1.0782608695652174, "grad_norm": 1.0170421600341797, "learning_rate": 4.355003917348985e-06, "loss": 0.9877, "step": 378 }, { "epoch": 1.0811594202898551, "grad_norm": 0.8441584706306458, "learning_rate": 4.332103943420507e-06, "loss": 0.9795, "step": 379 }, { "epoch": 1.0840579710144929, "grad_norm": 0.9508838057518005, "learning_rate": 4.309218225240591e-06, "loss": 1.0274, "step": 380 }, { "epoch": 1.0869565217391304, "grad_norm": 0.9078054428100586, "learning_rate": 4.286347251288004e-06, "loss": 1.0117, "step": 381 }, { "epoch": 1.0898550724637681, "grad_norm": 1.056804895401001, "learning_rate": 4.263491509726812e-06, "loss": 0.9588, "step": 382 }, { "epoch": 1.0927536231884059, "grad_norm": 0.8957586288452148, "learning_rate": 4.240651488395958e-06, "loss": 0.9644, "step": 383 }, { "epoch": 1.0956521739130434, "grad_norm": 0.9251319169998169, "learning_rate": 4.217827674798845e-06, "loss": 0.9764, "step": 384 }, { "epoch": 1.098550724637681, "grad_norm": 0.8325505256652832, "learning_rate": 4.195020556092935e-06, "loss": 0.987, "step": 385 }, { "epoch": 1.1014492753623188, "grad_norm": 0.8144704699516296, "learning_rate": 4.17223061907935e-06, "loss": 0.9898, "step": 386 }, { "epoch": 1.1043478260869566, "grad_norm": 0.8545647859573364, "learning_rate": 4.14945835019248e-06, "loss": 0.9214, "step": 387 }, { "epoch": 1.107246376811594, "grad_norm": 0.8896581530570984, "learning_rate": 4.126704235489606e-06, "loss": 0.9432, "step": 388 }, { "epoch": 1.1101449275362318, "grad_norm": 0.8762820959091187, "learning_rate": 4.103968760640516e-06, "loss": 0.9754, "step": 389 }, { "epoch": 1.1130434782608696, "grad_norm": 0.7869084477424622, "learning_rate": 4.081252410917148e-06, "loss": 0.9655, "step": 390 }, { "epoch": 1.1159420289855073, "grad_norm": 0.9484694600105286, "learning_rate": 4.058555671183227e-06, "loss": 0.9461, "step": 391 }, { "epoch": 1.1188405797101448, "grad_norm": 0.8366033434867859, "learning_rate": 4.035879025883916e-06, "loss": 0.9745, "step": 392 }, { "epoch": 1.1217391304347826, "grad_norm": 0.8974631428718567, "learning_rate": 4.013222959035481e-06, "loss": 1.003, "step": 393 }, { "epoch": 1.1246376811594203, "grad_norm": 0.9970961809158325, "learning_rate": 3.99058795421495e-06, "loss": 0.9548, "step": 394 }, { "epoch": 1.127536231884058, "grad_norm": 0.8342113494873047, "learning_rate": 3.967974494549803e-06, "loss": 0.8879, "step": 395 }, { "epoch": 1.1304347826086956, "grad_norm": 0.7740679383277893, "learning_rate": 3.945383062707652e-06, "loss": 1.0181, "step": 396 }, { "epoch": 1.1333333333333333, "grad_norm": 0.8080225586891174, "learning_rate": 3.922814140885942e-06, "loss": 0.9629, "step": 397 }, { "epoch": 1.136231884057971, "grad_norm": 0.745694637298584, "learning_rate": 3.9002682108016585e-06, "loss": 0.9725, "step": 398 }, { "epoch": 1.1391304347826088, "grad_norm": 0.93767249584198, "learning_rate": 3.8777457536810446e-06, "loss": 0.9411, "step": 399 }, { "epoch": 1.1420289855072463, "grad_norm": 0.7331735491752625, "learning_rate": 3.855247250249331e-06, "loss": 0.9187, "step": 400 }, { "epoch": 1.144927536231884, "grad_norm": 1.1504460573196411, "learning_rate": 3.832773180720475e-06, "loss": 1.0038, "step": 401 }, { "epoch": 1.1478260869565218, "grad_norm": 0.7792490124702454, "learning_rate": 3.8103240247869077e-06, "loss": 0.9583, "step": 402 }, { "epoch": 1.1507246376811595, "grad_norm": 0.8607194423675537, "learning_rate": 3.7879002616093015e-06, "loss": 0.9608, "step": 403 }, { "epoch": 1.153623188405797, "grad_norm": 0.7470278143882751, "learning_rate": 3.765502369806334e-06, "loss": 1.0097, "step": 404 }, { "epoch": 1.1565217391304348, "grad_norm": 0.8549491763114929, "learning_rate": 3.743130827444487e-06, "loss": 0.9707, "step": 405 }, { "epoch": 1.1594202898550725, "grad_norm": 0.8472537398338318, "learning_rate": 3.720786112027822e-06, "loss": 0.9746, "step": 406 }, { "epoch": 1.1623188405797102, "grad_norm": 0.7988584637641907, "learning_rate": 3.6984687004878052e-06, "loss": 0.9883, "step": 407 }, { "epoch": 1.1652173913043478, "grad_norm": 0.823165774345398, "learning_rate": 3.6761790691731207e-06, "loss": 1.013, "step": 408 }, { "epoch": 1.1681159420289855, "grad_norm": 0.7537344694137573, "learning_rate": 3.6539176938395037e-06, "loss": 1.0081, "step": 409 }, { "epoch": 1.1710144927536232, "grad_norm": 0.7858260273933411, "learning_rate": 3.6316850496395863e-06, "loss": 0.9688, "step": 410 }, { "epoch": 1.1739130434782608, "grad_norm": 0.8715892434120178, "learning_rate": 3.609481611112755e-06, "loss": 1.0181, "step": 411 }, { "epoch": 1.1768115942028985, "grad_norm": 0.816693127155304, "learning_rate": 3.587307852175025e-06, "loss": 0.9505, "step": 412 }, { "epoch": 1.1797101449275362, "grad_norm": 0.9773905277252197, "learning_rate": 3.5651642461089207e-06, "loss": 0.9745, "step": 413 }, { "epoch": 1.182608695652174, "grad_norm": 0.7822540998458862, "learning_rate": 3.5430512655533774e-06, "loss": 0.9977, "step": 414 }, { "epoch": 1.1855072463768117, "grad_norm": 0.9197254180908203, "learning_rate": 3.5209693824936486e-06, "loss": 0.9955, "step": 415 }, { "epoch": 1.1884057971014492, "grad_norm": 0.8545462489128113, "learning_rate": 3.498919068251237e-06, "loss": 1.0544, "step": 416 }, { "epoch": 1.191304347826087, "grad_norm": 0.8395746350288391, "learning_rate": 3.476900793473832e-06, "loss": 0.9757, "step": 417 }, { "epoch": 1.1942028985507247, "grad_norm": 0.8740842938423157, "learning_rate": 3.4549150281252635e-06, "loss": 0.9468, "step": 418 }, { "epoch": 1.1971014492753622, "grad_norm": 0.7521042823791504, "learning_rate": 3.4329622414754728e-06, "loss": 0.9432, "step": 419 }, { "epoch": 1.2, "grad_norm": 0.713711142539978, "learning_rate": 3.4110429020904924e-06, "loss": 0.9838, "step": 420 }, { "epoch": 1.2028985507246377, "grad_norm": 0.8481893539428711, "learning_rate": 3.3891574778224524e-06, "loss": 0.9489, "step": 421 }, { "epoch": 1.2057971014492754, "grad_norm": 0.863029420375824, "learning_rate": 3.3673064357995844e-06, "loss": 1.0462, "step": 422 }, { "epoch": 1.208695652173913, "grad_norm": 0.8649914860725403, "learning_rate": 3.3454902424162603e-06, "loss": 1.0085, "step": 423 }, { "epoch": 1.2115942028985507, "grad_norm": 0.8374588489532471, "learning_rate": 3.3237093633230323e-06, "loss": 1.0425, "step": 424 }, { "epoch": 1.2144927536231884, "grad_norm": 0.9396947026252747, "learning_rate": 3.301964263416693e-06, "loss": 1.0303, "step": 425 }, { "epoch": 1.2173913043478262, "grad_norm": 0.8101410865783691, "learning_rate": 3.2802554068303595e-06, "loss": 0.9747, "step": 426 }, { "epoch": 1.2202898550724637, "grad_norm": 0.9860018491744995, "learning_rate": 3.2585832569235576e-06, "loss": 0.9533, "step": 427 }, { "epoch": 1.2231884057971014, "grad_norm": 0.950383186340332, "learning_rate": 3.236948276272337e-06, "loss": 0.9562, "step": 428 }, { "epoch": 1.2260869565217392, "grad_norm": 0.8197913765907288, "learning_rate": 3.2153509266593984e-06, "loss": 0.9588, "step": 429 }, { "epoch": 1.228985507246377, "grad_norm": 0.8033617734909058, "learning_rate": 3.1937916690642356e-06, "loss": 1.0014, "step": 430 }, { "epoch": 1.2318840579710144, "grad_norm": 0.8451259732246399, "learning_rate": 3.1722709636532944e-06, "loss": 0.9428, "step": 431 }, { "epoch": 1.2347826086956522, "grad_norm": 0.7560276985168457, "learning_rate": 3.150789269770155e-06, "loss": 1.002, "step": 432 }, { "epoch": 1.23768115942029, "grad_norm": 0.918804943561554, "learning_rate": 3.1293470459257237e-06, "loss": 0.9653, "step": 433 }, { "epoch": 1.2405797101449276, "grad_norm": 0.8339065313339233, "learning_rate": 3.107944749788449e-06, "loss": 0.9407, "step": 434 }, { "epoch": 1.2434782608695651, "grad_norm": 0.7564199566841125, "learning_rate": 3.0865828381745515e-06, "loss": 1.012, "step": 435 }, { "epoch": 1.2434782608695651, "eval_loss": 0.9773865938186646, "eval_runtime": 46.2701, "eval_samples_per_second": 5.533, "eval_steps_per_second": 0.692, "step": 435 }, { "epoch": 1.2463768115942029, "grad_norm": 0.7768362164497375, "learning_rate": 3.0652617670382745e-06, "loss": 0.9642, "step": 436 }, { "epoch": 1.2492753623188406, "grad_norm": 0.8295703530311584, "learning_rate": 3.04398199146215e-06, "loss": 1.0002, "step": 437 }, { "epoch": 1.2521739130434781, "grad_norm": 0.8403414487838745, "learning_rate": 3.0227439656472878e-06, "loss": 0.9772, "step": 438 }, { "epoch": 1.2550724637681159, "grad_norm": 0.8178934454917908, "learning_rate": 3.0015481429036807e-06, "loss": 1.0126, "step": 439 }, { "epoch": 1.2579710144927536, "grad_norm": 0.8231812119483948, "learning_rate": 2.980394975640526e-06, "loss": 0.9118, "step": 440 }, { "epoch": 1.2608695652173914, "grad_norm": 0.8780835270881653, "learning_rate": 2.9592849153565727e-06, "loss": 0.9549, "step": 441 }, { "epoch": 1.263768115942029, "grad_norm": 1.000675916671753, "learning_rate": 2.9382184126304834e-06, "loss": 1.0483, "step": 442 }, { "epoch": 1.2666666666666666, "grad_norm": 0.8840986490249634, "learning_rate": 2.917195917111215e-06, "loss": 0.9931, "step": 443 }, { "epoch": 1.2695652173913043, "grad_norm": 0.8707259297370911, "learning_rate": 2.8962178775084267e-06, "loss": 0.8975, "step": 444 }, { "epoch": 1.272463768115942, "grad_norm": 0.7439221739768982, "learning_rate": 2.8752847415828923e-06, "loss": 0.9453, "step": 445 }, { "epoch": 1.2753623188405796, "grad_norm": 0.9899610280990601, "learning_rate": 2.8543969561369556e-06, "loss": 0.9426, "step": 446 }, { "epoch": 1.2782608695652173, "grad_norm": 0.9144057035446167, "learning_rate": 2.8335549670049866e-06, "loss": 0.9453, "step": 447 }, { "epoch": 1.281159420289855, "grad_norm": 0.9034680128097534, "learning_rate": 2.812759219043869e-06, "loss": 0.9258, "step": 448 }, { "epoch": 1.2840579710144928, "grad_norm": 0.9689735174179077, "learning_rate": 2.7920101561234954e-06, "loss": 0.993, "step": 449 }, { "epoch": 1.2869565217391306, "grad_norm": 0.6610868573188782, "learning_rate": 2.771308221117309e-06, "loss": 0.9506, "step": 450 }, { "epoch": 1.289855072463768, "grad_norm": 0.829849362373352, "learning_rate": 2.750653855892836e-06, "loss": 0.9609, "step": 451 }, { "epoch": 1.2927536231884058, "grad_norm": 0.7730438709259033, "learning_rate": 2.7300475013022666e-06, "loss": 0.9859, "step": 452 }, { "epoch": 1.2956521739130435, "grad_norm": 0.925363302230835, "learning_rate": 2.7094895971730326e-06, "loss": 1.0286, "step": 453 }, { "epoch": 1.298550724637681, "grad_norm": 0.886048436164856, "learning_rate": 2.6889805822984348e-06, "loss": 0.952, "step": 454 }, { "epoch": 1.3014492753623188, "grad_norm": 1.1092323064804077, "learning_rate": 2.668520894428259e-06, "loss": 1.0032, "step": 455 }, { "epoch": 1.3043478260869565, "grad_norm": 0.7811794877052307, "learning_rate": 2.648110970259454e-06, "loss": 0.9296, "step": 456 }, { "epoch": 1.3072463768115943, "grad_norm": 0.8023120164871216, "learning_rate": 2.6277512454267874e-06, "loss": 0.9304, "step": 457 }, { "epoch": 1.310144927536232, "grad_norm": 0.7649518251419067, "learning_rate": 2.607442154493568e-06, "loss": 0.9441, "step": 458 }, { "epoch": 1.3130434782608695, "grad_norm": 0.8725413680076599, "learning_rate": 2.5871841309423557e-06, "loss": 0.9637, "step": 459 }, { "epoch": 1.3159420289855073, "grad_norm": 0.7210862636566162, "learning_rate": 2.5669776071657194e-06, "loss": 0.9869, "step": 460 }, { "epoch": 1.318840579710145, "grad_norm": 0.8270391821861267, "learning_rate": 2.546823014456998e-06, "loss": 0.9164, "step": 461 }, { "epoch": 1.3217391304347825, "grad_norm": 0.829223096370697, "learning_rate": 2.526720783001107e-06, "loss": 1.0128, "step": 462 }, { "epoch": 1.3246376811594203, "grad_norm": 0.9681026935577393, "learning_rate": 2.506671341865341e-06, "loss": 0.9768, "step": 463 }, { "epoch": 1.327536231884058, "grad_norm": 0.840314507484436, "learning_rate": 2.486675118990233e-06, "loss": 0.9359, "step": 464 }, { "epoch": 1.3304347826086955, "grad_norm": 0.659677267074585, "learning_rate": 2.466732541180404e-06, "loss": 0.965, "step": 465 }, { "epoch": 1.3333333333333333, "grad_norm": 0.9055850505828857, "learning_rate": 2.4468440340954664e-06, "loss": 0.9557, "step": 466 }, { "epoch": 1.336231884057971, "grad_norm": 0.8318009972572327, "learning_rate": 2.4270100222409275e-06, "loss": 0.9111, "step": 467 }, { "epoch": 1.3391304347826087, "grad_norm": 0.9112004041671753, "learning_rate": 2.4072309289591394e-06, "loss": 0.9243, "step": 468 }, { "epoch": 1.3420289855072465, "grad_norm": 0.8032493591308594, "learning_rate": 2.387507176420256e-06, "loss": 0.9228, "step": 469 }, { "epoch": 1.344927536231884, "grad_norm": 0.662981390953064, "learning_rate": 2.3678391856132203e-06, "loss": 0.9778, "step": 470 }, { "epoch": 1.3478260869565217, "grad_norm": 0.8368533849716187, "learning_rate": 2.348227376336789e-06, "loss": 1.0145, "step": 471 }, { "epoch": 1.3507246376811595, "grad_norm": 0.9046915769577026, "learning_rate": 2.328672167190558e-06, "loss": 0.9393, "step": 472 }, { "epoch": 1.353623188405797, "grad_norm": 0.9030489921569824, "learning_rate": 2.3091739755660425e-06, "loss": 0.9636, "step": 473 }, { "epoch": 1.3565217391304347, "grad_norm": 0.8339246511459351, "learning_rate": 2.289733217637753e-06, "loss": 0.9395, "step": 474 }, { "epoch": 1.3594202898550725, "grad_norm": 0.7877910733222961, "learning_rate": 2.2703503083543288e-06, "loss": 0.9454, "step": 475 }, { "epoch": 1.3623188405797102, "grad_norm": 0.9808143377304077, "learning_rate": 2.2510256614296638e-06, "loss": 0.9968, "step": 476 }, { "epoch": 1.365217391304348, "grad_norm": 1.2518080472946167, "learning_rate": 2.2317596893340924e-06, "loss": 0.9732, "step": 477 }, { "epoch": 1.3681159420289855, "grad_norm": 0.8053367137908936, "learning_rate": 2.2125528032855727e-06, "loss": 0.9803, "step": 478 }, { "epoch": 1.3710144927536232, "grad_norm": 0.9491231441497803, "learning_rate": 2.1934054132409183e-06, "loss": 0.9332, "step": 479 }, { "epoch": 1.373913043478261, "grad_norm": 0.7503049373626709, "learning_rate": 2.174317927887041e-06, "loss": 0.9591, "step": 480 }, { "epoch": 1.3768115942028984, "grad_norm": 0.819608211517334, "learning_rate": 2.1552907546322356e-06, "loss": 0.9795, "step": 481 }, { "epoch": 1.3797101449275362, "grad_norm": 0.8053436279296875, "learning_rate": 2.136324299597474e-06, "loss": 1.0053, "step": 482 }, { "epoch": 1.382608695652174, "grad_norm": 0.7377948760986328, "learning_rate": 2.11741896760775e-06, "loss": 1.0277, "step": 483 }, { "epoch": 1.3855072463768117, "grad_norm": 0.865705668926239, "learning_rate": 2.098575162183422e-06, "loss": 0.9952, "step": 484 }, { "epoch": 1.3884057971014494, "grad_norm": 0.8623892664909363, "learning_rate": 2.0797932855316183e-06, "loss": 1.0304, "step": 485 }, { "epoch": 1.391304347826087, "grad_norm": 0.803113579750061, "learning_rate": 2.061073738537635e-06, "loss": 0.993, "step": 486 }, { "epoch": 1.3942028985507247, "grad_norm": 0.7748633623123169, "learning_rate": 2.0424169207563954e-06, "loss": 0.9103, "step": 487 }, { "epoch": 1.3971014492753624, "grad_norm": 0.9022510051727295, "learning_rate": 2.023823230403907e-06, "loss": 0.9125, "step": 488 }, { "epoch": 1.4, "grad_norm": 0.8588757514953613, "learning_rate": 2.005293064348773e-06, "loss": 1.0259, "step": 489 }, { "epoch": 1.4028985507246376, "grad_norm": 0.8985849618911743, "learning_rate": 1.9868268181037186e-06, "loss": 0.9839, "step": 490 }, { "epoch": 1.4057971014492754, "grad_norm": 0.8959106802940369, "learning_rate": 1.968424885817143e-06, "loss": 0.9752, "step": 491 }, { "epoch": 1.4086956521739131, "grad_norm": 0.9213183522224426, "learning_rate": 1.9500876602647167e-06, "loss": 0.9053, "step": 492 }, { "epoch": 1.4115942028985506, "grad_norm": 0.8219558596611023, "learning_rate": 1.931815532840987e-06, "loss": 0.9522, "step": 493 }, { "epoch": 1.4144927536231884, "grad_norm": 0.8716898560523987, "learning_rate": 1.913608893551036e-06, "loss": 0.9858, "step": 494 }, { "epoch": 1.4173913043478261, "grad_norm": 0.9072102904319763, "learning_rate": 1.8954681310021434e-06, "loss": 0.9382, "step": 495 }, { "epoch": 1.4202898550724639, "grad_norm": 0.8592570424079895, "learning_rate": 1.8773936323955055e-06, "loss": 1.0004, "step": 496 }, { "epoch": 1.4231884057971014, "grad_norm": 0.8882102966308594, "learning_rate": 1.8593857835179557e-06, "loss": 0.9862, "step": 497 }, { "epoch": 1.4260869565217391, "grad_norm": 0.851216197013855, "learning_rate": 1.8414449687337467e-06, "loss": 1.0109, "step": 498 }, { "epoch": 1.4289855072463769, "grad_norm": 0.7851223349571228, "learning_rate": 1.8235715709763285e-06, "loss": 0.9404, "step": 499 }, { "epoch": 1.4318840579710144, "grad_norm": 0.7435230612754822, "learning_rate": 1.8057659717401948e-06, "loss": 1.0388, "step": 500 }, { "epoch": 1.434782608695652, "grad_norm": 0.795467734336853, "learning_rate": 1.7880285510727197e-06, "loss": 1.0, "step": 501 }, { "epoch": 1.4376811594202898, "grad_norm": 0.8847975730895996, "learning_rate": 1.7703596875660645e-06, "loss": 1.0182, "step": 502 }, { "epoch": 1.4405797101449276, "grad_norm": 1.0256052017211914, "learning_rate": 1.7527597583490825e-06, "loss": 0.9573, "step": 503 }, { "epoch": 1.4434782608695653, "grad_norm": 0.7743212580680847, "learning_rate": 1.7352291390792798e-06, "loss": 0.9831, "step": 504 }, { "epoch": 1.4463768115942028, "grad_norm": 0.9608955979347229, "learning_rate": 1.7177682039347875e-06, "loss": 0.9683, "step": 505 }, { "epoch": 1.4492753623188406, "grad_norm": 0.899786651134491, "learning_rate": 1.7003773256063882e-06, "loss": 1.0373, "step": 506 }, { "epoch": 1.4521739130434783, "grad_norm": 0.933459997177124, "learning_rate": 1.6830568752895455e-06, "loss": 1.0065, "step": 507 }, { "epoch": 1.4550724637681158, "grad_norm": 0.7607547640800476, "learning_rate": 1.6658072226764949e-06, "loss": 0.9652, "step": 508 }, { "epoch": 1.4579710144927536, "grad_norm": 0.7857306599617004, "learning_rate": 1.6486287359483422e-06, "loss": 0.9943, "step": 509 }, { "epoch": 1.4608695652173913, "grad_norm": 0.9342886209487915, "learning_rate": 1.6315217817672142e-06, "loss": 1.028, "step": 510 }, { "epoch": 1.463768115942029, "grad_norm": 1.0333482027053833, "learning_rate": 1.614486725268426e-06, "loss": 0.9296, "step": 511 }, { "epoch": 1.4666666666666668, "grad_norm": 0.7788994908332825, "learning_rate": 1.5975239300526924e-06, "loss": 0.9871, "step": 512 }, { "epoch": 1.4695652173913043, "grad_norm": 0.764268159866333, "learning_rate": 1.5806337581783593e-06, "loss": 0.9603, "step": 513 }, { "epoch": 1.472463768115942, "grad_norm": 0.9053126573562622, "learning_rate": 1.5638165701536866e-06, "loss": 1.003, "step": 514 }, { "epoch": 1.4753623188405798, "grad_norm": 0.890696108341217, "learning_rate": 1.5470727249291423e-06, "loss": 0.9894, "step": 515 }, { "epoch": 1.4782608695652173, "grad_norm": 0.755885124206543, "learning_rate": 1.5304025798897521e-06, "loss": 0.9355, "step": 516 }, { "epoch": 1.481159420289855, "grad_norm": 0.8839924931526184, "learning_rate": 1.5138064908474603e-06, "loss": 0.9879, "step": 517 }, { "epoch": 1.4840579710144928, "grad_norm": 0.919336199760437, "learning_rate": 1.4972848120335453e-06, "loss": 1.042, "step": 518 }, { "epoch": 1.4869565217391305, "grad_norm": 1.0073022842407227, "learning_rate": 1.4808378960910502e-06, "loss": 1.0537, "step": 519 }, { "epoch": 1.4898550724637682, "grad_norm": 0.9994317293167114, "learning_rate": 1.4644660940672628e-06, "loss": 1.042, "step": 520 }, { "epoch": 1.4927536231884058, "grad_norm": 0.8237168788909912, "learning_rate": 1.448169755406218e-06, "loss": 0.9449, "step": 521 }, { "epoch": 1.4956521739130435, "grad_norm": 0.8838447332382202, "learning_rate": 1.4319492279412388e-06, "loss": 0.9789, "step": 522 }, { "epoch": 1.4956521739130435, "eval_loss": 0.9736447334289551, "eval_runtime": 46.3906, "eval_samples_per_second": 5.518, "eval_steps_per_second": 0.69, "step": 522 }, { "epoch": 1.4985507246376812, "grad_norm": 0.7661985754966736, "learning_rate": 1.4158048578875211e-06, "loss": 0.9991, "step": 523 }, { "epoch": 1.5014492753623188, "grad_norm": 0.8049348592758179, "learning_rate": 1.399736989834728e-06, "loss": 0.9455, "step": 524 }, { "epoch": 1.5043478260869565, "grad_norm": 0.8575480580329895, "learning_rate": 1.383745966739652e-06, "loss": 0.9764, "step": 525 }, { "epoch": 1.5072463768115942, "grad_norm": 0.7336897253990173, "learning_rate": 1.3678321299188802e-06, "loss": 0.9613, "step": 526 }, { "epoch": 1.5101449275362318, "grad_norm": 0.8718299865722656, "learning_rate": 1.351995819041521e-06, "loss": 0.9923, "step": 527 }, { "epoch": 1.5130434782608697, "grad_norm": 0.9166209101676941, "learning_rate": 1.336237372121944e-06, "loss": 1.069, "step": 528 }, { "epoch": 1.5159420289855072, "grad_norm": 0.9382581114768982, "learning_rate": 1.320557125512575e-06, "loss": 0.9671, "step": 529 }, { "epoch": 1.518840579710145, "grad_norm": 0.8037452101707458, "learning_rate": 1.3049554138967052e-06, "loss": 0.9395, "step": 530 }, { "epoch": 1.5217391304347827, "grad_norm": 0.6627395749092102, "learning_rate": 1.289432570281361e-06, "loss": 0.9025, "step": 531 }, { "epoch": 1.5246376811594202, "grad_norm": 0.7865214943885803, "learning_rate": 1.2739889259901866e-06, "loss": 0.9021, "step": 532 }, { "epoch": 1.527536231884058, "grad_norm": 0.8900570273399353, "learning_rate": 1.258624810656376e-06, "loss": 0.946, "step": 533 }, { "epoch": 1.5304347826086957, "grad_norm": 0.8942597508430481, "learning_rate": 1.2433405522156334e-06, "loss": 1.0141, "step": 534 }, { "epoch": 1.5333333333333332, "grad_norm": 0.8667037487030029, "learning_rate": 1.2281364768991804e-06, "loss": 1.0092, "step": 535 }, { "epoch": 1.5362318840579712, "grad_norm": 0.7895119190216064, "learning_rate": 1.213012909226786e-06, "loss": 0.9251, "step": 536 }, { "epoch": 1.5391304347826087, "grad_norm": 0.8225801587104797, "learning_rate": 1.1979701719998454e-06, "loss": 0.9449, "step": 537 }, { "epoch": 1.5420289855072464, "grad_norm": 0.8342156410217285, "learning_rate": 1.1830085862944851e-06, "loss": 0.9676, "step": 538 }, { "epoch": 1.5449275362318842, "grad_norm": 0.7941964864730835, "learning_rate": 1.1681284714547147e-06, "loss": 0.9907, "step": 539 }, { "epoch": 1.5478260869565217, "grad_norm": 0.9655299782752991, "learning_rate": 1.1533301450856054e-06, "loss": 1.0126, "step": 540 }, { "epoch": 1.5507246376811594, "grad_norm": 0.8632703423500061, "learning_rate": 1.1386139230465176e-06, "loss": 0.9452, "step": 541 }, { "epoch": 1.5536231884057972, "grad_norm": 0.8908371329307556, "learning_rate": 1.1239801194443507e-06, "loss": 0.9821, "step": 542 }, { "epoch": 1.5565217391304347, "grad_norm": 0.873409628868103, "learning_rate": 1.1094290466268493e-06, "loss": 0.969, "step": 543 }, { "epoch": 1.5594202898550724, "grad_norm": 0.8888543844223022, "learning_rate": 1.0949610151759233e-06, "loss": 0.9593, "step": 544 }, { "epoch": 1.5623188405797102, "grad_norm": 0.7646573781967163, "learning_rate": 1.0805763339010329e-06, "loss": 0.9287, "step": 545 }, { "epoch": 1.5652173913043477, "grad_norm": 0.835421085357666, "learning_rate": 1.066275309832584e-06, "loss": 0.9732, "step": 546 }, { "epoch": 1.5681159420289856, "grad_norm": 0.9228112697601318, "learning_rate": 1.0520582482153874e-06, "loss": 0.9675, "step": 547 }, { "epoch": 1.5710144927536231, "grad_norm": 0.7750451564788818, "learning_rate": 1.037925452502131e-06, "loss": 0.9938, "step": 548 }, { "epoch": 1.5739130434782609, "grad_norm": 0.8366883397102356, "learning_rate": 1.0238772243469153e-06, "loss": 0.962, "step": 549 }, { "epoch": 1.5768115942028986, "grad_norm": 0.933855414390564, "learning_rate": 1.0099138635988026e-06, "loss": 0.9732, "step": 550 }, { "epoch": 1.5797101449275361, "grad_norm": 0.9288073778152466, "learning_rate": 9.960356682954293e-07, "loss": 0.9958, "step": 551 }, { "epoch": 1.5826086956521739, "grad_norm": 0.7197360992431641, "learning_rate": 9.822429346566314e-07, "loss": 0.9266, "step": 552 }, { "epoch": 1.5855072463768116, "grad_norm": 0.8900216817855835, "learning_rate": 9.685359570781344e-07, "loss": 1.0006, "step": 553 }, { "epoch": 1.5884057971014491, "grad_norm": 0.7970424294471741, "learning_rate": 9.549150281252633e-07, "loss": 0.968, "step": 554 }, { "epoch": 1.591304347826087, "grad_norm": 0.9357386231422424, "learning_rate": 9.41380438526694e-07, "loss": 1.0361, "step": 555 }, { "epoch": 1.5942028985507246, "grad_norm": 0.740880012512207, "learning_rate": 9.279324771682586e-07, "loss": 0.9492, "step": 556 }, { "epoch": 1.5971014492753624, "grad_norm": 0.9611430764198303, "learning_rate": 9.145714310867676e-07, "loss": 0.9559, "step": 557 }, { "epoch": 1.6, "grad_norm": 0.9163907170295715, "learning_rate": 9.01297585463895e-07, "loss": 1.0112, "step": 558 }, { "epoch": 1.6028985507246376, "grad_norm": 0.9926815032958984, "learning_rate": 8.881112236200795e-07, "loss": 1.0813, "step": 559 }, { "epoch": 1.6057971014492753, "grad_norm": 0.8820666074752808, "learning_rate": 8.750126270084891e-07, "loss": 0.9911, "step": 560 }, { "epoch": 1.608695652173913, "grad_norm": 0.817694365978241, "learning_rate": 8.620020752090008e-07, "loss": 0.9162, "step": 561 }, { "epoch": 1.6115942028985506, "grad_norm": 0.9005435109138489, "learning_rate": 8.490798459222477e-07, "loss": 1.015, "step": 562 }, { "epoch": 1.6144927536231886, "grad_norm": 0.8248128890991211, "learning_rate": 8.362462149636757e-07, "loss": 0.9976, "step": 563 }, { "epoch": 1.617391304347826, "grad_norm": 0.8286884427070618, "learning_rate": 8.235014562576732e-07, "loss": 0.992, "step": 564 }, { "epoch": 1.6202898550724638, "grad_norm": 0.8723387718200684, "learning_rate": 8.108458418317089e-07, "loss": 0.9381, "step": 565 }, { "epoch": 1.6231884057971016, "grad_norm": 0.9833754897117615, "learning_rate": 7.98279641810537e-07, "loss": 0.9435, "step": 566 }, { "epoch": 1.626086956521739, "grad_norm": 0.9212725162506104, "learning_rate": 7.858031244104247e-07, "loss": 0.9611, "step": 567 }, { "epoch": 1.6289855072463768, "grad_norm": 0.852350115776062, "learning_rate": 7.734165559334327e-07, "loss": 0.9064, "step": 568 }, { "epoch": 1.6318840579710145, "grad_norm": 0.8955137729644775, "learning_rate": 7.611202007617241e-07, "loss": 0.9547, "step": 569 }, { "epoch": 1.634782608695652, "grad_norm": 0.8889902830123901, "learning_rate": 7.489143213519301e-07, "loss": 0.9533, "step": 570 }, { "epoch": 1.6376811594202898, "grad_norm": 0.9037710428237915, "learning_rate": 7.367991782295392e-07, "loss": 0.9213, "step": 571 }, { "epoch": 1.6405797101449275, "grad_norm": 0.8594886064529419, "learning_rate": 7.24775029983345e-07, "loss": 0.9765, "step": 572 }, { "epoch": 1.643478260869565, "grad_norm": 0.7082343101501465, "learning_rate": 7.128421332599189e-07, "loss": 0.9871, "step": 573 }, { "epoch": 1.646376811594203, "grad_norm": 0.878217339515686, "learning_rate": 7.010007427581378e-07, "loss": 0.9366, "step": 574 }, { "epoch": 1.6492753623188405, "grad_norm": 0.9462459087371826, "learning_rate": 6.892511112237472e-07, "loss": 0.9505, "step": 575 }, { "epoch": 1.6521739130434783, "grad_norm": 0.7900387644767761, "learning_rate": 6.775934894439606e-07, "loss": 0.9554, "step": 576 }, { "epoch": 1.655072463768116, "grad_norm": 0.8542242050170898, "learning_rate": 6.66028126242117e-07, "loss": 0.9331, "step": 577 }, { "epoch": 1.6579710144927535, "grad_norm": 0.9795560836791992, "learning_rate": 6.545552684723583e-07, "loss": 0.9203, "step": 578 }, { "epoch": 1.6608695652173913, "grad_norm": 0.7833444476127625, "learning_rate": 6.431751610143716e-07, "loss": 0.9977, "step": 579 }, { "epoch": 1.663768115942029, "grad_norm": 0.8404137492179871, "learning_rate": 6.318880467681527e-07, "loss": 0.9981, "step": 580 }, { "epoch": 1.6666666666666665, "grad_norm": 0.9158584475517273, "learning_rate": 6.206941666488287e-07, "loss": 0.9584, "step": 581 }, { "epoch": 1.6695652173913045, "grad_norm": 0.7720228433609009, "learning_rate": 6.095937595815104e-07, "loss": 0.9284, "step": 582 }, { "epoch": 1.672463768115942, "grad_norm": 0.9077423214912415, "learning_rate": 5.985870624961993e-07, "loss": 1.0104, "step": 583 }, { "epoch": 1.6753623188405797, "grad_norm": 0.7142834663391113, "learning_rate": 5.876743103227217e-07, "loss": 0.9617, "step": 584 }, { "epoch": 1.6782608695652175, "grad_norm": 0.9244917035102844, "learning_rate": 5.768557359857241e-07, "loss": 0.9534, "step": 585 }, { "epoch": 1.681159420289855, "grad_norm": 0.8961134552955627, "learning_rate": 5.661315703996905e-07, "loss": 0.9462, "step": 586 }, { "epoch": 1.6840579710144927, "grad_norm": 0.9584707021713257, "learning_rate": 5.555020424640267e-07, "loss": 0.9483, "step": 587 }, { "epoch": 1.6869565217391305, "grad_norm": 0.8094743490219116, "learning_rate": 5.449673790581611e-07, "loss": 0.9564, "step": 588 }, { "epoch": 1.689855072463768, "grad_norm": 0.886703610420227, "learning_rate": 5.345278050367142e-07, "loss": 1.0153, "step": 589 }, { "epoch": 1.692753623188406, "grad_norm": 0.9125918745994568, "learning_rate": 5.241835432246888e-07, "loss": 0.9749, "step": 590 }, { "epoch": 1.6956521739130435, "grad_norm": 0.8972467184066772, "learning_rate": 5.139348144127237e-07, "loss": 1.0084, "step": 591 }, { "epoch": 1.6985507246376812, "grad_norm": 0.7566870450973511, "learning_rate": 5.037818373523723e-07, "loss": 0.9932, "step": 592 }, { "epoch": 1.701449275362319, "grad_norm": 0.8601511716842651, "learning_rate": 4.937248287514407e-07, "loss": 0.9747, "step": 593 }, { "epoch": 1.7043478260869565, "grad_norm": 0.8272446393966675, "learning_rate": 4.837640032693558e-07, "loss": 1.0065, "step": 594 }, { "epoch": 1.7072463768115942, "grad_norm": 0.7029653191566467, "learning_rate": 4.738995735125895e-07, "loss": 0.9384, "step": 595 }, { "epoch": 1.710144927536232, "grad_norm": 0.913718044757843, "learning_rate": 4.641317500301173e-07, "loss": 0.9563, "step": 596 }, { "epoch": 1.7130434782608694, "grad_norm": 0.9736040830612183, "learning_rate": 4.5446074130892525e-07, "loss": 0.9455, "step": 597 }, { "epoch": 1.7159420289855074, "grad_norm": 0.8182763457298279, "learning_rate": 4.448867537695578e-07, "loss": 0.944, "step": 598 }, { "epoch": 1.718840579710145, "grad_norm": 0.8536428213119507, "learning_rate": 4.3540999176171717e-07, "loss": 0.9029, "step": 599 }, { "epoch": 1.7217391304347827, "grad_norm": 0.8713299036026001, "learning_rate": 4.2603065755989493e-07, "loss": 0.9448, "step": 600 }, { "epoch": 1.7246376811594204, "grad_norm": 0.9857087135314941, "learning_rate": 4.167489513590611e-07, "loss": 1.0004, "step": 601 }, { "epoch": 1.727536231884058, "grad_norm": 0.9195379018783569, "learning_rate": 4.0756507127038494e-07, "loss": 1.0247, "step": 602 }, { "epoch": 1.7304347826086957, "grad_norm": 0.8422645926475525, "learning_rate": 3.984792133170129e-07, "loss": 1.0087, "step": 603 }, { "epoch": 1.7333333333333334, "grad_norm": 0.8902682662010193, "learning_rate": 3.894915714298775e-07, "loss": 0.8793, "step": 604 }, { "epoch": 1.736231884057971, "grad_norm": 0.8859000205993652, "learning_rate": 3.8060233744356634e-07, "loss": 1.0018, "step": 605 }, { "epoch": 1.7391304347826086, "grad_norm": 0.8340051174163818, "learning_rate": 3.71811701092219e-07, "loss": 0.9534, "step": 606 }, { "epoch": 1.7420289855072464, "grad_norm": 0.8677003979682922, "learning_rate": 3.6311985000548223e-07, "loss": 0.9525, "step": 607 }, { "epoch": 1.744927536231884, "grad_norm": 0.932613730430603, "learning_rate": 3.5452696970450674e-07, "loss": 0.9257, "step": 608 }, { "epoch": 1.7478260869565219, "grad_norm": 0.9657606482505798, "learning_rate": 3.4603324359798016e-07, "loss": 1.0033, "step": 609 }, { "epoch": 1.7478260869565219, "eval_loss": 0.9723503589630127, "eval_runtime": 46.2237, "eval_samples_per_second": 5.538, "eval_steps_per_second": 0.692, "step": 609 }, { "epoch": 1.7507246376811594, "grad_norm": 0.860346257686615, "learning_rate": 3.3763885297822153e-07, "loss": 0.986, "step": 610 }, { "epoch": 1.7536231884057971, "grad_norm": 0.8614711165428162, "learning_rate": 3.293439770173046e-07, "loss": 0.9976, "step": 611 }, { "epoch": 1.7565217391304349, "grad_norm": 0.7311533689498901, "learning_rate": 3.2114879276323783e-07, "loss": 0.908, "step": 612 }, { "epoch": 1.7594202898550724, "grad_norm": 0.9412534236907959, "learning_rate": 3.130534751361808e-07, "loss": 0.977, "step": 613 }, { "epoch": 1.76231884057971, "grad_norm": 0.911098062992096, "learning_rate": 3.0505819692471797e-07, "loss": 0.9387, "step": 614 }, { "epoch": 1.7652173913043478, "grad_norm": 0.8363705277442932, "learning_rate": 2.9716312878216194e-07, "loss": 0.9538, "step": 615 }, { "epoch": 1.7681159420289854, "grad_norm": 0.9569475650787354, "learning_rate": 2.893684392229185e-07, "loss": 0.998, "step": 616 }, { "epoch": 1.7710144927536233, "grad_norm": 0.8830727338790894, "learning_rate": 2.8167429461888496e-07, "loss": 0.9277, "step": 617 }, { "epoch": 1.7739130434782608, "grad_norm": 0.9968934059143066, "learning_rate": 2.7408085919590265e-07, "loss": 1.0167, "step": 618 }, { "epoch": 1.7768115942028986, "grad_norm": 0.7348361611366272, "learning_rate": 2.6658829503024566e-07, "loss": 0.9224, "step": 619 }, { "epoch": 1.7797101449275363, "grad_norm": 0.9676991701126099, "learning_rate": 2.5919676204517073e-07, "loss": 0.9808, "step": 620 }, { "epoch": 1.7826086956521738, "grad_norm": 0.8737136125564575, "learning_rate": 2.5190641800749424e-07, "loss": 0.9436, "step": 621 }, { "epoch": 1.7855072463768116, "grad_norm": 0.8523948192596436, "learning_rate": 2.447174185242324e-07, "loss": 0.952, "step": 622 }, { "epoch": 1.7884057971014493, "grad_norm": 0.7342602610588074, "learning_rate": 2.3762991703927375e-07, "loss": 0.9682, "step": 623 }, { "epoch": 1.7913043478260868, "grad_norm": 1.044270634651184, "learning_rate": 2.3064406483010947e-07, "loss": 0.9725, "step": 624 }, { "epoch": 1.7942028985507248, "grad_norm": 0.9236974120140076, "learning_rate": 2.237600110046001e-07, "loss": 0.951, "step": 625 }, { "epoch": 1.7971014492753623, "grad_norm": 0.7988727688789368, "learning_rate": 2.1697790249779638e-07, "loss": 0.8851, "step": 626 }, { "epoch": 1.8, "grad_norm": 0.7906875014305115, "learning_rate": 2.102978840687997e-07, "loss": 0.9162, "step": 627 }, { "epoch": 1.8028985507246378, "grad_norm": 0.7702775001525879, "learning_rate": 2.0372009829767558e-07, "loss": 0.9614, "step": 628 }, { "epoch": 1.8057971014492753, "grad_norm": 0.9317652583122253, "learning_rate": 1.9724468558240838e-07, "loss": 0.9105, "step": 629 }, { "epoch": 1.808695652173913, "grad_norm": 0.855368435382843, "learning_rate": 1.908717841359048e-07, "loss": 1.0019, "step": 630 }, { "epoch": 1.8115942028985508, "grad_norm": 0.761951744556427, "learning_rate": 1.8460152998304393e-07, "loss": 0.9267, "step": 631 }, { "epoch": 1.8144927536231883, "grad_norm": 0.8468912839889526, "learning_rate": 1.7843405695777582e-07, "loss": 1.0065, "step": 632 }, { "epoch": 1.8173913043478263, "grad_norm": 0.889159619808197, "learning_rate": 1.7236949670026037e-07, "loss": 0.9332, "step": 633 }, { "epoch": 1.8202898550724638, "grad_norm": 0.8339653015136719, "learning_rate": 1.664079786540629e-07, "loss": 0.9851, "step": 634 }, { "epoch": 1.8231884057971013, "grad_norm": 0.7670577764511108, "learning_rate": 1.6054963006338742e-07, "loss": 0.9354, "step": 635 }, { "epoch": 1.8260869565217392, "grad_norm": 0.8923590183258057, "learning_rate": 1.547945759703623e-07, "loss": 1.0162, "step": 636 }, { "epoch": 1.8289855072463768, "grad_norm": 0.7903847098350525, "learning_rate": 1.491429392123711e-07, "loss": 0.979, "step": 637 }, { "epoch": 1.8318840579710145, "grad_norm": 0.9351047873497009, "learning_rate": 1.435948404194304e-07, "loss": 0.9458, "step": 638 }, { "epoch": 1.8347826086956522, "grad_norm": 0.8081286549568176, "learning_rate": 1.3815039801161723e-07, "loss": 0.9246, "step": 639 }, { "epoch": 1.8376811594202898, "grad_norm": 0.752216100692749, "learning_rate": 1.328097281965357e-07, "loss": 0.9758, "step": 640 }, { "epoch": 1.8405797101449275, "grad_norm": 0.9659929871559143, "learning_rate": 1.2757294496684447e-07, "loss": 1.0107, "step": 641 }, { "epoch": 1.8434782608695652, "grad_norm": 1.0376217365264893, "learning_rate": 1.22440160097817e-07, "loss": 0.9631, "step": 642 }, { "epoch": 1.8463768115942027, "grad_norm": 0.9361832141876221, "learning_rate": 1.1741148314495965e-07, "loss": 0.9867, "step": 643 }, { "epoch": 1.8492753623188407, "grad_norm": 0.8664498329162598, "learning_rate": 1.1248702144167123e-07, "loss": 0.9703, "step": 644 }, { "epoch": 1.8521739130434782, "grad_norm": 0.9653159379959106, "learning_rate": 1.0766688009695548e-07, "loss": 0.9662, "step": 645 }, { "epoch": 1.855072463768116, "grad_norm": 1.0553069114685059, "learning_rate": 1.0295116199317057e-07, "loss": 0.9745, "step": 646 }, { "epoch": 1.8579710144927537, "grad_norm": 0.9453853964805603, "learning_rate": 9.833996778384259e-08, "loss": 0.9802, "step": 647 }, { "epoch": 1.8608695652173912, "grad_norm": 0.7949392795562744, "learning_rate": 9.383339589150776e-08, "loss": 0.9173, "step": 648 }, { "epoch": 1.863768115942029, "grad_norm": 0.7941511273384094, "learning_rate": 8.943154250562025e-08, "loss": 0.9633, "step": 649 }, { "epoch": 1.8666666666666667, "grad_norm": 0.8360518217086792, "learning_rate": 8.513450158049109e-08, "loss": 0.9565, "step": 650 }, { "epoch": 1.8695652173913042, "grad_norm": 0.9996237754821777, "learning_rate": 8.094236483329022e-08, "loss": 0.9999, "step": 651 }, { "epoch": 1.8724637681159422, "grad_norm": 0.7493065595626831, "learning_rate": 7.685522174208205e-08, "loss": 0.9733, "step": 652 }, { "epoch": 1.8753623188405797, "grad_norm": 0.8603729605674744, "learning_rate": 7.287315954392137e-08, "loss": 0.9624, "step": 653 }, { "epoch": 1.8782608695652174, "grad_norm": 0.7145766615867615, "learning_rate": 6.899626323298714e-08, "loss": 1.0049, "step": 654 }, { "epoch": 1.8811594202898552, "grad_norm": 0.9684036374092102, "learning_rate": 6.522461555877213e-08, "loss": 0.9562, "step": 655 }, { "epoch": 1.8840579710144927, "grad_norm": 0.8989734053611755, "learning_rate": 6.15582970243117e-08, "loss": 1.0268, "step": 656 }, { "epoch": 1.8869565217391304, "grad_norm": 0.9243214726448059, "learning_rate": 5.799738588447068e-08, "loss": 0.9643, "step": 657 }, { "epoch": 1.8898550724637682, "grad_norm": 0.9879785776138306, "learning_rate": 5.454195814427021e-08, "loss": 0.9417, "step": 658 }, { "epoch": 1.8927536231884057, "grad_norm": 0.9754204154014587, "learning_rate": 5.119208755726579e-08, "loss": 1.063, "step": 659 }, { "epoch": 1.8956521739130436, "grad_norm": 0.7662235498428345, "learning_rate": 4.794784562397459e-08, "loss": 0.9799, "step": 660 }, { "epoch": 1.8985507246376812, "grad_norm": 0.8312128782272339, "learning_rate": 4.4809301590345576e-08, "loss": 0.9671, "step": 661 }, { "epoch": 1.901449275362319, "grad_norm": 0.8354112505912781, "learning_rate": 4.177652244628627e-08, "loss": 0.9688, "step": 662 }, { "epoch": 1.9043478260869566, "grad_norm": 0.9401686191558838, "learning_rate": 3.884957292422997e-08, "loss": 0.9989, "step": 663 }, { "epoch": 1.9072463768115941, "grad_norm": 0.8864877820014954, "learning_rate": 3.602851549775521e-08, "loss": 1.0094, "step": 664 }, { "epoch": 1.9101449275362319, "grad_norm": 0.9440781474113464, "learning_rate": 3.3313410380250157e-08, "loss": 0.9544, "step": 665 }, { "epoch": 1.9130434782608696, "grad_norm": 1.0098837614059448, "learning_rate": 3.0704315523631956e-08, "loss": 0.9209, "step": 666 }, { "epoch": 1.9159420289855071, "grad_norm": 0.9735342860221863, "learning_rate": 2.8201286617103863e-08, "loss": 1.0385, "step": 667 }, { "epoch": 1.9188405797101449, "grad_norm": 0.9122427105903625, "learning_rate": 2.5804377085972278e-08, "loss": 0.9844, "step": 668 }, { "epoch": 1.9217391304347826, "grad_norm": 0.8491829633712769, "learning_rate": 2.351363809050211e-08, "loss": 1.0045, "step": 669 }, { "epoch": 1.9246376811594201, "grad_norm": 0.83339524269104, "learning_rate": 2.1329118524827662e-08, "loss": 0.9844, "step": 670 }, { "epoch": 1.927536231884058, "grad_norm": 0.9295774102210999, "learning_rate": 1.9250865015906784e-08, "loss": 1.0247, "step": 671 }, { "epoch": 1.9304347826086956, "grad_norm": 0.8484298586845398, "learning_rate": 1.7278921922527224e-08, "loss": 1.0195, "step": 672 }, { "epoch": 1.9333333333333333, "grad_norm": 0.8862564563751221, "learning_rate": 1.541333133436018e-08, "loss": 0.9827, "step": 673 }, { "epoch": 1.936231884057971, "grad_norm": 0.8401779532432556, "learning_rate": 1.3654133071059894e-08, "loss": 1.0295, "step": 674 }, { "epoch": 1.9391304347826086, "grad_norm": 0.8818807005882263, "learning_rate": 1.200136468141544e-08, "loss": 0.9554, "step": 675 }, { "epoch": 1.9420289855072463, "grad_norm": 0.8366807699203491, "learning_rate": 1.0455061442548597e-08, "loss": 0.9771, "step": 676 }, { "epoch": 1.944927536231884, "grad_norm": 0.8115973472595215, "learning_rate": 9.015256359161118e-09, "loss": 1.0364, "step": 677 }, { "epoch": 1.9478260869565216, "grad_norm": 0.925413191318512, "learning_rate": 7.681980162830283e-09, "loss": 1.0026, "step": 678 }, { "epoch": 1.9507246376811596, "grad_norm": 0.8799839615821838, "learning_rate": 6.455261311352767e-09, "loss": 1.0164, "step": 679 }, { "epoch": 1.953623188405797, "grad_norm": 0.8579555153846741, "learning_rate": 5.3351259881379016e-09, "loss": 0.9775, "step": 680 }, { "epoch": 1.9565217391304348, "grad_norm": 0.8572901487350464, "learning_rate": 4.321598101647007e-09, "loss": 0.9926, "step": 681 }, { "epoch": 1.9594202898550726, "grad_norm": 0.7731289863586426, "learning_rate": 3.41469928488547e-09, "loss": 1.0126, "step": 682 }, { "epoch": 1.96231884057971, "grad_norm": 0.937656581401825, "learning_rate": 2.6144488949392253e-09, "loss": 0.9443, "step": 683 }, { "epoch": 1.9652173913043478, "grad_norm": 0.8993798494338989, "learning_rate": 1.9208640125628618e-09, "loss": 0.946, "step": 684 }, { "epoch": 1.9681159420289855, "grad_norm": 0.9831903576850891, "learning_rate": 1.3339594418138036e-09, "loss": 0.9799, "step": 685 }, { "epoch": 1.971014492753623, "grad_norm": 0.9224021434783936, "learning_rate": 8.537477097364522e-10, "loss": 0.9299, "step": 686 }, { "epoch": 1.973913043478261, "grad_norm": 0.8220890760421753, "learning_rate": 4.802390660968437e-10, "loss": 1.0307, "step": 687 }, { "epoch": 1.9768115942028985, "grad_norm": 1.0893397331237793, "learning_rate": 2.1344148316060352e-10, "loss": 0.9523, "step": 688 }, { "epoch": 1.9797101449275363, "grad_norm": 0.8536267280578613, "learning_rate": 5.336065552641323e-11, "loss": 0.9675, "step": 689 }, { "epoch": 1.982608695652174, "grad_norm": 0.8123190999031067, "learning_rate": 0.0, "loss": 0.9576, "step": 690 } ], "logging_steps": 1, "max_steps": 690, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 173, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.816855525560156e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }