diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4927 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.982608695652174, + "eval_steps": 87, + "global_step": 690, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002898550724637681, + "grad_norm": 0.44052618741989136, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.4473, + "step": 1 + }, + { + "epoch": 0.002898550724637681, + "eval_loss": 1.4117156267166138, + "eval_runtime": 46.1446, + "eval_samples_per_second": 5.548, + "eval_steps_per_second": 0.693, + "step": 1 + }, + { + "epoch": 0.005797101449275362, + "grad_norm": 0.4932183027267456, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.3923, + "step": 2 + }, + { + "epoch": 0.008695652173913044, + "grad_norm": 0.4844379723072052, + "learning_rate": 3e-06, + "loss": 1.4468, + "step": 3 + }, + { + "epoch": 0.011594202898550725, + "grad_norm": 0.5023930668830872, + "learning_rate": 4.000000000000001e-06, + "loss": 1.3773, + "step": 4 + }, + { + "epoch": 0.014492753623188406, + "grad_norm": 0.483876496553421, + "learning_rate": 5e-06, + "loss": 1.4103, + "step": 5 + }, + { + "epoch": 0.017391304347826087, + "grad_norm": 0.4460753798484802, + "learning_rate": 6e-06, + "loss": 1.4707, + "step": 6 + }, + { + "epoch": 0.020289855072463767, + "grad_norm": 0.4342319369316101, + "learning_rate": 7e-06, + "loss": 1.3563, + "step": 7 + }, + { + "epoch": 0.02318840579710145, + "grad_norm": 0.479257196187973, + "learning_rate": 8.000000000000001e-06, + "loss": 1.414, + "step": 8 + }, + { + "epoch": 0.02608695652173913, + "grad_norm": 0.5028970241546631, + "learning_rate": 9e-06, + "loss": 1.4601, + "step": 9 + }, + { + "epoch": 0.028985507246376812, + "grad_norm": 0.49131453037261963, + "learning_rate": 1e-05, + "loss": 1.4364, + "step": 10 + }, + { + "epoch": 0.03188405797101449, + "grad_norm": 0.5517832040786743, + "learning_rate": 9.999946639344475e-06, + "loss": 1.4873, + "step": 11 + }, + { + "epoch": 0.034782608695652174, + "grad_norm": 0.5310211181640625, + "learning_rate": 9.99978655851684e-06, + "loss": 1.4346, + "step": 12 + }, + { + "epoch": 0.03768115942028986, + "grad_norm": 0.4639141857624054, + "learning_rate": 9.999519760933905e-06, + "loss": 1.4402, + "step": 13 + }, + { + "epoch": 0.04057971014492753, + "grad_norm": 0.47811073064804077, + "learning_rate": 9.999146252290264e-06, + "loss": 1.4106, + "step": 14 + }, + { + "epoch": 0.043478260869565216, + "grad_norm": 0.5223386883735657, + "learning_rate": 9.998666040558187e-06, + "loss": 1.3732, + "step": 15 + }, + { + "epoch": 0.0463768115942029, + "grad_norm": 0.5601791143417358, + "learning_rate": 9.998079135987437e-06, + "loss": 1.4166, + "step": 16 + }, + { + "epoch": 0.04927536231884058, + "grad_norm": 0.5459745526313782, + "learning_rate": 9.997385551105061e-06, + "loss": 1.4501, + "step": 17 + }, + { + "epoch": 0.05217391304347826, + "grad_norm": 0.6155043244361877, + "learning_rate": 9.996585300715117e-06, + "loss": 1.3987, + "step": 18 + }, + { + "epoch": 0.05507246376811594, + "grad_norm": 0.539135754108429, + "learning_rate": 9.995678401898354e-06, + "loss": 1.3943, + "step": 19 + }, + { + "epoch": 0.057971014492753624, + "grad_norm": 0.5232663154602051, + "learning_rate": 9.994664874011864e-06, + "loss": 1.3742, + "step": 20 + }, + { + "epoch": 0.06086956521739131, + "grad_norm": 0.4995758533477783, + "learning_rate": 9.993544738688647e-06, + "loss": 1.3969, + "step": 21 + }, + { + "epoch": 0.06376811594202898, + "grad_norm": 0.5397970080375671, + "learning_rate": 9.992318019837171e-06, + "loss": 1.3238, + "step": 22 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.5533668994903564, + "learning_rate": 9.990984743640839e-06, + "loss": 1.3717, + "step": 23 + }, + { + "epoch": 0.06956521739130435, + "grad_norm": 0.5304050445556641, + "learning_rate": 9.989544938557453e-06, + "loss": 1.3565, + "step": 24 + }, + { + "epoch": 0.07246376811594203, + "grad_norm": 0.5658550262451172, + "learning_rate": 9.987998635318586e-06, + "loss": 1.3075, + "step": 25 + }, + { + "epoch": 0.07536231884057971, + "grad_norm": 0.5798805952072144, + "learning_rate": 9.98634586692894e-06, + "loss": 1.4202, + "step": 26 + }, + { + "epoch": 0.0782608695652174, + "grad_norm": 0.49352607131004333, + "learning_rate": 9.984586668665641e-06, + "loss": 1.3172, + "step": 27 + }, + { + "epoch": 0.08115942028985507, + "grad_norm": 0.576454222202301, + "learning_rate": 9.982721078077474e-06, + "loss": 1.3633, + "step": 28 + }, + { + "epoch": 0.08405797101449275, + "grad_norm": 0.5843266248703003, + "learning_rate": 9.980749134984094e-06, + "loss": 1.3031, + "step": 29 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 0.5863199234008789, + "learning_rate": 9.978670881475173e-06, + "loss": 1.3228, + "step": 30 + }, + { + "epoch": 0.08985507246376812, + "grad_norm": 0.6071418523788452, + "learning_rate": 9.9764863619095e-06, + "loss": 1.3277, + "step": 31 + }, + { + "epoch": 0.0927536231884058, + "grad_norm": 0.5361754298210144, + "learning_rate": 9.97419562291403e-06, + "loss": 1.3189, + "step": 32 + }, + { + "epoch": 0.09565217391304348, + "grad_norm": 0.6043053865432739, + "learning_rate": 9.971798713382896e-06, + "loss": 1.2567, + "step": 33 + }, + { + "epoch": 0.09855072463768116, + "grad_norm": 0.4795907139778137, + "learning_rate": 9.96929568447637e-06, + "loss": 1.33, + "step": 34 + }, + { + "epoch": 0.10144927536231885, + "grad_norm": 0.5752019882202148, + "learning_rate": 9.96668658961975e-06, + "loss": 1.1915, + "step": 35 + }, + { + "epoch": 0.10434782608695652, + "grad_norm": 0.47888195514678955, + "learning_rate": 9.963971484502247e-06, + "loss": 1.2753, + "step": 36 + }, + { + "epoch": 0.1072463768115942, + "grad_norm": 0.5371452569961548, + "learning_rate": 9.96115042707577e-06, + "loss": 1.2659, + "step": 37 + }, + { + "epoch": 0.11014492753623188, + "grad_norm": 0.6198606491088867, + "learning_rate": 9.958223477553715e-06, + "loss": 1.2166, + "step": 38 + }, + { + "epoch": 0.11304347826086956, + "grad_norm": 0.4718591272830963, + "learning_rate": 9.955190698409656e-06, + "loss": 1.2708, + "step": 39 + }, + { + "epoch": 0.11594202898550725, + "grad_norm": 0.5691114068031311, + "learning_rate": 9.952052154376027e-06, + "loss": 1.2074, + "step": 40 + }, + { + "epoch": 0.11884057971014493, + "grad_norm": 0.515771210193634, + "learning_rate": 9.948807912442735e-06, + "loss": 1.1958, + "step": 41 + }, + { + "epoch": 0.12173913043478261, + "grad_norm": 0.6830301880836487, + "learning_rate": 9.945458041855732e-06, + "loss": 1.2992, + "step": 42 + }, + { + "epoch": 0.1246376811594203, + "grad_norm": 0.5583641529083252, + "learning_rate": 9.94200261411553e-06, + "loss": 1.2654, + "step": 43 + }, + { + "epoch": 0.12753623188405797, + "grad_norm": 0.5985351800918579, + "learning_rate": 9.938441702975689e-06, + "loss": 1.2064, + "step": 44 + }, + { + "epoch": 0.13043478260869565, + "grad_norm": 0.5092725157737732, + "learning_rate": 9.93477538444123e-06, + "loss": 1.1477, + "step": 45 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.5719948410987854, + "learning_rate": 9.931003736767013e-06, + "loss": 1.3045, + "step": 46 + }, + { + "epoch": 0.13623188405797101, + "grad_norm": 0.5000984072685242, + "learning_rate": 9.92712684045608e-06, + "loss": 1.2954, + "step": 47 + }, + { + "epoch": 0.1391304347826087, + "grad_norm": 0.6268609762191772, + "learning_rate": 9.923144778257918e-06, + "loss": 1.2742, + "step": 48 + }, + { + "epoch": 0.14202898550724638, + "grad_norm": 0.5395749807357788, + "learning_rate": 9.91905763516671e-06, + "loss": 1.1651, + "step": 49 + }, + { + "epoch": 0.14492753623188406, + "grad_norm": 0.6797102689743042, + "learning_rate": 9.91486549841951e-06, + "loss": 1.2083, + "step": 50 + }, + { + "epoch": 0.14782608695652175, + "grad_norm": 0.554821252822876, + "learning_rate": 9.91056845749438e-06, + "loss": 1.1623, + "step": 51 + }, + { + "epoch": 0.15072463768115943, + "grad_norm": 0.6033896803855896, + "learning_rate": 9.906166604108494e-06, + "loss": 1.2135, + "step": 52 + }, + { + "epoch": 0.1536231884057971, + "grad_norm": 0.568701446056366, + "learning_rate": 9.901660032216159e-06, + "loss": 1.1956, + "step": 53 + }, + { + "epoch": 0.1565217391304348, + "grad_norm": 0.6862343549728394, + "learning_rate": 9.89704883800683e-06, + "loss": 1.1992, + "step": 54 + }, + { + "epoch": 0.15942028985507245, + "grad_norm": 0.49399352073669434, + "learning_rate": 9.892333119903045e-06, + "loss": 1.1711, + "step": 55 + }, + { + "epoch": 0.16231884057971013, + "grad_norm": 0.5683416724205017, + "learning_rate": 9.887512978558329e-06, + "loss": 1.2608, + "step": 56 + }, + { + "epoch": 0.16521739130434782, + "grad_norm": 0.4855175018310547, + "learning_rate": 9.88258851685504e-06, + "loss": 1.1652, + "step": 57 + }, + { + "epoch": 0.1681159420289855, + "grad_norm": 0.5765471458435059, + "learning_rate": 9.877559839902185e-06, + "loss": 1.2653, + "step": 58 + }, + { + "epoch": 0.17101449275362318, + "grad_norm": 0.5921582579612732, + "learning_rate": 9.872427055033156e-06, + "loss": 1.1191, + "step": 59 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 0.5046260356903076, + "learning_rate": 9.867190271803466e-06, + "loss": 1.1824, + "step": 60 + }, + { + "epoch": 0.17681159420289855, + "grad_norm": 0.5180432796478271, + "learning_rate": 9.861849601988384e-06, + "loss": 1.1736, + "step": 61 + }, + { + "epoch": 0.17971014492753623, + "grad_norm": 0.65400230884552, + "learning_rate": 9.85640515958057e-06, + "loss": 1.1129, + "step": 62 + }, + { + "epoch": 0.1826086956521739, + "grad_norm": 0.5726003646850586, + "learning_rate": 9.85085706078763e-06, + "loss": 1.1567, + "step": 63 + }, + { + "epoch": 0.1855072463768116, + "grad_norm": 0.5297178030014038, + "learning_rate": 9.845205424029639e-06, + "loss": 1.101, + "step": 64 + }, + { + "epoch": 0.18840579710144928, + "grad_norm": 0.5242377519607544, + "learning_rate": 9.839450369936615e-06, + "loss": 1.174, + "step": 65 + }, + { + "epoch": 0.19130434782608696, + "grad_norm": 0.5277882218360901, + "learning_rate": 9.833592021345938e-06, + "loss": 1.1772, + "step": 66 + }, + { + "epoch": 0.19420289855072465, + "grad_norm": 0.5334244966506958, + "learning_rate": 9.827630503299741e-06, + "loss": 1.1722, + "step": 67 + }, + { + "epoch": 0.19710144927536233, + "grad_norm": 0.6054286360740662, + "learning_rate": 9.821565943042225e-06, + "loss": 1.2022, + "step": 68 + }, + { + "epoch": 0.2, + "grad_norm": 0.5691675543785095, + "learning_rate": 9.815398470016957e-06, + "loss": 1.1256, + "step": 69 + }, + { + "epoch": 0.2028985507246377, + "grad_norm": 0.4579974114894867, + "learning_rate": 9.809128215864096e-06, + "loss": 1.1548, + "step": 70 + }, + { + "epoch": 0.20579710144927535, + "grad_norm": 0.605627715587616, + "learning_rate": 9.802755314417592e-06, + "loss": 1.0972, + "step": 71 + }, + { + "epoch": 0.20869565217391303, + "grad_norm": 0.5655208826065063, + "learning_rate": 9.796279901702326e-06, + "loss": 1.0902, + "step": 72 + }, + { + "epoch": 0.21159420289855072, + "grad_norm": 0.570743978023529, + "learning_rate": 9.789702115931202e-06, + "loss": 1.0654, + "step": 73 + }, + { + "epoch": 0.2144927536231884, + "grad_norm": 0.7513704895973206, + "learning_rate": 9.783022097502204e-06, + "loss": 1.1348, + "step": 74 + }, + { + "epoch": 0.21739130434782608, + "grad_norm": 0.592363715171814, + "learning_rate": 9.776239988995401e-06, + "loss": 1.1733, + "step": 75 + }, + { + "epoch": 0.22028985507246376, + "grad_norm": 0.5394357442855835, + "learning_rate": 9.76935593516989e-06, + "loss": 1.1313, + "step": 76 + }, + { + "epoch": 0.22318840579710145, + "grad_norm": 0.598983108997345, + "learning_rate": 9.762370082960727e-06, + "loss": 1.1077, + "step": 77 + }, + { + "epoch": 0.22608695652173913, + "grad_norm": 0.5635719895362854, + "learning_rate": 9.755282581475769e-06, + "loss": 1.0393, + "step": 78 + }, + { + "epoch": 0.2289855072463768, + "grad_norm": 0.5638449788093567, + "learning_rate": 9.748093581992506e-06, + "loss": 1.1126, + "step": 79 + }, + { + "epoch": 0.2318840579710145, + "grad_norm": 0.5267054438591003, + "learning_rate": 9.74080323795483e-06, + "loss": 1.108, + "step": 80 + }, + { + "epoch": 0.23478260869565218, + "grad_norm": 0.69565749168396, + "learning_rate": 9.733411704969754e-06, + "loss": 1.1065, + "step": 81 + }, + { + "epoch": 0.23768115942028986, + "grad_norm": 0.5769387483596802, + "learning_rate": 9.7259191408041e-06, + "loss": 1.0892, + "step": 82 + }, + { + "epoch": 0.24057971014492754, + "grad_norm": 0.4646681845188141, + "learning_rate": 9.718325705381115e-06, + "loss": 1.0984, + "step": 83 + }, + { + "epoch": 0.24347826086956523, + "grad_norm": 0.5441101789474487, + "learning_rate": 9.710631560777082e-06, + "loss": 1.134, + "step": 84 + }, + { + "epoch": 0.2463768115942029, + "grad_norm": 0.6711792349815369, + "learning_rate": 9.702836871217838e-06, + "loss": 1.118, + "step": 85 + }, + { + "epoch": 0.2492753623188406, + "grad_norm": 0.6086435914039612, + "learning_rate": 9.694941803075285e-06, + "loss": 1.1332, + "step": 86 + }, + { + "epoch": 0.25217391304347825, + "grad_norm": 0.6047069430351257, + "learning_rate": 9.686946524863821e-06, + "loss": 1.0948, + "step": 87 + }, + { + "epoch": 0.25217391304347825, + "eval_loss": 1.093648910522461, + "eval_runtime": 46.2827, + "eval_samples_per_second": 5.531, + "eval_steps_per_second": 0.691, + "step": 87 + }, + { + "epoch": 0.25507246376811593, + "grad_norm": 0.5494099259376526, + "learning_rate": 9.678851207236764e-06, + "loss": 1.0677, + "step": 88 + }, + { + "epoch": 0.2579710144927536, + "grad_norm": 0.6029177308082581, + "learning_rate": 9.670656022982696e-06, + "loss": 1.1122, + "step": 89 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 0.6882422566413879, + "learning_rate": 9.66236114702178e-06, + "loss": 1.131, + "step": 90 + }, + { + "epoch": 0.263768115942029, + "grad_norm": 0.5858222246170044, + "learning_rate": 9.65396675640202e-06, + "loss": 1.0904, + "step": 91 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.6096974611282349, + "learning_rate": 9.645473030295496e-06, + "loss": 1.1001, + "step": 92 + }, + { + "epoch": 0.26956521739130435, + "grad_norm": 0.5705183148384094, + "learning_rate": 9.636880149994518e-06, + "loss": 1.1159, + "step": 93 + }, + { + "epoch": 0.27246376811594203, + "grad_norm": 0.5896604061126709, + "learning_rate": 9.628188298907782e-06, + "loss": 1.0236, + "step": 94 + }, + { + "epoch": 0.2753623188405797, + "grad_norm": 0.6060263514518738, + "learning_rate": 9.619397662556434e-06, + "loss": 1.0991, + "step": 95 + }, + { + "epoch": 0.2782608695652174, + "grad_norm": 0.6302357316017151, + "learning_rate": 9.610508428570122e-06, + "loss": 1.073, + "step": 96 + }, + { + "epoch": 0.2811594202898551, + "grad_norm": 0.6086059212684631, + "learning_rate": 9.601520786682989e-06, + "loss": 1.1556, + "step": 97 + }, + { + "epoch": 0.28405797101449276, + "grad_norm": 0.5601389408111572, + "learning_rate": 9.592434928729617e-06, + "loss": 1.0691, + "step": 98 + }, + { + "epoch": 0.28695652173913044, + "grad_norm": 0.6236623525619507, + "learning_rate": 9.583251048640941e-06, + "loss": 1.0293, + "step": 99 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 0.661264181137085, + "learning_rate": 9.573969342440107e-06, + "loss": 1.0597, + "step": 100 + }, + { + "epoch": 0.2927536231884058, + "grad_norm": 0.5187559127807617, + "learning_rate": 9.564590008238284e-06, + "loss": 1.0152, + "step": 101 + }, + { + "epoch": 0.2956521739130435, + "grad_norm": 0.7033849358558655, + "learning_rate": 9.555113246230443e-06, + "loss": 1.0583, + "step": 102 + }, + { + "epoch": 0.2985507246376812, + "grad_norm": 0.6243430376052856, + "learning_rate": 9.545539258691076e-06, + "loss": 1.0415, + "step": 103 + }, + { + "epoch": 0.30144927536231886, + "grad_norm": 0.7448285222053528, + "learning_rate": 9.535868249969882e-06, + "loss": 1.1665, + "step": 104 + }, + { + "epoch": 0.30434782608695654, + "grad_norm": 0.7407688498497009, + "learning_rate": 9.52610042648741e-06, + "loss": 1.0805, + "step": 105 + }, + { + "epoch": 0.3072463768115942, + "grad_norm": 0.6399569511413574, + "learning_rate": 9.516235996730645e-06, + "loss": 1.0622, + "step": 106 + }, + { + "epoch": 0.3101449275362319, + "grad_norm": 0.6391183733940125, + "learning_rate": 9.50627517124856e-06, + "loss": 1.0988, + "step": 107 + }, + { + "epoch": 0.3130434782608696, + "grad_norm": 0.6799684166908264, + "learning_rate": 9.496218162647629e-06, + "loss": 1.0667, + "step": 108 + }, + { + "epoch": 0.3159420289855073, + "grad_norm": 0.6955932378768921, + "learning_rate": 9.486065185587278e-06, + "loss": 1.0475, + "step": 109 + }, + { + "epoch": 0.3188405797101449, + "grad_norm": 0.6768685579299927, + "learning_rate": 9.475816456775313e-06, + "loss": 1.0906, + "step": 110 + }, + { + "epoch": 0.3217391304347826, + "grad_norm": 0.6448860168457031, + "learning_rate": 9.465472194963287e-06, + "loss": 1.0725, + "step": 111 + }, + { + "epoch": 0.32463768115942027, + "grad_norm": 0.654137909412384, + "learning_rate": 9.45503262094184e-06, + "loss": 1.0477, + "step": 112 + }, + { + "epoch": 0.32753623188405795, + "grad_norm": 0.5668336749076843, + "learning_rate": 9.444497957535975e-06, + "loss": 1.0419, + "step": 113 + }, + { + "epoch": 0.33043478260869563, + "grad_norm": 0.8345162868499756, + "learning_rate": 9.43386842960031e-06, + "loss": 1.1125, + "step": 114 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.5995410084724426, + "learning_rate": 9.423144264014278e-06, + "loss": 1.048, + "step": 115 + }, + { + "epoch": 0.336231884057971, + "grad_norm": 0.6526032090187073, + "learning_rate": 9.41232568967728e-06, + "loss": 1.0868, + "step": 116 + }, + { + "epoch": 0.3391304347826087, + "grad_norm": 0.7131723165512085, + "learning_rate": 9.401412937503802e-06, + "loss": 1.0154, + "step": 117 + }, + { + "epoch": 0.34202898550724636, + "grad_norm": 0.7425084114074707, + "learning_rate": 9.39040624041849e-06, + "loss": 1.1046, + "step": 118 + }, + { + "epoch": 0.34492753623188405, + "grad_norm": 0.6741538643836975, + "learning_rate": 9.379305833351174e-06, + "loss": 1.0884, + "step": 119 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.6611533164978027, + "learning_rate": 9.368111953231849e-06, + "loss": 1.1291, + "step": 120 + }, + { + "epoch": 0.3507246376811594, + "grad_norm": 0.6605979204177856, + "learning_rate": 9.35682483898563e-06, + "loss": 1.0354, + "step": 121 + }, + { + "epoch": 0.3536231884057971, + "grad_norm": 0.7649601101875305, + "learning_rate": 9.345444731527642e-06, + "loss": 1.0705, + "step": 122 + }, + { + "epoch": 0.3565217391304348, + "grad_norm": 0.6104558110237122, + "learning_rate": 9.333971873757885e-06, + "loss": 1.0221, + "step": 123 + }, + { + "epoch": 0.35942028985507246, + "grad_norm": 0.5945985913276672, + "learning_rate": 9.32240651055604e-06, + "loss": 1.0352, + "step": 124 + }, + { + "epoch": 0.36231884057971014, + "grad_norm": 0.7351408004760742, + "learning_rate": 9.310748888776254e-06, + "loss": 1.0283, + "step": 125 + }, + { + "epoch": 0.3652173913043478, + "grad_norm": 0.6751654148101807, + "learning_rate": 9.298999257241862e-06, + "loss": 1.1355, + "step": 126 + }, + { + "epoch": 0.3681159420289855, + "grad_norm": 0.6744984984397888, + "learning_rate": 9.287157866740082e-06, + "loss": 1.097, + "step": 127 + }, + { + "epoch": 0.3710144927536232, + "grad_norm": 0.6096031665802002, + "learning_rate": 9.275224970016656e-06, + "loss": 0.9879, + "step": 128 + }, + { + "epoch": 0.3739130434782609, + "grad_norm": 0.6282311081886292, + "learning_rate": 9.263200821770462e-06, + "loss": 1.0088, + "step": 129 + }, + { + "epoch": 0.37681159420289856, + "grad_norm": 0.6340439319610596, + "learning_rate": 9.251085678648072e-06, + "loss": 1.0314, + "step": 130 + }, + { + "epoch": 0.37971014492753624, + "grad_norm": 0.6008773446083069, + "learning_rate": 9.238879799238278e-06, + "loss": 1.0304, + "step": 131 + }, + { + "epoch": 0.3826086956521739, + "grad_norm": 0.83261638879776, + "learning_rate": 9.22658344406657e-06, + "loss": 1.0767, + "step": 132 + }, + { + "epoch": 0.3855072463768116, + "grad_norm": 0.6942703127861023, + "learning_rate": 9.214196875589577e-06, + "loss": 1.0238, + "step": 133 + }, + { + "epoch": 0.3884057971014493, + "grad_norm": 0.6649532914161682, + "learning_rate": 9.201720358189464e-06, + "loss": 1.0353, + "step": 134 + }, + { + "epoch": 0.391304347826087, + "grad_norm": 0.6827482581138611, + "learning_rate": 9.189154158168293e-06, + "loss": 1.0123, + "step": 135 + }, + { + "epoch": 0.39420289855072466, + "grad_norm": 0.8225923776626587, + "learning_rate": 9.176498543742328e-06, + "loss": 1.0894, + "step": 136 + }, + { + "epoch": 0.39710144927536234, + "grad_norm": 0.7622413635253906, + "learning_rate": 9.163753785036324e-06, + "loss": 1.0987, + "step": 137 + }, + { + "epoch": 0.4, + "grad_norm": 0.729880690574646, + "learning_rate": 9.150920154077753e-06, + "loss": 1.0686, + "step": 138 + }, + { + "epoch": 0.4028985507246377, + "grad_norm": 0.5569338798522949, + "learning_rate": 9.137997924791e-06, + "loss": 1.0554, + "step": 139 + }, + { + "epoch": 0.4057971014492754, + "grad_norm": 0.7127766013145447, + "learning_rate": 9.124987372991512e-06, + "loss": 1.0878, + "step": 140 + }, + { + "epoch": 0.40869565217391307, + "grad_norm": 0.6865119338035583, + "learning_rate": 9.11188877637992e-06, + "loss": 1.078, + "step": 141 + }, + { + "epoch": 0.4115942028985507, + "grad_norm": 0.7496594786643982, + "learning_rate": 9.098702414536107e-06, + "loss": 1.1678, + "step": 142 + }, + { + "epoch": 0.4144927536231884, + "grad_norm": 0.7547608017921448, + "learning_rate": 9.085428568913233e-06, + "loss": 1.0282, + "step": 143 + }, + { + "epoch": 0.41739130434782606, + "grad_norm": 0.6696781516075134, + "learning_rate": 9.072067522831743e-06, + "loss": 1.0529, + "step": 144 + }, + { + "epoch": 0.42028985507246375, + "grad_norm": 0.6223747134208679, + "learning_rate": 9.058619561473308e-06, + "loss": 1.0101, + "step": 145 + }, + { + "epoch": 0.42318840579710143, + "grad_norm": 0.6682969331741333, + "learning_rate": 9.045084971874738e-06, + "loss": 1.0669, + "step": 146 + }, + { + "epoch": 0.4260869565217391, + "grad_norm": 0.702489972114563, + "learning_rate": 9.031464042921866e-06, + "loss": 1.0696, + "step": 147 + }, + { + "epoch": 0.4289855072463768, + "grad_norm": 0.6877920031547546, + "learning_rate": 9.017757065343368e-06, + "loss": 1.0181, + "step": 148 + }, + { + "epoch": 0.4318840579710145, + "grad_norm": 0.7262343168258667, + "learning_rate": 9.003964331704574e-06, + "loss": 1.0869, + "step": 149 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.6435033082962036, + "learning_rate": 8.990086136401199e-06, + "loss": 1.0943, + "step": 150 + }, + { + "epoch": 0.43768115942028984, + "grad_norm": 0.8294116854667664, + "learning_rate": 8.976122775653087e-06, + "loss": 1.0053, + "step": 151 + }, + { + "epoch": 0.4405797101449275, + "grad_norm": 0.7582129240036011, + "learning_rate": 8.96207454749787e-06, + "loss": 1.0255, + "step": 152 + }, + { + "epoch": 0.4434782608695652, + "grad_norm": 0.7421862483024597, + "learning_rate": 8.947941751784614e-06, + "loss": 0.995, + "step": 153 + }, + { + "epoch": 0.4463768115942029, + "grad_norm": 0.6562067866325378, + "learning_rate": 8.933724690167417e-06, + "loss": 1.0051, + "step": 154 + }, + { + "epoch": 0.4492753623188406, + "grad_norm": 0.7008780241012573, + "learning_rate": 8.91942366609897e-06, + "loss": 1.0224, + "step": 155 + }, + { + "epoch": 0.45217391304347826, + "grad_norm": 0.8320948481559753, + "learning_rate": 8.905038984824079e-06, + "loss": 1.0867, + "step": 156 + }, + { + "epoch": 0.45507246376811594, + "grad_norm": 0.7078688740730286, + "learning_rate": 8.890570953373152e-06, + "loss": 1.0233, + "step": 157 + }, + { + "epoch": 0.4579710144927536, + "grad_norm": 0.602080225944519, + "learning_rate": 8.87601988055565e-06, + "loss": 1.033, + "step": 158 + }, + { + "epoch": 0.4608695652173913, + "grad_norm": 0.6947946548461914, + "learning_rate": 8.861386076953485e-06, + "loss": 1.0056, + "step": 159 + }, + { + "epoch": 0.463768115942029, + "grad_norm": 0.7520703673362732, + "learning_rate": 8.846669854914395e-06, + "loss": 1.0129, + "step": 160 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.8198053240776062, + "learning_rate": 8.831871528545286e-06, + "loss": 1.0554, + "step": 161 + }, + { + "epoch": 0.46956521739130436, + "grad_norm": 0.8595309257507324, + "learning_rate": 8.816991413705515e-06, + "loss": 0.9769, + "step": 162 + }, + { + "epoch": 0.47246376811594204, + "grad_norm": 0.7658084034919739, + "learning_rate": 8.802029828000157e-06, + "loss": 1.0942, + "step": 163 + }, + { + "epoch": 0.4753623188405797, + "grad_norm": 0.779561460018158, + "learning_rate": 8.786987090773214e-06, + "loss": 1.0526, + "step": 164 + }, + { + "epoch": 0.4782608695652174, + "grad_norm": 0.7491458654403687, + "learning_rate": 8.771863523100821e-06, + "loss": 1.076, + "step": 165 + }, + { + "epoch": 0.4811594202898551, + "grad_norm": 0.7698597311973572, + "learning_rate": 8.756659447784367e-06, + "loss": 1.0513, + "step": 166 + }, + { + "epoch": 0.48405797101449277, + "grad_norm": 0.7076740860939026, + "learning_rate": 8.741375189343625e-06, + "loss": 0.952, + "step": 167 + }, + { + "epoch": 0.48695652173913045, + "grad_norm": 0.8549159169197083, + "learning_rate": 8.726011074009813e-06, + "loss": 1.0062, + "step": 168 + }, + { + "epoch": 0.48985507246376814, + "grad_norm": 0.7257103323936462, + "learning_rate": 8.71056742971864e-06, + "loss": 1.0124, + "step": 169 + }, + { + "epoch": 0.4927536231884058, + "grad_norm": 0.6643837094306946, + "learning_rate": 8.695044586103297e-06, + "loss": 1.0646, + "step": 170 + }, + { + "epoch": 0.4956521739130435, + "grad_norm": 0.6454336643218994, + "learning_rate": 8.679442874487427e-06, + "loss": 1.0482, + "step": 171 + }, + { + "epoch": 0.4985507246376812, + "grad_norm": 0.6484606266021729, + "learning_rate": 8.663762627878059e-06, + "loss": 1.0361, + "step": 172 + }, + { + "epoch": 0.5014492753623189, + "grad_norm": 0.8437646627426147, + "learning_rate": 8.64800418095848e-06, + "loss": 1.1064, + "step": 173 + }, + { + "epoch": 0.5043478260869565, + "grad_norm": 0.8865697979927063, + "learning_rate": 8.632167870081122e-06, + "loss": 1.0187, + "step": 174 + }, + { + "epoch": 0.5043478260869565, + "eval_loss": 1.0253716707229614, + "eval_runtime": 46.4716, + "eval_samples_per_second": 5.509, + "eval_steps_per_second": 0.689, + "step": 174 + }, + { + "epoch": 0.5072463768115942, + "grad_norm": 0.6522702574729919, + "learning_rate": 8.616254033260351e-06, + "loss": 1.0466, + "step": 175 + }, + { + "epoch": 0.5101449275362319, + "grad_norm": 0.7485548257827759, + "learning_rate": 8.600263010165275e-06, + "loss": 1.051, + "step": 176 + }, + { + "epoch": 0.5130434782608696, + "grad_norm": 0.7864269614219666, + "learning_rate": 8.584195142112482e-06, + "loss": 0.9823, + "step": 177 + }, + { + "epoch": 0.5159420289855072, + "grad_norm": 0.669228732585907, + "learning_rate": 8.568050772058763e-06, + "loss": 0.9959, + "step": 178 + }, + { + "epoch": 0.518840579710145, + "grad_norm": 0.7351509928703308, + "learning_rate": 8.551830244593785e-06, + "loss": 1.0523, + "step": 179 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 0.6464654207229614, + "learning_rate": 8.535533905932739e-06, + "loss": 1.0576, + "step": 180 + }, + { + "epoch": 0.5246376811594203, + "grad_norm": 0.6708983182907104, + "learning_rate": 8.519162103908951e-06, + "loss": 1.0036, + "step": 181 + }, + { + "epoch": 0.527536231884058, + "grad_norm": 0.6712408661842346, + "learning_rate": 8.502715187966455e-06, + "loss": 0.9567, + "step": 182 + }, + { + "epoch": 0.5304347826086957, + "grad_norm": 0.8165604472160339, + "learning_rate": 8.48619350915254e-06, + "loss": 1.0074, + "step": 183 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.8015124797821045, + "learning_rate": 8.469597420110249e-06, + "loss": 1.04, + "step": 184 + }, + { + "epoch": 0.5362318840579711, + "grad_norm": 0.6764898896217346, + "learning_rate": 8.452927275070858e-06, + "loss": 1.0259, + "step": 185 + }, + { + "epoch": 0.5391304347826087, + "grad_norm": 0.7508796453475952, + "learning_rate": 8.436183429846314e-06, + "loss": 1.0153, + "step": 186 + }, + { + "epoch": 0.5420289855072464, + "grad_norm": 0.7400704026222229, + "learning_rate": 8.41936624182164e-06, + "loss": 1.0302, + "step": 187 + }, + { + "epoch": 0.5449275362318841, + "grad_norm": 0.7747941017150879, + "learning_rate": 8.402476069947309e-06, + "loss": 1.0516, + "step": 188 + }, + { + "epoch": 0.5478260869565217, + "grad_norm": 0.6391712427139282, + "learning_rate": 8.385513274731574e-06, + "loss": 0.9144, + "step": 189 + }, + { + "epoch": 0.5507246376811594, + "grad_norm": 0.7723587155342102, + "learning_rate": 8.368478218232787e-06, + "loss": 1.038, + "step": 190 + }, + { + "epoch": 0.553623188405797, + "grad_norm": 0.6703996062278748, + "learning_rate": 8.351371264051659e-06, + "loss": 0.9767, + "step": 191 + }, + { + "epoch": 0.5565217391304348, + "grad_norm": 0.6496030688285828, + "learning_rate": 8.334192777323508e-06, + "loss": 1.0139, + "step": 192 + }, + { + "epoch": 0.5594202898550724, + "grad_norm": 0.9179766178131104, + "learning_rate": 8.316943124710457e-06, + "loss": 1.0217, + "step": 193 + }, + { + "epoch": 0.5623188405797102, + "grad_norm": 0.739105761051178, + "learning_rate": 8.299622674393615e-06, + "loss": 1.0097, + "step": 194 + }, + { + "epoch": 0.5652173913043478, + "grad_norm": 0.6799715757369995, + "learning_rate": 8.282231796065215e-06, + "loss": 0.9814, + "step": 195 + }, + { + "epoch": 0.5681159420289855, + "grad_norm": 0.7482266426086426, + "learning_rate": 8.264770860920722e-06, + "loss": 0.9651, + "step": 196 + }, + { + "epoch": 0.5710144927536231, + "grad_norm": 0.7226840853691101, + "learning_rate": 8.247240241650918e-06, + "loss": 1.0257, + "step": 197 + }, + { + "epoch": 0.5739130434782609, + "grad_norm": 0.8682334423065186, + "learning_rate": 8.229640312433938e-06, + "loss": 0.9359, + "step": 198 + }, + { + "epoch": 0.5768115942028985, + "grad_norm": 0.7574880123138428, + "learning_rate": 8.21197144892728e-06, + "loss": 1.0316, + "step": 199 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.6719037890434265, + "learning_rate": 8.194234028259806e-06, + "loss": 0.9718, + "step": 200 + }, + { + "epoch": 0.5826086956521739, + "grad_norm": 0.7872765064239502, + "learning_rate": 8.176428429023674e-06, + "loss": 1.0055, + "step": 201 + }, + { + "epoch": 0.5855072463768116, + "grad_norm": 0.8982404470443726, + "learning_rate": 8.158555031266255e-06, + "loss": 1.0763, + "step": 202 + }, + { + "epoch": 0.5884057971014492, + "grad_norm": 0.7265183925628662, + "learning_rate": 8.140614216482046e-06, + "loss": 0.9921, + "step": 203 + }, + { + "epoch": 0.591304347826087, + "grad_norm": 0.7971622943878174, + "learning_rate": 8.122606367604497e-06, + "loss": 0.9986, + "step": 204 + }, + { + "epoch": 0.5942028985507246, + "grad_norm": 0.689160943031311, + "learning_rate": 8.104531868997858e-06, + "loss": 0.9896, + "step": 205 + }, + { + "epoch": 0.5971014492753624, + "grad_norm": 0.8191243410110474, + "learning_rate": 8.086391106448965e-06, + "loss": 1.0141, + "step": 206 + }, + { + "epoch": 0.6, + "grad_norm": 0.860882043838501, + "learning_rate": 8.068184467159014e-06, + "loss": 0.9608, + "step": 207 + }, + { + "epoch": 0.6028985507246377, + "grad_norm": 0.7216934561729431, + "learning_rate": 8.049912339735284e-06, + "loss": 0.9898, + "step": 208 + }, + { + "epoch": 0.6057971014492753, + "grad_norm": 0.685965359210968, + "learning_rate": 8.031575114182856e-06, + "loss": 0.9532, + "step": 209 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 0.6752814054489136, + "learning_rate": 8.013173181896283e-06, + "loss": 1.0043, + "step": 210 + }, + { + "epoch": 0.6115942028985507, + "grad_norm": 0.815260112285614, + "learning_rate": 7.994706935651228e-06, + "loss": 1.0049, + "step": 211 + }, + { + "epoch": 0.6144927536231884, + "grad_norm": 0.729771077632904, + "learning_rate": 7.976176769596095e-06, + "loss": 1.0003, + "step": 212 + }, + { + "epoch": 0.6173913043478261, + "grad_norm": 0.6407178044319153, + "learning_rate": 7.957583079243607e-06, + "loss": 1.0197, + "step": 213 + }, + { + "epoch": 0.6202898550724638, + "grad_norm": 0.6758530735969543, + "learning_rate": 7.938926261462366e-06, + "loss": 1.0632, + "step": 214 + }, + { + "epoch": 0.6231884057971014, + "grad_norm": 0.7678017616271973, + "learning_rate": 7.920206714468383e-06, + "loss": 1.004, + "step": 215 + }, + { + "epoch": 0.6260869565217392, + "grad_norm": 0.6864491105079651, + "learning_rate": 7.90142483781658e-06, + "loss": 0.9798, + "step": 216 + }, + { + "epoch": 0.6289855072463768, + "grad_norm": 0.7141516804695129, + "learning_rate": 7.882581032392252e-06, + "loss": 0.9969, + "step": 217 + }, + { + "epoch": 0.6318840579710145, + "grad_norm": 0.7497020363807678, + "learning_rate": 7.863675700402527e-06, + "loss": 0.9951, + "step": 218 + }, + { + "epoch": 0.6347826086956522, + "grad_norm": 0.7010701894760132, + "learning_rate": 7.844709245367766e-06, + "loss": 1.0164, + "step": 219 + }, + { + "epoch": 0.6376811594202898, + "grad_norm": 0.8556409478187561, + "learning_rate": 7.82568207211296e-06, + "loss": 1.0079, + "step": 220 + }, + { + "epoch": 0.6405797101449275, + "grad_norm": 0.8755605816841125, + "learning_rate": 7.806594586759083e-06, + "loss": 1.0401, + "step": 221 + }, + { + "epoch": 0.6434782608695652, + "grad_norm": 0.7478286623954773, + "learning_rate": 7.787447196714428e-06, + "loss": 0.9966, + "step": 222 + }, + { + "epoch": 0.6463768115942029, + "grad_norm": 0.6972207427024841, + "learning_rate": 7.768240310665909e-06, + "loss": 1.0277, + "step": 223 + }, + { + "epoch": 0.6492753623188405, + "grad_norm": 0.7753648161888123, + "learning_rate": 7.748974338570337e-06, + "loss": 1.0531, + "step": 224 + }, + { + "epoch": 0.6521739130434783, + "grad_norm": 0.8420187830924988, + "learning_rate": 7.729649691645673e-06, + "loss": 1.0101, + "step": 225 + }, + { + "epoch": 0.6550724637681159, + "grad_norm": 0.7467186450958252, + "learning_rate": 7.710266782362248e-06, + "loss": 1.086, + "step": 226 + }, + { + "epoch": 0.6579710144927536, + "grad_norm": 0.679282009601593, + "learning_rate": 7.69082602443396e-06, + "loss": 1.0756, + "step": 227 + }, + { + "epoch": 0.6608695652173913, + "grad_norm": 0.8682421445846558, + "learning_rate": 7.671327832809442e-06, + "loss": 1.0337, + "step": 228 + }, + { + "epoch": 0.663768115942029, + "grad_norm": 0.9190111756324768, + "learning_rate": 7.651772623663212e-06, + "loss": 1.0412, + "step": 229 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.7419721484184265, + "learning_rate": 7.63216081438678e-06, + "loss": 0.9895, + "step": 230 + }, + { + "epoch": 0.6695652173913044, + "grad_norm": 0.7735477685928345, + "learning_rate": 7.612492823579744e-06, + "loss": 1.0109, + "step": 231 + }, + { + "epoch": 0.672463768115942, + "grad_norm": 0.6718391180038452, + "learning_rate": 7.5927690710408606e-06, + "loss": 1.0699, + "step": 232 + }, + { + "epoch": 0.6753623188405797, + "grad_norm": 0.8104904890060425, + "learning_rate": 7.572989977759073e-06, + "loss": 0.9957, + "step": 233 + }, + { + "epoch": 0.6782608695652174, + "grad_norm": 0.8718286752700806, + "learning_rate": 7.553155965904535e-06, + "loss": 0.9674, + "step": 234 + }, + { + "epoch": 0.6811594202898551, + "grad_norm": 0.727627158164978, + "learning_rate": 7.533267458819597e-06, + "loss": 1.0256, + "step": 235 + }, + { + "epoch": 0.6840579710144927, + "grad_norm": 0.6747854948043823, + "learning_rate": 7.513324881009769e-06, + "loss": 0.9956, + "step": 236 + }, + { + "epoch": 0.6869565217391305, + "grad_norm": 0.8896199464797974, + "learning_rate": 7.49332865813466e-06, + "loss": 1.052, + "step": 237 + }, + { + "epoch": 0.6898550724637681, + "grad_norm": 0.8011343479156494, + "learning_rate": 7.473279216998896e-06, + "loss": 0.9809, + "step": 238 + }, + { + "epoch": 0.6927536231884058, + "grad_norm": 0.7936311960220337, + "learning_rate": 7.453176985543002e-06, + "loss": 0.9491, + "step": 239 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.783686101436615, + "learning_rate": 7.4330223928342814e-06, + "loss": 1.0627, + "step": 240 + }, + { + "epoch": 0.6985507246376812, + "grad_norm": 0.6777355670928955, + "learning_rate": 7.412815869057644e-06, + "loss": 0.9836, + "step": 241 + }, + { + "epoch": 0.7014492753623188, + "grad_norm": 0.8609856367111206, + "learning_rate": 7.392557845506433e-06, + "loss": 1.0383, + "step": 242 + }, + { + "epoch": 0.7043478260869566, + "grad_norm": 0.7346140146255493, + "learning_rate": 7.372248754573213e-06, + "loss": 1.0237, + "step": 243 + }, + { + "epoch": 0.7072463768115942, + "grad_norm": 0.8134037852287292, + "learning_rate": 7.351889029740548e-06, + "loss": 1.0051, + "step": 244 + }, + { + "epoch": 0.7101449275362319, + "grad_norm": 0.7623313069343567, + "learning_rate": 7.33147910557174e-06, + "loss": 0.966, + "step": 245 + }, + { + "epoch": 0.7130434782608696, + "grad_norm": 0.8289423584938049, + "learning_rate": 7.311019417701567e-06, + "loss": 1.0162, + "step": 246 + }, + { + "epoch": 0.7159420289855073, + "grad_norm": 0.6778679490089417, + "learning_rate": 7.290510402826967e-06, + "loss": 1.042, + "step": 247 + }, + { + "epoch": 0.7188405797101449, + "grad_norm": 0.7705609798431396, + "learning_rate": 7.269952498697734e-06, + "loss": 0.9979, + "step": 248 + }, + { + "epoch": 0.7217391304347827, + "grad_norm": 0.8417146801948547, + "learning_rate": 7.249346144107165e-06, + "loss": 0.9937, + "step": 249 + }, + { + "epoch": 0.7246376811594203, + "grad_norm": 0.6634312868118286, + "learning_rate": 7.2286917788826926e-06, + "loss": 1.0299, + "step": 250 + }, + { + "epoch": 0.7275362318840579, + "grad_norm": 0.7162610292434692, + "learning_rate": 7.207989843876505e-06, + "loss": 0.9627, + "step": 251 + }, + { + "epoch": 0.7304347826086957, + "grad_norm": 0.886674165725708, + "learning_rate": 7.187240780956133e-06, + "loss": 0.9804, + "step": 252 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.8589048385620117, + "learning_rate": 7.166445032995013e-06, + "loss": 0.9972, + "step": 253 + }, + { + "epoch": 0.736231884057971, + "grad_norm": 0.792225182056427, + "learning_rate": 7.145603043863045e-06, + "loss": 1.0047, + "step": 254 + }, + { + "epoch": 0.7391304347826086, + "grad_norm": 0.7787736654281616, + "learning_rate": 7.124715258417111e-06, + "loss": 0.974, + "step": 255 + }, + { + "epoch": 0.7420289855072464, + "grad_norm": 0.7716973423957825, + "learning_rate": 7.103782122491577e-06, + "loss": 0.9476, + "step": 256 + }, + { + "epoch": 0.744927536231884, + "grad_norm": 0.8235695958137512, + "learning_rate": 7.082804082888787e-06, + "loss": 1.0303, + "step": 257 + }, + { + "epoch": 0.7478260869565218, + "grad_norm": 0.8061054944992065, + "learning_rate": 7.061781587369518e-06, + "loss": 1.0254, + "step": 258 + }, + { + "epoch": 0.7507246376811594, + "grad_norm": 0.8522235751152039, + "learning_rate": 7.040715084643429e-06, + "loss": 1.0196, + "step": 259 + }, + { + "epoch": 0.7536231884057971, + "grad_norm": 0.8005476593971252, + "learning_rate": 7.019605024359475e-06, + "loss": 1.052, + "step": 260 + }, + { + "epoch": 0.7565217391304347, + "grad_norm": 0.9044481515884399, + "learning_rate": 6.998451857096321e-06, + "loss": 1.04, + "step": 261 + }, + { + "epoch": 0.7565217391304347, + "eval_loss": 0.9999631643295288, + "eval_runtime": 46.2792, + "eval_samples_per_second": 5.532, + "eval_steps_per_second": 0.691, + "step": 261 + }, + { + "epoch": 0.7594202898550725, + "grad_norm": 0.6946824193000793, + "learning_rate": 6.977256034352713e-06, + "loss": 0.9869, + "step": 262 + }, + { + "epoch": 0.7623188405797101, + "grad_norm": 0.8048357963562012, + "learning_rate": 6.956018008537852e-06, + "loss": 0.9773, + "step": 263 + }, + { + "epoch": 0.7652173913043478, + "grad_norm": 0.7211609482765198, + "learning_rate": 6.934738232961728e-06, + "loss": 0.9727, + "step": 264 + }, + { + "epoch": 0.7681159420289855, + "grad_norm": 0.7225235104560852, + "learning_rate": 6.913417161825449e-06, + "loss": 1.0209, + "step": 265 + }, + { + "epoch": 0.7710144927536232, + "grad_norm": 0.6443622708320618, + "learning_rate": 6.892055250211552e-06, + "loss": 1.0398, + "step": 266 + }, + { + "epoch": 0.7739130434782608, + "grad_norm": 0.8570783138275146, + "learning_rate": 6.8706529540742775e-06, + "loss": 0.9883, + "step": 267 + }, + { + "epoch": 0.7768115942028986, + "grad_norm": 0.9808831810951233, + "learning_rate": 6.849210730229846e-06, + "loss": 1.0847, + "step": 268 + }, + { + "epoch": 0.7797101449275362, + "grad_norm": 0.8551820516586304, + "learning_rate": 6.827729036346706e-06, + "loss": 0.9621, + "step": 269 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 0.8964309692382812, + "learning_rate": 6.806208330935766e-06, + "loss": 0.9886, + "step": 270 + }, + { + "epoch": 0.7855072463768116, + "grad_norm": 0.8737574219703674, + "learning_rate": 6.784649073340601e-06, + "loss": 1.0019, + "step": 271 + }, + { + "epoch": 0.7884057971014493, + "grad_norm": 0.7480164170265198, + "learning_rate": 6.763051723727663e-06, + "loss": 0.9987, + "step": 272 + }, + { + "epoch": 0.7913043478260869, + "grad_norm": 0.7155961990356445, + "learning_rate": 6.741416743076443e-06, + "loss": 1.0043, + "step": 273 + }, + { + "epoch": 0.7942028985507247, + "grad_norm": 0.8288201093673706, + "learning_rate": 6.719744593169642e-06, + "loss": 0.9703, + "step": 274 + }, + { + "epoch": 0.7971014492753623, + "grad_norm": 0.7403139472007751, + "learning_rate": 6.698035736583307e-06, + "loss": 0.9453, + "step": 275 + }, + { + "epoch": 0.8, + "grad_norm": 0.7977936863899231, + "learning_rate": 6.67629063667697e-06, + "loss": 1.0091, + "step": 276 + }, + { + "epoch": 0.8028985507246377, + "grad_norm": 0.8381959795951843, + "learning_rate": 6.6545097575837405e-06, + "loss": 1.0001, + "step": 277 + }, + { + "epoch": 0.8057971014492754, + "grad_norm": 0.7988629937171936, + "learning_rate": 6.6326935642004165e-06, + "loss": 1.0053, + "step": 278 + }, + { + "epoch": 0.808695652173913, + "grad_norm": 0.8848451375961304, + "learning_rate": 6.610842522177549e-06, + "loss": 1.021, + "step": 279 + }, + { + "epoch": 0.8115942028985508, + "grad_norm": 0.8423268795013428, + "learning_rate": 6.588957097909509e-06, + "loss": 1.0245, + "step": 280 + }, + { + "epoch": 0.8144927536231884, + "grad_norm": 0.6828733682632446, + "learning_rate": 6.567037758524529e-06, + "loss": 0.9966, + "step": 281 + }, + { + "epoch": 0.8173913043478261, + "grad_norm": 0.8118813633918762, + "learning_rate": 6.545084971874738e-06, + "loss": 0.9777, + "step": 282 + }, + { + "epoch": 0.8202898550724638, + "grad_norm": 0.8288912773132324, + "learning_rate": 6.5230992065261685e-06, + "loss": 1.0158, + "step": 283 + }, + { + "epoch": 0.8231884057971014, + "grad_norm": 0.7110708951950073, + "learning_rate": 6.501080931748764e-06, + "loss": 0.9331, + "step": 284 + }, + { + "epoch": 0.8260869565217391, + "grad_norm": 0.767749011516571, + "learning_rate": 6.4790306175063535e-06, + "loss": 0.8917, + "step": 285 + }, + { + "epoch": 0.8289855072463768, + "grad_norm": 0.8519418835639954, + "learning_rate": 6.456948734446624e-06, + "loss": 1.0296, + "step": 286 + }, + { + "epoch": 0.8318840579710145, + "grad_norm": 0.7988749742507935, + "learning_rate": 6.43483575389108e-06, + "loss": 0.9296, + "step": 287 + }, + { + "epoch": 0.8347826086956521, + "grad_norm": 0.8312949538230896, + "learning_rate": 6.412692147824976e-06, + "loss": 1.0632, + "step": 288 + }, + { + "epoch": 0.8376811594202899, + "grad_norm": 0.9024953246116638, + "learning_rate": 6.390518388887246e-06, + "loss": 1.0013, + "step": 289 + }, + { + "epoch": 0.8405797101449275, + "grad_norm": 0.6774289011955261, + "learning_rate": 6.368314950360416e-06, + "loss": 0.954, + "step": 290 + }, + { + "epoch": 0.8434782608695652, + "grad_norm": 0.739329993724823, + "learning_rate": 6.3460823061604984e-06, + "loss": 0.9453, + "step": 291 + }, + { + "epoch": 0.8463768115942029, + "grad_norm": 0.7888621687889099, + "learning_rate": 6.323820930826879e-06, + "loss": 0.9672, + "step": 292 + }, + { + "epoch": 0.8492753623188406, + "grad_norm": 0.7777626514434814, + "learning_rate": 6.301531299512195e-06, + "loss": 1.0118, + "step": 293 + }, + { + "epoch": 0.8521739130434782, + "grad_norm": 0.8532302975654602, + "learning_rate": 6.279213887972179e-06, + "loss": 0.9837, + "step": 294 + }, + { + "epoch": 0.855072463768116, + "grad_norm": 0.8223821520805359, + "learning_rate": 6.2568691725555144e-06, + "loss": 0.9786, + "step": 295 + }, + { + "epoch": 0.8579710144927536, + "grad_norm": 0.7102084755897522, + "learning_rate": 6.234497630193666e-06, + "loss": 0.9634, + "step": 296 + }, + { + "epoch": 0.8608695652173913, + "grad_norm": 0.7488099932670593, + "learning_rate": 6.2120997383907015e-06, + "loss": 1.0271, + "step": 297 + }, + { + "epoch": 0.863768115942029, + "grad_norm": 0.755387008190155, + "learning_rate": 6.189675975213094e-06, + "loss": 1.0068, + "step": 298 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.7323296666145325, + "learning_rate": 6.1672268192795285e-06, + "loss": 1.0177, + "step": 299 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.7505559325218201, + "learning_rate": 6.144752749750671e-06, + "loss": 1.0031, + "step": 300 + }, + { + "epoch": 0.8724637681159421, + "grad_norm": 0.8251679539680481, + "learning_rate": 6.122254246318957e-06, + "loss": 1.0281, + "step": 301 + }, + { + "epoch": 0.8753623188405797, + "grad_norm": 0.7030305862426758, + "learning_rate": 6.099731789198344e-06, + "loss": 0.977, + "step": 302 + }, + { + "epoch": 0.8782608695652174, + "grad_norm": 0.872175931930542, + "learning_rate": 6.077185859114059e-06, + "loss": 1.0279, + "step": 303 + }, + { + "epoch": 0.881159420289855, + "grad_norm": 0.6906105279922485, + "learning_rate": 6.05461693729235e-06, + "loss": 0.9747, + "step": 304 + }, + { + "epoch": 0.8840579710144928, + "grad_norm": 0.8041731119155884, + "learning_rate": 6.0320255054501985e-06, + "loss": 0.9706, + "step": 305 + }, + { + "epoch": 0.8869565217391304, + "grad_norm": 0.9219099283218384, + "learning_rate": 6.009412045785051e-06, + "loss": 1.0192, + "step": 306 + }, + { + "epoch": 0.8898550724637682, + "grad_norm": 0.5931650996208191, + "learning_rate": 5.986777040964521e-06, + "loss": 1.0064, + "step": 307 + }, + { + "epoch": 0.8927536231884058, + "grad_norm": 0.9496859908103943, + "learning_rate": 5.964120974116085e-06, + "loss": 1.0138, + "step": 308 + }, + { + "epoch": 0.8956521739130435, + "grad_norm": 0.719667375087738, + "learning_rate": 5.941444328816775e-06, + "loss": 1.0213, + "step": 309 + }, + { + "epoch": 0.8985507246376812, + "grad_norm": 0.8299076557159424, + "learning_rate": 5.918747589082853e-06, + "loss": 0.9931, + "step": 310 + }, + { + "epoch": 0.9014492753623189, + "grad_norm": 0.8233078718185425, + "learning_rate": 5.896031239359485e-06, + "loss": 0.9789, + "step": 311 + }, + { + "epoch": 0.9043478260869565, + "grad_norm": 0.6814295649528503, + "learning_rate": 5.8732957645103946e-06, + "loss": 1.0711, + "step": 312 + }, + { + "epoch": 0.9072463768115943, + "grad_norm": 0.786590039730072, + "learning_rate": 5.85054164980752e-06, + "loss": 1.0282, + "step": 313 + }, + { + "epoch": 0.9101449275362319, + "grad_norm": 0.7114934921264648, + "learning_rate": 5.82776938092065e-06, + "loss": 1.0125, + "step": 314 + }, + { + "epoch": 0.9130434782608695, + "grad_norm": 0.8856657147407532, + "learning_rate": 5.804979443907065e-06, + "loss": 1.0325, + "step": 315 + }, + { + "epoch": 0.9159420289855073, + "grad_norm": 0.9123273491859436, + "learning_rate": 5.782172325201155e-06, + "loss": 1.0696, + "step": 316 + }, + { + "epoch": 0.9188405797101449, + "grad_norm": 0.7296032905578613, + "learning_rate": 5.7593485116040425e-06, + "loss": 1.0004, + "step": 317 + }, + { + "epoch": 0.9217391304347826, + "grad_norm": 0.8410807847976685, + "learning_rate": 5.736508490273189e-06, + "loss": 0.9547, + "step": 318 + }, + { + "epoch": 0.9246376811594202, + "grad_norm": 1.0709190368652344, + "learning_rate": 5.713652748711997e-06, + "loss": 0.9583, + "step": 319 + }, + { + "epoch": 0.927536231884058, + "grad_norm": 0.6270896196365356, + "learning_rate": 5.690781774759412e-06, + "loss": 1.0024, + "step": 320 + }, + { + "epoch": 0.9304347826086956, + "grad_norm": 0.7849041223526001, + "learning_rate": 5.667896056579495e-06, + "loss": 0.9477, + "step": 321 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.7513189315795898, + "learning_rate": 5.644996082651018e-06, + "loss": 0.9937, + "step": 322 + }, + { + "epoch": 0.936231884057971, + "grad_norm": 0.8150386214256287, + "learning_rate": 5.622082341757027e-06, + "loss": 1.0589, + "step": 323 + }, + { + "epoch": 0.9391304347826087, + "grad_norm": 0.8518944978713989, + "learning_rate": 5.5991553229744166e-06, + "loss": 1.0393, + "step": 324 + }, + { + "epoch": 0.9420289855072463, + "grad_norm": 0.814802885055542, + "learning_rate": 5.576215515663489e-06, + "loss": 1.0186, + "step": 325 + }, + { + "epoch": 0.9449275362318841, + "grad_norm": 0.9456635117530823, + "learning_rate": 5.553263409457504e-06, + "loss": 0.9657, + "step": 326 + }, + { + "epoch": 0.9478260869565217, + "grad_norm": 0.7259712815284729, + "learning_rate": 5.530299494252238e-06, + "loss": 1.0066, + "step": 327 + }, + { + "epoch": 0.9507246376811594, + "grad_norm": 0.7462155818939209, + "learning_rate": 5.507324260195516e-06, + "loss": 0.9246, + "step": 328 + }, + { + "epoch": 0.9536231884057971, + "grad_norm": 0.9022188782691956, + "learning_rate": 5.484338197676757e-06, + "loss": 0.9624, + "step": 329 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 0.8874835968017578, + "learning_rate": 5.46134179731651e-06, + "loss": 0.9851, + "step": 330 + }, + { + "epoch": 0.9594202898550724, + "grad_norm": 0.7534209489822388, + "learning_rate": 5.4383355499559734e-06, + "loss": 0.9761, + "step": 331 + }, + { + "epoch": 0.9623188405797102, + "grad_norm": 0.9121699929237366, + "learning_rate": 5.41531994664652e-06, + "loss": 0.9994, + "step": 332 + }, + { + "epoch": 0.9652173913043478, + "grad_norm": 0.774753212928772, + "learning_rate": 5.392295478639226e-06, + "loss": 1.0218, + "step": 333 + }, + { + "epoch": 0.9681159420289855, + "grad_norm": 0.7575943470001221, + "learning_rate": 5.36926263737437e-06, + "loss": 0.9855, + "step": 334 + }, + { + "epoch": 0.9710144927536232, + "grad_norm": 0.8202754259109497, + "learning_rate": 5.346221914470959e-06, + "loss": 1.0112, + "step": 335 + }, + { + "epoch": 0.9739130434782609, + "grad_norm": 0.8952569961547852, + "learning_rate": 5.323173801716222e-06, + "loss": 0.9722, + "step": 336 + }, + { + "epoch": 0.9768115942028985, + "grad_norm": 0.7153046727180481, + "learning_rate": 5.300118791055122e-06, + "loss": 0.9847, + "step": 337 + }, + { + "epoch": 0.9797101449275363, + "grad_norm": 0.7900391221046448, + "learning_rate": 5.27705737457985e-06, + "loss": 1.0324, + "step": 338 + }, + { + "epoch": 0.9826086956521739, + "grad_norm": 0.8250629305839539, + "learning_rate": 5.253990044519329e-06, + "loss": 0.9764, + "step": 339 + }, + { + "epoch": 0.9855072463768116, + "grad_norm": 0.8809992671012878, + "learning_rate": 5.230917293228699e-06, + "loss": 1.0198, + "step": 340 + }, + { + "epoch": 0.9884057971014493, + "grad_norm": 0.7209755778312683, + "learning_rate": 5.207839613178814e-06, + "loss": 1.0253, + "step": 341 + }, + { + "epoch": 0.991304347826087, + "grad_norm": 0.8488002419471741, + "learning_rate": 5.184757496945726e-06, + "loss": 0.9333, + "step": 342 + }, + { + "epoch": 0.9942028985507246, + "grad_norm": 0.8114776611328125, + "learning_rate": 5.161671437200179e-06, + "loss": 1.0026, + "step": 343 + }, + { + "epoch": 0.9971014492753624, + "grad_norm": 0.8550688028335571, + "learning_rate": 5.138581926697083e-06, + "loss": 1.0057, + "step": 344 + }, + { + "epoch": 1.0, + "grad_norm": 0.9187963008880615, + "learning_rate": 5.115489458265006e-06, + "loss": 1.0037, + "step": 345 + }, + { + "epoch": 1.0028985507246377, + "grad_norm": 0.8499656915664673, + "learning_rate": 5.09239452479565e-06, + "loss": 0.9793, + "step": 346 + }, + { + "epoch": 1.0057971014492753, + "grad_norm": 0.9663048982620239, + "learning_rate": 5.0692976192333295e-06, + "loss": 0.9337, + "step": 347 + }, + { + "epoch": 1.008695652173913, + "grad_norm": 0.8095614910125732, + "learning_rate": 5.046199234564455e-06, + "loss": 0.9461, + "step": 348 + }, + { + "epoch": 1.008695652173913, + "eval_loss": 0.9858289361000061, + "eval_runtime": 46.4396, + "eval_samples_per_second": 5.513, + "eval_steps_per_second": 0.689, + "step": 348 + }, + { + "epoch": 1.0115942028985507, + "grad_norm": 0.839413046836853, + "learning_rate": 5.0230998638070024e-06, + "loss": 0.9702, + "step": 349 + }, + { + "epoch": 1.0144927536231885, + "grad_norm": 0.8220239877700806, + "learning_rate": 5e-06, + "loss": 0.9403, + "step": 350 + }, + { + "epoch": 1.017391304347826, + "grad_norm": 0.8942255973815918, + "learning_rate": 4.976900136192998e-06, + "loss": 0.9763, + "step": 351 + }, + { + "epoch": 1.0028985507246377, + "grad_norm": 0.785389244556427, + "learning_rate": 4.953800765435547e-06, + "loss": 1.0033, + "step": 352 + }, + { + "epoch": 1.0057971014492753, + "grad_norm": 0.9310470223426819, + "learning_rate": 4.930702380766671e-06, + "loss": 0.9569, + "step": 353 + }, + { + "epoch": 1.008695652173913, + "grad_norm": 0.9420292377471924, + "learning_rate": 4.907605475204352e-06, + "loss": 1.0085, + "step": 354 + }, + { + "epoch": 1.0115942028985507, + "grad_norm": 0.8762017488479614, + "learning_rate": 4.8845105417349955e-06, + "loss": 1.0225, + "step": 355 + }, + { + "epoch": 1.0144927536231885, + "grad_norm": 0.8962522149085999, + "learning_rate": 4.861418073302919e-06, + "loss": 0.9543, + "step": 356 + }, + { + "epoch": 1.017391304347826, + "grad_norm": 0.8070088028907776, + "learning_rate": 4.838328562799824e-06, + "loss": 0.9334, + "step": 357 + }, + { + "epoch": 1.0202898550724637, + "grad_norm": 0.8407843708992004, + "learning_rate": 4.815242503054277e-06, + "loss": 0.9499, + "step": 358 + }, + { + "epoch": 1.0231884057971015, + "grad_norm": 0.8197099566459656, + "learning_rate": 4.79216038682119e-06, + "loss": 1.0039, + "step": 359 + }, + { + "epoch": 1.0260869565217392, + "grad_norm": 0.7919727563858032, + "learning_rate": 4.7690827067713035e-06, + "loss": 0.9731, + "step": 360 + }, + { + "epoch": 1.0289855072463767, + "grad_norm": 0.7514965534210205, + "learning_rate": 4.746009955480672e-06, + "loss": 0.9124, + "step": 361 + }, + { + "epoch": 1.0318840579710145, + "grad_norm": 0.7958142757415771, + "learning_rate": 4.7229426254201504e-06, + "loss": 0.9836, + "step": 362 + }, + { + "epoch": 1.0347826086956522, + "grad_norm": 0.9223296642303467, + "learning_rate": 4.69988120894488e-06, + "loss": 1.0372, + "step": 363 + }, + { + "epoch": 1.03768115942029, + "grad_norm": 0.7448701858520508, + "learning_rate": 4.676826198283779e-06, + "loss": 0.9189, + "step": 364 + }, + { + "epoch": 1.0405797101449274, + "grad_norm": 0.731107771396637, + "learning_rate": 4.653778085529043e-06, + "loss": 0.9632, + "step": 365 + }, + { + "epoch": 1.0434782608695652, + "grad_norm": 0.8460220694541931, + "learning_rate": 4.630737362625631e-06, + "loss": 0.9794, + "step": 366 + }, + { + "epoch": 1.046376811594203, + "grad_norm": 0.8166036605834961, + "learning_rate": 4.6077045213607765e-06, + "loss": 0.9976, + "step": 367 + }, + { + "epoch": 1.0492753623188407, + "grad_norm": 0.6962491869926453, + "learning_rate": 4.584680053353481e-06, + "loss": 0.9374, + "step": 368 + }, + { + "epoch": 1.0521739130434782, + "grad_norm": 0.8353239893913269, + "learning_rate": 4.561664450044029e-06, + "loss": 0.991, + "step": 369 + }, + { + "epoch": 1.055072463768116, + "grad_norm": 0.8190463781356812, + "learning_rate": 4.53865820268349e-06, + "loss": 0.9971, + "step": 370 + }, + { + "epoch": 1.0579710144927537, + "grad_norm": 0.904393196105957, + "learning_rate": 4.515661802323244e-06, + "loss": 0.9548, + "step": 371 + }, + { + "epoch": 1.0608695652173914, + "grad_norm": 0.7582879066467285, + "learning_rate": 4.492675739804486e-06, + "loss": 0.934, + "step": 372 + }, + { + "epoch": 1.063768115942029, + "grad_norm": 0.7787836194038391, + "learning_rate": 4.4697005057477634e-06, + "loss": 0.973, + "step": 373 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.7273504137992859, + "learning_rate": 4.446736590542497e-06, + "loss": 1.0166, + "step": 374 + }, + { + "epoch": 1.0695652173913044, + "grad_norm": 0.7512848377227783, + "learning_rate": 4.4237844843365126e-06, + "loss": 0.9951, + "step": 375 + }, + { + "epoch": 1.0724637681159421, + "grad_norm": 0.8715952038764954, + "learning_rate": 4.400844677025585e-06, + "loss": 1.0384, + "step": 376 + }, + { + "epoch": 1.0753623188405796, + "grad_norm": 1.1643601655960083, + "learning_rate": 4.377917658242975e-06, + "loss": 0.9725, + "step": 377 + }, + { + "epoch": 1.0782608695652174, + "grad_norm": 1.0170421600341797, + "learning_rate": 4.355003917348985e-06, + "loss": 0.9877, + "step": 378 + }, + { + "epoch": 1.0811594202898551, + "grad_norm": 0.8441584706306458, + "learning_rate": 4.332103943420507e-06, + "loss": 0.9795, + "step": 379 + }, + { + "epoch": 1.0840579710144929, + "grad_norm": 0.9508838057518005, + "learning_rate": 4.309218225240591e-06, + "loss": 1.0274, + "step": 380 + }, + { + "epoch": 1.0869565217391304, + "grad_norm": 0.9078054428100586, + "learning_rate": 4.286347251288004e-06, + "loss": 1.0117, + "step": 381 + }, + { + "epoch": 1.0898550724637681, + "grad_norm": 1.056804895401001, + "learning_rate": 4.263491509726812e-06, + "loss": 0.9588, + "step": 382 + }, + { + "epoch": 1.0927536231884059, + "grad_norm": 0.8957586288452148, + "learning_rate": 4.240651488395958e-06, + "loss": 0.9644, + "step": 383 + }, + { + "epoch": 1.0956521739130434, + "grad_norm": 0.9251319169998169, + "learning_rate": 4.217827674798845e-06, + "loss": 0.9764, + "step": 384 + }, + { + "epoch": 1.098550724637681, + "grad_norm": 0.8325505256652832, + "learning_rate": 4.195020556092935e-06, + "loss": 0.987, + "step": 385 + }, + { + "epoch": 1.1014492753623188, + "grad_norm": 0.8144704699516296, + "learning_rate": 4.17223061907935e-06, + "loss": 0.9898, + "step": 386 + }, + { + "epoch": 1.1043478260869566, + "grad_norm": 0.8545647859573364, + "learning_rate": 4.14945835019248e-06, + "loss": 0.9214, + "step": 387 + }, + { + "epoch": 1.107246376811594, + "grad_norm": 0.8896581530570984, + "learning_rate": 4.126704235489606e-06, + "loss": 0.9432, + "step": 388 + }, + { + "epoch": 1.1101449275362318, + "grad_norm": 0.8762820959091187, + "learning_rate": 4.103968760640516e-06, + "loss": 0.9754, + "step": 389 + }, + { + "epoch": 1.1130434782608696, + "grad_norm": 0.7869084477424622, + "learning_rate": 4.081252410917148e-06, + "loss": 0.9655, + "step": 390 + }, + { + "epoch": 1.1159420289855073, + "grad_norm": 0.9484694600105286, + "learning_rate": 4.058555671183227e-06, + "loss": 0.9461, + "step": 391 + }, + { + "epoch": 1.1188405797101448, + "grad_norm": 0.8366033434867859, + "learning_rate": 4.035879025883916e-06, + "loss": 0.9745, + "step": 392 + }, + { + "epoch": 1.1217391304347826, + "grad_norm": 0.8974631428718567, + "learning_rate": 4.013222959035481e-06, + "loss": 1.003, + "step": 393 + }, + { + "epoch": 1.1246376811594203, + "grad_norm": 0.9970961809158325, + "learning_rate": 3.99058795421495e-06, + "loss": 0.9548, + "step": 394 + }, + { + "epoch": 1.127536231884058, + "grad_norm": 0.8342113494873047, + "learning_rate": 3.967974494549803e-06, + "loss": 0.8879, + "step": 395 + }, + { + "epoch": 1.1304347826086956, + "grad_norm": 0.7740679383277893, + "learning_rate": 3.945383062707652e-06, + "loss": 1.0181, + "step": 396 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.8080225586891174, + "learning_rate": 3.922814140885942e-06, + "loss": 0.9629, + "step": 397 + }, + { + "epoch": 1.136231884057971, + "grad_norm": 0.745694637298584, + "learning_rate": 3.9002682108016585e-06, + "loss": 0.9725, + "step": 398 + }, + { + "epoch": 1.1391304347826088, + "grad_norm": 0.93767249584198, + "learning_rate": 3.8777457536810446e-06, + "loss": 0.9411, + "step": 399 + }, + { + "epoch": 1.1420289855072463, + "grad_norm": 0.7331735491752625, + "learning_rate": 3.855247250249331e-06, + "loss": 0.9187, + "step": 400 + }, + { + "epoch": 1.144927536231884, + "grad_norm": 1.1504460573196411, + "learning_rate": 3.832773180720475e-06, + "loss": 1.0038, + "step": 401 + }, + { + "epoch": 1.1478260869565218, + "grad_norm": 0.7792490124702454, + "learning_rate": 3.8103240247869077e-06, + "loss": 0.9583, + "step": 402 + }, + { + "epoch": 1.1507246376811595, + "grad_norm": 0.8607194423675537, + "learning_rate": 3.7879002616093015e-06, + "loss": 0.9608, + "step": 403 + }, + { + "epoch": 1.153623188405797, + "grad_norm": 0.7470278143882751, + "learning_rate": 3.765502369806334e-06, + "loss": 1.0097, + "step": 404 + }, + { + "epoch": 1.1565217391304348, + "grad_norm": 0.8549491763114929, + "learning_rate": 3.743130827444487e-06, + "loss": 0.9707, + "step": 405 + }, + { + "epoch": 1.1594202898550725, + "grad_norm": 0.8472537398338318, + "learning_rate": 3.720786112027822e-06, + "loss": 0.9746, + "step": 406 + }, + { + "epoch": 1.1623188405797102, + "grad_norm": 0.7988584637641907, + "learning_rate": 3.6984687004878052e-06, + "loss": 0.9883, + "step": 407 + }, + { + "epoch": 1.1652173913043478, + "grad_norm": 0.823165774345398, + "learning_rate": 3.6761790691731207e-06, + "loss": 1.013, + "step": 408 + }, + { + "epoch": 1.1681159420289855, + "grad_norm": 0.7537344694137573, + "learning_rate": 3.6539176938395037e-06, + "loss": 1.0081, + "step": 409 + }, + { + "epoch": 1.1710144927536232, + "grad_norm": 0.7858260273933411, + "learning_rate": 3.6316850496395863e-06, + "loss": 0.9688, + "step": 410 + }, + { + "epoch": 1.1739130434782608, + "grad_norm": 0.8715892434120178, + "learning_rate": 3.609481611112755e-06, + "loss": 1.0181, + "step": 411 + }, + { + "epoch": 1.1768115942028985, + "grad_norm": 0.816693127155304, + "learning_rate": 3.587307852175025e-06, + "loss": 0.9505, + "step": 412 + }, + { + "epoch": 1.1797101449275362, + "grad_norm": 0.9773905277252197, + "learning_rate": 3.5651642461089207e-06, + "loss": 0.9745, + "step": 413 + }, + { + "epoch": 1.182608695652174, + "grad_norm": 0.7822540998458862, + "learning_rate": 3.5430512655533774e-06, + "loss": 0.9977, + "step": 414 + }, + { + "epoch": 1.1855072463768117, + "grad_norm": 0.9197254180908203, + "learning_rate": 3.5209693824936486e-06, + "loss": 0.9955, + "step": 415 + }, + { + "epoch": 1.1884057971014492, + "grad_norm": 0.8545462489128113, + "learning_rate": 3.498919068251237e-06, + "loss": 1.0544, + "step": 416 + }, + { + "epoch": 1.191304347826087, + "grad_norm": 0.8395746350288391, + "learning_rate": 3.476900793473832e-06, + "loss": 0.9757, + "step": 417 + }, + { + "epoch": 1.1942028985507247, + "grad_norm": 0.8740842938423157, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.9468, + "step": 418 + }, + { + "epoch": 1.1971014492753622, + "grad_norm": 0.7521042823791504, + "learning_rate": 3.4329622414754728e-06, + "loss": 0.9432, + "step": 419 + }, + { + "epoch": 1.2, + "grad_norm": 0.713711142539978, + "learning_rate": 3.4110429020904924e-06, + "loss": 0.9838, + "step": 420 + }, + { + "epoch": 1.2028985507246377, + "grad_norm": 0.8481893539428711, + "learning_rate": 3.3891574778224524e-06, + "loss": 0.9489, + "step": 421 + }, + { + "epoch": 1.2057971014492754, + "grad_norm": 0.863029420375824, + "learning_rate": 3.3673064357995844e-06, + "loss": 1.0462, + "step": 422 + }, + { + "epoch": 1.208695652173913, + "grad_norm": 0.8649914860725403, + "learning_rate": 3.3454902424162603e-06, + "loss": 1.0085, + "step": 423 + }, + { + "epoch": 1.2115942028985507, + "grad_norm": 0.8374588489532471, + "learning_rate": 3.3237093633230323e-06, + "loss": 1.0425, + "step": 424 + }, + { + "epoch": 1.2144927536231884, + "grad_norm": 0.9396947026252747, + "learning_rate": 3.301964263416693e-06, + "loss": 1.0303, + "step": 425 + }, + { + "epoch": 1.2173913043478262, + "grad_norm": 0.8101410865783691, + "learning_rate": 3.2802554068303595e-06, + "loss": 0.9747, + "step": 426 + }, + { + "epoch": 1.2202898550724637, + "grad_norm": 0.9860018491744995, + "learning_rate": 3.2585832569235576e-06, + "loss": 0.9533, + "step": 427 + }, + { + "epoch": 1.2231884057971014, + "grad_norm": 0.950383186340332, + "learning_rate": 3.236948276272337e-06, + "loss": 0.9562, + "step": 428 + }, + { + "epoch": 1.2260869565217392, + "grad_norm": 0.8197913765907288, + "learning_rate": 3.2153509266593984e-06, + "loss": 0.9588, + "step": 429 + }, + { + "epoch": 1.228985507246377, + "grad_norm": 0.8033617734909058, + "learning_rate": 3.1937916690642356e-06, + "loss": 1.0014, + "step": 430 + }, + { + "epoch": 1.2318840579710144, + "grad_norm": 0.8451259732246399, + "learning_rate": 3.1722709636532944e-06, + "loss": 0.9428, + "step": 431 + }, + { + "epoch": 1.2347826086956522, + "grad_norm": 0.7560276985168457, + "learning_rate": 3.150789269770155e-06, + "loss": 1.002, + "step": 432 + }, + { + "epoch": 1.23768115942029, + "grad_norm": 0.918804943561554, + "learning_rate": 3.1293470459257237e-06, + "loss": 0.9653, + "step": 433 + }, + { + "epoch": 1.2405797101449276, + "grad_norm": 0.8339065313339233, + "learning_rate": 3.107944749788449e-06, + "loss": 0.9407, + "step": 434 + }, + { + "epoch": 1.2434782608695651, + "grad_norm": 0.7564199566841125, + "learning_rate": 3.0865828381745515e-06, + "loss": 1.012, + "step": 435 + }, + { + "epoch": 1.2434782608695651, + "eval_loss": 0.9773865938186646, + "eval_runtime": 46.2701, + "eval_samples_per_second": 5.533, + "eval_steps_per_second": 0.692, + "step": 435 + }, + { + "epoch": 1.2463768115942029, + "grad_norm": 0.7768362164497375, + "learning_rate": 3.0652617670382745e-06, + "loss": 0.9642, + "step": 436 + }, + { + "epoch": 1.2492753623188406, + "grad_norm": 0.8295703530311584, + "learning_rate": 3.04398199146215e-06, + "loss": 1.0002, + "step": 437 + }, + { + "epoch": 1.2521739130434781, + "grad_norm": 0.8403414487838745, + "learning_rate": 3.0227439656472878e-06, + "loss": 0.9772, + "step": 438 + }, + { + "epoch": 1.2550724637681159, + "grad_norm": 0.8178934454917908, + "learning_rate": 3.0015481429036807e-06, + "loss": 1.0126, + "step": 439 + }, + { + "epoch": 1.2579710144927536, + "grad_norm": 0.8231812119483948, + "learning_rate": 2.980394975640526e-06, + "loss": 0.9118, + "step": 440 + }, + { + "epoch": 1.2608695652173914, + "grad_norm": 0.8780835270881653, + "learning_rate": 2.9592849153565727e-06, + "loss": 0.9549, + "step": 441 + }, + { + "epoch": 1.263768115942029, + "grad_norm": 1.000675916671753, + "learning_rate": 2.9382184126304834e-06, + "loss": 1.0483, + "step": 442 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.8840986490249634, + "learning_rate": 2.917195917111215e-06, + "loss": 0.9931, + "step": 443 + }, + { + "epoch": 1.2695652173913043, + "grad_norm": 0.8707259297370911, + "learning_rate": 2.8962178775084267e-06, + "loss": 0.8975, + "step": 444 + }, + { + "epoch": 1.272463768115942, + "grad_norm": 0.7439221739768982, + "learning_rate": 2.8752847415828923e-06, + "loss": 0.9453, + "step": 445 + }, + { + "epoch": 1.2753623188405796, + "grad_norm": 0.9899610280990601, + "learning_rate": 2.8543969561369556e-06, + "loss": 0.9426, + "step": 446 + }, + { + "epoch": 1.2782608695652173, + "grad_norm": 0.9144057035446167, + "learning_rate": 2.8335549670049866e-06, + "loss": 0.9453, + "step": 447 + }, + { + "epoch": 1.281159420289855, + "grad_norm": 0.9034680128097534, + "learning_rate": 2.812759219043869e-06, + "loss": 0.9258, + "step": 448 + }, + { + "epoch": 1.2840579710144928, + "grad_norm": 0.9689735174179077, + "learning_rate": 2.7920101561234954e-06, + "loss": 0.993, + "step": 449 + }, + { + "epoch": 1.2869565217391306, + "grad_norm": 0.6610868573188782, + "learning_rate": 2.771308221117309e-06, + "loss": 0.9506, + "step": 450 + }, + { + "epoch": 1.289855072463768, + "grad_norm": 0.829849362373352, + "learning_rate": 2.750653855892836e-06, + "loss": 0.9609, + "step": 451 + }, + { + "epoch": 1.2927536231884058, + "grad_norm": 0.7730438709259033, + "learning_rate": 2.7300475013022666e-06, + "loss": 0.9859, + "step": 452 + }, + { + "epoch": 1.2956521739130435, + "grad_norm": 0.925363302230835, + "learning_rate": 2.7094895971730326e-06, + "loss": 1.0286, + "step": 453 + }, + { + "epoch": 1.298550724637681, + "grad_norm": 0.886048436164856, + "learning_rate": 2.6889805822984348e-06, + "loss": 0.952, + "step": 454 + }, + { + "epoch": 1.3014492753623188, + "grad_norm": 1.1092323064804077, + "learning_rate": 2.668520894428259e-06, + "loss": 1.0032, + "step": 455 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 0.7811794877052307, + "learning_rate": 2.648110970259454e-06, + "loss": 0.9296, + "step": 456 + }, + { + "epoch": 1.3072463768115943, + "grad_norm": 0.8023120164871216, + "learning_rate": 2.6277512454267874e-06, + "loss": 0.9304, + "step": 457 + }, + { + "epoch": 1.310144927536232, + "grad_norm": 0.7649518251419067, + "learning_rate": 2.607442154493568e-06, + "loss": 0.9441, + "step": 458 + }, + { + "epoch": 1.3130434782608695, + "grad_norm": 0.8725413680076599, + "learning_rate": 2.5871841309423557e-06, + "loss": 0.9637, + "step": 459 + }, + { + "epoch": 1.3159420289855073, + "grad_norm": 0.7210862636566162, + "learning_rate": 2.5669776071657194e-06, + "loss": 0.9869, + "step": 460 + }, + { + "epoch": 1.318840579710145, + "grad_norm": 0.8270391821861267, + "learning_rate": 2.546823014456998e-06, + "loss": 0.9164, + "step": 461 + }, + { + "epoch": 1.3217391304347825, + "grad_norm": 0.829223096370697, + "learning_rate": 2.526720783001107e-06, + "loss": 1.0128, + "step": 462 + }, + { + "epoch": 1.3246376811594203, + "grad_norm": 0.9681026935577393, + "learning_rate": 2.506671341865341e-06, + "loss": 0.9768, + "step": 463 + }, + { + "epoch": 1.327536231884058, + "grad_norm": 0.840314507484436, + "learning_rate": 2.486675118990233e-06, + "loss": 0.9359, + "step": 464 + }, + { + "epoch": 1.3304347826086955, + "grad_norm": 0.659677267074585, + "learning_rate": 2.466732541180404e-06, + "loss": 0.965, + "step": 465 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.9055850505828857, + "learning_rate": 2.4468440340954664e-06, + "loss": 0.9557, + "step": 466 + }, + { + "epoch": 1.336231884057971, + "grad_norm": 0.8318009972572327, + "learning_rate": 2.4270100222409275e-06, + "loss": 0.9111, + "step": 467 + }, + { + "epoch": 1.3391304347826087, + "grad_norm": 0.9112004041671753, + "learning_rate": 2.4072309289591394e-06, + "loss": 0.9243, + "step": 468 + }, + { + "epoch": 1.3420289855072465, + "grad_norm": 0.8032493591308594, + "learning_rate": 2.387507176420256e-06, + "loss": 0.9228, + "step": 469 + }, + { + "epoch": 1.344927536231884, + "grad_norm": 0.662981390953064, + "learning_rate": 2.3678391856132203e-06, + "loss": 0.9778, + "step": 470 + }, + { + "epoch": 1.3478260869565217, + "grad_norm": 0.8368533849716187, + "learning_rate": 2.348227376336789e-06, + "loss": 1.0145, + "step": 471 + }, + { + "epoch": 1.3507246376811595, + "grad_norm": 0.9046915769577026, + "learning_rate": 2.328672167190558e-06, + "loss": 0.9393, + "step": 472 + }, + { + "epoch": 1.353623188405797, + "grad_norm": 0.9030489921569824, + "learning_rate": 2.3091739755660425e-06, + "loss": 0.9636, + "step": 473 + }, + { + "epoch": 1.3565217391304347, + "grad_norm": 0.8339246511459351, + "learning_rate": 2.289733217637753e-06, + "loss": 0.9395, + "step": 474 + }, + { + "epoch": 1.3594202898550725, + "grad_norm": 0.7877910733222961, + "learning_rate": 2.2703503083543288e-06, + "loss": 0.9454, + "step": 475 + }, + { + "epoch": 1.3623188405797102, + "grad_norm": 0.9808143377304077, + "learning_rate": 2.2510256614296638e-06, + "loss": 0.9968, + "step": 476 + }, + { + "epoch": 1.365217391304348, + "grad_norm": 1.2518080472946167, + "learning_rate": 2.2317596893340924e-06, + "loss": 0.9732, + "step": 477 + }, + { + "epoch": 1.3681159420289855, + "grad_norm": 0.8053367137908936, + "learning_rate": 2.2125528032855727e-06, + "loss": 0.9803, + "step": 478 + }, + { + "epoch": 1.3710144927536232, + "grad_norm": 0.9491231441497803, + "learning_rate": 2.1934054132409183e-06, + "loss": 0.9332, + "step": 479 + }, + { + "epoch": 1.373913043478261, + "grad_norm": 0.7503049373626709, + "learning_rate": 2.174317927887041e-06, + "loss": 0.9591, + "step": 480 + }, + { + "epoch": 1.3768115942028984, + "grad_norm": 0.819608211517334, + "learning_rate": 2.1552907546322356e-06, + "loss": 0.9795, + "step": 481 + }, + { + "epoch": 1.3797101449275362, + "grad_norm": 0.8053436279296875, + "learning_rate": 2.136324299597474e-06, + "loss": 1.0053, + "step": 482 + }, + { + "epoch": 1.382608695652174, + "grad_norm": 0.7377948760986328, + "learning_rate": 2.11741896760775e-06, + "loss": 1.0277, + "step": 483 + }, + { + "epoch": 1.3855072463768117, + "grad_norm": 0.865705668926239, + "learning_rate": 2.098575162183422e-06, + "loss": 0.9952, + "step": 484 + }, + { + "epoch": 1.3884057971014494, + "grad_norm": 0.8623892664909363, + "learning_rate": 2.0797932855316183e-06, + "loss": 1.0304, + "step": 485 + }, + { + "epoch": 1.391304347826087, + "grad_norm": 0.803113579750061, + "learning_rate": 2.061073738537635e-06, + "loss": 0.993, + "step": 486 + }, + { + "epoch": 1.3942028985507247, + "grad_norm": 0.7748633623123169, + "learning_rate": 2.0424169207563954e-06, + "loss": 0.9103, + "step": 487 + }, + { + "epoch": 1.3971014492753624, + "grad_norm": 0.9022510051727295, + "learning_rate": 2.023823230403907e-06, + "loss": 0.9125, + "step": 488 + }, + { + "epoch": 1.4, + "grad_norm": 0.8588757514953613, + "learning_rate": 2.005293064348773e-06, + "loss": 1.0259, + "step": 489 + }, + { + "epoch": 1.4028985507246376, + "grad_norm": 0.8985849618911743, + "learning_rate": 1.9868268181037186e-06, + "loss": 0.9839, + "step": 490 + }, + { + "epoch": 1.4057971014492754, + "grad_norm": 0.8959106802940369, + "learning_rate": 1.968424885817143e-06, + "loss": 0.9752, + "step": 491 + }, + { + "epoch": 1.4086956521739131, + "grad_norm": 0.9213183522224426, + "learning_rate": 1.9500876602647167e-06, + "loss": 0.9053, + "step": 492 + }, + { + "epoch": 1.4115942028985506, + "grad_norm": 0.8219558596611023, + "learning_rate": 1.931815532840987e-06, + "loss": 0.9522, + "step": 493 + }, + { + "epoch": 1.4144927536231884, + "grad_norm": 0.8716898560523987, + "learning_rate": 1.913608893551036e-06, + "loss": 0.9858, + "step": 494 + }, + { + "epoch": 1.4173913043478261, + "grad_norm": 0.9072102904319763, + "learning_rate": 1.8954681310021434e-06, + "loss": 0.9382, + "step": 495 + }, + { + "epoch": 1.4202898550724639, + "grad_norm": 0.8592570424079895, + "learning_rate": 1.8773936323955055e-06, + "loss": 1.0004, + "step": 496 + }, + { + "epoch": 1.4231884057971014, + "grad_norm": 0.8882102966308594, + "learning_rate": 1.8593857835179557e-06, + "loss": 0.9862, + "step": 497 + }, + { + "epoch": 1.4260869565217391, + "grad_norm": 0.851216197013855, + "learning_rate": 1.8414449687337467e-06, + "loss": 1.0109, + "step": 498 + }, + { + "epoch": 1.4289855072463769, + "grad_norm": 0.7851223349571228, + "learning_rate": 1.8235715709763285e-06, + "loss": 0.9404, + "step": 499 + }, + { + "epoch": 1.4318840579710144, + "grad_norm": 0.7435230612754822, + "learning_rate": 1.8057659717401948e-06, + "loss": 1.0388, + "step": 500 + }, + { + "epoch": 1.434782608695652, + "grad_norm": 0.795467734336853, + "learning_rate": 1.7880285510727197e-06, + "loss": 1.0, + "step": 501 + }, + { + "epoch": 1.4376811594202898, + "grad_norm": 0.8847975730895996, + "learning_rate": 1.7703596875660645e-06, + "loss": 1.0182, + "step": 502 + }, + { + "epoch": 1.4405797101449276, + "grad_norm": 1.0256052017211914, + "learning_rate": 1.7527597583490825e-06, + "loss": 0.9573, + "step": 503 + }, + { + "epoch": 1.4434782608695653, + "grad_norm": 0.7743212580680847, + "learning_rate": 1.7352291390792798e-06, + "loss": 0.9831, + "step": 504 + }, + { + "epoch": 1.4463768115942028, + "grad_norm": 0.9608955979347229, + "learning_rate": 1.7177682039347875e-06, + "loss": 0.9683, + "step": 505 + }, + { + "epoch": 1.4492753623188406, + "grad_norm": 0.899786651134491, + "learning_rate": 1.7003773256063882e-06, + "loss": 1.0373, + "step": 506 + }, + { + "epoch": 1.4521739130434783, + "grad_norm": 0.933459997177124, + "learning_rate": 1.6830568752895455e-06, + "loss": 1.0065, + "step": 507 + }, + { + "epoch": 1.4550724637681158, + "grad_norm": 0.7607547640800476, + "learning_rate": 1.6658072226764949e-06, + "loss": 0.9652, + "step": 508 + }, + { + "epoch": 1.4579710144927536, + "grad_norm": 0.7857306599617004, + "learning_rate": 1.6486287359483422e-06, + "loss": 0.9943, + "step": 509 + }, + { + "epoch": 1.4608695652173913, + "grad_norm": 0.9342886209487915, + "learning_rate": 1.6315217817672142e-06, + "loss": 1.028, + "step": 510 + }, + { + "epoch": 1.463768115942029, + "grad_norm": 1.0333482027053833, + "learning_rate": 1.614486725268426e-06, + "loss": 0.9296, + "step": 511 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.7788994908332825, + "learning_rate": 1.5975239300526924e-06, + "loss": 0.9871, + "step": 512 + }, + { + "epoch": 1.4695652173913043, + "grad_norm": 0.764268159866333, + "learning_rate": 1.5806337581783593e-06, + "loss": 0.9603, + "step": 513 + }, + { + "epoch": 1.472463768115942, + "grad_norm": 0.9053126573562622, + "learning_rate": 1.5638165701536866e-06, + "loss": 1.003, + "step": 514 + }, + { + "epoch": 1.4753623188405798, + "grad_norm": 0.890696108341217, + "learning_rate": 1.5470727249291423e-06, + "loss": 0.9894, + "step": 515 + }, + { + "epoch": 1.4782608695652173, + "grad_norm": 0.755885124206543, + "learning_rate": 1.5304025798897521e-06, + "loss": 0.9355, + "step": 516 + }, + { + "epoch": 1.481159420289855, + "grad_norm": 0.8839924931526184, + "learning_rate": 1.5138064908474603e-06, + "loss": 0.9879, + "step": 517 + }, + { + "epoch": 1.4840579710144928, + "grad_norm": 0.919336199760437, + "learning_rate": 1.4972848120335453e-06, + "loss": 1.042, + "step": 518 + }, + { + "epoch": 1.4869565217391305, + "grad_norm": 1.0073022842407227, + "learning_rate": 1.4808378960910502e-06, + "loss": 1.0537, + "step": 519 + }, + { + "epoch": 1.4898550724637682, + "grad_norm": 0.9994317293167114, + "learning_rate": 1.4644660940672628e-06, + "loss": 1.042, + "step": 520 + }, + { + "epoch": 1.4927536231884058, + "grad_norm": 0.8237168788909912, + "learning_rate": 1.448169755406218e-06, + "loss": 0.9449, + "step": 521 + }, + { + "epoch": 1.4956521739130435, + "grad_norm": 0.8838447332382202, + "learning_rate": 1.4319492279412388e-06, + "loss": 0.9789, + "step": 522 + }, + { + "epoch": 1.4956521739130435, + "eval_loss": 0.9736447334289551, + "eval_runtime": 46.3906, + "eval_samples_per_second": 5.518, + "eval_steps_per_second": 0.69, + "step": 522 + }, + { + "epoch": 1.4985507246376812, + "grad_norm": 0.7661985754966736, + "learning_rate": 1.4158048578875211e-06, + "loss": 0.9991, + "step": 523 + }, + { + "epoch": 1.5014492753623188, + "grad_norm": 0.8049348592758179, + "learning_rate": 1.399736989834728e-06, + "loss": 0.9455, + "step": 524 + }, + { + "epoch": 1.5043478260869565, + "grad_norm": 0.8575480580329895, + "learning_rate": 1.383745966739652e-06, + "loss": 0.9764, + "step": 525 + }, + { + "epoch": 1.5072463768115942, + "grad_norm": 0.7336897253990173, + "learning_rate": 1.3678321299188802e-06, + "loss": 0.9613, + "step": 526 + }, + { + "epoch": 1.5101449275362318, + "grad_norm": 0.8718299865722656, + "learning_rate": 1.351995819041521e-06, + "loss": 0.9923, + "step": 527 + }, + { + "epoch": 1.5130434782608697, + "grad_norm": 0.9166209101676941, + "learning_rate": 1.336237372121944e-06, + "loss": 1.069, + "step": 528 + }, + { + "epoch": 1.5159420289855072, + "grad_norm": 0.9382581114768982, + "learning_rate": 1.320557125512575e-06, + "loss": 0.9671, + "step": 529 + }, + { + "epoch": 1.518840579710145, + "grad_norm": 0.8037452101707458, + "learning_rate": 1.3049554138967052e-06, + "loss": 0.9395, + "step": 530 + }, + { + "epoch": 1.5217391304347827, + "grad_norm": 0.6627395749092102, + "learning_rate": 1.289432570281361e-06, + "loss": 0.9025, + "step": 531 + }, + { + "epoch": 1.5246376811594202, + "grad_norm": 0.7865214943885803, + "learning_rate": 1.2739889259901866e-06, + "loss": 0.9021, + "step": 532 + }, + { + "epoch": 1.527536231884058, + "grad_norm": 0.8900570273399353, + "learning_rate": 1.258624810656376e-06, + "loss": 0.946, + "step": 533 + }, + { + "epoch": 1.5304347826086957, + "grad_norm": 0.8942597508430481, + "learning_rate": 1.2433405522156334e-06, + "loss": 1.0141, + "step": 534 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.8667037487030029, + "learning_rate": 1.2281364768991804e-06, + "loss": 1.0092, + "step": 535 + }, + { + "epoch": 1.5362318840579712, + "grad_norm": 0.7895119190216064, + "learning_rate": 1.213012909226786e-06, + "loss": 0.9251, + "step": 536 + }, + { + "epoch": 1.5391304347826087, + "grad_norm": 0.8225801587104797, + "learning_rate": 1.1979701719998454e-06, + "loss": 0.9449, + "step": 537 + }, + { + "epoch": 1.5420289855072464, + "grad_norm": 0.8342156410217285, + "learning_rate": 1.1830085862944851e-06, + "loss": 0.9676, + "step": 538 + }, + { + "epoch": 1.5449275362318842, + "grad_norm": 0.7941964864730835, + "learning_rate": 1.1681284714547147e-06, + "loss": 0.9907, + "step": 539 + }, + { + "epoch": 1.5478260869565217, + "grad_norm": 0.9655299782752991, + "learning_rate": 1.1533301450856054e-06, + "loss": 1.0126, + "step": 540 + }, + { + "epoch": 1.5507246376811594, + "grad_norm": 0.8632703423500061, + "learning_rate": 1.1386139230465176e-06, + "loss": 0.9452, + "step": 541 + }, + { + "epoch": 1.5536231884057972, + "grad_norm": 0.8908371329307556, + "learning_rate": 1.1239801194443507e-06, + "loss": 0.9821, + "step": 542 + }, + { + "epoch": 1.5565217391304347, + "grad_norm": 0.873409628868103, + "learning_rate": 1.1094290466268493e-06, + "loss": 0.969, + "step": 543 + }, + { + "epoch": 1.5594202898550724, + "grad_norm": 0.8888543844223022, + "learning_rate": 1.0949610151759233e-06, + "loss": 0.9593, + "step": 544 + }, + { + "epoch": 1.5623188405797102, + "grad_norm": 0.7646573781967163, + "learning_rate": 1.0805763339010329e-06, + "loss": 0.9287, + "step": 545 + }, + { + "epoch": 1.5652173913043477, + "grad_norm": 0.835421085357666, + "learning_rate": 1.066275309832584e-06, + "loss": 0.9732, + "step": 546 + }, + { + "epoch": 1.5681159420289856, + "grad_norm": 0.9228112697601318, + "learning_rate": 1.0520582482153874e-06, + "loss": 0.9675, + "step": 547 + }, + { + "epoch": 1.5710144927536231, + "grad_norm": 0.7750451564788818, + "learning_rate": 1.037925452502131e-06, + "loss": 0.9938, + "step": 548 + }, + { + "epoch": 1.5739130434782609, + "grad_norm": 0.8366883397102356, + "learning_rate": 1.0238772243469153e-06, + "loss": 0.962, + "step": 549 + }, + { + "epoch": 1.5768115942028986, + "grad_norm": 0.933855414390564, + "learning_rate": 1.0099138635988026e-06, + "loss": 0.9732, + "step": 550 + }, + { + "epoch": 1.5797101449275361, + "grad_norm": 0.9288073778152466, + "learning_rate": 9.960356682954293e-07, + "loss": 0.9958, + "step": 551 + }, + { + "epoch": 1.5826086956521739, + "grad_norm": 0.7197360992431641, + "learning_rate": 9.822429346566314e-07, + "loss": 0.9266, + "step": 552 + }, + { + "epoch": 1.5855072463768116, + "grad_norm": 0.8900216817855835, + "learning_rate": 9.685359570781344e-07, + "loss": 1.0006, + "step": 553 + }, + { + "epoch": 1.5884057971014491, + "grad_norm": 0.7970424294471741, + "learning_rate": 9.549150281252633e-07, + "loss": 0.968, + "step": 554 + }, + { + "epoch": 1.591304347826087, + "grad_norm": 0.9357386231422424, + "learning_rate": 9.41380438526694e-07, + "loss": 1.0361, + "step": 555 + }, + { + "epoch": 1.5942028985507246, + "grad_norm": 0.740880012512207, + "learning_rate": 9.279324771682586e-07, + "loss": 0.9492, + "step": 556 + }, + { + "epoch": 1.5971014492753624, + "grad_norm": 0.9611430764198303, + "learning_rate": 9.145714310867676e-07, + "loss": 0.9559, + "step": 557 + }, + { + "epoch": 1.6, + "grad_norm": 0.9163907170295715, + "learning_rate": 9.01297585463895e-07, + "loss": 1.0112, + "step": 558 + }, + { + "epoch": 1.6028985507246376, + "grad_norm": 0.9926815032958984, + "learning_rate": 8.881112236200795e-07, + "loss": 1.0813, + "step": 559 + }, + { + "epoch": 1.6057971014492753, + "grad_norm": 0.8820666074752808, + "learning_rate": 8.750126270084891e-07, + "loss": 0.9911, + "step": 560 + }, + { + "epoch": 1.608695652173913, + "grad_norm": 0.817694365978241, + "learning_rate": 8.620020752090008e-07, + "loss": 0.9162, + "step": 561 + }, + { + "epoch": 1.6115942028985506, + "grad_norm": 0.9005435109138489, + "learning_rate": 8.490798459222477e-07, + "loss": 1.015, + "step": 562 + }, + { + "epoch": 1.6144927536231886, + "grad_norm": 0.8248128890991211, + "learning_rate": 8.362462149636757e-07, + "loss": 0.9976, + "step": 563 + }, + { + "epoch": 1.617391304347826, + "grad_norm": 0.8286884427070618, + "learning_rate": 8.235014562576732e-07, + "loss": 0.992, + "step": 564 + }, + { + "epoch": 1.6202898550724638, + "grad_norm": 0.8723387718200684, + "learning_rate": 8.108458418317089e-07, + "loss": 0.9381, + "step": 565 + }, + { + "epoch": 1.6231884057971016, + "grad_norm": 0.9833754897117615, + "learning_rate": 7.98279641810537e-07, + "loss": 0.9435, + "step": 566 + }, + { + "epoch": 1.626086956521739, + "grad_norm": 0.9212725162506104, + "learning_rate": 7.858031244104247e-07, + "loss": 0.9611, + "step": 567 + }, + { + "epoch": 1.6289855072463768, + "grad_norm": 0.852350115776062, + "learning_rate": 7.734165559334327e-07, + "loss": 0.9064, + "step": 568 + }, + { + "epoch": 1.6318840579710145, + "grad_norm": 0.8955137729644775, + "learning_rate": 7.611202007617241e-07, + "loss": 0.9547, + "step": 569 + }, + { + "epoch": 1.634782608695652, + "grad_norm": 0.8889902830123901, + "learning_rate": 7.489143213519301e-07, + "loss": 0.9533, + "step": 570 + }, + { + "epoch": 1.6376811594202898, + "grad_norm": 0.9037710428237915, + "learning_rate": 7.367991782295392e-07, + "loss": 0.9213, + "step": 571 + }, + { + "epoch": 1.6405797101449275, + "grad_norm": 0.8594886064529419, + "learning_rate": 7.24775029983345e-07, + "loss": 0.9765, + "step": 572 + }, + { + "epoch": 1.643478260869565, + "grad_norm": 0.7082343101501465, + "learning_rate": 7.128421332599189e-07, + "loss": 0.9871, + "step": 573 + }, + { + "epoch": 1.646376811594203, + "grad_norm": 0.878217339515686, + "learning_rate": 7.010007427581378e-07, + "loss": 0.9366, + "step": 574 + }, + { + "epoch": 1.6492753623188405, + "grad_norm": 0.9462459087371826, + "learning_rate": 6.892511112237472e-07, + "loss": 0.9505, + "step": 575 + }, + { + "epoch": 1.6521739130434783, + "grad_norm": 0.7900387644767761, + "learning_rate": 6.775934894439606e-07, + "loss": 0.9554, + "step": 576 + }, + { + "epoch": 1.655072463768116, + "grad_norm": 0.8542242050170898, + "learning_rate": 6.66028126242117e-07, + "loss": 0.9331, + "step": 577 + }, + { + "epoch": 1.6579710144927535, + "grad_norm": 0.9795560836791992, + "learning_rate": 6.545552684723583e-07, + "loss": 0.9203, + "step": 578 + }, + { + "epoch": 1.6608695652173913, + "grad_norm": 0.7833444476127625, + "learning_rate": 6.431751610143716e-07, + "loss": 0.9977, + "step": 579 + }, + { + "epoch": 1.663768115942029, + "grad_norm": 0.8404137492179871, + "learning_rate": 6.318880467681527e-07, + "loss": 0.9981, + "step": 580 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.9158584475517273, + "learning_rate": 6.206941666488287e-07, + "loss": 0.9584, + "step": 581 + }, + { + "epoch": 1.6695652173913045, + "grad_norm": 0.7720228433609009, + "learning_rate": 6.095937595815104e-07, + "loss": 0.9284, + "step": 582 + }, + { + "epoch": 1.672463768115942, + "grad_norm": 0.9077423214912415, + "learning_rate": 5.985870624961993e-07, + "loss": 1.0104, + "step": 583 + }, + { + "epoch": 1.6753623188405797, + "grad_norm": 0.7142834663391113, + "learning_rate": 5.876743103227217e-07, + "loss": 0.9617, + "step": 584 + }, + { + "epoch": 1.6782608695652175, + "grad_norm": 0.9244917035102844, + "learning_rate": 5.768557359857241e-07, + "loss": 0.9534, + "step": 585 + }, + { + "epoch": 1.681159420289855, + "grad_norm": 0.8961134552955627, + "learning_rate": 5.661315703996905e-07, + "loss": 0.9462, + "step": 586 + }, + { + "epoch": 1.6840579710144927, + "grad_norm": 0.9584707021713257, + "learning_rate": 5.555020424640267e-07, + "loss": 0.9483, + "step": 587 + }, + { + "epoch": 1.6869565217391305, + "grad_norm": 0.8094743490219116, + "learning_rate": 5.449673790581611e-07, + "loss": 0.9564, + "step": 588 + }, + { + "epoch": 1.689855072463768, + "grad_norm": 0.886703610420227, + "learning_rate": 5.345278050367142e-07, + "loss": 1.0153, + "step": 589 + }, + { + "epoch": 1.692753623188406, + "grad_norm": 0.9125918745994568, + "learning_rate": 5.241835432246888e-07, + "loss": 0.9749, + "step": 590 + }, + { + "epoch": 1.6956521739130435, + "grad_norm": 0.8972467184066772, + "learning_rate": 5.139348144127237e-07, + "loss": 1.0084, + "step": 591 + }, + { + "epoch": 1.6985507246376812, + "grad_norm": 0.7566870450973511, + "learning_rate": 5.037818373523723e-07, + "loss": 0.9932, + "step": 592 + }, + { + "epoch": 1.701449275362319, + "grad_norm": 0.8601511716842651, + "learning_rate": 4.937248287514407e-07, + "loss": 0.9747, + "step": 593 + }, + { + "epoch": 1.7043478260869565, + "grad_norm": 0.8272446393966675, + "learning_rate": 4.837640032693558e-07, + "loss": 1.0065, + "step": 594 + }, + { + "epoch": 1.7072463768115942, + "grad_norm": 0.7029653191566467, + "learning_rate": 4.738995735125895e-07, + "loss": 0.9384, + "step": 595 + }, + { + "epoch": 1.710144927536232, + "grad_norm": 0.913718044757843, + "learning_rate": 4.641317500301173e-07, + "loss": 0.9563, + "step": 596 + }, + { + "epoch": 1.7130434782608694, + "grad_norm": 0.9736040830612183, + "learning_rate": 4.5446074130892525e-07, + "loss": 0.9455, + "step": 597 + }, + { + "epoch": 1.7159420289855074, + "grad_norm": 0.8182763457298279, + "learning_rate": 4.448867537695578e-07, + "loss": 0.944, + "step": 598 + }, + { + "epoch": 1.718840579710145, + "grad_norm": 0.8536428213119507, + "learning_rate": 4.3540999176171717e-07, + "loss": 0.9029, + "step": 599 + }, + { + "epoch": 1.7217391304347827, + "grad_norm": 0.8713299036026001, + "learning_rate": 4.2603065755989493e-07, + "loss": 0.9448, + "step": 600 + }, + { + "epoch": 1.7246376811594204, + "grad_norm": 0.9857087135314941, + "learning_rate": 4.167489513590611e-07, + "loss": 1.0004, + "step": 601 + }, + { + "epoch": 1.727536231884058, + "grad_norm": 0.9195379018783569, + "learning_rate": 4.0756507127038494e-07, + "loss": 1.0247, + "step": 602 + }, + { + "epoch": 1.7304347826086957, + "grad_norm": 0.8422645926475525, + "learning_rate": 3.984792133170129e-07, + "loss": 1.0087, + "step": 603 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.8902682662010193, + "learning_rate": 3.894915714298775e-07, + "loss": 0.8793, + "step": 604 + }, + { + "epoch": 1.736231884057971, + "grad_norm": 0.8859000205993652, + "learning_rate": 3.8060233744356634e-07, + "loss": 1.0018, + "step": 605 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 0.8340051174163818, + "learning_rate": 3.71811701092219e-07, + "loss": 0.9534, + "step": 606 + }, + { + "epoch": 1.7420289855072464, + "grad_norm": 0.8677003979682922, + "learning_rate": 3.6311985000548223e-07, + "loss": 0.9525, + "step": 607 + }, + { + "epoch": 1.744927536231884, + "grad_norm": 0.932613730430603, + "learning_rate": 3.5452696970450674e-07, + "loss": 0.9257, + "step": 608 + }, + { + "epoch": 1.7478260869565219, + "grad_norm": 0.9657606482505798, + "learning_rate": 3.4603324359798016e-07, + "loss": 1.0033, + "step": 609 + }, + { + "epoch": 1.7478260869565219, + "eval_loss": 0.9723503589630127, + "eval_runtime": 46.2237, + "eval_samples_per_second": 5.538, + "eval_steps_per_second": 0.692, + "step": 609 + }, + { + "epoch": 1.7507246376811594, + "grad_norm": 0.860346257686615, + "learning_rate": 3.3763885297822153e-07, + "loss": 0.986, + "step": 610 + }, + { + "epoch": 1.7536231884057971, + "grad_norm": 0.8614711165428162, + "learning_rate": 3.293439770173046e-07, + "loss": 0.9976, + "step": 611 + }, + { + "epoch": 1.7565217391304349, + "grad_norm": 0.7311533689498901, + "learning_rate": 3.2114879276323783e-07, + "loss": 0.908, + "step": 612 + }, + { + "epoch": 1.7594202898550724, + "grad_norm": 0.9412534236907959, + "learning_rate": 3.130534751361808e-07, + "loss": 0.977, + "step": 613 + }, + { + "epoch": 1.76231884057971, + "grad_norm": 0.911098062992096, + "learning_rate": 3.0505819692471797e-07, + "loss": 0.9387, + "step": 614 + }, + { + "epoch": 1.7652173913043478, + "grad_norm": 0.8363705277442932, + "learning_rate": 2.9716312878216194e-07, + "loss": 0.9538, + "step": 615 + }, + { + "epoch": 1.7681159420289854, + "grad_norm": 0.9569475650787354, + "learning_rate": 2.893684392229185e-07, + "loss": 0.998, + "step": 616 + }, + { + "epoch": 1.7710144927536233, + "grad_norm": 0.8830727338790894, + "learning_rate": 2.8167429461888496e-07, + "loss": 0.9277, + "step": 617 + }, + { + "epoch": 1.7739130434782608, + "grad_norm": 0.9968934059143066, + "learning_rate": 2.7408085919590265e-07, + "loss": 1.0167, + "step": 618 + }, + { + "epoch": 1.7768115942028986, + "grad_norm": 0.7348361611366272, + "learning_rate": 2.6658829503024566e-07, + "loss": 0.9224, + "step": 619 + }, + { + "epoch": 1.7797101449275363, + "grad_norm": 0.9676991701126099, + "learning_rate": 2.5919676204517073e-07, + "loss": 0.9808, + "step": 620 + }, + { + "epoch": 1.7826086956521738, + "grad_norm": 0.8737136125564575, + "learning_rate": 2.5190641800749424e-07, + "loss": 0.9436, + "step": 621 + }, + { + "epoch": 1.7855072463768116, + "grad_norm": 0.8523948192596436, + "learning_rate": 2.447174185242324e-07, + "loss": 0.952, + "step": 622 + }, + { + "epoch": 1.7884057971014493, + "grad_norm": 0.7342602610588074, + "learning_rate": 2.3762991703927375e-07, + "loss": 0.9682, + "step": 623 + }, + { + "epoch": 1.7913043478260868, + "grad_norm": 1.044270634651184, + "learning_rate": 2.3064406483010947e-07, + "loss": 0.9725, + "step": 624 + }, + { + "epoch": 1.7942028985507248, + "grad_norm": 0.9236974120140076, + "learning_rate": 2.237600110046001e-07, + "loss": 0.951, + "step": 625 + }, + { + "epoch": 1.7971014492753623, + "grad_norm": 0.7988727688789368, + "learning_rate": 2.1697790249779638e-07, + "loss": 0.8851, + "step": 626 + }, + { + "epoch": 1.8, + "grad_norm": 0.7906875014305115, + "learning_rate": 2.102978840687997e-07, + "loss": 0.9162, + "step": 627 + }, + { + "epoch": 1.8028985507246378, + "grad_norm": 0.7702775001525879, + "learning_rate": 2.0372009829767558e-07, + "loss": 0.9614, + "step": 628 + }, + { + "epoch": 1.8057971014492753, + "grad_norm": 0.9317652583122253, + "learning_rate": 1.9724468558240838e-07, + "loss": 0.9105, + "step": 629 + }, + { + "epoch": 1.808695652173913, + "grad_norm": 0.855368435382843, + "learning_rate": 1.908717841359048e-07, + "loss": 1.0019, + "step": 630 + }, + { + "epoch": 1.8115942028985508, + "grad_norm": 0.761951744556427, + "learning_rate": 1.8460152998304393e-07, + "loss": 0.9267, + "step": 631 + }, + { + "epoch": 1.8144927536231883, + "grad_norm": 0.8468912839889526, + "learning_rate": 1.7843405695777582e-07, + "loss": 1.0065, + "step": 632 + }, + { + "epoch": 1.8173913043478263, + "grad_norm": 0.889159619808197, + "learning_rate": 1.7236949670026037e-07, + "loss": 0.9332, + "step": 633 + }, + { + "epoch": 1.8202898550724638, + "grad_norm": 0.8339653015136719, + "learning_rate": 1.664079786540629e-07, + "loss": 0.9851, + "step": 634 + }, + { + "epoch": 1.8231884057971013, + "grad_norm": 0.7670577764511108, + "learning_rate": 1.6054963006338742e-07, + "loss": 0.9354, + "step": 635 + }, + { + "epoch": 1.8260869565217392, + "grad_norm": 0.8923590183258057, + "learning_rate": 1.547945759703623e-07, + "loss": 1.0162, + "step": 636 + }, + { + "epoch": 1.8289855072463768, + "grad_norm": 0.7903847098350525, + "learning_rate": 1.491429392123711e-07, + "loss": 0.979, + "step": 637 + }, + { + "epoch": 1.8318840579710145, + "grad_norm": 0.9351047873497009, + "learning_rate": 1.435948404194304e-07, + "loss": 0.9458, + "step": 638 + }, + { + "epoch": 1.8347826086956522, + "grad_norm": 0.8081286549568176, + "learning_rate": 1.3815039801161723e-07, + "loss": 0.9246, + "step": 639 + }, + { + "epoch": 1.8376811594202898, + "grad_norm": 0.752216100692749, + "learning_rate": 1.328097281965357e-07, + "loss": 0.9758, + "step": 640 + }, + { + "epoch": 1.8405797101449275, + "grad_norm": 0.9659929871559143, + "learning_rate": 1.2757294496684447e-07, + "loss": 1.0107, + "step": 641 + }, + { + "epoch": 1.8434782608695652, + "grad_norm": 1.0376217365264893, + "learning_rate": 1.22440160097817e-07, + "loss": 0.9631, + "step": 642 + }, + { + "epoch": 1.8463768115942027, + "grad_norm": 0.9361832141876221, + "learning_rate": 1.1741148314495965e-07, + "loss": 0.9867, + "step": 643 + }, + { + "epoch": 1.8492753623188407, + "grad_norm": 0.8664498329162598, + "learning_rate": 1.1248702144167123e-07, + "loss": 0.9703, + "step": 644 + }, + { + "epoch": 1.8521739130434782, + "grad_norm": 0.9653159379959106, + "learning_rate": 1.0766688009695548e-07, + "loss": 0.9662, + "step": 645 + }, + { + "epoch": 1.855072463768116, + "grad_norm": 1.0553069114685059, + "learning_rate": 1.0295116199317057e-07, + "loss": 0.9745, + "step": 646 + }, + { + "epoch": 1.8579710144927537, + "grad_norm": 0.9453853964805603, + "learning_rate": 9.833996778384259e-08, + "loss": 0.9802, + "step": 647 + }, + { + "epoch": 1.8608695652173912, + "grad_norm": 0.7949392795562744, + "learning_rate": 9.383339589150776e-08, + "loss": 0.9173, + "step": 648 + }, + { + "epoch": 1.863768115942029, + "grad_norm": 0.7941511273384094, + "learning_rate": 8.943154250562025e-08, + "loss": 0.9633, + "step": 649 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.8360518217086792, + "learning_rate": 8.513450158049109e-08, + "loss": 0.9565, + "step": 650 + }, + { + "epoch": 1.8695652173913042, + "grad_norm": 0.9996237754821777, + "learning_rate": 8.094236483329022e-08, + "loss": 0.9999, + "step": 651 + }, + { + "epoch": 1.8724637681159422, + "grad_norm": 0.7493065595626831, + "learning_rate": 7.685522174208205e-08, + "loss": 0.9733, + "step": 652 + }, + { + "epoch": 1.8753623188405797, + "grad_norm": 0.8603729605674744, + "learning_rate": 7.287315954392137e-08, + "loss": 0.9624, + "step": 653 + }, + { + "epoch": 1.8782608695652174, + "grad_norm": 0.7145766615867615, + "learning_rate": 6.899626323298714e-08, + "loss": 1.0049, + "step": 654 + }, + { + "epoch": 1.8811594202898552, + "grad_norm": 0.9684036374092102, + "learning_rate": 6.522461555877213e-08, + "loss": 0.9562, + "step": 655 + }, + { + "epoch": 1.8840579710144927, + "grad_norm": 0.8989734053611755, + "learning_rate": 6.15582970243117e-08, + "loss": 1.0268, + "step": 656 + }, + { + "epoch": 1.8869565217391304, + "grad_norm": 0.9243214726448059, + "learning_rate": 5.799738588447068e-08, + "loss": 0.9643, + "step": 657 + }, + { + "epoch": 1.8898550724637682, + "grad_norm": 0.9879785776138306, + "learning_rate": 5.454195814427021e-08, + "loss": 0.9417, + "step": 658 + }, + { + "epoch": 1.8927536231884057, + "grad_norm": 0.9754204154014587, + "learning_rate": 5.119208755726579e-08, + "loss": 1.063, + "step": 659 + }, + { + "epoch": 1.8956521739130436, + "grad_norm": 0.7662235498428345, + "learning_rate": 4.794784562397459e-08, + "loss": 0.9799, + "step": 660 + }, + { + "epoch": 1.8985507246376812, + "grad_norm": 0.8312128782272339, + "learning_rate": 4.4809301590345576e-08, + "loss": 0.9671, + "step": 661 + }, + { + "epoch": 1.901449275362319, + "grad_norm": 0.8354112505912781, + "learning_rate": 4.177652244628627e-08, + "loss": 0.9688, + "step": 662 + }, + { + "epoch": 1.9043478260869566, + "grad_norm": 0.9401686191558838, + "learning_rate": 3.884957292422997e-08, + "loss": 0.9989, + "step": 663 + }, + { + "epoch": 1.9072463768115941, + "grad_norm": 0.8864877820014954, + "learning_rate": 3.602851549775521e-08, + "loss": 1.0094, + "step": 664 + }, + { + "epoch": 1.9101449275362319, + "grad_norm": 0.9440781474113464, + "learning_rate": 3.3313410380250157e-08, + "loss": 0.9544, + "step": 665 + }, + { + "epoch": 1.9130434782608696, + "grad_norm": 1.0098837614059448, + "learning_rate": 3.0704315523631956e-08, + "loss": 0.9209, + "step": 666 + }, + { + "epoch": 1.9159420289855071, + "grad_norm": 0.9735342860221863, + "learning_rate": 2.8201286617103863e-08, + "loss": 1.0385, + "step": 667 + }, + { + "epoch": 1.9188405797101449, + "grad_norm": 0.9122427105903625, + "learning_rate": 2.5804377085972278e-08, + "loss": 0.9844, + "step": 668 + }, + { + "epoch": 1.9217391304347826, + "grad_norm": 0.8491829633712769, + "learning_rate": 2.351363809050211e-08, + "loss": 1.0045, + "step": 669 + }, + { + "epoch": 1.9246376811594201, + "grad_norm": 0.83339524269104, + "learning_rate": 2.1329118524827662e-08, + "loss": 0.9844, + "step": 670 + }, + { + "epoch": 1.927536231884058, + "grad_norm": 0.9295774102210999, + "learning_rate": 1.9250865015906784e-08, + "loss": 1.0247, + "step": 671 + }, + { + "epoch": 1.9304347826086956, + "grad_norm": 0.8484298586845398, + "learning_rate": 1.7278921922527224e-08, + "loss": 1.0195, + "step": 672 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.8862564563751221, + "learning_rate": 1.541333133436018e-08, + "loss": 0.9827, + "step": 673 + }, + { + "epoch": 1.936231884057971, + "grad_norm": 0.8401779532432556, + "learning_rate": 1.3654133071059894e-08, + "loss": 1.0295, + "step": 674 + }, + { + "epoch": 1.9391304347826086, + "grad_norm": 0.8818807005882263, + "learning_rate": 1.200136468141544e-08, + "loss": 0.9554, + "step": 675 + }, + { + "epoch": 1.9420289855072463, + "grad_norm": 0.8366807699203491, + "learning_rate": 1.0455061442548597e-08, + "loss": 0.9771, + "step": 676 + }, + { + "epoch": 1.944927536231884, + "grad_norm": 0.8115973472595215, + "learning_rate": 9.015256359161118e-09, + "loss": 1.0364, + "step": 677 + }, + { + "epoch": 1.9478260869565216, + "grad_norm": 0.925413191318512, + "learning_rate": 7.681980162830283e-09, + "loss": 1.0026, + "step": 678 + }, + { + "epoch": 1.9507246376811596, + "grad_norm": 0.8799839615821838, + "learning_rate": 6.455261311352767e-09, + "loss": 1.0164, + "step": 679 + }, + { + "epoch": 1.953623188405797, + "grad_norm": 0.8579555153846741, + "learning_rate": 5.3351259881379016e-09, + "loss": 0.9775, + "step": 680 + }, + { + "epoch": 1.9565217391304348, + "grad_norm": 0.8572901487350464, + "learning_rate": 4.321598101647007e-09, + "loss": 0.9926, + "step": 681 + }, + { + "epoch": 1.9594202898550726, + "grad_norm": 0.7731289863586426, + "learning_rate": 3.41469928488547e-09, + "loss": 1.0126, + "step": 682 + }, + { + "epoch": 1.96231884057971, + "grad_norm": 0.937656581401825, + "learning_rate": 2.6144488949392253e-09, + "loss": 0.9443, + "step": 683 + }, + { + "epoch": 1.9652173913043478, + "grad_norm": 0.8993798494338989, + "learning_rate": 1.9208640125628618e-09, + "loss": 0.946, + "step": 684 + }, + { + "epoch": 1.9681159420289855, + "grad_norm": 0.9831903576850891, + "learning_rate": 1.3339594418138036e-09, + "loss": 0.9799, + "step": 685 + }, + { + "epoch": 1.971014492753623, + "grad_norm": 0.9224021434783936, + "learning_rate": 8.537477097364522e-10, + "loss": 0.9299, + "step": 686 + }, + { + "epoch": 1.973913043478261, + "grad_norm": 0.8220890760421753, + "learning_rate": 4.802390660968437e-10, + "loss": 1.0307, + "step": 687 + }, + { + "epoch": 1.9768115942028985, + "grad_norm": 1.0893397331237793, + "learning_rate": 2.1344148316060352e-10, + "loss": 0.9523, + "step": 688 + }, + { + "epoch": 1.9797101449275363, + "grad_norm": 0.8536267280578613, + "learning_rate": 5.336065552641323e-11, + "loss": 0.9675, + "step": 689 + }, + { + "epoch": 1.982608695652174, + "grad_norm": 0.8123190999031067, + "learning_rate": 0.0, + "loss": 0.9576, + "step": 690 + } + ], + "logging_steps": 1, + "max_steps": 690, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 173, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.816855525560156e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}