{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1320, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0022727272727272726, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 3.1855, "step": 1 }, { "epoch": 0.004545454545454545, "grad_norm": Infinity, "learning_rate": 0.0001, "loss": 4.3007, "step": 2 }, { "epoch": 0.006818181818181818, "grad_norm": Infinity, "learning_rate": 0.0001, "loss": 4.3188, "step": 3 }, { "epoch": 0.00909090909090909, "grad_norm": 26.366512298583984, "learning_rate": 9.992424242424244e-05, "loss": 3.6582, "step": 4 }, { "epoch": 0.011363636363636364, "grad_norm": 29.344751358032227, "learning_rate": 9.984848484848486e-05, "loss": 4.2964, "step": 5 }, { "epoch": 0.013636363636363636, "grad_norm": 29.519277572631836, "learning_rate": 9.977272727272728e-05, "loss": 4.0004, "step": 6 }, { "epoch": 0.015909090909090907, "grad_norm": 24.204898834228516, "learning_rate": 9.96969696969697e-05, "loss": 3.2453, "step": 7 }, { "epoch": 0.01818181818181818, "grad_norm": 23.69887351989746, "learning_rate": 9.962121212121213e-05, "loss": 2.7972, "step": 8 }, { "epoch": 0.020454545454545454, "grad_norm": 52.371498107910156, "learning_rate": 9.954545454545455e-05, "loss": 2.5971, "step": 9 }, { "epoch": 0.022727272727272728, "grad_norm": 41.59567642211914, "learning_rate": 9.946969696969698e-05, "loss": 3.3081, "step": 10 }, { "epoch": 0.025, "grad_norm": 27.913963317871094, "learning_rate": 9.939393939393939e-05, "loss": 3.5977, "step": 11 }, { "epoch": 0.02727272727272727, "grad_norm": 21.261117935180664, "learning_rate": 9.931818181818182e-05, "loss": 3.3403, "step": 12 }, { "epoch": 0.029545454545454545, "grad_norm": 20.344589233398438, "learning_rate": 9.924242424242425e-05, "loss": 2.0478, "step": 13 }, { "epoch": 0.031818181818181815, "grad_norm": 32.50373077392578, "learning_rate": 9.916666666666667e-05, "loss": 3.0773, "step": 14 }, { "epoch": 0.03409090909090909, "grad_norm": 21.426048278808594, "learning_rate": 9.909090909090911e-05, "loss": 2.8572, "step": 15 }, { "epoch": 0.03636363636363636, "grad_norm": 27.847314834594727, "learning_rate": 9.901515151515151e-05, "loss": 3.129, "step": 16 }, { "epoch": 0.038636363636363635, "grad_norm": 23.516616821289062, "learning_rate": 9.893939393939395e-05, "loss": 3.3971, "step": 17 }, { "epoch": 0.04090909090909091, "grad_norm": 29.170352935791016, "learning_rate": 9.886363636363637e-05, "loss": 3.6325, "step": 18 }, { "epoch": 0.04318181818181818, "grad_norm": 21.103153228759766, "learning_rate": 9.87878787878788e-05, "loss": 2.7935, "step": 19 }, { "epoch": 0.045454545454545456, "grad_norm": 25.863285064697266, "learning_rate": 9.871212121212122e-05, "loss": 2.0675, "step": 20 }, { "epoch": 0.04772727272727273, "grad_norm": 25.554828643798828, "learning_rate": 9.863636363636364e-05, "loss": 2.8331, "step": 21 }, { "epoch": 0.05, "grad_norm": 26.424827575683594, "learning_rate": 9.856060606060607e-05, "loss": 4.0934, "step": 22 }, { "epoch": 0.05227272727272727, "grad_norm": 40.84152603149414, "learning_rate": 9.848484848484849e-05, "loss": 2.7315, "step": 23 }, { "epoch": 0.05454545454545454, "grad_norm": 17.789630889892578, "learning_rate": 9.840909090909092e-05, "loss": 2.5798, "step": 24 }, { "epoch": 0.056818181818181816, "grad_norm": 15.23817253112793, "learning_rate": 9.833333333333333e-05, "loss": 2.3981, "step": 25 }, { "epoch": 0.05909090909090909, "grad_norm": 17.333356857299805, "learning_rate": 9.825757575757576e-05, "loss": 2.0097, "step": 26 }, { "epoch": 0.06136363636363636, "grad_norm": 17.358461380004883, "learning_rate": 9.818181818181818e-05, "loss": 1.5636, "step": 27 }, { "epoch": 0.06363636363636363, "grad_norm": 15.479598999023438, "learning_rate": 9.810606060606061e-05, "loss": 2.3064, "step": 28 }, { "epoch": 0.0659090909090909, "grad_norm": 18.889394760131836, "learning_rate": 9.803030303030303e-05, "loss": 1.6592, "step": 29 }, { "epoch": 0.06818181818181818, "grad_norm": 19.264772415161133, "learning_rate": 9.795454545454545e-05, "loss": 2.9327, "step": 30 }, { "epoch": 0.07045454545454545, "grad_norm": 19.369556427001953, "learning_rate": 9.787878787878789e-05, "loss": 3.2685, "step": 31 }, { "epoch": 0.07272727272727272, "grad_norm": 20.017459869384766, "learning_rate": 9.78030303030303e-05, "loss": 3.4532, "step": 32 }, { "epoch": 0.075, "grad_norm": 18.956012725830078, "learning_rate": 9.772727272727274e-05, "loss": 2.2143, "step": 33 }, { "epoch": 0.07727272727272727, "grad_norm": 15.438785552978516, "learning_rate": 9.765151515151516e-05, "loss": 2.407, "step": 34 }, { "epoch": 0.07954545454545454, "grad_norm": 22.79155921936035, "learning_rate": 9.757575757575758e-05, "loss": 3.1064, "step": 35 }, { "epoch": 0.08181818181818182, "grad_norm": 15.908382415771484, "learning_rate": 9.75e-05, "loss": 2.9192, "step": 36 }, { "epoch": 0.08409090909090909, "grad_norm": 21.536775588989258, "learning_rate": 9.742424242424243e-05, "loss": 2.8127, "step": 37 }, { "epoch": 0.08636363636363636, "grad_norm": 19.644390106201172, "learning_rate": 9.734848484848485e-05, "loss": 1.704, "step": 38 }, { "epoch": 0.08863636363636364, "grad_norm": 20.067602157592773, "learning_rate": 9.727272727272728e-05, "loss": 2.3733, "step": 39 }, { "epoch": 0.09090909090909091, "grad_norm": 16.551055908203125, "learning_rate": 9.71969696969697e-05, "loss": 2.2413, "step": 40 }, { "epoch": 0.09318181818181819, "grad_norm": 18.292987823486328, "learning_rate": 9.712121212121212e-05, "loss": 2.6103, "step": 41 }, { "epoch": 0.09545454545454546, "grad_norm": 15.751124382019043, "learning_rate": 9.704545454545456e-05, "loss": 1.5648, "step": 42 }, { "epoch": 0.09772727272727273, "grad_norm": 25.068395614624023, "learning_rate": 9.696969696969698e-05, "loss": 1.8226, "step": 43 }, { "epoch": 0.1, "grad_norm": 25.069040298461914, "learning_rate": 9.689393939393941e-05, "loss": 3.8825, "step": 44 }, { "epoch": 0.10227272727272728, "grad_norm": 20.751232147216797, "learning_rate": 9.681818181818181e-05, "loss": 2.9331, "step": 45 }, { "epoch": 0.10454545454545454, "grad_norm": 23.918386459350586, "learning_rate": 9.674242424242425e-05, "loss": 3.0365, "step": 46 }, { "epoch": 0.10681818181818181, "grad_norm": 16.94843864440918, "learning_rate": 9.666666666666667e-05, "loss": 2.0012, "step": 47 }, { "epoch": 0.10909090909090909, "grad_norm": 38.2060432434082, "learning_rate": 9.65909090909091e-05, "loss": 2.4087, "step": 48 }, { "epoch": 0.11136363636363636, "grad_norm": 15.836068153381348, "learning_rate": 9.651515151515152e-05, "loss": 2.7204, "step": 49 }, { "epoch": 0.11363636363636363, "grad_norm": 20.13130760192871, "learning_rate": 9.643939393939394e-05, "loss": 1.8803, "step": 50 }, { "epoch": 0.1159090909090909, "grad_norm": 21.58964729309082, "learning_rate": 9.636363636363637e-05, "loss": 2.2448, "step": 51 }, { "epoch": 0.11818181818181818, "grad_norm": 15.996927261352539, "learning_rate": 9.628787878787879e-05, "loss": 2.456, "step": 52 }, { "epoch": 0.12045454545454545, "grad_norm": 15.738017082214355, "learning_rate": 9.621212121212123e-05, "loss": 2.0494, "step": 53 }, { "epoch": 0.12272727272727273, "grad_norm": 20.54029655456543, "learning_rate": 9.613636363636363e-05, "loss": 2.8584, "step": 54 }, { "epoch": 0.125, "grad_norm": 20.11783790588379, "learning_rate": 9.606060606060606e-05, "loss": 2.9836, "step": 55 }, { "epoch": 0.12727272727272726, "grad_norm": 15.297281265258789, "learning_rate": 9.598484848484848e-05, "loss": 1.8828, "step": 56 }, { "epoch": 0.12954545454545455, "grad_norm": 15.26744270324707, "learning_rate": 9.590909090909092e-05, "loss": 1.2548, "step": 57 }, { "epoch": 0.1318181818181818, "grad_norm": 18.839954376220703, "learning_rate": 9.583333333333334e-05, "loss": 3.7553, "step": 58 }, { "epoch": 0.1340909090909091, "grad_norm": 17.30214500427246, "learning_rate": 9.575757575757576e-05, "loss": 2.2297, "step": 59 }, { "epoch": 0.13636363636363635, "grad_norm": 25.153942108154297, "learning_rate": 9.568181818181819e-05, "loss": 2.6817, "step": 60 }, { "epoch": 0.13863636363636364, "grad_norm": 17.55406379699707, "learning_rate": 9.560606060606061e-05, "loss": 2.8551, "step": 61 }, { "epoch": 0.1409090909090909, "grad_norm": NaN, "learning_rate": 9.560606060606061e-05, "loss": 2.4352, "step": 62 }, { "epoch": 0.1431818181818182, "grad_norm": 18.4881649017334, "learning_rate": 9.553030303030304e-05, "loss": 2.1839, "step": 63 }, { "epoch": 0.14545454545454545, "grad_norm": 15.114643096923828, "learning_rate": 9.545454545454546e-05, "loss": 1.768, "step": 64 }, { "epoch": 0.14772727272727273, "grad_norm": 17.272735595703125, "learning_rate": 9.537878787878788e-05, "loss": 2.4241, "step": 65 }, { "epoch": 0.15, "grad_norm": 18.25682258605957, "learning_rate": 9.53030303030303e-05, "loss": 1.8703, "step": 66 }, { "epoch": 0.15227272727272728, "grad_norm": 20.255084991455078, "learning_rate": 9.522727272727273e-05, "loss": 2.3706, "step": 67 }, { "epoch": 0.15454545454545454, "grad_norm": 16.153093338012695, "learning_rate": 9.515151515151515e-05, "loss": 2.3896, "step": 68 }, { "epoch": 0.15681818181818183, "grad_norm": 14.229001998901367, "learning_rate": 9.507575757575759e-05, "loss": 2.5261, "step": 69 }, { "epoch": 0.1590909090909091, "grad_norm": 14.036202430725098, "learning_rate": 9.5e-05, "loss": 1.8918, "step": 70 }, { "epoch": 0.16136363636363638, "grad_norm": 16.262582778930664, "learning_rate": 9.492424242424242e-05, "loss": 2.7854, "step": 71 }, { "epoch": 0.16363636363636364, "grad_norm": 17.119918823242188, "learning_rate": 9.484848484848486e-05, "loss": 2.1371, "step": 72 }, { "epoch": 0.16590909090909092, "grad_norm": 19.72575569152832, "learning_rate": 9.477272727272728e-05, "loss": 2.6801, "step": 73 }, { "epoch": 0.16818181818181818, "grad_norm": 17.036550521850586, "learning_rate": 9.469696969696971e-05, "loss": 2.6403, "step": 74 }, { "epoch": 0.17045454545454544, "grad_norm": 14.31810188293457, "learning_rate": 9.462121212121212e-05, "loss": 1.9865, "step": 75 }, { "epoch": 0.17272727272727273, "grad_norm": 18.39834213256836, "learning_rate": 9.454545454545455e-05, "loss": 2.418, "step": 76 }, { "epoch": 0.175, "grad_norm": 18.37046241760254, "learning_rate": 9.446969696969697e-05, "loss": 2.2905, "step": 77 }, { "epoch": 0.17727272727272728, "grad_norm": 14.999472618103027, "learning_rate": 9.43939393939394e-05, "loss": 2.2521, "step": 78 }, { "epoch": 0.17954545454545454, "grad_norm": 11.88487434387207, "learning_rate": 9.431818181818182e-05, "loss": 2.2871, "step": 79 }, { "epoch": 0.18181818181818182, "grad_norm": 21.745532989501953, "learning_rate": 9.424242424242424e-05, "loss": 2.6415, "step": 80 }, { "epoch": 0.18409090909090908, "grad_norm": 13.109172821044922, "learning_rate": 9.416666666666667e-05, "loss": 1.9554, "step": 81 }, { "epoch": 0.18636363636363637, "grad_norm": 17.222652435302734, "learning_rate": 9.40909090909091e-05, "loss": 2.0914, "step": 82 }, { "epoch": 0.18863636363636363, "grad_norm": 17.833839416503906, "learning_rate": 9.401515151515153e-05, "loss": 1.6032, "step": 83 }, { "epoch": 0.19090909090909092, "grad_norm": 22.737525939941406, "learning_rate": 9.393939393939395e-05, "loss": 3.5915, "step": 84 }, { "epoch": 0.19318181818181818, "grad_norm": 14.926959037780762, "learning_rate": 9.386363636363637e-05, "loss": 2.2499, "step": 85 }, { "epoch": 0.19545454545454546, "grad_norm": 13.586040496826172, "learning_rate": 9.378787878787879e-05, "loss": 1.8228, "step": 86 }, { "epoch": 0.19772727272727272, "grad_norm": 19.175617218017578, "learning_rate": 9.371212121212122e-05, "loss": 2.7846, "step": 87 }, { "epoch": 0.2, "grad_norm": 21.078235626220703, "learning_rate": 9.363636363636364e-05, "loss": 2.7906, "step": 88 }, { "epoch": 0.20227272727272727, "grad_norm": 17.618940353393555, "learning_rate": 9.356060606060606e-05, "loss": 2.3022, "step": 89 }, { "epoch": 0.20454545454545456, "grad_norm": 16.79983139038086, "learning_rate": 9.348484848484849e-05, "loss": 1.8126, "step": 90 }, { "epoch": 0.20681818181818182, "grad_norm": 20.444580078125, "learning_rate": 9.340909090909091e-05, "loss": 2.055, "step": 91 }, { "epoch": 0.20909090909090908, "grad_norm": 18.694856643676758, "learning_rate": 9.333333333333334e-05, "loss": 2.6534, "step": 92 }, { "epoch": 0.21136363636363636, "grad_norm": 11.254834175109863, "learning_rate": 9.325757575757576e-05, "loss": 1.6695, "step": 93 }, { "epoch": 0.21363636363636362, "grad_norm": 14.369203567504883, "learning_rate": 9.318181818181818e-05, "loss": 2.3469, "step": 94 }, { "epoch": 0.2159090909090909, "grad_norm": 17.27039909362793, "learning_rate": 9.31060606060606e-05, "loss": 1.9188, "step": 95 }, { "epoch": 0.21818181818181817, "grad_norm": 12.644415855407715, "learning_rate": 9.303030303030303e-05, "loss": 1.3295, "step": 96 }, { "epoch": 0.22045454545454546, "grad_norm": 20.46677589416504, "learning_rate": 9.295454545454545e-05, "loss": 2.4697, "step": 97 }, { "epoch": 0.22272727272727272, "grad_norm": 15.218058586120605, "learning_rate": 9.287878787878789e-05, "loss": 2.4472, "step": 98 }, { "epoch": 0.225, "grad_norm": 14.982362747192383, "learning_rate": 9.280303030303031e-05, "loss": 1.881, "step": 99 }, { "epoch": 0.22727272727272727, "grad_norm": 20.168306350708008, "learning_rate": 9.272727272727273e-05, "loss": 1.6077, "step": 100 }, { "epoch": 0.22954545454545455, "grad_norm": 13.462889671325684, "learning_rate": 9.265151515151516e-05, "loss": 1.6057, "step": 101 }, { "epoch": 0.2318181818181818, "grad_norm": 12.3695068359375, "learning_rate": 9.257575757575758e-05, "loss": 2.0871, "step": 102 }, { "epoch": 0.2340909090909091, "grad_norm": 15.381841659545898, "learning_rate": 9.250000000000001e-05, "loss": 2.0592, "step": 103 }, { "epoch": 0.23636363636363636, "grad_norm": 18.213014602661133, "learning_rate": 9.242424242424242e-05, "loss": 2.2397, "step": 104 }, { "epoch": 0.23863636363636365, "grad_norm": 19.589962005615234, "learning_rate": 9.234848484848485e-05, "loss": 2.8305, "step": 105 }, { "epoch": 0.2409090909090909, "grad_norm": 21.765127182006836, "learning_rate": 9.227272727272727e-05, "loss": 1.8691, "step": 106 }, { "epoch": 0.2431818181818182, "grad_norm": 21.66250228881836, "learning_rate": 9.21969696969697e-05, "loss": 2.7176, "step": 107 }, { "epoch": 0.24545454545454545, "grad_norm": 16.438037872314453, "learning_rate": 9.212121212121214e-05, "loss": 3.0262, "step": 108 }, { "epoch": 0.24772727272727274, "grad_norm": 18.32391357421875, "learning_rate": 9.204545454545454e-05, "loss": 2.4011, "step": 109 }, { "epoch": 0.25, "grad_norm": 18.3424015045166, "learning_rate": 9.196969696969698e-05, "loss": 3.3481, "step": 110 }, { "epoch": 0.25227272727272726, "grad_norm": 12.168206214904785, "learning_rate": 9.18939393939394e-05, "loss": 1.5084, "step": 111 }, { "epoch": 0.2545454545454545, "grad_norm": 16.183521270751953, "learning_rate": 9.181818181818183e-05, "loss": 3.3444, "step": 112 }, { "epoch": 0.25681818181818183, "grad_norm": 17.887187957763672, "learning_rate": 9.174242424242425e-05, "loss": 2.4529, "step": 113 }, { "epoch": 0.2590909090909091, "grad_norm": 18.000579833984375, "learning_rate": 9.166666666666667e-05, "loss": 2.3228, "step": 114 }, { "epoch": 0.26136363636363635, "grad_norm": 15.579062461853027, "learning_rate": 9.159090909090909e-05, "loss": 3.2008, "step": 115 }, { "epoch": 0.2636363636363636, "grad_norm": 14.111518859863281, "learning_rate": 9.151515151515152e-05, "loss": 2.2286, "step": 116 }, { "epoch": 0.26590909090909093, "grad_norm": 13.755249977111816, "learning_rate": 9.143939393939395e-05, "loss": 1.9561, "step": 117 }, { "epoch": 0.2681818181818182, "grad_norm": 14.665258407592773, "learning_rate": 9.136363636363637e-05, "loss": 2.5016, "step": 118 }, { "epoch": 0.27045454545454545, "grad_norm": 14.470067024230957, "learning_rate": 9.128787878787879e-05, "loss": 2.3301, "step": 119 }, { "epoch": 0.2727272727272727, "grad_norm": 15.108169555664062, "learning_rate": 9.121212121212121e-05, "loss": 2.6079, "step": 120 }, { "epoch": 0.275, "grad_norm": 15.080549240112305, "learning_rate": 9.113636363636365e-05, "loss": 2.6349, "step": 121 }, { "epoch": 0.2772727272727273, "grad_norm": 17.71773910522461, "learning_rate": 9.106060606060606e-05, "loss": 1.9447, "step": 122 }, { "epoch": 0.27954545454545454, "grad_norm": 11.128664016723633, "learning_rate": 9.098484848484848e-05, "loss": 2.2076, "step": 123 }, { "epoch": 0.2818181818181818, "grad_norm": 19.131866455078125, "learning_rate": 9.090909090909092e-05, "loss": 1.5932, "step": 124 }, { "epoch": 0.2840909090909091, "grad_norm": 11.3361177444458, "learning_rate": 9.083333333333334e-05, "loss": 2.5923, "step": 125 }, { "epoch": 0.2863636363636364, "grad_norm": 16.97115707397461, "learning_rate": 9.075757575757577e-05, "loss": 1.828, "step": 126 }, { "epoch": 0.28863636363636364, "grad_norm": 11.52206802368164, "learning_rate": 9.068181818181819e-05, "loss": 2.3389, "step": 127 }, { "epoch": 0.2909090909090909, "grad_norm": 18.27076530456543, "learning_rate": 9.060606060606061e-05, "loss": 3.1892, "step": 128 }, { "epoch": 0.29318181818181815, "grad_norm": 15.098003387451172, "learning_rate": 9.053030303030303e-05, "loss": 2.3429, "step": 129 }, { "epoch": 0.29545454545454547, "grad_norm": 13.432772636413574, "learning_rate": 9.045454545454546e-05, "loss": 1.7032, "step": 130 }, { "epoch": 0.29772727272727273, "grad_norm": 21.96811866760254, "learning_rate": 9.037878787878788e-05, "loss": 3.3135, "step": 131 }, { "epoch": 0.3, "grad_norm": 17.522789001464844, "learning_rate": 9.030303030303031e-05, "loss": 2.0827, "step": 132 }, { "epoch": 0.30227272727272725, "grad_norm": 16.18021011352539, "learning_rate": 9.022727272727273e-05, "loss": 2.6956, "step": 133 }, { "epoch": 0.30454545454545456, "grad_norm": 17.834138870239258, "learning_rate": 9.015151515151515e-05, "loss": 2.3929, "step": 134 }, { "epoch": 0.3068181818181818, "grad_norm": 18.146596908569336, "learning_rate": 9.007575757575759e-05, "loss": 3.0074, "step": 135 }, { "epoch": 0.3090909090909091, "grad_norm": 11.941591262817383, "learning_rate": 9e-05, "loss": 1.6793, "step": 136 }, { "epoch": 0.31136363636363634, "grad_norm": 15.524669647216797, "learning_rate": 8.992424242424244e-05, "loss": 2.3193, "step": 137 }, { "epoch": 0.31363636363636366, "grad_norm": 17.986879348754883, "learning_rate": 8.984848484848484e-05, "loss": 3.1335, "step": 138 }, { "epoch": 0.3159090909090909, "grad_norm": 19.568361282348633, "learning_rate": 8.977272727272728e-05, "loss": 2.6232, "step": 139 }, { "epoch": 0.3181818181818182, "grad_norm": 15.213788986206055, "learning_rate": 8.96969696969697e-05, "loss": 1.6936, "step": 140 }, { "epoch": 0.32045454545454544, "grad_norm": 16.093795776367188, "learning_rate": 8.962121212121213e-05, "loss": 2.38, "step": 141 }, { "epoch": 0.32272727272727275, "grad_norm": 17.010087966918945, "learning_rate": 8.954545454545455e-05, "loss": 2.0467, "step": 142 }, { "epoch": 0.325, "grad_norm": 20.31732749938965, "learning_rate": 8.946969696969697e-05, "loss": 2.062, "step": 143 }, { "epoch": 0.32727272727272727, "grad_norm": 15.800658226013184, "learning_rate": 8.93939393939394e-05, "loss": 1.4575, "step": 144 }, { "epoch": 0.32954545454545453, "grad_norm": 15.116626739501953, "learning_rate": 8.931818181818182e-05, "loss": 2.314, "step": 145 }, { "epoch": 0.33181818181818185, "grad_norm": 25.464197158813477, "learning_rate": 8.924242424242426e-05, "loss": 2.0073, "step": 146 }, { "epoch": 0.3340909090909091, "grad_norm": 13.291275978088379, "learning_rate": 8.916666666666667e-05, "loss": 2.151, "step": 147 }, { "epoch": 0.33636363636363636, "grad_norm": 13.530828475952148, "learning_rate": 8.90909090909091e-05, "loss": 2.3051, "step": 148 }, { "epoch": 0.3386363636363636, "grad_norm": 15.941877365112305, "learning_rate": 8.901515151515151e-05, "loss": 2.6671, "step": 149 }, { "epoch": 0.3409090909090909, "grad_norm": 16.19255828857422, "learning_rate": 8.893939393939395e-05, "loss": 2.4137, "step": 150 }, { "epoch": 0.3431818181818182, "grad_norm": 25.39113998413086, "learning_rate": 8.886363636363637e-05, "loss": 3.1836, "step": 151 }, { "epoch": 0.34545454545454546, "grad_norm": 14.128908157348633, "learning_rate": 8.87878787878788e-05, "loss": 2.4864, "step": 152 }, { "epoch": 0.3477272727272727, "grad_norm": 14.206392288208008, "learning_rate": 8.871212121212122e-05, "loss": 1.3842, "step": 153 }, { "epoch": 0.35, "grad_norm": 11.746234893798828, "learning_rate": 8.863636363636364e-05, "loss": 1.69, "step": 154 }, { "epoch": 0.3522727272727273, "grad_norm": 14.249229431152344, "learning_rate": 8.856060606060607e-05, "loss": 2.962, "step": 155 }, { "epoch": 0.35454545454545455, "grad_norm": 13.884110450744629, "learning_rate": 8.848484848484849e-05, "loss": 1.9429, "step": 156 }, { "epoch": 0.3568181818181818, "grad_norm": 15.577651023864746, "learning_rate": 8.840909090909091e-05, "loss": 2.0814, "step": 157 }, { "epoch": 0.35909090909090907, "grad_norm": 13.055503845214844, "learning_rate": 8.833333333333333e-05, "loss": 2.286, "step": 158 }, { "epoch": 0.3613636363636364, "grad_norm": 14.148711204528809, "learning_rate": 8.825757575757576e-05, "loss": 1.7243, "step": 159 }, { "epoch": 0.36363636363636365, "grad_norm": 18.32880210876465, "learning_rate": 8.818181818181818e-05, "loss": 2.0912, "step": 160 }, { "epoch": 0.3659090909090909, "grad_norm": 19.306982040405273, "learning_rate": 8.810606060606062e-05, "loss": 2.1032, "step": 161 }, { "epoch": 0.36818181818181817, "grad_norm": 18.99219512939453, "learning_rate": 8.803030303030304e-05, "loss": 2.3527, "step": 162 }, { "epoch": 0.3704545454545455, "grad_norm": 14.297601699829102, "learning_rate": 8.795454545454545e-05, "loss": 2.8786, "step": 163 }, { "epoch": 0.37272727272727274, "grad_norm": 19.273303985595703, "learning_rate": 8.787878787878789e-05, "loss": 2.4364, "step": 164 }, { "epoch": 0.375, "grad_norm": 11.870357513427734, "learning_rate": 8.780303030303031e-05, "loss": 2.1716, "step": 165 }, { "epoch": 0.37727272727272726, "grad_norm": 11.26362133026123, "learning_rate": 8.772727272727274e-05, "loss": 3.1212, "step": 166 }, { "epoch": 0.3795454545454545, "grad_norm": 12.994135856628418, "learning_rate": 8.765151515151515e-05, "loss": 2.4722, "step": 167 }, { "epoch": 0.38181818181818183, "grad_norm": 13.474489212036133, "learning_rate": 8.757575757575758e-05, "loss": 2.9132, "step": 168 }, { "epoch": 0.3840909090909091, "grad_norm": 16.456457138061523, "learning_rate": 8.75e-05, "loss": 2.1006, "step": 169 }, { "epoch": 0.38636363636363635, "grad_norm": 16.236146926879883, "learning_rate": 8.742424242424243e-05, "loss": 2.1458, "step": 170 }, { "epoch": 0.3886363636363636, "grad_norm": 13.122529983520508, "learning_rate": 8.734848484848485e-05, "loss": 2.7045, "step": 171 }, { "epoch": 0.39090909090909093, "grad_norm": 12.385522842407227, "learning_rate": 8.727272727272727e-05, "loss": 2.2677, "step": 172 }, { "epoch": 0.3931818181818182, "grad_norm": 14.4050931930542, "learning_rate": 8.71969696969697e-05, "loss": 1.3401, "step": 173 }, { "epoch": 0.39545454545454545, "grad_norm": 21.25592803955078, "learning_rate": 8.712121212121212e-05, "loss": 1.8591, "step": 174 }, { "epoch": 0.3977272727272727, "grad_norm": 13.744414329528809, "learning_rate": 8.704545454545456e-05, "loss": 1.8915, "step": 175 }, { "epoch": 0.4, "grad_norm": 14.040199279785156, "learning_rate": 8.696969696969698e-05, "loss": 2.1142, "step": 176 }, { "epoch": 0.4022727272727273, "grad_norm": 13.779399871826172, "learning_rate": 8.68939393939394e-05, "loss": 1.6946, "step": 177 }, { "epoch": 0.40454545454545454, "grad_norm": 12.878482818603516, "learning_rate": 8.681818181818182e-05, "loss": 2.0229, "step": 178 }, { "epoch": 0.4068181818181818, "grad_norm": 10.951014518737793, "learning_rate": 8.674242424242425e-05, "loss": 2.2302, "step": 179 }, { "epoch": 0.4090909090909091, "grad_norm": 15.133676528930664, "learning_rate": 8.666666666666667e-05, "loss": 1.7796, "step": 180 }, { "epoch": 0.4113636363636364, "grad_norm": 11.56503677368164, "learning_rate": 8.65909090909091e-05, "loss": 2.0587, "step": 181 }, { "epoch": 0.41363636363636364, "grad_norm": 12.170353889465332, "learning_rate": 8.651515151515152e-05, "loss": 1.9297, "step": 182 }, { "epoch": 0.4159090909090909, "grad_norm": 14.984827995300293, "learning_rate": 8.643939393939394e-05, "loss": 1.3361, "step": 183 }, { "epoch": 0.41818181818181815, "grad_norm": 12.686882972717285, "learning_rate": 8.636363636363637e-05, "loss": 2.3203, "step": 184 }, { "epoch": 0.42045454545454547, "grad_norm": 19.53303337097168, "learning_rate": 8.628787878787879e-05, "loss": 2.1686, "step": 185 }, { "epoch": 0.42272727272727273, "grad_norm": 13.246541976928711, "learning_rate": 8.621212121212121e-05, "loss": 2.154, "step": 186 }, { "epoch": 0.425, "grad_norm": 18.38794708251953, "learning_rate": 8.613636363636363e-05, "loss": 2.3975, "step": 187 }, { "epoch": 0.42727272727272725, "grad_norm": 19.281801223754883, "learning_rate": 8.606060606060606e-05, "loss": 3.1559, "step": 188 }, { "epoch": 0.42954545454545456, "grad_norm": 16.43345069885254, "learning_rate": 8.598484848484848e-05, "loss": 2.4324, "step": 189 }, { "epoch": 0.4318181818181818, "grad_norm": 22.686885833740234, "learning_rate": 8.590909090909092e-05, "loss": 2.4541, "step": 190 }, { "epoch": 0.4340909090909091, "grad_norm": 16.799205780029297, "learning_rate": 8.583333333333334e-05, "loss": 1.9834, "step": 191 }, { "epoch": 0.43636363636363634, "grad_norm": 12.861906051635742, "learning_rate": 8.575757575757576e-05, "loss": 1.4132, "step": 192 }, { "epoch": 0.43863636363636366, "grad_norm": 14.350102424621582, "learning_rate": 8.568181818181819e-05, "loss": 2.5181, "step": 193 }, { "epoch": 0.4409090909090909, "grad_norm": 9.91285228729248, "learning_rate": 8.560606060606061e-05, "loss": 1.1131, "step": 194 }, { "epoch": 0.4431818181818182, "grad_norm": 12.768558502197266, "learning_rate": 8.553030303030304e-05, "loss": 1.6889, "step": 195 }, { "epoch": 0.44545454545454544, "grad_norm": 11.671558380126953, "learning_rate": 8.545454545454545e-05, "loss": 2.4559, "step": 196 }, { "epoch": 0.44772727272727275, "grad_norm": 12.10418701171875, "learning_rate": 8.537878787878788e-05, "loss": 2.2951, "step": 197 }, { "epoch": 0.45, "grad_norm": 12.047237396240234, "learning_rate": 8.53030303030303e-05, "loss": 1.7895, "step": 198 }, { "epoch": 0.45227272727272727, "grad_norm": 13.83714485168457, "learning_rate": 8.522727272727273e-05, "loss": 2.1267, "step": 199 }, { "epoch": 0.45454545454545453, "grad_norm": 17.289377212524414, "learning_rate": 8.515151515151515e-05, "loss": 3.4595, "step": 200 }, { "epoch": 0.45681818181818185, "grad_norm": 16.056198120117188, "learning_rate": 8.507575757575757e-05, "loss": 2.2333, "step": 201 }, { "epoch": 0.4590909090909091, "grad_norm": 12.874887466430664, "learning_rate": 8.5e-05, "loss": 2.3555, "step": 202 }, { "epoch": 0.46136363636363636, "grad_norm": 11.859071731567383, "learning_rate": 8.492424242424243e-05, "loss": 2.0893, "step": 203 }, { "epoch": 0.4636363636363636, "grad_norm": 11.99448013305664, "learning_rate": 8.484848484848486e-05, "loss": 2.4165, "step": 204 }, { "epoch": 0.4659090909090909, "grad_norm": 14.352676391601562, "learning_rate": 8.477272727272728e-05, "loss": 2.58, "step": 205 }, { "epoch": 0.4681818181818182, "grad_norm": 10.942952156066895, "learning_rate": 8.46969696969697e-05, "loss": 2.1313, "step": 206 }, { "epoch": 0.47045454545454546, "grad_norm": 13.232431411743164, "learning_rate": 8.462121212121212e-05, "loss": 2.8598, "step": 207 }, { "epoch": 0.4727272727272727, "grad_norm": 14.74603271484375, "learning_rate": 8.454545454545455e-05, "loss": 2.5221, "step": 208 }, { "epoch": 0.475, "grad_norm": 11.541604042053223, "learning_rate": 8.446969696969697e-05, "loss": 2.6656, "step": 209 }, { "epoch": 0.4772727272727273, "grad_norm": 22.731273651123047, "learning_rate": 8.43939393939394e-05, "loss": 1.9391, "step": 210 }, { "epoch": 0.47954545454545455, "grad_norm": 16.327220916748047, "learning_rate": 8.431818181818182e-05, "loss": 2.1225, "step": 211 }, { "epoch": 0.4818181818181818, "grad_norm": 15.646464347839355, "learning_rate": 8.424242424242424e-05, "loss": 2.1468, "step": 212 }, { "epoch": 0.48409090909090907, "grad_norm": 16.69521141052246, "learning_rate": 8.416666666666668e-05, "loss": 2.4979, "step": 213 }, { "epoch": 0.4863636363636364, "grad_norm": 12.17435073852539, "learning_rate": 8.40909090909091e-05, "loss": 1.915, "step": 214 }, { "epoch": 0.48863636363636365, "grad_norm": 15.295214653015137, "learning_rate": 8.401515151515153e-05, "loss": 2.6765, "step": 215 }, { "epoch": 0.4909090909090909, "grad_norm": 14.532336235046387, "learning_rate": 8.393939393939393e-05, "loss": 2.1649, "step": 216 }, { "epoch": 0.49318181818181817, "grad_norm": 9.738990783691406, "learning_rate": 8.386363636363637e-05, "loss": 1.7751, "step": 217 }, { "epoch": 0.4954545454545455, "grad_norm": 13.893047332763672, "learning_rate": 8.378787878787879e-05, "loss": 2.3839, "step": 218 }, { "epoch": 0.49772727272727274, "grad_norm": 10.604107856750488, "learning_rate": 8.371212121212122e-05, "loss": 1.839, "step": 219 }, { "epoch": 0.5, "grad_norm": 14.21572208404541, "learning_rate": 8.363636363636364e-05, "loss": 2.4181, "step": 220 }, { "epoch": 0.5022727272727273, "grad_norm": 12.247942924499512, "learning_rate": 8.356060606060606e-05, "loss": 1.6214, "step": 221 }, { "epoch": 0.5045454545454545, "grad_norm": 11.43807601928711, "learning_rate": 8.348484848484849e-05, "loss": 1.7002, "step": 222 }, { "epoch": 0.5068181818181818, "grad_norm": 12.532363891601562, "learning_rate": 8.340909090909091e-05, "loss": 1.6798, "step": 223 }, { "epoch": 0.509090909090909, "grad_norm": 21.122955322265625, "learning_rate": 8.333333333333334e-05, "loss": 2.3791, "step": 224 }, { "epoch": 0.5113636363636364, "grad_norm": 15.643569946289062, "learning_rate": 8.325757575757575e-05, "loss": 2.2841, "step": 225 }, { "epoch": 0.5136363636363637, "grad_norm": 13.66476821899414, "learning_rate": 8.318181818181818e-05, "loss": 2.7105, "step": 226 }, { "epoch": 0.5159090909090909, "grad_norm": 15.538378715515137, "learning_rate": 8.310606060606062e-05, "loss": 2.5573, "step": 227 }, { "epoch": 0.5181818181818182, "grad_norm": 14.432341575622559, "learning_rate": 8.303030303030304e-05, "loss": 1.6926, "step": 228 }, { "epoch": 0.5204545454545455, "grad_norm": 14.326302528381348, "learning_rate": 8.295454545454547e-05, "loss": 1.9976, "step": 229 }, { "epoch": 0.5227272727272727, "grad_norm": 16.38084602355957, "learning_rate": 8.287878787878787e-05, "loss": 2.8438, "step": 230 }, { "epoch": 0.525, "grad_norm": 14.56826114654541, "learning_rate": 8.280303030303031e-05, "loss": 2.3643, "step": 231 }, { "epoch": 0.5272727272727272, "grad_norm": 10.183893203735352, "learning_rate": 8.272727272727273e-05, "loss": 1.9476, "step": 232 }, { "epoch": 0.5295454545454545, "grad_norm": 15.575922012329102, "learning_rate": 8.265151515151516e-05, "loss": 2.3493, "step": 233 }, { "epoch": 0.5318181818181819, "grad_norm": 12.653141021728516, "learning_rate": 8.257575757575758e-05, "loss": 2.0519, "step": 234 }, { "epoch": 0.5340909090909091, "grad_norm": 12.279047966003418, "learning_rate": 8.25e-05, "loss": 2.0694, "step": 235 }, { "epoch": 0.5363636363636364, "grad_norm": 12.395997047424316, "learning_rate": 8.242424242424243e-05, "loss": 2.1307, "step": 236 }, { "epoch": 0.5386363636363637, "grad_norm": 10.851142883300781, "learning_rate": 8.234848484848485e-05, "loss": 1.9883, "step": 237 }, { "epoch": 0.5409090909090909, "grad_norm": 14.103243827819824, "learning_rate": 8.227272727272729e-05, "loss": 2.6901, "step": 238 }, { "epoch": 0.5431818181818182, "grad_norm": 9.63924789428711, "learning_rate": 8.21969696969697e-05, "loss": 1.2228, "step": 239 }, { "epoch": 0.5454545454545454, "grad_norm": 13.430061340332031, "learning_rate": 8.212121212121212e-05, "loss": 1.7877, "step": 240 }, { "epoch": 0.5477272727272727, "grad_norm": 15.428567886352539, "learning_rate": 8.204545454545454e-05, "loss": 2.0201, "step": 241 }, { "epoch": 0.55, "grad_norm": 15.405593872070312, "learning_rate": 8.196969696969698e-05, "loss": 2.8325, "step": 242 }, { "epoch": 0.5522727272727272, "grad_norm": 22.855867385864258, "learning_rate": 8.18939393939394e-05, "loss": 3.045, "step": 243 }, { "epoch": 0.5545454545454546, "grad_norm": 14.374544143676758, "learning_rate": 8.181818181818183e-05, "loss": 2.0002, "step": 244 }, { "epoch": 0.5568181818181818, "grad_norm": 13.37702465057373, "learning_rate": 8.174242424242425e-05, "loss": 1.6496, "step": 245 }, { "epoch": 0.5590909090909091, "grad_norm": 13.321274757385254, "learning_rate": 8.166666666666667e-05, "loss": 1.9746, "step": 246 }, { "epoch": 0.5613636363636364, "grad_norm": 13.79466438293457, "learning_rate": 8.15909090909091e-05, "loss": 2.0699, "step": 247 }, { "epoch": 0.5636363636363636, "grad_norm": 12.355722427368164, "learning_rate": 8.151515151515152e-05, "loss": 2.2207, "step": 248 }, { "epoch": 0.5659090909090909, "grad_norm": 14.220561981201172, "learning_rate": 8.143939393939395e-05, "loss": 2.1695, "step": 249 }, { "epoch": 0.5681818181818182, "grad_norm": 12.587940216064453, "learning_rate": 8.136363636363636e-05, "loss": 1.8604, "step": 250 }, { "epoch": 0.5704545454545454, "grad_norm": 9.54430103302002, "learning_rate": 8.12878787878788e-05, "loss": 1.6446, "step": 251 }, { "epoch": 0.5727272727272728, "grad_norm": 14.440407752990723, "learning_rate": 8.121212121212121e-05, "loss": 2.4646, "step": 252 }, { "epoch": 0.575, "grad_norm": 14.50412368774414, "learning_rate": 8.113636363636365e-05, "loss": 1.5263, "step": 253 }, { "epoch": 0.5772727272727273, "grad_norm": 18.535612106323242, "learning_rate": 8.106060606060607e-05, "loss": 2.7942, "step": 254 }, { "epoch": 0.5795454545454546, "grad_norm": 11.250702857971191, "learning_rate": 8.098484848484848e-05, "loss": 1.5575, "step": 255 }, { "epoch": 0.5818181818181818, "grad_norm": 12.534632682800293, "learning_rate": 8.090909090909092e-05, "loss": 1.9031, "step": 256 }, { "epoch": 0.5840909090909091, "grad_norm": 14.82848834991455, "learning_rate": 8.083333333333334e-05, "loss": 1.4666, "step": 257 }, { "epoch": 0.5863636363636363, "grad_norm": 15.74230670928955, "learning_rate": 8.075757575757577e-05, "loss": 2.3956, "step": 258 }, { "epoch": 0.5886363636363636, "grad_norm": 13.576948165893555, "learning_rate": 8.068181818181818e-05, "loss": 1.9797, "step": 259 }, { "epoch": 0.5909090909090909, "grad_norm": 12.77927303314209, "learning_rate": 8.060606060606061e-05, "loss": 2.0894, "step": 260 }, { "epoch": 0.5931818181818181, "grad_norm": 17.75493621826172, "learning_rate": 8.053030303030303e-05, "loss": 2.6691, "step": 261 }, { "epoch": 0.5954545454545455, "grad_norm": 12.445291519165039, "learning_rate": 8.045454545454546e-05, "loss": 1.9188, "step": 262 }, { "epoch": 0.5977272727272728, "grad_norm": 12.350727081298828, "learning_rate": 8.037878787878788e-05, "loss": 1.9648, "step": 263 }, { "epoch": 0.6, "grad_norm": 10.37759780883789, "learning_rate": 8.03030303030303e-05, "loss": 1.5221, "step": 264 }, { "epoch": 0.6022727272727273, "grad_norm": 13.281451225280762, "learning_rate": 8.022727272727273e-05, "loss": 3.2337, "step": 265 }, { "epoch": 0.6045454545454545, "grad_norm": 11.684523582458496, "learning_rate": 8.015151515151515e-05, "loss": 1.7641, "step": 266 }, { "epoch": 0.6068181818181818, "grad_norm": 15.161863327026367, "learning_rate": 8.007575757575759e-05, "loss": 3.5694, "step": 267 }, { "epoch": 0.6090909090909091, "grad_norm": 13.221097946166992, "learning_rate": 8e-05, "loss": 2.5334, "step": 268 }, { "epoch": 0.6113636363636363, "grad_norm": 15.834603309631348, "learning_rate": 7.992424242424243e-05, "loss": 2.5292, "step": 269 }, { "epoch": 0.6136363636363636, "grad_norm": 15.016695976257324, "learning_rate": 7.984848484848485e-05, "loss": 1.9177, "step": 270 }, { "epoch": 0.615909090909091, "grad_norm": 18.896211624145508, "learning_rate": 7.977272727272728e-05, "loss": 2.2495, "step": 271 }, { "epoch": 0.6181818181818182, "grad_norm": 17.597623825073242, "learning_rate": 7.96969696969697e-05, "loss": 2.1252, "step": 272 }, { "epoch": 0.6204545454545455, "grad_norm": 14.346769332885742, "learning_rate": 7.962121212121213e-05, "loss": 2.0273, "step": 273 }, { "epoch": 0.6227272727272727, "grad_norm": 13.852729797363281, "learning_rate": 7.954545454545455e-05, "loss": 2.7319, "step": 274 }, { "epoch": 0.625, "grad_norm": 12.906790733337402, "learning_rate": 7.946969696969697e-05, "loss": 1.6674, "step": 275 }, { "epoch": 0.6272727272727273, "grad_norm": 10.031960487365723, "learning_rate": 7.93939393939394e-05, "loss": 1.5017, "step": 276 }, { "epoch": 0.6295454545454545, "grad_norm": 12.02971363067627, "learning_rate": 7.931818181818182e-05, "loss": 2.1617, "step": 277 }, { "epoch": 0.6318181818181818, "grad_norm": 12.239229202270508, "learning_rate": 7.924242424242426e-05, "loss": 1.285, "step": 278 }, { "epoch": 0.634090909090909, "grad_norm": 12.207528114318848, "learning_rate": 7.916666666666666e-05, "loss": 1.4661, "step": 279 }, { "epoch": 0.6363636363636364, "grad_norm": 21.659215927124023, "learning_rate": 7.90909090909091e-05, "loss": 1.8808, "step": 280 }, { "epoch": 0.6386363636363637, "grad_norm": 14.419612884521484, "learning_rate": 7.901515151515151e-05, "loss": 2.6502, "step": 281 }, { "epoch": 0.6409090909090909, "grad_norm": NaN, "learning_rate": 7.901515151515151e-05, "loss": 0.0, "step": 282 }, { "epoch": 0.6431818181818182, "grad_norm": 11.444130897521973, "learning_rate": 7.893939393939395e-05, "loss": 1.5987, "step": 283 }, { "epoch": 0.6454545454545455, "grad_norm": 10.316890716552734, "learning_rate": 7.886363636363637e-05, "loss": 1.5173, "step": 284 }, { "epoch": 0.6477272727272727, "grad_norm": 13.772204399108887, "learning_rate": 7.878787878787879e-05, "loss": 3.0357, "step": 285 }, { "epoch": 0.65, "grad_norm": 12.452784538269043, "learning_rate": 7.871212121212122e-05, "loss": 2.2077, "step": 286 }, { "epoch": 0.6522727272727272, "grad_norm": 15.323153495788574, "learning_rate": 7.863636363636364e-05, "loss": 1.8941, "step": 287 }, { "epoch": 0.6545454545454545, "grad_norm": 10.558858871459961, "learning_rate": 7.856060606060607e-05, "loss": 1.5262, "step": 288 }, { "epoch": 0.6568181818181819, "grad_norm": 15.232844352722168, "learning_rate": 7.848484848484848e-05, "loss": 3.1486, "step": 289 }, { "epoch": 0.6590909090909091, "grad_norm": 11.309487342834473, "learning_rate": 7.840909090909091e-05, "loss": 1.8324, "step": 290 }, { "epoch": 0.6613636363636364, "grad_norm": 11.427604675292969, "learning_rate": 7.833333333333333e-05, "loss": 1.0609, "step": 291 }, { "epoch": 0.6636363636363637, "grad_norm": 15.115833282470703, "learning_rate": 7.825757575757576e-05, "loss": 2.8888, "step": 292 }, { "epoch": 0.6659090909090909, "grad_norm": 14.701318740844727, "learning_rate": 7.818181818181818e-05, "loss": 2.823, "step": 293 }, { "epoch": 0.6681818181818182, "grad_norm": 10.650053024291992, "learning_rate": 7.81060606060606e-05, "loss": 1.8724, "step": 294 }, { "epoch": 0.6704545454545454, "grad_norm": 12.72999382019043, "learning_rate": 7.803030303030304e-05, "loss": 1.9267, "step": 295 }, { "epoch": 0.6727272727272727, "grad_norm": 16.98598861694336, "learning_rate": 7.795454545454546e-05, "loss": 2.325, "step": 296 }, { "epoch": 0.675, "grad_norm": 12.848193168640137, "learning_rate": 7.787878787878789e-05, "loss": 3.1965, "step": 297 }, { "epoch": 0.6772727272727272, "grad_norm": 8.765904426574707, "learning_rate": 7.780303030303031e-05, "loss": 1.8081, "step": 298 }, { "epoch": 0.6795454545454546, "grad_norm": 14.633967399597168, "learning_rate": 7.772727272727273e-05, "loss": 1.8056, "step": 299 }, { "epoch": 0.6818181818181818, "grad_norm": 9.972925186157227, "learning_rate": 7.765151515151515e-05, "loss": 1.8835, "step": 300 }, { "epoch": 0.6840909090909091, "grad_norm": 11.186135292053223, "learning_rate": 7.757575757575758e-05, "loss": 1.6734, "step": 301 }, { "epoch": 0.6863636363636364, "grad_norm": 15.052450180053711, "learning_rate": 7.75e-05, "loss": 2.2574, "step": 302 }, { "epoch": 0.6886363636363636, "grad_norm": 12.664848327636719, "learning_rate": 7.742424242424243e-05, "loss": 1.5916, "step": 303 }, { "epoch": 0.6909090909090909, "grad_norm": 14.287535667419434, "learning_rate": 7.734848484848485e-05, "loss": 1.8552, "step": 304 }, { "epoch": 0.6931818181818182, "grad_norm": 14.354594230651855, "learning_rate": 7.727272727272727e-05, "loss": 3.0925, "step": 305 }, { "epoch": 0.6954545454545454, "grad_norm": 12.003613471984863, "learning_rate": 7.71969696969697e-05, "loss": 1.6642, "step": 306 }, { "epoch": 0.6977272727272728, "grad_norm": 11.559938430786133, "learning_rate": 7.712121212121212e-05, "loss": 1.5997, "step": 307 }, { "epoch": 0.7, "grad_norm": 13.42446517944336, "learning_rate": 7.704545454545456e-05, "loss": 1.7934, "step": 308 }, { "epoch": 0.7022727272727273, "grad_norm": 11.831766128540039, "learning_rate": 7.696969696969696e-05, "loss": 1.7729, "step": 309 }, { "epoch": 0.7045454545454546, "grad_norm": 11.884734153747559, "learning_rate": 7.68939393939394e-05, "loss": 1.9489, "step": 310 }, { "epoch": 0.7068181818181818, "grad_norm": 15.816669464111328, "learning_rate": 7.681818181818182e-05, "loss": 2.4105, "step": 311 }, { "epoch": 0.7090909090909091, "grad_norm": 12.010058403015137, "learning_rate": 7.674242424242425e-05, "loss": 1.9247, "step": 312 }, { "epoch": 0.7113636363636363, "grad_norm": 9.436304092407227, "learning_rate": 7.666666666666667e-05, "loss": 1.9038, "step": 313 }, { "epoch": 0.7136363636363636, "grad_norm": 9.153775215148926, "learning_rate": 7.659090909090909e-05, "loss": 1.241, "step": 314 }, { "epoch": 0.7159090909090909, "grad_norm": 13.067652702331543, "learning_rate": 7.651515151515152e-05, "loss": 2.7662, "step": 315 }, { "epoch": 0.7181818181818181, "grad_norm": 16.106948852539062, "learning_rate": 7.643939393939394e-05, "loss": 2.0783, "step": 316 }, { "epoch": 0.7204545454545455, "grad_norm": 13.585596084594727, "learning_rate": 7.636363636363637e-05, "loss": 1.919, "step": 317 }, { "epoch": 0.7227272727272728, "grad_norm": 13.833767890930176, "learning_rate": 7.62878787878788e-05, "loss": 1.1069, "step": 318 }, { "epoch": 0.725, "grad_norm": 12.201956748962402, "learning_rate": 7.621212121212121e-05, "loss": 1.9548, "step": 319 }, { "epoch": 0.7272727272727273, "grad_norm": 15.562934875488281, "learning_rate": 7.613636363636363e-05, "loss": 1.9211, "step": 320 }, { "epoch": 0.7295454545454545, "grad_norm": 14.389630317687988, "learning_rate": 7.606060606060607e-05, "loss": 1.821, "step": 321 }, { "epoch": 0.7318181818181818, "grad_norm": 14.584891319274902, "learning_rate": 7.598484848484849e-05, "loss": 2.5068, "step": 322 }, { "epoch": 0.7340909090909091, "grad_norm": 14.5166654586792, "learning_rate": 7.59090909090909e-05, "loss": 1.9124, "step": 323 }, { "epoch": 0.7363636363636363, "grad_norm": 46.67388916015625, "learning_rate": 7.583333333333334e-05, "loss": 1.6895, "step": 324 }, { "epoch": 0.7386363636363636, "grad_norm": 12.92702865600586, "learning_rate": 7.575757575757576e-05, "loss": 1.7526, "step": 325 }, { "epoch": 0.740909090909091, "grad_norm": 8.52035140991211, "learning_rate": 7.568181818181819e-05, "loss": 1.4144, "step": 326 }, { "epoch": 0.7431818181818182, "grad_norm": 13.630702018737793, "learning_rate": 7.560606060606061e-05, "loss": 2.2018, "step": 327 }, { "epoch": 0.7454545454545455, "grad_norm": 14.379950523376465, "learning_rate": 7.553030303030303e-05, "loss": 2.8618, "step": 328 }, { "epoch": 0.7477272727272727, "grad_norm": 14.78795051574707, "learning_rate": 7.545454545454545e-05, "loss": 1.9749, "step": 329 }, { "epoch": 0.75, "grad_norm": 10.462140083312988, "learning_rate": 7.537878787878788e-05, "loss": 2.3666, "step": 330 }, { "epoch": 0.7522727272727273, "grad_norm": 11.336270332336426, "learning_rate": 7.530303030303032e-05, "loss": 1.4712, "step": 331 }, { "epoch": 0.7545454545454545, "grad_norm": 17.15682029724121, "learning_rate": 7.522727272727273e-05, "loss": 3.2442, "step": 332 }, { "epoch": 0.7568181818181818, "grad_norm": 14.129326820373535, "learning_rate": 7.515151515151515e-05, "loss": 1.9768, "step": 333 }, { "epoch": 0.759090909090909, "grad_norm": 14.239521026611328, "learning_rate": 7.507575757575757e-05, "loss": 1.9933, "step": 334 }, { "epoch": 0.7613636363636364, "grad_norm": 10.573707580566406, "learning_rate": 7.500000000000001e-05, "loss": 1.3049, "step": 335 }, { "epoch": 0.7636363636363637, "grad_norm": 15.881331443786621, "learning_rate": 7.492424242424243e-05, "loss": 2.7249, "step": 336 }, { "epoch": 0.7659090909090909, "grad_norm": 11.606864929199219, "learning_rate": 7.484848484848486e-05, "loss": 1.4883, "step": 337 }, { "epoch": 0.7681818181818182, "grad_norm": 8.834245681762695, "learning_rate": 7.477272727272727e-05, "loss": 1.3757, "step": 338 }, { "epoch": 0.7704545454545455, "grad_norm": 10.011686325073242, "learning_rate": 7.46969696969697e-05, "loss": 1.4306, "step": 339 }, { "epoch": 0.7727272727272727, "grad_norm": 13.084802627563477, "learning_rate": 7.462121212121213e-05, "loss": 2.1676, "step": 340 }, { "epoch": 0.775, "grad_norm": 12.480827331542969, "learning_rate": 7.454545454545455e-05, "loss": 2.2564, "step": 341 }, { "epoch": 0.7772727272727272, "grad_norm": 12.32083797454834, "learning_rate": 7.446969696969698e-05, "loss": 1.4576, "step": 342 }, { "epoch": 0.7795454545454545, "grad_norm": 13.759376525878906, "learning_rate": 7.439393939393939e-05, "loss": 2.5308, "step": 343 }, { "epoch": 0.7818181818181819, "grad_norm": 17.70578384399414, "learning_rate": 7.431818181818182e-05, "loss": 3.0816, "step": 344 }, { "epoch": 0.7840909090909091, "grad_norm": 13.809745788574219, "learning_rate": 7.424242424242424e-05, "loss": 2.6903, "step": 345 }, { "epoch": 0.7863636363636364, "grad_norm": 13.484768867492676, "learning_rate": 7.416666666666668e-05, "loss": 1.6094, "step": 346 }, { "epoch": 0.7886363636363637, "grad_norm": 10.424938201904297, "learning_rate": 7.40909090909091e-05, "loss": 1.3566, "step": 347 }, { "epoch": 0.7909090909090909, "grad_norm": 15.058128356933594, "learning_rate": 7.401515151515152e-05, "loss": 1.945, "step": 348 }, { "epoch": 0.7931818181818182, "grad_norm": 11.48098373413086, "learning_rate": 7.393939393939395e-05, "loss": 2.9329, "step": 349 }, { "epoch": 0.7954545454545454, "grad_norm": 15.027339935302734, "learning_rate": 7.386363636363637e-05, "loss": 3.3324, "step": 350 }, { "epoch": 0.7977272727272727, "grad_norm": 12.786996841430664, "learning_rate": 7.37878787878788e-05, "loss": 2.7898, "step": 351 }, { "epoch": 0.8, "grad_norm": 14.68897819519043, "learning_rate": 7.37121212121212e-05, "loss": 2.1318, "step": 352 }, { "epoch": 0.8022727272727272, "grad_norm": 15.081788063049316, "learning_rate": 7.363636363636364e-05, "loss": 2.544, "step": 353 }, { "epoch": 0.8045454545454546, "grad_norm": 13.604434967041016, "learning_rate": 7.356060606060606e-05, "loss": 3.242, "step": 354 }, { "epoch": 0.8068181818181818, "grad_norm": 10.167998313903809, "learning_rate": 7.348484848484849e-05, "loss": 1.7378, "step": 355 }, { "epoch": 0.8090909090909091, "grad_norm": 11.878591537475586, "learning_rate": 7.340909090909091e-05, "loss": 1.9651, "step": 356 }, { "epoch": 0.8113636363636364, "grad_norm": 10.606021881103516, "learning_rate": 7.333333333333333e-05, "loss": 1.6922, "step": 357 }, { "epoch": 0.8136363636363636, "grad_norm": 36.99083709716797, "learning_rate": 7.325757575757576e-05, "loss": 2.7004, "step": 358 }, { "epoch": 0.8159090909090909, "grad_norm": 12.748845100402832, "learning_rate": 7.318181818181818e-05, "loss": 2.0722, "step": 359 }, { "epoch": 0.8181818181818182, "grad_norm": 13.374279975891113, "learning_rate": 7.310606060606062e-05, "loss": 2.3361, "step": 360 }, { "epoch": 0.8204545454545454, "grad_norm": 10.289033889770508, "learning_rate": 7.303030303030304e-05, "loss": 1.6377, "step": 361 }, { "epoch": 0.8227272727272728, "grad_norm": 10.585772514343262, "learning_rate": 7.295454545454546e-05, "loss": 1.6941, "step": 362 }, { "epoch": 0.825, "grad_norm": 13.439225196838379, "learning_rate": 7.287878787878788e-05, "loss": 1.9242, "step": 363 }, { "epoch": 0.8272727272727273, "grad_norm": 12.649117469787598, "learning_rate": 7.280303030303031e-05, "loss": 3.5932, "step": 364 }, { "epoch": 0.8295454545454546, "grad_norm": 13.014269828796387, "learning_rate": 7.272727272727273e-05, "loss": 1.6747, "step": 365 }, { "epoch": 0.8318181818181818, "grad_norm": 10.855698585510254, "learning_rate": 7.265151515151516e-05, "loss": 2.2644, "step": 366 }, { "epoch": 0.8340909090909091, "grad_norm": 9.967236518859863, "learning_rate": 7.257575757575758e-05, "loss": 1.7373, "step": 367 }, { "epoch": 0.8363636363636363, "grad_norm": 12.029590606689453, "learning_rate": 7.25e-05, "loss": 1.7012, "step": 368 }, { "epoch": 0.8386363636363636, "grad_norm": 18.046247482299805, "learning_rate": 7.242424242424243e-05, "loss": 2.7507, "step": 369 }, { "epoch": 0.8409090909090909, "grad_norm": 12.02083969116211, "learning_rate": 7.234848484848485e-05, "loss": 1.4928, "step": 370 }, { "epoch": 0.8431818181818181, "grad_norm": 14.034537315368652, "learning_rate": 7.227272727272729e-05, "loss": 1.5557, "step": 371 }, { "epoch": 0.8454545454545455, "grad_norm": 11.5894775390625, "learning_rate": 7.219696969696969e-05, "loss": 2.0848, "step": 372 }, { "epoch": 0.8477272727272728, "grad_norm": 10.489690780639648, "learning_rate": 7.212121212121213e-05, "loss": 2.1963, "step": 373 }, { "epoch": 0.85, "grad_norm": 14.684807777404785, "learning_rate": 7.204545454545454e-05, "loss": 1.6653, "step": 374 }, { "epoch": 0.8522727272727273, "grad_norm": 10.650580406188965, "learning_rate": 7.196969696969698e-05, "loss": 1.5813, "step": 375 }, { "epoch": 0.8545454545454545, "grad_norm": 14.406346321105957, "learning_rate": 7.18939393939394e-05, "loss": 1.6018, "step": 376 }, { "epoch": 0.8568181818181818, "grad_norm": 10.684210777282715, "learning_rate": 7.181818181818182e-05, "loss": 1.16, "step": 377 }, { "epoch": 0.8590909090909091, "grad_norm": 11.588654518127441, "learning_rate": 7.174242424242425e-05, "loss": 1.52, "step": 378 }, { "epoch": 0.8613636363636363, "grad_norm": 13.342896461486816, "learning_rate": 7.166666666666667e-05, "loss": 1.3069, "step": 379 }, { "epoch": 0.8636363636363636, "grad_norm": 10.33123779296875, "learning_rate": 7.15909090909091e-05, "loss": 2.097, "step": 380 }, { "epoch": 0.865909090909091, "grad_norm": 13.286327362060547, "learning_rate": 7.151515151515152e-05, "loss": 1.6996, "step": 381 }, { "epoch": 0.8681818181818182, "grad_norm": 12.737727165222168, "learning_rate": 7.143939393939394e-05, "loss": 1.8533, "step": 382 }, { "epoch": 0.8704545454545455, "grad_norm": 10.602120399475098, "learning_rate": 7.136363636363636e-05, "loss": 0.9764, "step": 383 }, { "epoch": 0.8727272727272727, "grad_norm": 13.362771034240723, "learning_rate": 7.12878787878788e-05, "loss": 2.6888, "step": 384 }, { "epoch": 0.875, "grad_norm": 15.875019073486328, "learning_rate": 7.121212121212121e-05, "loss": 1.3865, "step": 385 }, { "epoch": 0.8772727272727273, "grad_norm": 11.602843284606934, "learning_rate": 7.113636363636363e-05, "loss": 1.489, "step": 386 }, { "epoch": 0.8795454545454545, "grad_norm": 10.052959442138672, "learning_rate": 7.106060606060607e-05, "loss": 1.423, "step": 387 }, { "epoch": 0.8818181818181818, "grad_norm": 15.898283004760742, "learning_rate": 7.098484848484849e-05, "loss": 2.0401, "step": 388 }, { "epoch": 0.884090909090909, "grad_norm": 14.83981990814209, "learning_rate": 7.090909090909092e-05, "loss": 2.9656, "step": 389 }, { "epoch": 0.8863636363636364, "grad_norm": 12.542622566223145, "learning_rate": 7.083333333333334e-05, "loss": 1.7818, "step": 390 }, { "epoch": 0.8886363636363637, "grad_norm": 10.65149974822998, "learning_rate": 7.075757575757576e-05, "loss": 1.4115, "step": 391 }, { "epoch": 0.8909090909090909, "grad_norm": 14.208708763122559, "learning_rate": 7.068181818181818e-05, "loss": 2.5107, "step": 392 }, { "epoch": 0.8931818181818182, "grad_norm": 13.435481071472168, "learning_rate": 7.060606060606061e-05, "loss": 2.0141, "step": 393 }, { "epoch": 0.8954545454545455, "grad_norm": 14.987428665161133, "learning_rate": 7.053030303030303e-05, "loss": 1.6295, "step": 394 }, { "epoch": 0.8977272727272727, "grad_norm": 15.590865135192871, "learning_rate": 7.045454545454546e-05, "loss": 2.5029, "step": 395 }, { "epoch": 0.9, "grad_norm": 12.00338077545166, "learning_rate": 7.037878787878788e-05, "loss": 1.5399, "step": 396 }, { "epoch": 0.9022727272727272, "grad_norm": 10.2390718460083, "learning_rate": 7.03030303030303e-05, "loss": 1.2943, "step": 397 }, { "epoch": 0.9045454545454545, "grad_norm": 13.09786319732666, "learning_rate": 7.022727272727274e-05, "loss": 1.951, "step": 398 }, { "epoch": 0.9068181818181819, "grad_norm": 14.016656875610352, "learning_rate": 7.015151515151515e-05, "loss": 2.4783, "step": 399 }, { "epoch": 0.9090909090909091, "grad_norm": 14.135820388793945, "learning_rate": 7.007575757575759e-05, "loss": 1.8109, "step": 400 }, { "epoch": 0.9113636363636364, "grad_norm": 15.545958518981934, "learning_rate": 7e-05, "loss": 2.2156, "step": 401 }, { "epoch": 0.9136363636363637, "grad_norm": 15.512310028076172, "learning_rate": 6.992424242424243e-05, "loss": 1.8199, "step": 402 }, { "epoch": 0.9159090909090909, "grad_norm": 12.54996109008789, "learning_rate": 6.984848484848485e-05, "loss": 2.0134, "step": 403 }, { "epoch": 0.9181818181818182, "grad_norm": 10.554512023925781, "learning_rate": 6.977272727272728e-05, "loss": 1.5173, "step": 404 }, { "epoch": 0.9204545454545454, "grad_norm": 13.31303882598877, "learning_rate": 6.96969696969697e-05, "loss": 1.7694, "step": 405 }, { "epoch": 0.9227272727272727, "grad_norm": 18.840511322021484, "learning_rate": 6.962121212121212e-05, "loss": 3.0551, "step": 406 }, { "epoch": 0.925, "grad_norm": 13.331717491149902, "learning_rate": 6.954545454545455e-05, "loss": 2.0296, "step": 407 }, { "epoch": 0.9272727272727272, "grad_norm": 11.75788688659668, "learning_rate": 6.946969696969697e-05, "loss": 1.8544, "step": 408 }, { "epoch": 0.9295454545454546, "grad_norm": 14.479559898376465, "learning_rate": 6.93939393939394e-05, "loss": 2.4435, "step": 409 }, { "epoch": 0.9318181818181818, "grad_norm": 14.522322654724121, "learning_rate": 6.931818181818182e-05, "loss": 2.3013, "step": 410 }, { "epoch": 0.9340909090909091, "grad_norm": 12.853972434997559, "learning_rate": 6.924242424242424e-05, "loss": 2.4637, "step": 411 }, { "epoch": 0.9363636363636364, "grad_norm": 10.978107452392578, "learning_rate": 6.916666666666666e-05, "loss": 1.5277, "step": 412 }, { "epoch": 0.9386363636363636, "grad_norm": 14.109042167663574, "learning_rate": 6.90909090909091e-05, "loss": 1.9601, "step": 413 }, { "epoch": 0.9409090909090909, "grad_norm": 10.699783325195312, "learning_rate": 6.901515151515152e-05, "loss": 2.2143, "step": 414 }, { "epoch": 0.9431818181818182, "grad_norm": 10.57825756072998, "learning_rate": 6.893939393939395e-05, "loss": 2.0557, "step": 415 }, { "epoch": 0.9454545454545454, "grad_norm": 12.432737350463867, "learning_rate": 6.886363636363637e-05, "loss": 1.7554, "step": 416 }, { "epoch": 0.9477272727272728, "grad_norm": 12.157960891723633, "learning_rate": 6.878787878787879e-05, "loss": 2.1302, "step": 417 }, { "epoch": 0.95, "grad_norm": 15.89067554473877, "learning_rate": 6.871212121212122e-05, "loss": 2.1424, "step": 418 }, { "epoch": 0.9522727272727273, "grad_norm": 10.453248977661133, "learning_rate": 6.863636363636364e-05, "loss": 1.8215, "step": 419 }, { "epoch": 0.9545454545454546, "grad_norm": 8.481575012207031, "learning_rate": 6.856060606060606e-05, "loss": 1.5999, "step": 420 }, { "epoch": 0.9568181818181818, "grad_norm": 10.795332908630371, "learning_rate": 6.848484848484848e-05, "loss": 1.4623, "step": 421 }, { "epoch": 0.9590909090909091, "grad_norm": 18.586315155029297, "learning_rate": 6.840909090909091e-05, "loss": 2.1875, "step": 422 }, { "epoch": 0.9613636363636363, "grad_norm": 15.387242317199707, "learning_rate": 6.833333333333333e-05, "loss": 2.1544, "step": 423 }, { "epoch": 0.9636363636363636, "grad_norm": 11.277326583862305, "learning_rate": 6.825757575757576e-05, "loss": 1.8575, "step": 424 }, { "epoch": 0.9659090909090909, "grad_norm": 9.451603889465332, "learning_rate": 6.818181818181818e-05, "loss": 1.6149, "step": 425 }, { "epoch": 0.9681818181818181, "grad_norm": 14.108964920043945, "learning_rate": 6.81060606060606e-05, "loss": 2.0166, "step": 426 }, { "epoch": 0.9704545454545455, "grad_norm": 8.922270774841309, "learning_rate": 6.803030303030304e-05, "loss": 1.3486, "step": 427 }, { "epoch": 0.9727272727272728, "grad_norm": 9.383979797363281, "learning_rate": 6.795454545454546e-05, "loss": 1.0425, "step": 428 }, { "epoch": 0.975, "grad_norm": 13.076512336730957, "learning_rate": 6.787878787878789e-05, "loss": 1.7828, "step": 429 }, { "epoch": 0.9772727272727273, "grad_norm": 14.815391540527344, "learning_rate": 6.78030303030303e-05, "loss": 1.893, "step": 430 }, { "epoch": 0.9795454545454545, "grad_norm": 10.523706436157227, "learning_rate": 6.772727272727273e-05, "loss": 1.5307, "step": 431 }, { "epoch": 0.9818181818181818, "grad_norm": 16.938919067382812, "learning_rate": 6.765151515151515e-05, "loss": 1.9001, "step": 432 }, { "epoch": 0.9840909090909091, "grad_norm": 11.781875610351562, "learning_rate": 6.757575757575758e-05, "loss": 2.183, "step": 433 }, { "epoch": 0.9863636363636363, "grad_norm": 14.539305686950684, "learning_rate": 6.750000000000001e-05, "loss": 2.2021, "step": 434 }, { "epoch": 0.9886363636363636, "grad_norm": 15.532546997070312, "learning_rate": 6.742424242424242e-05, "loss": 2.1856, "step": 435 }, { "epoch": 0.990909090909091, "grad_norm": 12.917964935302734, "learning_rate": 6.734848484848485e-05, "loss": 2.8732, "step": 436 }, { "epoch": 0.9931818181818182, "grad_norm": 12.498353958129883, "learning_rate": 6.727272727272727e-05, "loss": 1.9246, "step": 437 }, { "epoch": 0.9954545454545455, "grad_norm": 14.181402206420898, "learning_rate": 6.71969696969697e-05, "loss": 2.3863, "step": 438 }, { "epoch": 0.9977272727272727, "grad_norm": 12.139135360717773, "learning_rate": 6.712121212121213e-05, "loss": 2.5505, "step": 439 }, { "epoch": 1.0, "grad_norm": 18.971040725708008, "learning_rate": 6.704545454545455e-05, "loss": 2.3566, "step": 440 }, { "epoch": 1.0, "eval_f1": 0.8942, "eval_gen_len": 41.6727, "eval_loss": 1.852333426475525, "eval_precision": 0.8938, "eval_recall": 0.8947, "eval_rouge1": 0.4801, "eval_rouge2": 0.2302, "eval_rougeL": 0.4078, "eval_rougeLsum": 0.4472, "eval_runtime": 28.5976, "eval_samples_per_second": 3.846, "eval_steps_per_second": 0.49, "step": 440 }, { "epoch": 1.0022727272727272, "grad_norm": 9.610616683959961, "learning_rate": 6.696969696969696e-05, "loss": 1.3656, "step": 441 }, { "epoch": 1.0045454545454546, "grad_norm": 13.653773307800293, "learning_rate": 6.68939393939394e-05, "loss": 3.0115, "step": 442 }, { "epoch": 1.0068181818181818, "grad_norm": 10.243281364440918, "learning_rate": 6.681818181818183e-05, "loss": 1.7598, "step": 443 }, { "epoch": 1.009090909090909, "grad_norm": 12.79389762878418, "learning_rate": 6.674242424242425e-05, "loss": 1.7768, "step": 444 }, { "epoch": 1.0113636363636365, "grad_norm": 8.748100280761719, "learning_rate": 6.666666666666667e-05, "loss": 1.4368, "step": 445 }, { "epoch": 1.0136363636363637, "grad_norm": 9.42500114440918, "learning_rate": 6.659090909090909e-05, "loss": 1.0754, "step": 446 }, { "epoch": 1.0159090909090909, "grad_norm": 11.976570129394531, "learning_rate": 6.651515151515152e-05, "loss": 2.07, "step": 447 }, { "epoch": 1.018181818181818, "grad_norm": 9.448553085327148, "learning_rate": 6.643939393939394e-05, "loss": 1.5004, "step": 448 }, { "epoch": 1.0204545454545455, "grad_norm": 10.295342445373535, "learning_rate": 6.636363636363638e-05, "loss": 1.6393, "step": 449 }, { "epoch": 1.0227272727272727, "grad_norm": 9.445040702819824, "learning_rate": 6.628787878787878e-05, "loss": 1.7432, "step": 450 }, { "epoch": 1.025, "grad_norm": 16.851524353027344, "learning_rate": 6.621212121212121e-05, "loss": 2.2318, "step": 451 }, { "epoch": 1.0272727272727273, "grad_norm": 10.721171379089355, "learning_rate": 6.613636363636365e-05, "loss": 1.7857, "step": 452 }, { "epoch": 1.0295454545454545, "grad_norm": 10.074830055236816, "learning_rate": 6.606060606060607e-05, "loss": 1.5901, "step": 453 }, { "epoch": 1.0318181818181817, "grad_norm": 20.14990234375, "learning_rate": 6.598484848484849e-05, "loss": 2.6518, "step": 454 }, { "epoch": 1.0340909090909092, "grad_norm": 10.911235809326172, "learning_rate": 6.59090909090909e-05, "loss": 2.1865, "step": 455 }, { "epoch": 1.0363636363636364, "grad_norm": 18.03226089477539, "learning_rate": 6.583333333333334e-05, "loss": 2.4383, "step": 456 }, { "epoch": 1.0386363636363636, "grad_norm": 9.279253959655762, "learning_rate": 6.575757575757576e-05, "loss": 0.9629, "step": 457 }, { "epoch": 1.040909090909091, "grad_norm": 11.864253997802734, "learning_rate": 6.568181818181819e-05, "loss": 2.1734, "step": 458 }, { "epoch": 1.0431818181818182, "grad_norm": 13.346138954162598, "learning_rate": 6.560606060606061e-05, "loss": 1.4337, "step": 459 }, { "epoch": 1.0454545454545454, "grad_norm": 8.396434783935547, "learning_rate": 6.553030303030303e-05, "loss": 1.54, "step": 460 }, { "epoch": 1.0477272727272728, "grad_norm": 9.705253601074219, "learning_rate": 6.545454545454546e-05, "loss": 1.9016, "step": 461 }, { "epoch": 1.05, "grad_norm": 9.6156005859375, "learning_rate": 6.537878787878788e-05, "loss": 1.3029, "step": 462 }, { "epoch": 1.0522727272727272, "grad_norm": 16.548994064331055, "learning_rate": 6.530303030303032e-05, "loss": 3.5641, "step": 463 }, { "epoch": 1.0545454545454545, "grad_norm": 11.045211791992188, "learning_rate": 6.522727272727272e-05, "loss": 1.3876, "step": 464 }, { "epoch": 1.0568181818181819, "grad_norm": 10.465343475341797, "learning_rate": 6.515151515151516e-05, "loss": 1.5871, "step": 465 }, { "epoch": 1.059090909090909, "grad_norm": 10.053452491760254, "learning_rate": 6.507575757575757e-05, "loss": 1.4177, "step": 466 }, { "epoch": 1.0613636363636363, "grad_norm": 12.043208122253418, "learning_rate": 6.500000000000001e-05, "loss": 1.5364, "step": 467 }, { "epoch": 1.0636363636363637, "grad_norm": 11.853958129882812, "learning_rate": 6.492424242424243e-05, "loss": 1.3952, "step": 468 }, { "epoch": 1.065909090909091, "grad_norm": 8.25589656829834, "learning_rate": 6.484848484848485e-05, "loss": 1.5497, "step": 469 }, { "epoch": 1.0681818181818181, "grad_norm": 13.430974960327148, "learning_rate": 6.477272727272728e-05, "loss": 2.4184, "step": 470 }, { "epoch": 1.0704545454545455, "grad_norm": 10.576482772827148, "learning_rate": 6.46969696969697e-05, "loss": 1.4223, "step": 471 }, { "epoch": 1.0727272727272728, "grad_norm": 11.786113739013672, "learning_rate": 6.462121212121213e-05, "loss": 2.0499, "step": 472 }, { "epoch": 1.075, "grad_norm": 12.00688362121582, "learning_rate": 6.454545454545455e-05, "loss": 2.9764, "step": 473 }, { "epoch": 1.0772727272727272, "grad_norm": 10.834086418151855, "learning_rate": 6.446969696969697e-05, "loss": 2.0765, "step": 474 }, { "epoch": 1.0795454545454546, "grad_norm": 10.710877418518066, "learning_rate": 6.439393939393939e-05, "loss": 1.4314, "step": 475 }, { "epoch": 1.0818181818181818, "grad_norm": 12.800888061523438, "learning_rate": 6.431818181818182e-05, "loss": 1.4847, "step": 476 }, { "epoch": 1.084090909090909, "grad_norm": 10.365299224853516, "learning_rate": 6.424242424242424e-05, "loss": 1.6775, "step": 477 }, { "epoch": 1.0863636363636364, "grad_norm": 10.344579696655273, "learning_rate": 6.416666666666668e-05, "loss": 2.3473, "step": 478 }, { "epoch": 1.0886363636363636, "grad_norm": 13.791784286499023, "learning_rate": 6.40909090909091e-05, "loss": 2.5763, "step": 479 }, { "epoch": 1.0909090909090908, "grad_norm": 13.133481979370117, "learning_rate": 6.401515151515152e-05, "loss": 1.7025, "step": 480 }, { "epoch": 1.0931818181818183, "grad_norm": 13.444737434387207, "learning_rate": 6.393939393939395e-05, "loss": 2.341, "step": 481 }, { "epoch": 1.0954545454545455, "grad_norm": 15.245584487915039, "learning_rate": 6.386363636363637e-05, "loss": 1.929, "step": 482 }, { "epoch": 1.0977272727272727, "grad_norm": 10.724458694458008, "learning_rate": 6.37878787878788e-05, "loss": 1.4099, "step": 483 }, { "epoch": 1.1, "grad_norm": 11.243814468383789, "learning_rate": 6.371212121212121e-05, "loss": 1.5886, "step": 484 }, { "epoch": 1.1022727272727273, "grad_norm": 11.731426239013672, "learning_rate": 6.363636363636364e-05, "loss": 1.9571, "step": 485 }, { "epoch": 1.1045454545454545, "grad_norm": 10.820639610290527, "learning_rate": 6.356060606060606e-05, "loss": 1.113, "step": 486 }, { "epoch": 1.106818181818182, "grad_norm": 14.63482666015625, "learning_rate": 6.34848484848485e-05, "loss": 1.9765, "step": 487 }, { "epoch": 1.1090909090909091, "grad_norm": 12.746257781982422, "learning_rate": 6.340909090909091e-05, "loss": 1.5906, "step": 488 }, { "epoch": 1.1113636363636363, "grad_norm": 14.916450500488281, "learning_rate": 6.333333333333333e-05, "loss": 1.6616, "step": 489 }, { "epoch": 1.1136363636363635, "grad_norm": 11.509872436523438, "learning_rate": 6.325757575757577e-05, "loss": 2.5105, "step": 490 }, { "epoch": 1.115909090909091, "grad_norm": 11.517654418945312, "learning_rate": 6.318181818181818e-05, "loss": 1.3542, "step": 491 }, { "epoch": 1.1181818181818182, "grad_norm": 13.984039306640625, "learning_rate": 6.310606060606062e-05, "loss": 2.1356, "step": 492 }, { "epoch": 1.1204545454545454, "grad_norm": 13.018148422241211, "learning_rate": 6.303030303030302e-05, "loss": 1.5024, "step": 493 }, { "epoch": 1.1227272727272728, "grad_norm": 13.609540939331055, "learning_rate": 6.295454545454546e-05, "loss": 2.1359, "step": 494 }, { "epoch": 1.125, "grad_norm": 13.505942344665527, "learning_rate": 6.287878787878788e-05, "loss": 2.8486, "step": 495 }, { "epoch": 1.1272727272727272, "grad_norm": 11.420187950134277, "learning_rate": 6.280303030303031e-05, "loss": 1.5044, "step": 496 }, { "epoch": 1.1295454545454546, "grad_norm": 14.127695083618164, "learning_rate": 6.272727272727273e-05, "loss": 2.6676, "step": 497 }, { "epoch": 1.1318181818181818, "grad_norm": 9.813878059387207, "learning_rate": 6.265151515151515e-05, "loss": 1.4169, "step": 498 }, { "epoch": 1.134090909090909, "grad_norm": 9.80479621887207, "learning_rate": 6.257575757575758e-05, "loss": 1.5349, "step": 499 }, { "epoch": 1.1363636363636362, "grad_norm": 10.739019393920898, "learning_rate": 6.25e-05, "loss": 1.5255, "step": 500 }, { "epoch": 1.1386363636363637, "grad_norm": 11.327676773071289, "learning_rate": 6.242424242424243e-05, "loss": 1.3854, "step": 501 }, { "epoch": 1.1409090909090909, "grad_norm": 9.645312309265137, "learning_rate": 6.234848484848485e-05, "loss": 1.6148, "step": 502 }, { "epoch": 1.143181818181818, "grad_norm": 12.285623550415039, "learning_rate": 6.227272727272727e-05, "loss": 1.9336, "step": 503 }, { "epoch": 1.1454545454545455, "grad_norm": 15.579854011535645, "learning_rate": 6.219696969696969e-05, "loss": 2.1064, "step": 504 }, { "epoch": 1.1477272727272727, "grad_norm": 17.76817512512207, "learning_rate": 6.212121212121213e-05, "loss": 1.4266, "step": 505 }, { "epoch": 1.15, "grad_norm": 10.037004470825195, "learning_rate": 6.204545454545455e-05, "loss": 1.5432, "step": 506 }, { "epoch": 1.1522727272727273, "grad_norm": 10.46380615234375, "learning_rate": 6.196969696969698e-05, "loss": 2.1057, "step": 507 }, { "epoch": 1.1545454545454545, "grad_norm": 12.883086204528809, "learning_rate": 6.18939393939394e-05, "loss": 2.1955, "step": 508 }, { "epoch": 1.1568181818181817, "grad_norm": 10.667054176330566, "learning_rate": 6.181818181818182e-05, "loss": 1.8041, "step": 509 }, { "epoch": 1.1590909090909092, "grad_norm": 13.076772689819336, "learning_rate": 6.174242424242425e-05, "loss": 1.9923, "step": 510 }, { "epoch": 1.1613636363636364, "grad_norm": 13.195068359375, "learning_rate": 6.166666666666667e-05, "loss": 2.2575, "step": 511 }, { "epoch": 1.1636363636363636, "grad_norm": 25.86856460571289, "learning_rate": 6.15909090909091e-05, "loss": 0.9713, "step": 512 }, { "epoch": 1.165909090909091, "grad_norm": 13.29697322845459, "learning_rate": 6.151515151515151e-05, "loss": 1.9724, "step": 513 }, { "epoch": 1.1681818181818182, "grad_norm": 11.164151191711426, "learning_rate": 6.143939393939394e-05, "loss": 1.7574, "step": 514 }, { "epoch": 1.1704545454545454, "grad_norm": 11.621664047241211, "learning_rate": 6.136363636363636e-05, "loss": 2.0349, "step": 515 }, { "epoch": 1.1727272727272728, "grad_norm": 13.135611534118652, "learning_rate": 6.12878787878788e-05, "loss": 2.1065, "step": 516 }, { "epoch": 1.175, "grad_norm": 13.730208396911621, "learning_rate": 6.121212121212121e-05, "loss": 2.2205, "step": 517 }, { "epoch": 1.1772727272727272, "grad_norm": 11.453598022460938, "learning_rate": 6.113636363636363e-05, "loss": 2.2924, "step": 518 }, { "epoch": 1.1795454545454545, "grad_norm": 10.924808502197266, "learning_rate": 6.106060606060607e-05, "loss": 1.2283, "step": 519 }, { "epoch": 1.1818181818181819, "grad_norm": 16.08315658569336, "learning_rate": 6.098484848484849e-05, "loss": 2.5927, "step": 520 }, { "epoch": 1.184090909090909, "grad_norm": 8.260347366333008, "learning_rate": 6.090909090909091e-05, "loss": 1.3534, "step": 521 }, { "epoch": 1.1863636363636363, "grad_norm": 12.075833320617676, "learning_rate": 6.083333333333333e-05, "loss": 2.0813, "step": 522 }, { "epoch": 1.1886363636363637, "grad_norm": 10.575677871704102, "learning_rate": 6.075757575757576e-05, "loss": 1.4781, "step": 523 }, { "epoch": 1.190909090909091, "grad_norm": 12.236503601074219, "learning_rate": 6.0681818181818185e-05, "loss": 2.003, "step": 524 }, { "epoch": 1.1931818181818181, "grad_norm": 12.172025680541992, "learning_rate": 6.060606060606061e-05, "loss": 1.4951, "step": 525 }, { "epoch": 1.1954545454545455, "grad_norm": 12.456896781921387, "learning_rate": 6.053030303030304e-05, "loss": 1.8737, "step": 526 }, { "epoch": 1.1977272727272728, "grad_norm": 13.824838638305664, "learning_rate": 6.045454545454545e-05, "loss": 1.7923, "step": 527 }, { "epoch": 1.2, "grad_norm": 10.863786697387695, "learning_rate": 6.037878787878788e-05, "loss": 2.144, "step": 528 }, { "epoch": 1.2022727272727272, "grad_norm": 17.319700241088867, "learning_rate": 6.03030303030303e-05, "loss": 1.9297, "step": 529 }, { "epoch": 1.2045454545454546, "grad_norm": 8.89411449432373, "learning_rate": 6.022727272727273e-05, "loss": 1.3583, "step": 530 }, { "epoch": 1.2068181818181818, "grad_norm": 16.971437454223633, "learning_rate": 6.0151515151515156e-05, "loss": 2.5184, "step": 531 }, { "epoch": 1.209090909090909, "grad_norm": 11.486995697021484, "learning_rate": 6.0075757575757575e-05, "loss": 1.5296, "step": 532 }, { "epoch": 1.2113636363636364, "grad_norm": 17.541278839111328, "learning_rate": 6e-05, "loss": 2.108, "step": 533 }, { "epoch": 1.2136363636363636, "grad_norm": 13.599751472473145, "learning_rate": 5.992424242424243e-05, "loss": 2.0622, "step": 534 }, { "epoch": 1.2159090909090908, "grad_norm": 10.884852409362793, "learning_rate": 5.9848484848484854e-05, "loss": 1.5018, "step": 535 }, { "epoch": 1.2181818181818183, "grad_norm": 10.407668113708496, "learning_rate": 5.977272727272728e-05, "loss": 1.5013, "step": 536 }, { "epoch": 1.2204545454545455, "grad_norm": 9.911277770996094, "learning_rate": 5.969696969696969e-05, "loss": 1.9855, "step": 537 }, { "epoch": 1.2227272727272727, "grad_norm": 11.939435958862305, "learning_rate": 5.962121212121212e-05, "loss": 2.359, "step": 538 }, { "epoch": 1.225, "grad_norm": 11.17503547668457, "learning_rate": 5.9545454545454546e-05, "loss": 1.4952, "step": 539 }, { "epoch": 1.2272727272727273, "grad_norm": 15.073485374450684, "learning_rate": 5.946969696969697e-05, "loss": 2.1802, "step": 540 }, { "epoch": 1.2295454545454545, "grad_norm": 12.413151741027832, "learning_rate": 5.93939393939394e-05, "loss": 1.9444, "step": 541 }, { "epoch": 1.231818181818182, "grad_norm": 12.741022109985352, "learning_rate": 5.931818181818182e-05, "loss": 1.894, "step": 542 }, { "epoch": 1.2340909090909091, "grad_norm": 11.041027069091797, "learning_rate": 5.9242424242424244e-05, "loss": 1.748, "step": 543 }, { "epoch": 1.2363636363636363, "grad_norm": 10.045198440551758, "learning_rate": 5.916666666666667e-05, "loss": 1.7848, "step": 544 }, { "epoch": 1.2386363636363638, "grad_norm": 10.759014129638672, "learning_rate": 5.90909090909091e-05, "loss": 1.7836, "step": 545 }, { "epoch": 1.240909090909091, "grad_norm": 10.296431541442871, "learning_rate": 5.901515151515152e-05, "loss": 1.088, "step": 546 }, { "epoch": 1.2431818181818182, "grad_norm": 11.159008026123047, "learning_rate": 5.8939393939393936e-05, "loss": 1.3126, "step": 547 }, { "epoch": 1.2454545454545454, "grad_norm": 7.6021270751953125, "learning_rate": 5.886363636363636e-05, "loss": 1.137, "step": 548 }, { "epoch": 1.2477272727272728, "grad_norm": 11.449591636657715, "learning_rate": 5.878787878787879e-05, "loss": 1.7471, "step": 549 }, { "epoch": 1.25, "grad_norm": 14.451662063598633, "learning_rate": 5.871212121212122e-05, "loss": 2.014, "step": 550 }, { "epoch": 1.2522727272727272, "grad_norm": 11.24593448638916, "learning_rate": 5.8636363636363634e-05, "loss": 1.5885, "step": 551 }, { "epoch": 1.2545454545454544, "grad_norm": 10.326696395874023, "learning_rate": 5.856060606060606e-05, "loss": 1.5146, "step": 552 }, { "epoch": 1.2568181818181818, "grad_norm": 11.736088752746582, "learning_rate": 5.848484848484849e-05, "loss": 2.1627, "step": 553 }, { "epoch": 1.259090909090909, "grad_norm": 14.25733757019043, "learning_rate": 5.840909090909091e-05, "loss": 1.8419, "step": 554 }, { "epoch": 1.2613636363636362, "grad_norm": 10.154618263244629, "learning_rate": 5.833333333333334e-05, "loss": 1.8319, "step": 555 }, { "epoch": 1.2636363636363637, "grad_norm": 14.464015007019043, "learning_rate": 5.825757575757575e-05, "loss": 1.7117, "step": 556 }, { "epoch": 1.2659090909090909, "grad_norm": 9.713830947875977, "learning_rate": 5.818181818181818e-05, "loss": 1.4495, "step": 557 }, { "epoch": 1.268181818181818, "grad_norm": 21.958648681640625, "learning_rate": 5.810606060606061e-05, "loss": 3.0762, "step": 558 }, { "epoch": 1.2704545454545455, "grad_norm": 11.349808692932129, "learning_rate": 5.803030303030304e-05, "loss": 1.9419, "step": 559 }, { "epoch": 1.2727272727272727, "grad_norm": 12.586771965026855, "learning_rate": 5.7954545454545464e-05, "loss": 2.1826, "step": 560 }, { "epoch": 1.275, "grad_norm": 10.261626243591309, "learning_rate": 5.787878787878788e-05, "loss": 2.0422, "step": 561 }, { "epoch": 1.2772727272727273, "grad_norm": 11.65180492401123, "learning_rate": 5.78030303030303e-05, "loss": 1.5295, "step": 562 }, { "epoch": 1.2795454545454545, "grad_norm": 12.369877815246582, "learning_rate": 5.772727272727273e-05, "loss": 1.8935, "step": 563 }, { "epoch": 1.2818181818181817, "grad_norm": 10.670714378356934, "learning_rate": 5.7651515151515156e-05, "loss": 2.0215, "step": 564 }, { "epoch": 1.2840909090909092, "grad_norm": 13.76659870147705, "learning_rate": 5.757575757575758e-05, "loss": 2.2982, "step": 565 }, { "epoch": 1.2863636363636364, "grad_norm": 9.004195213317871, "learning_rate": 5.7499999999999995e-05, "loss": 1.6066, "step": 566 }, { "epoch": 1.2886363636363636, "grad_norm": 10.873322486877441, "learning_rate": 5.742424242424243e-05, "loss": 1.5999, "step": 567 }, { "epoch": 1.290909090909091, "grad_norm": 11.641073226928711, "learning_rate": 5.7348484848484854e-05, "loss": 1.3272, "step": 568 }, { "epoch": 1.2931818181818182, "grad_norm": 9.68420124053955, "learning_rate": 5.727272727272728e-05, "loss": 1.413, "step": 569 }, { "epoch": 1.2954545454545454, "grad_norm": 13.477838516235352, "learning_rate": 5.719696969696971e-05, "loss": 2.5129, "step": 570 }, { "epoch": 1.2977272727272728, "grad_norm": 11.720010757446289, "learning_rate": 5.712121212121212e-05, "loss": 2.1576, "step": 571 }, { "epoch": 1.3, "grad_norm": 13.136527061462402, "learning_rate": 5.7045454545454546e-05, "loss": 1.9311, "step": 572 }, { "epoch": 1.3022727272727272, "grad_norm": 8.095415115356445, "learning_rate": 5.696969696969697e-05, "loss": 0.8927, "step": 573 }, { "epoch": 1.3045454545454547, "grad_norm": 11.233893394470215, "learning_rate": 5.68939393939394e-05, "loss": 2.0108, "step": 574 }, { "epoch": 1.3068181818181819, "grad_norm": 11.203099250793457, "learning_rate": 5.6818181818181825e-05, "loss": 1.9241, "step": 575 }, { "epoch": 1.309090909090909, "grad_norm": 9.640209197998047, "learning_rate": 5.6742424242424244e-05, "loss": 1.3841, "step": 576 }, { "epoch": 1.3113636363636363, "grad_norm": 10.882938385009766, "learning_rate": 5.666666666666667e-05, "loss": 1.4184, "step": 577 }, { "epoch": 1.3136363636363637, "grad_norm": 10.470818519592285, "learning_rate": 5.65909090909091e-05, "loss": 2.0096, "step": 578 }, { "epoch": 1.315909090909091, "grad_norm": 12.759695053100586, "learning_rate": 5.651515151515152e-05, "loss": 2.138, "step": 579 }, { "epoch": 1.3181818181818181, "grad_norm": 26.707128524780273, "learning_rate": 5.643939393939395e-05, "loss": 2.8215, "step": 580 }, { "epoch": 1.3204545454545453, "grad_norm": 11.116402626037598, "learning_rate": 5.636363636363636e-05, "loss": 2.4158, "step": 581 }, { "epoch": 1.3227272727272728, "grad_norm": 14.136595726013184, "learning_rate": 5.628787878787879e-05, "loss": 1.6545, "step": 582 }, { "epoch": 1.325, "grad_norm": 11.88375473022461, "learning_rate": 5.6212121212121215e-05, "loss": 2.1069, "step": 583 }, { "epoch": 1.3272727272727272, "grad_norm": 11.863356590270996, "learning_rate": 5.613636363636364e-05, "loss": 1.535, "step": 584 }, { "epoch": 1.3295454545454546, "grad_norm": 11.284381866455078, "learning_rate": 5.606060606060606e-05, "loss": 2.3407, "step": 585 }, { "epoch": 1.3318181818181818, "grad_norm": 11.79831600189209, "learning_rate": 5.598484848484849e-05, "loss": 1.6409, "step": 586 }, { "epoch": 1.334090909090909, "grad_norm": 11.130000114440918, "learning_rate": 5.5909090909090913e-05, "loss": 1.6426, "step": 587 }, { "epoch": 1.3363636363636364, "grad_norm": 9.1551513671875, "learning_rate": 5.583333333333334e-05, "loss": 1.8466, "step": 588 }, { "epoch": 1.3386363636363636, "grad_norm": 14.405865669250488, "learning_rate": 5.5757575757575766e-05, "loss": 2.066, "step": 589 }, { "epoch": 1.3409090909090908, "grad_norm": 53.46037673950195, "learning_rate": 5.568181818181818e-05, "loss": 2.4224, "step": 590 }, { "epoch": 1.3431818181818183, "grad_norm": 11.6724271774292, "learning_rate": 5.5606060606060605e-05, "loss": 1.7148, "step": 591 }, { "epoch": 1.3454545454545455, "grad_norm": 15.849516868591309, "learning_rate": 5.553030303030303e-05, "loss": 2.0981, "step": 592 }, { "epoch": 1.3477272727272727, "grad_norm": 13.421188354492188, "learning_rate": 5.545454545454546e-05, "loss": 1.722, "step": 593 }, { "epoch": 1.35, "grad_norm": 14.319283485412598, "learning_rate": 5.5378787878787884e-05, "loss": 1.7284, "step": 594 }, { "epoch": 1.3522727272727273, "grad_norm": 12.210022926330566, "learning_rate": 5.5303030303030304e-05, "loss": 1.4507, "step": 595 }, { "epoch": 1.3545454545454545, "grad_norm": 11.60317325592041, "learning_rate": 5.522727272727273e-05, "loss": 1.7749, "step": 596 }, { "epoch": 1.356818181818182, "grad_norm": 12.895737648010254, "learning_rate": 5.5151515151515156e-05, "loss": 1.5555, "step": 597 }, { "epoch": 1.3590909090909091, "grad_norm": 11.198805809020996, "learning_rate": 5.507575757575758e-05, "loss": 1.7624, "step": 598 }, { "epoch": 1.3613636363636363, "grad_norm": 13.309189796447754, "learning_rate": 5.500000000000001e-05, "loss": 1.8765, "step": 599 }, { "epoch": 1.3636363636363638, "grad_norm": 10.177202224731445, "learning_rate": 5.492424242424242e-05, "loss": 1.1895, "step": 600 }, { "epoch": 1.365909090909091, "grad_norm": 11.205484390258789, "learning_rate": 5.484848484848485e-05, "loss": 1.1661, "step": 601 }, { "epoch": 1.3681818181818182, "grad_norm": 12.091497421264648, "learning_rate": 5.4772727272727274e-05, "loss": 1.9972, "step": 602 }, { "epoch": 1.3704545454545456, "grad_norm": 11.2894926071167, "learning_rate": 5.46969696969697e-05, "loss": 1.7121, "step": 603 }, { "epoch": 1.3727272727272728, "grad_norm": 15.034446716308594, "learning_rate": 5.462121212121213e-05, "loss": 2.8078, "step": 604 }, { "epoch": 1.375, "grad_norm": 8.075346946716309, "learning_rate": 5.4545454545454546e-05, "loss": 1.0453, "step": 605 }, { "epoch": 1.3772727272727272, "grad_norm": 10.377656936645508, "learning_rate": 5.446969696969697e-05, "loss": 1.7973, "step": 606 }, { "epoch": 1.3795454545454544, "grad_norm": 10.147284507751465, "learning_rate": 5.43939393939394e-05, "loss": 2.1848, "step": 607 }, { "epoch": 1.3818181818181818, "grad_norm": 11.856623649597168, "learning_rate": 5.4318181818181825e-05, "loss": 1.9857, "step": 608 }, { "epoch": 1.384090909090909, "grad_norm": 10.355262756347656, "learning_rate": 5.424242424242425e-05, "loss": 1.4383, "step": 609 }, { "epoch": 1.3863636363636362, "grad_norm": 9.085455894470215, "learning_rate": 5.4166666666666664e-05, "loss": 1.382, "step": 610 }, { "epoch": 1.3886363636363637, "grad_norm": 13.221922874450684, "learning_rate": 5.409090909090909e-05, "loss": 2.3278, "step": 611 }, { "epoch": 1.3909090909090909, "grad_norm": 14.725556373596191, "learning_rate": 5.401515151515152e-05, "loss": 2.0181, "step": 612 }, { "epoch": 1.393181818181818, "grad_norm": 11.90503978729248, "learning_rate": 5.393939393939394e-05, "loss": 2.5601, "step": 613 }, { "epoch": 1.3954545454545455, "grad_norm": 10.583837509155273, "learning_rate": 5.386363636363637e-05, "loss": 1.4886, "step": 614 }, { "epoch": 1.3977272727272727, "grad_norm": 12.369796752929688, "learning_rate": 5.378787878787879e-05, "loss": 1.2716, "step": 615 }, { "epoch": 1.4, "grad_norm": 12.412566184997559, "learning_rate": 5.3712121212121215e-05, "loss": 2.0391, "step": 616 }, { "epoch": 1.4022727272727273, "grad_norm": 12.033483505249023, "learning_rate": 5.363636363636364e-05, "loss": 1.2044, "step": 617 }, { "epoch": 1.4045454545454545, "grad_norm": 11.291866302490234, "learning_rate": 5.356060606060607e-05, "loss": 2.3266, "step": 618 }, { "epoch": 1.4068181818181817, "grad_norm": 17.745227813720703, "learning_rate": 5.348484848484848e-05, "loss": 1.7097, "step": 619 }, { "epoch": 1.4090909090909092, "grad_norm": 11.858403205871582, "learning_rate": 5.340909090909091e-05, "loss": 1.9088, "step": 620 }, { "epoch": 1.4113636363636364, "grad_norm": 14.968146324157715, "learning_rate": 5.333333333333333e-05, "loss": 2.009, "step": 621 }, { "epoch": 1.4136363636363636, "grad_norm": 13.16178035736084, "learning_rate": 5.325757575757576e-05, "loss": 1.6262, "step": 622 }, { "epoch": 1.415909090909091, "grad_norm": 11.63772201538086, "learning_rate": 5.3181818181818186e-05, "loss": 1.481, "step": 623 }, { "epoch": 1.4181818181818182, "grad_norm": 13.266715049743652, "learning_rate": 5.3106060606060605e-05, "loss": 2.3015, "step": 624 }, { "epoch": 1.4204545454545454, "grad_norm": 11.690614700317383, "learning_rate": 5.303030303030303e-05, "loss": 1.7226, "step": 625 }, { "epoch": 1.4227272727272728, "grad_norm": 10.599973678588867, "learning_rate": 5.295454545454546e-05, "loss": 1.0261, "step": 626 }, { "epoch": 1.425, "grad_norm": 17.117259979248047, "learning_rate": 5.2878787878787884e-05, "loss": 1.7164, "step": 627 }, { "epoch": 1.4272727272727272, "grad_norm": 11.62483024597168, "learning_rate": 5.280303030303031e-05, "loss": 1.3686, "step": 628 }, { "epoch": 1.4295454545454547, "grad_norm": 10.503996849060059, "learning_rate": 5.272727272727272e-05, "loss": 1.6085, "step": 629 }, { "epoch": 1.4318181818181819, "grad_norm": 14.493663787841797, "learning_rate": 5.265151515151515e-05, "loss": 2.0943, "step": 630 }, { "epoch": 1.434090909090909, "grad_norm": 11.125360488891602, "learning_rate": 5.2575757575757576e-05, "loss": 1.8284, "step": 631 }, { "epoch": 1.4363636363636363, "grad_norm": 10.438358306884766, "learning_rate": 5.25e-05, "loss": 2.1436, "step": 632 }, { "epoch": 1.4386363636363637, "grad_norm": 13.013614654541016, "learning_rate": 5.242424242424243e-05, "loss": 1.6999, "step": 633 }, { "epoch": 1.440909090909091, "grad_norm": 14.21478271484375, "learning_rate": 5.234848484848485e-05, "loss": 3.268, "step": 634 }, { "epoch": 1.4431818181818181, "grad_norm": 10.756131172180176, "learning_rate": 5.2272727272727274e-05, "loss": 1.1294, "step": 635 }, { "epoch": 1.4454545454545453, "grad_norm": 14.409692764282227, "learning_rate": 5.21969696969697e-05, "loss": 1.391, "step": 636 }, { "epoch": 1.4477272727272728, "grad_norm": 9.839500427246094, "learning_rate": 5.212121212121213e-05, "loss": 1.4028, "step": 637 }, { "epoch": 1.45, "grad_norm": 13.601579666137695, "learning_rate": 5.204545454545455e-05, "loss": 1.6384, "step": 638 }, { "epoch": 1.4522727272727272, "grad_norm": 12.721500396728516, "learning_rate": 5.1969696969696966e-05, "loss": 1.9382, "step": 639 }, { "epoch": 1.4545454545454546, "grad_norm": 11.373588562011719, "learning_rate": 5.189393939393939e-05, "loss": 2.7324, "step": 640 }, { "epoch": 1.4568181818181818, "grad_norm": 11.873559951782227, "learning_rate": 5.181818181818182e-05, "loss": 1.6583, "step": 641 }, { "epoch": 1.459090909090909, "grad_norm": 10.649148941040039, "learning_rate": 5.1742424242424245e-05, "loss": 1.7733, "step": 642 }, { "epoch": 1.4613636363636364, "grad_norm": 12.14698314666748, "learning_rate": 5.166666666666667e-05, "loss": 1.6434, "step": 643 }, { "epoch": 1.4636363636363636, "grad_norm": 9.80806827545166, "learning_rate": 5.159090909090909e-05, "loss": 1.9463, "step": 644 }, { "epoch": 1.4659090909090908, "grad_norm": 7.273732662200928, "learning_rate": 5.151515151515152e-05, "loss": 0.8156, "step": 645 }, { "epoch": 1.4681818181818183, "grad_norm": 12.560272216796875, "learning_rate": 5.143939393939394e-05, "loss": 2.2347, "step": 646 }, { "epoch": 1.4704545454545455, "grad_norm": 10.116893768310547, "learning_rate": 5.136363636363637e-05, "loss": 1.2157, "step": 647 }, { "epoch": 1.4727272727272727, "grad_norm": 11.09861946105957, "learning_rate": 5.1287878787878796e-05, "loss": 1.2521, "step": 648 }, { "epoch": 1.475, "grad_norm": 11.454336166381836, "learning_rate": 5.121212121212121e-05, "loss": 1.6148, "step": 649 }, { "epoch": 1.4772727272727273, "grad_norm": 11.669930458068848, "learning_rate": 5.1136363636363635e-05, "loss": 2.4559, "step": 650 }, { "epoch": 1.4795454545454545, "grad_norm": 10.853449821472168, "learning_rate": 5.106060606060606e-05, "loss": 1.6519, "step": 651 }, { "epoch": 1.481818181818182, "grad_norm": 23.87467384338379, "learning_rate": 5.098484848484849e-05, "loss": 3.9198, "step": 652 }, { "epoch": 1.4840909090909091, "grad_norm": 15.731586456298828, "learning_rate": 5.090909090909091e-05, "loss": 2.4425, "step": 653 }, { "epoch": 1.4863636363636363, "grad_norm": 10.91791820526123, "learning_rate": 5.0833333333333333e-05, "loss": 1.4977, "step": 654 }, { "epoch": 1.4886363636363638, "grad_norm": 11.515501022338867, "learning_rate": 5.075757575757576e-05, "loss": 1.4377, "step": 655 }, { "epoch": 1.490909090909091, "grad_norm": 9.79021167755127, "learning_rate": 5.0681818181818186e-05, "loss": 1.208, "step": 656 }, { "epoch": 1.4931818181818182, "grad_norm": 7.424502849578857, "learning_rate": 5.060606060606061e-05, "loss": 1.368, "step": 657 }, { "epoch": 1.4954545454545456, "grad_norm": 9.132887840270996, "learning_rate": 5.0530303030303025e-05, "loss": 1.0296, "step": 658 }, { "epoch": 1.4977272727272728, "grad_norm": 14.063539505004883, "learning_rate": 5.045454545454545e-05, "loss": 1.9923, "step": 659 }, { "epoch": 1.5, "grad_norm": 10.994144439697266, "learning_rate": 5.037878787878788e-05, "loss": 1.5963, "step": 660 }, { "epoch": 1.5022727272727274, "grad_norm": 11.193540573120117, "learning_rate": 5.030303030303031e-05, "loss": 2.6418, "step": 661 }, { "epoch": 1.5045454545454544, "grad_norm": 11.344916343688965, "learning_rate": 5.022727272727274e-05, "loss": 0.9847, "step": 662 }, { "epoch": 1.5068181818181818, "grad_norm": 16.028928756713867, "learning_rate": 5.015151515151515e-05, "loss": 2.7095, "step": 663 }, { "epoch": 1.509090909090909, "grad_norm": 10.2492036819458, "learning_rate": 5.0075757575757576e-05, "loss": 1.4351, "step": 664 }, { "epoch": 1.5113636363636362, "grad_norm": 12.819211959838867, "learning_rate": 5e-05, "loss": 2.2236, "step": 665 }, { "epoch": 1.5136363636363637, "grad_norm": 9.43850326538086, "learning_rate": 4.992424242424243e-05, "loss": 0.988, "step": 666 }, { "epoch": 1.5159090909090909, "grad_norm": 12.35922622680664, "learning_rate": 4.984848484848485e-05, "loss": 1.9395, "step": 667 }, { "epoch": 1.518181818181818, "grad_norm": 12.175325393676758, "learning_rate": 4.9772727272727275e-05, "loss": 2.0219, "step": 668 }, { "epoch": 1.5204545454545455, "grad_norm": 16.44111442565918, "learning_rate": 4.9696969696969694e-05, "loss": 1.7191, "step": 669 }, { "epoch": 1.5227272727272727, "grad_norm": 12.413610458374023, "learning_rate": 4.962121212121213e-05, "loss": 2.2003, "step": 670 }, { "epoch": 1.525, "grad_norm": 7.922098159790039, "learning_rate": 4.9545454545454553e-05, "loss": 1.1514, "step": 671 }, { "epoch": 1.5272727272727273, "grad_norm": 11.402259826660156, "learning_rate": 4.946969696969697e-05, "loss": 1.6611, "step": 672 }, { "epoch": 1.5295454545454545, "grad_norm": 10.548962593078613, "learning_rate": 4.93939393939394e-05, "loss": 1.6242, "step": 673 }, { "epoch": 1.5318181818181817, "grad_norm": 14.536432266235352, "learning_rate": 4.931818181818182e-05, "loss": 2.2415, "step": 674 }, { "epoch": 1.5340909090909092, "grad_norm": 12.954751014709473, "learning_rate": 4.9242424242424245e-05, "loss": 1.8463, "step": 675 }, { "epoch": 1.5363636363636364, "grad_norm": 12.143820762634277, "learning_rate": 4.9166666666666665e-05, "loss": 1.97, "step": 676 }, { "epoch": 1.5386363636363636, "grad_norm": 10.134570121765137, "learning_rate": 4.909090909090909e-05, "loss": 0.9264, "step": 677 }, { "epoch": 1.540909090909091, "grad_norm": 12.558758735656738, "learning_rate": 4.901515151515152e-05, "loss": 1.4608, "step": 678 }, { "epoch": 1.5431818181818182, "grad_norm": 10.165045738220215, "learning_rate": 4.8939393939393944e-05, "loss": 1.3453, "step": 679 }, { "epoch": 1.5454545454545454, "grad_norm": 11.995816230773926, "learning_rate": 4.886363636363637e-05, "loss": 2.1228, "step": 680 }, { "epoch": 1.5477272727272728, "grad_norm": 10.822747230529785, "learning_rate": 4.878787878787879e-05, "loss": 2.0378, "step": 681 }, { "epoch": 1.55, "grad_norm": 16.348892211914062, "learning_rate": 4.8712121212121216e-05, "loss": 1.7209, "step": 682 }, { "epoch": 1.5522727272727272, "grad_norm": 9.395282745361328, "learning_rate": 4.863636363636364e-05, "loss": 1.4529, "step": 683 }, { "epoch": 1.5545454545454547, "grad_norm": 16.89964485168457, "learning_rate": 4.856060606060606e-05, "loss": 2.8833, "step": 684 }, { "epoch": 1.5568181818181817, "grad_norm": 10.703327178955078, "learning_rate": 4.848484848484849e-05, "loss": 1.7938, "step": 685 }, { "epoch": 1.559090909090909, "grad_norm": 19.770193099975586, "learning_rate": 4.840909090909091e-05, "loss": 1.6041, "step": 686 }, { "epoch": 1.5613636363636365, "grad_norm": 11.777501106262207, "learning_rate": 4.8333333333333334e-05, "loss": 2.0716, "step": 687 }, { "epoch": 1.5636363636363635, "grad_norm": 10.248165130615234, "learning_rate": 4.825757575757576e-05, "loss": 1.5853, "step": 688 }, { "epoch": 1.565909090909091, "grad_norm": 10.732747077941895, "learning_rate": 4.8181818181818186e-05, "loss": 1.2683, "step": 689 }, { "epoch": 1.5681818181818183, "grad_norm": 11.304749488830566, "learning_rate": 4.810606060606061e-05, "loss": 2.2432, "step": 690 }, { "epoch": 1.5704545454545453, "grad_norm": 13.820841789245605, "learning_rate": 4.803030303030303e-05, "loss": 1.8117, "step": 691 }, { "epoch": 1.5727272727272728, "grad_norm": 9.33556079864502, "learning_rate": 4.795454545454546e-05, "loss": 1.0837, "step": 692 }, { "epoch": 1.575, "grad_norm": 13.970429420471191, "learning_rate": 4.787878787878788e-05, "loss": 2.5927, "step": 693 }, { "epoch": 1.5772727272727272, "grad_norm": 10.840149879455566, "learning_rate": 4.7803030303030304e-05, "loss": 1.8707, "step": 694 }, { "epoch": 1.5795454545454546, "grad_norm": 11.14415168762207, "learning_rate": 4.772727272727273e-05, "loss": 1.6668, "step": 695 }, { "epoch": 1.5818181818181818, "grad_norm": 14.185403823852539, "learning_rate": 4.765151515151515e-05, "loss": 1.6091, "step": 696 }, { "epoch": 1.584090909090909, "grad_norm": 13.565306663513184, "learning_rate": 4.7575757575757576e-05, "loss": 1.8229, "step": 697 }, { "epoch": 1.5863636363636364, "grad_norm": 14.329642295837402, "learning_rate": 4.75e-05, "loss": 1.9366, "step": 698 }, { "epoch": 1.5886363636363636, "grad_norm": 12.332931518554688, "learning_rate": 4.742424242424243e-05, "loss": 1.683, "step": 699 }, { "epoch": 1.5909090909090908, "grad_norm": 10.493454933166504, "learning_rate": 4.7348484848484855e-05, "loss": 1.8994, "step": 700 }, { "epoch": 1.5931818181818183, "grad_norm": 11.809647560119629, "learning_rate": 4.7272727272727275e-05, "loss": 1.509, "step": 701 }, { "epoch": 1.5954545454545455, "grad_norm": 12.72128963470459, "learning_rate": 4.71969696969697e-05, "loss": 2.1266, "step": 702 }, { "epoch": 1.5977272727272727, "grad_norm": 13.074295043945312, "learning_rate": 4.712121212121212e-05, "loss": 1.6113, "step": 703 }, { "epoch": 1.6, "grad_norm": 10.254904747009277, "learning_rate": 4.704545454545455e-05, "loss": 2.2737, "step": 704 }, { "epoch": 1.6022727272727273, "grad_norm": 24.574390411376953, "learning_rate": 4.696969696969697e-05, "loss": 2.2779, "step": 705 }, { "epoch": 1.6045454545454545, "grad_norm": 10.441598892211914, "learning_rate": 4.689393939393939e-05, "loss": 1.8209, "step": 706 }, { "epoch": 1.606818181818182, "grad_norm": 12.4207763671875, "learning_rate": 4.681818181818182e-05, "loss": 1.5389, "step": 707 }, { "epoch": 1.6090909090909091, "grad_norm": 15.072708129882812, "learning_rate": 4.6742424242424245e-05, "loss": 1.3703, "step": 708 }, { "epoch": 1.6113636363636363, "grad_norm": 11.555070877075195, "learning_rate": 4.666666666666667e-05, "loss": 1.9363, "step": 709 }, { "epoch": 1.6136363636363638, "grad_norm": 13.27509593963623, "learning_rate": 4.659090909090909e-05, "loss": 1.4334, "step": 710 }, { "epoch": 1.615909090909091, "grad_norm": 12.357429504394531, "learning_rate": 4.651515151515152e-05, "loss": 2.3112, "step": 711 }, { "epoch": 1.6181818181818182, "grad_norm": 19.84957504272461, "learning_rate": 4.6439393939393944e-05, "loss": 1.1851, "step": 712 }, { "epoch": 1.6204545454545456, "grad_norm": 10.689920425415039, "learning_rate": 4.636363636363636e-05, "loss": 1.921, "step": 713 }, { "epoch": 1.6227272727272726, "grad_norm": 10.688066482543945, "learning_rate": 4.628787878787879e-05, "loss": 1.2294, "step": 714 }, { "epoch": 1.625, "grad_norm": 11.80333423614502, "learning_rate": 4.621212121212121e-05, "loss": 2.5255, "step": 715 }, { "epoch": 1.6272727272727274, "grad_norm": 11.181013107299805, "learning_rate": 4.6136363636363635e-05, "loss": 1.2692, "step": 716 }, { "epoch": 1.6295454545454544, "grad_norm": 11.557047843933105, "learning_rate": 4.606060606060607e-05, "loss": 1.4575, "step": 717 }, { "epoch": 1.6318181818181818, "grad_norm": 13.798693656921387, "learning_rate": 4.598484848484849e-05, "loss": 2.3197, "step": 718 }, { "epoch": 1.634090909090909, "grad_norm": 8.890710830688477, "learning_rate": 4.5909090909090914e-05, "loss": 1.5266, "step": 719 }, { "epoch": 1.6363636363636362, "grad_norm": 10.293892860412598, "learning_rate": 4.5833333333333334e-05, "loss": 1.9222, "step": 720 }, { "epoch": 1.6386363636363637, "grad_norm": 12.959512710571289, "learning_rate": 4.575757575757576e-05, "loss": 1.5771, "step": 721 }, { "epoch": 1.6409090909090909, "grad_norm": 11.565927505493164, "learning_rate": 4.5681818181818186e-05, "loss": 1.5313, "step": 722 }, { "epoch": 1.643181818181818, "grad_norm": 9.419241905212402, "learning_rate": 4.5606060606060606e-05, "loss": 1.4229, "step": 723 }, { "epoch": 1.6454545454545455, "grad_norm": 15.411003112792969, "learning_rate": 4.553030303030303e-05, "loss": 1.8707, "step": 724 }, { "epoch": 1.6477272727272727, "grad_norm": 7.6546711921691895, "learning_rate": 4.545454545454546e-05, "loss": 0.742, "step": 725 }, { "epoch": 1.65, "grad_norm": 13.029730796813965, "learning_rate": 4.5378787878787885e-05, "loss": 1.5179, "step": 726 }, { "epoch": 1.6522727272727273, "grad_norm": 12.853962898254395, "learning_rate": 4.5303030303030304e-05, "loss": 1.8908, "step": 727 }, { "epoch": 1.6545454545454545, "grad_norm": 12.864992141723633, "learning_rate": 4.522727272727273e-05, "loss": 1.7175, "step": 728 }, { "epoch": 1.6568181818181817, "grad_norm": 13.25144100189209, "learning_rate": 4.515151515151516e-05, "loss": 1.7681, "step": 729 }, { "epoch": 1.6590909090909092, "grad_norm": 9.894201278686523, "learning_rate": 4.5075757575757577e-05, "loss": 1.5505, "step": 730 }, { "epoch": 1.6613636363636364, "grad_norm": 16.501630783081055, "learning_rate": 4.5e-05, "loss": 1.4968, "step": 731 }, { "epoch": 1.6636363636363636, "grad_norm": 10.3342924118042, "learning_rate": 4.492424242424242e-05, "loss": 1.4734, "step": 732 }, { "epoch": 1.665909090909091, "grad_norm": 11.081184387207031, "learning_rate": 4.484848484848485e-05, "loss": 2.6513, "step": 733 }, { "epoch": 1.6681818181818182, "grad_norm": 17.005704879760742, "learning_rate": 4.4772727272727275e-05, "loss": 2.4109, "step": 734 }, { "epoch": 1.6704545454545454, "grad_norm": 11.718207359313965, "learning_rate": 4.46969696969697e-05, "loss": 1.6445, "step": 735 }, { "epoch": 1.6727272727272728, "grad_norm": 12.14245319366455, "learning_rate": 4.462121212121213e-05, "loss": 2.335, "step": 736 }, { "epoch": 1.675, "grad_norm": 10.971789360046387, "learning_rate": 4.454545454545455e-05, "loss": 1.6266, "step": 737 }, { "epoch": 1.6772727272727272, "grad_norm": 17.435321807861328, "learning_rate": 4.4469696969696973e-05, "loss": 2.1164, "step": 738 }, { "epoch": 1.6795454545454547, "grad_norm": 10.45814323425293, "learning_rate": 4.43939393939394e-05, "loss": 1.3992, "step": 739 }, { "epoch": 1.6818181818181817, "grad_norm": 12.788302421569824, "learning_rate": 4.431818181818182e-05, "loss": 2.4001, "step": 740 }, { "epoch": 1.684090909090909, "grad_norm": 14.425982475280762, "learning_rate": 4.4242424242424246e-05, "loss": 2.163, "step": 741 }, { "epoch": 1.6863636363636365, "grad_norm": 9.09310531616211, "learning_rate": 4.4166666666666665e-05, "loss": 1.4595, "step": 742 }, { "epoch": 1.6886363636363635, "grad_norm": 11.336987495422363, "learning_rate": 4.409090909090909e-05, "loss": 2.6262, "step": 743 }, { "epoch": 1.690909090909091, "grad_norm": 11.697134017944336, "learning_rate": 4.401515151515152e-05, "loss": 1.3628, "step": 744 }, { "epoch": 1.6931818181818183, "grad_norm": 8.620695114135742, "learning_rate": 4.3939393939393944e-05, "loss": 1.2893, "step": 745 }, { "epoch": 1.6954545454545453, "grad_norm": 9.322046279907227, "learning_rate": 4.386363636363637e-05, "loss": 1.9579, "step": 746 }, { "epoch": 1.6977272727272728, "grad_norm": 11.273119926452637, "learning_rate": 4.378787878787879e-05, "loss": 2.2207, "step": 747 }, { "epoch": 1.7, "grad_norm": 11.111379623413086, "learning_rate": 4.3712121212121216e-05, "loss": 1.4021, "step": 748 }, { "epoch": 1.7022727272727272, "grad_norm": 11.808859825134277, "learning_rate": 4.3636363636363636e-05, "loss": 1.4873, "step": 749 }, { "epoch": 1.7045454545454546, "grad_norm": 14.41899585723877, "learning_rate": 4.356060606060606e-05, "loss": 1.9247, "step": 750 }, { "epoch": 1.7068181818181818, "grad_norm": 9.383740425109863, "learning_rate": 4.348484848484849e-05, "loss": 1.6231, "step": 751 }, { "epoch": 1.709090909090909, "grad_norm": 9.926271438598633, "learning_rate": 4.340909090909091e-05, "loss": 2.2661, "step": 752 }, { "epoch": 1.7113636363636364, "grad_norm": 12.015188217163086, "learning_rate": 4.3333333333333334e-05, "loss": 1.4877, "step": 753 }, { "epoch": 1.7136363636363636, "grad_norm": 12.057700157165527, "learning_rate": 4.325757575757576e-05, "loss": 1.6091, "step": 754 }, { "epoch": 1.7159090909090908, "grad_norm": 8.392674446105957, "learning_rate": 4.318181818181819e-05, "loss": 1.4652, "step": 755 }, { "epoch": 1.7181818181818183, "grad_norm": 7.7269287109375, "learning_rate": 4.3106060606060606e-05, "loss": 1.1991, "step": 756 }, { "epoch": 1.7204545454545455, "grad_norm": 13.280454635620117, "learning_rate": 4.303030303030303e-05, "loss": 1.9597, "step": 757 }, { "epoch": 1.7227272727272727, "grad_norm": 11.144329071044922, "learning_rate": 4.295454545454546e-05, "loss": 1.6052, "step": 758 }, { "epoch": 1.725, "grad_norm": 12.23388385772705, "learning_rate": 4.287878787878788e-05, "loss": 1.5491, "step": 759 }, { "epoch": 1.7272727272727273, "grad_norm": 11.918728828430176, "learning_rate": 4.2803030303030305e-05, "loss": 2.0586, "step": 760 }, { "epoch": 1.7295454545454545, "grad_norm": 7.68416166305542, "learning_rate": 4.2727272727272724e-05, "loss": 1.0501, "step": 761 }, { "epoch": 1.731818181818182, "grad_norm": 16.64651870727539, "learning_rate": 4.265151515151515e-05, "loss": 1.9819, "step": 762 }, { "epoch": 1.7340909090909091, "grad_norm": 14.889754295349121, "learning_rate": 4.257575757575758e-05, "loss": 2.5418, "step": 763 }, { "epoch": 1.7363636363636363, "grad_norm": 13.508451461791992, "learning_rate": 4.25e-05, "loss": 1.5028, "step": 764 }, { "epoch": 1.7386363636363638, "grad_norm": 9.541330337524414, "learning_rate": 4.242424242424243e-05, "loss": 1.0183, "step": 765 }, { "epoch": 1.740909090909091, "grad_norm": 13.14413833618164, "learning_rate": 4.234848484848485e-05, "loss": 2.0542, "step": 766 }, { "epoch": 1.7431818181818182, "grad_norm": 12.490581512451172, "learning_rate": 4.2272727272727275e-05, "loss": 1.5971, "step": 767 }, { "epoch": 1.7454545454545456, "grad_norm": 14.117782592773438, "learning_rate": 4.21969696969697e-05, "loss": 3.0207, "step": 768 }, { "epoch": 1.7477272727272726, "grad_norm": 12.968109130859375, "learning_rate": 4.212121212121212e-05, "loss": 1.9058, "step": 769 }, { "epoch": 1.75, "grad_norm": 10.889745712280273, "learning_rate": 4.204545454545455e-05, "loss": 1.535, "step": 770 }, { "epoch": 1.7522727272727274, "grad_norm": 11.901477813720703, "learning_rate": 4.196969696969697e-05, "loss": 1.3743, "step": 771 }, { "epoch": 1.7545454545454544, "grad_norm": 11.466394424438477, "learning_rate": 4.189393939393939e-05, "loss": 2.1364, "step": 772 }, { "epoch": 1.7568181818181818, "grad_norm": 9.973612785339355, "learning_rate": 4.181818181818182e-05, "loss": 1.7472, "step": 773 }, { "epoch": 1.759090909090909, "grad_norm": 11.81697940826416, "learning_rate": 4.1742424242424246e-05, "loss": 1.6475, "step": 774 }, { "epoch": 1.7613636363636362, "grad_norm": 10.81869125366211, "learning_rate": 4.166666666666667e-05, "loss": 2.433, "step": 775 }, { "epoch": 1.7636363636363637, "grad_norm": 15.867783546447754, "learning_rate": 4.159090909090909e-05, "loss": 3.0407, "step": 776 }, { "epoch": 1.7659090909090909, "grad_norm": 12.047411918640137, "learning_rate": 4.151515151515152e-05, "loss": 1.7651, "step": 777 }, { "epoch": 1.768181818181818, "grad_norm": 11.829177856445312, "learning_rate": 4.143939393939394e-05, "loss": 1.5285, "step": 778 }, { "epoch": 1.7704545454545455, "grad_norm": 13.831562995910645, "learning_rate": 4.1363636363636364e-05, "loss": 2.6372, "step": 779 }, { "epoch": 1.7727272727272727, "grad_norm": 10.6288480758667, "learning_rate": 4.128787878787879e-05, "loss": 1.8006, "step": 780 }, { "epoch": 1.775, "grad_norm": 12.919150352478027, "learning_rate": 4.1212121212121216e-05, "loss": 1.8753, "step": 781 }, { "epoch": 1.7772727272727273, "grad_norm": 14.138745307922363, "learning_rate": 4.113636363636364e-05, "loss": 2.1089, "step": 782 }, { "epoch": 1.7795454545454545, "grad_norm": 8.130454063415527, "learning_rate": 4.106060606060606e-05, "loss": 0.9243, "step": 783 }, { "epoch": 1.7818181818181817, "grad_norm": 13.32907485961914, "learning_rate": 4.098484848484849e-05, "loss": 2.599, "step": 784 }, { "epoch": 1.7840909090909092, "grad_norm": 9.957046508789062, "learning_rate": 4.0909090909090915e-05, "loss": 1.1874, "step": 785 }, { "epoch": 1.7863636363636364, "grad_norm": 10.413941383361816, "learning_rate": 4.0833333333333334e-05, "loss": 1.2206, "step": 786 }, { "epoch": 1.7886363636363636, "grad_norm": 12.38062858581543, "learning_rate": 4.075757575757576e-05, "loss": 1.5484, "step": 787 }, { "epoch": 1.790909090909091, "grad_norm": 10.63827896118164, "learning_rate": 4.068181818181818e-05, "loss": 1.4851, "step": 788 }, { "epoch": 1.7931818181818182, "grad_norm": 10.755563735961914, "learning_rate": 4.0606060606060606e-05, "loss": 2.0725, "step": 789 }, { "epoch": 1.7954545454545454, "grad_norm": 10.352532386779785, "learning_rate": 4.053030303030303e-05, "loss": 1.6825, "step": 790 }, { "epoch": 1.7977272727272728, "grad_norm": 10.303858757019043, "learning_rate": 4.045454545454546e-05, "loss": 1.6771, "step": 791 }, { "epoch": 1.8, "grad_norm": 12.914578437805176, "learning_rate": 4.0378787878787885e-05, "loss": 2.0149, "step": 792 }, { "epoch": 1.8022727272727272, "grad_norm": 9.389689445495605, "learning_rate": 4.0303030303030305e-05, "loss": 1.9987, "step": 793 }, { "epoch": 1.8045454545454547, "grad_norm": 13.615360260009766, "learning_rate": 4.022727272727273e-05, "loss": 1.7871, "step": 794 }, { "epoch": 1.8068181818181817, "grad_norm": 12.188302040100098, "learning_rate": 4.015151515151515e-05, "loss": 2.1458, "step": 795 }, { "epoch": 1.809090909090909, "grad_norm": 23.321977615356445, "learning_rate": 4.007575757575758e-05, "loss": 1.5815, "step": 796 }, { "epoch": 1.8113636363636365, "grad_norm": 13.12856674194336, "learning_rate": 4e-05, "loss": 1.9065, "step": 797 }, { "epoch": 1.8136363636363635, "grad_norm": 8.955425262451172, "learning_rate": 3.992424242424242e-05, "loss": 1.4415, "step": 798 }, { "epoch": 1.815909090909091, "grad_norm": 14.052294731140137, "learning_rate": 3.984848484848485e-05, "loss": 2.6913, "step": 799 }, { "epoch": 1.8181818181818183, "grad_norm": 8.688261032104492, "learning_rate": 3.9772727272727275e-05, "loss": 1.6981, "step": 800 }, { "epoch": 1.8204545454545453, "grad_norm": 13.951496124267578, "learning_rate": 3.96969696969697e-05, "loss": 1.5787, "step": 801 }, { "epoch": 1.8227272727272728, "grad_norm": 10.023541450500488, "learning_rate": 3.962121212121213e-05, "loss": 1.9886, "step": 802 }, { "epoch": 1.825, "grad_norm": 8.397741317749023, "learning_rate": 3.954545454545455e-05, "loss": 1.7193, "step": 803 }, { "epoch": 1.8272727272727272, "grad_norm": 10.017319679260254, "learning_rate": 3.9469696969696974e-05, "loss": 1.7097, "step": 804 }, { "epoch": 1.8295454545454546, "grad_norm": 13.632206916809082, "learning_rate": 3.939393939393939e-05, "loss": 2.1469, "step": 805 }, { "epoch": 1.8318181818181818, "grad_norm": 19.315832138061523, "learning_rate": 3.931818181818182e-05, "loss": 2.2873, "step": 806 }, { "epoch": 1.834090909090909, "grad_norm": 11.273087501525879, "learning_rate": 3.924242424242424e-05, "loss": 1.352, "step": 807 }, { "epoch": 1.8363636363636364, "grad_norm": 12.127049446105957, "learning_rate": 3.9166666666666665e-05, "loss": 1.8422, "step": 808 }, { "epoch": 1.8386363636363636, "grad_norm": 9.968843460083008, "learning_rate": 3.909090909090909e-05, "loss": 1.2724, "step": 809 }, { "epoch": 1.8409090909090908, "grad_norm": 13.883306503295898, "learning_rate": 3.901515151515152e-05, "loss": 2.6822, "step": 810 }, { "epoch": 1.8431818181818183, "grad_norm": 10.443497657775879, "learning_rate": 3.8939393939393944e-05, "loss": 1.2037, "step": 811 }, { "epoch": 1.8454545454545455, "grad_norm": 10.290310859680176, "learning_rate": 3.8863636363636364e-05, "loss": 1.5355, "step": 812 }, { "epoch": 1.8477272727272727, "grad_norm": 9.970185279846191, "learning_rate": 3.878787878787879e-05, "loss": 1.957, "step": 813 }, { "epoch": 1.85, "grad_norm": 10.905329704284668, "learning_rate": 3.8712121212121217e-05, "loss": 1.8562, "step": 814 }, { "epoch": 1.8522727272727273, "grad_norm": 9.466534614562988, "learning_rate": 3.8636363636363636e-05, "loss": 1.4522, "step": 815 }, { "epoch": 1.8545454545454545, "grad_norm": 13.48620891571045, "learning_rate": 3.856060606060606e-05, "loss": 2.1203, "step": 816 }, { "epoch": 1.856818181818182, "grad_norm": 12.107563018798828, "learning_rate": 3.848484848484848e-05, "loss": 1.7011, "step": 817 }, { "epoch": 1.8590909090909091, "grad_norm": 10.786709785461426, "learning_rate": 3.840909090909091e-05, "loss": 1.7418, "step": 818 }, { "epoch": 1.8613636363636363, "grad_norm": 10.853336334228516, "learning_rate": 3.8333333333333334e-05, "loss": 1.4229, "step": 819 }, { "epoch": 1.8636363636363638, "grad_norm": 11.42320442199707, "learning_rate": 3.825757575757576e-05, "loss": 1.6411, "step": 820 }, { "epoch": 1.865909090909091, "grad_norm": 9.623292922973633, "learning_rate": 3.818181818181819e-05, "loss": 2.2372, "step": 821 }, { "epoch": 1.8681818181818182, "grad_norm": 19.681766510009766, "learning_rate": 3.810606060606061e-05, "loss": 1.7814, "step": 822 }, { "epoch": 1.8704545454545456, "grad_norm": 11.759204864501953, "learning_rate": 3.803030303030303e-05, "loss": 1.4783, "step": 823 }, { "epoch": 1.8727272727272726, "grad_norm": 11.130982398986816, "learning_rate": 3.795454545454545e-05, "loss": 1.3937, "step": 824 }, { "epoch": 1.875, "grad_norm": 10.193344116210938, "learning_rate": 3.787878787878788e-05, "loss": 1.3912, "step": 825 }, { "epoch": 1.8772727272727274, "grad_norm": 8.412622451782227, "learning_rate": 3.7803030303030305e-05, "loss": 1.3978, "step": 826 }, { "epoch": 1.8795454545454544, "grad_norm": 12.766166687011719, "learning_rate": 3.7727272727272725e-05, "loss": 1.9356, "step": 827 }, { "epoch": 1.8818181818181818, "grad_norm": 11.161136627197266, "learning_rate": 3.765151515151516e-05, "loss": 1.8318, "step": 828 }, { "epoch": 1.884090909090909, "grad_norm": 11.214709281921387, "learning_rate": 3.757575757575758e-05, "loss": 1.4253, "step": 829 }, { "epoch": 1.8863636363636362, "grad_norm": 12.173728942871094, "learning_rate": 3.7500000000000003e-05, "loss": 1.3093, "step": 830 }, { "epoch": 1.8886363636363637, "grad_norm": 12.564881324768066, "learning_rate": 3.742424242424243e-05, "loss": 2.0086, "step": 831 }, { "epoch": 1.8909090909090909, "grad_norm": 10.378774642944336, "learning_rate": 3.734848484848485e-05, "loss": 2.2117, "step": 832 }, { "epoch": 1.893181818181818, "grad_norm": 13.659943580627441, "learning_rate": 3.7272727272727276e-05, "loss": 1.8717, "step": 833 }, { "epoch": 1.8954545454545455, "grad_norm": 10.889350891113281, "learning_rate": 3.7196969696969695e-05, "loss": 2.524, "step": 834 }, { "epoch": 1.8977272727272727, "grad_norm": 20.47830581665039, "learning_rate": 3.712121212121212e-05, "loss": 1.5575, "step": 835 }, { "epoch": 1.9, "grad_norm": 8.377565383911133, "learning_rate": 3.704545454545455e-05, "loss": 1.4985, "step": 836 }, { "epoch": 1.9022727272727273, "grad_norm": 14.420267105102539, "learning_rate": 3.6969696969696974e-05, "loss": 2.0562, "step": 837 }, { "epoch": 1.9045454545454545, "grad_norm": 11.469067573547363, "learning_rate": 3.68939393939394e-05, "loss": 1.9261, "step": 838 }, { "epoch": 1.9068181818181817, "grad_norm": 14.95913314819336, "learning_rate": 3.681818181818182e-05, "loss": 1.4905, "step": 839 }, { "epoch": 1.9090909090909092, "grad_norm": 12.481145858764648, "learning_rate": 3.6742424242424246e-05, "loss": 1.3664, "step": 840 }, { "epoch": 1.9113636363636364, "grad_norm": 11.715337753295898, "learning_rate": 3.6666666666666666e-05, "loss": 2.0561, "step": 841 }, { "epoch": 1.9136363636363636, "grad_norm": 12.499181747436523, "learning_rate": 3.659090909090909e-05, "loss": 1.62, "step": 842 }, { "epoch": 1.915909090909091, "grad_norm": 7.448797225952148, "learning_rate": 3.651515151515152e-05, "loss": 0.979, "step": 843 }, { "epoch": 1.9181818181818182, "grad_norm": 11.219677925109863, "learning_rate": 3.643939393939394e-05, "loss": 1.8378, "step": 844 }, { "epoch": 1.9204545454545454, "grad_norm": 11.738428115844727, "learning_rate": 3.6363636363636364e-05, "loss": 2.1477, "step": 845 }, { "epoch": 1.9227272727272728, "grad_norm": 13.800374031066895, "learning_rate": 3.628787878787879e-05, "loss": 2.3644, "step": 846 }, { "epoch": 1.925, "grad_norm": 11.240313529968262, "learning_rate": 3.621212121212122e-05, "loss": 1.6775, "step": 847 }, { "epoch": 1.9272727272727272, "grad_norm": 13.477606773376465, "learning_rate": 3.613636363636364e-05, "loss": 1.3438, "step": 848 }, { "epoch": 1.9295454545454547, "grad_norm": 12.788423538208008, "learning_rate": 3.606060606060606e-05, "loss": 1.7158, "step": 849 }, { "epoch": 1.9318181818181817, "grad_norm": 8.893767356872559, "learning_rate": 3.598484848484849e-05, "loss": 1.4747, "step": 850 }, { "epoch": 1.934090909090909, "grad_norm": 12.053075790405273, "learning_rate": 3.590909090909091e-05, "loss": 1.0121, "step": 851 }, { "epoch": 1.9363636363636365, "grad_norm": 12.093589782714844, "learning_rate": 3.5833333333333335e-05, "loss": 2.1991, "step": 852 }, { "epoch": 1.9386363636363635, "grad_norm": 9.356278419494629, "learning_rate": 3.575757575757576e-05, "loss": 1.4497, "step": 853 }, { "epoch": 1.940909090909091, "grad_norm": 12.686812400817871, "learning_rate": 3.568181818181818e-05, "loss": 1.5038, "step": 854 }, { "epoch": 1.9431818181818183, "grad_norm": 13.139368057250977, "learning_rate": 3.560606060606061e-05, "loss": 2.9399, "step": 855 }, { "epoch": 1.9454545454545453, "grad_norm": 11.385064125061035, "learning_rate": 3.553030303030303e-05, "loss": 1.4202, "step": 856 }, { "epoch": 1.9477272727272728, "grad_norm": 9.905313491821289, "learning_rate": 3.545454545454546e-05, "loss": 2.5033, "step": 857 }, { "epoch": 1.95, "grad_norm": 9.99422836303711, "learning_rate": 3.537878787878788e-05, "loss": 1.631, "step": 858 }, { "epoch": 1.9522727272727272, "grad_norm": 12.235610961914062, "learning_rate": 3.5303030303030305e-05, "loss": 1.7517, "step": 859 }, { "epoch": 1.9545454545454546, "grad_norm": 13.225701332092285, "learning_rate": 3.522727272727273e-05, "loss": 1.545, "step": 860 }, { "epoch": 1.9568181818181818, "grad_norm": 13.755146980285645, "learning_rate": 3.515151515151515e-05, "loss": 1.6548, "step": 861 }, { "epoch": 1.959090909090909, "grad_norm": 14.235300064086914, "learning_rate": 3.507575757575758e-05, "loss": 2.2791, "step": 862 }, { "epoch": 1.9613636363636364, "grad_norm": 12.734109878540039, "learning_rate": 3.5e-05, "loss": 1.4257, "step": 863 }, { "epoch": 1.9636363636363636, "grad_norm": 12.51075267791748, "learning_rate": 3.492424242424242e-05, "loss": 2.1328, "step": 864 }, { "epoch": 1.9659090909090908, "grad_norm": 12.090396881103516, "learning_rate": 3.484848484848485e-05, "loss": 2.4949, "step": 865 }, { "epoch": 1.9681818181818183, "grad_norm": 9.898470878601074, "learning_rate": 3.4772727272727276e-05, "loss": 1.0122, "step": 866 }, { "epoch": 1.9704545454545455, "grad_norm": 12.299036979675293, "learning_rate": 3.46969696969697e-05, "loss": 1.1734, "step": 867 }, { "epoch": 1.9727272727272727, "grad_norm": 10.930243492126465, "learning_rate": 3.462121212121212e-05, "loss": 1.8219, "step": 868 }, { "epoch": 1.975, "grad_norm": 11.0517578125, "learning_rate": 3.454545454545455e-05, "loss": 1.5023, "step": 869 }, { "epoch": 1.9772727272727273, "grad_norm": 11.98909854888916, "learning_rate": 3.4469696969696974e-05, "loss": 1.298, "step": 870 }, { "epoch": 1.9795454545454545, "grad_norm": 12.753129959106445, "learning_rate": 3.4393939393939394e-05, "loss": 1.7147, "step": 871 }, { "epoch": 1.981818181818182, "grad_norm": 71.2451171875, "learning_rate": 3.431818181818182e-05, "loss": 1.3867, "step": 872 }, { "epoch": 1.9840909090909091, "grad_norm": 9.198206901550293, "learning_rate": 3.424242424242424e-05, "loss": 1.2175, "step": 873 }, { "epoch": 1.9863636363636363, "grad_norm": 10.864444732666016, "learning_rate": 3.4166666666666666e-05, "loss": 2.4479, "step": 874 }, { "epoch": 1.9886363636363638, "grad_norm": 12.929604530334473, "learning_rate": 3.409090909090909e-05, "loss": 2.3538, "step": 875 }, { "epoch": 1.990909090909091, "grad_norm": 15.190954208374023, "learning_rate": 3.401515151515152e-05, "loss": 2.7314, "step": 876 }, { "epoch": 1.9931818181818182, "grad_norm": 12.220293045043945, "learning_rate": 3.3939393939393945e-05, "loss": 1.8087, "step": 877 }, { "epoch": 1.9954545454545456, "grad_norm": 13.717775344848633, "learning_rate": 3.3863636363636364e-05, "loss": 2.2791, "step": 878 }, { "epoch": 1.9977272727272726, "grad_norm": 13.53941822052002, "learning_rate": 3.378787878787879e-05, "loss": 1.9205, "step": 879 }, { "epoch": 2.0, "grad_norm": 10.206825256347656, "learning_rate": 3.371212121212121e-05, "loss": 1.2968, "step": 880 }, { "epoch": 2.0, "eval_f1": 0.8929, "eval_gen_len": 41.9091, "eval_loss": 1.7823115587234497, "eval_precision": 0.8925, "eval_recall": 0.8935, "eval_rouge1": 0.447, "eval_rouge2": 0.2102, "eval_rougeL": 0.3795, "eval_rougeLsum": 0.4136, "eval_runtime": 29.0339, "eval_samples_per_second": 3.789, "eval_steps_per_second": 0.482, "step": 880 }, { "epoch": 2.0022727272727274, "grad_norm": 9.781706809997559, "learning_rate": 3.3636363636363636e-05, "loss": 1.0468, "step": 881 }, { "epoch": 2.0045454545454544, "grad_norm": 8.61344051361084, "learning_rate": 3.356060606060606e-05, "loss": 1.7286, "step": 882 }, { "epoch": 2.006818181818182, "grad_norm": 11.291481971740723, "learning_rate": 3.348484848484848e-05, "loss": 1.1274, "step": 883 }, { "epoch": 2.0090909090909093, "grad_norm": 11.33132553100586, "learning_rate": 3.3409090909090915e-05, "loss": 1.4992, "step": 884 }, { "epoch": 2.0113636363636362, "grad_norm": 10.342754364013672, "learning_rate": 3.3333333333333335e-05, "loss": 1.7733, "step": 885 }, { "epoch": 2.0136363636363637, "grad_norm": 9.18486499786377, "learning_rate": 3.325757575757576e-05, "loss": 1.7391, "step": 886 }, { "epoch": 2.015909090909091, "grad_norm": 35.923648834228516, "learning_rate": 3.318181818181819e-05, "loss": 1.8191, "step": 887 }, { "epoch": 2.018181818181818, "grad_norm": 10.737150192260742, "learning_rate": 3.310606060606061e-05, "loss": 1.1656, "step": 888 }, { "epoch": 2.0204545454545455, "grad_norm": 7.691224098205566, "learning_rate": 3.303030303030303e-05, "loss": 1.1787, "step": 889 }, { "epoch": 2.022727272727273, "grad_norm": 14.402198791503906, "learning_rate": 3.295454545454545e-05, "loss": 2.1618, "step": 890 }, { "epoch": 2.025, "grad_norm": 9.567869186401367, "learning_rate": 3.287878787878788e-05, "loss": 1.4921, "step": 891 }, { "epoch": 2.0272727272727273, "grad_norm": 12.46391487121582, "learning_rate": 3.2803030303030305e-05, "loss": 2.0986, "step": 892 }, { "epoch": 2.0295454545454548, "grad_norm": 12.333531379699707, "learning_rate": 3.272727272727273e-05, "loss": 1.5944, "step": 893 }, { "epoch": 2.0318181818181817, "grad_norm": 12.140853881835938, "learning_rate": 3.265151515151516e-05, "loss": 1.7773, "step": 894 }, { "epoch": 2.034090909090909, "grad_norm": 9.412683486938477, "learning_rate": 3.257575757575758e-05, "loss": 1.2663, "step": 895 }, { "epoch": 2.036363636363636, "grad_norm": 10.711098670959473, "learning_rate": 3.2500000000000004e-05, "loss": 1.6462, "step": 896 }, { "epoch": 2.0386363636363636, "grad_norm": 11.64570426940918, "learning_rate": 3.2424242424242423e-05, "loss": 1.8232, "step": 897 }, { "epoch": 2.040909090909091, "grad_norm": 12.753011703491211, "learning_rate": 3.234848484848485e-05, "loss": 1.9761, "step": 898 }, { "epoch": 2.043181818181818, "grad_norm": 15.42159366607666, "learning_rate": 3.2272727272727276e-05, "loss": 1.5225, "step": 899 }, { "epoch": 2.0454545454545454, "grad_norm": 13.561200141906738, "learning_rate": 3.2196969696969696e-05, "loss": 2.2342, "step": 900 }, { "epoch": 2.047727272727273, "grad_norm": 11.59468936920166, "learning_rate": 3.212121212121212e-05, "loss": 1.3996, "step": 901 }, { "epoch": 2.05, "grad_norm": 12.330318450927734, "learning_rate": 3.204545454545455e-05, "loss": 2.3926, "step": 902 }, { "epoch": 2.0522727272727272, "grad_norm": 15.305580139160156, "learning_rate": 3.1969696969696974e-05, "loss": 2.5056, "step": 903 }, { "epoch": 2.0545454545454547, "grad_norm": 12.250936508178711, "learning_rate": 3.18939393939394e-05, "loss": 2.2595, "step": 904 }, { "epoch": 2.0568181818181817, "grad_norm": 9.258564949035645, "learning_rate": 3.181818181818182e-05, "loss": 1.0952, "step": 905 }, { "epoch": 2.059090909090909, "grad_norm": 10.1191987991333, "learning_rate": 3.174242424242425e-05, "loss": 2.2179, "step": 906 }, { "epoch": 2.0613636363636365, "grad_norm": 12.793285369873047, "learning_rate": 3.1666666666666666e-05, "loss": 1.7858, "step": 907 }, { "epoch": 2.0636363636363635, "grad_norm": 10.188157081604004, "learning_rate": 3.159090909090909e-05, "loss": 1.3631, "step": 908 }, { "epoch": 2.065909090909091, "grad_norm": 13.256832122802734, "learning_rate": 3.151515151515151e-05, "loss": 2.2464, "step": 909 }, { "epoch": 2.0681818181818183, "grad_norm": 10.160938262939453, "learning_rate": 3.143939393939394e-05, "loss": 1.5204, "step": 910 }, { "epoch": 2.0704545454545453, "grad_norm": 10.945446014404297, "learning_rate": 3.1363636363636365e-05, "loss": 1.6125, "step": 911 }, { "epoch": 2.0727272727272728, "grad_norm": 10.19439697265625, "learning_rate": 3.128787878787879e-05, "loss": 1.5317, "step": 912 }, { "epoch": 2.075, "grad_norm": 9.242986679077148, "learning_rate": 3.121212121212122e-05, "loss": 1.7993, "step": 913 }, { "epoch": 2.077272727272727, "grad_norm": 9.43307113647461, "learning_rate": 3.113636363636364e-05, "loss": 1.4297, "step": 914 }, { "epoch": 2.0795454545454546, "grad_norm": 9.292837142944336, "learning_rate": 3.106060606060606e-05, "loss": 1.1428, "step": 915 }, { "epoch": 2.081818181818182, "grad_norm": 10.290895462036133, "learning_rate": 3.098484848484849e-05, "loss": 1.3587, "step": 916 }, { "epoch": 2.084090909090909, "grad_norm": 12.890341758728027, "learning_rate": 3.090909090909091e-05, "loss": 1.5721, "step": 917 }, { "epoch": 2.0863636363636364, "grad_norm": 9.548102378845215, "learning_rate": 3.0833333333333335e-05, "loss": 1.5717, "step": 918 }, { "epoch": 2.088636363636364, "grad_norm": 11.2235689163208, "learning_rate": 3.0757575757575755e-05, "loss": 1.818, "step": 919 }, { "epoch": 2.090909090909091, "grad_norm": 14.528667449951172, "learning_rate": 3.068181818181818e-05, "loss": 1.6878, "step": 920 }, { "epoch": 2.0931818181818183, "grad_norm": 13.295345306396484, "learning_rate": 3.060606060606061e-05, "loss": 1.8521, "step": 921 }, { "epoch": 2.0954545454545457, "grad_norm": 13.902974128723145, "learning_rate": 3.0530303030303034e-05, "loss": 1.7186, "step": 922 }, { "epoch": 2.0977272727272727, "grad_norm": 8.313849449157715, "learning_rate": 3.0454545454545456e-05, "loss": 0.8988, "step": 923 }, { "epoch": 2.1, "grad_norm": 11.491289138793945, "learning_rate": 3.037878787878788e-05, "loss": 1.1394, "step": 924 }, { "epoch": 2.102272727272727, "grad_norm": 13.124963760375977, "learning_rate": 3.0303030303030306e-05, "loss": 1.7424, "step": 925 }, { "epoch": 2.1045454545454545, "grad_norm": 8.5538911819458, "learning_rate": 3.0227272727272725e-05, "loss": 1.3577, "step": 926 }, { "epoch": 2.106818181818182, "grad_norm": 12.04502010345459, "learning_rate": 3.015151515151515e-05, "loss": 1.2389, "step": 927 }, { "epoch": 2.109090909090909, "grad_norm": 8.608831405639648, "learning_rate": 3.0075757575757578e-05, "loss": 1.1577, "step": 928 }, { "epoch": 2.1113636363636363, "grad_norm": 14.802834510803223, "learning_rate": 3e-05, "loss": 1.8636, "step": 929 }, { "epoch": 2.1136363636363638, "grad_norm": 9.014802932739258, "learning_rate": 2.9924242424242427e-05, "loss": 0.7823, "step": 930 }, { "epoch": 2.1159090909090907, "grad_norm": 10.007800102233887, "learning_rate": 2.9848484848484847e-05, "loss": 1.7205, "step": 931 }, { "epoch": 2.118181818181818, "grad_norm": 16.067474365234375, "learning_rate": 2.9772727272727273e-05, "loss": 2.443, "step": 932 }, { "epoch": 2.1204545454545456, "grad_norm": 12.624736785888672, "learning_rate": 2.96969696969697e-05, "loss": 1.5536, "step": 933 }, { "epoch": 2.1227272727272726, "grad_norm": 10.400491714477539, "learning_rate": 2.9621212121212122e-05, "loss": 1.2871, "step": 934 }, { "epoch": 2.125, "grad_norm": 11.056097984313965, "learning_rate": 2.954545454545455e-05, "loss": 1.4614, "step": 935 }, { "epoch": 2.1272727272727274, "grad_norm": 9.163816452026367, "learning_rate": 2.9469696969696968e-05, "loss": 1.2918, "step": 936 }, { "epoch": 2.1295454545454544, "grad_norm": 8.908564567565918, "learning_rate": 2.9393939393939394e-05, "loss": 1.2489, "step": 937 }, { "epoch": 2.131818181818182, "grad_norm": 8.402863502502441, "learning_rate": 2.9318181818181817e-05, "loss": 1.4269, "step": 938 }, { "epoch": 2.1340909090909093, "grad_norm": 10.939780235290527, "learning_rate": 2.9242424242424243e-05, "loss": 1.4199, "step": 939 }, { "epoch": 2.1363636363636362, "grad_norm": 11.758381843566895, "learning_rate": 2.916666666666667e-05, "loss": 1.4597, "step": 940 }, { "epoch": 2.1386363636363637, "grad_norm": 11.411653518676758, "learning_rate": 2.909090909090909e-05, "loss": 2.1611, "step": 941 }, { "epoch": 2.140909090909091, "grad_norm": 11.838427543640137, "learning_rate": 2.901515151515152e-05, "loss": 1.2373, "step": 942 }, { "epoch": 2.143181818181818, "grad_norm": 14.833626747131348, "learning_rate": 2.893939393939394e-05, "loss": 1.9202, "step": 943 }, { "epoch": 2.1454545454545455, "grad_norm": 10.815326690673828, "learning_rate": 2.8863636363636365e-05, "loss": 1.5089, "step": 944 }, { "epoch": 2.147727272727273, "grad_norm": 12.253664016723633, "learning_rate": 2.878787878787879e-05, "loss": 1.3787, "step": 945 }, { "epoch": 2.15, "grad_norm": 13.154531478881836, "learning_rate": 2.8712121212121214e-05, "loss": 1.8925, "step": 946 }, { "epoch": 2.1522727272727273, "grad_norm": 12.020703315734863, "learning_rate": 2.863636363636364e-05, "loss": 1.379, "step": 947 }, { "epoch": 2.1545454545454543, "grad_norm": 10.430608749389648, "learning_rate": 2.856060606060606e-05, "loss": 1.4203, "step": 948 }, { "epoch": 2.1568181818181817, "grad_norm": 8.769074440002441, "learning_rate": 2.8484848484848486e-05, "loss": 1.227, "step": 949 }, { "epoch": 2.159090909090909, "grad_norm": 11.399450302124023, "learning_rate": 2.8409090909090912e-05, "loss": 1.3783, "step": 950 }, { "epoch": 2.161363636363636, "grad_norm": 9.87228012084961, "learning_rate": 2.8333333333333335e-05, "loss": 1.6523, "step": 951 }, { "epoch": 2.1636363636363636, "grad_norm": 15.94421100616455, "learning_rate": 2.825757575757576e-05, "loss": 2.4161, "step": 952 }, { "epoch": 2.165909090909091, "grad_norm": 9.126893043518066, "learning_rate": 2.818181818181818e-05, "loss": 1.2675, "step": 953 }, { "epoch": 2.168181818181818, "grad_norm": 15.760127067565918, "learning_rate": 2.8106060606060607e-05, "loss": 2.9231, "step": 954 }, { "epoch": 2.1704545454545454, "grad_norm": 8.999767303466797, "learning_rate": 2.803030303030303e-05, "loss": 1.5147, "step": 955 }, { "epoch": 2.172727272727273, "grad_norm": 12.179048538208008, "learning_rate": 2.7954545454545457e-05, "loss": 1.4017, "step": 956 }, { "epoch": 2.175, "grad_norm": 11.52514934539795, "learning_rate": 2.7878787878787883e-05, "loss": 2.2158, "step": 957 }, { "epoch": 2.1772727272727272, "grad_norm": 14.60074520111084, "learning_rate": 2.7803030303030303e-05, "loss": 1.6378, "step": 958 }, { "epoch": 2.1795454545454547, "grad_norm": 11.505465507507324, "learning_rate": 2.772727272727273e-05, "loss": 1.6039, "step": 959 }, { "epoch": 2.1818181818181817, "grad_norm": 12.141363143920898, "learning_rate": 2.7651515151515152e-05, "loss": 2.6782, "step": 960 }, { "epoch": 2.184090909090909, "grad_norm": 10.89749813079834, "learning_rate": 2.7575757575757578e-05, "loss": 1.4787, "step": 961 }, { "epoch": 2.1863636363636365, "grad_norm": 11.249963760375977, "learning_rate": 2.7500000000000004e-05, "loss": 1.9647, "step": 962 }, { "epoch": 2.1886363636363635, "grad_norm": 9.608443260192871, "learning_rate": 2.7424242424242424e-05, "loss": 0.8747, "step": 963 }, { "epoch": 2.190909090909091, "grad_norm": 9.517485618591309, "learning_rate": 2.734848484848485e-05, "loss": 1.2376, "step": 964 }, { "epoch": 2.1931818181818183, "grad_norm": 9.044648170471191, "learning_rate": 2.7272727272727273e-05, "loss": 0.8014, "step": 965 }, { "epoch": 2.1954545454545453, "grad_norm": 9.988462448120117, "learning_rate": 2.71969696969697e-05, "loss": 1.652, "step": 966 }, { "epoch": 2.1977272727272728, "grad_norm": 8.96922492980957, "learning_rate": 2.7121212121212126e-05, "loss": 0.9484, "step": 967 }, { "epoch": 2.2, "grad_norm": 10.36929702758789, "learning_rate": 2.7045454545454545e-05, "loss": 1.2604, "step": 968 }, { "epoch": 2.202272727272727, "grad_norm": 14.008241653442383, "learning_rate": 2.696969696969697e-05, "loss": 2.4898, "step": 969 }, { "epoch": 2.2045454545454546, "grad_norm": 14.017687797546387, "learning_rate": 2.6893939393939394e-05, "loss": 1.8664, "step": 970 }, { "epoch": 2.206818181818182, "grad_norm": 11.672577857971191, "learning_rate": 2.681818181818182e-05, "loss": 1.8917, "step": 971 }, { "epoch": 2.209090909090909, "grad_norm": 11.760181427001953, "learning_rate": 2.674242424242424e-05, "loss": 2.0559, "step": 972 }, { "epoch": 2.2113636363636364, "grad_norm": 13.333674430847168, "learning_rate": 2.6666666666666667e-05, "loss": 1.8072, "step": 973 }, { "epoch": 2.213636363636364, "grad_norm": 9.448116302490234, "learning_rate": 2.6590909090909093e-05, "loss": 1.2764, "step": 974 }, { "epoch": 2.215909090909091, "grad_norm": 11.52153491973877, "learning_rate": 2.6515151515151516e-05, "loss": 1.7083, "step": 975 }, { "epoch": 2.2181818181818183, "grad_norm": 20.444080352783203, "learning_rate": 2.6439393939393942e-05, "loss": 2.2781, "step": 976 }, { "epoch": 2.2204545454545457, "grad_norm": 15.952470779418945, "learning_rate": 2.636363636363636e-05, "loss": 2.0901, "step": 977 }, { "epoch": 2.2227272727272727, "grad_norm": 10.751893997192383, "learning_rate": 2.6287878787878788e-05, "loss": 0.9779, "step": 978 }, { "epoch": 2.225, "grad_norm": 11.89562702178955, "learning_rate": 2.6212121212121214e-05, "loss": 1.7043, "step": 979 }, { "epoch": 2.227272727272727, "grad_norm": 12.013797760009766, "learning_rate": 2.6136363636363637e-05, "loss": 1.4427, "step": 980 }, { "epoch": 2.2295454545454545, "grad_norm": 13.685124397277832, "learning_rate": 2.6060606060606063e-05, "loss": 1.9327, "step": 981 }, { "epoch": 2.231818181818182, "grad_norm": 14.36984920501709, "learning_rate": 2.5984848484848483e-05, "loss": 2.4401, "step": 982 }, { "epoch": 2.234090909090909, "grad_norm": 11.657794952392578, "learning_rate": 2.590909090909091e-05, "loss": 1.5776, "step": 983 }, { "epoch": 2.2363636363636363, "grad_norm": 9.138626098632812, "learning_rate": 2.5833333333333336e-05, "loss": 1.5954, "step": 984 }, { "epoch": 2.2386363636363638, "grad_norm": 11.275242805480957, "learning_rate": 2.575757575757576e-05, "loss": 1.5874, "step": 985 }, { "epoch": 2.2409090909090907, "grad_norm": 11.694557189941406, "learning_rate": 2.5681818181818185e-05, "loss": 1.2839, "step": 986 }, { "epoch": 2.243181818181818, "grad_norm": 14.328207015991211, "learning_rate": 2.5606060606060604e-05, "loss": 2.3689, "step": 987 }, { "epoch": 2.2454545454545456, "grad_norm": 14.487227439880371, "learning_rate": 2.553030303030303e-05, "loss": 1.5858, "step": 988 }, { "epoch": 2.2477272727272726, "grad_norm": 14.691239356994629, "learning_rate": 2.5454545454545454e-05, "loss": 1.8329, "step": 989 }, { "epoch": 2.25, "grad_norm": 10.622157096862793, "learning_rate": 2.537878787878788e-05, "loss": 1.8422, "step": 990 }, { "epoch": 2.2522727272727274, "grad_norm": 13.788392066955566, "learning_rate": 2.5303030303030306e-05, "loss": 2.0421, "step": 991 }, { "epoch": 2.2545454545454544, "grad_norm": 8.527210235595703, "learning_rate": 2.5227272727272726e-05, "loss": 1.4462, "step": 992 }, { "epoch": 2.256818181818182, "grad_norm": 11.221017837524414, "learning_rate": 2.5151515151515155e-05, "loss": 1.7809, "step": 993 }, { "epoch": 2.2590909090909093, "grad_norm": 15.243719100952148, "learning_rate": 2.5075757575757575e-05, "loss": 1.7409, "step": 994 }, { "epoch": 2.2613636363636362, "grad_norm": 16.965797424316406, "learning_rate": 2.5e-05, "loss": 3.2836, "step": 995 }, { "epoch": 2.2636363636363637, "grad_norm": 10.187609672546387, "learning_rate": 2.4924242424242424e-05, "loss": 1.5489, "step": 996 }, { "epoch": 2.265909090909091, "grad_norm": 9.865535736083984, "learning_rate": 2.4848484848484847e-05, "loss": 2.0742, "step": 997 }, { "epoch": 2.268181818181818, "grad_norm": 11.739052772521973, "learning_rate": 2.4772727272727277e-05, "loss": 1.4237, "step": 998 }, { "epoch": 2.2704545454545455, "grad_norm": 13.875876426696777, "learning_rate": 2.46969696969697e-05, "loss": 2.8714, "step": 999 }, { "epoch": 2.2727272727272725, "grad_norm": 11.909977912902832, "learning_rate": 2.4621212121212123e-05, "loss": 1.9434, "step": 1000 }, { "epoch": 2.275, "grad_norm": 13.642827033996582, "learning_rate": 2.4545454545454545e-05, "loss": 1.4233, "step": 1001 }, { "epoch": 2.2772727272727273, "grad_norm": 10.349024772644043, "learning_rate": 2.4469696969696972e-05, "loss": 1.5193, "step": 1002 }, { "epoch": 2.2795454545454543, "grad_norm": 8.302240371704102, "learning_rate": 2.4393939393939395e-05, "loss": 1.0769, "step": 1003 }, { "epoch": 2.2818181818181817, "grad_norm": 9.903936386108398, "learning_rate": 2.431818181818182e-05, "loss": 1.4596, "step": 1004 }, { "epoch": 2.284090909090909, "grad_norm": 7.976583957672119, "learning_rate": 2.4242424242424244e-05, "loss": 1.3187, "step": 1005 }, { "epoch": 2.286363636363636, "grad_norm": 8.382739067077637, "learning_rate": 2.4166666666666667e-05, "loss": 1.1004, "step": 1006 }, { "epoch": 2.2886363636363636, "grad_norm": 9.898600578308105, "learning_rate": 2.4090909090909093e-05, "loss": 1.3482, "step": 1007 }, { "epoch": 2.290909090909091, "grad_norm": 9.736372947692871, "learning_rate": 2.4015151515151516e-05, "loss": 1.0737, "step": 1008 }, { "epoch": 2.293181818181818, "grad_norm": 14.735883712768555, "learning_rate": 2.393939393939394e-05, "loss": 1.9045, "step": 1009 }, { "epoch": 2.2954545454545454, "grad_norm": 16.780405044555664, "learning_rate": 2.3863636363636365e-05, "loss": 1.9355, "step": 1010 }, { "epoch": 2.297727272727273, "grad_norm": 9.181320190429688, "learning_rate": 2.3787878787878788e-05, "loss": 1.4465, "step": 1011 }, { "epoch": 2.3, "grad_norm": 11.207884788513184, "learning_rate": 2.3712121212121214e-05, "loss": 1.6341, "step": 1012 }, { "epoch": 2.3022727272727272, "grad_norm": 12.287393569946289, "learning_rate": 2.3636363636363637e-05, "loss": 1.806, "step": 1013 }, { "epoch": 2.3045454545454547, "grad_norm": 12.173286437988281, "learning_rate": 2.356060606060606e-05, "loss": 2.2166, "step": 1014 }, { "epoch": 2.3068181818181817, "grad_norm": 13.528629302978516, "learning_rate": 2.3484848484848487e-05, "loss": 1.5679, "step": 1015 }, { "epoch": 2.309090909090909, "grad_norm": 9.217406272888184, "learning_rate": 2.340909090909091e-05, "loss": 1.7179, "step": 1016 }, { "epoch": 2.3113636363636365, "grad_norm": 13.768959999084473, "learning_rate": 2.3333333333333336e-05, "loss": 2.1235, "step": 1017 }, { "epoch": 2.3136363636363635, "grad_norm": 9.60761833190918, "learning_rate": 2.325757575757576e-05, "loss": 1.3526, "step": 1018 }, { "epoch": 2.315909090909091, "grad_norm": 10.336706161499023, "learning_rate": 2.318181818181818e-05, "loss": 1.3543, "step": 1019 }, { "epoch": 2.3181818181818183, "grad_norm": 11.636757850646973, "learning_rate": 2.3106060606060605e-05, "loss": 1.8026, "step": 1020 }, { "epoch": 2.3204545454545453, "grad_norm": 10.546634674072266, "learning_rate": 2.3030303030303034e-05, "loss": 1.9753, "step": 1021 }, { "epoch": 2.3227272727272728, "grad_norm": 13.629782676696777, "learning_rate": 2.2954545454545457e-05, "loss": 1.6927, "step": 1022 }, { "epoch": 2.325, "grad_norm": 13.1149263381958, "learning_rate": 2.287878787878788e-05, "loss": 1.4331, "step": 1023 }, { "epoch": 2.327272727272727, "grad_norm": 10.624835968017578, "learning_rate": 2.2803030303030303e-05, "loss": 1.4769, "step": 1024 }, { "epoch": 2.3295454545454546, "grad_norm": 13.692902565002441, "learning_rate": 2.272727272727273e-05, "loss": 2.7543, "step": 1025 }, { "epoch": 2.331818181818182, "grad_norm": 10.054675102233887, "learning_rate": 2.2651515151515152e-05, "loss": 1.2323, "step": 1026 }, { "epoch": 2.334090909090909, "grad_norm": 14.394067764282227, "learning_rate": 2.257575757575758e-05, "loss": 2.094, "step": 1027 }, { "epoch": 2.3363636363636364, "grad_norm": 10.581347465515137, "learning_rate": 2.25e-05, "loss": 2.2432, "step": 1028 }, { "epoch": 2.338636363636364, "grad_norm": 9.492446899414062, "learning_rate": 2.2424242424242424e-05, "loss": 1.3964, "step": 1029 }, { "epoch": 2.340909090909091, "grad_norm": 10.887022972106934, "learning_rate": 2.234848484848485e-05, "loss": 2.0411, "step": 1030 }, { "epoch": 2.3431818181818183, "grad_norm": 13.539667129516602, "learning_rate": 2.2272727272727274e-05, "loss": 1.3067, "step": 1031 }, { "epoch": 2.3454545454545457, "grad_norm": 9.191630363464355, "learning_rate": 2.21969696969697e-05, "loss": 1.266, "step": 1032 }, { "epoch": 2.3477272727272727, "grad_norm": 8.683979034423828, "learning_rate": 2.2121212121212123e-05, "loss": 0.8044, "step": 1033 }, { "epoch": 2.35, "grad_norm": 13.170730590820312, "learning_rate": 2.2045454545454546e-05, "loss": 2.2811, "step": 1034 }, { "epoch": 2.3522727272727275, "grad_norm": 11.17111873626709, "learning_rate": 2.1969696969696972e-05, "loss": 1.3998, "step": 1035 }, { "epoch": 2.3545454545454545, "grad_norm": 11.230095863342285, "learning_rate": 2.1893939393939395e-05, "loss": 2.0224, "step": 1036 }, { "epoch": 2.356818181818182, "grad_norm": 11.912615776062012, "learning_rate": 2.1818181818181818e-05, "loss": 1.5619, "step": 1037 }, { "epoch": 2.359090909090909, "grad_norm": 10.748661994934082, "learning_rate": 2.1742424242424244e-05, "loss": 1.924, "step": 1038 }, { "epoch": 2.3613636363636363, "grad_norm": 9.370635032653809, "learning_rate": 2.1666666666666667e-05, "loss": 1.1797, "step": 1039 }, { "epoch": 2.3636363636363638, "grad_norm": 10.01646900177002, "learning_rate": 2.1590909090909093e-05, "loss": 2.1678, "step": 1040 }, { "epoch": 2.3659090909090907, "grad_norm": 9.345016479492188, "learning_rate": 2.1515151515151516e-05, "loss": 1.4512, "step": 1041 }, { "epoch": 2.368181818181818, "grad_norm": 11.185441970825195, "learning_rate": 2.143939393939394e-05, "loss": 1.5958, "step": 1042 }, { "epoch": 2.3704545454545456, "grad_norm": 10.186037063598633, "learning_rate": 2.1363636363636362e-05, "loss": 0.8744, "step": 1043 }, { "epoch": 2.3727272727272726, "grad_norm": 16.676177978515625, "learning_rate": 2.128787878787879e-05, "loss": 2.0851, "step": 1044 }, { "epoch": 2.375, "grad_norm": 12.497913360595703, "learning_rate": 2.1212121212121215e-05, "loss": 1.4765, "step": 1045 }, { "epoch": 2.3772727272727274, "grad_norm": 7.271422386169434, "learning_rate": 2.1136363636363638e-05, "loss": 1.0424, "step": 1046 }, { "epoch": 2.3795454545454544, "grad_norm": 14.968780517578125, "learning_rate": 2.106060606060606e-05, "loss": 2.1247, "step": 1047 }, { "epoch": 2.381818181818182, "grad_norm": 11.1759672164917, "learning_rate": 2.0984848484848483e-05, "loss": 1.5037, "step": 1048 }, { "epoch": 2.3840909090909093, "grad_norm": 9.880687713623047, "learning_rate": 2.090909090909091e-05, "loss": 0.8131, "step": 1049 }, { "epoch": 2.3863636363636362, "grad_norm": 7.559080123901367, "learning_rate": 2.0833333333333336e-05, "loss": 0.5826, "step": 1050 }, { "epoch": 2.3886363636363637, "grad_norm": 14.357791900634766, "learning_rate": 2.075757575757576e-05, "loss": 2.0945, "step": 1051 }, { "epoch": 2.390909090909091, "grad_norm": 11.396363258361816, "learning_rate": 2.0681818181818182e-05, "loss": 1.1564, "step": 1052 }, { "epoch": 2.393181818181818, "grad_norm": 11.255867958068848, "learning_rate": 2.0606060606060608e-05, "loss": 2.2688, "step": 1053 }, { "epoch": 2.3954545454545455, "grad_norm": 12.590128898620605, "learning_rate": 2.053030303030303e-05, "loss": 2.0123, "step": 1054 }, { "epoch": 2.3977272727272725, "grad_norm": 8.069854736328125, "learning_rate": 2.0454545454545457e-05, "loss": 1.3967, "step": 1055 }, { "epoch": 2.4, "grad_norm": 12.596185684204102, "learning_rate": 2.037878787878788e-05, "loss": 1.6038, "step": 1056 }, { "epoch": 2.4022727272727273, "grad_norm": 10.432991981506348, "learning_rate": 2.0303030303030303e-05, "loss": 1.645, "step": 1057 }, { "epoch": 2.4045454545454543, "grad_norm": 10.639815330505371, "learning_rate": 2.022727272727273e-05, "loss": 1.5334, "step": 1058 }, { "epoch": 2.4068181818181817, "grad_norm": 8.867145538330078, "learning_rate": 2.0151515151515152e-05, "loss": 1.2041, "step": 1059 }, { "epoch": 2.409090909090909, "grad_norm": 9.741902351379395, "learning_rate": 2.0075757575757575e-05, "loss": 1.4987, "step": 1060 }, { "epoch": 2.411363636363636, "grad_norm": 9.907489776611328, "learning_rate": 2e-05, "loss": 1.299, "step": 1061 }, { "epoch": 2.4136363636363636, "grad_norm": 8.68997859954834, "learning_rate": 1.9924242424242425e-05, "loss": 1.2559, "step": 1062 }, { "epoch": 2.415909090909091, "grad_norm": 9.990528106689453, "learning_rate": 1.984848484848485e-05, "loss": 2.3812, "step": 1063 }, { "epoch": 2.418181818181818, "grad_norm": 6.777112007141113, "learning_rate": 1.9772727272727274e-05, "loss": 1.0051, "step": 1064 }, { "epoch": 2.4204545454545454, "grad_norm": 13.396077156066895, "learning_rate": 1.9696969696969697e-05, "loss": 2.4201, "step": 1065 }, { "epoch": 2.422727272727273, "grad_norm": 13.596755981445312, "learning_rate": 1.962121212121212e-05, "loss": 2.0457, "step": 1066 }, { "epoch": 2.425, "grad_norm": 10.351893424987793, "learning_rate": 1.9545454545454546e-05, "loss": 1.9791, "step": 1067 }, { "epoch": 2.4272727272727272, "grad_norm": 7.505919933319092, "learning_rate": 1.9469696969696972e-05, "loss": 1.2944, "step": 1068 }, { "epoch": 2.4295454545454547, "grad_norm": 10.136748313903809, "learning_rate": 1.9393939393939395e-05, "loss": 1.2477, "step": 1069 }, { "epoch": 2.4318181818181817, "grad_norm": 8.979276657104492, "learning_rate": 1.9318181818181818e-05, "loss": 0.9829, "step": 1070 }, { "epoch": 2.434090909090909, "grad_norm": 11.097721099853516, "learning_rate": 1.924242424242424e-05, "loss": 1.5509, "step": 1071 }, { "epoch": 2.4363636363636365, "grad_norm": 10.789654731750488, "learning_rate": 1.9166666666666667e-05, "loss": 1.7344, "step": 1072 }, { "epoch": 2.4386363636363635, "grad_norm": 12.25899887084961, "learning_rate": 1.9090909090909094e-05, "loss": 2.0121, "step": 1073 }, { "epoch": 2.440909090909091, "grad_norm": 11.828030586242676, "learning_rate": 1.9015151515151516e-05, "loss": 1.7356, "step": 1074 }, { "epoch": 2.4431818181818183, "grad_norm": 10.524036407470703, "learning_rate": 1.893939393939394e-05, "loss": 1.402, "step": 1075 }, { "epoch": 2.4454545454545453, "grad_norm": 10.572868347167969, "learning_rate": 1.8863636363636362e-05, "loss": 1.6468, "step": 1076 }, { "epoch": 2.4477272727272728, "grad_norm": 9.194175720214844, "learning_rate": 1.878787878787879e-05, "loss": 1.1557, "step": 1077 }, { "epoch": 2.45, "grad_norm": 11.355244636535645, "learning_rate": 1.8712121212121215e-05, "loss": 1.7729, "step": 1078 }, { "epoch": 2.452272727272727, "grad_norm": 10.380278587341309, "learning_rate": 1.8636363636363638e-05, "loss": 2.3491, "step": 1079 }, { "epoch": 2.4545454545454546, "grad_norm": 9.57583236694336, "learning_rate": 1.856060606060606e-05, "loss": 1.6112, "step": 1080 }, { "epoch": 2.456818181818182, "grad_norm": 12.973028182983398, "learning_rate": 1.8484848484848487e-05, "loss": 1.5272, "step": 1081 }, { "epoch": 2.459090909090909, "grad_norm": 9.473404884338379, "learning_rate": 1.840909090909091e-05, "loss": 1.2366, "step": 1082 }, { "epoch": 2.4613636363636364, "grad_norm": 9.843785285949707, "learning_rate": 1.8333333333333333e-05, "loss": 1.6283, "step": 1083 }, { "epoch": 2.463636363636364, "grad_norm": 13.467684745788574, "learning_rate": 1.825757575757576e-05, "loss": 1.5219, "step": 1084 }, { "epoch": 2.465909090909091, "grad_norm": 8.460468292236328, "learning_rate": 1.8181818181818182e-05, "loss": 0.8931, "step": 1085 }, { "epoch": 2.4681818181818183, "grad_norm": 8.956411361694336, "learning_rate": 1.810606060606061e-05, "loss": 1.1577, "step": 1086 }, { "epoch": 2.4704545454545457, "grad_norm": 10.919206619262695, "learning_rate": 1.803030303030303e-05, "loss": 1.719, "step": 1087 }, { "epoch": 2.4727272727272727, "grad_norm": 10.65345287322998, "learning_rate": 1.7954545454545454e-05, "loss": 1.5257, "step": 1088 }, { "epoch": 2.475, "grad_norm": 9.616610527038574, "learning_rate": 1.787878787878788e-05, "loss": 1.4704, "step": 1089 }, { "epoch": 2.4772727272727275, "grad_norm": 14.458331108093262, "learning_rate": 1.7803030303030303e-05, "loss": 1.4181, "step": 1090 }, { "epoch": 2.4795454545454545, "grad_norm": 8.37006664276123, "learning_rate": 1.772727272727273e-05, "loss": 1.191, "step": 1091 }, { "epoch": 2.481818181818182, "grad_norm": 13.129170417785645, "learning_rate": 1.7651515151515153e-05, "loss": 1.9966, "step": 1092 }, { "epoch": 2.484090909090909, "grad_norm": 12.65162181854248, "learning_rate": 1.7575757575757576e-05, "loss": 1.7372, "step": 1093 }, { "epoch": 2.4863636363636363, "grad_norm": 12.132272720336914, "learning_rate": 1.75e-05, "loss": 1.9386, "step": 1094 }, { "epoch": 2.4886363636363638, "grad_norm": 11.549707412719727, "learning_rate": 1.7424242424242425e-05, "loss": 1.2838, "step": 1095 }, { "epoch": 2.4909090909090907, "grad_norm": 10.115202903747559, "learning_rate": 1.734848484848485e-05, "loss": 1.7778, "step": 1096 }, { "epoch": 2.493181818181818, "grad_norm": 14.97376823425293, "learning_rate": 1.7272727272727274e-05, "loss": 2.5436, "step": 1097 }, { "epoch": 2.4954545454545456, "grad_norm": 10.270051956176758, "learning_rate": 1.7196969696969697e-05, "loss": 1.3943, "step": 1098 }, { "epoch": 2.4977272727272726, "grad_norm": 11.584896087646484, "learning_rate": 1.712121212121212e-05, "loss": 1.8023, "step": 1099 }, { "epoch": 2.5, "grad_norm": 11.003795623779297, "learning_rate": 1.7045454545454546e-05, "loss": 1.2057, "step": 1100 }, { "epoch": 2.5022727272727274, "grad_norm": 10.495930671691895, "learning_rate": 1.6969696969696972e-05, "loss": 1.7265, "step": 1101 }, { "epoch": 2.5045454545454544, "grad_norm": 10.6824951171875, "learning_rate": 1.6893939393939395e-05, "loss": 1.4241, "step": 1102 }, { "epoch": 2.506818181818182, "grad_norm": 10.532041549682617, "learning_rate": 1.6818181818181818e-05, "loss": 1.4532, "step": 1103 }, { "epoch": 2.509090909090909, "grad_norm": 8.671700477600098, "learning_rate": 1.674242424242424e-05, "loss": 1.2539, "step": 1104 }, { "epoch": 2.5113636363636362, "grad_norm": 14.828866004943848, "learning_rate": 1.6666666666666667e-05, "loss": 1.4732, "step": 1105 }, { "epoch": 2.5136363636363637, "grad_norm": 11.871790885925293, "learning_rate": 1.6590909090909094e-05, "loss": 1.7559, "step": 1106 }, { "epoch": 2.5159090909090907, "grad_norm": 9.144551277160645, "learning_rate": 1.6515151515151517e-05, "loss": 1.3562, "step": 1107 }, { "epoch": 2.518181818181818, "grad_norm": 9.856282234191895, "learning_rate": 1.643939393939394e-05, "loss": 1.4721, "step": 1108 }, { "epoch": 2.5204545454545455, "grad_norm": 8.48530101776123, "learning_rate": 1.6363636363636366e-05, "loss": 1.0045, "step": 1109 }, { "epoch": 2.5227272727272725, "grad_norm": 16.73642349243164, "learning_rate": 1.628787878787879e-05, "loss": 2.4458, "step": 1110 }, { "epoch": 2.525, "grad_norm": 10.180378913879395, "learning_rate": 1.6212121212121212e-05, "loss": 1.3323, "step": 1111 }, { "epoch": 2.5272727272727273, "grad_norm": 11.56425666809082, "learning_rate": 1.6136363636363638e-05, "loss": 2.0303, "step": 1112 }, { "epoch": 2.5295454545454543, "grad_norm": 14.644630432128906, "learning_rate": 1.606060606060606e-05, "loss": 1.9247, "step": 1113 }, { "epoch": 2.5318181818181817, "grad_norm": 11.767682075500488, "learning_rate": 1.5984848484848487e-05, "loss": 1.7903, "step": 1114 }, { "epoch": 2.534090909090909, "grad_norm": 11.074971199035645, "learning_rate": 1.590909090909091e-05, "loss": 2.0781, "step": 1115 }, { "epoch": 2.536363636363636, "grad_norm": 13.846643447875977, "learning_rate": 1.5833333333333333e-05, "loss": 1.2449, "step": 1116 }, { "epoch": 2.5386363636363636, "grad_norm": 12.496777534484863, "learning_rate": 1.5757575757575756e-05, "loss": 1.287, "step": 1117 }, { "epoch": 2.540909090909091, "grad_norm": 8.406025886535645, "learning_rate": 1.5681818181818182e-05, "loss": 1.4133, "step": 1118 }, { "epoch": 2.543181818181818, "grad_norm": 9.715517044067383, "learning_rate": 1.560606060606061e-05, "loss": 1.6738, "step": 1119 }, { "epoch": 2.5454545454545454, "grad_norm": 14.14928913116455, "learning_rate": 1.553030303030303e-05, "loss": 1.9505, "step": 1120 }, { "epoch": 2.547727272727273, "grad_norm": 10.110836029052734, "learning_rate": 1.5454545454545454e-05, "loss": 1.4759, "step": 1121 }, { "epoch": 2.55, "grad_norm": 15.94524097442627, "learning_rate": 1.5378787878787877e-05, "loss": 1.7516, "step": 1122 }, { "epoch": 2.5522727272727272, "grad_norm": 16.20330047607422, "learning_rate": 1.5303030303030304e-05, "loss": 2.1093, "step": 1123 }, { "epoch": 2.5545454545454547, "grad_norm": 8.647255897521973, "learning_rate": 1.5227272727272728e-05, "loss": 1.0308, "step": 1124 }, { "epoch": 2.5568181818181817, "grad_norm": 8.955947875976562, "learning_rate": 1.5151515151515153e-05, "loss": 1.0129, "step": 1125 }, { "epoch": 2.559090909090909, "grad_norm": 12.877582550048828, "learning_rate": 1.5075757575757576e-05, "loss": 1.4853, "step": 1126 }, { "epoch": 2.5613636363636365, "grad_norm": 14.299208641052246, "learning_rate": 1.5e-05, "loss": 2.0464, "step": 1127 }, { "epoch": 2.5636363636363635, "grad_norm": 14.365765571594238, "learning_rate": 1.4924242424242423e-05, "loss": 1.9381, "step": 1128 }, { "epoch": 2.565909090909091, "grad_norm": 10.231593132019043, "learning_rate": 1.484848484848485e-05, "loss": 1.6777, "step": 1129 }, { "epoch": 2.5681818181818183, "grad_norm": 14.259530067443848, "learning_rate": 1.4772727272727274e-05, "loss": 1.6438, "step": 1130 }, { "epoch": 2.5704545454545453, "grad_norm": 13.114981651306152, "learning_rate": 1.4696969696969697e-05, "loss": 1.3336, "step": 1131 }, { "epoch": 2.5727272727272728, "grad_norm": 9.463297843933105, "learning_rate": 1.4621212121212122e-05, "loss": 1.203, "step": 1132 }, { "epoch": 2.575, "grad_norm": 9.805520057678223, "learning_rate": 1.4545454545454545e-05, "loss": 1.2487, "step": 1133 }, { "epoch": 2.577272727272727, "grad_norm": 14.853455543518066, "learning_rate": 1.446969696969697e-05, "loss": 1.5734, "step": 1134 }, { "epoch": 2.5795454545454546, "grad_norm": 11.86341381072998, "learning_rate": 1.4393939393939396e-05, "loss": 1.4835, "step": 1135 }, { "epoch": 2.581818181818182, "grad_norm": 11.581096649169922, "learning_rate": 1.431818181818182e-05, "loss": 2.0558, "step": 1136 }, { "epoch": 2.584090909090909, "grad_norm": 12.040521621704102, "learning_rate": 1.4242424242424243e-05, "loss": 1.4117, "step": 1137 }, { "epoch": 2.5863636363636364, "grad_norm": 13.00901985168457, "learning_rate": 1.4166666666666668e-05, "loss": 2.9511, "step": 1138 }, { "epoch": 2.588636363636364, "grad_norm": 9.332910537719727, "learning_rate": 1.409090909090909e-05, "loss": 1.1121, "step": 1139 }, { "epoch": 2.590909090909091, "grad_norm": 10.607443809509277, "learning_rate": 1.4015151515151515e-05, "loss": 1.4706, "step": 1140 }, { "epoch": 2.5931818181818183, "grad_norm": 9.47099494934082, "learning_rate": 1.3939393939393942e-05, "loss": 1.6907, "step": 1141 }, { "epoch": 2.5954545454545457, "grad_norm": 12.868734359741211, "learning_rate": 1.3863636363636364e-05, "loss": 1.334, "step": 1142 }, { "epoch": 2.5977272727272727, "grad_norm": 7.338480472564697, "learning_rate": 1.3787878787878789e-05, "loss": 0.6364, "step": 1143 }, { "epoch": 2.6, "grad_norm": 10.434823989868164, "learning_rate": 1.3712121212121212e-05, "loss": 1.7292, "step": 1144 }, { "epoch": 2.6022727272727275, "grad_norm": 10.510713577270508, "learning_rate": 1.3636363636363637e-05, "loss": 1.555, "step": 1145 }, { "epoch": 2.6045454545454545, "grad_norm": 11.927501678466797, "learning_rate": 1.3560606060606063e-05, "loss": 1.7373, "step": 1146 }, { "epoch": 2.606818181818182, "grad_norm": 8.673569679260254, "learning_rate": 1.3484848484848486e-05, "loss": 1.3046, "step": 1147 }, { "epoch": 2.6090909090909093, "grad_norm": 9.680171012878418, "learning_rate": 1.340909090909091e-05, "loss": 1.2691, "step": 1148 }, { "epoch": 2.6113636363636363, "grad_norm": 20.66661834716797, "learning_rate": 1.3333333333333333e-05, "loss": 3.1138, "step": 1149 }, { "epoch": 2.6136363636363638, "grad_norm": 59.59333801269531, "learning_rate": 1.3257575757575758e-05, "loss": 1.8486, "step": 1150 }, { "epoch": 2.615909090909091, "grad_norm": 9.416550636291504, "learning_rate": 1.318181818181818e-05, "loss": 1.198, "step": 1151 }, { "epoch": 2.618181818181818, "grad_norm": 11.847350120544434, "learning_rate": 1.3106060606060607e-05, "loss": 1.494, "step": 1152 }, { "epoch": 2.6204545454545456, "grad_norm": 8.2369966506958, "learning_rate": 1.3030303030303032e-05, "loss": 0.8885, "step": 1153 }, { "epoch": 2.6227272727272726, "grad_norm": 13.204099655151367, "learning_rate": 1.2954545454545455e-05, "loss": 1.9838, "step": 1154 }, { "epoch": 2.625, "grad_norm": 11.384471893310547, "learning_rate": 1.287878787878788e-05, "loss": 1.5648, "step": 1155 }, { "epoch": 2.6272727272727274, "grad_norm": 43.95447540283203, "learning_rate": 1.2803030303030302e-05, "loss": 1.6246, "step": 1156 }, { "epoch": 2.6295454545454544, "grad_norm": 12.041752815246582, "learning_rate": 1.2727272727272727e-05, "loss": 1.6404, "step": 1157 }, { "epoch": 2.631818181818182, "grad_norm": 13.470951080322266, "learning_rate": 1.2651515151515153e-05, "loss": 2.1278, "step": 1158 }, { "epoch": 2.634090909090909, "grad_norm": 12.769510269165039, "learning_rate": 1.2575757575757578e-05, "loss": 1.6486, "step": 1159 }, { "epoch": 2.6363636363636362, "grad_norm": 9.455702781677246, "learning_rate": 1.25e-05, "loss": 1.5211, "step": 1160 }, { "epoch": 2.6386363636363637, "grad_norm": 13.590509414672852, "learning_rate": 1.2424242424242424e-05, "loss": 2.081, "step": 1161 }, { "epoch": 2.6409090909090907, "grad_norm": 12.029936790466309, "learning_rate": 1.234848484848485e-05, "loss": 1.6036, "step": 1162 }, { "epoch": 2.643181818181818, "grad_norm": 65.75121307373047, "learning_rate": 1.2272727272727273e-05, "loss": 1.5853, "step": 1163 }, { "epoch": 2.6454545454545455, "grad_norm": 13.093693733215332, "learning_rate": 1.2196969696969697e-05, "loss": 1.4623, "step": 1164 }, { "epoch": 2.6477272727272725, "grad_norm": 14.704643249511719, "learning_rate": 1.2121212121212122e-05, "loss": 1.7431, "step": 1165 }, { "epoch": 2.65, "grad_norm": 10.710149765014648, "learning_rate": 1.2045454545454547e-05, "loss": 1.6442, "step": 1166 }, { "epoch": 2.6522727272727273, "grad_norm": 12.05364990234375, "learning_rate": 1.196969696969697e-05, "loss": 2.0733, "step": 1167 }, { "epoch": 2.6545454545454543, "grad_norm": 12.834985733032227, "learning_rate": 1.1893939393939394e-05, "loss": 2.8648, "step": 1168 }, { "epoch": 2.6568181818181817, "grad_norm": 9.302035331726074, "learning_rate": 1.1818181818181819e-05, "loss": 1.1539, "step": 1169 }, { "epoch": 2.659090909090909, "grad_norm": 9.240340232849121, "learning_rate": 1.1742424242424243e-05, "loss": 1.5434, "step": 1170 }, { "epoch": 2.661363636363636, "grad_norm": 14.066667556762695, "learning_rate": 1.1666666666666668e-05, "loss": 1.7866, "step": 1171 }, { "epoch": 2.6636363636363636, "grad_norm": 10.935914039611816, "learning_rate": 1.159090909090909e-05, "loss": 1.4766, "step": 1172 }, { "epoch": 2.665909090909091, "grad_norm": 8.409308433532715, "learning_rate": 1.1515151515151517e-05, "loss": 1.3846, "step": 1173 }, { "epoch": 2.668181818181818, "grad_norm": 10.203055381774902, "learning_rate": 1.143939393939394e-05, "loss": 1.1693, "step": 1174 }, { "epoch": 2.6704545454545454, "grad_norm": 11.417679786682129, "learning_rate": 1.1363636363636365e-05, "loss": 1.9941, "step": 1175 }, { "epoch": 2.672727272727273, "grad_norm": 13.196696281433105, "learning_rate": 1.128787878787879e-05, "loss": 1.8474, "step": 1176 }, { "epoch": 2.675, "grad_norm": 11.088204383850098, "learning_rate": 1.1212121212121212e-05, "loss": 1.7153, "step": 1177 }, { "epoch": 2.6772727272727272, "grad_norm": 12.048771858215332, "learning_rate": 1.1136363636363637e-05, "loss": 2.5212, "step": 1178 }, { "epoch": 2.6795454545454547, "grad_norm": 13.929719924926758, "learning_rate": 1.1060606060606061e-05, "loss": 2.3728, "step": 1179 }, { "epoch": 2.6818181818181817, "grad_norm": 10.445011138916016, "learning_rate": 1.0984848484848486e-05, "loss": 0.9737, "step": 1180 }, { "epoch": 2.684090909090909, "grad_norm": 14.0521821975708, "learning_rate": 1.0909090909090909e-05, "loss": 1.6476, "step": 1181 }, { "epoch": 2.6863636363636365, "grad_norm": 10.526323318481445, "learning_rate": 1.0833333333333334e-05, "loss": 1.4206, "step": 1182 }, { "epoch": 2.6886363636363635, "grad_norm": 11.84065055847168, "learning_rate": 1.0757575757575758e-05, "loss": 2.5504, "step": 1183 }, { "epoch": 2.690909090909091, "grad_norm": 13.432804107666016, "learning_rate": 1.0681818181818181e-05, "loss": 1.1723, "step": 1184 }, { "epoch": 2.6931818181818183, "grad_norm": 10.570472717285156, "learning_rate": 1.0606060606060607e-05, "loss": 1.3094, "step": 1185 }, { "epoch": 2.6954545454545453, "grad_norm": 9.313067436218262, "learning_rate": 1.053030303030303e-05, "loss": 1.3848, "step": 1186 }, { "epoch": 2.6977272727272728, "grad_norm": 12.77459716796875, "learning_rate": 1.0454545454545455e-05, "loss": 1.9546, "step": 1187 }, { "epoch": 2.7, "grad_norm": 12.23890495300293, "learning_rate": 1.037878787878788e-05, "loss": 1.858, "step": 1188 }, { "epoch": 2.702272727272727, "grad_norm": 10.90783977508545, "learning_rate": 1.0303030303030304e-05, "loss": 1.7215, "step": 1189 }, { "epoch": 2.7045454545454546, "grad_norm": 11.610969543457031, "learning_rate": 1.0227272727272729e-05, "loss": 1.3744, "step": 1190 }, { "epoch": 2.706818181818182, "grad_norm": 13.296714782714844, "learning_rate": 1.0151515151515152e-05, "loss": 1.3959, "step": 1191 }, { "epoch": 2.709090909090909, "grad_norm": 11.602737426757812, "learning_rate": 1.0075757575757576e-05, "loss": 0.9706, "step": 1192 }, { "epoch": 2.7113636363636364, "grad_norm": 8.904767036437988, "learning_rate": 1e-05, "loss": 1.1206, "step": 1193 }, { "epoch": 2.713636363636364, "grad_norm": 9.719966888427734, "learning_rate": 9.924242424242425e-06, "loss": 1.326, "step": 1194 }, { "epoch": 2.715909090909091, "grad_norm": 11.37736701965332, "learning_rate": 9.848484848484848e-06, "loss": 1.2423, "step": 1195 }, { "epoch": 2.7181818181818183, "grad_norm": 8.89704418182373, "learning_rate": 9.772727272727273e-06, "loss": 1.5434, "step": 1196 }, { "epoch": 2.7204545454545457, "grad_norm": 11.980868339538574, "learning_rate": 9.696969696969698e-06, "loss": 1.9285, "step": 1197 }, { "epoch": 2.7227272727272727, "grad_norm": 20.147335052490234, "learning_rate": 9.62121212121212e-06, "loss": 1.9032, "step": 1198 }, { "epoch": 2.725, "grad_norm": 12.508543014526367, "learning_rate": 9.545454545454547e-06, "loss": 2.549, "step": 1199 }, { "epoch": 2.7272727272727275, "grad_norm": 15.286222457885742, "learning_rate": 9.46969696969697e-06, "loss": 1.7541, "step": 1200 }, { "epoch": 2.7295454545454545, "grad_norm": 9.950079917907715, "learning_rate": 9.393939393939394e-06, "loss": 1.0859, "step": 1201 }, { "epoch": 2.731818181818182, "grad_norm": 9.034377098083496, "learning_rate": 9.318181818181819e-06, "loss": 1.6942, "step": 1202 }, { "epoch": 2.7340909090909093, "grad_norm": 10.347823143005371, "learning_rate": 9.242424242424244e-06, "loss": 0.7853, "step": 1203 }, { "epoch": 2.7363636363636363, "grad_norm": 13.554040908813477, "learning_rate": 9.166666666666666e-06, "loss": 1.6867, "step": 1204 }, { "epoch": 2.7386363636363638, "grad_norm": 12.764242172241211, "learning_rate": 9.090909090909091e-06, "loss": 1.7983, "step": 1205 }, { "epoch": 2.740909090909091, "grad_norm": 13.305977821350098, "learning_rate": 9.015151515151516e-06, "loss": 1.5904, "step": 1206 }, { "epoch": 2.743181818181818, "grad_norm": 16.118629455566406, "learning_rate": 8.93939393939394e-06, "loss": 1.593, "step": 1207 }, { "epoch": 2.7454545454545456, "grad_norm": 9.158020973205566, "learning_rate": 8.863636363636365e-06, "loss": 1.2809, "step": 1208 }, { "epoch": 2.7477272727272726, "grad_norm": 12.490316390991211, "learning_rate": 8.787878787878788e-06, "loss": 1.5405, "step": 1209 }, { "epoch": 2.75, "grad_norm": 12.778218269348145, "learning_rate": 8.712121212121212e-06, "loss": 1.4892, "step": 1210 }, { "epoch": 2.7522727272727274, "grad_norm": 11.4492826461792, "learning_rate": 8.636363636363637e-06, "loss": 1.2019, "step": 1211 }, { "epoch": 2.7545454545454544, "grad_norm": 13.168742179870605, "learning_rate": 8.56060606060606e-06, "loss": 1.6647, "step": 1212 }, { "epoch": 2.756818181818182, "grad_norm": 10.593256950378418, "learning_rate": 8.484848484848486e-06, "loss": 1.3455, "step": 1213 }, { "epoch": 2.759090909090909, "grad_norm": 12.997807502746582, "learning_rate": 8.409090909090909e-06, "loss": 1.6967, "step": 1214 }, { "epoch": 2.7613636363636362, "grad_norm": 16.37111473083496, "learning_rate": 8.333333333333334e-06, "loss": 1.7001, "step": 1215 }, { "epoch": 2.7636363636363637, "grad_norm": 11.749297142028809, "learning_rate": 8.257575757575758e-06, "loss": 0.9918, "step": 1216 }, { "epoch": 2.7659090909090907, "grad_norm": 9.196391105651855, "learning_rate": 8.181818181818183e-06, "loss": 1.3952, "step": 1217 }, { "epoch": 2.768181818181818, "grad_norm": 7.304767608642578, "learning_rate": 8.106060606060606e-06, "loss": 0.9309, "step": 1218 }, { "epoch": 2.7704545454545455, "grad_norm": 11.371389389038086, "learning_rate": 8.03030303030303e-06, "loss": 2.2034, "step": 1219 }, { "epoch": 2.7727272727272725, "grad_norm": 10.503549575805664, "learning_rate": 7.954545454545455e-06, "loss": 1.0822, "step": 1220 }, { "epoch": 2.775, "grad_norm": 11.071968078613281, "learning_rate": 7.878787878787878e-06, "loss": 1.7071, "step": 1221 }, { "epoch": 2.7772727272727273, "grad_norm": 11.416297912597656, "learning_rate": 7.803030303030304e-06, "loss": 2.0261, "step": 1222 }, { "epoch": 2.7795454545454543, "grad_norm": 15.829241752624512, "learning_rate": 7.727272727272727e-06, "loss": 2.0085, "step": 1223 }, { "epoch": 2.7818181818181817, "grad_norm": 8.403531074523926, "learning_rate": 7.651515151515152e-06, "loss": 1.2764, "step": 1224 }, { "epoch": 2.784090909090909, "grad_norm": 11.730886459350586, "learning_rate": 7.5757575757575764e-06, "loss": 1.6733, "step": 1225 }, { "epoch": 2.786363636363636, "grad_norm": 13.102418899536133, "learning_rate": 7.5e-06, "loss": 2.139, "step": 1226 }, { "epoch": 2.7886363636363636, "grad_norm": 14.804220199584961, "learning_rate": 7.424242424242425e-06, "loss": 2.1015, "step": 1227 }, { "epoch": 2.790909090909091, "grad_norm": 11.839103698730469, "learning_rate": 7.3484848484848486e-06, "loss": 1.6026, "step": 1228 }, { "epoch": 2.793181818181818, "grad_norm": 17.421327590942383, "learning_rate": 7.272727272727272e-06, "loss": 2.7038, "step": 1229 }, { "epoch": 2.7954545454545454, "grad_norm": 14.81433391571045, "learning_rate": 7.196969696969698e-06, "loss": 1.702, "step": 1230 }, { "epoch": 2.797727272727273, "grad_norm": 7.195108413696289, "learning_rate": 7.1212121212121215e-06, "loss": 0.9022, "step": 1231 }, { "epoch": 2.8, "grad_norm": 9.045830726623535, "learning_rate": 7.045454545454545e-06, "loss": 1.0748, "step": 1232 }, { "epoch": 2.8022727272727272, "grad_norm": 11.995684623718262, "learning_rate": 6.969696969696971e-06, "loss": 2.5776, "step": 1233 }, { "epoch": 2.8045454545454547, "grad_norm": 10.528661727905273, "learning_rate": 6.8939393939393945e-06, "loss": 1.8155, "step": 1234 }, { "epoch": 2.8068181818181817, "grad_norm": 34.72589111328125, "learning_rate": 6.818181818181818e-06, "loss": 2.5481, "step": 1235 }, { "epoch": 2.809090909090909, "grad_norm": 8.032730102539062, "learning_rate": 6.742424242424243e-06, "loss": 0.736, "step": 1236 }, { "epoch": 2.8113636363636365, "grad_norm": 9.088884353637695, "learning_rate": 6.666666666666667e-06, "loss": 1.6364, "step": 1237 }, { "epoch": 2.8136363636363635, "grad_norm": 9.277338027954102, "learning_rate": 6.59090909090909e-06, "loss": 1.4521, "step": 1238 }, { "epoch": 2.815909090909091, "grad_norm": 12.458305358886719, "learning_rate": 6.515151515151516e-06, "loss": 1.2296, "step": 1239 }, { "epoch": 2.8181818181818183, "grad_norm": 10.594490051269531, "learning_rate": 6.43939393939394e-06, "loss": 1.414, "step": 1240 }, { "epoch": 2.8204545454545453, "grad_norm": 10.604024887084961, "learning_rate": 6.363636363636363e-06, "loss": 1.7017, "step": 1241 }, { "epoch": 2.8227272727272728, "grad_norm": 10.347737312316895, "learning_rate": 6.287878787878789e-06, "loss": 1.2462, "step": 1242 }, { "epoch": 2.825, "grad_norm": 11.151006698608398, "learning_rate": 6.212121212121212e-06, "loss": 1.7713, "step": 1243 }, { "epoch": 2.827272727272727, "grad_norm": 12.432381629943848, "learning_rate": 6.136363636363636e-06, "loss": 2.7927, "step": 1244 }, { "epoch": 2.8295454545454546, "grad_norm": 12.030777931213379, "learning_rate": 6.060606060606061e-06, "loss": 2.1842, "step": 1245 }, { "epoch": 2.831818181818182, "grad_norm": 14.940272331237793, "learning_rate": 5.984848484848485e-06, "loss": 1.6475, "step": 1246 }, { "epoch": 2.834090909090909, "grad_norm": 8.027610778808594, "learning_rate": 5.909090909090909e-06, "loss": 0.948, "step": 1247 }, { "epoch": 2.8363636363636364, "grad_norm": 12.356363296508789, "learning_rate": 5.833333333333334e-06, "loss": 1.6191, "step": 1248 }, { "epoch": 2.838636363636364, "grad_norm": 12.225868225097656, "learning_rate": 5.7575757575757586e-06, "loss": 1.2056, "step": 1249 }, { "epoch": 2.840909090909091, "grad_norm": 11.615985870361328, "learning_rate": 5.681818181818182e-06, "loss": 1.5477, "step": 1250 }, { "epoch": 2.8431818181818183, "grad_norm": 13.92235279083252, "learning_rate": 5.606060606060606e-06, "loss": 2.401, "step": 1251 }, { "epoch": 2.8454545454545457, "grad_norm": 19.311002731323242, "learning_rate": 5.530303030303031e-06, "loss": 2.2211, "step": 1252 }, { "epoch": 2.8477272727272727, "grad_norm": 9.447689056396484, "learning_rate": 5.4545454545454545e-06, "loss": 1.2734, "step": 1253 }, { "epoch": 2.85, "grad_norm": 10.197713851928711, "learning_rate": 5.378787878787879e-06, "loss": 0.878, "step": 1254 }, { "epoch": 2.8522727272727275, "grad_norm": 14.826508522033691, "learning_rate": 5.303030303030304e-06, "loss": 1.6759, "step": 1255 }, { "epoch": 2.8545454545454545, "grad_norm": 10.666242599487305, "learning_rate": 5.2272727272727274e-06, "loss": 2.1974, "step": 1256 }, { "epoch": 2.856818181818182, "grad_norm": 13.020369529724121, "learning_rate": 5.151515151515152e-06, "loss": 1.4073, "step": 1257 }, { "epoch": 2.8590909090909093, "grad_norm": 14.27531623840332, "learning_rate": 5.075757575757576e-06, "loss": 2.1165, "step": 1258 }, { "epoch": 2.8613636363636363, "grad_norm": 11.82662296295166, "learning_rate": 5e-06, "loss": 0.7765, "step": 1259 }, { "epoch": 2.8636363636363638, "grad_norm": 12.107914924621582, "learning_rate": 4.924242424242424e-06, "loss": 1.2762, "step": 1260 }, { "epoch": 2.865909090909091, "grad_norm": 10.041885375976562, "learning_rate": 4.848484848484849e-06, "loss": 2.1775, "step": 1261 }, { "epoch": 2.868181818181818, "grad_norm": 11.078441619873047, "learning_rate": 4.772727272727273e-06, "loss": 1.6073, "step": 1262 }, { "epoch": 2.8704545454545456, "grad_norm": 9.000492095947266, "learning_rate": 4.696969696969697e-06, "loss": 1.4636, "step": 1263 }, { "epoch": 2.8727272727272726, "grad_norm": 11.069653511047363, "learning_rate": 4.621212121212122e-06, "loss": 1.4654, "step": 1264 }, { "epoch": 2.875, "grad_norm": 9.110404968261719, "learning_rate": 4.5454545454545455e-06, "loss": 1.8338, "step": 1265 }, { "epoch": 2.8772727272727274, "grad_norm": 16.761194229125977, "learning_rate": 4.46969696969697e-06, "loss": 1.0709, "step": 1266 }, { "epoch": 2.8795454545454544, "grad_norm": 13.67717170715332, "learning_rate": 4.393939393939394e-06, "loss": 2.0994, "step": 1267 }, { "epoch": 2.881818181818182, "grad_norm": 8.258940696716309, "learning_rate": 4.3181818181818185e-06, "loss": 1.2818, "step": 1268 }, { "epoch": 2.884090909090909, "grad_norm": 12.960264205932617, "learning_rate": 4.242424242424243e-06, "loss": 1.9218, "step": 1269 }, { "epoch": 2.8863636363636362, "grad_norm": 10.886972427368164, "learning_rate": 4.166666666666667e-06, "loss": 1.4611, "step": 1270 }, { "epoch": 2.8886363636363637, "grad_norm": 10.516489028930664, "learning_rate": 4.0909090909090915e-06, "loss": 2.3418, "step": 1271 }, { "epoch": 2.8909090909090907, "grad_norm": 12.977254867553711, "learning_rate": 4.015151515151515e-06, "loss": 1.5361, "step": 1272 }, { "epoch": 2.893181818181818, "grad_norm": 14.605803489685059, "learning_rate": 3.939393939393939e-06, "loss": 1.6679, "step": 1273 }, { "epoch": 2.8954545454545455, "grad_norm": 17.729450225830078, "learning_rate": 3.863636363636364e-06, "loss": 1.468, "step": 1274 }, { "epoch": 2.8977272727272725, "grad_norm": 10.65392780303955, "learning_rate": 3.7878787878787882e-06, "loss": 1.8606, "step": 1275 }, { "epoch": 2.9, "grad_norm": 18.738691329956055, "learning_rate": 3.7121212121212124e-06, "loss": 2.7391, "step": 1276 }, { "epoch": 2.9022727272727273, "grad_norm": 11.129204750061035, "learning_rate": 3.636363636363636e-06, "loss": 1.4911, "step": 1277 }, { "epoch": 2.9045454545454543, "grad_norm": 10.117977142333984, "learning_rate": 3.5606060606060608e-06, "loss": 1.0915, "step": 1278 }, { "epoch": 2.9068181818181817, "grad_norm": 9.391002655029297, "learning_rate": 3.4848484848484854e-06, "loss": 1.1659, "step": 1279 }, { "epoch": 2.909090909090909, "grad_norm": 10.86440372467041, "learning_rate": 3.409090909090909e-06, "loss": 1.4967, "step": 1280 }, { "epoch": 2.911363636363636, "grad_norm": 11.438384056091309, "learning_rate": 3.3333333333333333e-06, "loss": 1.6597, "step": 1281 }, { "epoch": 2.9136363636363636, "grad_norm": 13.486211776733398, "learning_rate": 3.257575757575758e-06, "loss": 1.947, "step": 1282 }, { "epoch": 2.915909090909091, "grad_norm": 13.491000175476074, "learning_rate": 3.1818181818181817e-06, "loss": 2.4163, "step": 1283 }, { "epoch": 2.918181818181818, "grad_norm": 10.710677146911621, "learning_rate": 3.106060606060606e-06, "loss": 1.8073, "step": 1284 }, { "epoch": 2.9204545454545454, "grad_norm": 12.062322616577148, "learning_rate": 3.0303030303030305e-06, "loss": 1.9969, "step": 1285 }, { "epoch": 2.922727272727273, "grad_norm": 70.31402587890625, "learning_rate": 2.9545454545454547e-06, "loss": 1.3767, "step": 1286 }, { "epoch": 2.925, "grad_norm": 9.519462585449219, "learning_rate": 2.8787878787878793e-06, "loss": 1.4795, "step": 1287 }, { "epoch": 2.9272727272727272, "grad_norm": 13.316557884216309, "learning_rate": 2.803030303030303e-06, "loss": 0.858, "step": 1288 }, { "epoch": 2.9295454545454547, "grad_norm": 11.898123741149902, "learning_rate": 2.7272727272727272e-06, "loss": 1.7807, "step": 1289 }, { "epoch": 2.9318181818181817, "grad_norm": 13.429510116577148, "learning_rate": 2.651515151515152e-06, "loss": 1.7467, "step": 1290 }, { "epoch": 2.934090909090909, "grad_norm": 34.4333381652832, "learning_rate": 2.575757575757576e-06, "loss": 1.6774, "step": 1291 }, { "epoch": 2.9363636363636365, "grad_norm": 8.44999885559082, "learning_rate": 2.5e-06, "loss": 0.8595, "step": 1292 }, { "epoch": 2.9386363636363635, "grad_norm": 9.824548721313477, "learning_rate": 2.4242424242424244e-06, "loss": 1.551, "step": 1293 }, { "epoch": 2.940909090909091, "grad_norm": 10.713866233825684, "learning_rate": 2.3484848484848486e-06, "loss": 1.4604, "step": 1294 }, { "epoch": 2.9431818181818183, "grad_norm": 18.695775985717773, "learning_rate": 2.2727272727272728e-06, "loss": 2.8512, "step": 1295 }, { "epoch": 2.9454545454545453, "grad_norm": 9.289727210998535, "learning_rate": 2.196969696969697e-06, "loss": 1.3539, "step": 1296 }, { "epoch": 2.9477272727272728, "grad_norm": 7.917882442474365, "learning_rate": 2.1212121212121216e-06, "loss": 1.2179, "step": 1297 }, { "epoch": 2.95, "grad_norm": 16.269927978515625, "learning_rate": 2.0454545454545457e-06, "loss": 1.8904, "step": 1298 }, { "epoch": 2.952272727272727, "grad_norm": 11.293408393859863, "learning_rate": 1.9696969696969695e-06, "loss": 1.4438, "step": 1299 }, { "epoch": 2.9545454545454546, "grad_norm": 14.2405424118042, "learning_rate": 1.8939393939393941e-06, "loss": 2.2578, "step": 1300 }, { "epoch": 2.956818181818182, "grad_norm": 9.712430953979492, "learning_rate": 1.818181818181818e-06, "loss": 1.1685, "step": 1301 }, { "epoch": 2.959090909090909, "grad_norm": 14.34041690826416, "learning_rate": 1.7424242424242427e-06, "loss": 1.9741, "step": 1302 }, { "epoch": 2.9613636363636364, "grad_norm": 12.20971965789795, "learning_rate": 1.6666666666666667e-06, "loss": 2.283, "step": 1303 }, { "epoch": 2.963636363636364, "grad_norm": 13.051138877868652, "learning_rate": 1.5909090909090908e-06, "loss": 2.3128, "step": 1304 }, { "epoch": 2.965909090909091, "grad_norm": 11.069129943847656, "learning_rate": 1.5151515151515152e-06, "loss": 1.4379, "step": 1305 }, { "epoch": 2.9681818181818183, "grad_norm": 10.655563354492188, "learning_rate": 1.4393939393939396e-06, "loss": 1.4726, "step": 1306 }, { "epoch": 2.9704545454545457, "grad_norm": 9.674460411071777, "learning_rate": 1.3636363636363636e-06, "loss": 1.2689, "step": 1307 }, { "epoch": 2.9727272727272727, "grad_norm": 10.24626636505127, "learning_rate": 1.287878787878788e-06, "loss": 1.2585, "step": 1308 }, { "epoch": 2.975, "grad_norm": 13.117413520812988, "learning_rate": 1.2121212121212122e-06, "loss": 1.8019, "step": 1309 }, { "epoch": 2.9772727272727275, "grad_norm": 11.649164199829102, "learning_rate": 1.1363636363636364e-06, "loss": 1.375, "step": 1310 }, { "epoch": 2.9795454545454545, "grad_norm": 11.054950714111328, "learning_rate": 1.0606060606060608e-06, "loss": 1.7139, "step": 1311 }, { "epoch": 2.981818181818182, "grad_norm": 9.476350784301758, "learning_rate": 9.848484848484847e-07, "loss": 1.1851, "step": 1312 }, { "epoch": 2.9840909090909093, "grad_norm": 9.467584609985352, "learning_rate": 9.09090909090909e-07, "loss": 1.0272, "step": 1313 }, { "epoch": 2.9863636363636363, "grad_norm": 11.783283233642578, "learning_rate": 8.333333333333333e-07, "loss": 1.886, "step": 1314 }, { "epoch": 2.9886363636363638, "grad_norm": 11.245438575744629, "learning_rate": 7.575757575757576e-07, "loss": 1.2872, "step": 1315 }, { "epoch": 2.990909090909091, "grad_norm": 12.71106243133545, "learning_rate": 6.818181818181818e-07, "loss": 1.3681, "step": 1316 }, { "epoch": 2.993181818181818, "grad_norm": 11.738058090209961, "learning_rate": 6.060606060606061e-07, "loss": 1.9274, "step": 1317 }, { "epoch": 2.9954545454545456, "grad_norm": 12.179485321044922, "learning_rate": 5.303030303030304e-07, "loss": 1.6056, "step": 1318 }, { "epoch": 2.9977272727272726, "grad_norm": 9.123523712158203, "learning_rate": 4.545454545454545e-07, "loss": 1.2402, "step": 1319 }, { "epoch": 3.0, "grad_norm": 17.10702133178711, "learning_rate": 3.787878787878788e-07, "loss": 1.7438, "step": 1320 }, { "epoch": 3.0, "eval_f1": 0.8924, "eval_gen_len": 41.8818, "eval_loss": 1.7954092025756836, "eval_precision": 0.8906, "eval_recall": 0.8943, "eval_rouge1": 0.4651, "eval_rouge2": 0.218, "eval_rougeL": 0.3904, "eval_rougeLsum": 0.4291, "eval_runtime": 28.6293, "eval_samples_per_second": 3.842, "eval_steps_per_second": 0.489, "step": 1320 }, { "epoch": 3.0, "step": 1320, "total_flos": 2659801069854720.0, "train_loss": 1.8849294849868976, "train_runtime": 574.0732, "train_samples_per_second": 4.593, "train_steps_per_second": 2.299 } ], "logging_steps": 1, "max_steps": 1320, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2659801069854720.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }