diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9330 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1320, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0022727272727272726, + "grad_norm": NaN, + "learning_rate": 0.0001, + "loss": 3.1855, + "step": 1 + }, + { + "epoch": 0.004545454545454545, + "grad_norm": Infinity, + "learning_rate": 0.0001, + "loss": 4.3007, + "step": 2 + }, + { + "epoch": 0.006818181818181818, + "grad_norm": Infinity, + "learning_rate": 0.0001, + "loss": 4.3188, + "step": 3 + }, + { + "epoch": 0.00909090909090909, + "grad_norm": 26.366512298583984, + "learning_rate": 9.992424242424244e-05, + "loss": 3.6582, + "step": 4 + }, + { + "epoch": 0.011363636363636364, + "grad_norm": 29.344751358032227, + "learning_rate": 9.984848484848486e-05, + "loss": 4.2964, + "step": 5 + }, + { + "epoch": 0.013636363636363636, + "grad_norm": 29.519277572631836, + "learning_rate": 9.977272727272728e-05, + "loss": 4.0004, + "step": 6 + }, + { + "epoch": 0.015909090909090907, + "grad_norm": 24.204898834228516, + "learning_rate": 9.96969696969697e-05, + "loss": 3.2453, + "step": 7 + }, + { + "epoch": 0.01818181818181818, + "grad_norm": 23.69887351989746, + "learning_rate": 9.962121212121213e-05, + "loss": 2.7972, + "step": 8 + }, + { + "epoch": 0.020454545454545454, + "grad_norm": 52.371498107910156, + "learning_rate": 9.954545454545455e-05, + "loss": 2.5971, + "step": 9 + }, + { + "epoch": 0.022727272727272728, + "grad_norm": 41.59567642211914, + "learning_rate": 9.946969696969698e-05, + "loss": 3.3081, + "step": 10 + }, + { + "epoch": 0.025, + "grad_norm": 27.913963317871094, + "learning_rate": 9.939393939393939e-05, + "loss": 3.5977, + "step": 11 + }, + { + "epoch": 0.02727272727272727, + "grad_norm": 21.261117935180664, + "learning_rate": 9.931818181818182e-05, + "loss": 3.3403, + "step": 12 + }, + { + "epoch": 0.029545454545454545, + "grad_norm": 20.344589233398438, + "learning_rate": 9.924242424242425e-05, + "loss": 2.0478, + "step": 13 + }, + { + "epoch": 0.031818181818181815, + "grad_norm": 32.50373077392578, + "learning_rate": 9.916666666666667e-05, + "loss": 3.0773, + "step": 14 + }, + { + "epoch": 0.03409090909090909, + "grad_norm": 21.426048278808594, + "learning_rate": 9.909090909090911e-05, + "loss": 2.8572, + "step": 15 + }, + { + "epoch": 0.03636363636363636, + "grad_norm": 27.847314834594727, + "learning_rate": 9.901515151515151e-05, + "loss": 3.129, + "step": 16 + }, + { + "epoch": 0.038636363636363635, + "grad_norm": 23.516616821289062, + "learning_rate": 9.893939393939395e-05, + "loss": 3.3971, + "step": 17 + }, + { + "epoch": 0.04090909090909091, + "grad_norm": 29.170352935791016, + "learning_rate": 9.886363636363637e-05, + "loss": 3.6325, + "step": 18 + }, + { + "epoch": 0.04318181818181818, + "grad_norm": 21.103153228759766, + "learning_rate": 9.87878787878788e-05, + "loss": 2.7935, + "step": 19 + }, + { + "epoch": 0.045454545454545456, + "grad_norm": 25.863285064697266, + "learning_rate": 9.871212121212122e-05, + "loss": 2.0675, + "step": 20 + }, + { + "epoch": 0.04772727272727273, + "grad_norm": 25.554828643798828, + "learning_rate": 9.863636363636364e-05, + "loss": 2.8331, + "step": 21 + }, + { + "epoch": 0.05, + "grad_norm": 26.424827575683594, + "learning_rate": 9.856060606060607e-05, + "loss": 4.0934, + "step": 22 + }, + { + "epoch": 0.05227272727272727, + "grad_norm": 40.84152603149414, + "learning_rate": 9.848484848484849e-05, + "loss": 2.7315, + "step": 23 + }, + { + "epoch": 0.05454545454545454, + "grad_norm": 17.789630889892578, + "learning_rate": 9.840909090909092e-05, + "loss": 2.5798, + "step": 24 + }, + { + "epoch": 0.056818181818181816, + "grad_norm": 15.23817253112793, + "learning_rate": 9.833333333333333e-05, + "loss": 2.3981, + "step": 25 + }, + { + "epoch": 0.05909090909090909, + "grad_norm": 17.333356857299805, + "learning_rate": 9.825757575757576e-05, + "loss": 2.0097, + "step": 26 + }, + { + "epoch": 0.06136363636363636, + "grad_norm": 17.358461380004883, + "learning_rate": 9.818181818181818e-05, + "loss": 1.5636, + "step": 27 + }, + { + "epoch": 0.06363636363636363, + "grad_norm": 15.479598999023438, + "learning_rate": 9.810606060606061e-05, + "loss": 2.3064, + "step": 28 + }, + { + "epoch": 0.0659090909090909, + "grad_norm": 18.889394760131836, + "learning_rate": 9.803030303030303e-05, + "loss": 1.6592, + "step": 29 + }, + { + "epoch": 0.06818181818181818, + "grad_norm": 19.264772415161133, + "learning_rate": 9.795454545454545e-05, + "loss": 2.9327, + "step": 30 + }, + { + "epoch": 0.07045454545454545, + "grad_norm": 19.369556427001953, + "learning_rate": 9.787878787878789e-05, + "loss": 3.2685, + "step": 31 + }, + { + "epoch": 0.07272727272727272, + "grad_norm": 20.017459869384766, + "learning_rate": 9.78030303030303e-05, + "loss": 3.4532, + "step": 32 + }, + { + "epoch": 0.075, + "grad_norm": 18.956012725830078, + "learning_rate": 9.772727272727274e-05, + "loss": 2.2143, + "step": 33 + }, + { + "epoch": 0.07727272727272727, + "grad_norm": 15.438785552978516, + "learning_rate": 9.765151515151516e-05, + "loss": 2.407, + "step": 34 + }, + { + "epoch": 0.07954545454545454, + "grad_norm": 22.79155921936035, + "learning_rate": 9.757575757575758e-05, + "loss": 3.1064, + "step": 35 + }, + { + "epoch": 0.08181818181818182, + "grad_norm": 15.908382415771484, + "learning_rate": 9.75e-05, + "loss": 2.9192, + "step": 36 + }, + { + "epoch": 0.08409090909090909, + "grad_norm": 21.536775588989258, + "learning_rate": 9.742424242424243e-05, + "loss": 2.8127, + "step": 37 + }, + { + "epoch": 0.08636363636363636, + "grad_norm": 19.644390106201172, + "learning_rate": 9.734848484848485e-05, + "loss": 1.704, + "step": 38 + }, + { + "epoch": 0.08863636363636364, + "grad_norm": 20.067602157592773, + "learning_rate": 9.727272727272728e-05, + "loss": 2.3733, + "step": 39 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 16.551055908203125, + "learning_rate": 9.71969696969697e-05, + "loss": 2.2413, + "step": 40 + }, + { + "epoch": 0.09318181818181819, + "grad_norm": 18.292987823486328, + "learning_rate": 9.712121212121212e-05, + "loss": 2.6103, + "step": 41 + }, + { + "epoch": 0.09545454545454546, + "grad_norm": 15.751124382019043, + "learning_rate": 9.704545454545456e-05, + "loss": 1.5648, + "step": 42 + }, + { + "epoch": 0.09772727272727273, + "grad_norm": 25.068395614624023, + "learning_rate": 9.696969696969698e-05, + "loss": 1.8226, + "step": 43 + }, + { + "epoch": 0.1, + "grad_norm": 25.069040298461914, + "learning_rate": 9.689393939393941e-05, + "loss": 3.8825, + "step": 44 + }, + { + "epoch": 0.10227272727272728, + "grad_norm": 20.751232147216797, + "learning_rate": 9.681818181818181e-05, + "loss": 2.9331, + "step": 45 + }, + { + "epoch": 0.10454545454545454, + "grad_norm": 23.918386459350586, + "learning_rate": 9.674242424242425e-05, + "loss": 3.0365, + "step": 46 + }, + { + "epoch": 0.10681818181818181, + "grad_norm": 16.94843864440918, + "learning_rate": 9.666666666666667e-05, + "loss": 2.0012, + "step": 47 + }, + { + "epoch": 0.10909090909090909, + "grad_norm": 38.2060432434082, + "learning_rate": 9.65909090909091e-05, + "loss": 2.4087, + "step": 48 + }, + { + "epoch": 0.11136363636363636, + "grad_norm": 15.836068153381348, + "learning_rate": 9.651515151515152e-05, + "loss": 2.7204, + "step": 49 + }, + { + "epoch": 0.11363636363636363, + "grad_norm": 20.13130760192871, + "learning_rate": 9.643939393939394e-05, + "loss": 1.8803, + "step": 50 + }, + { + "epoch": 0.1159090909090909, + "grad_norm": 21.58964729309082, + "learning_rate": 9.636363636363637e-05, + "loss": 2.2448, + "step": 51 + }, + { + "epoch": 0.11818181818181818, + "grad_norm": 15.996927261352539, + "learning_rate": 9.628787878787879e-05, + "loss": 2.456, + "step": 52 + }, + { + "epoch": 0.12045454545454545, + "grad_norm": 15.738017082214355, + "learning_rate": 9.621212121212123e-05, + "loss": 2.0494, + "step": 53 + }, + { + "epoch": 0.12272727272727273, + "grad_norm": 20.54029655456543, + "learning_rate": 9.613636363636363e-05, + "loss": 2.8584, + "step": 54 + }, + { + "epoch": 0.125, + "grad_norm": 20.11783790588379, + "learning_rate": 9.606060606060606e-05, + "loss": 2.9836, + "step": 55 + }, + { + "epoch": 0.12727272727272726, + "grad_norm": 15.297281265258789, + "learning_rate": 9.598484848484848e-05, + "loss": 1.8828, + "step": 56 + }, + { + "epoch": 0.12954545454545455, + "grad_norm": 15.26744270324707, + "learning_rate": 9.590909090909092e-05, + "loss": 1.2548, + "step": 57 + }, + { + "epoch": 0.1318181818181818, + "grad_norm": 18.839954376220703, + "learning_rate": 9.583333333333334e-05, + "loss": 3.7553, + "step": 58 + }, + { + "epoch": 0.1340909090909091, + "grad_norm": 17.30214500427246, + "learning_rate": 9.575757575757576e-05, + "loss": 2.2297, + "step": 59 + }, + { + "epoch": 0.13636363636363635, + "grad_norm": 25.153942108154297, + "learning_rate": 9.568181818181819e-05, + "loss": 2.6817, + "step": 60 + }, + { + "epoch": 0.13863636363636364, + "grad_norm": 17.55406379699707, + "learning_rate": 9.560606060606061e-05, + "loss": 2.8551, + "step": 61 + }, + { + "epoch": 0.1409090909090909, + "grad_norm": NaN, + "learning_rate": 9.560606060606061e-05, + "loss": 2.4352, + "step": 62 + }, + { + "epoch": 0.1431818181818182, + "grad_norm": 18.4881649017334, + "learning_rate": 9.553030303030304e-05, + "loss": 2.1839, + "step": 63 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 15.114643096923828, + "learning_rate": 9.545454545454546e-05, + "loss": 1.768, + "step": 64 + }, + { + "epoch": 0.14772727272727273, + "grad_norm": 17.272735595703125, + "learning_rate": 9.537878787878788e-05, + "loss": 2.4241, + "step": 65 + }, + { + "epoch": 0.15, + "grad_norm": 18.25682258605957, + "learning_rate": 9.53030303030303e-05, + "loss": 1.8703, + "step": 66 + }, + { + "epoch": 0.15227272727272728, + "grad_norm": 20.255084991455078, + "learning_rate": 9.522727272727273e-05, + "loss": 2.3706, + "step": 67 + }, + { + "epoch": 0.15454545454545454, + "grad_norm": 16.153093338012695, + "learning_rate": 9.515151515151515e-05, + "loss": 2.3896, + "step": 68 + }, + { + "epoch": 0.15681818181818183, + "grad_norm": 14.229001998901367, + "learning_rate": 9.507575757575759e-05, + "loss": 2.5261, + "step": 69 + }, + { + "epoch": 0.1590909090909091, + "grad_norm": 14.036202430725098, + "learning_rate": 9.5e-05, + "loss": 1.8918, + "step": 70 + }, + { + "epoch": 0.16136363636363638, + "grad_norm": 16.262582778930664, + "learning_rate": 9.492424242424242e-05, + "loss": 2.7854, + "step": 71 + }, + { + "epoch": 0.16363636363636364, + "grad_norm": 17.119918823242188, + "learning_rate": 9.484848484848486e-05, + "loss": 2.1371, + "step": 72 + }, + { + "epoch": 0.16590909090909092, + "grad_norm": 19.72575569152832, + "learning_rate": 9.477272727272728e-05, + "loss": 2.6801, + "step": 73 + }, + { + "epoch": 0.16818181818181818, + "grad_norm": 17.036550521850586, + "learning_rate": 9.469696969696971e-05, + "loss": 2.6403, + "step": 74 + }, + { + "epoch": 0.17045454545454544, + "grad_norm": 14.31810188293457, + "learning_rate": 9.462121212121212e-05, + "loss": 1.9865, + "step": 75 + }, + { + "epoch": 0.17272727272727273, + "grad_norm": 18.39834213256836, + "learning_rate": 9.454545454545455e-05, + "loss": 2.418, + "step": 76 + }, + { + "epoch": 0.175, + "grad_norm": 18.37046241760254, + "learning_rate": 9.446969696969697e-05, + "loss": 2.2905, + "step": 77 + }, + { + "epoch": 0.17727272727272728, + "grad_norm": 14.999472618103027, + "learning_rate": 9.43939393939394e-05, + "loss": 2.2521, + "step": 78 + }, + { + "epoch": 0.17954545454545454, + "grad_norm": 11.88487434387207, + "learning_rate": 9.431818181818182e-05, + "loss": 2.2871, + "step": 79 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 21.745532989501953, + "learning_rate": 9.424242424242424e-05, + "loss": 2.6415, + "step": 80 + }, + { + "epoch": 0.18409090909090908, + "grad_norm": 13.109172821044922, + "learning_rate": 9.416666666666667e-05, + "loss": 1.9554, + "step": 81 + }, + { + "epoch": 0.18636363636363637, + "grad_norm": 17.222652435302734, + "learning_rate": 9.40909090909091e-05, + "loss": 2.0914, + "step": 82 + }, + { + "epoch": 0.18863636363636363, + "grad_norm": 17.833839416503906, + "learning_rate": 9.401515151515153e-05, + "loss": 1.6032, + "step": 83 + }, + { + "epoch": 0.19090909090909092, + "grad_norm": 22.737525939941406, + "learning_rate": 9.393939393939395e-05, + "loss": 3.5915, + "step": 84 + }, + { + "epoch": 0.19318181818181818, + "grad_norm": 14.926959037780762, + "learning_rate": 9.386363636363637e-05, + "loss": 2.2499, + "step": 85 + }, + { + "epoch": 0.19545454545454546, + "grad_norm": 13.586040496826172, + "learning_rate": 9.378787878787879e-05, + "loss": 1.8228, + "step": 86 + }, + { + "epoch": 0.19772727272727272, + "grad_norm": 19.175617218017578, + "learning_rate": 9.371212121212122e-05, + "loss": 2.7846, + "step": 87 + }, + { + "epoch": 0.2, + "grad_norm": 21.078235626220703, + "learning_rate": 9.363636363636364e-05, + "loss": 2.7906, + "step": 88 + }, + { + "epoch": 0.20227272727272727, + "grad_norm": 17.618940353393555, + "learning_rate": 9.356060606060606e-05, + "loss": 2.3022, + "step": 89 + }, + { + "epoch": 0.20454545454545456, + "grad_norm": 16.79983139038086, + "learning_rate": 9.348484848484849e-05, + "loss": 1.8126, + "step": 90 + }, + { + "epoch": 0.20681818181818182, + "grad_norm": 20.444580078125, + "learning_rate": 9.340909090909091e-05, + "loss": 2.055, + "step": 91 + }, + { + "epoch": 0.20909090909090908, + "grad_norm": 18.694856643676758, + "learning_rate": 9.333333333333334e-05, + "loss": 2.6534, + "step": 92 + }, + { + "epoch": 0.21136363636363636, + "grad_norm": 11.254834175109863, + "learning_rate": 9.325757575757576e-05, + "loss": 1.6695, + "step": 93 + }, + { + "epoch": 0.21363636363636362, + "grad_norm": 14.369203567504883, + "learning_rate": 9.318181818181818e-05, + "loss": 2.3469, + "step": 94 + }, + { + "epoch": 0.2159090909090909, + "grad_norm": 17.27039909362793, + "learning_rate": 9.31060606060606e-05, + "loss": 1.9188, + "step": 95 + }, + { + "epoch": 0.21818181818181817, + "grad_norm": 12.644415855407715, + "learning_rate": 9.303030303030303e-05, + "loss": 1.3295, + "step": 96 + }, + { + "epoch": 0.22045454545454546, + "grad_norm": 20.46677589416504, + "learning_rate": 9.295454545454545e-05, + "loss": 2.4697, + "step": 97 + }, + { + "epoch": 0.22272727272727272, + "grad_norm": 15.218058586120605, + "learning_rate": 9.287878787878789e-05, + "loss": 2.4472, + "step": 98 + }, + { + "epoch": 0.225, + "grad_norm": 14.982362747192383, + "learning_rate": 9.280303030303031e-05, + "loss": 1.881, + "step": 99 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 20.168306350708008, + "learning_rate": 9.272727272727273e-05, + "loss": 1.6077, + "step": 100 + }, + { + "epoch": 0.22954545454545455, + "grad_norm": 13.462889671325684, + "learning_rate": 9.265151515151516e-05, + "loss": 1.6057, + "step": 101 + }, + { + "epoch": 0.2318181818181818, + "grad_norm": 12.3695068359375, + "learning_rate": 9.257575757575758e-05, + "loss": 2.0871, + "step": 102 + }, + { + "epoch": 0.2340909090909091, + "grad_norm": 15.381841659545898, + "learning_rate": 9.250000000000001e-05, + "loss": 2.0592, + "step": 103 + }, + { + "epoch": 0.23636363636363636, + "grad_norm": 18.213014602661133, + "learning_rate": 9.242424242424242e-05, + "loss": 2.2397, + "step": 104 + }, + { + "epoch": 0.23863636363636365, + "grad_norm": 19.589962005615234, + "learning_rate": 9.234848484848485e-05, + "loss": 2.8305, + "step": 105 + }, + { + "epoch": 0.2409090909090909, + "grad_norm": 21.765127182006836, + "learning_rate": 9.227272727272727e-05, + "loss": 1.8691, + "step": 106 + }, + { + "epoch": 0.2431818181818182, + "grad_norm": 21.66250228881836, + "learning_rate": 9.21969696969697e-05, + "loss": 2.7176, + "step": 107 + }, + { + "epoch": 0.24545454545454545, + "grad_norm": 16.438037872314453, + "learning_rate": 9.212121212121214e-05, + "loss": 3.0262, + "step": 108 + }, + { + "epoch": 0.24772727272727274, + "grad_norm": 18.32391357421875, + "learning_rate": 9.204545454545454e-05, + "loss": 2.4011, + "step": 109 + }, + { + "epoch": 0.25, + "grad_norm": 18.3424015045166, + "learning_rate": 9.196969696969698e-05, + "loss": 3.3481, + "step": 110 + }, + { + "epoch": 0.25227272727272726, + "grad_norm": 12.168206214904785, + "learning_rate": 9.18939393939394e-05, + "loss": 1.5084, + "step": 111 + }, + { + "epoch": 0.2545454545454545, + "grad_norm": 16.183521270751953, + "learning_rate": 9.181818181818183e-05, + "loss": 3.3444, + "step": 112 + }, + { + "epoch": 0.25681818181818183, + "grad_norm": 17.887187957763672, + "learning_rate": 9.174242424242425e-05, + "loss": 2.4529, + "step": 113 + }, + { + "epoch": 0.2590909090909091, + "grad_norm": 18.000579833984375, + "learning_rate": 9.166666666666667e-05, + "loss": 2.3228, + "step": 114 + }, + { + "epoch": 0.26136363636363635, + "grad_norm": 15.579062461853027, + "learning_rate": 9.159090909090909e-05, + "loss": 3.2008, + "step": 115 + }, + { + "epoch": 0.2636363636363636, + "grad_norm": 14.111518859863281, + "learning_rate": 9.151515151515152e-05, + "loss": 2.2286, + "step": 116 + }, + { + "epoch": 0.26590909090909093, + "grad_norm": 13.755249977111816, + "learning_rate": 9.143939393939395e-05, + "loss": 1.9561, + "step": 117 + }, + { + "epoch": 0.2681818181818182, + "grad_norm": 14.665258407592773, + "learning_rate": 9.136363636363637e-05, + "loss": 2.5016, + "step": 118 + }, + { + "epoch": 0.27045454545454545, + "grad_norm": 14.470067024230957, + "learning_rate": 9.128787878787879e-05, + "loss": 2.3301, + "step": 119 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 15.108169555664062, + "learning_rate": 9.121212121212121e-05, + "loss": 2.6079, + "step": 120 + }, + { + "epoch": 0.275, + "grad_norm": 15.080549240112305, + "learning_rate": 9.113636363636365e-05, + "loss": 2.6349, + "step": 121 + }, + { + "epoch": 0.2772727272727273, + "grad_norm": 17.71773910522461, + "learning_rate": 9.106060606060606e-05, + "loss": 1.9447, + "step": 122 + }, + { + "epoch": 0.27954545454545454, + "grad_norm": 11.128664016723633, + "learning_rate": 9.098484848484848e-05, + "loss": 2.2076, + "step": 123 + }, + { + "epoch": 0.2818181818181818, + "grad_norm": 19.131866455078125, + "learning_rate": 9.090909090909092e-05, + "loss": 1.5932, + "step": 124 + }, + { + "epoch": 0.2840909090909091, + "grad_norm": 11.3361177444458, + "learning_rate": 9.083333333333334e-05, + "loss": 2.5923, + "step": 125 + }, + { + "epoch": 0.2863636363636364, + "grad_norm": 16.97115707397461, + "learning_rate": 9.075757575757577e-05, + "loss": 1.828, + "step": 126 + }, + { + "epoch": 0.28863636363636364, + "grad_norm": 11.52206802368164, + "learning_rate": 9.068181818181819e-05, + "loss": 2.3389, + "step": 127 + }, + { + "epoch": 0.2909090909090909, + "grad_norm": 18.27076530456543, + "learning_rate": 9.060606060606061e-05, + "loss": 3.1892, + "step": 128 + }, + { + "epoch": 0.29318181818181815, + "grad_norm": 15.098003387451172, + "learning_rate": 9.053030303030303e-05, + "loss": 2.3429, + "step": 129 + }, + { + "epoch": 0.29545454545454547, + "grad_norm": 13.432772636413574, + "learning_rate": 9.045454545454546e-05, + "loss": 1.7032, + "step": 130 + }, + { + "epoch": 0.29772727272727273, + "grad_norm": 21.96811866760254, + "learning_rate": 9.037878787878788e-05, + "loss": 3.3135, + "step": 131 + }, + { + "epoch": 0.3, + "grad_norm": 17.522789001464844, + "learning_rate": 9.030303030303031e-05, + "loss": 2.0827, + "step": 132 + }, + { + "epoch": 0.30227272727272725, + "grad_norm": 16.18021011352539, + "learning_rate": 9.022727272727273e-05, + "loss": 2.6956, + "step": 133 + }, + { + "epoch": 0.30454545454545456, + "grad_norm": 17.834138870239258, + "learning_rate": 9.015151515151515e-05, + "loss": 2.3929, + "step": 134 + }, + { + "epoch": 0.3068181818181818, + "grad_norm": 18.146596908569336, + "learning_rate": 9.007575757575759e-05, + "loss": 3.0074, + "step": 135 + }, + { + "epoch": 0.3090909090909091, + "grad_norm": 11.941591262817383, + "learning_rate": 9e-05, + "loss": 1.6793, + "step": 136 + }, + { + "epoch": 0.31136363636363634, + "grad_norm": 15.524669647216797, + "learning_rate": 8.992424242424244e-05, + "loss": 2.3193, + "step": 137 + }, + { + "epoch": 0.31363636363636366, + "grad_norm": 17.986879348754883, + "learning_rate": 8.984848484848484e-05, + "loss": 3.1335, + "step": 138 + }, + { + "epoch": 0.3159090909090909, + "grad_norm": 19.568361282348633, + "learning_rate": 8.977272727272728e-05, + "loss": 2.6232, + "step": 139 + }, + { + "epoch": 0.3181818181818182, + "grad_norm": 15.213788986206055, + "learning_rate": 8.96969696969697e-05, + "loss": 1.6936, + "step": 140 + }, + { + "epoch": 0.32045454545454544, + "grad_norm": 16.093795776367188, + "learning_rate": 8.962121212121213e-05, + "loss": 2.38, + "step": 141 + }, + { + "epoch": 0.32272727272727275, + "grad_norm": 17.010087966918945, + "learning_rate": 8.954545454545455e-05, + "loss": 2.0467, + "step": 142 + }, + { + "epoch": 0.325, + "grad_norm": 20.31732749938965, + "learning_rate": 8.946969696969697e-05, + "loss": 2.062, + "step": 143 + }, + { + "epoch": 0.32727272727272727, + "grad_norm": 15.800658226013184, + "learning_rate": 8.93939393939394e-05, + "loss": 1.4575, + "step": 144 + }, + { + "epoch": 0.32954545454545453, + "grad_norm": 15.116626739501953, + "learning_rate": 8.931818181818182e-05, + "loss": 2.314, + "step": 145 + }, + { + "epoch": 0.33181818181818185, + "grad_norm": 25.464197158813477, + "learning_rate": 8.924242424242426e-05, + "loss": 2.0073, + "step": 146 + }, + { + "epoch": 0.3340909090909091, + "grad_norm": 13.291275978088379, + "learning_rate": 8.916666666666667e-05, + "loss": 2.151, + "step": 147 + }, + { + "epoch": 0.33636363636363636, + "grad_norm": 13.530828475952148, + "learning_rate": 8.90909090909091e-05, + "loss": 2.3051, + "step": 148 + }, + { + "epoch": 0.3386363636363636, + "grad_norm": 15.941877365112305, + "learning_rate": 8.901515151515151e-05, + "loss": 2.6671, + "step": 149 + }, + { + "epoch": 0.3409090909090909, + "grad_norm": 16.19255828857422, + "learning_rate": 8.893939393939395e-05, + "loss": 2.4137, + "step": 150 + }, + { + "epoch": 0.3431818181818182, + "grad_norm": 25.39113998413086, + "learning_rate": 8.886363636363637e-05, + "loss": 3.1836, + "step": 151 + }, + { + "epoch": 0.34545454545454546, + "grad_norm": 14.128908157348633, + "learning_rate": 8.87878787878788e-05, + "loss": 2.4864, + "step": 152 + }, + { + "epoch": 0.3477272727272727, + "grad_norm": 14.206392288208008, + "learning_rate": 8.871212121212122e-05, + "loss": 1.3842, + "step": 153 + }, + { + "epoch": 0.35, + "grad_norm": 11.746234893798828, + "learning_rate": 8.863636363636364e-05, + "loss": 1.69, + "step": 154 + }, + { + "epoch": 0.3522727272727273, + "grad_norm": 14.249229431152344, + "learning_rate": 8.856060606060607e-05, + "loss": 2.962, + "step": 155 + }, + { + "epoch": 0.35454545454545455, + "grad_norm": 13.884110450744629, + "learning_rate": 8.848484848484849e-05, + "loss": 1.9429, + "step": 156 + }, + { + "epoch": 0.3568181818181818, + "grad_norm": 15.577651023864746, + "learning_rate": 8.840909090909091e-05, + "loss": 2.0814, + "step": 157 + }, + { + "epoch": 0.35909090909090907, + "grad_norm": 13.055503845214844, + "learning_rate": 8.833333333333333e-05, + "loss": 2.286, + "step": 158 + }, + { + "epoch": 0.3613636363636364, + "grad_norm": 14.148711204528809, + "learning_rate": 8.825757575757576e-05, + "loss": 1.7243, + "step": 159 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 18.32880210876465, + "learning_rate": 8.818181818181818e-05, + "loss": 2.0912, + "step": 160 + }, + { + "epoch": 0.3659090909090909, + "grad_norm": 19.306982040405273, + "learning_rate": 8.810606060606062e-05, + "loss": 2.1032, + "step": 161 + }, + { + "epoch": 0.36818181818181817, + "grad_norm": 18.99219512939453, + "learning_rate": 8.803030303030304e-05, + "loss": 2.3527, + "step": 162 + }, + { + "epoch": 0.3704545454545455, + "grad_norm": 14.297601699829102, + "learning_rate": 8.795454545454545e-05, + "loss": 2.8786, + "step": 163 + }, + { + "epoch": 0.37272727272727274, + "grad_norm": 19.273303985595703, + "learning_rate": 8.787878787878789e-05, + "loss": 2.4364, + "step": 164 + }, + { + "epoch": 0.375, + "grad_norm": 11.870357513427734, + "learning_rate": 8.780303030303031e-05, + "loss": 2.1716, + "step": 165 + }, + { + "epoch": 0.37727272727272726, + "grad_norm": 11.26362133026123, + "learning_rate": 8.772727272727274e-05, + "loss": 3.1212, + "step": 166 + }, + { + "epoch": 0.3795454545454545, + "grad_norm": 12.994135856628418, + "learning_rate": 8.765151515151515e-05, + "loss": 2.4722, + "step": 167 + }, + { + "epoch": 0.38181818181818183, + "grad_norm": 13.474489212036133, + "learning_rate": 8.757575757575758e-05, + "loss": 2.9132, + "step": 168 + }, + { + "epoch": 0.3840909090909091, + "grad_norm": 16.456457138061523, + "learning_rate": 8.75e-05, + "loss": 2.1006, + "step": 169 + }, + { + "epoch": 0.38636363636363635, + "grad_norm": 16.236146926879883, + "learning_rate": 8.742424242424243e-05, + "loss": 2.1458, + "step": 170 + }, + { + "epoch": 0.3886363636363636, + "grad_norm": 13.122529983520508, + "learning_rate": 8.734848484848485e-05, + "loss": 2.7045, + "step": 171 + }, + { + "epoch": 0.39090909090909093, + "grad_norm": 12.385522842407227, + "learning_rate": 8.727272727272727e-05, + "loss": 2.2677, + "step": 172 + }, + { + "epoch": 0.3931818181818182, + "grad_norm": 14.4050931930542, + "learning_rate": 8.71969696969697e-05, + "loss": 1.3401, + "step": 173 + }, + { + "epoch": 0.39545454545454545, + "grad_norm": 21.25592803955078, + "learning_rate": 8.712121212121212e-05, + "loss": 1.8591, + "step": 174 + }, + { + "epoch": 0.3977272727272727, + "grad_norm": 13.744414329528809, + "learning_rate": 8.704545454545456e-05, + "loss": 1.8915, + "step": 175 + }, + { + "epoch": 0.4, + "grad_norm": 14.040199279785156, + "learning_rate": 8.696969696969698e-05, + "loss": 2.1142, + "step": 176 + }, + { + "epoch": 0.4022727272727273, + "grad_norm": 13.779399871826172, + "learning_rate": 8.68939393939394e-05, + "loss": 1.6946, + "step": 177 + }, + { + "epoch": 0.40454545454545454, + "grad_norm": 12.878482818603516, + "learning_rate": 8.681818181818182e-05, + "loss": 2.0229, + "step": 178 + }, + { + "epoch": 0.4068181818181818, + "grad_norm": 10.951014518737793, + "learning_rate": 8.674242424242425e-05, + "loss": 2.2302, + "step": 179 + }, + { + "epoch": 0.4090909090909091, + "grad_norm": 15.133676528930664, + "learning_rate": 8.666666666666667e-05, + "loss": 1.7796, + "step": 180 + }, + { + "epoch": 0.4113636363636364, + "grad_norm": 11.56503677368164, + "learning_rate": 8.65909090909091e-05, + "loss": 2.0587, + "step": 181 + }, + { + "epoch": 0.41363636363636364, + "grad_norm": 12.170353889465332, + "learning_rate": 8.651515151515152e-05, + "loss": 1.9297, + "step": 182 + }, + { + "epoch": 0.4159090909090909, + "grad_norm": 14.984827995300293, + "learning_rate": 8.643939393939394e-05, + "loss": 1.3361, + "step": 183 + }, + { + "epoch": 0.41818181818181815, + "grad_norm": 12.686882972717285, + "learning_rate": 8.636363636363637e-05, + "loss": 2.3203, + "step": 184 + }, + { + "epoch": 0.42045454545454547, + "grad_norm": 19.53303337097168, + "learning_rate": 8.628787878787879e-05, + "loss": 2.1686, + "step": 185 + }, + { + "epoch": 0.42272727272727273, + "grad_norm": 13.246541976928711, + "learning_rate": 8.621212121212121e-05, + "loss": 2.154, + "step": 186 + }, + { + "epoch": 0.425, + "grad_norm": 18.38794708251953, + "learning_rate": 8.613636363636363e-05, + "loss": 2.3975, + "step": 187 + }, + { + "epoch": 0.42727272727272725, + "grad_norm": 19.281801223754883, + "learning_rate": 8.606060606060606e-05, + "loss": 3.1559, + "step": 188 + }, + { + "epoch": 0.42954545454545456, + "grad_norm": 16.43345069885254, + "learning_rate": 8.598484848484848e-05, + "loss": 2.4324, + "step": 189 + }, + { + "epoch": 0.4318181818181818, + "grad_norm": 22.686885833740234, + "learning_rate": 8.590909090909092e-05, + "loss": 2.4541, + "step": 190 + }, + { + "epoch": 0.4340909090909091, + "grad_norm": 16.799205780029297, + "learning_rate": 8.583333333333334e-05, + "loss": 1.9834, + "step": 191 + }, + { + "epoch": 0.43636363636363634, + "grad_norm": 12.861906051635742, + "learning_rate": 8.575757575757576e-05, + "loss": 1.4132, + "step": 192 + }, + { + "epoch": 0.43863636363636366, + "grad_norm": 14.350102424621582, + "learning_rate": 8.568181818181819e-05, + "loss": 2.5181, + "step": 193 + }, + { + "epoch": 0.4409090909090909, + "grad_norm": 9.91285228729248, + "learning_rate": 8.560606060606061e-05, + "loss": 1.1131, + "step": 194 + }, + { + "epoch": 0.4431818181818182, + "grad_norm": 12.768558502197266, + "learning_rate": 8.553030303030304e-05, + "loss": 1.6889, + "step": 195 + }, + { + "epoch": 0.44545454545454544, + "grad_norm": 11.671558380126953, + "learning_rate": 8.545454545454545e-05, + "loss": 2.4559, + "step": 196 + }, + { + "epoch": 0.44772727272727275, + "grad_norm": 12.10418701171875, + "learning_rate": 8.537878787878788e-05, + "loss": 2.2951, + "step": 197 + }, + { + "epoch": 0.45, + "grad_norm": 12.047237396240234, + "learning_rate": 8.53030303030303e-05, + "loss": 1.7895, + "step": 198 + }, + { + "epoch": 0.45227272727272727, + "grad_norm": 13.83714485168457, + "learning_rate": 8.522727272727273e-05, + "loss": 2.1267, + "step": 199 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 17.289377212524414, + "learning_rate": 8.515151515151515e-05, + "loss": 3.4595, + "step": 200 + }, + { + "epoch": 0.45681818181818185, + "grad_norm": 16.056198120117188, + "learning_rate": 8.507575757575757e-05, + "loss": 2.2333, + "step": 201 + }, + { + "epoch": 0.4590909090909091, + "grad_norm": 12.874887466430664, + "learning_rate": 8.5e-05, + "loss": 2.3555, + "step": 202 + }, + { + "epoch": 0.46136363636363636, + "grad_norm": 11.859071731567383, + "learning_rate": 8.492424242424243e-05, + "loss": 2.0893, + "step": 203 + }, + { + "epoch": 0.4636363636363636, + "grad_norm": 11.99448013305664, + "learning_rate": 8.484848484848486e-05, + "loss": 2.4165, + "step": 204 + }, + { + "epoch": 0.4659090909090909, + "grad_norm": 14.352676391601562, + "learning_rate": 8.477272727272728e-05, + "loss": 2.58, + "step": 205 + }, + { + "epoch": 0.4681818181818182, + "grad_norm": 10.942952156066895, + "learning_rate": 8.46969696969697e-05, + "loss": 2.1313, + "step": 206 + }, + { + "epoch": 0.47045454545454546, + "grad_norm": 13.232431411743164, + "learning_rate": 8.462121212121212e-05, + "loss": 2.8598, + "step": 207 + }, + { + "epoch": 0.4727272727272727, + "grad_norm": 14.74603271484375, + "learning_rate": 8.454545454545455e-05, + "loss": 2.5221, + "step": 208 + }, + { + "epoch": 0.475, + "grad_norm": 11.541604042053223, + "learning_rate": 8.446969696969697e-05, + "loss": 2.6656, + "step": 209 + }, + { + "epoch": 0.4772727272727273, + "grad_norm": 22.731273651123047, + "learning_rate": 8.43939393939394e-05, + "loss": 1.9391, + "step": 210 + }, + { + "epoch": 0.47954545454545455, + "grad_norm": 16.327220916748047, + "learning_rate": 8.431818181818182e-05, + "loss": 2.1225, + "step": 211 + }, + { + "epoch": 0.4818181818181818, + "grad_norm": 15.646464347839355, + "learning_rate": 8.424242424242424e-05, + "loss": 2.1468, + "step": 212 + }, + { + "epoch": 0.48409090909090907, + "grad_norm": 16.69521141052246, + "learning_rate": 8.416666666666668e-05, + "loss": 2.4979, + "step": 213 + }, + { + "epoch": 0.4863636363636364, + "grad_norm": 12.17435073852539, + "learning_rate": 8.40909090909091e-05, + "loss": 1.915, + "step": 214 + }, + { + "epoch": 0.48863636363636365, + "grad_norm": 15.295214653015137, + "learning_rate": 8.401515151515153e-05, + "loss": 2.6765, + "step": 215 + }, + { + "epoch": 0.4909090909090909, + "grad_norm": 14.532336235046387, + "learning_rate": 8.393939393939393e-05, + "loss": 2.1649, + "step": 216 + }, + { + "epoch": 0.49318181818181817, + "grad_norm": 9.738990783691406, + "learning_rate": 8.386363636363637e-05, + "loss": 1.7751, + "step": 217 + }, + { + "epoch": 0.4954545454545455, + "grad_norm": 13.893047332763672, + "learning_rate": 8.378787878787879e-05, + "loss": 2.3839, + "step": 218 + }, + { + "epoch": 0.49772727272727274, + "grad_norm": 10.604107856750488, + "learning_rate": 8.371212121212122e-05, + "loss": 1.839, + "step": 219 + }, + { + "epoch": 0.5, + "grad_norm": 14.21572208404541, + "learning_rate": 8.363636363636364e-05, + "loss": 2.4181, + "step": 220 + }, + { + "epoch": 0.5022727272727273, + "grad_norm": 12.247942924499512, + "learning_rate": 8.356060606060606e-05, + "loss": 1.6214, + "step": 221 + }, + { + "epoch": 0.5045454545454545, + "grad_norm": 11.43807601928711, + "learning_rate": 8.348484848484849e-05, + "loss": 1.7002, + "step": 222 + }, + { + "epoch": 0.5068181818181818, + "grad_norm": 12.532363891601562, + "learning_rate": 8.340909090909091e-05, + "loss": 1.6798, + "step": 223 + }, + { + "epoch": 0.509090909090909, + "grad_norm": 21.122955322265625, + "learning_rate": 8.333333333333334e-05, + "loss": 2.3791, + "step": 224 + }, + { + "epoch": 0.5113636363636364, + "grad_norm": 15.643569946289062, + "learning_rate": 8.325757575757575e-05, + "loss": 2.2841, + "step": 225 + }, + { + "epoch": 0.5136363636363637, + "grad_norm": 13.66476821899414, + "learning_rate": 8.318181818181818e-05, + "loss": 2.7105, + "step": 226 + }, + { + "epoch": 0.5159090909090909, + "grad_norm": 15.538378715515137, + "learning_rate": 8.310606060606062e-05, + "loss": 2.5573, + "step": 227 + }, + { + "epoch": 0.5181818181818182, + "grad_norm": 14.432341575622559, + "learning_rate": 8.303030303030304e-05, + "loss": 1.6926, + "step": 228 + }, + { + "epoch": 0.5204545454545455, + "grad_norm": 14.326302528381348, + "learning_rate": 8.295454545454547e-05, + "loss": 1.9976, + "step": 229 + }, + { + "epoch": 0.5227272727272727, + "grad_norm": 16.38084602355957, + "learning_rate": 8.287878787878787e-05, + "loss": 2.8438, + "step": 230 + }, + { + "epoch": 0.525, + "grad_norm": 14.56826114654541, + "learning_rate": 8.280303030303031e-05, + "loss": 2.3643, + "step": 231 + }, + { + "epoch": 0.5272727272727272, + "grad_norm": 10.183893203735352, + "learning_rate": 8.272727272727273e-05, + "loss": 1.9476, + "step": 232 + }, + { + "epoch": 0.5295454545454545, + "grad_norm": 15.575922012329102, + "learning_rate": 8.265151515151516e-05, + "loss": 2.3493, + "step": 233 + }, + { + "epoch": 0.5318181818181819, + "grad_norm": 12.653141021728516, + "learning_rate": 8.257575757575758e-05, + "loss": 2.0519, + "step": 234 + }, + { + "epoch": 0.5340909090909091, + "grad_norm": 12.279047966003418, + "learning_rate": 8.25e-05, + "loss": 2.0694, + "step": 235 + }, + { + "epoch": 0.5363636363636364, + "grad_norm": 12.395997047424316, + "learning_rate": 8.242424242424243e-05, + "loss": 2.1307, + "step": 236 + }, + { + "epoch": 0.5386363636363637, + "grad_norm": 10.851142883300781, + "learning_rate": 8.234848484848485e-05, + "loss": 1.9883, + "step": 237 + }, + { + "epoch": 0.5409090909090909, + "grad_norm": 14.103243827819824, + "learning_rate": 8.227272727272729e-05, + "loss": 2.6901, + "step": 238 + }, + { + "epoch": 0.5431818181818182, + "grad_norm": 9.63924789428711, + "learning_rate": 8.21969696969697e-05, + "loss": 1.2228, + "step": 239 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 13.430061340332031, + "learning_rate": 8.212121212121212e-05, + "loss": 1.7877, + "step": 240 + }, + { + "epoch": 0.5477272727272727, + "grad_norm": 15.428567886352539, + "learning_rate": 8.204545454545454e-05, + "loss": 2.0201, + "step": 241 + }, + { + "epoch": 0.55, + "grad_norm": 15.405593872070312, + "learning_rate": 8.196969696969698e-05, + "loss": 2.8325, + "step": 242 + }, + { + "epoch": 0.5522727272727272, + "grad_norm": 22.855867385864258, + "learning_rate": 8.18939393939394e-05, + "loss": 3.045, + "step": 243 + }, + { + "epoch": 0.5545454545454546, + "grad_norm": 14.374544143676758, + "learning_rate": 8.181818181818183e-05, + "loss": 2.0002, + "step": 244 + }, + { + "epoch": 0.5568181818181818, + "grad_norm": 13.37702465057373, + "learning_rate": 8.174242424242425e-05, + "loss": 1.6496, + "step": 245 + }, + { + "epoch": 0.5590909090909091, + "grad_norm": 13.321274757385254, + "learning_rate": 8.166666666666667e-05, + "loss": 1.9746, + "step": 246 + }, + { + "epoch": 0.5613636363636364, + "grad_norm": 13.79466438293457, + "learning_rate": 8.15909090909091e-05, + "loss": 2.0699, + "step": 247 + }, + { + "epoch": 0.5636363636363636, + "grad_norm": 12.355722427368164, + "learning_rate": 8.151515151515152e-05, + "loss": 2.2207, + "step": 248 + }, + { + "epoch": 0.5659090909090909, + "grad_norm": 14.220561981201172, + "learning_rate": 8.143939393939395e-05, + "loss": 2.1695, + "step": 249 + }, + { + "epoch": 0.5681818181818182, + "grad_norm": 12.587940216064453, + "learning_rate": 8.136363636363636e-05, + "loss": 1.8604, + "step": 250 + }, + { + "epoch": 0.5704545454545454, + "grad_norm": 9.54430103302002, + "learning_rate": 8.12878787878788e-05, + "loss": 1.6446, + "step": 251 + }, + { + "epoch": 0.5727272727272728, + "grad_norm": 14.440407752990723, + "learning_rate": 8.121212121212121e-05, + "loss": 2.4646, + "step": 252 + }, + { + "epoch": 0.575, + "grad_norm": 14.50412368774414, + "learning_rate": 8.113636363636365e-05, + "loss": 1.5263, + "step": 253 + }, + { + "epoch": 0.5772727272727273, + "grad_norm": 18.535612106323242, + "learning_rate": 8.106060606060607e-05, + "loss": 2.7942, + "step": 254 + }, + { + "epoch": 0.5795454545454546, + "grad_norm": 11.250702857971191, + "learning_rate": 8.098484848484848e-05, + "loss": 1.5575, + "step": 255 + }, + { + "epoch": 0.5818181818181818, + "grad_norm": 12.534632682800293, + "learning_rate": 8.090909090909092e-05, + "loss": 1.9031, + "step": 256 + }, + { + "epoch": 0.5840909090909091, + "grad_norm": 14.82848834991455, + "learning_rate": 8.083333333333334e-05, + "loss": 1.4666, + "step": 257 + }, + { + "epoch": 0.5863636363636363, + "grad_norm": 15.74230670928955, + "learning_rate": 8.075757575757577e-05, + "loss": 2.3956, + "step": 258 + }, + { + "epoch": 0.5886363636363636, + "grad_norm": 13.576948165893555, + "learning_rate": 8.068181818181818e-05, + "loss": 1.9797, + "step": 259 + }, + { + "epoch": 0.5909090909090909, + "grad_norm": 12.77927303314209, + "learning_rate": 8.060606060606061e-05, + "loss": 2.0894, + "step": 260 + }, + { + "epoch": 0.5931818181818181, + "grad_norm": 17.75493621826172, + "learning_rate": 8.053030303030303e-05, + "loss": 2.6691, + "step": 261 + }, + { + "epoch": 0.5954545454545455, + "grad_norm": 12.445291519165039, + "learning_rate": 8.045454545454546e-05, + "loss": 1.9188, + "step": 262 + }, + { + "epoch": 0.5977272727272728, + "grad_norm": 12.350727081298828, + "learning_rate": 8.037878787878788e-05, + "loss": 1.9648, + "step": 263 + }, + { + "epoch": 0.6, + "grad_norm": 10.37759780883789, + "learning_rate": 8.03030303030303e-05, + "loss": 1.5221, + "step": 264 + }, + { + "epoch": 0.6022727272727273, + "grad_norm": 13.281451225280762, + "learning_rate": 8.022727272727273e-05, + "loss": 3.2337, + "step": 265 + }, + { + "epoch": 0.6045454545454545, + "grad_norm": 11.684523582458496, + "learning_rate": 8.015151515151515e-05, + "loss": 1.7641, + "step": 266 + }, + { + "epoch": 0.6068181818181818, + "grad_norm": 15.161863327026367, + "learning_rate": 8.007575757575759e-05, + "loss": 3.5694, + "step": 267 + }, + { + "epoch": 0.6090909090909091, + "grad_norm": 13.221097946166992, + "learning_rate": 8e-05, + "loss": 2.5334, + "step": 268 + }, + { + "epoch": 0.6113636363636363, + "grad_norm": 15.834603309631348, + "learning_rate": 7.992424242424243e-05, + "loss": 2.5292, + "step": 269 + }, + { + "epoch": 0.6136363636363636, + "grad_norm": 15.016695976257324, + "learning_rate": 7.984848484848485e-05, + "loss": 1.9177, + "step": 270 + }, + { + "epoch": 0.615909090909091, + "grad_norm": 18.896211624145508, + "learning_rate": 7.977272727272728e-05, + "loss": 2.2495, + "step": 271 + }, + { + "epoch": 0.6181818181818182, + "grad_norm": 17.597623825073242, + "learning_rate": 7.96969696969697e-05, + "loss": 2.1252, + "step": 272 + }, + { + "epoch": 0.6204545454545455, + "grad_norm": 14.346769332885742, + "learning_rate": 7.962121212121213e-05, + "loss": 2.0273, + "step": 273 + }, + { + "epoch": 0.6227272727272727, + "grad_norm": 13.852729797363281, + "learning_rate": 7.954545454545455e-05, + "loss": 2.7319, + "step": 274 + }, + { + "epoch": 0.625, + "grad_norm": 12.906790733337402, + "learning_rate": 7.946969696969697e-05, + "loss": 1.6674, + "step": 275 + }, + { + "epoch": 0.6272727272727273, + "grad_norm": 10.031960487365723, + "learning_rate": 7.93939393939394e-05, + "loss": 1.5017, + "step": 276 + }, + { + "epoch": 0.6295454545454545, + "grad_norm": 12.02971363067627, + "learning_rate": 7.931818181818182e-05, + "loss": 2.1617, + "step": 277 + }, + { + "epoch": 0.6318181818181818, + "grad_norm": 12.239229202270508, + "learning_rate": 7.924242424242426e-05, + "loss": 1.285, + "step": 278 + }, + { + "epoch": 0.634090909090909, + "grad_norm": 12.207528114318848, + "learning_rate": 7.916666666666666e-05, + "loss": 1.4661, + "step": 279 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 21.659215927124023, + "learning_rate": 7.90909090909091e-05, + "loss": 1.8808, + "step": 280 + }, + { + "epoch": 0.6386363636363637, + "grad_norm": 14.419612884521484, + "learning_rate": 7.901515151515151e-05, + "loss": 2.6502, + "step": 281 + }, + { + "epoch": 0.6409090909090909, + "grad_norm": NaN, + "learning_rate": 7.901515151515151e-05, + "loss": 0.0, + "step": 282 + }, + { + "epoch": 0.6431818181818182, + "grad_norm": 11.444130897521973, + "learning_rate": 7.893939393939395e-05, + "loss": 1.5987, + "step": 283 + }, + { + "epoch": 0.6454545454545455, + "grad_norm": 10.316890716552734, + "learning_rate": 7.886363636363637e-05, + "loss": 1.5173, + "step": 284 + }, + { + "epoch": 0.6477272727272727, + "grad_norm": 13.772204399108887, + "learning_rate": 7.878787878787879e-05, + "loss": 3.0357, + "step": 285 + }, + { + "epoch": 0.65, + "grad_norm": 12.452784538269043, + "learning_rate": 7.871212121212122e-05, + "loss": 2.2077, + "step": 286 + }, + { + "epoch": 0.6522727272727272, + "grad_norm": 15.323153495788574, + "learning_rate": 7.863636363636364e-05, + "loss": 1.8941, + "step": 287 + }, + { + "epoch": 0.6545454545454545, + "grad_norm": 10.558858871459961, + "learning_rate": 7.856060606060607e-05, + "loss": 1.5262, + "step": 288 + }, + { + "epoch": 0.6568181818181819, + "grad_norm": 15.232844352722168, + "learning_rate": 7.848484848484848e-05, + "loss": 3.1486, + "step": 289 + }, + { + "epoch": 0.6590909090909091, + "grad_norm": 11.309487342834473, + "learning_rate": 7.840909090909091e-05, + "loss": 1.8324, + "step": 290 + }, + { + "epoch": 0.6613636363636364, + "grad_norm": 11.427604675292969, + "learning_rate": 7.833333333333333e-05, + "loss": 1.0609, + "step": 291 + }, + { + "epoch": 0.6636363636363637, + "grad_norm": 15.115833282470703, + "learning_rate": 7.825757575757576e-05, + "loss": 2.8888, + "step": 292 + }, + { + "epoch": 0.6659090909090909, + "grad_norm": 14.701318740844727, + "learning_rate": 7.818181818181818e-05, + "loss": 2.823, + "step": 293 + }, + { + "epoch": 0.6681818181818182, + "grad_norm": 10.650053024291992, + "learning_rate": 7.81060606060606e-05, + "loss": 1.8724, + "step": 294 + }, + { + "epoch": 0.6704545454545454, + "grad_norm": 12.72999382019043, + "learning_rate": 7.803030303030304e-05, + "loss": 1.9267, + "step": 295 + }, + { + "epoch": 0.6727272727272727, + "grad_norm": 16.98598861694336, + "learning_rate": 7.795454545454546e-05, + "loss": 2.325, + "step": 296 + }, + { + "epoch": 0.675, + "grad_norm": 12.848193168640137, + "learning_rate": 7.787878787878789e-05, + "loss": 3.1965, + "step": 297 + }, + { + "epoch": 0.6772727272727272, + "grad_norm": 8.765904426574707, + "learning_rate": 7.780303030303031e-05, + "loss": 1.8081, + "step": 298 + }, + { + "epoch": 0.6795454545454546, + "grad_norm": 14.633967399597168, + "learning_rate": 7.772727272727273e-05, + "loss": 1.8056, + "step": 299 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 9.972925186157227, + "learning_rate": 7.765151515151515e-05, + "loss": 1.8835, + "step": 300 + }, + { + "epoch": 0.6840909090909091, + "grad_norm": 11.186135292053223, + "learning_rate": 7.757575757575758e-05, + "loss": 1.6734, + "step": 301 + }, + { + "epoch": 0.6863636363636364, + "grad_norm": 15.052450180053711, + "learning_rate": 7.75e-05, + "loss": 2.2574, + "step": 302 + }, + { + "epoch": 0.6886363636363636, + "grad_norm": 12.664848327636719, + "learning_rate": 7.742424242424243e-05, + "loss": 1.5916, + "step": 303 + }, + { + "epoch": 0.6909090909090909, + "grad_norm": 14.287535667419434, + "learning_rate": 7.734848484848485e-05, + "loss": 1.8552, + "step": 304 + }, + { + "epoch": 0.6931818181818182, + "grad_norm": 14.354594230651855, + "learning_rate": 7.727272727272727e-05, + "loss": 3.0925, + "step": 305 + }, + { + "epoch": 0.6954545454545454, + "grad_norm": 12.003613471984863, + "learning_rate": 7.71969696969697e-05, + "loss": 1.6642, + "step": 306 + }, + { + "epoch": 0.6977272727272728, + "grad_norm": 11.559938430786133, + "learning_rate": 7.712121212121212e-05, + "loss": 1.5997, + "step": 307 + }, + { + "epoch": 0.7, + "grad_norm": 13.42446517944336, + "learning_rate": 7.704545454545456e-05, + "loss": 1.7934, + "step": 308 + }, + { + "epoch": 0.7022727272727273, + "grad_norm": 11.831766128540039, + "learning_rate": 7.696969696969696e-05, + "loss": 1.7729, + "step": 309 + }, + { + "epoch": 0.7045454545454546, + "grad_norm": 11.884734153747559, + "learning_rate": 7.68939393939394e-05, + "loss": 1.9489, + "step": 310 + }, + { + "epoch": 0.7068181818181818, + "grad_norm": 15.816669464111328, + "learning_rate": 7.681818181818182e-05, + "loss": 2.4105, + "step": 311 + }, + { + "epoch": 0.7090909090909091, + "grad_norm": 12.010058403015137, + "learning_rate": 7.674242424242425e-05, + "loss": 1.9247, + "step": 312 + }, + { + "epoch": 0.7113636363636363, + "grad_norm": 9.436304092407227, + "learning_rate": 7.666666666666667e-05, + "loss": 1.9038, + "step": 313 + }, + { + "epoch": 0.7136363636363636, + "grad_norm": 9.153775215148926, + "learning_rate": 7.659090909090909e-05, + "loss": 1.241, + "step": 314 + }, + { + "epoch": 0.7159090909090909, + "grad_norm": 13.067652702331543, + "learning_rate": 7.651515151515152e-05, + "loss": 2.7662, + "step": 315 + }, + { + "epoch": 0.7181818181818181, + "grad_norm": 16.106948852539062, + "learning_rate": 7.643939393939394e-05, + "loss": 2.0783, + "step": 316 + }, + { + "epoch": 0.7204545454545455, + "grad_norm": 13.585596084594727, + "learning_rate": 7.636363636363637e-05, + "loss": 1.919, + "step": 317 + }, + { + "epoch": 0.7227272727272728, + "grad_norm": 13.833767890930176, + "learning_rate": 7.62878787878788e-05, + "loss": 1.1069, + "step": 318 + }, + { + "epoch": 0.725, + "grad_norm": 12.201956748962402, + "learning_rate": 7.621212121212121e-05, + "loss": 1.9548, + "step": 319 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 15.562934875488281, + "learning_rate": 7.613636363636363e-05, + "loss": 1.9211, + "step": 320 + }, + { + "epoch": 0.7295454545454545, + "grad_norm": 14.389630317687988, + "learning_rate": 7.606060606060607e-05, + "loss": 1.821, + "step": 321 + }, + { + "epoch": 0.7318181818181818, + "grad_norm": 14.584891319274902, + "learning_rate": 7.598484848484849e-05, + "loss": 2.5068, + "step": 322 + }, + { + "epoch": 0.7340909090909091, + "grad_norm": 14.5166654586792, + "learning_rate": 7.59090909090909e-05, + "loss": 1.9124, + "step": 323 + }, + { + "epoch": 0.7363636363636363, + "grad_norm": 46.67388916015625, + "learning_rate": 7.583333333333334e-05, + "loss": 1.6895, + "step": 324 + }, + { + "epoch": 0.7386363636363636, + "grad_norm": 12.92702865600586, + "learning_rate": 7.575757575757576e-05, + "loss": 1.7526, + "step": 325 + }, + { + "epoch": 0.740909090909091, + "grad_norm": 8.52035140991211, + "learning_rate": 7.568181818181819e-05, + "loss": 1.4144, + "step": 326 + }, + { + "epoch": 0.7431818181818182, + "grad_norm": 13.630702018737793, + "learning_rate": 7.560606060606061e-05, + "loss": 2.2018, + "step": 327 + }, + { + "epoch": 0.7454545454545455, + "grad_norm": 14.379950523376465, + "learning_rate": 7.553030303030303e-05, + "loss": 2.8618, + "step": 328 + }, + { + "epoch": 0.7477272727272727, + "grad_norm": 14.78795051574707, + "learning_rate": 7.545454545454545e-05, + "loss": 1.9749, + "step": 329 + }, + { + "epoch": 0.75, + "grad_norm": 10.462140083312988, + "learning_rate": 7.537878787878788e-05, + "loss": 2.3666, + "step": 330 + }, + { + "epoch": 0.7522727272727273, + "grad_norm": 11.336270332336426, + "learning_rate": 7.530303030303032e-05, + "loss": 1.4712, + "step": 331 + }, + { + "epoch": 0.7545454545454545, + "grad_norm": 17.15682029724121, + "learning_rate": 7.522727272727273e-05, + "loss": 3.2442, + "step": 332 + }, + { + "epoch": 0.7568181818181818, + "grad_norm": 14.129326820373535, + "learning_rate": 7.515151515151515e-05, + "loss": 1.9768, + "step": 333 + }, + { + "epoch": 0.759090909090909, + "grad_norm": 14.239521026611328, + "learning_rate": 7.507575757575757e-05, + "loss": 1.9933, + "step": 334 + }, + { + "epoch": 0.7613636363636364, + "grad_norm": 10.573707580566406, + "learning_rate": 7.500000000000001e-05, + "loss": 1.3049, + "step": 335 + }, + { + "epoch": 0.7636363636363637, + "grad_norm": 15.881331443786621, + "learning_rate": 7.492424242424243e-05, + "loss": 2.7249, + "step": 336 + }, + { + "epoch": 0.7659090909090909, + "grad_norm": 11.606864929199219, + "learning_rate": 7.484848484848486e-05, + "loss": 1.4883, + "step": 337 + }, + { + "epoch": 0.7681818181818182, + "grad_norm": 8.834245681762695, + "learning_rate": 7.477272727272727e-05, + "loss": 1.3757, + "step": 338 + }, + { + "epoch": 0.7704545454545455, + "grad_norm": 10.011686325073242, + "learning_rate": 7.46969696969697e-05, + "loss": 1.4306, + "step": 339 + }, + { + "epoch": 0.7727272727272727, + "grad_norm": 13.084802627563477, + "learning_rate": 7.462121212121213e-05, + "loss": 2.1676, + "step": 340 + }, + { + "epoch": 0.775, + "grad_norm": 12.480827331542969, + "learning_rate": 7.454545454545455e-05, + "loss": 2.2564, + "step": 341 + }, + { + "epoch": 0.7772727272727272, + "grad_norm": 12.32083797454834, + "learning_rate": 7.446969696969698e-05, + "loss": 1.4576, + "step": 342 + }, + { + "epoch": 0.7795454545454545, + "grad_norm": 13.759376525878906, + "learning_rate": 7.439393939393939e-05, + "loss": 2.5308, + "step": 343 + }, + { + "epoch": 0.7818181818181819, + "grad_norm": 17.70578384399414, + "learning_rate": 7.431818181818182e-05, + "loss": 3.0816, + "step": 344 + }, + { + "epoch": 0.7840909090909091, + "grad_norm": 13.809745788574219, + "learning_rate": 7.424242424242424e-05, + "loss": 2.6903, + "step": 345 + }, + { + "epoch": 0.7863636363636364, + "grad_norm": 13.484768867492676, + "learning_rate": 7.416666666666668e-05, + "loss": 1.6094, + "step": 346 + }, + { + "epoch": 0.7886363636363637, + "grad_norm": 10.424938201904297, + "learning_rate": 7.40909090909091e-05, + "loss": 1.3566, + "step": 347 + }, + { + "epoch": 0.7909090909090909, + "grad_norm": 15.058128356933594, + "learning_rate": 7.401515151515152e-05, + "loss": 1.945, + "step": 348 + }, + { + "epoch": 0.7931818181818182, + "grad_norm": 11.48098373413086, + "learning_rate": 7.393939393939395e-05, + "loss": 2.9329, + "step": 349 + }, + { + "epoch": 0.7954545454545454, + "grad_norm": 15.027339935302734, + "learning_rate": 7.386363636363637e-05, + "loss": 3.3324, + "step": 350 + }, + { + "epoch": 0.7977272727272727, + "grad_norm": 12.786996841430664, + "learning_rate": 7.37878787878788e-05, + "loss": 2.7898, + "step": 351 + }, + { + "epoch": 0.8, + "grad_norm": 14.68897819519043, + "learning_rate": 7.37121212121212e-05, + "loss": 2.1318, + "step": 352 + }, + { + "epoch": 0.8022727272727272, + "grad_norm": 15.081788063049316, + "learning_rate": 7.363636363636364e-05, + "loss": 2.544, + "step": 353 + }, + { + "epoch": 0.8045454545454546, + "grad_norm": 13.604434967041016, + "learning_rate": 7.356060606060606e-05, + "loss": 3.242, + "step": 354 + }, + { + "epoch": 0.8068181818181818, + "grad_norm": 10.167998313903809, + "learning_rate": 7.348484848484849e-05, + "loss": 1.7378, + "step": 355 + }, + { + "epoch": 0.8090909090909091, + "grad_norm": 11.878591537475586, + "learning_rate": 7.340909090909091e-05, + "loss": 1.9651, + "step": 356 + }, + { + "epoch": 0.8113636363636364, + "grad_norm": 10.606021881103516, + "learning_rate": 7.333333333333333e-05, + "loss": 1.6922, + "step": 357 + }, + { + "epoch": 0.8136363636363636, + "grad_norm": 36.99083709716797, + "learning_rate": 7.325757575757576e-05, + "loss": 2.7004, + "step": 358 + }, + { + "epoch": 0.8159090909090909, + "grad_norm": 12.748845100402832, + "learning_rate": 7.318181818181818e-05, + "loss": 2.0722, + "step": 359 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 13.374279975891113, + "learning_rate": 7.310606060606062e-05, + "loss": 2.3361, + "step": 360 + }, + { + "epoch": 0.8204545454545454, + "grad_norm": 10.289033889770508, + "learning_rate": 7.303030303030304e-05, + "loss": 1.6377, + "step": 361 + }, + { + "epoch": 0.8227272727272728, + "grad_norm": 10.585772514343262, + "learning_rate": 7.295454545454546e-05, + "loss": 1.6941, + "step": 362 + }, + { + "epoch": 0.825, + "grad_norm": 13.439225196838379, + "learning_rate": 7.287878787878788e-05, + "loss": 1.9242, + "step": 363 + }, + { + "epoch": 0.8272727272727273, + "grad_norm": 12.649117469787598, + "learning_rate": 7.280303030303031e-05, + "loss": 3.5932, + "step": 364 + }, + { + "epoch": 0.8295454545454546, + "grad_norm": 13.014269828796387, + "learning_rate": 7.272727272727273e-05, + "loss": 1.6747, + "step": 365 + }, + { + "epoch": 0.8318181818181818, + "grad_norm": 10.855698585510254, + "learning_rate": 7.265151515151516e-05, + "loss": 2.2644, + "step": 366 + }, + { + "epoch": 0.8340909090909091, + "grad_norm": 9.967236518859863, + "learning_rate": 7.257575757575758e-05, + "loss": 1.7373, + "step": 367 + }, + { + "epoch": 0.8363636363636363, + "grad_norm": 12.029590606689453, + "learning_rate": 7.25e-05, + "loss": 1.7012, + "step": 368 + }, + { + "epoch": 0.8386363636363636, + "grad_norm": 18.046247482299805, + "learning_rate": 7.242424242424243e-05, + "loss": 2.7507, + "step": 369 + }, + { + "epoch": 0.8409090909090909, + "grad_norm": 12.02083969116211, + "learning_rate": 7.234848484848485e-05, + "loss": 1.4928, + "step": 370 + }, + { + "epoch": 0.8431818181818181, + "grad_norm": 14.034537315368652, + "learning_rate": 7.227272727272729e-05, + "loss": 1.5557, + "step": 371 + }, + { + "epoch": 0.8454545454545455, + "grad_norm": 11.5894775390625, + "learning_rate": 7.219696969696969e-05, + "loss": 2.0848, + "step": 372 + }, + { + "epoch": 0.8477272727272728, + "grad_norm": 10.489690780639648, + "learning_rate": 7.212121212121213e-05, + "loss": 2.1963, + "step": 373 + }, + { + "epoch": 0.85, + "grad_norm": 14.684807777404785, + "learning_rate": 7.204545454545454e-05, + "loss": 1.6653, + "step": 374 + }, + { + "epoch": 0.8522727272727273, + "grad_norm": 10.650580406188965, + "learning_rate": 7.196969696969698e-05, + "loss": 1.5813, + "step": 375 + }, + { + "epoch": 0.8545454545454545, + "grad_norm": 14.406346321105957, + "learning_rate": 7.18939393939394e-05, + "loss": 1.6018, + "step": 376 + }, + { + "epoch": 0.8568181818181818, + "grad_norm": 10.684210777282715, + "learning_rate": 7.181818181818182e-05, + "loss": 1.16, + "step": 377 + }, + { + "epoch": 0.8590909090909091, + "grad_norm": 11.588654518127441, + "learning_rate": 7.174242424242425e-05, + "loss": 1.52, + "step": 378 + }, + { + "epoch": 0.8613636363636363, + "grad_norm": 13.342896461486816, + "learning_rate": 7.166666666666667e-05, + "loss": 1.3069, + "step": 379 + }, + { + "epoch": 0.8636363636363636, + "grad_norm": 10.33123779296875, + "learning_rate": 7.15909090909091e-05, + "loss": 2.097, + "step": 380 + }, + { + "epoch": 0.865909090909091, + "grad_norm": 13.286327362060547, + "learning_rate": 7.151515151515152e-05, + "loss": 1.6996, + "step": 381 + }, + { + "epoch": 0.8681818181818182, + "grad_norm": 12.737727165222168, + "learning_rate": 7.143939393939394e-05, + "loss": 1.8533, + "step": 382 + }, + { + "epoch": 0.8704545454545455, + "grad_norm": 10.602120399475098, + "learning_rate": 7.136363636363636e-05, + "loss": 0.9764, + "step": 383 + }, + { + "epoch": 0.8727272727272727, + "grad_norm": 13.362771034240723, + "learning_rate": 7.12878787878788e-05, + "loss": 2.6888, + "step": 384 + }, + { + "epoch": 0.875, + "grad_norm": 15.875019073486328, + "learning_rate": 7.121212121212121e-05, + "loss": 1.3865, + "step": 385 + }, + { + "epoch": 0.8772727272727273, + "grad_norm": 11.602843284606934, + "learning_rate": 7.113636363636363e-05, + "loss": 1.489, + "step": 386 + }, + { + "epoch": 0.8795454545454545, + "grad_norm": 10.052959442138672, + "learning_rate": 7.106060606060607e-05, + "loss": 1.423, + "step": 387 + }, + { + "epoch": 0.8818181818181818, + "grad_norm": 15.898283004760742, + "learning_rate": 7.098484848484849e-05, + "loss": 2.0401, + "step": 388 + }, + { + "epoch": 0.884090909090909, + "grad_norm": 14.83981990814209, + "learning_rate": 7.090909090909092e-05, + "loss": 2.9656, + "step": 389 + }, + { + "epoch": 0.8863636363636364, + "grad_norm": 12.542622566223145, + "learning_rate": 7.083333333333334e-05, + "loss": 1.7818, + "step": 390 + }, + { + "epoch": 0.8886363636363637, + "grad_norm": 10.65149974822998, + "learning_rate": 7.075757575757576e-05, + "loss": 1.4115, + "step": 391 + }, + { + "epoch": 0.8909090909090909, + "grad_norm": 14.208708763122559, + "learning_rate": 7.068181818181818e-05, + "loss": 2.5107, + "step": 392 + }, + { + "epoch": 0.8931818181818182, + "grad_norm": 13.435481071472168, + "learning_rate": 7.060606060606061e-05, + "loss": 2.0141, + "step": 393 + }, + { + "epoch": 0.8954545454545455, + "grad_norm": 14.987428665161133, + "learning_rate": 7.053030303030303e-05, + "loss": 1.6295, + "step": 394 + }, + { + "epoch": 0.8977272727272727, + "grad_norm": 15.590865135192871, + "learning_rate": 7.045454545454546e-05, + "loss": 2.5029, + "step": 395 + }, + { + "epoch": 0.9, + "grad_norm": 12.00338077545166, + "learning_rate": 7.037878787878788e-05, + "loss": 1.5399, + "step": 396 + }, + { + "epoch": 0.9022727272727272, + "grad_norm": 10.2390718460083, + "learning_rate": 7.03030303030303e-05, + "loss": 1.2943, + "step": 397 + }, + { + "epoch": 0.9045454545454545, + "grad_norm": 13.09786319732666, + "learning_rate": 7.022727272727274e-05, + "loss": 1.951, + "step": 398 + }, + { + "epoch": 0.9068181818181819, + "grad_norm": 14.016656875610352, + "learning_rate": 7.015151515151515e-05, + "loss": 2.4783, + "step": 399 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 14.135820388793945, + "learning_rate": 7.007575757575759e-05, + "loss": 1.8109, + "step": 400 + }, + { + "epoch": 0.9113636363636364, + "grad_norm": 15.545958518981934, + "learning_rate": 7e-05, + "loss": 2.2156, + "step": 401 + }, + { + "epoch": 0.9136363636363637, + "grad_norm": 15.512310028076172, + "learning_rate": 6.992424242424243e-05, + "loss": 1.8199, + "step": 402 + }, + { + "epoch": 0.9159090909090909, + "grad_norm": 12.54996109008789, + "learning_rate": 6.984848484848485e-05, + "loss": 2.0134, + "step": 403 + }, + { + "epoch": 0.9181818181818182, + "grad_norm": 10.554512023925781, + "learning_rate": 6.977272727272728e-05, + "loss": 1.5173, + "step": 404 + }, + { + "epoch": 0.9204545454545454, + "grad_norm": 13.31303882598877, + "learning_rate": 6.96969696969697e-05, + "loss": 1.7694, + "step": 405 + }, + { + "epoch": 0.9227272727272727, + "grad_norm": 18.840511322021484, + "learning_rate": 6.962121212121212e-05, + "loss": 3.0551, + "step": 406 + }, + { + "epoch": 0.925, + "grad_norm": 13.331717491149902, + "learning_rate": 6.954545454545455e-05, + "loss": 2.0296, + "step": 407 + }, + { + "epoch": 0.9272727272727272, + "grad_norm": 11.75788688659668, + "learning_rate": 6.946969696969697e-05, + "loss": 1.8544, + "step": 408 + }, + { + "epoch": 0.9295454545454546, + "grad_norm": 14.479559898376465, + "learning_rate": 6.93939393939394e-05, + "loss": 2.4435, + "step": 409 + }, + { + "epoch": 0.9318181818181818, + "grad_norm": 14.522322654724121, + "learning_rate": 6.931818181818182e-05, + "loss": 2.3013, + "step": 410 + }, + { + "epoch": 0.9340909090909091, + "grad_norm": 12.853972434997559, + "learning_rate": 6.924242424242424e-05, + "loss": 2.4637, + "step": 411 + }, + { + "epoch": 0.9363636363636364, + "grad_norm": 10.978107452392578, + "learning_rate": 6.916666666666666e-05, + "loss": 1.5277, + "step": 412 + }, + { + "epoch": 0.9386363636363636, + "grad_norm": 14.109042167663574, + "learning_rate": 6.90909090909091e-05, + "loss": 1.9601, + "step": 413 + }, + { + "epoch": 0.9409090909090909, + "grad_norm": 10.699783325195312, + "learning_rate": 6.901515151515152e-05, + "loss": 2.2143, + "step": 414 + }, + { + "epoch": 0.9431818181818182, + "grad_norm": 10.57825756072998, + "learning_rate": 6.893939393939395e-05, + "loss": 2.0557, + "step": 415 + }, + { + "epoch": 0.9454545454545454, + "grad_norm": 12.432737350463867, + "learning_rate": 6.886363636363637e-05, + "loss": 1.7554, + "step": 416 + }, + { + "epoch": 0.9477272727272728, + "grad_norm": 12.157960891723633, + "learning_rate": 6.878787878787879e-05, + "loss": 2.1302, + "step": 417 + }, + { + "epoch": 0.95, + "grad_norm": 15.89067554473877, + "learning_rate": 6.871212121212122e-05, + "loss": 2.1424, + "step": 418 + }, + { + "epoch": 0.9522727272727273, + "grad_norm": 10.453248977661133, + "learning_rate": 6.863636363636364e-05, + "loss": 1.8215, + "step": 419 + }, + { + "epoch": 0.9545454545454546, + "grad_norm": 8.481575012207031, + "learning_rate": 6.856060606060606e-05, + "loss": 1.5999, + "step": 420 + }, + { + "epoch": 0.9568181818181818, + "grad_norm": 10.795332908630371, + "learning_rate": 6.848484848484848e-05, + "loss": 1.4623, + "step": 421 + }, + { + "epoch": 0.9590909090909091, + "grad_norm": 18.586315155029297, + "learning_rate": 6.840909090909091e-05, + "loss": 2.1875, + "step": 422 + }, + { + "epoch": 0.9613636363636363, + "grad_norm": 15.387242317199707, + "learning_rate": 6.833333333333333e-05, + "loss": 2.1544, + "step": 423 + }, + { + "epoch": 0.9636363636363636, + "grad_norm": 11.277326583862305, + "learning_rate": 6.825757575757576e-05, + "loss": 1.8575, + "step": 424 + }, + { + "epoch": 0.9659090909090909, + "grad_norm": 9.451603889465332, + "learning_rate": 6.818181818181818e-05, + "loss": 1.6149, + "step": 425 + }, + { + "epoch": 0.9681818181818181, + "grad_norm": 14.108964920043945, + "learning_rate": 6.81060606060606e-05, + "loss": 2.0166, + "step": 426 + }, + { + "epoch": 0.9704545454545455, + "grad_norm": 8.922270774841309, + "learning_rate": 6.803030303030304e-05, + "loss": 1.3486, + "step": 427 + }, + { + "epoch": 0.9727272727272728, + "grad_norm": 9.383979797363281, + "learning_rate": 6.795454545454546e-05, + "loss": 1.0425, + "step": 428 + }, + { + "epoch": 0.975, + "grad_norm": 13.076512336730957, + "learning_rate": 6.787878787878789e-05, + "loss": 1.7828, + "step": 429 + }, + { + "epoch": 0.9772727272727273, + "grad_norm": 14.815391540527344, + "learning_rate": 6.78030303030303e-05, + "loss": 1.893, + "step": 430 + }, + { + "epoch": 0.9795454545454545, + "grad_norm": 10.523706436157227, + "learning_rate": 6.772727272727273e-05, + "loss": 1.5307, + "step": 431 + }, + { + "epoch": 0.9818181818181818, + "grad_norm": 16.938919067382812, + "learning_rate": 6.765151515151515e-05, + "loss": 1.9001, + "step": 432 + }, + { + "epoch": 0.9840909090909091, + "grad_norm": 11.781875610351562, + "learning_rate": 6.757575757575758e-05, + "loss": 2.183, + "step": 433 + }, + { + "epoch": 0.9863636363636363, + "grad_norm": 14.539305686950684, + "learning_rate": 6.750000000000001e-05, + "loss": 2.2021, + "step": 434 + }, + { + "epoch": 0.9886363636363636, + "grad_norm": 15.532546997070312, + "learning_rate": 6.742424242424242e-05, + "loss": 2.1856, + "step": 435 + }, + { + "epoch": 0.990909090909091, + "grad_norm": 12.917964935302734, + "learning_rate": 6.734848484848485e-05, + "loss": 2.8732, + "step": 436 + }, + { + "epoch": 0.9931818181818182, + "grad_norm": 12.498353958129883, + "learning_rate": 6.727272727272727e-05, + "loss": 1.9246, + "step": 437 + }, + { + "epoch": 0.9954545454545455, + "grad_norm": 14.181402206420898, + "learning_rate": 6.71969696969697e-05, + "loss": 2.3863, + "step": 438 + }, + { + "epoch": 0.9977272727272727, + "grad_norm": 12.139135360717773, + "learning_rate": 6.712121212121213e-05, + "loss": 2.5505, + "step": 439 + }, + { + "epoch": 1.0, + "grad_norm": 18.971040725708008, + "learning_rate": 6.704545454545455e-05, + "loss": 2.3566, + "step": 440 + }, + { + "epoch": 1.0, + "eval_f1": 0.8942, + "eval_gen_len": 41.6727, + "eval_loss": 1.852333426475525, + "eval_precision": 0.8938, + "eval_recall": 0.8947, + "eval_rouge1": 0.4801, + "eval_rouge2": 0.2302, + "eval_rougeL": 0.4078, + "eval_rougeLsum": 0.4472, + "eval_runtime": 28.5976, + "eval_samples_per_second": 3.846, + "eval_steps_per_second": 0.49, + "step": 440 + }, + { + "epoch": 1.0022727272727272, + "grad_norm": 9.610616683959961, + "learning_rate": 6.696969696969696e-05, + "loss": 1.3656, + "step": 441 + }, + { + "epoch": 1.0045454545454546, + "grad_norm": 13.653773307800293, + "learning_rate": 6.68939393939394e-05, + "loss": 3.0115, + "step": 442 + }, + { + "epoch": 1.0068181818181818, + "grad_norm": 10.243281364440918, + "learning_rate": 6.681818181818183e-05, + "loss": 1.7598, + "step": 443 + }, + { + "epoch": 1.009090909090909, + "grad_norm": 12.79389762878418, + "learning_rate": 6.674242424242425e-05, + "loss": 1.7768, + "step": 444 + }, + { + "epoch": 1.0113636363636365, + "grad_norm": 8.748100280761719, + "learning_rate": 6.666666666666667e-05, + "loss": 1.4368, + "step": 445 + }, + { + "epoch": 1.0136363636363637, + "grad_norm": 9.42500114440918, + "learning_rate": 6.659090909090909e-05, + "loss": 1.0754, + "step": 446 + }, + { + "epoch": 1.0159090909090909, + "grad_norm": 11.976570129394531, + "learning_rate": 6.651515151515152e-05, + "loss": 2.07, + "step": 447 + }, + { + "epoch": 1.018181818181818, + "grad_norm": 9.448553085327148, + "learning_rate": 6.643939393939394e-05, + "loss": 1.5004, + "step": 448 + }, + { + "epoch": 1.0204545454545455, + "grad_norm": 10.295342445373535, + "learning_rate": 6.636363636363638e-05, + "loss": 1.6393, + "step": 449 + }, + { + "epoch": 1.0227272727272727, + "grad_norm": 9.445040702819824, + "learning_rate": 6.628787878787878e-05, + "loss": 1.7432, + "step": 450 + }, + { + "epoch": 1.025, + "grad_norm": 16.851524353027344, + "learning_rate": 6.621212121212121e-05, + "loss": 2.2318, + "step": 451 + }, + { + "epoch": 1.0272727272727273, + "grad_norm": 10.721171379089355, + "learning_rate": 6.613636363636365e-05, + "loss": 1.7857, + "step": 452 + }, + { + "epoch": 1.0295454545454545, + "grad_norm": 10.074830055236816, + "learning_rate": 6.606060606060607e-05, + "loss": 1.5901, + "step": 453 + }, + { + "epoch": 1.0318181818181817, + "grad_norm": 20.14990234375, + "learning_rate": 6.598484848484849e-05, + "loss": 2.6518, + "step": 454 + }, + { + "epoch": 1.0340909090909092, + "grad_norm": 10.911235809326172, + "learning_rate": 6.59090909090909e-05, + "loss": 2.1865, + "step": 455 + }, + { + "epoch": 1.0363636363636364, + "grad_norm": 18.03226089477539, + "learning_rate": 6.583333333333334e-05, + "loss": 2.4383, + "step": 456 + }, + { + "epoch": 1.0386363636363636, + "grad_norm": 9.279253959655762, + "learning_rate": 6.575757575757576e-05, + "loss": 0.9629, + "step": 457 + }, + { + "epoch": 1.040909090909091, + "grad_norm": 11.864253997802734, + "learning_rate": 6.568181818181819e-05, + "loss": 2.1734, + "step": 458 + }, + { + "epoch": 1.0431818181818182, + "grad_norm": 13.346138954162598, + "learning_rate": 6.560606060606061e-05, + "loss": 1.4337, + "step": 459 + }, + { + "epoch": 1.0454545454545454, + "grad_norm": 8.396434783935547, + "learning_rate": 6.553030303030303e-05, + "loss": 1.54, + "step": 460 + }, + { + "epoch": 1.0477272727272728, + "grad_norm": 9.705253601074219, + "learning_rate": 6.545454545454546e-05, + "loss": 1.9016, + "step": 461 + }, + { + "epoch": 1.05, + "grad_norm": 9.6156005859375, + "learning_rate": 6.537878787878788e-05, + "loss": 1.3029, + "step": 462 + }, + { + "epoch": 1.0522727272727272, + "grad_norm": 16.548994064331055, + "learning_rate": 6.530303030303032e-05, + "loss": 3.5641, + "step": 463 + }, + { + "epoch": 1.0545454545454545, + "grad_norm": 11.045211791992188, + "learning_rate": 6.522727272727272e-05, + "loss": 1.3876, + "step": 464 + }, + { + "epoch": 1.0568181818181819, + "grad_norm": 10.465343475341797, + "learning_rate": 6.515151515151516e-05, + "loss": 1.5871, + "step": 465 + }, + { + "epoch": 1.059090909090909, + "grad_norm": 10.053452491760254, + "learning_rate": 6.507575757575757e-05, + "loss": 1.4177, + "step": 466 + }, + { + "epoch": 1.0613636363636363, + "grad_norm": 12.043208122253418, + "learning_rate": 6.500000000000001e-05, + "loss": 1.5364, + "step": 467 + }, + { + "epoch": 1.0636363636363637, + "grad_norm": 11.853958129882812, + "learning_rate": 6.492424242424243e-05, + "loss": 1.3952, + "step": 468 + }, + { + "epoch": 1.065909090909091, + "grad_norm": 8.25589656829834, + "learning_rate": 6.484848484848485e-05, + "loss": 1.5497, + "step": 469 + }, + { + "epoch": 1.0681818181818181, + "grad_norm": 13.430974960327148, + "learning_rate": 6.477272727272728e-05, + "loss": 2.4184, + "step": 470 + }, + { + "epoch": 1.0704545454545455, + "grad_norm": 10.576482772827148, + "learning_rate": 6.46969696969697e-05, + "loss": 1.4223, + "step": 471 + }, + { + "epoch": 1.0727272727272728, + "grad_norm": 11.786113739013672, + "learning_rate": 6.462121212121213e-05, + "loss": 2.0499, + "step": 472 + }, + { + "epoch": 1.075, + "grad_norm": 12.00688362121582, + "learning_rate": 6.454545454545455e-05, + "loss": 2.9764, + "step": 473 + }, + { + "epoch": 1.0772727272727272, + "grad_norm": 10.834086418151855, + "learning_rate": 6.446969696969697e-05, + "loss": 2.0765, + "step": 474 + }, + { + "epoch": 1.0795454545454546, + "grad_norm": 10.710877418518066, + "learning_rate": 6.439393939393939e-05, + "loss": 1.4314, + "step": 475 + }, + { + "epoch": 1.0818181818181818, + "grad_norm": 12.800888061523438, + "learning_rate": 6.431818181818182e-05, + "loss": 1.4847, + "step": 476 + }, + { + "epoch": 1.084090909090909, + "grad_norm": 10.365299224853516, + "learning_rate": 6.424242424242424e-05, + "loss": 1.6775, + "step": 477 + }, + { + "epoch": 1.0863636363636364, + "grad_norm": 10.344579696655273, + "learning_rate": 6.416666666666668e-05, + "loss": 2.3473, + "step": 478 + }, + { + "epoch": 1.0886363636363636, + "grad_norm": 13.791784286499023, + "learning_rate": 6.40909090909091e-05, + "loss": 2.5763, + "step": 479 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 13.133481979370117, + "learning_rate": 6.401515151515152e-05, + "loss": 1.7025, + "step": 480 + }, + { + "epoch": 1.0931818181818183, + "grad_norm": 13.444737434387207, + "learning_rate": 6.393939393939395e-05, + "loss": 2.341, + "step": 481 + }, + { + "epoch": 1.0954545454545455, + "grad_norm": 15.245584487915039, + "learning_rate": 6.386363636363637e-05, + "loss": 1.929, + "step": 482 + }, + { + "epoch": 1.0977272727272727, + "grad_norm": 10.724458694458008, + "learning_rate": 6.37878787878788e-05, + "loss": 1.4099, + "step": 483 + }, + { + "epoch": 1.1, + "grad_norm": 11.243814468383789, + "learning_rate": 6.371212121212121e-05, + "loss": 1.5886, + "step": 484 + }, + { + "epoch": 1.1022727272727273, + "grad_norm": 11.731426239013672, + "learning_rate": 6.363636363636364e-05, + "loss": 1.9571, + "step": 485 + }, + { + "epoch": 1.1045454545454545, + "grad_norm": 10.820639610290527, + "learning_rate": 6.356060606060606e-05, + "loss": 1.113, + "step": 486 + }, + { + "epoch": 1.106818181818182, + "grad_norm": 14.63482666015625, + "learning_rate": 6.34848484848485e-05, + "loss": 1.9765, + "step": 487 + }, + { + "epoch": 1.1090909090909091, + "grad_norm": 12.746257781982422, + "learning_rate": 6.340909090909091e-05, + "loss": 1.5906, + "step": 488 + }, + { + "epoch": 1.1113636363636363, + "grad_norm": 14.916450500488281, + "learning_rate": 6.333333333333333e-05, + "loss": 1.6616, + "step": 489 + }, + { + "epoch": 1.1136363636363635, + "grad_norm": 11.509872436523438, + "learning_rate": 6.325757575757577e-05, + "loss": 2.5105, + "step": 490 + }, + { + "epoch": 1.115909090909091, + "grad_norm": 11.517654418945312, + "learning_rate": 6.318181818181818e-05, + "loss": 1.3542, + "step": 491 + }, + { + "epoch": 1.1181818181818182, + "grad_norm": 13.984039306640625, + "learning_rate": 6.310606060606062e-05, + "loss": 2.1356, + "step": 492 + }, + { + "epoch": 1.1204545454545454, + "grad_norm": 13.018148422241211, + "learning_rate": 6.303030303030302e-05, + "loss": 1.5024, + "step": 493 + }, + { + "epoch": 1.1227272727272728, + "grad_norm": 13.609540939331055, + "learning_rate": 6.295454545454546e-05, + "loss": 2.1359, + "step": 494 + }, + { + "epoch": 1.125, + "grad_norm": 13.505942344665527, + "learning_rate": 6.287878787878788e-05, + "loss": 2.8486, + "step": 495 + }, + { + "epoch": 1.1272727272727272, + "grad_norm": 11.420187950134277, + "learning_rate": 6.280303030303031e-05, + "loss": 1.5044, + "step": 496 + }, + { + "epoch": 1.1295454545454546, + "grad_norm": 14.127695083618164, + "learning_rate": 6.272727272727273e-05, + "loss": 2.6676, + "step": 497 + }, + { + "epoch": 1.1318181818181818, + "grad_norm": 9.813878059387207, + "learning_rate": 6.265151515151515e-05, + "loss": 1.4169, + "step": 498 + }, + { + "epoch": 1.134090909090909, + "grad_norm": 9.80479621887207, + "learning_rate": 6.257575757575758e-05, + "loss": 1.5349, + "step": 499 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 10.739019393920898, + "learning_rate": 6.25e-05, + "loss": 1.5255, + "step": 500 + }, + { + "epoch": 1.1386363636363637, + "grad_norm": 11.327676773071289, + "learning_rate": 6.242424242424243e-05, + "loss": 1.3854, + "step": 501 + }, + { + "epoch": 1.1409090909090909, + "grad_norm": 9.645312309265137, + "learning_rate": 6.234848484848485e-05, + "loss": 1.6148, + "step": 502 + }, + { + "epoch": 1.143181818181818, + "grad_norm": 12.285623550415039, + "learning_rate": 6.227272727272727e-05, + "loss": 1.9336, + "step": 503 + }, + { + "epoch": 1.1454545454545455, + "grad_norm": 15.579854011535645, + "learning_rate": 6.219696969696969e-05, + "loss": 2.1064, + "step": 504 + }, + { + "epoch": 1.1477272727272727, + "grad_norm": 17.76817512512207, + "learning_rate": 6.212121212121213e-05, + "loss": 1.4266, + "step": 505 + }, + { + "epoch": 1.15, + "grad_norm": 10.037004470825195, + "learning_rate": 6.204545454545455e-05, + "loss": 1.5432, + "step": 506 + }, + { + "epoch": 1.1522727272727273, + "grad_norm": 10.46380615234375, + "learning_rate": 6.196969696969698e-05, + "loss": 2.1057, + "step": 507 + }, + { + "epoch": 1.1545454545454545, + "grad_norm": 12.883086204528809, + "learning_rate": 6.18939393939394e-05, + "loss": 2.1955, + "step": 508 + }, + { + "epoch": 1.1568181818181817, + "grad_norm": 10.667054176330566, + "learning_rate": 6.181818181818182e-05, + "loss": 1.8041, + "step": 509 + }, + { + "epoch": 1.1590909090909092, + "grad_norm": 13.076772689819336, + "learning_rate": 6.174242424242425e-05, + "loss": 1.9923, + "step": 510 + }, + { + "epoch": 1.1613636363636364, + "grad_norm": 13.195068359375, + "learning_rate": 6.166666666666667e-05, + "loss": 2.2575, + "step": 511 + }, + { + "epoch": 1.1636363636363636, + "grad_norm": 25.86856460571289, + "learning_rate": 6.15909090909091e-05, + "loss": 0.9713, + "step": 512 + }, + { + "epoch": 1.165909090909091, + "grad_norm": 13.29697322845459, + "learning_rate": 6.151515151515151e-05, + "loss": 1.9724, + "step": 513 + }, + { + "epoch": 1.1681818181818182, + "grad_norm": 11.164151191711426, + "learning_rate": 6.143939393939394e-05, + "loss": 1.7574, + "step": 514 + }, + { + "epoch": 1.1704545454545454, + "grad_norm": 11.621664047241211, + "learning_rate": 6.136363636363636e-05, + "loss": 2.0349, + "step": 515 + }, + { + "epoch": 1.1727272727272728, + "grad_norm": 13.135611534118652, + "learning_rate": 6.12878787878788e-05, + "loss": 2.1065, + "step": 516 + }, + { + "epoch": 1.175, + "grad_norm": 13.730208396911621, + "learning_rate": 6.121212121212121e-05, + "loss": 2.2205, + "step": 517 + }, + { + "epoch": 1.1772727272727272, + "grad_norm": 11.453598022460938, + "learning_rate": 6.113636363636363e-05, + "loss": 2.2924, + "step": 518 + }, + { + "epoch": 1.1795454545454545, + "grad_norm": 10.924808502197266, + "learning_rate": 6.106060606060607e-05, + "loss": 1.2283, + "step": 519 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 16.08315658569336, + "learning_rate": 6.098484848484849e-05, + "loss": 2.5927, + "step": 520 + }, + { + "epoch": 1.184090909090909, + "grad_norm": 8.260347366333008, + "learning_rate": 6.090909090909091e-05, + "loss": 1.3534, + "step": 521 + }, + { + "epoch": 1.1863636363636363, + "grad_norm": 12.075833320617676, + "learning_rate": 6.083333333333333e-05, + "loss": 2.0813, + "step": 522 + }, + { + "epoch": 1.1886363636363637, + "grad_norm": 10.575677871704102, + "learning_rate": 6.075757575757576e-05, + "loss": 1.4781, + "step": 523 + }, + { + "epoch": 1.190909090909091, + "grad_norm": 12.236503601074219, + "learning_rate": 6.0681818181818185e-05, + "loss": 2.003, + "step": 524 + }, + { + "epoch": 1.1931818181818181, + "grad_norm": 12.172025680541992, + "learning_rate": 6.060606060606061e-05, + "loss": 1.4951, + "step": 525 + }, + { + "epoch": 1.1954545454545455, + "grad_norm": 12.456896781921387, + "learning_rate": 6.053030303030304e-05, + "loss": 1.8737, + "step": 526 + }, + { + "epoch": 1.1977272727272728, + "grad_norm": 13.824838638305664, + "learning_rate": 6.045454545454545e-05, + "loss": 1.7923, + "step": 527 + }, + { + "epoch": 1.2, + "grad_norm": 10.863786697387695, + "learning_rate": 6.037878787878788e-05, + "loss": 2.144, + "step": 528 + }, + { + "epoch": 1.2022727272727272, + "grad_norm": 17.319700241088867, + "learning_rate": 6.03030303030303e-05, + "loss": 1.9297, + "step": 529 + }, + { + "epoch": 1.2045454545454546, + "grad_norm": 8.89411449432373, + "learning_rate": 6.022727272727273e-05, + "loss": 1.3583, + "step": 530 + }, + { + "epoch": 1.2068181818181818, + "grad_norm": 16.971437454223633, + "learning_rate": 6.0151515151515156e-05, + "loss": 2.5184, + "step": 531 + }, + { + "epoch": 1.209090909090909, + "grad_norm": 11.486995697021484, + "learning_rate": 6.0075757575757575e-05, + "loss": 1.5296, + "step": 532 + }, + { + "epoch": 1.2113636363636364, + "grad_norm": 17.541278839111328, + "learning_rate": 6e-05, + "loss": 2.108, + "step": 533 + }, + { + "epoch": 1.2136363636363636, + "grad_norm": 13.599751472473145, + "learning_rate": 5.992424242424243e-05, + "loss": 2.0622, + "step": 534 + }, + { + "epoch": 1.2159090909090908, + "grad_norm": 10.884852409362793, + "learning_rate": 5.9848484848484854e-05, + "loss": 1.5018, + "step": 535 + }, + { + "epoch": 1.2181818181818183, + "grad_norm": 10.407668113708496, + "learning_rate": 5.977272727272728e-05, + "loss": 1.5013, + "step": 536 + }, + { + "epoch": 1.2204545454545455, + "grad_norm": 9.911277770996094, + "learning_rate": 5.969696969696969e-05, + "loss": 1.9855, + "step": 537 + }, + { + "epoch": 1.2227272727272727, + "grad_norm": 11.939435958862305, + "learning_rate": 5.962121212121212e-05, + "loss": 2.359, + "step": 538 + }, + { + "epoch": 1.225, + "grad_norm": 11.17503547668457, + "learning_rate": 5.9545454545454546e-05, + "loss": 1.4952, + "step": 539 + }, + { + "epoch": 1.2272727272727273, + "grad_norm": 15.073485374450684, + "learning_rate": 5.946969696969697e-05, + "loss": 2.1802, + "step": 540 + }, + { + "epoch": 1.2295454545454545, + "grad_norm": 12.413151741027832, + "learning_rate": 5.93939393939394e-05, + "loss": 1.9444, + "step": 541 + }, + { + "epoch": 1.231818181818182, + "grad_norm": 12.741022109985352, + "learning_rate": 5.931818181818182e-05, + "loss": 1.894, + "step": 542 + }, + { + "epoch": 1.2340909090909091, + "grad_norm": 11.041027069091797, + "learning_rate": 5.9242424242424244e-05, + "loss": 1.748, + "step": 543 + }, + { + "epoch": 1.2363636363636363, + "grad_norm": 10.045198440551758, + "learning_rate": 5.916666666666667e-05, + "loss": 1.7848, + "step": 544 + }, + { + "epoch": 1.2386363636363638, + "grad_norm": 10.759014129638672, + "learning_rate": 5.90909090909091e-05, + "loss": 1.7836, + "step": 545 + }, + { + "epoch": 1.240909090909091, + "grad_norm": 10.296431541442871, + "learning_rate": 5.901515151515152e-05, + "loss": 1.088, + "step": 546 + }, + { + "epoch": 1.2431818181818182, + "grad_norm": 11.159008026123047, + "learning_rate": 5.8939393939393936e-05, + "loss": 1.3126, + "step": 547 + }, + { + "epoch": 1.2454545454545454, + "grad_norm": 7.6021270751953125, + "learning_rate": 5.886363636363636e-05, + "loss": 1.137, + "step": 548 + }, + { + "epoch": 1.2477272727272728, + "grad_norm": 11.449591636657715, + "learning_rate": 5.878787878787879e-05, + "loss": 1.7471, + "step": 549 + }, + { + "epoch": 1.25, + "grad_norm": 14.451662063598633, + "learning_rate": 5.871212121212122e-05, + "loss": 2.014, + "step": 550 + }, + { + "epoch": 1.2522727272727272, + "grad_norm": 11.24593448638916, + "learning_rate": 5.8636363636363634e-05, + "loss": 1.5885, + "step": 551 + }, + { + "epoch": 1.2545454545454544, + "grad_norm": 10.326696395874023, + "learning_rate": 5.856060606060606e-05, + "loss": 1.5146, + "step": 552 + }, + { + "epoch": 1.2568181818181818, + "grad_norm": 11.736088752746582, + "learning_rate": 5.848484848484849e-05, + "loss": 2.1627, + "step": 553 + }, + { + "epoch": 1.259090909090909, + "grad_norm": 14.25733757019043, + "learning_rate": 5.840909090909091e-05, + "loss": 1.8419, + "step": 554 + }, + { + "epoch": 1.2613636363636362, + "grad_norm": 10.154618263244629, + "learning_rate": 5.833333333333334e-05, + "loss": 1.8319, + "step": 555 + }, + { + "epoch": 1.2636363636363637, + "grad_norm": 14.464015007019043, + "learning_rate": 5.825757575757575e-05, + "loss": 1.7117, + "step": 556 + }, + { + "epoch": 1.2659090909090909, + "grad_norm": 9.713830947875977, + "learning_rate": 5.818181818181818e-05, + "loss": 1.4495, + "step": 557 + }, + { + "epoch": 1.268181818181818, + "grad_norm": 21.958648681640625, + "learning_rate": 5.810606060606061e-05, + "loss": 3.0762, + "step": 558 + }, + { + "epoch": 1.2704545454545455, + "grad_norm": 11.349808692932129, + "learning_rate": 5.803030303030304e-05, + "loss": 1.9419, + "step": 559 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 12.586771965026855, + "learning_rate": 5.7954545454545464e-05, + "loss": 2.1826, + "step": 560 + }, + { + "epoch": 1.275, + "grad_norm": 10.261626243591309, + "learning_rate": 5.787878787878788e-05, + "loss": 2.0422, + "step": 561 + }, + { + "epoch": 1.2772727272727273, + "grad_norm": 11.65180492401123, + "learning_rate": 5.78030303030303e-05, + "loss": 1.5295, + "step": 562 + }, + { + "epoch": 1.2795454545454545, + "grad_norm": 12.369877815246582, + "learning_rate": 5.772727272727273e-05, + "loss": 1.8935, + "step": 563 + }, + { + "epoch": 1.2818181818181817, + "grad_norm": 10.670714378356934, + "learning_rate": 5.7651515151515156e-05, + "loss": 2.0215, + "step": 564 + }, + { + "epoch": 1.2840909090909092, + "grad_norm": 13.76659870147705, + "learning_rate": 5.757575757575758e-05, + "loss": 2.2982, + "step": 565 + }, + { + "epoch": 1.2863636363636364, + "grad_norm": 9.004195213317871, + "learning_rate": 5.7499999999999995e-05, + "loss": 1.6066, + "step": 566 + }, + { + "epoch": 1.2886363636363636, + "grad_norm": 10.873322486877441, + "learning_rate": 5.742424242424243e-05, + "loss": 1.5999, + "step": 567 + }, + { + "epoch": 1.290909090909091, + "grad_norm": 11.641073226928711, + "learning_rate": 5.7348484848484854e-05, + "loss": 1.3272, + "step": 568 + }, + { + "epoch": 1.2931818181818182, + "grad_norm": 9.68420124053955, + "learning_rate": 5.727272727272728e-05, + "loss": 1.413, + "step": 569 + }, + { + "epoch": 1.2954545454545454, + "grad_norm": 13.477838516235352, + "learning_rate": 5.719696969696971e-05, + "loss": 2.5129, + "step": 570 + }, + { + "epoch": 1.2977272727272728, + "grad_norm": 11.720010757446289, + "learning_rate": 5.712121212121212e-05, + "loss": 2.1576, + "step": 571 + }, + { + "epoch": 1.3, + "grad_norm": 13.136527061462402, + "learning_rate": 5.7045454545454546e-05, + "loss": 1.9311, + "step": 572 + }, + { + "epoch": 1.3022727272727272, + "grad_norm": 8.095415115356445, + "learning_rate": 5.696969696969697e-05, + "loss": 0.8927, + "step": 573 + }, + { + "epoch": 1.3045454545454547, + "grad_norm": 11.233893394470215, + "learning_rate": 5.68939393939394e-05, + "loss": 2.0108, + "step": 574 + }, + { + "epoch": 1.3068181818181819, + "grad_norm": 11.203099250793457, + "learning_rate": 5.6818181818181825e-05, + "loss": 1.9241, + "step": 575 + }, + { + "epoch": 1.309090909090909, + "grad_norm": 9.640209197998047, + "learning_rate": 5.6742424242424244e-05, + "loss": 1.3841, + "step": 576 + }, + { + "epoch": 1.3113636363636363, + "grad_norm": 10.882938385009766, + "learning_rate": 5.666666666666667e-05, + "loss": 1.4184, + "step": 577 + }, + { + "epoch": 1.3136363636363637, + "grad_norm": 10.470818519592285, + "learning_rate": 5.65909090909091e-05, + "loss": 2.0096, + "step": 578 + }, + { + "epoch": 1.315909090909091, + "grad_norm": 12.759695053100586, + "learning_rate": 5.651515151515152e-05, + "loss": 2.138, + "step": 579 + }, + { + "epoch": 1.3181818181818181, + "grad_norm": 26.707128524780273, + "learning_rate": 5.643939393939395e-05, + "loss": 2.8215, + "step": 580 + }, + { + "epoch": 1.3204545454545453, + "grad_norm": 11.116402626037598, + "learning_rate": 5.636363636363636e-05, + "loss": 2.4158, + "step": 581 + }, + { + "epoch": 1.3227272727272728, + "grad_norm": 14.136595726013184, + "learning_rate": 5.628787878787879e-05, + "loss": 1.6545, + "step": 582 + }, + { + "epoch": 1.325, + "grad_norm": 11.88375473022461, + "learning_rate": 5.6212121212121215e-05, + "loss": 2.1069, + "step": 583 + }, + { + "epoch": 1.3272727272727272, + "grad_norm": 11.863356590270996, + "learning_rate": 5.613636363636364e-05, + "loss": 1.535, + "step": 584 + }, + { + "epoch": 1.3295454545454546, + "grad_norm": 11.284381866455078, + "learning_rate": 5.606060606060606e-05, + "loss": 2.3407, + "step": 585 + }, + { + "epoch": 1.3318181818181818, + "grad_norm": 11.79831600189209, + "learning_rate": 5.598484848484849e-05, + "loss": 1.6409, + "step": 586 + }, + { + "epoch": 1.334090909090909, + "grad_norm": 11.130000114440918, + "learning_rate": 5.5909090909090913e-05, + "loss": 1.6426, + "step": 587 + }, + { + "epoch": 1.3363636363636364, + "grad_norm": 9.1551513671875, + "learning_rate": 5.583333333333334e-05, + "loss": 1.8466, + "step": 588 + }, + { + "epoch": 1.3386363636363636, + "grad_norm": 14.405865669250488, + "learning_rate": 5.5757575757575766e-05, + "loss": 2.066, + "step": 589 + }, + { + "epoch": 1.3409090909090908, + "grad_norm": 53.46037673950195, + "learning_rate": 5.568181818181818e-05, + "loss": 2.4224, + "step": 590 + }, + { + "epoch": 1.3431818181818183, + "grad_norm": 11.6724271774292, + "learning_rate": 5.5606060606060605e-05, + "loss": 1.7148, + "step": 591 + }, + { + "epoch": 1.3454545454545455, + "grad_norm": 15.849516868591309, + "learning_rate": 5.553030303030303e-05, + "loss": 2.0981, + "step": 592 + }, + { + "epoch": 1.3477272727272727, + "grad_norm": 13.421188354492188, + "learning_rate": 5.545454545454546e-05, + "loss": 1.722, + "step": 593 + }, + { + "epoch": 1.35, + "grad_norm": 14.319283485412598, + "learning_rate": 5.5378787878787884e-05, + "loss": 1.7284, + "step": 594 + }, + { + "epoch": 1.3522727272727273, + "grad_norm": 12.210022926330566, + "learning_rate": 5.5303030303030304e-05, + "loss": 1.4507, + "step": 595 + }, + { + "epoch": 1.3545454545454545, + "grad_norm": 11.60317325592041, + "learning_rate": 5.522727272727273e-05, + "loss": 1.7749, + "step": 596 + }, + { + "epoch": 1.356818181818182, + "grad_norm": 12.895737648010254, + "learning_rate": 5.5151515151515156e-05, + "loss": 1.5555, + "step": 597 + }, + { + "epoch": 1.3590909090909091, + "grad_norm": 11.198805809020996, + "learning_rate": 5.507575757575758e-05, + "loss": 1.7624, + "step": 598 + }, + { + "epoch": 1.3613636363636363, + "grad_norm": 13.309189796447754, + "learning_rate": 5.500000000000001e-05, + "loss": 1.8765, + "step": 599 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 10.177202224731445, + "learning_rate": 5.492424242424242e-05, + "loss": 1.1895, + "step": 600 + }, + { + "epoch": 1.365909090909091, + "grad_norm": 11.205484390258789, + "learning_rate": 5.484848484848485e-05, + "loss": 1.1661, + "step": 601 + }, + { + "epoch": 1.3681818181818182, + "grad_norm": 12.091497421264648, + "learning_rate": 5.4772727272727274e-05, + "loss": 1.9972, + "step": 602 + }, + { + "epoch": 1.3704545454545456, + "grad_norm": 11.2894926071167, + "learning_rate": 5.46969696969697e-05, + "loss": 1.7121, + "step": 603 + }, + { + "epoch": 1.3727272727272728, + "grad_norm": 15.034446716308594, + "learning_rate": 5.462121212121213e-05, + "loss": 2.8078, + "step": 604 + }, + { + "epoch": 1.375, + "grad_norm": 8.075346946716309, + "learning_rate": 5.4545454545454546e-05, + "loss": 1.0453, + "step": 605 + }, + { + "epoch": 1.3772727272727272, + "grad_norm": 10.377656936645508, + "learning_rate": 5.446969696969697e-05, + "loss": 1.7973, + "step": 606 + }, + { + "epoch": 1.3795454545454544, + "grad_norm": 10.147284507751465, + "learning_rate": 5.43939393939394e-05, + "loss": 2.1848, + "step": 607 + }, + { + "epoch": 1.3818181818181818, + "grad_norm": 11.856623649597168, + "learning_rate": 5.4318181818181825e-05, + "loss": 1.9857, + "step": 608 + }, + { + "epoch": 1.384090909090909, + "grad_norm": 10.355262756347656, + "learning_rate": 5.424242424242425e-05, + "loss": 1.4383, + "step": 609 + }, + { + "epoch": 1.3863636363636362, + "grad_norm": 9.085455894470215, + "learning_rate": 5.4166666666666664e-05, + "loss": 1.382, + "step": 610 + }, + { + "epoch": 1.3886363636363637, + "grad_norm": 13.221922874450684, + "learning_rate": 5.409090909090909e-05, + "loss": 2.3278, + "step": 611 + }, + { + "epoch": 1.3909090909090909, + "grad_norm": 14.725556373596191, + "learning_rate": 5.401515151515152e-05, + "loss": 2.0181, + "step": 612 + }, + { + "epoch": 1.393181818181818, + "grad_norm": 11.90503978729248, + "learning_rate": 5.393939393939394e-05, + "loss": 2.5601, + "step": 613 + }, + { + "epoch": 1.3954545454545455, + "grad_norm": 10.583837509155273, + "learning_rate": 5.386363636363637e-05, + "loss": 1.4886, + "step": 614 + }, + { + "epoch": 1.3977272727272727, + "grad_norm": 12.369796752929688, + "learning_rate": 5.378787878787879e-05, + "loss": 1.2716, + "step": 615 + }, + { + "epoch": 1.4, + "grad_norm": 12.412566184997559, + "learning_rate": 5.3712121212121215e-05, + "loss": 2.0391, + "step": 616 + }, + { + "epoch": 1.4022727272727273, + "grad_norm": 12.033483505249023, + "learning_rate": 5.363636363636364e-05, + "loss": 1.2044, + "step": 617 + }, + { + "epoch": 1.4045454545454545, + "grad_norm": 11.291866302490234, + "learning_rate": 5.356060606060607e-05, + "loss": 2.3266, + "step": 618 + }, + { + "epoch": 1.4068181818181817, + "grad_norm": 17.745227813720703, + "learning_rate": 5.348484848484848e-05, + "loss": 1.7097, + "step": 619 + }, + { + "epoch": 1.4090909090909092, + "grad_norm": 11.858403205871582, + "learning_rate": 5.340909090909091e-05, + "loss": 1.9088, + "step": 620 + }, + { + "epoch": 1.4113636363636364, + "grad_norm": 14.968146324157715, + "learning_rate": 5.333333333333333e-05, + "loss": 2.009, + "step": 621 + }, + { + "epoch": 1.4136363636363636, + "grad_norm": 13.16178035736084, + "learning_rate": 5.325757575757576e-05, + "loss": 1.6262, + "step": 622 + }, + { + "epoch": 1.415909090909091, + "grad_norm": 11.63772201538086, + "learning_rate": 5.3181818181818186e-05, + "loss": 1.481, + "step": 623 + }, + { + "epoch": 1.4181818181818182, + "grad_norm": 13.266715049743652, + "learning_rate": 5.3106060606060605e-05, + "loss": 2.3015, + "step": 624 + }, + { + "epoch": 1.4204545454545454, + "grad_norm": 11.690614700317383, + "learning_rate": 5.303030303030303e-05, + "loss": 1.7226, + "step": 625 + }, + { + "epoch": 1.4227272727272728, + "grad_norm": 10.599973678588867, + "learning_rate": 5.295454545454546e-05, + "loss": 1.0261, + "step": 626 + }, + { + "epoch": 1.425, + "grad_norm": 17.117259979248047, + "learning_rate": 5.2878787878787884e-05, + "loss": 1.7164, + "step": 627 + }, + { + "epoch": 1.4272727272727272, + "grad_norm": 11.62483024597168, + "learning_rate": 5.280303030303031e-05, + "loss": 1.3686, + "step": 628 + }, + { + "epoch": 1.4295454545454547, + "grad_norm": 10.503996849060059, + "learning_rate": 5.272727272727272e-05, + "loss": 1.6085, + "step": 629 + }, + { + "epoch": 1.4318181818181819, + "grad_norm": 14.493663787841797, + "learning_rate": 5.265151515151515e-05, + "loss": 2.0943, + "step": 630 + }, + { + "epoch": 1.434090909090909, + "grad_norm": 11.125360488891602, + "learning_rate": 5.2575757575757576e-05, + "loss": 1.8284, + "step": 631 + }, + { + "epoch": 1.4363636363636363, + "grad_norm": 10.438358306884766, + "learning_rate": 5.25e-05, + "loss": 2.1436, + "step": 632 + }, + { + "epoch": 1.4386363636363637, + "grad_norm": 13.013614654541016, + "learning_rate": 5.242424242424243e-05, + "loss": 1.6999, + "step": 633 + }, + { + "epoch": 1.440909090909091, + "grad_norm": 14.21478271484375, + "learning_rate": 5.234848484848485e-05, + "loss": 3.268, + "step": 634 + }, + { + "epoch": 1.4431818181818181, + "grad_norm": 10.756131172180176, + "learning_rate": 5.2272727272727274e-05, + "loss": 1.1294, + "step": 635 + }, + { + "epoch": 1.4454545454545453, + "grad_norm": 14.409692764282227, + "learning_rate": 5.21969696969697e-05, + "loss": 1.391, + "step": 636 + }, + { + "epoch": 1.4477272727272728, + "grad_norm": 9.839500427246094, + "learning_rate": 5.212121212121213e-05, + "loss": 1.4028, + "step": 637 + }, + { + "epoch": 1.45, + "grad_norm": 13.601579666137695, + "learning_rate": 5.204545454545455e-05, + "loss": 1.6384, + "step": 638 + }, + { + "epoch": 1.4522727272727272, + "grad_norm": 12.721500396728516, + "learning_rate": 5.1969696969696966e-05, + "loss": 1.9382, + "step": 639 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 11.373588562011719, + "learning_rate": 5.189393939393939e-05, + "loss": 2.7324, + "step": 640 + }, + { + "epoch": 1.4568181818181818, + "grad_norm": 11.873559951782227, + "learning_rate": 5.181818181818182e-05, + "loss": 1.6583, + "step": 641 + }, + { + "epoch": 1.459090909090909, + "grad_norm": 10.649148941040039, + "learning_rate": 5.1742424242424245e-05, + "loss": 1.7733, + "step": 642 + }, + { + "epoch": 1.4613636363636364, + "grad_norm": 12.14698314666748, + "learning_rate": 5.166666666666667e-05, + "loss": 1.6434, + "step": 643 + }, + { + "epoch": 1.4636363636363636, + "grad_norm": 9.80806827545166, + "learning_rate": 5.159090909090909e-05, + "loss": 1.9463, + "step": 644 + }, + { + "epoch": 1.4659090909090908, + "grad_norm": 7.273732662200928, + "learning_rate": 5.151515151515152e-05, + "loss": 0.8156, + "step": 645 + }, + { + "epoch": 1.4681818181818183, + "grad_norm": 12.560272216796875, + "learning_rate": 5.143939393939394e-05, + "loss": 2.2347, + "step": 646 + }, + { + "epoch": 1.4704545454545455, + "grad_norm": 10.116893768310547, + "learning_rate": 5.136363636363637e-05, + "loss": 1.2157, + "step": 647 + }, + { + "epoch": 1.4727272727272727, + "grad_norm": 11.09861946105957, + "learning_rate": 5.1287878787878796e-05, + "loss": 1.2521, + "step": 648 + }, + { + "epoch": 1.475, + "grad_norm": 11.454336166381836, + "learning_rate": 5.121212121212121e-05, + "loss": 1.6148, + "step": 649 + }, + { + "epoch": 1.4772727272727273, + "grad_norm": 11.669930458068848, + "learning_rate": 5.1136363636363635e-05, + "loss": 2.4559, + "step": 650 + }, + { + "epoch": 1.4795454545454545, + "grad_norm": 10.853449821472168, + "learning_rate": 5.106060606060606e-05, + "loss": 1.6519, + "step": 651 + }, + { + "epoch": 1.481818181818182, + "grad_norm": 23.87467384338379, + "learning_rate": 5.098484848484849e-05, + "loss": 3.9198, + "step": 652 + }, + { + "epoch": 1.4840909090909091, + "grad_norm": 15.731586456298828, + "learning_rate": 5.090909090909091e-05, + "loss": 2.4425, + "step": 653 + }, + { + "epoch": 1.4863636363636363, + "grad_norm": 10.91791820526123, + "learning_rate": 5.0833333333333333e-05, + "loss": 1.4977, + "step": 654 + }, + { + "epoch": 1.4886363636363638, + "grad_norm": 11.515501022338867, + "learning_rate": 5.075757575757576e-05, + "loss": 1.4377, + "step": 655 + }, + { + "epoch": 1.490909090909091, + "grad_norm": 9.79021167755127, + "learning_rate": 5.0681818181818186e-05, + "loss": 1.208, + "step": 656 + }, + { + "epoch": 1.4931818181818182, + "grad_norm": 7.424502849578857, + "learning_rate": 5.060606060606061e-05, + "loss": 1.368, + "step": 657 + }, + { + "epoch": 1.4954545454545456, + "grad_norm": 9.132887840270996, + "learning_rate": 5.0530303030303025e-05, + "loss": 1.0296, + "step": 658 + }, + { + "epoch": 1.4977272727272728, + "grad_norm": 14.063539505004883, + "learning_rate": 5.045454545454545e-05, + "loss": 1.9923, + "step": 659 + }, + { + "epoch": 1.5, + "grad_norm": 10.994144439697266, + "learning_rate": 5.037878787878788e-05, + "loss": 1.5963, + "step": 660 + }, + { + "epoch": 1.5022727272727274, + "grad_norm": 11.193540573120117, + "learning_rate": 5.030303030303031e-05, + "loss": 2.6418, + "step": 661 + }, + { + "epoch": 1.5045454545454544, + "grad_norm": 11.344916343688965, + "learning_rate": 5.022727272727274e-05, + "loss": 0.9847, + "step": 662 + }, + { + "epoch": 1.5068181818181818, + "grad_norm": 16.028928756713867, + "learning_rate": 5.015151515151515e-05, + "loss": 2.7095, + "step": 663 + }, + { + "epoch": 1.509090909090909, + "grad_norm": 10.2492036819458, + "learning_rate": 5.0075757575757576e-05, + "loss": 1.4351, + "step": 664 + }, + { + "epoch": 1.5113636363636362, + "grad_norm": 12.819211959838867, + "learning_rate": 5e-05, + "loss": 2.2236, + "step": 665 + }, + { + "epoch": 1.5136363636363637, + "grad_norm": 9.43850326538086, + "learning_rate": 4.992424242424243e-05, + "loss": 0.988, + "step": 666 + }, + { + "epoch": 1.5159090909090909, + "grad_norm": 12.35922622680664, + "learning_rate": 4.984848484848485e-05, + "loss": 1.9395, + "step": 667 + }, + { + "epoch": 1.518181818181818, + "grad_norm": 12.175325393676758, + "learning_rate": 4.9772727272727275e-05, + "loss": 2.0219, + "step": 668 + }, + { + "epoch": 1.5204545454545455, + "grad_norm": 16.44111442565918, + "learning_rate": 4.9696969696969694e-05, + "loss": 1.7191, + "step": 669 + }, + { + "epoch": 1.5227272727272727, + "grad_norm": 12.413610458374023, + "learning_rate": 4.962121212121213e-05, + "loss": 2.2003, + "step": 670 + }, + { + "epoch": 1.525, + "grad_norm": 7.922098159790039, + "learning_rate": 4.9545454545454553e-05, + "loss": 1.1514, + "step": 671 + }, + { + "epoch": 1.5272727272727273, + "grad_norm": 11.402259826660156, + "learning_rate": 4.946969696969697e-05, + "loss": 1.6611, + "step": 672 + }, + { + "epoch": 1.5295454545454545, + "grad_norm": 10.548962593078613, + "learning_rate": 4.93939393939394e-05, + "loss": 1.6242, + "step": 673 + }, + { + "epoch": 1.5318181818181817, + "grad_norm": 14.536432266235352, + "learning_rate": 4.931818181818182e-05, + "loss": 2.2415, + "step": 674 + }, + { + "epoch": 1.5340909090909092, + "grad_norm": 12.954751014709473, + "learning_rate": 4.9242424242424245e-05, + "loss": 1.8463, + "step": 675 + }, + { + "epoch": 1.5363636363636364, + "grad_norm": 12.143820762634277, + "learning_rate": 4.9166666666666665e-05, + "loss": 1.97, + "step": 676 + }, + { + "epoch": 1.5386363636363636, + "grad_norm": 10.134570121765137, + "learning_rate": 4.909090909090909e-05, + "loss": 0.9264, + "step": 677 + }, + { + "epoch": 1.540909090909091, + "grad_norm": 12.558758735656738, + "learning_rate": 4.901515151515152e-05, + "loss": 1.4608, + "step": 678 + }, + { + "epoch": 1.5431818181818182, + "grad_norm": 10.165045738220215, + "learning_rate": 4.8939393939393944e-05, + "loss": 1.3453, + "step": 679 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 11.995816230773926, + "learning_rate": 4.886363636363637e-05, + "loss": 2.1228, + "step": 680 + }, + { + "epoch": 1.5477272727272728, + "grad_norm": 10.822747230529785, + "learning_rate": 4.878787878787879e-05, + "loss": 2.0378, + "step": 681 + }, + { + "epoch": 1.55, + "grad_norm": 16.348892211914062, + "learning_rate": 4.8712121212121216e-05, + "loss": 1.7209, + "step": 682 + }, + { + "epoch": 1.5522727272727272, + "grad_norm": 9.395282745361328, + "learning_rate": 4.863636363636364e-05, + "loss": 1.4529, + "step": 683 + }, + { + "epoch": 1.5545454545454547, + "grad_norm": 16.89964485168457, + "learning_rate": 4.856060606060606e-05, + "loss": 2.8833, + "step": 684 + }, + { + "epoch": 1.5568181818181817, + "grad_norm": 10.703327178955078, + "learning_rate": 4.848484848484849e-05, + "loss": 1.7938, + "step": 685 + }, + { + "epoch": 1.559090909090909, + "grad_norm": 19.770193099975586, + "learning_rate": 4.840909090909091e-05, + "loss": 1.6041, + "step": 686 + }, + { + "epoch": 1.5613636363636365, + "grad_norm": 11.777501106262207, + "learning_rate": 4.8333333333333334e-05, + "loss": 2.0716, + "step": 687 + }, + { + "epoch": 1.5636363636363635, + "grad_norm": 10.248165130615234, + "learning_rate": 4.825757575757576e-05, + "loss": 1.5853, + "step": 688 + }, + { + "epoch": 1.565909090909091, + "grad_norm": 10.732747077941895, + "learning_rate": 4.8181818181818186e-05, + "loss": 1.2683, + "step": 689 + }, + { + "epoch": 1.5681818181818183, + "grad_norm": 11.304749488830566, + "learning_rate": 4.810606060606061e-05, + "loss": 2.2432, + "step": 690 + }, + { + "epoch": 1.5704545454545453, + "grad_norm": 13.820841789245605, + "learning_rate": 4.803030303030303e-05, + "loss": 1.8117, + "step": 691 + }, + { + "epoch": 1.5727272727272728, + "grad_norm": 9.33556079864502, + "learning_rate": 4.795454545454546e-05, + "loss": 1.0837, + "step": 692 + }, + { + "epoch": 1.575, + "grad_norm": 13.970429420471191, + "learning_rate": 4.787878787878788e-05, + "loss": 2.5927, + "step": 693 + }, + { + "epoch": 1.5772727272727272, + "grad_norm": 10.840149879455566, + "learning_rate": 4.7803030303030304e-05, + "loss": 1.8707, + "step": 694 + }, + { + "epoch": 1.5795454545454546, + "grad_norm": 11.14415168762207, + "learning_rate": 4.772727272727273e-05, + "loss": 1.6668, + "step": 695 + }, + { + "epoch": 1.5818181818181818, + "grad_norm": 14.185403823852539, + "learning_rate": 4.765151515151515e-05, + "loss": 1.6091, + "step": 696 + }, + { + "epoch": 1.584090909090909, + "grad_norm": 13.565306663513184, + "learning_rate": 4.7575757575757576e-05, + "loss": 1.8229, + "step": 697 + }, + { + "epoch": 1.5863636363636364, + "grad_norm": 14.329642295837402, + "learning_rate": 4.75e-05, + "loss": 1.9366, + "step": 698 + }, + { + "epoch": 1.5886363636363636, + "grad_norm": 12.332931518554688, + "learning_rate": 4.742424242424243e-05, + "loss": 1.683, + "step": 699 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 10.493454933166504, + "learning_rate": 4.7348484848484855e-05, + "loss": 1.8994, + "step": 700 + }, + { + "epoch": 1.5931818181818183, + "grad_norm": 11.809647560119629, + "learning_rate": 4.7272727272727275e-05, + "loss": 1.509, + "step": 701 + }, + { + "epoch": 1.5954545454545455, + "grad_norm": 12.72128963470459, + "learning_rate": 4.71969696969697e-05, + "loss": 2.1266, + "step": 702 + }, + { + "epoch": 1.5977272727272727, + "grad_norm": 13.074295043945312, + "learning_rate": 4.712121212121212e-05, + "loss": 1.6113, + "step": 703 + }, + { + "epoch": 1.6, + "grad_norm": 10.254904747009277, + "learning_rate": 4.704545454545455e-05, + "loss": 2.2737, + "step": 704 + }, + { + "epoch": 1.6022727272727273, + "grad_norm": 24.574390411376953, + "learning_rate": 4.696969696969697e-05, + "loss": 2.2779, + "step": 705 + }, + { + "epoch": 1.6045454545454545, + "grad_norm": 10.441598892211914, + "learning_rate": 4.689393939393939e-05, + "loss": 1.8209, + "step": 706 + }, + { + "epoch": 1.606818181818182, + "grad_norm": 12.4207763671875, + "learning_rate": 4.681818181818182e-05, + "loss": 1.5389, + "step": 707 + }, + { + "epoch": 1.6090909090909091, + "grad_norm": 15.072708129882812, + "learning_rate": 4.6742424242424245e-05, + "loss": 1.3703, + "step": 708 + }, + { + "epoch": 1.6113636363636363, + "grad_norm": 11.555070877075195, + "learning_rate": 4.666666666666667e-05, + "loss": 1.9363, + "step": 709 + }, + { + "epoch": 1.6136363636363638, + "grad_norm": 13.27509593963623, + "learning_rate": 4.659090909090909e-05, + "loss": 1.4334, + "step": 710 + }, + { + "epoch": 1.615909090909091, + "grad_norm": 12.357429504394531, + "learning_rate": 4.651515151515152e-05, + "loss": 2.3112, + "step": 711 + }, + { + "epoch": 1.6181818181818182, + "grad_norm": 19.84957504272461, + "learning_rate": 4.6439393939393944e-05, + "loss": 1.1851, + "step": 712 + }, + { + "epoch": 1.6204545454545456, + "grad_norm": 10.689920425415039, + "learning_rate": 4.636363636363636e-05, + "loss": 1.921, + "step": 713 + }, + { + "epoch": 1.6227272727272726, + "grad_norm": 10.688066482543945, + "learning_rate": 4.628787878787879e-05, + "loss": 1.2294, + "step": 714 + }, + { + "epoch": 1.625, + "grad_norm": 11.80333423614502, + "learning_rate": 4.621212121212121e-05, + "loss": 2.5255, + "step": 715 + }, + { + "epoch": 1.6272727272727274, + "grad_norm": 11.181013107299805, + "learning_rate": 4.6136363636363635e-05, + "loss": 1.2692, + "step": 716 + }, + { + "epoch": 1.6295454545454544, + "grad_norm": 11.557047843933105, + "learning_rate": 4.606060606060607e-05, + "loss": 1.4575, + "step": 717 + }, + { + "epoch": 1.6318181818181818, + "grad_norm": 13.798693656921387, + "learning_rate": 4.598484848484849e-05, + "loss": 2.3197, + "step": 718 + }, + { + "epoch": 1.634090909090909, + "grad_norm": 8.890710830688477, + "learning_rate": 4.5909090909090914e-05, + "loss": 1.5266, + "step": 719 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 10.293892860412598, + "learning_rate": 4.5833333333333334e-05, + "loss": 1.9222, + "step": 720 + }, + { + "epoch": 1.6386363636363637, + "grad_norm": 12.959512710571289, + "learning_rate": 4.575757575757576e-05, + "loss": 1.5771, + "step": 721 + }, + { + "epoch": 1.6409090909090909, + "grad_norm": 11.565927505493164, + "learning_rate": 4.5681818181818186e-05, + "loss": 1.5313, + "step": 722 + }, + { + "epoch": 1.643181818181818, + "grad_norm": 9.419241905212402, + "learning_rate": 4.5606060606060606e-05, + "loss": 1.4229, + "step": 723 + }, + { + "epoch": 1.6454545454545455, + "grad_norm": 15.411003112792969, + "learning_rate": 4.553030303030303e-05, + "loss": 1.8707, + "step": 724 + }, + { + "epoch": 1.6477272727272727, + "grad_norm": 7.6546711921691895, + "learning_rate": 4.545454545454546e-05, + "loss": 0.742, + "step": 725 + }, + { + "epoch": 1.65, + "grad_norm": 13.029730796813965, + "learning_rate": 4.5378787878787885e-05, + "loss": 1.5179, + "step": 726 + }, + { + "epoch": 1.6522727272727273, + "grad_norm": 12.853962898254395, + "learning_rate": 4.5303030303030304e-05, + "loss": 1.8908, + "step": 727 + }, + { + "epoch": 1.6545454545454545, + "grad_norm": 12.864992141723633, + "learning_rate": 4.522727272727273e-05, + "loss": 1.7175, + "step": 728 + }, + { + "epoch": 1.6568181818181817, + "grad_norm": 13.25144100189209, + "learning_rate": 4.515151515151516e-05, + "loss": 1.7681, + "step": 729 + }, + { + "epoch": 1.6590909090909092, + "grad_norm": 9.894201278686523, + "learning_rate": 4.5075757575757577e-05, + "loss": 1.5505, + "step": 730 + }, + { + "epoch": 1.6613636363636364, + "grad_norm": 16.501630783081055, + "learning_rate": 4.5e-05, + "loss": 1.4968, + "step": 731 + }, + { + "epoch": 1.6636363636363636, + "grad_norm": 10.3342924118042, + "learning_rate": 4.492424242424242e-05, + "loss": 1.4734, + "step": 732 + }, + { + "epoch": 1.665909090909091, + "grad_norm": 11.081184387207031, + "learning_rate": 4.484848484848485e-05, + "loss": 2.6513, + "step": 733 + }, + { + "epoch": 1.6681818181818182, + "grad_norm": 17.005704879760742, + "learning_rate": 4.4772727272727275e-05, + "loss": 2.4109, + "step": 734 + }, + { + "epoch": 1.6704545454545454, + "grad_norm": 11.718207359313965, + "learning_rate": 4.46969696969697e-05, + "loss": 1.6445, + "step": 735 + }, + { + "epoch": 1.6727272727272728, + "grad_norm": 12.14245319366455, + "learning_rate": 4.462121212121213e-05, + "loss": 2.335, + "step": 736 + }, + { + "epoch": 1.675, + "grad_norm": 10.971789360046387, + "learning_rate": 4.454545454545455e-05, + "loss": 1.6266, + "step": 737 + }, + { + "epoch": 1.6772727272727272, + "grad_norm": 17.435321807861328, + "learning_rate": 4.4469696969696973e-05, + "loss": 2.1164, + "step": 738 + }, + { + "epoch": 1.6795454545454547, + "grad_norm": 10.45814323425293, + "learning_rate": 4.43939393939394e-05, + "loss": 1.3992, + "step": 739 + }, + { + "epoch": 1.6818181818181817, + "grad_norm": 12.788302421569824, + "learning_rate": 4.431818181818182e-05, + "loss": 2.4001, + "step": 740 + }, + { + "epoch": 1.684090909090909, + "grad_norm": 14.425982475280762, + "learning_rate": 4.4242424242424246e-05, + "loss": 2.163, + "step": 741 + }, + { + "epoch": 1.6863636363636365, + "grad_norm": 9.09310531616211, + "learning_rate": 4.4166666666666665e-05, + "loss": 1.4595, + "step": 742 + }, + { + "epoch": 1.6886363636363635, + "grad_norm": 11.336987495422363, + "learning_rate": 4.409090909090909e-05, + "loss": 2.6262, + "step": 743 + }, + { + "epoch": 1.690909090909091, + "grad_norm": 11.697134017944336, + "learning_rate": 4.401515151515152e-05, + "loss": 1.3628, + "step": 744 + }, + { + "epoch": 1.6931818181818183, + "grad_norm": 8.620695114135742, + "learning_rate": 4.3939393939393944e-05, + "loss": 1.2893, + "step": 745 + }, + { + "epoch": 1.6954545454545453, + "grad_norm": 9.322046279907227, + "learning_rate": 4.386363636363637e-05, + "loss": 1.9579, + "step": 746 + }, + { + "epoch": 1.6977272727272728, + "grad_norm": 11.273119926452637, + "learning_rate": 4.378787878787879e-05, + "loss": 2.2207, + "step": 747 + }, + { + "epoch": 1.7, + "grad_norm": 11.111379623413086, + "learning_rate": 4.3712121212121216e-05, + "loss": 1.4021, + "step": 748 + }, + { + "epoch": 1.7022727272727272, + "grad_norm": 11.808859825134277, + "learning_rate": 4.3636363636363636e-05, + "loss": 1.4873, + "step": 749 + }, + { + "epoch": 1.7045454545454546, + "grad_norm": 14.41899585723877, + "learning_rate": 4.356060606060606e-05, + "loss": 1.9247, + "step": 750 + }, + { + "epoch": 1.7068181818181818, + "grad_norm": 9.383740425109863, + "learning_rate": 4.348484848484849e-05, + "loss": 1.6231, + "step": 751 + }, + { + "epoch": 1.709090909090909, + "grad_norm": 9.926271438598633, + "learning_rate": 4.340909090909091e-05, + "loss": 2.2661, + "step": 752 + }, + { + "epoch": 1.7113636363636364, + "grad_norm": 12.015188217163086, + "learning_rate": 4.3333333333333334e-05, + "loss": 1.4877, + "step": 753 + }, + { + "epoch": 1.7136363636363636, + "grad_norm": 12.057700157165527, + "learning_rate": 4.325757575757576e-05, + "loss": 1.6091, + "step": 754 + }, + { + "epoch": 1.7159090909090908, + "grad_norm": 8.392674446105957, + "learning_rate": 4.318181818181819e-05, + "loss": 1.4652, + "step": 755 + }, + { + "epoch": 1.7181818181818183, + "grad_norm": 7.7269287109375, + "learning_rate": 4.3106060606060606e-05, + "loss": 1.1991, + "step": 756 + }, + { + "epoch": 1.7204545454545455, + "grad_norm": 13.280454635620117, + "learning_rate": 4.303030303030303e-05, + "loss": 1.9597, + "step": 757 + }, + { + "epoch": 1.7227272727272727, + "grad_norm": 11.144329071044922, + "learning_rate": 4.295454545454546e-05, + "loss": 1.6052, + "step": 758 + }, + { + "epoch": 1.725, + "grad_norm": 12.23388385772705, + "learning_rate": 4.287878787878788e-05, + "loss": 1.5491, + "step": 759 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 11.918728828430176, + "learning_rate": 4.2803030303030305e-05, + "loss": 2.0586, + "step": 760 + }, + { + "epoch": 1.7295454545454545, + "grad_norm": 7.68416166305542, + "learning_rate": 4.2727272727272724e-05, + "loss": 1.0501, + "step": 761 + }, + { + "epoch": 1.731818181818182, + "grad_norm": 16.64651870727539, + "learning_rate": 4.265151515151515e-05, + "loss": 1.9819, + "step": 762 + }, + { + "epoch": 1.7340909090909091, + "grad_norm": 14.889754295349121, + "learning_rate": 4.257575757575758e-05, + "loss": 2.5418, + "step": 763 + }, + { + "epoch": 1.7363636363636363, + "grad_norm": 13.508451461791992, + "learning_rate": 4.25e-05, + "loss": 1.5028, + "step": 764 + }, + { + "epoch": 1.7386363636363638, + "grad_norm": 9.541330337524414, + "learning_rate": 4.242424242424243e-05, + "loss": 1.0183, + "step": 765 + }, + { + "epoch": 1.740909090909091, + "grad_norm": 13.14413833618164, + "learning_rate": 4.234848484848485e-05, + "loss": 2.0542, + "step": 766 + }, + { + "epoch": 1.7431818181818182, + "grad_norm": 12.490581512451172, + "learning_rate": 4.2272727272727275e-05, + "loss": 1.5971, + "step": 767 + }, + { + "epoch": 1.7454545454545456, + "grad_norm": 14.117782592773438, + "learning_rate": 4.21969696969697e-05, + "loss": 3.0207, + "step": 768 + }, + { + "epoch": 1.7477272727272726, + "grad_norm": 12.968109130859375, + "learning_rate": 4.212121212121212e-05, + "loss": 1.9058, + "step": 769 + }, + { + "epoch": 1.75, + "grad_norm": 10.889745712280273, + "learning_rate": 4.204545454545455e-05, + "loss": 1.535, + "step": 770 + }, + { + "epoch": 1.7522727272727274, + "grad_norm": 11.901477813720703, + "learning_rate": 4.196969696969697e-05, + "loss": 1.3743, + "step": 771 + }, + { + "epoch": 1.7545454545454544, + "grad_norm": 11.466394424438477, + "learning_rate": 4.189393939393939e-05, + "loss": 2.1364, + "step": 772 + }, + { + "epoch": 1.7568181818181818, + "grad_norm": 9.973612785339355, + "learning_rate": 4.181818181818182e-05, + "loss": 1.7472, + "step": 773 + }, + { + "epoch": 1.759090909090909, + "grad_norm": 11.81697940826416, + "learning_rate": 4.1742424242424246e-05, + "loss": 1.6475, + "step": 774 + }, + { + "epoch": 1.7613636363636362, + "grad_norm": 10.81869125366211, + "learning_rate": 4.166666666666667e-05, + "loss": 2.433, + "step": 775 + }, + { + "epoch": 1.7636363636363637, + "grad_norm": 15.867783546447754, + "learning_rate": 4.159090909090909e-05, + "loss": 3.0407, + "step": 776 + }, + { + "epoch": 1.7659090909090909, + "grad_norm": 12.047411918640137, + "learning_rate": 4.151515151515152e-05, + "loss": 1.7651, + "step": 777 + }, + { + "epoch": 1.768181818181818, + "grad_norm": 11.829177856445312, + "learning_rate": 4.143939393939394e-05, + "loss": 1.5285, + "step": 778 + }, + { + "epoch": 1.7704545454545455, + "grad_norm": 13.831562995910645, + "learning_rate": 4.1363636363636364e-05, + "loss": 2.6372, + "step": 779 + }, + { + "epoch": 1.7727272727272727, + "grad_norm": 10.6288480758667, + "learning_rate": 4.128787878787879e-05, + "loss": 1.8006, + "step": 780 + }, + { + "epoch": 1.775, + "grad_norm": 12.919150352478027, + "learning_rate": 4.1212121212121216e-05, + "loss": 1.8753, + "step": 781 + }, + { + "epoch": 1.7772727272727273, + "grad_norm": 14.138745307922363, + "learning_rate": 4.113636363636364e-05, + "loss": 2.1089, + "step": 782 + }, + { + "epoch": 1.7795454545454545, + "grad_norm": 8.130454063415527, + "learning_rate": 4.106060606060606e-05, + "loss": 0.9243, + "step": 783 + }, + { + "epoch": 1.7818181818181817, + "grad_norm": 13.32907485961914, + "learning_rate": 4.098484848484849e-05, + "loss": 2.599, + "step": 784 + }, + { + "epoch": 1.7840909090909092, + "grad_norm": 9.957046508789062, + "learning_rate": 4.0909090909090915e-05, + "loss": 1.1874, + "step": 785 + }, + { + "epoch": 1.7863636363636364, + "grad_norm": 10.413941383361816, + "learning_rate": 4.0833333333333334e-05, + "loss": 1.2206, + "step": 786 + }, + { + "epoch": 1.7886363636363636, + "grad_norm": 12.38062858581543, + "learning_rate": 4.075757575757576e-05, + "loss": 1.5484, + "step": 787 + }, + { + "epoch": 1.790909090909091, + "grad_norm": 10.63827896118164, + "learning_rate": 4.068181818181818e-05, + "loss": 1.4851, + "step": 788 + }, + { + "epoch": 1.7931818181818182, + "grad_norm": 10.755563735961914, + "learning_rate": 4.0606060606060606e-05, + "loss": 2.0725, + "step": 789 + }, + { + "epoch": 1.7954545454545454, + "grad_norm": 10.352532386779785, + "learning_rate": 4.053030303030303e-05, + "loss": 1.6825, + "step": 790 + }, + { + "epoch": 1.7977272727272728, + "grad_norm": 10.303858757019043, + "learning_rate": 4.045454545454546e-05, + "loss": 1.6771, + "step": 791 + }, + { + "epoch": 1.8, + "grad_norm": 12.914578437805176, + "learning_rate": 4.0378787878787885e-05, + "loss": 2.0149, + "step": 792 + }, + { + "epoch": 1.8022727272727272, + "grad_norm": 9.389689445495605, + "learning_rate": 4.0303030303030305e-05, + "loss": 1.9987, + "step": 793 + }, + { + "epoch": 1.8045454545454547, + "grad_norm": 13.615360260009766, + "learning_rate": 4.022727272727273e-05, + "loss": 1.7871, + "step": 794 + }, + { + "epoch": 1.8068181818181817, + "grad_norm": 12.188302040100098, + "learning_rate": 4.015151515151515e-05, + "loss": 2.1458, + "step": 795 + }, + { + "epoch": 1.809090909090909, + "grad_norm": 23.321977615356445, + "learning_rate": 4.007575757575758e-05, + "loss": 1.5815, + "step": 796 + }, + { + "epoch": 1.8113636363636365, + "grad_norm": 13.12856674194336, + "learning_rate": 4e-05, + "loss": 1.9065, + "step": 797 + }, + { + "epoch": 1.8136363636363635, + "grad_norm": 8.955425262451172, + "learning_rate": 3.992424242424242e-05, + "loss": 1.4415, + "step": 798 + }, + { + "epoch": 1.815909090909091, + "grad_norm": 14.052294731140137, + "learning_rate": 3.984848484848485e-05, + "loss": 2.6913, + "step": 799 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 8.688261032104492, + "learning_rate": 3.9772727272727275e-05, + "loss": 1.6981, + "step": 800 + }, + { + "epoch": 1.8204545454545453, + "grad_norm": 13.951496124267578, + "learning_rate": 3.96969696969697e-05, + "loss": 1.5787, + "step": 801 + }, + { + "epoch": 1.8227272727272728, + "grad_norm": 10.023541450500488, + "learning_rate": 3.962121212121213e-05, + "loss": 1.9886, + "step": 802 + }, + { + "epoch": 1.825, + "grad_norm": 8.397741317749023, + "learning_rate": 3.954545454545455e-05, + "loss": 1.7193, + "step": 803 + }, + { + "epoch": 1.8272727272727272, + "grad_norm": 10.017319679260254, + "learning_rate": 3.9469696969696974e-05, + "loss": 1.7097, + "step": 804 + }, + { + "epoch": 1.8295454545454546, + "grad_norm": 13.632206916809082, + "learning_rate": 3.939393939393939e-05, + "loss": 2.1469, + "step": 805 + }, + { + "epoch": 1.8318181818181818, + "grad_norm": 19.315832138061523, + "learning_rate": 3.931818181818182e-05, + "loss": 2.2873, + "step": 806 + }, + { + "epoch": 1.834090909090909, + "grad_norm": 11.273087501525879, + "learning_rate": 3.924242424242424e-05, + "loss": 1.352, + "step": 807 + }, + { + "epoch": 1.8363636363636364, + "grad_norm": 12.127049446105957, + "learning_rate": 3.9166666666666665e-05, + "loss": 1.8422, + "step": 808 + }, + { + "epoch": 1.8386363636363636, + "grad_norm": 9.968843460083008, + "learning_rate": 3.909090909090909e-05, + "loss": 1.2724, + "step": 809 + }, + { + "epoch": 1.8409090909090908, + "grad_norm": 13.883306503295898, + "learning_rate": 3.901515151515152e-05, + "loss": 2.6822, + "step": 810 + }, + { + "epoch": 1.8431818181818183, + "grad_norm": 10.443497657775879, + "learning_rate": 3.8939393939393944e-05, + "loss": 1.2037, + "step": 811 + }, + { + "epoch": 1.8454545454545455, + "grad_norm": 10.290310859680176, + "learning_rate": 3.8863636363636364e-05, + "loss": 1.5355, + "step": 812 + }, + { + "epoch": 1.8477272727272727, + "grad_norm": 9.970185279846191, + "learning_rate": 3.878787878787879e-05, + "loss": 1.957, + "step": 813 + }, + { + "epoch": 1.85, + "grad_norm": 10.905329704284668, + "learning_rate": 3.8712121212121217e-05, + "loss": 1.8562, + "step": 814 + }, + { + "epoch": 1.8522727272727273, + "grad_norm": 9.466534614562988, + "learning_rate": 3.8636363636363636e-05, + "loss": 1.4522, + "step": 815 + }, + { + "epoch": 1.8545454545454545, + "grad_norm": 13.48620891571045, + "learning_rate": 3.856060606060606e-05, + "loss": 2.1203, + "step": 816 + }, + { + "epoch": 1.856818181818182, + "grad_norm": 12.107563018798828, + "learning_rate": 3.848484848484848e-05, + "loss": 1.7011, + "step": 817 + }, + { + "epoch": 1.8590909090909091, + "grad_norm": 10.786709785461426, + "learning_rate": 3.840909090909091e-05, + "loss": 1.7418, + "step": 818 + }, + { + "epoch": 1.8613636363636363, + "grad_norm": 10.853336334228516, + "learning_rate": 3.8333333333333334e-05, + "loss": 1.4229, + "step": 819 + }, + { + "epoch": 1.8636363636363638, + "grad_norm": 11.42320442199707, + "learning_rate": 3.825757575757576e-05, + "loss": 1.6411, + "step": 820 + }, + { + "epoch": 1.865909090909091, + "grad_norm": 9.623292922973633, + "learning_rate": 3.818181818181819e-05, + "loss": 2.2372, + "step": 821 + }, + { + "epoch": 1.8681818181818182, + "grad_norm": 19.681766510009766, + "learning_rate": 3.810606060606061e-05, + "loss": 1.7814, + "step": 822 + }, + { + "epoch": 1.8704545454545456, + "grad_norm": 11.759204864501953, + "learning_rate": 3.803030303030303e-05, + "loss": 1.4783, + "step": 823 + }, + { + "epoch": 1.8727272727272726, + "grad_norm": 11.130982398986816, + "learning_rate": 3.795454545454545e-05, + "loss": 1.3937, + "step": 824 + }, + { + "epoch": 1.875, + "grad_norm": 10.193344116210938, + "learning_rate": 3.787878787878788e-05, + "loss": 1.3912, + "step": 825 + }, + { + "epoch": 1.8772727272727274, + "grad_norm": 8.412622451782227, + "learning_rate": 3.7803030303030305e-05, + "loss": 1.3978, + "step": 826 + }, + { + "epoch": 1.8795454545454544, + "grad_norm": 12.766166687011719, + "learning_rate": 3.7727272727272725e-05, + "loss": 1.9356, + "step": 827 + }, + { + "epoch": 1.8818181818181818, + "grad_norm": 11.161136627197266, + "learning_rate": 3.765151515151516e-05, + "loss": 1.8318, + "step": 828 + }, + { + "epoch": 1.884090909090909, + "grad_norm": 11.214709281921387, + "learning_rate": 3.757575757575758e-05, + "loss": 1.4253, + "step": 829 + }, + { + "epoch": 1.8863636363636362, + "grad_norm": 12.173728942871094, + "learning_rate": 3.7500000000000003e-05, + "loss": 1.3093, + "step": 830 + }, + { + "epoch": 1.8886363636363637, + "grad_norm": 12.564881324768066, + "learning_rate": 3.742424242424243e-05, + "loss": 2.0086, + "step": 831 + }, + { + "epoch": 1.8909090909090909, + "grad_norm": 10.378774642944336, + "learning_rate": 3.734848484848485e-05, + "loss": 2.2117, + "step": 832 + }, + { + "epoch": 1.893181818181818, + "grad_norm": 13.659943580627441, + "learning_rate": 3.7272727272727276e-05, + "loss": 1.8717, + "step": 833 + }, + { + "epoch": 1.8954545454545455, + "grad_norm": 10.889350891113281, + "learning_rate": 3.7196969696969695e-05, + "loss": 2.524, + "step": 834 + }, + { + "epoch": 1.8977272727272727, + "grad_norm": 20.47830581665039, + "learning_rate": 3.712121212121212e-05, + "loss": 1.5575, + "step": 835 + }, + { + "epoch": 1.9, + "grad_norm": 8.377565383911133, + "learning_rate": 3.704545454545455e-05, + "loss": 1.4985, + "step": 836 + }, + { + "epoch": 1.9022727272727273, + "grad_norm": 14.420267105102539, + "learning_rate": 3.6969696969696974e-05, + "loss": 2.0562, + "step": 837 + }, + { + "epoch": 1.9045454545454545, + "grad_norm": 11.469067573547363, + "learning_rate": 3.68939393939394e-05, + "loss": 1.9261, + "step": 838 + }, + { + "epoch": 1.9068181818181817, + "grad_norm": 14.95913314819336, + "learning_rate": 3.681818181818182e-05, + "loss": 1.4905, + "step": 839 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 12.481145858764648, + "learning_rate": 3.6742424242424246e-05, + "loss": 1.3664, + "step": 840 + }, + { + "epoch": 1.9113636363636364, + "grad_norm": 11.715337753295898, + "learning_rate": 3.6666666666666666e-05, + "loss": 2.0561, + "step": 841 + }, + { + "epoch": 1.9136363636363636, + "grad_norm": 12.499181747436523, + "learning_rate": 3.659090909090909e-05, + "loss": 1.62, + "step": 842 + }, + { + "epoch": 1.915909090909091, + "grad_norm": 7.448797225952148, + "learning_rate": 3.651515151515152e-05, + "loss": 0.979, + "step": 843 + }, + { + "epoch": 1.9181818181818182, + "grad_norm": 11.219677925109863, + "learning_rate": 3.643939393939394e-05, + "loss": 1.8378, + "step": 844 + }, + { + "epoch": 1.9204545454545454, + "grad_norm": 11.738428115844727, + "learning_rate": 3.6363636363636364e-05, + "loss": 2.1477, + "step": 845 + }, + { + "epoch": 1.9227272727272728, + "grad_norm": 13.800374031066895, + "learning_rate": 3.628787878787879e-05, + "loss": 2.3644, + "step": 846 + }, + { + "epoch": 1.925, + "grad_norm": 11.240313529968262, + "learning_rate": 3.621212121212122e-05, + "loss": 1.6775, + "step": 847 + }, + { + "epoch": 1.9272727272727272, + "grad_norm": 13.477606773376465, + "learning_rate": 3.613636363636364e-05, + "loss": 1.3438, + "step": 848 + }, + { + "epoch": 1.9295454545454547, + "grad_norm": 12.788423538208008, + "learning_rate": 3.606060606060606e-05, + "loss": 1.7158, + "step": 849 + }, + { + "epoch": 1.9318181818181817, + "grad_norm": 8.893767356872559, + "learning_rate": 3.598484848484849e-05, + "loss": 1.4747, + "step": 850 + }, + { + "epoch": 1.934090909090909, + "grad_norm": 12.053075790405273, + "learning_rate": 3.590909090909091e-05, + "loss": 1.0121, + "step": 851 + }, + { + "epoch": 1.9363636363636365, + "grad_norm": 12.093589782714844, + "learning_rate": 3.5833333333333335e-05, + "loss": 2.1991, + "step": 852 + }, + { + "epoch": 1.9386363636363635, + "grad_norm": 9.356278419494629, + "learning_rate": 3.575757575757576e-05, + "loss": 1.4497, + "step": 853 + }, + { + "epoch": 1.940909090909091, + "grad_norm": 12.686812400817871, + "learning_rate": 3.568181818181818e-05, + "loss": 1.5038, + "step": 854 + }, + { + "epoch": 1.9431818181818183, + "grad_norm": 13.139368057250977, + "learning_rate": 3.560606060606061e-05, + "loss": 2.9399, + "step": 855 + }, + { + "epoch": 1.9454545454545453, + "grad_norm": 11.385064125061035, + "learning_rate": 3.553030303030303e-05, + "loss": 1.4202, + "step": 856 + }, + { + "epoch": 1.9477272727272728, + "grad_norm": 9.905313491821289, + "learning_rate": 3.545454545454546e-05, + "loss": 2.5033, + "step": 857 + }, + { + "epoch": 1.95, + "grad_norm": 9.99422836303711, + "learning_rate": 3.537878787878788e-05, + "loss": 1.631, + "step": 858 + }, + { + "epoch": 1.9522727272727272, + "grad_norm": 12.235610961914062, + "learning_rate": 3.5303030303030305e-05, + "loss": 1.7517, + "step": 859 + }, + { + "epoch": 1.9545454545454546, + "grad_norm": 13.225701332092285, + "learning_rate": 3.522727272727273e-05, + "loss": 1.545, + "step": 860 + }, + { + "epoch": 1.9568181818181818, + "grad_norm": 13.755146980285645, + "learning_rate": 3.515151515151515e-05, + "loss": 1.6548, + "step": 861 + }, + { + "epoch": 1.959090909090909, + "grad_norm": 14.235300064086914, + "learning_rate": 3.507575757575758e-05, + "loss": 2.2791, + "step": 862 + }, + { + "epoch": 1.9613636363636364, + "grad_norm": 12.734109878540039, + "learning_rate": 3.5e-05, + "loss": 1.4257, + "step": 863 + }, + { + "epoch": 1.9636363636363636, + "grad_norm": 12.51075267791748, + "learning_rate": 3.492424242424242e-05, + "loss": 2.1328, + "step": 864 + }, + { + "epoch": 1.9659090909090908, + "grad_norm": 12.090396881103516, + "learning_rate": 3.484848484848485e-05, + "loss": 2.4949, + "step": 865 + }, + { + "epoch": 1.9681818181818183, + "grad_norm": 9.898470878601074, + "learning_rate": 3.4772727272727276e-05, + "loss": 1.0122, + "step": 866 + }, + { + "epoch": 1.9704545454545455, + "grad_norm": 12.299036979675293, + "learning_rate": 3.46969696969697e-05, + "loss": 1.1734, + "step": 867 + }, + { + "epoch": 1.9727272727272727, + "grad_norm": 10.930243492126465, + "learning_rate": 3.462121212121212e-05, + "loss": 1.8219, + "step": 868 + }, + { + "epoch": 1.975, + "grad_norm": 11.0517578125, + "learning_rate": 3.454545454545455e-05, + "loss": 1.5023, + "step": 869 + }, + { + "epoch": 1.9772727272727273, + "grad_norm": 11.98909854888916, + "learning_rate": 3.4469696969696974e-05, + "loss": 1.298, + "step": 870 + }, + { + "epoch": 1.9795454545454545, + "grad_norm": 12.753129959106445, + "learning_rate": 3.4393939393939394e-05, + "loss": 1.7147, + "step": 871 + }, + { + "epoch": 1.981818181818182, + "grad_norm": 71.2451171875, + "learning_rate": 3.431818181818182e-05, + "loss": 1.3867, + "step": 872 + }, + { + "epoch": 1.9840909090909091, + "grad_norm": 9.198206901550293, + "learning_rate": 3.424242424242424e-05, + "loss": 1.2175, + "step": 873 + }, + { + "epoch": 1.9863636363636363, + "grad_norm": 10.864444732666016, + "learning_rate": 3.4166666666666666e-05, + "loss": 2.4479, + "step": 874 + }, + { + "epoch": 1.9886363636363638, + "grad_norm": 12.929604530334473, + "learning_rate": 3.409090909090909e-05, + "loss": 2.3538, + "step": 875 + }, + { + "epoch": 1.990909090909091, + "grad_norm": 15.190954208374023, + "learning_rate": 3.401515151515152e-05, + "loss": 2.7314, + "step": 876 + }, + { + "epoch": 1.9931818181818182, + "grad_norm": 12.220293045043945, + "learning_rate": 3.3939393939393945e-05, + "loss": 1.8087, + "step": 877 + }, + { + "epoch": 1.9954545454545456, + "grad_norm": 13.717775344848633, + "learning_rate": 3.3863636363636364e-05, + "loss": 2.2791, + "step": 878 + }, + { + "epoch": 1.9977272727272726, + "grad_norm": 13.53941822052002, + "learning_rate": 3.378787878787879e-05, + "loss": 1.9205, + "step": 879 + }, + { + "epoch": 2.0, + "grad_norm": 10.206825256347656, + "learning_rate": 3.371212121212121e-05, + "loss": 1.2968, + "step": 880 + }, + { + "epoch": 2.0, + "eval_f1": 0.8929, + "eval_gen_len": 41.9091, + "eval_loss": 1.7823115587234497, + "eval_precision": 0.8925, + "eval_recall": 0.8935, + "eval_rouge1": 0.447, + "eval_rouge2": 0.2102, + "eval_rougeL": 0.3795, + "eval_rougeLsum": 0.4136, + "eval_runtime": 29.0339, + "eval_samples_per_second": 3.789, + "eval_steps_per_second": 0.482, + "step": 880 + }, + { + "epoch": 2.0022727272727274, + "grad_norm": 9.781706809997559, + "learning_rate": 3.3636363636363636e-05, + "loss": 1.0468, + "step": 881 + }, + { + "epoch": 2.0045454545454544, + "grad_norm": 8.61344051361084, + "learning_rate": 3.356060606060606e-05, + "loss": 1.7286, + "step": 882 + }, + { + "epoch": 2.006818181818182, + "grad_norm": 11.291481971740723, + "learning_rate": 3.348484848484848e-05, + "loss": 1.1274, + "step": 883 + }, + { + "epoch": 2.0090909090909093, + "grad_norm": 11.33132553100586, + "learning_rate": 3.3409090909090915e-05, + "loss": 1.4992, + "step": 884 + }, + { + "epoch": 2.0113636363636362, + "grad_norm": 10.342754364013672, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.7733, + "step": 885 + }, + { + "epoch": 2.0136363636363637, + "grad_norm": 9.18486499786377, + "learning_rate": 3.325757575757576e-05, + "loss": 1.7391, + "step": 886 + }, + { + "epoch": 2.015909090909091, + "grad_norm": 35.923648834228516, + "learning_rate": 3.318181818181819e-05, + "loss": 1.8191, + "step": 887 + }, + { + "epoch": 2.018181818181818, + "grad_norm": 10.737150192260742, + "learning_rate": 3.310606060606061e-05, + "loss": 1.1656, + "step": 888 + }, + { + "epoch": 2.0204545454545455, + "grad_norm": 7.691224098205566, + "learning_rate": 3.303030303030303e-05, + "loss": 1.1787, + "step": 889 + }, + { + "epoch": 2.022727272727273, + "grad_norm": 14.402198791503906, + "learning_rate": 3.295454545454545e-05, + "loss": 2.1618, + "step": 890 + }, + { + "epoch": 2.025, + "grad_norm": 9.567869186401367, + "learning_rate": 3.287878787878788e-05, + "loss": 1.4921, + "step": 891 + }, + { + "epoch": 2.0272727272727273, + "grad_norm": 12.46391487121582, + "learning_rate": 3.2803030303030305e-05, + "loss": 2.0986, + "step": 892 + }, + { + "epoch": 2.0295454545454548, + "grad_norm": 12.333531379699707, + "learning_rate": 3.272727272727273e-05, + "loss": 1.5944, + "step": 893 + }, + { + "epoch": 2.0318181818181817, + "grad_norm": 12.140853881835938, + "learning_rate": 3.265151515151516e-05, + "loss": 1.7773, + "step": 894 + }, + { + "epoch": 2.034090909090909, + "grad_norm": 9.412683486938477, + "learning_rate": 3.257575757575758e-05, + "loss": 1.2663, + "step": 895 + }, + { + "epoch": 2.036363636363636, + "grad_norm": 10.711098670959473, + "learning_rate": 3.2500000000000004e-05, + "loss": 1.6462, + "step": 896 + }, + { + "epoch": 2.0386363636363636, + "grad_norm": 11.64570426940918, + "learning_rate": 3.2424242424242423e-05, + "loss": 1.8232, + "step": 897 + }, + { + "epoch": 2.040909090909091, + "grad_norm": 12.753011703491211, + "learning_rate": 3.234848484848485e-05, + "loss": 1.9761, + "step": 898 + }, + { + "epoch": 2.043181818181818, + "grad_norm": 15.42159366607666, + "learning_rate": 3.2272727272727276e-05, + "loss": 1.5225, + "step": 899 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 13.561200141906738, + "learning_rate": 3.2196969696969696e-05, + "loss": 2.2342, + "step": 900 + }, + { + "epoch": 2.047727272727273, + "grad_norm": 11.59468936920166, + "learning_rate": 3.212121212121212e-05, + "loss": 1.3996, + "step": 901 + }, + { + "epoch": 2.05, + "grad_norm": 12.330318450927734, + "learning_rate": 3.204545454545455e-05, + "loss": 2.3926, + "step": 902 + }, + { + "epoch": 2.0522727272727272, + "grad_norm": 15.305580139160156, + "learning_rate": 3.1969696969696974e-05, + "loss": 2.5056, + "step": 903 + }, + { + "epoch": 2.0545454545454547, + "grad_norm": 12.250936508178711, + "learning_rate": 3.18939393939394e-05, + "loss": 2.2595, + "step": 904 + }, + { + "epoch": 2.0568181818181817, + "grad_norm": 9.258564949035645, + "learning_rate": 3.181818181818182e-05, + "loss": 1.0952, + "step": 905 + }, + { + "epoch": 2.059090909090909, + "grad_norm": 10.1191987991333, + "learning_rate": 3.174242424242425e-05, + "loss": 2.2179, + "step": 906 + }, + { + "epoch": 2.0613636363636365, + "grad_norm": 12.793285369873047, + "learning_rate": 3.1666666666666666e-05, + "loss": 1.7858, + "step": 907 + }, + { + "epoch": 2.0636363636363635, + "grad_norm": 10.188157081604004, + "learning_rate": 3.159090909090909e-05, + "loss": 1.3631, + "step": 908 + }, + { + "epoch": 2.065909090909091, + "grad_norm": 13.256832122802734, + "learning_rate": 3.151515151515151e-05, + "loss": 2.2464, + "step": 909 + }, + { + "epoch": 2.0681818181818183, + "grad_norm": 10.160938262939453, + "learning_rate": 3.143939393939394e-05, + "loss": 1.5204, + "step": 910 + }, + { + "epoch": 2.0704545454545453, + "grad_norm": 10.945446014404297, + "learning_rate": 3.1363636363636365e-05, + "loss": 1.6125, + "step": 911 + }, + { + "epoch": 2.0727272727272728, + "grad_norm": 10.19439697265625, + "learning_rate": 3.128787878787879e-05, + "loss": 1.5317, + "step": 912 + }, + { + "epoch": 2.075, + "grad_norm": 9.242986679077148, + "learning_rate": 3.121212121212122e-05, + "loss": 1.7993, + "step": 913 + }, + { + "epoch": 2.077272727272727, + "grad_norm": 9.43307113647461, + "learning_rate": 3.113636363636364e-05, + "loss": 1.4297, + "step": 914 + }, + { + "epoch": 2.0795454545454546, + "grad_norm": 9.292837142944336, + "learning_rate": 3.106060606060606e-05, + "loss": 1.1428, + "step": 915 + }, + { + "epoch": 2.081818181818182, + "grad_norm": 10.290895462036133, + "learning_rate": 3.098484848484849e-05, + "loss": 1.3587, + "step": 916 + }, + { + "epoch": 2.084090909090909, + "grad_norm": 12.890341758728027, + "learning_rate": 3.090909090909091e-05, + "loss": 1.5721, + "step": 917 + }, + { + "epoch": 2.0863636363636364, + "grad_norm": 9.548102378845215, + "learning_rate": 3.0833333333333335e-05, + "loss": 1.5717, + "step": 918 + }, + { + "epoch": 2.088636363636364, + "grad_norm": 11.2235689163208, + "learning_rate": 3.0757575757575755e-05, + "loss": 1.818, + "step": 919 + }, + { + "epoch": 2.090909090909091, + "grad_norm": 14.528667449951172, + "learning_rate": 3.068181818181818e-05, + "loss": 1.6878, + "step": 920 + }, + { + "epoch": 2.0931818181818183, + "grad_norm": 13.295345306396484, + "learning_rate": 3.060606060606061e-05, + "loss": 1.8521, + "step": 921 + }, + { + "epoch": 2.0954545454545457, + "grad_norm": 13.902974128723145, + "learning_rate": 3.0530303030303034e-05, + "loss": 1.7186, + "step": 922 + }, + { + "epoch": 2.0977272727272727, + "grad_norm": 8.313849449157715, + "learning_rate": 3.0454545454545456e-05, + "loss": 0.8988, + "step": 923 + }, + { + "epoch": 2.1, + "grad_norm": 11.491289138793945, + "learning_rate": 3.037878787878788e-05, + "loss": 1.1394, + "step": 924 + }, + { + "epoch": 2.102272727272727, + "grad_norm": 13.124963760375977, + "learning_rate": 3.0303030303030306e-05, + "loss": 1.7424, + "step": 925 + }, + { + "epoch": 2.1045454545454545, + "grad_norm": 8.5538911819458, + "learning_rate": 3.0227272727272725e-05, + "loss": 1.3577, + "step": 926 + }, + { + "epoch": 2.106818181818182, + "grad_norm": 12.04502010345459, + "learning_rate": 3.015151515151515e-05, + "loss": 1.2389, + "step": 927 + }, + { + "epoch": 2.109090909090909, + "grad_norm": 8.608831405639648, + "learning_rate": 3.0075757575757578e-05, + "loss": 1.1577, + "step": 928 + }, + { + "epoch": 2.1113636363636363, + "grad_norm": 14.802834510803223, + "learning_rate": 3e-05, + "loss": 1.8636, + "step": 929 + }, + { + "epoch": 2.1136363636363638, + "grad_norm": 9.014802932739258, + "learning_rate": 2.9924242424242427e-05, + "loss": 0.7823, + "step": 930 + }, + { + "epoch": 2.1159090909090907, + "grad_norm": 10.007800102233887, + "learning_rate": 2.9848484848484847e-05, + "loss": 1.7205, + "step": 931 + }, + { + "epoch": 2.118181818181818, + "grad_norm": 16.067474365234375, + "learning_rate": 2.9772727272727273e-05, + "loss": 2.443, + "step": 932 + }, + { + "epoch": 2.1204545454545456, + "grad_norm": 12.624736785888672, + "learning_rate": 2.96969696969697e-05, + "loss": 1.5536, + "step": 933 + }, + { + "epoch": 2.1227272727272726, + "grad_norm": 10.400491714477539, + "learning_rate": 2.9621212121212122e-05, + "loss": 1.2871, + "step": 934 + }, + { + "epoch": 2.125, + "grad_norm": 11.056097984313965, + "learning_rate": 2.954545454545455e-05, + "loss": 1.4614, + "step": 935 + }, + { + "epoch": 2.1272727272727274, + "grad_norm": 9.163816452026367, + "learning_rate": 2.9469696969696968e-05, + "loss": 1.2918, + "step": 936 + }, + { + "epoch": 2.1295454545454544, + "grad_norm": 8.908564567565918, + "learning_rate": 2.9393939393939394e-05, + "loss": 1.2489, + "step": 937 + }, + { + "epoch": 2.131818181818182, + "grad_norm": 8.402863502502441, + "learning_rate": 2.9318181818181817e-05, + "loss": 1.4269, + "step": 938 + }, + { + "epoch": 2.1340909090909093, + "grad_norm": 10.939780235290527, + "learning_rate": 2.9242424242424243e-05, + "loss": 1.4199, + "step": 939 + }, + { + "epoch": 2.1363636363636362, + "grad_norm": 11.758381843566895, + "learning_rate": 2.916666666666667e-05, + "loss": 1.4597, + "step": 940 + }, + { + "epoch": 2.1386363636363637, + "grad_norm": 11.411653518676758, + "learning_rate": 2.909090909090909e-05, + "loss": 2.1611, + "step": 941 + }, + { + "epoch": 2.140909090909091, + "grad_norm": 11.838427543640137, + "learning_rate": 2.901515151515152e-05, + "loss": 1.2373, + "step": 942 + }, + { + "epoch": 2.143181818181818, + "grad_norm": 14.833626747131348, + "learning_rate": 2.893939393939394e-05, + "loss": 1.9202, + "step": 943 + }, + { + "epoch": 2.1454545454545455, + "grad_norm": 10.815326690673828, + "learning_rate": 2.8863636363636365e-05, + "loss": 1.5089, + "step": 944 + }, + { + "epoch": 2.147727272727273, + "grad_norm": 12.253664016723633, + "learning_rate": 2.878787878787879e-05, + "loss": 1.3787, + "step": 945 + }, + { + "epoch": 2.15, + "grad_norm": 13.154531478881836, + "learning_rate": 2.8712121212121214e-05, + "loss": 1.8925, + "step": 946 + }, + { + "epoch": 2.1522727272727273, + "grad_norm": 12.020703315734863, + "learning_rate": 2.863636363636364e-05, + "loss": 1.379, + "step": 947 + }, + { + "epoch": 2.1545454545454543, + "grad_norm": 10.430608749389648, + "learning_rate": 2.856060606060606e-05, + "loss": 1.4203, + "step": 948 + }, + { + "epoch": 2.1568181818181817, + "grad_norm": 8.769074440002441, + "learning_rate": 2.8484848484848486e-05, + "loss": 1.227, + "step": 949 + }, + { + "epoch": 2.159090909090909, + "grad_norm": 11.399450302124023, + "learning_rate": 2.8409090909090912e-05, + "loss": 1.3783, + "step": 950 + }, + { + "epoch": 2.161363636363636, + "grad_norm": 9.87228012084961, + "learning_rate": 2.8333333333333335e-05, + "loss": 1.6523, + "step": 951 + }, + { + "epoch": 2.1636363636363636, + "grad_norm": 15.94421100616455, + "learning_rate": 2.825757575757576e-05, + "loss": 2.4161, + "step": 952 + }, + { + "epoch": 2.165909090909091, + "grad_norm": 9.126893043518066, + "learning_rate": 2.818181818181818e-05, + "loss": 1.2675, + "step": 953 + }, + { + "epoch": 2.168181818181818, + "grad_norm": 15.760127067565918, + "learning_rate": 2.8106060606060607e-05, + "loss": 2.9231, + "step": 954 + }, + { + "epoch": 2.1704545454545454, + "grad_norm": 8.999767303466797, + "learning_rate": 2.803030303030303e-05, + "loss": 1.5147, + "step": 955 + }, + { + "epoch": 2.172727272727273, + "grad_norm": 12.179048538208008, + "learning_rate": 2.7954545454545457e-05, + "loss": 1.4017, + "step": 956 + }, + { + "epoch": 2.175, + "grad_norm": 11.52514934539795, + "learning_rate": 2.7878787878787883e-05, + "loss": 2.2158, + "step": 957 + }, + { + "epoch": 2.1772727272727272, + "grad_norm": 14.60074520111084, + "learning_rate": 2.7803030303030303e-05, + "loss": 1.6378, + "step": 958 + }, + { + "epoch": 2.1795454545454547, + "grad_norm": 11.505465507507324, + "learning_rate": 2.772727272727273e-05, + "loss": 1.6039, + "step": 959 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 12.141363143920898, + "learning_rate": 2.7651515151515152e-05, + "loss": 2.6782, + "step": 960 + }, + { + "epoch": 2.184090909090909, + "grad_norm": 10.89749813079834, + "learning_rate": 2.7575757575757578e-05, + "loss": 1.4787, + "step": 961 + }, + { + "epoch": 2.1863636363636365, + "grad_norm": 11.249963760375977, + "learning_rate": 2.7500000000000004e-05, + "loss": 1.9647, + "step": 962 + }, + { + "epoch": 2.1886363636363635, + "grad_norm": 9.608443260192871, + "learning_rate": 2.7424242424242424e-05, + "loss": 0.8747, + "step": 963 + }, + { + "epoch": 2.190909090909091, + "grad_norm": 9.517485618591309, + "learning_rate": 2.734848484848485e-05, + "loss": 1.2376, + "step": 964 + }, + { + "epoch": 2.1931818181818183, + "grad_norm": 9.044648170471191, + "learning_rate": 2.7272727272727273e-05, + "loss": 0.8014, + "step": 965 + }, + { + "epoch": 2.1954545454545453, + "grad_norm": 9.988462448120117, + "learning_rate": 2.71969696969697e-05, + "loss": 1.652, + "step": 966 + }, + { + "epoch": 2.1977272727272728, + "grad_norm": 8.96922492980957, + "learning_rate": 2.7121212121212126e-05, + "loss": 0.9484, + "step": 967 + }, + { + "epoch": 2.2, + "grad_norm": 10.36929702758789, + "learning_rate": 2.7045454545454545e-05, + "loss": 1.2604, + "step": 968 + }, + { + "epoch": 2.202272727272727, + "grad_norm": 14.008241653442383, + "learning_rate": 2.696969696969697e-05, + "loss": 2.4898, + "step": 969 + }, + { + "epoch": 2.2045454545454546, + "grad_norm": 14.017687797546387, + "learning_rate": 2.6893939393939394e-05, + "loss": 1.8664, + "step": 970 + }, + { + "epoch": 2.206818181818182, + "grad_norm": 11.672577857971191, + "learning_rate": 2.681818181818182e-05, + "loss": 1.8917, + "step": 971 + }, + { + "epoch": 2.209090909090909, + "grad_norm": 11.760181427001953, + "learning_rate": 2.674242424242424e-05, + "loss": 2.0559, + "step": 972 + }, + { + "epoch": 2.2113636363636364, + "grad_norm": 13.333674430847168, + "learning_rate": 2.6666666666666667e-05, + "loss": 1.8072, + "step": 973 + }, + { + "epoch": 2.213636363636364, + "grad_norm": 9.448116302490234, + "learning_rate": 2.6590909090909093e-05, + "loss": 1.2764, + "step": 974 + }, + { + "epoch": 2.215909090909091, + "grad_norm": 11.52153491973877, + "learning_rate": 2.6515151515151516e-05, + "loss": 1.7083, + "step": 975 + }, + { + "epoch": 2.2181818181818183, + "grad_norm": 20.444080352783203, + "learning_rate": 2.6439393939393942e-05, + "loss": 2.2781, + "step": 976 + }, + { + "epoch": 2.2204545454545457, + "grad_norm": 15.952470779418945, + "learning_rate": 2.636363636363636e-05, + "loss": 2.0901, + "step": 977 + }, + { + "epoch": 2.2227272727272727, + "grad_norm": 10.751893997192383, + "learning_rate": 2.6287878787878788e-05, + "loss": 0.9779, + "step": 978 + }, + { + "epoch": 2.225, + "grad_norm": 11.89562702178955, + "learning_rate": 2.6212121212121214e-05, + "loss": 1.7043, + "step": 979 + }, + { + "epoch": 2.227272727272727, + "grad_norm": 12.013797760009766, + "learning_rate": 2.6136363636363637e-05, + "loss": 1.4427, + "step": 980 + }, + { + "epoch": 2.2295454545454545, + "grad_norm": 13.685124397277832, + "learning_rate": 2.6060606060606063e-05, + "loss": 1.9327, + "step": 981 + }, + { + "epoch": 2.231818181818182, + "grad_norm": 14.36984920501709, + "learning_rate": 2.5984848484848483e-05, + "loss": 2.4401, + "step": 982 + }, + { + "epoch": 2.234090909090909, + "grad_norm": 11.657794952392578, + "learning_rate": 2.590909090909091e-05, + "loss": 1.5776, + "step": 983 + }, + { + "epoch": 2.2363636363636363, + "grad_norm": 9.138626098632812, + "learning_rate": 2.5833333333333336e-05, + "loss": 1.5954, + "step": 984 + }, + { + "epoch": 2.2386363636363638, + "grad_norm": 11.275242805480957, + "learning_rate": 2.575757575757576e-05, + "loss": 1.5874, + "step": 985 + }, + { + "epoch": 2.2409090909090907, + "grad_norm": 11.694557189941406, + "learning_rate": 2.5681818181818185e-05, + "loss": 1.2839, + "step": 986 + }, + { + "epoch": 2.243181818181818, + "grad_norm": 14.328207015991211, + "learning_rate": 2.5606060606060604e-05, + "loss": 2.3689, + "step": 987 + }, + { + "epoch": 2.2454545454545456, + "grad_norm": 14.487227439880371, + "learning_rate": 2.553030303030303e-05, + "loss": 1.5858, + "step": 988 + }, + { + "epoch": 2.2477272727272726, + "grad_norm": 14.691239356994629, + "learning_rate": 2.5454545454545454e-05, + "loss": 1.8329, + "step": 989 + }, + { + "epoch": 2.25, + "grad_norm": 10.622157096862793, + "learning_rate": 2.537878787878788e-05, + "loss": 1.8422, + "step": 990 + }, + { + "epoch": 2.2522727272727274, + "grad_norm": 13.788392066955566, + "learning_rate": 2.5303030303030306e-05, + "loss": 2.0421, + "step": 991 + }, + { + "epoch": 2.2545454545454544, + "grad_norm": 8.527210235595703, + "learning_rate": 2.5227272727272726e-05, + "loss": 1.4462, + "step": 992 + }, + { + "epoch": 2.256818181818182, + "grad_norm": 11.221017837524414, + "learning_rate": 2.5151515151515155e-05, + "loss": 1.7809, + "step": 993 + }, + { + "epoch": 2.2590909090909093, + "grad_norm": 15.243719100952148, + "learning_rate": 2.5075757575757575e-05, + "loss": 1.7409, + "step": 994 + }, + { + "epoch": 2.2613636363636362, + "grad_norm": 16.965797424316406, + "learning_rate": 2.5e-05, + "loss": 3.2836, + "step": 995 + }, + { + "epoch": 2.2636363636363637, + "grad_norm": 10.187609672546387, + "learning_rate": 2.4924242424242424e-05, + "loss": 1.5489, + "step": 996 + }, + { + "epoch": 2.265909090909091, + "grad_norm": 9.865535736083984, + "learning_rate": 2.4848484848484847e-05, + "loss": 2.0742, + "step": 997 + }, + { + "epoch": 2.268181818181818, + "grad_norm": 11.739052772521973, + "learning_rate": 2.4772727272727277e-05, + "loss": 1.4237, + "step": 998 + }, + { + "epoch": 2.2704545454545455, + "grad_norm": 13.875876426696777, + "learning_rate": 2.46969696969697e-05, + "loss": 2.8714, + "step": 999 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 11.909977912902832, + "learning_rate": 2.4621212121212123e-05, + "loss": 1.9434, + "step": 1000 + }, + { + "epoch": 2.275, + "grad_norm": 13.642827033996582, + "learning_rate": 2.4545454545454545e-05, + "loss": 1.4233, + "step": 1001 + }, + { + "epoch": 2.2772727272727273, + "grad_norm": 10.349024772644043, + "learning_rate": 2.4469696969696972e-05, + "loss": 1.5193, + "step": 1002 + }, + { + "epoch": 2.2795454545454543, + "grad_norm": 8.302240371704102, + "learning_rate": 2.4393939393939395e-05, + "loss": 1.0769, + "step": 1003 + }, + { + "epoch": 2.2818181818181817, + "grad_norm": 9.903936386108398, + "learning_rate": 2.431818181818182e-05, + "loss": 1.4596, + "step": 1004 + }, + { + "epoch": 2.284090909090909, + "grad_norm": 7.976583957672119, + "learning_rate": 2.4242424242424244e-05, + "loss": 1.3187, + "step": 1005 + }, + { + "epoch": 2.286363636363636, + "grad_norm": 8.382739067077637, + "learning_rate": 2.4166666666666667e-05, + "loss": 1.1004, + "step": 1006 + }, + { + "epoch": 2.2886363636363636, + "grad_norm": 9.898600578308105, + "learning_rate": 2.4090909090909093e-05, + "loss": 1.3482, + "step": 1007 + }, + { + "epoch": 2.290909090909091, + "grad_norm": 9.736372947692871, + "learning_rate": 2.4015151515151516e-05, + "loss": 1.0737, + "step": 1008 + }, + { + "epoch": 2.293181818181818, + "grad_norm": 14.735883712768555, + "learning_rate": 2.393939393939394e-05, + "loss": 1.9045, + "step": 1009 + }, + { + "epoch": 2.2954545454545454, + "grad_norm": 16.780405044555664, + "learning_rate": 2.3863636363636365e-05, + "loss": 1.9355, + "step": 1010 + }, + { + "epoch": 2.297727272727273, + "grad_norm": 9.181320190429688, + "learning_rate": 2.3787878787878788e-05, + "loss": 1.4465, + "step": 1011 + }, + { + "epoch": 2.3, + "grad_norm": 11.207884788513184, + "learning_rate": 2.3712121212121214e-05, + "loss": 1.6341, + "step": 1012 + }, + { + "epoch": 2.3022727272727272, + "grad_norm": 12.287393569946289, + "learning_rate": 2.3636363636363637e-05, + "loss": 1.806, + "step": 1013 + }, + { + "epoch": 2.3045454545454547, + "grad_norm": 12.173286437988281, + "learning_rate": 2.356060606060606e-05, + "loss": 2.2166, + "step": 1014 + }, + { + "epoch": 2.3068181818181817, + "grad_norm": 13.528629302978516, + "learning_rate": 2.3484848484848487e-05, + "loss": 1.5679, + "step": 1015 + }, + { + "epoch": 2.309090909090909, + "grad_norm": 9.217406272888184, + "learning_rate": 2.340909090909091e-05, + "loss": 1.7179, + "step": 1016 + }, + { + "epoch": 2.3113636363636365, + "grad_norm": 13.768959999084473, + "learning_rate": 2.3333333333333336e-05, + "loss": 2.1235, + "step": 1017 + }, + { + "epoch": 2.3136363636363635, + "grad_norm": 9.60761833190918, + "learning_rate": 2.325757575757576e-05, + "loss": 1.3526, + "step": 1018 + }, + { + "epoch": 2.315909090909091, + "grad_norm": 10.336706161499023, + "learning_rate": 2.318181818181818e-05, + "loss": 1.3543, + "step": 1019 + }, + { + "epoch": 2.3181818181818183, + "grad_norm": 11.636757850646973, + "learning_rate": 2.3106060606060605e-05, + "loss": 1.8026, + "step": 1020 + }, + { + "epoch": 2.3204545454545453, + "grad_norm": 10.546634674072266, + "learning_rate": 2.3030303030303034e-05, + "loss": 1.9753, + "step": 1021 + }, + { + "epoch": 2.3227272727272728, + "grad_norm": 13.629782676696777, + "learning_rate": 2.2954545454545457e-05, + "loss": 1.6927, + "step": 1022 + }, + { + "epoch": 2.325, + "grad_norm": 13.1149263381958, + "learning_rate": 2.287878787878788e-05, + "loss": 1.4331, + "step": 1023 + }, + { + "epoch": 2.327272727272727, + "grad_norm": 10.624835968017578, + "learning_rate": 2.2803030303030303e-05, + "loss": 1.4769, + "step": 1024 + }, + { + "epoch": 2.3295454545454546, + "grad_norm": 13.692902565002441, + "learning_rate": 2.272727272727273e-05, + "loss": 2.7543, + "step": 1025 + }, + { + "epoch": 2.331818181818182, + "grad_norm": 10.054675102233887, + "learning_rate": 2.2651515151515152e-05, + "loss": 1.2323, + "step": 1026 + }, + { + "epoch": 2.334090909090909, + "grad_norm": 14.394067764282227, + "learning_rate": 2.257575757575758e-05, + "loss": 2.094, + "step": 1027 + }, + { + "epoch": 2.3363636363636364, + "grad_norm": 10.581347465515137, + "learning_rate": 2.25e-05, + "loss": 2.2432, + "step": 1028 + }, + { + "epoch": 2.338636363636364, + "grad_norm": 9.492446899414062, + "learning_rate": 2.2424242424242424e-05, + "loss": 1.3964, + "step": 1029 + }, + { + "epoch": 2.340909090909091, + "grad_norm": 10.887022972106934, + "learning_rate": 2.234848484848485e-05, + "loss": 2.0411, + "step": 1030 + }, + { + "epoch": 2.3431818181818183, + "grad_norm": 13.539667129516602, + "learning_rate": 2.2272727272727274e-05, + "loss": 1.3067, + "step": 1031 + }, + { + "epoch": 2.3454545454545457, + "grad_norm": 9.191630363464355, + "learning_rate": 2.21969696969697e-05, + "loss": 1.266, + "step": 1032 + }, + { + "epoch": 2.3477272727272727, + "grad_norm": 8.683979034423828, + "learning_rate": 2.2121212121212123e-05, + "loss": 0.8044, + "step": 1033 + }, + { + "epoch": 2.35, + "grad_norm": 13.170730590820312, + "learning_rate": 2.2045454545454546e-05, + "loss": 2.2811, + "step": 1034 + }, + { + "epoch": 2.3522727272727275, + "grad_norm": 11.17111873626709, + "learning_rate": 2.1969696969696972e-05, + "loss": 1.3998, + "step": 1035 + }, + { + "epoch": 2.3545454545454545, + "grad_norm": 11.230095863342285, + "learning_rate": 2.1893939393939395e-05, + "loss": 2.0224, + "step": 1036 + }, + { + "epoch": 2.356818181818182, + "grad_norm": 11.912615776062012, + "learning_rate": 2.1818181818181818e-05, + "loss": 1.5619, + "step": 1037 + }, + { + "epoch": 2.359090909090909, + "grad_norm": 10.748661994934082, + "learning_rate": 2.1742424242424244e-05, + "loss": 1.924, + "step": 1038 + }, + { + "epoch": 2.3613636363636363, + "grad_norm": 9.370635032653809, + "learning_rate": 2.1666666666666667e-05, + "loss": 1.1797, + "step": 1039 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 10.01646900177002, + "learning_rate": 2.1590909090909093e-05, + "loss": 2.1678, + "step": 1040 + }, + { + "epoch": 2.3659090909090907, + "grad_norm": 9.345016479492188, + "learning_rate": 2.1515151515151516e-05, + "loss": 1.4512, + "step": 1041 + }, + { + "epoch": 2.368181818181818, + "grad_norm": 11.185441970825195, + "learning_rate": 2.143939393939394e-05, + "loss": 1.5958, + "step": 1042 + }, + { + "epoch": 2.3704545454545456, + "grad_norm": 10.186037063598633, + "learning_rate": 2.1363636363636362e-05, + "loss": 0.8744, + "step": 1043 + }, + { + "epoch": 2.3727272727272726, + "grad_norm": 16.676177978515625, + "learning_rate": 2.128787878787879e-05, + "loss": 2.0851, + "step": 1044 + }, + { + "epoch": 2.375, + "grad_norm": 12.497913360595703, + "learning_rate": 2.1212121212121215e-05, + "loss": 1.4765, + "step": 1045 + }, + { + "epoch": 2.3772727272727274, + "grad_norm": 7.271422386169434, + "learning_rate": 2.1136363636363638e-05, + "loss": 1.0424, + "step": 1046 + }, + { + "epoch": 2.3795454545454544, + "grad_norm": 14.968780517578125, + "learning_rate": 2.106060606060606e-05, + "loss": 2.1247, + "step": 1047 + }, + { + "epoch": 2.381818181818182, + "grad_norm": 11.1759672164917, + "learning_rate": 2.0984848484848483e-05, + "loss": 1.5037, + "step": 1048 + }, + { + "epoch": 2.3840909090909093, + "grad_norm": 9.880687713623047, + "learning_rate": 2.090909090909091e-05, + "loss": 0.8131, + "step": 1049 + }, + { + "epoch": 2.3863636363636362, + "grad_norm": 7.559080123901367, + "learning_rate": 2.0833333333333336e-05, + "loss": 0.5826, + "step": 1050 + }, + { + "epoch": 2.3886363636363637, + "grad_norm": 14.357791900634766, + "learning_rate": 2.075757575757576e-05, + "loss": 2.0945, + "step": 1051 + }, + { + "epoch": 2.390909090909091, + "grad_norm": 11.396363258361816, + "learning_rate": 2.0681818181818182e-05, + "loss": 1.1564, + "step": 1052 + }, + { + "epoch": 2.393181818181818, + "grad_norm": 11.255867958068848, + "learning_rate": 2.0606060606060608e-05, + "loss": 2.2688, + "step": 1053 + }, + { + "epoch": 2.3954545454545455, + "grad_norm": 12.590128898620605, + "learning_rate": 2.053030303030303e-05, + "loss": 2.0123, + "step": 1054 + }, + { + "epoch": 2.3977272727272725, + "grad_norm": 8.069854736328125, + "learning_rate": 2.0454545454545457e-05, + "loss": 1.3967, + "step": 1055 + }, + { + "epoch": 2.4, + "grad_norm": 12.596185684204102, + "learning_rate": 2.037878787878788e-05, + "loss": 1.6038, + "step": 1056 + }, + { + "epoch": 2.4022727272727273, + "grad_norm": 10.432991981506348, + "learning_rate": 2.0303030303030303e-05, + "loss": 1.645, + "step": 1057 + }, + { + "epoch": 2.4045454545454543, + "grad_norm": 10.639815330505371, + "learning_rate": 2.022727272727273e-05, + "loss": 1.5334, + "step": 1058 + }, + { + "epoch": 2.4068181818181817, + "grad_norm": 8.867145538330078, + "learning_rate": 2.0151515151515152e-05, + "loss": 1.2041, + "step": 1059 + }, + { + "epoch": 2.409090909090909, + "grad_norm": 9.741902351379395, + "learning_rate": 2.0075757575757575e-05, + "loss": 1.4987, + "step": 1060 + }, + { + "epoch": 2.411363636363636, + "grad_norm": 9.907489776611328, + "learning_rate": 2e-05, + "loss": 1.299, + "step": 1061 + }, + { + "epoch": 2.4136363636363636, + "grad_norm": 8.68997859954834, + "learning_rate": 1.9924242424242425e-05, + "loss": 1.2559, + "step": 1062 + }, + { + "epoch": 2.415909090909091, + "grad_norm": 9.990528106689453, + "learning_rate": 1.984848484848485e-05, + "loss": 2.3812, + "step": 1063 + }, + { + "epoch": 2.418181818181818, + "grad_norm": 6.777112007141113, + "learning_rate": 1.9772727272727274e-05, + "loss": 1.0051, + "step": 1064 + }, + { + "epoch": 2.4204545454545454, + "grad_norm": 13.396077156066895, + "learning_rate": 1.9696969696969697e-05, + "loss": 2.4201, + "step": 1065 + }, + { + "epoch": 2.422727272727273, + "grad_norm": 13.596755981445312, + "learning_rate": 1.962121212121212e-05, + "loss": 2.0457, + "step": 1066 + }, + { + "epoch": 2.425, + "grad_norm": 10.351893424987793, + "learning_rate": 1.9545454545454546e-05, + "loss": 1.9791, + "step": 1067 + }, + { + "epoch": 2.4272727272727272, + "grad_norm": 7.505919933319092, + "learning_rate": 1.9469696969696972e-05, + "loss": 1.2944, + "step": 1068 + }, + { + "epoch": 2.4295454545454547, + "grad_norm": 10.136748313903809, + "learning_rate": 1.9393939393939395e-05, + "loss": 1.2477, + "step": 1069 + }, + { + "epoch": 2.4318181818181817, + "grad_norm": 8.979276657104492, + "learning_rate": 1.9318181818181818e-05, + "loss": 0.9829, + "step": 1070 + }, + { + "epoch": 2.434090909090909, + "grad_norm": 11.097721099853516, + "learning_rate": 1.924242424242424e-05, + "loss": 1.5509, + "step": 1071 + }, + { + "epoch": 2.4363636363636365, + "grad_norm": 10.789654731750488, + "learning_rate": 1.9166666666666667e-05, + "loss": 1.7344, + "step": 1072 + }, + { + "epoch": 2.4386363636363635, + "grad_norm": 12.25899887084961, + "learning_rate": 1.9090909090909094e-05, + "loss": 2.0121, + "step": 1073 + }, + { + "epoch": 2.440909090909091, + "grad_norm": 11.828030586242676, + "learning_rate": 1.9015151515151516e-05, + "loss": 1.7356, + "step": 1074 + }, + { + "epoch": 2.4431818181818183, + "grad_norm": 10.524036407470703, + "learning_rate": 1.893939393939394e-05, + "loss": 1.402, + "step": 1075 + }, + { + "epoch": 2.4454545454545453, + "grad_norm": 10.572868347167969, + "learning_rate": 1.8863636363636362e-05, + "loss": 1.6468, + "step": 1076 + }, + { + "epoch": 2.4477272727272728, + "grad_norm": 9.194175720214844, + "learning_rate": 1.878787878787879e-05, + "loss": 1.1557, + "step": 1077 + }, + { + "epoch": 2.45, + "grad_norm": 11.355244636535645, + "learning_rate": 1.8712121212121215e-05, + "loss": 1.7729, + "step": 1078 + }, + { + "epoch": 2.452272727272727, + "grad_norm": 10.380278587341309, + "learning_rate": 1.8636363636363638e-05, + "loss": 2.3491, + "step": 1079 + }, + { + "epoch": 2.4545454545454546, + "grad_norm": 9.57583236694336, + "learning_rate": 1.856060606060606e-05, + "loss": 1.6112, + "step": 1080 + }, + { + "epoch": 2.456818181818182, + "grad_norm": 12.973028182983398, + "learning_rate": 1.8484848484848487e-05, + "loss": 1.5272, + "step": 1081 + }, + { + "epoch": 2.459090909090909, + "grad_norm": 9.473404884338379, + "learning_rate": 1.840909090909091e-05, + "loss": 1.2366, + "step": 1082 + }, + { + "epoch": 2.4613636363636364, + "grad_norm": 9.843785285949707, + "learning_rate": 1.8333333333333333e-05, + "loss": 1.6283, + "step": 1083 + }, + { + "epoch": 2.463636363636364, + "grad_norm": 13.467684745788574, + "learning_rate": 1.825757575757576e-05, + "loss": 1.5219, + "step": 1084 + }, + { + "epoch": 2.465909090909091, + "grad_norm": 8.460468292236328, + "learning_rate": 1.8181818181818182e-05, + "loss": 0.8931, + "step": 1085 + }, + { + "epoch": 2.4681818181818183, + "grad_norm": 8.956411361694336, + "learning_rate": 1.810606060606061e-05, + "loss": 1.1577, + "step": 1086 + }, + { + "epoch": 2.4704545454545457, + "grad_norm": 10.919206619262695, + "learning_rate": 1.803030303030303e-05, + "loss": 1.719, + "step": 1087 + }, + { + "epoch": 2.4727272727272727, + "grad_norm": 10.65345287322998, + "learning_rate": 1.7954545454545454e-05, + "loss": 1.5257, + "step": 1088 + }, + { + "epoch": 2.475, + "grad_norm": 9.616610527038574, + "learning_rate": 1.787878787878788e-05, + "loss": 1.4704, + "step": 1089 + }, + { + "epoch": 2.4772727272727275, + "grad_norm": 14.458331108093262, + "learning_rate": 1.7803030303030303e-05, + "loss": 1.4181, + "step": 1090 + }, + { + "epoch": 2.4795454545454545, + "grad_norm": 8.37006664276123, + "learning_rate": 1.772727272727273e-05, + "loss": 1.191, + "step": 1091 + }, + { + "epoch": 2.481818181818182, + "grad_norm": 13.129170417785645, + "learning_rate": 1.7651515151515153e-05, + "loss": 1.9966, + "step": 1092 + }, + { + "epoch": 2.484090909090909, + "grad_norm": 12.65162181854248, + "learning_rate": 1.7575757575757576e-05, + "loss": 1.7372, + "step": 1093 + }, + { + "epoch": 2.4863636363636363, + "grad_norm": 12.132272720336914, + "learning_rate": 1.75e-05, + "loss": 1.9386, + "step": 1094 + }, + { + "epoch": 2.4886363636363638, + "grad_norm": 11.549707412719727, + "learning_rate": 1.7424242424242425e-05, + "loss": 1.2838, + "step": 1095 + }, + { + "epoch": 2.4909090909090907, + "grad_norm": 10.115202903747559, + "learning_rate": 1.734848484848485e-05, + "loss": 1.7778, + "step": 1096 + }, + { + "epoch": 2.493181818181818, + "grad_norm": 14.97376823425293, + "learning_rate": 1.7272727272727274e-05, + "loss": 2.5436, + "step": 1097 + }, + { + "epoch": 2.4954545454545456, + "grad_norm": 10.270051956176758, + "learning_rate": 1.7196969696969697e-05, + "loss": 1.3943, + "step": 1098 + }, + { + "epoch": 2.4977272727272726, + "grad_norm": 11.584896087646484, + "learning_rate": 1.712121212121212e-05, + "loss": 1.8023, + "step": 1099 + }, + { + "epoch": 2.5, + "grad_norm": 11.003795623779297, + "learning_rate": 1.7045454545454546e-05, + "loss": 1.2057, + "step": 1100 + }, + { + "epoch": 2.5022727272727274, + "grad_norm": 10.495930671691895, + "learning_rate": 1.6969696969696972e-05, + "loss": 1.7265, + "step": 1101 + }, + { + "epoch": 2.5045454545454544, + "grad_norm": 10.6824951171875, + "learning_rate": 1.6893939393939395e-05, + "loss": 1.4241, + "step": 1102 + }, + { + "epoch": 2.506818181818182, + "grad_norm": 10.532041549682617, + "learning_rate": 1.6818181818181818e-05, + "loss": 1.4532, + "step": 1103 + }, + { + "epoch": 2.509090909090909, + "grad_norm": 8.671700477600098, + "learning_rate": 1.674242424242424e-05, + "loss": 1.2539, + "step": 1104 + }, + { + "epoch": 2.5113636363636362, + "grad_norm": 14.828866004943848, + "learning_rate": 1.6666666666666667e-05, + "loss": 1.4732, + "step": 1105 + }, + { + "epoch": 2.5136363636363637, + "grad_norm": 11.871790885925293, + "learning_rate": 1.6590909090909094e-05, + "loss": 1.7559, + "step": 1106 + }, + { + "epoch": 2.5159090909090907, + "grad_norm": 9.144551277160645, + "learning_rate": 1.6515151515151517e-05, + "loss": 1.3562, + "step": 1107 + }, + { + "epoch": 2.518181818181818, + "grad_norm": 9.856282234191895, + "learning_rate": 1.643939393939394e-05, + "loss": 1.4721, + "step": 1108 + }, + { + "epoch": 2.5204545454545455, + "grad_norm": 8.48530101776123, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.0045, + "step": 1109 + }, + { + "epoch": 2.5227272727272725, + "grad_norm": 16.73642349243164, + "learning_rate": 1.628787878787879e-05, + "loss": 2.4458, + "step": 1110 + }, + { + "epoch": 2.525, + "grad_norm": 10.180378913879395, + "learning_rate": 1.6212121212121212e-05, + "loss": 1.3323, + "step": 1111 + }, + { + "epoch": 2.5272727272727273, + "grad_norm": 11.56425666809082, + "learning_rate": 1.6136363636363638e-05, + "loss": 2.0303, + "step": 1112 + }, + { + "epoch": 2.5295454545454543, + "grad_norm": 14.644630432128906, + "learning_rate": 1.606060606060606e-05, + "loss": 1.9247, + "step": 1113 + }, + { + "epoch": 2.5318181818181817, + "grad_norm": 11.767682075500488, + "learning_rate": 1.5984848484848487e-05, + "loss": 1.7903, + "step": 1114 + }, + { + "epoch": 2.534090909090909, + "grad_norm": 11.074971199035645, + "learning_rate": 1.590909090909091e-05, + "loss": 2.0781, + "step": 1115 + }, + { + "epoch": 2.536363636363636, + "grad_norm": 13.846643447875977, + "learning_rate": 1.5833333333333333e-05, + "loss": 1.2449, + "step": 1116 + }, + { + "epoch": 2.5386363636363636, + "grad_norm": 12.496777534484863, + "learning_rate": 1.5757575757575756e-05, + "loss": 1.287, + "step": 1117 + }, + { + "epoch": 2.540909090909091, + "grad_norm": 8.406025886535645, + "learning_rate": 1.5681818181818182e-05, + "loss": 1.4133, + "step": 1118 + }, + { + "epoch": 2.543181818181818, + "grad_norm": 9.715517044067383, + "learning_rate": 1.560606060606061e-05, + "loss": 1.6738, + "step": 1119 + }, + { + "epoch": 2.5454545454545454, + "grad_norm": 14.14928913116455, + "learning_rate": 1.553030303030303e-05, + "loss": 1.9505, + "step": 1120 + }, + { + "epoch": 2.547727272727273, + "grad_norm": 10.110836029052734, + "learning_rate": 1.5454545454545454e-05, + "loss": 1.4759, + "step": 1121 + }, + { + "epoch": 2.55, + "grad_norm": 15.94524097442627, + "learning_rate": 1.5378787878787877e-05, + "loss": 1.7516, + "step": 1122 + }, + { + "epoch": 2.5522727272727272, + "grad_norm": 16.20330047607422, + "learning_rate": 1.5303030303030304e-05, + "loss": 2.1093, + "step": 1123 + }, + { + "epoch": 2.5545454545454547, + "grad_norm": 8.647255897521973, + "learning_rate": 1.5227272727272728e-05, + "loss": 1.0308, + "step": 1124 + }, + { + "epoch": 2.5568181818181817, + "grad_norm": 8.955947875976562, + "learning_rate": 1.5151515151515153e-05, + "loss": 1.0129, + "step": 1125 + }, + { + "epoch": 2.559090909090909, + "grad_norm": 12.877582550048828, + "learning_rate": 1.5075757575757576e-05, + "loss": 1.4853, + "step": 1126 + }, + { + "epoch": 2.5613636363636365, + "grad_norm": 14.299208641052246, + "learning_rate": 1.5e-05, + "loss": 2.0464, + "step": 1127 + }, + { + "epoch": 2.5636363636363635, + "grad_norm": 14.365765571594238, + "learning_rate": 1.4924242424242423e-05, + "loss": 1.9381, + "step": 1128 + }, + { + "epoch": 2.565909090909091, + "grad_norm": 10.231593132019043, + "learning_rate": 1.484848484848485e-05, + "loss": 1.6777, + "step": 1129 + }, + { + "epoch": 2.5681818181818183, + "grad_norm": 14.259530067443848, + "learning_rate": 1.4772727272727274e-05, + "loss": 1.6438, + "step": 1130 + }, + { + "epoch": 2.5704545454545453, + "grad_norm": 13.114981651306152, + "learning_rate": 1.4696969696969697e-05, + "loss": 1.3336, + "step": 1131 + }, + { + "epoch": 2.5727272727272728, + "grad_norm": 9.463297843933105, + "learning_rate": 1.4621212121212122e-05, + "loss": 1.203, + "step": 1132 + }, + { + "epoch": 2.575, + "grad_norm": 9.805520057678223, + "learning_rate": 1.4545454545454545e-05, + "loss": 1.2487, + "step": 1133 + }, + { + "epoch": 2.577272727272727, + "grad_norm": 14.853455543518066, + "learning_rate": 1.446969696969697e-05, + "loss": 1.5734, + "step": 1134 + }, + { + "epoch": 2.5795454545454546, + "grad_norm": 11.86341381072998, + "learning_rate": 1.4393939393939396e-05, + "loss": 1.4835, + "step": 1135 + }, + { + "epoch": 2.581818181818182, + "grad_norm": 11.581096649169922, + "learning_rate": 1.431818181818182e-05, + "loss": 2.0558, + "step": 1136 + }, + { + "epoch": 2.584090909090909, + "grad_norm": 12.040521621704102, + "learning_rate": 1.4242424242424243e-05, + "loss": 1.4117, + "step": 1137 + }, + { + "epoch": 2.5863636363636364, + "grad_norm": 13.00901985168457, + "learning_rate": 1.4166666666666668e-05, + "loss": 2.9511, + "step": 1138 + }, + { + "epoch": 2.588636363636364, + "grad_norm": 9.332910537719727, + "learning_rate": 1.409090909090909e-05, + "loss": 1.1121, + "step": 1139 + }, + { + "epoch": 2.590909090909091, + "grad_norm": 10.607443809509277, + "learning_rate": 1.4015151515151515e-05, + "loss": 1.4706, + "step": 1140 + }, + { + "epoch": 2.5931818181818183, + "grad_norm": 9.47099494934082, + "learning_rate": 1.3939393939393942e-05, + "loss": 1.6907, + "step": 1141 + }, + { + "epoch": 2.5954545454545457, + "grad_norm": 12.868734359741211, + "learning_rate": 1.3863636363636364e-05, + "loss": 1.334, + "step": 1142 + }, + { + "epoch": 2.5977272727272727, + "grad_norm": 7.338480472564697, + "learning_rate": 1.3787878787878789e-05, + "loss": 0.6364, + "step": 1143 + }, + { + "epoch": 2.6, + "grad_norm": 10.434823989868164, + "learning_rate": 1.3712121212121212e-05, + "loss": 1.7292, + "step": 1144 + }, + { + "epoch": 2.6022727272727275, + "grad_norm": 10.510713577270508, + "learning_rate": 1.3636363636363637e-05, + "loss": 1.555, + "step": 1145 + }, + { + "epoch": 2.6045454545454545, + "grad_norm": 11.927501678466797, + "learning_rate": 1.3560606060606063e-05, + "loss": 1.7373, + "step": 1146 + }, + { + "epoch": 2.606818181818182, + "grad_norm": 8.673569679260254, + "learning_rate": 1.3484848484848486e-05, + "loss": 1.3046, + "step": 1147 + }, + { + "epoch": 2.6090909090909093, + "grad_norm": 9.680171012878418, + "learning_rate": 1.340909090909091e-05, + "loss": 1.2691, + "step": 1148 + }, + { + "epoch": 2.6113636363636363, + "grad_norm": 20.66661834716797, + "learning_rate": 1.3333333333333333e-05, + "loss": 3.1138, + "step": 1149 + }, + { + "epoch": 2.6136363636363638, + "grad_norm": 59.59333801269531, + "learning_rate": 1.3257575757575758e-05, + "loss": 1.8486, + "step": 1150 + }, + { + "epoch": 2.615909090909091, + "grad_norm": 9.416550636291504, + "learning_rate": 1.318181818181818e-05, + "loss": 1.198, + "step": 1151 + }, + { + "epoch": 2.618181818181818, + "grad_norm": 11.847350120544434, + "learning_rate": 1.3106060606060607e-05, + "loss": 1.494, + "step": 1152 + }, + { + "epoch": 2.6204545454545456, + "grad_norm": 8.2369966506958, + "learning_rate": 1.3030303030303032e-05, + "loss": 0.8885, + "step": 1153 + }, + { + "epoch": 2.6227272727272726, + "grad_norm": 13.204099655151367, + "learning_rate": 1.2954545454545455e-05, + "loss": 1.9838, + "step": 1154 + }, + { + "epoch": 2.625, + "grad_norm": 11.384471893310547, + "learning_rate": 1.287878787878788e-05, + "loss": 1.5648, + "step": 1155 + }, + { + "epoch": 2.6272727272727274, + "grad_norm": 43.95447540283203, + "learning_rate": 1.2803030303030302e-05, + "loss": 1.6246, + "step": 1156 + }, + { + "epoch": 2.6295454545454544, + "grad_norm": 12.041752815246582, + "learning_rate": 1.2727272727272727e-05, + "loss": 1.6404, + "step": 1157 + }, + { + "epoch": 2.631818181818182, + "grad_norm": 13.470951080322266, + "learning_rate": 1.2651515151515153e-05, + "loss": 2.1278, + "step": 1158 + }, + { + "epoch": 2.634090909090909, + "grad_norm": 12.769510269165039, + "learning_rate": 1.2575757575757578e-05, + "loss": 1.6486, + "step": 1159 + }, + { + "epoch": 2.6363636363636362, + "grad_norm": 9.455702781677246, + "learning_rate": 1.25e-05, + "loss": 1.5211, + "step": 1160 + }, + { + "epoch": 2.6386363636363637, + "grad_norm": 13.590509414672852, + "learning_rate": 1.2424242424242424e-05, + "loss": 2.081, + "step": 1161 + }, + { + "epoch": 2.6409090909090907, + "grad_norm": 12.029936790466309, + "learning_rate": 1.234848484848485e-05, + "loss": 1.6036, + "step": 1162 + }, + { + "epoch": 2.643181818181818, + "grad_norm": 65.75121307373047, + "learning_rate": 1.2272727272727273e-05, + "loss": 1.5853, + "step": 1163 + }, + { + "epoch": 2.6454545454545455, + "grad_norm": 13.093693733215332, + "learning_rate": 1.2196969696969697e-05, + "loss": 1.4623, + "step": 1164 + }, + { + "epoch": 2.6477272727272725, + "grad_norm": 14.704643249511719, + "learning_rate": 1.2121212121212122e-05, + "loss": 1.7431, + "step": 1165 + }, + { + "epoch": 2.65, + "grad_norm": 10.710149765014648, + "learning_rate": 1.2045454545454547e-05, + "loss": 1.6442, + "step": 1166 + }, + { + "epoch": 2.6522727272727273, + "grad_norm": 12.05364990234375, + "learning_rate": 1.196969696969697e-05, + "loss": 2.0733, + "step": 1167 + }, + { + "epoch": 2.6545454545454543, + "grad_norm": 12.834985733032227, + "learning_rate": 1.1893939393939394e-05, + "loss": 2.8648, + "step": 1168 + }, + { + "epoch": 2.6568181818181817, + "grad_norm": 9.302035331726074, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.1539, + "step": 1169 + }, + { + "epoch": 2.659090909090909, + "grad_norm": 9.240340232849121, + "learning_rate": 1.1742424242424243e-05, + "loss": 1.5434, + "step": 1170 + }, + { + "epoch": 2.661363636363636, + "grad_norm": 14.066667556762695, + "learning_rate": 1.1666666666666668e-05, + "loss": 1.7866, + "step": 1171 + }, + { + "epoch": 2.6636363636363636, + "grad_norm": 10.935914039611816, + "learning_rate": 1.159090909090909e-05, + "loss": 1.4766, + "step": 1172 + }, + { + "epoch": 2.665909090909091, + "grad_norm": 8.409308433532715, + "learning_rate": 1.1515151515151517e-05, + "loss": 1.3846, + "step": 1173 + }, + { + "epoch": 2.668181818181818, + "grad_norm": 10.203055381774902, + "learning_rate": 1.143939393939394e-05, + "loss": 1.1693, + "step": 1174 + }, + { + "epoch": 2.6704545454545454, + "grad_norm": 11.417679786682129, + "learning_rate": 1.1363636363636365e-05, + "loss": 1.9941, + "step": 1175 + }, + { + "epoch": 2.672727272727273, + "grad_norm": 13.196696281433105, + "learning_rate": 1.128787878787879e-05, + "loss": 1.8474, + "step": 1176 + }, + { + "epoch": 2.675, + "grad_norm": 11.088204383850098, + "learning_rate": 1.1212121212121212e-05, + "loss": 1.7153, + "step": 1177 + }, + { + "epoch": 2.6772727272727272, + "grad_norm": 12.048771858215332, + "learning_rate": 1.1136363636363637e-05, + "loss": 2.5212, + "step": 1178 + }, + { + "epoch": 2.6795454545454547, + "grad_norm": 13.929719924926758, + "learning_rate": 1.1060606060606061e-05, + "loss": 2.3728, + "step": 1179 + }, + { + "epoch": 2.6818181818181817, + "grad_norm": 10.445011138916016, + "learning_rate": 1.0984848484848486e-05, + "loss": 0.9737, + "step": 1180 + }, + { + "epoch": 2.684090909090909, + "grad_norm": 14.0521821975708, + "learning_rate": 1.0909090909090909e-05, + "loss": 1.6476, + "step": 1181 + }, + { + "epoch": 2.6863636363636365, + "grad_norm": 10.526323318481445, + "learning_rate": 1.0833333333333334e-05, + "loss": 1.4206, + "step": 1182 + }, + { + "epoch": 2.6886363636363635, + "grad_norm": 11.84065055847168, + "learning_rate": 1.0757575757575758e-05, + "loss": 2.5504, + "step": 1183 + }, + { + "epoch": 2.690909090909091, + "grad_norm": 13.432804107666016, + "learning_rate": 1.0681818181818181e-05, + "loss": 1.1723, + "step": 1184 + }, + { + "epoch": 2.6931818181818183, + "grad_norm": 10.570472717285156, + "learning_rate": 1.0606060606060607e-05, + "loss": 1.3094, + "step": 1185 + }, + { + "epoch": 2.6954545454545453, + "grad_norm": 9.313067436218262, + "learning_rate": 1.053030303030303e-05, + "loss": 1.3848, + "step": 1186 + }, + { + "epoch": 2.6977272727272728, + "grad_norm": 12.77459716796875, + "learning_rate": 1.0454545454545455e-05, + "loss": 1.9546, + "step": 1187 + }, + { + "epoch": 2.7, + "grad_norm": 12.23890495300293, + "learning_rate": 1.037878787878788e-05, + "loss": 1.858, + "step": 1188 + }, + { + "epoch": 2.702272727272727, + "grad_norm": 10.90783977508545, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.7215, + "step": 1189 + }, + { + "epoch": 2.7045454545454546, + "grad_norm": 11.610969543457031, + "learning_rate": 1.0227272727272729e-05, + "loss": 1.3744, + "step": 1190 + }, + { + "epoch": 2.706818181818182, + "grad_norm": 13.296714782714844, + "learning_rate": 1.0151515151515152e-05, + "loss": 1.3959, + "step": 1191 + }, + { + "epoch": 2.709090909090909, + "grad_norm": 11.602737426757812, + "learning_rate": 1.0075757575757576e-05, + "loss": 0.9706, + "step": 1192 + }, + { + "epoch": 2.7113636363636364, + "grad_norm": 8.904767036437988, + "learning_rate": 1e-05, + "loss": 1.1206, + "step": 1193 + }, + { + "epoch": 2.713636363636364, + "grad_norm": 9.719966888427734, + "learning_rate": 9.924242424242425e-06, + "loss": 1.326, + "step": 1194 + }, + { + "epoch": 2.715909090909091, + "grad_norm": 11.37736701965332, + "learning_rate": 9.848484848484848e-06, + "loss": 1.2423, + "step": 1195 + }, + { + "epoch": 2.7181818181818183, + "grad_norm": 8.89704418182373, + "learning_rate": 9.772727272727273e-06, + "loss": 1.5434, + "step": 1196 + }, + { + "epoch": 2.7204545454545457, + "grad_norm": 11.980868339538574, + "learning_rate": 9.696969696969698e-06, + "loss": 1.9285, + "step": 1197 + }, + { + "epoch": 2.7227272727272727, + "grad_norm": 20.147335052490234, + "learning_rate": 9.62121212121212e-06, + "loss": 1.9032, + "step": 1198 + }, + { + "epoch": 2.725, + "grad_norm": 12.508543014526367, + "learning_rate": 9.545454545454547e-06, + "loss": 2.549, + "step": 1199 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 15.286222457885742, + "learning_rate": 9.46969696969697e-06, + "loss": 1.7541, + "step": 1200 + }, + { + "epoch": 2.7295454545454545, + "grad_norm": 9.950079917907715, + "learning_rate": 9.393939393939394e-06, + "loss": 1.0859, + "step": 1201 + }, + { + "epoch": 2.731818181818182, + "grad_norm": 9.034377098083496, + "learning_rate": 9.318181818181819e-06, + "loss": 1.6942, + "step": 1202 + }, + { + "epoch": 2.7340909090909093, + "grad_norm": 10.347823143005371, + "learning_rate": 9.242424242424244e-06, + "loss": 0.7853, + "step": 1203 + }, + { + "epoch": 2.7363636363636363, + "grad_norm": 13.554040908813477, + "learning_rate": 9.166666666666666e-06, + "loss": 1.6867, + "step": 1204 + }, + { + "epoch": 2.7386363636363638, + "grad_norm": 12.764242172241211, + "learning_rate": 9.090909090909091e-06, + "loss": 1.7983, + "step": 1205 + }, + { + "epoch": 2.740909090909091, + "grad_norm": 13.305977821350098, + "learning_rate": 9.015151515151516e-06, + "loss": 1.5904, + "step": 1206 + }, + { + "epoch": 2.743181818181818, + "grad_norm": 16.118629455566406, + "learning_rate": 8.93939393939394e-06, + "loss": 1.593, + "step": 1207 + }, + { + "epoch": 2.7454545454545456, + "grad_norm": 9.158020973205566, + "learning_rate": 8.863636363636365e-06, + "loss": 1.2809, + "step": 1208 + }, + { + "epoch": 2.7477272727272726, + "grad_norm": 12.490316390991211, + "learning_rate": 8.787878787878788e-06, + "loss": 1.5405, + "step": 1209 + }, + { + "epoch": 2.75, + "grad_norm": 12.778218269348145, + "learning_rate": 8.712121212121212e-06, + "loss": 1.4892, + "step": 1210 + }, + { + "epoch": 2.7522727272727274, + "grad_norm": 11.4492826461792, + "learning_rate": 8.636363636363637e-06, + "loss": 1.2019, + "step": 1211 + }, + { + "epoch": 2.7545454545454544, + "grad_norm": 13.168742179870605, + "learning_rate": 8.56060606060606e-06, + "loss": 1.6647, + "step": 1212 + }, + { + "epoch": 2.756818181818182, + "grad_norm": 10.593256950378418, + "learning_rate": 8.484848484848486e-06, + "loss": 1.3455, + "step": 1213 + }, + { + "epoch": 2.759090909090909, + "grad_norm": 12.997807502746582, + "learning_rate": 8.409090909090909e-06, + "loss": 1.6967, + "step": 1214 + }, + { + "epoch": 2.7613636363636362, + "grad_norm": 16.37111473083496, + "learning_rate": 8.333333333333334e-06, + "loss": 1.7001, + "step": 1215 + }, + { + "epoch": 2.7636363636363637, + "grad_norm": 11.749297142028809, + "learning_rate": 8.257575757575758e-06, + "loss": 0.9918, + "step": 1216 + }, + { + "epoch": 2.7659090909090907, + "grad_norm": 9.196391105651855, + "learning_rate": 8.181818181818183e-06, + "loss": 1.3952, + "step": 1217 + }, + { + "epoch": 2.768181818181818, + "grad_norm": 7.304767608642578, + "learning_rate": 8.106060606060606e-06, + "loss": 0.9309, + "step": 1218 + }, + { + "epoch": 2.7704545454545455, + "grad_norm": 11.371389389038086, + "learning_rate": 8.03030303030303e-06, + "loss": 2.2034, + "step": 1219 + }, + { + "epoch": 2.7727272727272725, + "grad_norm": 10.503549575805664, + "learning_rate": 7.954545454545455e-06, + "loss": 1.0822, + "step": 1220 + }, + { + "epoch": 2.775, + "grad_norm": 11.071968078613281, + "learning_rate": 7.878787878787878e-06, + "loss": 1.7071, + "step": 1221 + }, + { + "epoch": 2.7772727272727273, + "grad_norm": 11.416297912597656, + "learning_rate": 7.803030303030304e-06, + "loss": 2.0261, + "step": 1222 + }, + { + "epoch": 2.7795454545454543, + "grad_norm": 15.829241752624512, + "learning_rate": 7.727272727272727e-06, + "loss": 2.0085, + "step": 1223 + }, + { + "epoch": 2.7818181818181817, + "grad_norm": 8.403531074523926, + "learning_rate": 7.651515151515152e-06, + "loss": 1.2764, + "step": 1224 + }, + { + "epoch": 2.784090909090909, + "grad_norm": 11.730886459350586, + "learning_rate": 7.5757575757575764e-06, + "loss": 1.6733, + "step": 1225 + }, + { + "epoch": 2.786363636363636, + "grad_norm": 13.102418899536133, + "learning_rate": 7.5e-06, + "loss": 2.139, + "step": 1226 + }, + { + "epoch": 2.7886363636363636, + "grad_norm": 14.804220199584961, + "learning_rate": 7.424242424242425e-06, + "loss": 2.1015, + "step": 1227 + }, + { + "epoch": 2.790909090909091, + "grad_norm": 11.839103698730469, + "learning_rate": 7.3484848484848486e-06, + "loss": 1.6026, + "step": 1228 + }, + { + "epoch": 2.793181818181818, + "grad_norm": 17.421327590942383, + "learning_rate": 7.272727272727272e-06, + "loss": 2.7038, + "step": 1229 + }, + { + "epoch": 2.7954545454545454, + "grad_norm": 14.81433391571045, + "learning_rate": 7.196969696969698e-06, + "loss": 1.702, + "step": 1230 + }, + { + "epoch": 2.797727272727273, + "grad_norm": 7.195108413696289, + "learning_rate": 7.1212121212121215e-06, + "loss": 0.9022, + "step": 1231 + }, + { + "epoch": 2.8, + "grad_norm": 9.045830726623535, + "learning_rate": 7.045454545454545e-06, + "loss": 1.0748, + "step": 1232 + }, + { + "epoch": 2.8022727272727272, + "grad_norm": 11.995684623718262, + "learning_rate": 6.969696969696971e-06, + "loss": 2.5776, + "step": 1233 + }, + { + "epoch": 2.8045454545454547, + "grad_norm": 10.528661727905273, + "learning_rate": 6.8939393939393945e-06, + "loss": 1.8155, + "step": 1234 + }, + { + "epoch": 2.8068181818181817, + "grad_norm": 34.72589111328125, + "learning_rate": 6.818181818181818e-06, + "loss": 2.5481, + "step": 1235 + }, + { + "epoch": 2.809090909090909, + "grad_norm": 8.032730102539062, + "learning_rate": 6.742424242424243e-06, + "loss": 0.736, + "step": 1236 + }, + { + "epoch": 2.8113636363636365, + "grad_norm": 9.088884353637695, + "learning_rate": 6.666666666666667e-06, + "loss": 1.6364, + "step": 1237 + }, + { + "epoch": 2.8136363636363635, + "grad_norm": 9.277338027954102, + "learning_rate": 6.59090909090909e-06, + "loss": 1.4521, + "step": 1238 + }, + { + "epoch": 2.815909090909091, + "grad_norm": 12.458305358886719, + "learning_rate": 6.515151515151516e-06, + "loss": 1.2296, + "step": 1239 + }, + { + "epoch": 2.8181818181818183, + "grad_norm": 10.594490051269531, + "learning_rate": 6.43939393939394e-06, + "loss": 1.414, + "step": 1240 + }, + { + "epoch": 2.8204545454545453, + "grad_norm": 10.604024887084961, + "learning_rate": 6.363636363636363e-06, + "loss": 1.7017, + "step": 1241 + }, + { + "epoch": 2.8227272727272728, + "grad_norm": 10.347737312316895, + "learning_rate": 6.287878787878789e-06, + "loss": 1.2462, + "step": 1242 + }, + { + "epoch": 2.825, + "grad_norm": 11.151006698608398, + "learning_rate": 6.212121212121212e-06, + "loss": 1.7713, + "step": 1243 + }, + { + "epoch": 2.827272727272727, + "grad_norm": 12.432381629943848, + "learning_rate": 6.136363636363636e-06, + "loss": 2.7927, + "step": 1244 + }, + { + "epoch": 2.8295454545454546, + "grad_norm": 12.030777931213379, + "learning_rate": 6.060606060606061e-06, + "loss": 2.1842, + "step": 1245 + }, + { + "epoch": 2.831818181818182, + "grad_norm": 14.940272331237793, + "learning_rate": 5.984848484848485e-06, + "loss": 1.6475, + "step": 1246 + }, + { + "epoch": 2.834090909090909, + "grad_norm": 8.027610778808594, + "learning_rate": 5.909090909090909e-06, + "loss": 0.948, + "step": 1247 + }, + { + "epoch": 2.8363636363636364, + "grad_norm": 12.356363296508789, + "learning_rate": 5.833333333333334e-06, + "loss": 1.6191, + "step": 1248 + }, + { + "epoch": 2.838636363636364, + "grad_norm": 12.225868225097656, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.2056, + "step": 1249 + }, + { + "epoch": 2.840909090909091, + "grad_norm": 11.615985870361328, + "learning_rate": 5.681818181818182e-06, + "loss": 1.5477, + "step": 1250 + }, + { + "epoch": 2.8431818181818183, + "grad_norm": 13.92235279083252, + "learning_rate": 5.606060606060606e-06, + "loss": 2.401, + "step": 1251 + }, + { + "epoch": 2.8454545454545457, + "grad_norm": 19.311002731323242, + "learning_rate": 5.530303030303031e-06, + "loss": 2.2211, + "step": 1252 + }, + { + "epoch": 2.8477272727272727, + "grad_norm": 9.447689056396484, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.2734, + "step": 1253 + }, + { + "epoch": 2.85, + "grad_norm": 10.197713851928711, + "learning_rate": 5.378787878787879e-06, + "loss": 0.878, + "step": 1254 + }, + { + "epoch": 2.8522727272727275, + "grad_norm": 14.826508522033691, + "learning_rate": 5.303030303030304e-06, + "loss": 1.6759, + "step": 1255 + }, + { + "epoch": 2.8545454545454545, + "grad_norm": 10.666242599487305, + "learning_rate": 5.2272727272727274e-06, + "loss": 2.1974, + "step": 1256 + }, + { + "epoch": 2.856818181818182, + "grad_norm": 13.020369529724121, + "learning_rate": 5.151515151515152e-06, + "loss": 1.4073, + "step": 1257 + }, + { + "epoch": 2.8590909090909093, + "grad_norm": 14.27531623840332, + "learning_rate": 5.075757575757576e-06, + "loss": 2.1165, + "step": 1258 + }, + { + "epoch": 2.8613636363636363, + "grad_norm": 11.82662296295166, + "learning_rate": 5e-06, + "loss": 0.7765, + "step": 1259 + }, + { + "epoch": 2.8636363636363638, + "grad_norm": 12.107914924621582, + "learning_rate": 4.924242424242424e-06, + "loss": 1.2762, + "step": 1260 + }, + { + "epoch": 2.865909090909091, + "grad_norm": 10.041885375976562, + "learning_rate": 4.848484848484849e-06, + "loss": 2.1775, + "step": 1261 + }, + { + "epoch": 2.868181818181818, + "grad_norm": 11.078441619873047, + "learning_rate": 4.772727272727273e-06, + "loss": 1.6073, + "step": 1262 + }, + { + "epoch": 2.8704545454545456, + "grad_norm": 9.000492095947266, + "learning_rate": 4.696969696969697e-06, + "loss": 1.4636, + "step": 1263 + }, + { + "epoch": 2.8727272727272726, + "grad_norm": 11.069653511047363, + "learning_rate": 4.621212121212122e-06, + "loss": 1.4654, + "step": 1264 + }, + { + "epoch": 2.875, + "grad_norm": 9.110404968261719, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.8338, + "step": 1265 + }, + { + "epoch": 2.8772727272727274, + "grad_norm": 16.761194229125977, + "learning_rate": 4.46969696969697e-06, + "loss": 1.0709, + "step": 1266 + }, + { + "epoch": 2.8795454545454544, + "grad_norm": 13.67717170715332, + "learning_rate": 4.393939393939394e-06, + "loss": 2.0994, + "step": 1267 + }, + { + "epoch": 2.881818181818182, + "grad_norm": 8.258940696716309, + "learning_rate": 4.3181818181818185e-06, + "loss": 1.2818, + "step": 1268 + }, + { + "epoch": 2.884090909090909, + "grad_norm": 12.960264205932617, + "learning_rate": 4.242424242424243e-06, + "loss": 1.9218, + "step": 1269 + }, + { + "epoch": 2.8863636363636362, + "grad_norm": 10.886972427368164, + "learning_rate": 4.166666666666667e-06, + "loss": 1.4611, + "step": 1270 + }, + { + "epoch": 2.8886363636363637, + "grad_norm": 10.516489028930664, + "learning_rate": 4.0909090909090915e-06, + "loss": 2.3418, + "step": 1271 + }, + { + "epoch": 2.8909090909090907, + "grad_norm": 12.977254867553711, + "learning_rate": 4.015151515151515e-06, + "loss": 1.5361, + "step": 1272 + }, + { + "epoch": 2.893181818181818, + "grad_norm": 14.605803489685059, + "learning_rate": 3.939393939393939e-06, + "loss": 1.6679, + "step": 1273 + }, + { + "epoch": 2.8954545454545455, + "grad_norm": 17.729450225830078, + "learning_rate": 3.863636363636364e-06, + "loss": 1.468, + "step": 1274 + }, + { + "epoch": 2.8977272727272725, + "grad_norm": 10.65392780303955, + "learning_rate": 3.7878787878787882e-06, + "loss": 1.8606, + "step": 1275 + }, + { + "epoch": 2.9, + "grad_norm": 18.738691329956055, + "learning_rate": 3.7121212121212124e-06, + "loss": 2.7391, + "step": 1276 + }, + { + "epoch": 2.9022727272727273, + "grad_norm": 11.129204750061035, + "learning_rate": 3.636363636363636e-06, + "loss": 1.4911, + "step": 1277 + }, + { + "epoch": 2.9045454545454543, + "grad_norm": 10.117977142333984, + "learning_rate": 3.5606060606060608e-06, + "loss": 1.0915, + "step": 1278 + }, + { + "epoch": 2.9068181818181817, + "grad_norm": 9.391002655029297, + "learning_rate": 3.4848484848484854e-06, + "loss": 1.1659, + "step": 1279 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 10.86440372467041, + "learning_rate": 3.409090909090909e-06, + "loss": 1.4967, + "step": 1280 + }, + { + "epoch": 2.911363636363636, + "grad_norm": 11.438384056091309, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.6597, + "step": 1281 + }, + { + "epoch": 2.9136363636363636, + "grad_norm": 13.486211776733398, + "learning_rate": 3.257575757575758e-06, + "loss": 1.947, + "step": 1282 + }, + { + "epoch": 2.915909090909091, + "grad_norm": 13.491000175476074, + "learning_rate": 3.1818181818181817e-06, + "loss": 2.4163, + "step": 1283 + }, + { + "epoch": 2.918181818181818, + "grad_norm": 10.710677146911621, + "learning_rate": 3.106060606060606e-06, + "loss": 1.8073, + "step": 1284 + }, + { + "epoch": 2.9204545454545454, + "grad_norm": 12.062322616577148, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.9969, + "step": 1285 + }, + { + "epoch": 2.922727272727273, + "grad_norm": 70.31402587890625, + "learning_rate": 2.9545454545454547e-06, + "loss": 1.3767, + "step": 1286 + }, + { + "epoch": 2.925, + "grad_norm": 9.519462585449219, + "learning_rate": 2.8787878787878793e-06, + "loss": 1.4795, + "step": 1287 + }, + { + "epoch": 2.9272727272727272, + "grad_norm": 13.316557884216309, + "learning_rate": 2.803030303030303e-06, + "loss": 0.858, + "step": 1288 + }, + { + "epoch": 2.9295454545454547, + "grad_norm": 11.898123741149902, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.7807, + "step": 1289 + }, + { + "epoch": 2.9318181818181817, + "grad_norm": 13.429510116577148, + "learning_rate": 2.651515151515152e-06, + "loss": 1.7467, + "step": 1290 + }, + { + "epoch": 2.934090909090909, + "grad_norm": 34.4333381652832, + "learning_rate": 2.575757575757576e-06, + "loss": 1.6774, + "step": 1291 + }, + { + "epoch": 2.9363636363636365, + "grad_norm": 8.44999885559082, + "learning_rate": 2.5e-06, + "loss": 0.8595, + "step": 1292 + }, + { + "epoch": 2.9386363636363635, + "grad_norm": 9.824548721313477, + "learning_rate": 2.4242424242424244e-06, + "loss": 1.551, + "step": 1293 + }, + { + "epoch": 2.940909090909091, + "grad_norm": 10.713866233825684, + "learning_rate": 2.3484848484848486e-06, + "loss": 1.4604, + "step": 1294 + }, + { + "epoch": 2.9431818181818183, + "grad_norm": 18.695775985717773, + "learning_rate": 2.2727272727272728e-06, + "loss": 2.8512, + "step": 1295 + }, + { + "epoch": 2.9454545454545453, + "grad_norm": 9.289727210998535, + "learning_rate": 2.196969696969697e-06, + "loss": 1.3539, + "step": 1296 + }, + { + "epoch": 2.9477272727272728, + "grad_norm": 7.917882442474365, + "learning_rate": 2.1212121212121216e-06, + "loss": 1.2179, + "step": 1297 + }, + { + "epoch": 2.95, + "grad_norm": 16.269927978515625, + "learning_rate": 2.0454545454545457e-06, + "loss": 1.8904, + "step": 1298 + }, + { + "epoch": 2.952272727272727, + "grad_norm": 11.293408393859863, + "learning_rate": 1.9696969696969695e-06, + "loss": 1.4438, + "step": 1299 + }, + { + "epoch": 2.9545454545454546, + "grad_norm": 14.2405424118042, + "learning_rate": 1.8939393939393941e-06, + "loss": 2.2578, + "step": 1300 + }, + { + "epoch": 2.956818181818182, + "grad_norm": 9.712430953979492, + "learning_rate": 1.818181818181818e-06, + "loss": 1.1685, + "step": 1301 + }, + { + "epoch": 2.959090909090909, + "grad_norm": 14.34041690826416, + "learning_rate": 1.7424242424242427e-06, + "loss": 1.9741, + "step": 1302 + }, + { + "epoch": 2.9613636363636364, + "grad_norm": 12.20971965789795, + "learning_rate": 1.6666666666666667e-06, + "loss": 2.283, + "step": 1303 + }, + { + "epoch": 2.963636363636364, + "grad_norm": 13.051138877868652, + "learning_rate": 1.5909090909090908e-06, + "loss": 2.3128, + "step": 1304 + }, + { + "epoch": 2.965909090909091, + "grad_norm": 11.069129943847656, + "learning_rate": 1.5151515151515152e-06, + "loss": 1.4379, + "step": 1305 + }, + { + "epoch": 2.9681818181818183, + "grad_norm": 10.655563354492188, + "learning_rate": 1.4393939393939396e-06, + "loss": 1.4726, + "step": 1306 + }, + { + "epoch": 2.9704545454545457, + "grad_norm": 9.674460411071777, + "learning_rate": 1.3636363636363636e-06, + "loss": 1.2689, + "step": 1307 + }, + { + "epoch": 2.9727272727272727, + "grad_norm": 10.24626636505127, + "learning_rate": 1.287878787878788e-06, + "loss": 1.2585, + "step": 1308 + }, + { + "epoch": 2.975, + "grad_norm": 13.117413520812988, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.8019, + "step": 1309 + }, + { + "epoch": 2.9772727272727275, + "grad_norm": 11.649164199829102, + "learning_rate": 1.1363636363636364e-06, + "loss": 1.375, + "step": 1310 + }, + { + "epoch": 2.9795454545454545, + "grad_norm": 11.054950714111328, + "learning_rate": 1.0606060606060608e-06, + "loss": 1.7139, + "step": 1311 + }, + { + "epoch": 2.981818181818182, + "grad_norm": 9.476350784301758, + "learning_rate": 9.848484848484847e-07, + "loss": 1.1851, + "step": 1312 + }, + { + "epoch": 2.9840909090909093, + "grad_norm": 9.467584609985352, + "learning_rate": 9.09090909090909e-07, + "loss": 1.0272, + "step": 1313 + }, + { + "epoch": 2.9863636363636363, + "grad_norm": 11.783283233642578, + "learning_rate": 8.333333333333333e-07, + "loss": 1.886, + "step": 1314 + }, + { + "epoch": 2.9886363636363638, + "grad_norm": 11.245438575744629, + "learning_rate": 7.575757575757576e-07, + "loss": 1.2872, + "step": 1315 + }, + { + "epoch": 2.990909090909091, + "grad_norm": 12.71106243133545, + "learning_rate": 6.818181818181818e-07, + "loss": 1.3681, + "step": 1316 + }, + { + "epoch": 2.993181818181818, + "grad_norm": 11.738058090209961, + "learning_rate": 6.060606060606061e-07, + "loss": 1.9274, + "step": 1317 + }, + { + "epoch": 2.9954545454545456, + "grad_norm": 12.179485321044922, + "learning_rate": 5.303030303030304e-07, + "loss": 1.6056, + "step": 1318 + }, + { + "epoch": 2.9977272727272726, + "grad_norm": 9.123523712158203, + "learning_rate": 4.545454545454545e-07, + "loss": 1.2402, + "step": 1319 + }, + { + "epoch": 3.0, + "grad_norm": 17.10702133178711, + "learning_rate": 3.787878787878788e-07, + "loss": 1.7438, + "step": 1320 + }, + { + "epoch": 3.0, + "eval_f1": 0.8924, + "eval_gen_len": 41.8818, + "eval_loss": 1.7954092025756836, + "eval_precision": 0.8906, + "eval_recall": 0.8943, + "eval_rouge1": 0.4651, + "eval_rouge2": 0.218, + "eval_rougeL": 0.3904, + "eval_rougeLsum": 0.4291, + "eval_runtime": 28.6293, + "eval_samples_per_second": 3.842, + "eval_steps_per_second": 0.489, + "step": 1320 + }, + { + "epoch": 3.0, + "step": 1320, + "total_flos": 2659801069854720.0, + "train_loss": 1.8849294849868976, + "train_runtime": 574.0732, + "train_samples_per_second": 4.593, + "train_steps_per_second": 2.299 + } + ], + "logging_steps": 1, + "max_steps": 1320, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2659801069854720.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}