{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 37, "global_step": 290, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006896551724137931, "grad_norm": 2.651252269744873, "learning_rate": 7.5e-07, "loss": 4.7891, "step": 1 }, { "epoch": 0.006896551724137931, "eval_loss": 3.3850624561309814, "eval_runtime": 4.5511, "eval_samples_per_second": 1.758, "eval_steps_per_second": 1.758, "step": 1 }, { "epoch": 0.013793103448275862, "grad_norm": 2.064390182495117, "learning_rate": 1.5e-06, "loss": 2.9792, "step": 2 }, { "epoch": 0.020689655172413793, "grad_norm": 1.254639983177185, "learning_rate": 2.25e-06, "loss": 3.0642, "step": 3 }, { "epoch": 0.027586206896551724, "grad_norm": 1.5396596193313599, "learning_rate": 3e-06, "loss": 3.249, "step": 4 }, { "epoch": 0.034482758620689655, "grad_norm": 2.1187007427215576, "learning_rate": 3.75e-06, "loss": 3.4551, "step": 5 }, { "epoch": 0.041379310344827586, "grad_norm": 3.609483003616333, "learning_rate": 4.5e-06, "loss": 6.0963, "step": 6 }, { "epoch": 0.04827586206896552, "grad_norm": 2.3606605529785156, "learning_rate": 5.25e-06, "loss": 4.5106, "step": 7 }, { "epoch": 0.05517241379310345, "grad_norm": 1.5973587036132812, "learning_rate": 6e-06, "loss": 3.3045, "step": 8 }, { "epoch": 0.06206896551724138, "grad_norm": 1.4481266736984253, "learning_rate": 6.750000000000001e-06, "loss": 3.5009, "step": 9 }, { "epoch": 0.06896551724137931, "grad_norm": 2.194455862045288, "learning_rate": 7.5e-06, "loss": 5.0099, "step": 10 }, { "epoch": 0.07586206896551724, "grad_norm": 2.1214380264282227, "learning_rate": 8.25e-06, "loss": 3.7116, "step": 11 }, { "epoch": 0.08275862068965517, "grad_norm": 1.501364827156067, "learning_rate": 9e-06, "loss": 3.3266, "step": 12 }, { "epoch": 0.0896551724137931, "grad_norm": 2.078202962875366, "learning_rate": 9.75e-06, "loss": 3.4969, "step": 13 }, { "epoch": 0.09655172413793103, "grad_norm": 1.9757059812545776, "learning_rate": 1.05e-05, "loss": 3.3945, "step": 14 }, { "epoch": 0.10344827586206896, "grad_norm": 2.730952739715576, "learning_rate": 1.125e-05, "loss": 3.5526, "step": 15 }, { "epoch": 0.1103448275862069, "grad_norm": 3.307832717895508, "learning_rate": 1.2e-05, "loss": 4.7907, "step": 16 }, { "epoch": 0.11724137931034483, "grad_norm": 2.1818623542785645, "learning_rate": 1.275e-05, "loss": 3.9304, "step": 17 }, { "epoch": 0.12413793103448276, "grad_norm": 3.2467010021209717, "learning_rate": 1.3500000000000001e-05, "loss": 4.9399, "step": 18 }, { "epoch": 0.1310344827586207, "grad_norm": 1.985250473022461, "learning_rate": 1.4249999999999999e-05, "loss": 2.9249, "step": 19 }, { "epoch": 0.13793103448275862, "grad_norm": 2.669128894805908, "learning_rate": 1.5e-05, "loss": 3.33, "step": 20 }, { "epoch": 0.14482758620689656, "grad_norm": 2.3027143478393555, "learning_rate": 1.575e-05, "loss": 2.6643, "step": 21 }, { "epoch": 0.15172413793103448, "grad_norm": 3.2221450805664062, "learning_rate": 1.65e-05, "loss": 3.6538, "step": 22 }, { "epoch": 0.15862068965517243, "grad_norm": 4.213558197021484, "learning_rate": 1.725e-05, "loss": 3.5537, "step": 23 }, { "epoch": 0.16551724137931034, "grad_norm": 1.8173574209213257, "learning_rate": 1.8e-05, "loss": 2.9893, "step": 24 }, { "epoch": 0.1724137931034483, "grad_norm": 2.863757610321045, "learning_rate": 1.8750000000000002e-05, "loss": 2.487, "step": 25 }, { "epoch": 0.1793103448275862, "grad_norm": 2.650667905807495, "learning_rate": 1.95e-05, "loss": 2.4155, "step": 26 }, { "epoch": 0.18620689655172415, "grad_norm": 2.5865488052368164, "learning_rate": 2.025e-05, "loss": 2.6802, "step": 27 }, { "epoch": 0.19310344827586207, "grad_norm": 2.5942306518554688, "learning_rate": 2.1e-05, "loss": 2.6337, "step": 28 }, { "epoch": 0.2, "grad_norm": 4.72637414932251, "learning_rate": 2.175e-05, "loss": 4.4921, "step": 29 }, { "epoch": 0.20689655172413793, "grad_norm": 3.370011806488037, "learning_rate": 2.25e-05, "loss": 2.8945, "step": 30 }, { "epoch": 0.21379310344827587, "grad_norm": 2.9828832149505615, "learning_rate": 2.3250000000000003e-05, "loss": 3.031, "step": 31 }, { "epoch": 0.2206896551724138, "grad_norm": 2.9294815063476562, "learning_rate": 2.4e-05, "loss": 2.7564, "step": 32 }, { "epoch": 0.22758620689655173, "grad_norm": 2.9029836654663086, "learning_rate": 2.475e-05, "loss": 2.635, "step": 33 }, { "epoch": 0.23448275862068965, "grad_norm": 3.5925707817077637, "learning_rate": 2.55e-05, "loss": 2.9502, "step": 34 }, { "epoch": 0.2413793103448276, "grad_norm": 4.757285118103027, "learning_rate": 2.625e-05, "loss": 2.0215, "step": 35 }, { "epoch": 0.2482758620689655, "grad_norm": 12.417799949645996, "learning_rate": 2.7000000000000002e-05, "loss": 4.3027, "step": 36 }, { "epoch": 0.25517241379310346, "grad_norm": 4.293732166290283, "learning_rate": 2.7750000000000004e-05, "loss": 1.8095, "step": 37 }, { "epoch": 0.25517241379310346, "eval_loss": 2.7702932357788086, "eval_runtime": 4.4922, "eval_samples_per_second": 1.781, "eval_steps_per_second": 1.781, "step": 37 }, { "epoch": 0.2620689655172414, "grad_norm": 4.490789890289307, "learning_rate": 2.8499999999999998e-05, "loss": 2.2294, "step": 38 }, { "epoch": 0.2689655172413793, "grad_norm": 6.464705467224121, "learning_rate": 2.925e-05, "loss": 1.8365, "step": 39 }, { "epoch": 0.27586206896551724, "grad_norm": 4.536788463592529, "learning_rate": 3e-05, "loss": 2.3919, "step": 40 }, { "epoch": 0.2827586206896552, "grad_norm": 4.616312503814697, "learning_rate": 2.9999842246463293e-05, "loss": 2.7667, "step": 41 }, { "epoch": 0.2896551724137931, "grad_norm": 9.268322944641113, "learning_rate": 2.9999368989171332e-05, "loss": 1.6036, "step": 42 }, { "epoch": 0.296551724137931, "grad_norm": 9.249737739562988, "learning_rate": 2.9998580238078518e-05, "loss": 3.5214, "step": 43 }, { "epoch": 0.30344827586206896, "grad_norm": 5.715229034423828, "learning_rate": 2.9997476009775285e-05, "loss": 2.8424, "step": 44 }, { "epoch": 0.3103448275862069, "grad_norm": 11.085472106933594, "learning_rate": 2.999605632748776e-05, "loss": 3.2402, "step": 45 }, { "epoch": 0.31724137931034485, "grad_norm": 9.64968204498291, "learning_rate": 2.999432122107726e-05, "loss": 2.5156, "step": 46 }, { "epoch": 0.32413793103448274, "grad_norm": 5.387156963348389, "learning_rate": 2.9992270727039674e-05, "loss": 2.5335, "step": 47 }, { "epoch": 0.3310344827586207, "grad_norm": 8.309078216552734, "learning_rate": 2.9989904888504697e-05, "loss": 3.9152, "step": 48 }, { "epoch": 0.33793103448275863, "grad_norm": 8.13180923461914, "learning_rate": 2.998722375523491e-05, "loss": 2.6522, "step": 49 }, { "epoch": 0.3448275862068966, "grad_norm": 5.079596042633057, "learning_rate": 2.9984227383624753e-05, "loss": 2.1673, "step": 50 }, { "epoch": 0.35172413793103446, "grad_norm": 4.810355186462402, "learning_rate": 2.9980915836699322e-05, "loss": 2.4192, "step": 51 }, { "epoch": 0.3586206896551724, "grad_norm": 5.691514015197754, "learning_rate": 2.9977289184113038e-05, "loss": 2.1087, "step": 52 }, { "epoch": 0.36551724137931035, "grad_norm": 6.280345439910889, "learning_rate": 2.997334750214822e-05, "loss": 2.1365, "step": 53 }, { "epoch": 0.3724137931034483, "grad_norm": 5.447160243988037, "learning_rate": 2.9969090873713425e-05, "loss": 2.3364, "step": 54 }, { "epoch": 0.3793103448275862, "grad_norm": 9.810458183288574, "learning_rate": 2.9964519388341753e-05, "loss": 2.6459, "step": 55 }, { "epoch": 0.38620689655172413, "grad_norm": 8.42009449005127, "learning_rate": 2.9959633142188928e-05, "loss": 2.449, "step": 56 }, { "epoch": 0.3931034482758621, "grad_norm": 4.881720066070557, "learning_rate": 2.99544322380313e-05, "loss": 2.1418, "step": 57 }, { "epoch": 0.4, "grad_norm": 5.82330322265625, "learning_rate": 2.994891678526368e-05, "loss": 1.7012, "step": 58 }, { "epoch": 0.4068965517241379, "grad_norm": 7.9598917961120605, "learning_rate": 2.994308689989702e-05, "loss": 1.935, "step": 59 }, { "epoch": 0.41379310344827586, "grad_norm": 4.582525253295898, "learning_rate": 2.9936942704555988e-05, "loss": 2.3673, "step": 60 }, { "epoch": 0.4206896551724138, "grad_norm": 5.968487739562988, "learning_rate": 2.9930484328476392e-05, "loss": 1.3351, "step": 61 }, { "epoch": 0.42758620689655175, "grad_norm": 6.217311859130859, "learning_rate": 2.992371190750246e-05, "loss": 2.8033, "step": 62 }, { "epoch": 0.43448275862068964, "grad_norm": 15.824493408203125, "learning_rate": 2.9916625584083965e-05, "loss": 3.5775, "step": 63 }, { "epoch": 0.4413793103448276, "grad_norm": 6.833436965942383, "learning_rate": 2.990922550727326e-05, "loss": 3.1572, "step": 64 }, { "epoch": 0.4482758620689655, "grad_norm": 7.139071464538574, "learning_rate": 2.9901511832722107e-05, "loss": 2.5818, "step": 65 }, { "epoch": 0.45517241379310347, "grad_norm": 5.222311973571777, "learning_rate": 2.989348472267844e-05, "loss": 1.6886, "step": 66 }, { "epoch": 0.46206896551724136, "grad_norm": 5.626909255981445, "learning_rate": 2.988514434598292e-05, "loss": 2.3763, "step": 67 }, { "epoch": 0.4689655172413793, "grad_norm": 4.76120138168335, "learning_rate": 2.9876490878065402e-05, "loss": 2.3714, "step": 68 }, { "epoch": 0.47586206896551725, "grad_norm": 4.187699317932129, "learning_rate": 2.9867524500941255e-05, "loss": 1.5454, "step": 69 }, { "epoch": 0.4827586206896552, "grad_norm": 6.578094959259033, "learning_rate": 2.9858245403207488e-05, "loss": 2.9233, "step": 70 }, { "epoch": 0.4896551724137931, "grad_norm": 21.87733268737793, "learning_rate": 2.9848653780038844e-05, "loss": 3.232, "step": 71 }, { "epoch": 0.496551724137931, "grad_norm": 8.493062973022461, "learning_rate": 2.9838749833183647e-05, "loss": 2.1025, "step": 72 }, { "epoch": 0.503448275862069, "grad_norm": 9.408147811889648, "learning_rate": 2.9828533770959584e-05, "loss": 1.8301, "step": 73 }, { "epoch": 0.5103448275862069, "grad_norm": 5.230013847351074, "learning_rate": 2.9818005808249323e-05, "loss": 1.6428, "step": 74 }, { "epoch": 0.5103448275862069, "eval_loss": 2.1590590476989746, "eval_runtime": 4.4994, "eval_samples_per_second": 1.778, "eval_steps_per_second": 1.778, "step": 74 }, { "epoch": 0.5172413793103449, "grad_norm": 5.788967132568359, "learning_rate": 2.9807166166495966e-05, "loss": 0.9333, "step": 75 }, { "epoch": 0.5241379310344828, "grad_norm": 5.982523441314697, "learning_rate": 2.979601507369843e-05, "loss": 1.8066, "step": 76 }, { "epoch": 0.5310344827586206, "grad_norm": 4.951727867126465, "learning_rate": 2.978455276440662e-05, "loss": 1.3418, "step": 77 }, { "epoch": 0.5379310344827586, "grad_norm": 5.705328941345215, "learning_rate": 2.977277947971652e-05, "loss": 1.1168, "step": 78 }, { "epoch": 0.5448275862068965, "grad_norm": 10.673117637634277, "learning_rate": 2.9760695467265096e-05, "loss": 1.9509, "step": 79 }, { "epoch": 0.5517241379310345, "grad_norm": 7.005712985992432, "learning_rate": 2.9748300981225112e-05, "loss": 2.8095, "step": 80 }, { "epoch": 0.5586206896551724, "grad_norm": 10.768919944763184, "learning_rate": 2.9735596282299767e-05, "loss": 1.1537, "step": 81 }, { "epoch": 0.5655172413793104, "grad_norm": 5.258718967437744, "learning_rate": 2.9722581637717225e-05, "loss": 2.0656, "step": 82 }, { "epoch": 0.5724137931034483, "grad_norm": 6.1790971755981445, "learning_rate": 2.9709257321224973e-05, "loss": 2.4952, "step": 83 }, { "epoch": 0.5793103448275863, "grad_norm": 5.152263164520264, "learning_rate": 2.9695623613084094e-05, "loss": 1.631, "step": 84 }, { "epoch": 0.5862068965517241, "grad_norm": 9.117701530456543, "learning_rate": 2.9681680800063333e-05, "loss": 1.5723, "step": 85 }, { "epoch": 0.593103448275862, "grad_norm": 15.927285194396973, "learning_rate": 2.966742917543311e-05, "loss": 4.0567, "step": 86 }, { "epoch": 0.6, "grad_norm": 7.300537109375, "learning_rate": 2.9652869038959308e-05, "loss": 2.0857, "step": 87 }, { "epoch": 0.6068965517241379, "grad_norm": 5.633571147918701, "learning_rate": 2.9638000696897004e-05, "loss": 1.5046, "step": 88 }, { "epoch": 0.6137931034482759, "grad_norm": 5.7234625816345215, "learning_rate": 2.9622824461984e-05, "loss": 2.1678, "step": 89 }, { "epoch": 0.6206896551724138, "grad_norm": 5.1384124755859375, "learning_rate": 2.9607340653434263e-05, "loss": 1.2331, "step": 90 }, { "epoch": 0.6275862068965518, "grad_norm": 9.393917083740234, "learning_rate": 2.9591549596931196e-05, "loss": 2.3239, "step": 91 }, { "epoch": 0.6344827586206897, "grad_norm": 29.67850112915039, "learning_rate": 2.957545162462081e-05, "loss": 2.7711, "step": 92 }, { "epoch": 0.6413793103448275, "grad_norm": 11.757678985595703, "learning_rate": 2.95590470751047e-05, "loss": 0.862, "step": 93 }, { "epoch": 0.6482758620689655, "grad_norm": 5.055336952209473, "learning_rate": 2.954233629343297e-05, "loss": 0.7167, "step": 94 }, { "epoch": 0.6551724137931034, "grad_norm": 6.484320640563965, "learning_rate": 2.9525319631096936e-05, "loss": 1.518, "step": 95 }, { "epoch": 0.6620689655172414, "grad_norm": 21.282947540283203, "learning_rate": 2.950799744602176e-05, "loss": 1.6637, "step": 96 }, { "epoch": 0.6689655172413793, "grad_norm": 4.83950662612915, "learning_rate": 2.94903701025589e-05, "loss": 1.8401, "step": 97 }, { "epoch": 0.6758620689655173, "grad_norm": 8.747105598449707, "learning_rate": 2.9472437971478456e-05, "loss": 2.6096, "step": 98 }, { "epoch": 0.6827586206896552, "grad_norm": 5.483854293823242, "learning_rate": 2.9454201429961377e-05, "loss": 1.1973, "step": 99 }, { "epoch": 0.6896551724137931, "grad_norm": 8.147703170776367, "learning_rate": 2.9435660861591523e-05, "loss": 2.2331, "step": 100 }, { "epoch": 0.696551724137931, "grad_norm": 10.85132884979248, "learning_rate": 2.9416816656347585e-05, "loss": 1.3165, "step": 101 }, { "epoch": 0.7034482758620689, "grad_norm": 9.249916076660156, "learning_rate": 2.9397669210594905e-05, "loss": 2.4115, "step": 102 }, { "epoch": 0.7103448275862069, "grad_norm": 8.229886054992676, "learning_rate": 2.9378218927077116e-05, "loss": 1.3861, "step": 103 }, { "epoch": 0.7172413793103448, "grad_norm": 5.1249613761901855, "learning_rate": 2.9358466214907692e-05, "loss": 1.3119, "step": 104 }, { "epoch": 0.7241379310344828, "grad_norm": 7.522292137145996, "learning_rate": 2.9338411489561327e-05, "loss": 2.3285, "step": 105 }, { "epoch": 0.7310344827586207, "grad_norm": 8.781840324401855, "learning_rate": 2.931805517286519e-05, "loss": 2.3725, "step": 106 }, { "epoch": 0.7379310344827587, "grad_norm": 8.974847793579102, "learning_rate": 2.929739769299009e-05, "loss": 2.7679, "step": 107 }, { "epoch": 0.7448275862068966, "grad_norm": 13.757139205932617, "learning_rate": 2.927643948444142e-05, "loss": 2.8373, "step": 108 }, { "epoch": 0.7517241379310344, "grad_norm": 6.372509956359863, "learning_rate": 2.925518098805005e-05, "loss": 0.3335, "step": 109 }, { "epoch": 0.7586206896551724, "grad_norm": 18.51079750061035, "learning_rate": 2.9233622650963046e-05, "loss": 3.7183, "step": 110 }, { "epoch": 0.7655172413793103, "grad_norm": 7.130232810974121, "learning_rate": 2.921176492663426e-05, "loss": 2.7397, "step": 111 }, { "epoch": 0.7655172413793103, "eval_loss": 2.076895236968994, "eval_runtime": 4.5121, "eval_samples_per_second": 1.773, "eval_steps_per_second": 1.773, "step": 111 }, { "epoch": 0.7724137931034483, "grad_norm": 5.484634876251221, "learning_rate": 2.9189608274814813e-05, "loss": 1.0504, "step": 112 }, { "epoch": 0.7793103448275862, "grad_norm": 7.2361979484558105, "learning_rate": 2.916715316154339e-05, "loss": 1.2353, "step": 113 }, { "epoch": 0.7862068965517242, "grad_norm": 5.681639194488525, "learning_rate": 2.9144400059136457e-05, "loss": 1.5471, "step": 114 }, { "epoch": 0.7931034482758621, "grad_norm": 8.158857345581055, "learning_rate": 2.9121349446178338e-05, "loss": 2.4476, "step": 115 }, { "epoch": 0.8, "grad_norm": 6.511702537536621, "learning_rate": 2.909800180751112e-05, "loss": 1.1023, "step": 116 }, { "epoch": 0.8068965517241379, "grad_norm": 9.999719619750977, "learning_rate": 2.907435763422449e-05, "loss": 0.5992, "step": 117 }, { "epoch": 0.8137931034482758, "grad_norm": 7.8701171875, "learning_rate": 2.9050417423645374e-05, "loss": 1.6095, "step": 118 }, { "epoch": 0.8206896551724138, "grad_norm": 6.189757823944092, "learning_rate": 2.9026181679327483e-05, "loss": 1.5167, "step": 119 }, { "epoch": 0.8275862068965517, "grad_norm": 17.398225784301758, "learning_rate": 2.9001650911040744e-05, "loss": 1.3394, "step": 120 }, { "epoch": 0.8344827586206897, "grad_norm": 13.492450714111328, "learning_rate": 2.897682563476054e-05, "loss": 5.1631, "step": 121 }, { "epoch": 0.8413793103448276, "grad_norm": 6.701801776885986, "learning_rate": 2.8951706372656898e-05, "loss": 1.4963, "step": 122 }, { "epoch": 0.8482758620689655, "grad_norm": 11.134561538696289, "learning_rate": 2.8926293653083475e-05, "loss": 2.0888, "step": 123 }, { "epoch": 0.8551724137931035, "grad_norm": 5.3243608474731445, "learning_rate": 2.890058801056645e-05, "loss": 1.9287, "step": 124 }, { "epoch": 0.8620689655172413, "grad_norm": 5.231749057769775, "learning_rate": 2.8874589985793298e-05, "loss": 1.543, "step": 125 }, { "epoch": 0.8689655172413793, "grad_norm": 29.016874313354492, "learning_rate": 2.88483001256014e-05, "loss": 3.8339, "step": 126 }, { "epoch": 0.8758620689655172, "grad_norm": 5.226491451263428, "learning_rate": 2.8821718982966544e-05, "loss": 1.189, "step": 127 }, { "epoch": 0.8827586206896552, "grad_norm": 5.349326133728027, "learning_rate": 2.87948471169913e-05, "loss": 0.4606, "step": 128 }, { "epoch": 0.8896551724137931, "grad_norm": 8.602606773376465, "learning_rate": 2.8767685092893244e-05, "loss": 3.0269, "step": 129 }, { "epoch": 0.896551724137931, "grad_norm": 8.23015308380127, "learning_rate": 2.874023348199311e-05, "loss": 1.2428, "step": 130 }, { "epoch": 0.903448275862069, "grad_norm": 5.327670097351074, "learning_rate": 2.8712492861702712e-05, "loss": 0.9737, "step": 131 }, { "epoch": 0.9103448275862069, "grad_norm": 4.516807556152344, "learning_rate": 2.868446381551285e-05, "loss": 1.2708, "step": 132 }, { "epoch": 0.9172413793103448, "grad_norm": 7.308916091918945, "learning_rate": 2.865614693298101e-05, "loss": 2.5184, "step": 133 }, { "epoch": 0.9241379310344827, "grad_norm": 7.566427230834961, "learning_rate": 2.8627542809718972e-05, "loss": 1.4915, "step": 134 }, { "epoch": 0.9310344827586207, "grad_norm": 6.399407863616943, "learning_rate": 2.8598652047380292e-05, "loss": 1.9207, "step": 135 }, { "epoch": 0.9379310344827586, "grad_norm": 5.2886786460876465, "learning_rate": 2.8569475253647624e-05, "loss": 1.0044, "step": 136 }, { "epoch": 0.9448275862068966, "grad_norm": 4.2925310134887695, "learning_rate": 2.854001304221995e-05, "loss": 1.6515, "step": 137 }, { "epoch": 0.9517241379310345, "grad_norm": 3.5389490127563477, "learning_rate": 2.8510266032799688e-05, "loss": 0.624, "step": 138 }, { "epoch": 0.9586206896551724, "grad_norm": 5.259653568267822, "learning_rate": 2.8480234851079622e-05, "loss": 1.5153, "step": 139 }, { "epoch": 0.9655172413793104, "grad_norm": 7.633677959442139, "learning_rate": 2.8449920128729772e-05, "loss": 1.6288, "step": 140 }, { "epoch": 0.9724137931034482, "grad_norm": 7.951615333557129, "learning_rate": 2.841932250338409e-05, "loss": 2.4153, "step": 141 }, { "epoch": 0.9793103448275862, "grad_norm": 4.297111511230469, "learning_rate": 2.8388442618627063e-05, "loss": 0.4714, "step": 142 }, { "epoch": 0.9862068965517241, "grad_norm": 5.56075382232666, "learning_rate": 2.8357281123980153e-05, "loss": 1.3144, "step": 143 }, { "epoch": 0.993103448275862, "grad_norm": 7.056431293487549, "learning_rate": 2.8325838674888168e-05, "loss": 2.9863, "step": 144 }, { "epoch": 1.0, "grad_norm": 5.265207290649414, "learning_rate": 2.829411593270545e-05, "loss": 1.8668, "step": 145 }, { "epoch": 1.006896551724138, "grad_norm": 6.738914966583252, "learning_rate": 2.826211356468196e-05, "loss": 1.6213, "step": 146 }, { "epoch": 1.013793103448276, "grad_norm": 19.16160774230957, "learning_rate": 2.822983224394926e-05, "loss": 1.2061, "step": 147 }, { "epoch": 1.0206896551724138, "grad_norm": 6.643246173858643, "learning_rate": 2.8197272649506363e-05, "loss": 1.936, "step": 148 }, { "epoch": 1.0206896551724138, "eval_loss": 1.9735311269760132, "eval_runtime": 4.4975, "eval_samples_per_second": 1.779, "eval_steps_per_second": 1.779, "step": 148 }, { "epoch": 1.0275862068965518, "grad_norm": 5.578238010406494, "learning_rate": 2.8164435466205423e-05, "loss": 1.7389, "step": 149 }, { "epoch": 1.0344827586206897, "grad_norm": 10.583780288696289, "learning_rate": 2.8131321384737344e-05, "loss": 0.4309, "step": 150 }, { "epoch": 1.0413793103448277, "grad_norm": 7.088948726654053, "learning_rate": 2.809793110161725e-05, "loss": 2.5514, "step": 151 }, { "epoch": 1.0482758620689656, "grad_norm": 5.819264888763428, "learning_rate": 2.8064265319169854e-05, "loss": 1.3072, "step": 152 }, { "epoch": 1.0551724137931036, "grad_norm": 6.2393364906311035, "learning_rate": 2.803032474551465e-05, "loss": 1.2324, "step": 153 }, { "epoch": 1.0620689655172413, "grad_norm": 4.304983139038086, "learning_rate": 2.799611009455104e-05, "loss": 0.1901, "step": 154 }, { "epoch": 1.0689655172413792, "grad_norm": 5.9356865882873535, "learning_rate": 2.7961622085943317e-05, "loss": 1.7746, "step": 155 }, { "epoch": 1.0758620689655172, "grad_norm": 5.441093444824219, "learning_rate": 2.792686144510553e-05, "loss": 1.9543, "step": 156 }, { "epoch": 1.0827586206896551, "grad_norm": 6.648105144500732, "learning_rate": 2.789182890318621e-05, "loss": 2.7205, "step": 157 }, { "epoch": 1.089655172413793, "grad_norm": 44.886863708496094, "learning_rate": 2.785652519705301e-05, "loss": 3.465, "step": 158 }, { "epoch": 1.096551724137931, "grad_norm": 5.482683181762695, "learning_rate": 2.78209510692772e-05, "loss": 1.4479, "step": 159 }, { "epoch": 1.103448275862069, "grad_norm": 7.18867826461792, "learning_rate": 2.778510726811804e-05, "loss": 2.3233, "step": 160 }, { "epoch": 1.110344827586207, "grad_norm": 10.170475006103516, "learning_rate": 2.7748994547507052e-05, "loss": 1.4287, "step": 161 }, { "epoch": 1.1172413793103448, "grad_norm": 16.519548416137695, "learning_rate": 2.7712613667032156e-05, "loss": 2.0423, "step": 162 }, { "epoch": 1.1241379310344828, "grad_norm": 6.548756122589111, "learning_rate": 2.7675965391921692e-05, "loss": 0.8162, "step": 163 }, { "epoch": 1.1310344827586207, "grad_norm": 10.879814147949219, "learning_rate": 2.763905049302833e-05, "loss": 2.0672, "step": 164 }, { "epoch": 1.1379310344827587, "grad_norm": 6.071365833282471, "learning_rate": 2.7601869746812855e-05, "loss": 1.5196, "step": 165 }, { "epoch": 1.1448275862068966, "grad_norm": 6.293059349060059, "learning_rate": 2.7564423935327817e-05, "loss": 0.7617, "step": 166 }, { "epoch": 1.1517241379310346, "grad_norm": 8.047264099121094, "learning_rate": 2.7526713846201118e-05, "loss": 0.4147, "step": 167 }, { "epoch": 1.1586206896551725, "grad_norm": 3.7810680866241455, "learning_rate": 2.7488740272619413e-05, "loss": 1.0664, "step": 168 }, { "epoch": 1.1655172413793102, "grad_norm": 6.20160436630249, "learning_rate": 2.7450504013311443e-05, "loss": 1.9879, "step": 169 }, { "epoch": 1.1724137931034484, "grad_norm": 6.690729141235352, "learning_rate": 2.7412005872531222e-05, "loss": 1.4968, "step": 170 }, { "epoch": 1.1793103448275861, "grad_norm": 9.275552749633789, "learning_rate": 2.737324666004113e-05, "loss": 4.3534, "step": 171 }, { "epoch": 1.186206896551724, "grad_norm": 5.3003106117248535, "learning_rate": 2.7334227191094885e-05, "loss": 0.9354, "step": 172 }, { "epoch": 1.193103448275862, "grad_norm": 6.302150726318359, "learning_rate": 2.729494828642038e-05, "loss": 1.2665, "step": 173 }, { "epoch": 1.2, "grad_norm": 7.4209303855896, "learning_rate": 2.7255410772202435e-05, "loss": 1.5308, "step": 174 }, { "epoch": 1.206896551724138, "grad_norm": 8.516342163085938, "learning_rate": 2.7215615480065415e-05, "loss": 1.1144, "step": 175 }, { "epoch": 1.2137931034482758, "grad_norm": 4.906225204467773, "learning_rate": 2.7175563247055723e-05, "loss": 2.0357, "step": 176 }, { "epoch": 1.2206896551724138, "grad_norm": 5.213315010070801, "learning_rate": 2.7135254915624213e-05, "loss": 1.6, "step": 177 }, { "epoch": 1.2275862068965517, "grad_norm": 6.571521282196045, "learning_rate": 2.709469133360847e-05, "loss": 1.4624, "step": 178 }, { "epoch": 1.2344827586206897, "grad_norm": 3.46243953704834, "learning_rate": 2.7053873354214957e-05, "loss": 0.3171, "step": 179 }, { "epoch": 1.2413793103448276, "grad_norm": 5.865048885345459, "learning_rate": 2.7012801836001098e-05, "loss": 1.9204, "step": 180 }, { "epoch": 1.2482758620689656, "grad_norm": 6.989517688751221, "learning_rate": 2.6971477642857185e-05, "loss": 1.518, "step": 181 }, { "epoch": 1.2551724137931035, "grad_norm": 4.894804000854492, "learning_rate": 2.6929901643988237e-05, "loss": 0.8945, "step": 182 }, { "epoch": 1.2620689655172415, "grad_norm": 4.024252414703369, "learning_rate": 2.6888074713895705e-05, "loss": 0.9237, "step": 183 }, { "epoch": 1.2689655172413792, "grad_norm": 6.457338333129883, "learning_rate": 2.6845997732359074e-05, "loss": 2.3232, "step": 184 }, { "epoch": 1.2758620689655173, "grad_norm": 7.112627029418945, "learning_rate": 2.680367158441736e-05, "loss": 0.2386, "step": 185 }, { "epoch": 1.2758620689655173, "eval_loss": 1.9213309288024902, "eval_runtime": 4.497, "eval_samples_per_second": 1.779, "eval_steps_per_second": 1.779, "step": 185 }, { "epoch": 1.282758620689655, "grad_norm": 15.904428482055664, "learning_rate": 2.676109716035051e-05, "loss": 1.914, "step": 186 }, { "epoch": 1.2896551724137932, "grad_norm": 6.568241596221924, "learning_rate": 2.6718275355660643e-05, "loss": 0.952, "step": 187 }, { "epoch": 1.296551724137931, "grad_norm": 10.139225959777832, "learning_rate": 2.667520707105325e-05, "loss": 2.1507, "step": 188 }, { "epoch": 1.303448275862069, "grad_norm": 6.300340175628662, "learning_rate": 2.6631893212418227e-05, "loss": 1.8095, "step": 189 }, { "epoch": 1.3103448275862069, "grad_norm": 4.754003047943115, "learning_rate": 2.6588334690810826e-05, "loss": 1.2223, "step": 190 }, { "epoch": 1.3172413793103448, "grad_norm": 5.433348655700684, "learning_rate": 2.654453242243249e-05, "loss": 0.9192, "step": 191 }, { "epoch": 1.3241379310344827, "grad_norm": 14.02868366241455, "learning_rate": 2.6500487328611584e-05, "loss": 1.642, "step": 192 }, { "epoch": 1.3310344827586207, "grad_norm": 4.654847621917725, "learning_rate": 2.645620033578402e-05, "loss": 1.6479, "step": 193 }, { "epoch": 1.3379310344827586, "grad_norm": 6.280045509338379, "learning_rate": 2.6411672375473768e-05, "loss": 1.9089, "step": 194 }, { "epoch": 1.3448275862068966, "grad_norm": 6.455976486206055, "learning_rate": 2.6366904384273252e-05, "loss": 0.8846, "step": 195 }, { "epoch": 1.3517241379310345, "grad_norm": 13.289530754089355, "learning_rate": 2.6321897303823665e-05, "loss": 1.32, "step": 196 }, { "epoch": 1.3586206896551725, "grad_norm": 16.279457092285156, "learning_rate": 2.6276652080795157e-05, "loss": 0.7443, "step": 197 }, { "epoch": 1.3655172413793104, "grad_norm": 5.546213626861572, "learning_rate": 2.6231169666866928e-05, "loss": 1.777, "step": 198 }, { "epoch": 1.3724137931034484, "grad_norm": 6.5098748207092285, "learning_rate": 2.6185451018707188e-05, "loss": 0.1505, "step": 199 }, { "epoch": 1.3793103448275863, "grad_norm": 7.808483123779297, "learning_rate": 2.613949709795307e-05, "loss": 2.0242, "step": 200 }, { "epoch": 1.386206896551724, "grad_norm": 19.0113582611084, "learning_rate": 2.6093308871190376e-05, "loss": 0.5734, "step": 201 }, { "epoch": 1.3931034482758622, "grad_norm": 5.4320268630981445, "learning_rate": 2.6046887309933252e-05, "loss": 1.9298, "step": 202 }, { "epoch": 1.4, "grad_norm": 5.639513969421387, "learning_rate": 2.6000233390603764e-05, "loss": 2.1008, "step": 203 }, { "epoch": 1.4068965517241379, "grad_norm": 6.506295680999756, "learning_rate": 2.595334809451135e-05, "loss": 2.0629, "step": 204 }, { "epoch": 1.4137931034482758, "grad_norm": 6.257269382476807, "learning_rate": 2.590623240783217e-05, "loss": 0.9089, "step": 205 }, { "epoch": 1.4206896551724137, "grad_norm": 7.4428534507751465, "learning_rate": 2.5858887321588403e-05, "loss": 1.6967, "step": 206 }, { "epoch": 1.4275862068965517, "grad_norm": 7.234190464019775, "learning_rate": 2.5811313831627343e-05, "loss": 1.1008, "step": 207 }, { "epoch": 1.4344827586206896, "grad_norm": 4.542428493499756, "learning_rate": 2.5763512938600496e-05, "loss": 1.0296, "step": 208 }, { "epoch": 1.4413793103448276, "grad_norm": 4.9460978507995605, "learning_rate": 2.5715485647942526e-05, "loss": 1.2251, "step": 209 }, { "epoch": 1.4482758620689655, "grad_norm": 8.297338485717773, "learning_rate": 2.566723296985009e-05, "loss": 1.244, "step": 210 }, { "epoch": 1.4551724137931035, "grad_norm": 5.6764326095581055, "learning_rate": 2.561875591926061e-05, "loss": 2.075, "step": 211 }, { "epoch": 1.4620689655172414, "grad_norm": 7.905850410461426, "learning_rate": 2.55700555158309e-05, "loss": 1.1372, "step": 212 }, { "epoch": 1.4689655172413794, "grad_norm": 7.2166523933410645, "learning_rate": 2.552113278391575e-05, "loss": 0.8753, "step": 213 }, { "epoch": 1.4758620689655173, "grad_norm": 16.44755744934082, "learning_rate": 2.5471988752546358e-05, "loss": 1.3347, "step": 214 }, { "epoch": 1.4827586206896552, "grad_norm": 6.070692539215088, "learning_rate": 2.542262445540869e-05, "loss": 1.7499, "step": 215 }, { "epoch": 1.489655172413793, "grad_norm": 7.000082492828369, "learning_rate": 2.5373040930821747e-05, "loss": 2.1478, "step": 216 }, { "epoch": 1.4965517241379311, "grad_norm": 7.349568843841553, "learning_rate": 2.5323239221715704e-05, "loss": 1.4035, "step": 217 }, { "epoch": 1.5034482758620689, "grad_norm": 6.242449760437012, "learning_rate": 2.5273220375609993e-05, "loss": 1.4238, "step": 218 }, { "epoch": 1.510344827586207, "grad_norm": 5.620000839233398, "learning_rate": 2.5222985444591268e-05, "loss": 1.3079, "step": 219 }, { "epoch": 1.5172413793103448, "grad_norm": 4.874967098236084, "learning_rate": 2.5172535485291263e-05, "loss": 0.5889, "step": 220 }, { "epoch": 1.524137931034483, "grad_norm": 13.453874588012695, "learning_rate": 2.5121871558864588e-05, "loss": 3.2711, "step": 221 }, { "epoch": 1.5310344827586206, "grad_norm": 24.48921775817871, "learning_rate": 2.5070994730966375e-05, "loss": 2.8233, "step": 222 }, { "epoch": 1.5310344827586206, "eval_loss": 1.8946478366851807, "eval_runtime": 4.54, "eval_samples_per_second": 1.762, "eval_steps_per_second": 1.762, "step": 222 }, { "epoch": 1.5379310344827586, "grad_norm": 4.243319034576416, "learning_rate": 2.5019906071729905e-05, "loss": 0.4053, "step": 223 }, { "epoch": 1.5448275862068965, "grad_norm": 9.057891845703125, "learning_rate": 2.496860665574406e-05, "loss": 2.5334, "step": 224 }, { "epoch": 1.5517241379310345, "grad_norm": 8.277237892150879, "learning_rate": 2.4917097562030756e-05, "loss": 2.1032, "step": 225 }, { "epoch": 1.5586206896551724, "grad_norm": 4.527651786804199, "learning_rate": 2.4865379874022212e-05, "loss": 0.6529, "step": 226 }, { "epoch": 1.5655172413793104, "grad_norm": 4.352250099182129, "learning_rate": 2.4813454679538192e-05, "loss": 0.2521, "step": 227 }, { "epoch": 1.5724137931034483, "grad_norm": 5.923490524291992, "learning_rate": 2.4761323070763103e-05, "loss": 1.8706, "step": 228 }, { "epoch": 1.5793103448275863, "grad_norm": 6.146652698516846, "learning_rate": 2.4708986144223038e-05, "loss": 1.4523, "step": 229 }, { "epoch": 1.5862068965517242, "grad_norm": 8.59493350982666, "learning_rate": 2.4656445000762695e-05, "loss": 2.0485, "step": 230 }, { "epoch": 1.593103448275862, "grad_norm": 14.89201545715332, "learning_rate": 2.4603700745522238e-05, "loss": 1.6968, "step": 231 }, { "epoch": 1.6, "grad_norm": 7.599427700042725, "learning_rate": 2.455075448791403e-05, "loss": 0.5584, "step": 232 }, { "epoch": 1.6068965517241378, "grad_norm": 6.137027263641357, "learning_rate": 2.4497607341599338e-05, "loss": 1.2578, "step": 233 }, { "epoch": 1.613793103448276, "grad_norm": 13.683157920837402, "learning_rate": 2.444426042446486e-05, "loss": 0.3959, "step": 234 }, { "epoch": 1.6206896551724137, "grad_norm": 6.342459201812744, "learning_rate": 2.439071485859924e-05, "loss": 1.5216, "step": 235 }, { "epoch": 1.6275862068965519, "grad_norm": 4.803065776824951, "learning_rate": 2.433697177026947e-05, "loss": 1.2224, "step": 236 }, { "epoch": 1.6344827586206896, "grad_norm": 19.011320114135742, "learning_rate": 2.4283032289897184e-05, "loss": 2.417, "step": 237 }, { "epoch": 1.6413793103448275, "grad_norm": 13.680153846740723, "learning_rate": 2.4228897552034885e-05, "loss": 1.0688, "step": 238 }, { "epoch": 1.6482758620689655, "grad_norm": 5.316374778747559, "learning_rate": 2.417456869534209e-05, "loss": 1.804, "step": 239 }, { "epoch": 1.6551724137931034, "grad_norm": 4.965254783630371, "learning_rate": 2.4120046862561367e-05, "loss": 1.0666, "step": 240 }, { "epoch": 1.6620689655172414, "grad_norm": 5.786849498748779, "learning_rate": 2.406533320049431e-05, "loss": 1.4944, "step": 241 }, { "epoch": 1.6689655172413793, "grad_norm": 6.54045295715332, "learning_rate": 2.4010428859977416e-05, "loss": 0.9506, "step": 242 }, { "epoch": 1.6758620689655173, "grad_norm": 8.71989631652832, "learning_rate": 2.3955334995857866e-05, "loss": 1.8664, "step": 243 }, { "epoch": 1.6827586206896552, "grad_norm": 7.771244525909424, "learning_rate": 2.3900052766969252e-05, "loss": 2.7829, "step": 244 }, { "epoch": 1.6896551724137931, "grad_norm": 13.747180938720703, "learning_rate": 2.3844583336107192e-05, "loss": 0.6658, "step": 245 }, { "epoch": 1.6965517241379309, "grad_norm": 8.790626525878906, "learning_rate": 2.378892787000487e-05, "loss": 1.6198, "step": 246 }, { "epoch": 1.703448275862069, "grad_norm": 6.457479000091553, "learning_rate": 2.37330875393085e-05, "loss": 2.0938, "step": 247 }, { "epoch": 1.7103448275862068, "grad_norm": 3.5511231422424316, "learning_rate": 2.3677063518552706e-05, "loss": 1.3691, "step": 248 }, { "epoch": 1.717241379310345, "grad_norm": 4.779417037963867, "learning_rate": 2.3620856986135807e-05, "loss": 1.4682, "step": 249 }, { "epoch": 1.7241379310344827, "grad_norm": 10.626018524169922, "learning_rate": 2.356446912429504e-05, "loss": 1.7604, "step": 250 }, { "epoch": 1.7310344827586208, "grad_norm": 9.10898208618164, "learning_rate": 2.3507901119081694e-05, "loss": 0.4235, "step": 251 }, { "epoch": 1.7379310344827585, "grad_norm": 6.013878345489502, "learning_rate": 2.3451154160336145e-05, "loss": 1.2361, "step": 252 }, { "epoch": 1.7448275862068967, "grad_norm": 6.5278401374816895, "learning_rate": 2.3394229441662863e-05, "loss": 2.053, "step": 253 }, { "epoch": 1.7517241379310344, "grad_norm": 6.2794623374938965, "learning_rate": 2.3337128160405262e-05, "loss": 0.772, "step": 254 }, { "epoch": 1.7586206896551724, "grad_norm": 7.884958744049072, "learning_rate": 2.3279851517620567e-05, "loss": 1.3663, "step": 255 }, { "epoch": 1.7655172413793103, "grad_norm": 6.92486047744751, "learning_rate": 2.322240071805449e-05, "loss": 1.3793, "step": 256 }, { "epoch": 1.7724137931034483, "grad_norm": 6.688532829284668, "learning_rate": 2.3164776970115952e-05, "loss": 2.1226, "step": 257 }, { "epoch": 1.7793103448275862, "grad_norm": 5.486979007720947, "learning_rate": 2.310698148585162e-05, "loss": 1.8741, "step": 258 }, { "epoch": 1.7862068965517242, "grad_norm": 4.790956497192383, "learning_rate": 2.3049015480920432e-05, "loss": 1.1631, "step": 259 }, { "epoch": 1.7862068965517242, "eval_loss": 1.9128004312515259, "eval_runtime": 4.4978, "eval_samples_per_second": 1.779, "eval_steps_per_second": 1.779, "step": 259 }, { "epoch": 1.793103448275862, "grad_norm": 7.004488945007324, "learning_rate": 2.299088017456803e-05, "loss": 1.5968, "step": 260 }, { "epoch": 1.8, "grad_norm": 9.21987533569336, "learning_rate": 2.29325767896011e-05, "loss": 1.7426, "step": 261 }, { "epoch": 1.806896551724138, "grad_norm": 7.149972915649414, "learning_rate": 2.2874106552361672e-05, "loss": 1.2684, "step": 262 }, { "epoch": 1.8137931034482757, "grad_norm": 4.868863105773926, "learning_rate": 2.2815470692701305e-05, "loss": 0.6387, "step": 263 }, { "epoch": 1.8206896551724139, "grad_norm": 7.658527851104736, "learning_rate": 2.2756670443955236e-05, "loss": 1.2035, "step": 264 }, { "epoch": 1.8275862068965516, "grad_norm": 4.457070827484131, "learning_rate": 2.2697707042916415e-05, "loss": 1.492, "step": 265 }, { "epoch": 1.8344827586206898, "grad_norm": 3.999518394470215, "learning_rate": 2.2638581729809522e-05, "loss": 0.3712, "step": 266 }, { "epoch": 1.8413793103448275, "grad_norm": 5.254199504852295, "learning_rate": 2.2579295748264856e-05, "loss": 1.5827, "step": 267 }, { "epoch": 1.8482758620689657, "grad_norm": 11.072293281555176, "learning_rate": 2.2519850345292192e-05, "loss": 1.5692, "step": 268 }, { "epoch": 1.8551724137931034, "grad_norm": 9.097825050354004, "learning_rate": 2.2460246771254525e-05, "loss": 2.0791, "step": 269 }, { "epoch": 1.8620689655172413, "grad_norm": 15.463027954101562, "learning_rate": 2.2400486279841812e-05, "loss": 2.0508, "step": 270 }, { "epoch": 1.8689655172413793, "grad_norm": 8.5389404296875, "learning_rate": 2.2340570128044567e-05, "loss": 0.9089, "step": 271 }, { "epoch": 1.8758620689655172, "grad_norm": 7.47584867477417, "learning_rate": 2.228049957612744e-05, "loss": 1.6641, "step": 272 }, { "epoch": 1.8827586206896552, "grad_norm": 5.4179253578186035, "learning_rate": 2.2220275887602688e-05, "loss": 1.9974, "step": 273 }, { "epoch": 1.889655172413793, "grad_norm": 20.747957229614258, "learning_rate": 2.2159900329203642e-05, "loss": 3.0228, "step": 274 }, { "epoch": 1.896551724137931, "grad_norm": 8.873815536499023, "learning_rate": 2.2099374170858004e-05, "loss": 1.7806, "step": 275 }, { "epoch": 1.903448275862069, "grad_norm": 5.834484100341797, "learning_rate": 2.2038698685661188e-05, "loss": 1.2638, "step": 276 }, { "epoch": 1.910344827586207, "grad_norm": 5.626514911651611, "learning_rate": 2.197787514984951e-05, "loss": 1.2453, "step": 277 }, { "epoch": 1.9172413793103447, "grad_norm": 5.62680196762085, "learning_rate": 2.1916904842773355e-05, "loss": 1.6145, "step": 278 }, { "epoch": 1.9241379310344828, "grad_norm": 10.727760314941406, "learning_rate": 2.1855789046870265e-05, "loss": 0.7303, "step": 279 }, { "epoch": 1.9310344827586206, "grad_norm": 4.86237907409668, "learning_rate": 2.1794529047637962e-05, "loss": 0.8277, "step": 280 }, { "epoch": 1.9379310344827587, "grad_norm": 5.81272554397583, "learning_rate": 2.1733126133607333e-05, "loss": 1.3026, "step": 281 }, { "epoch": 1.9448275862068964, "grad_norm": 7.906366348266602, "learning_rate": 2.1671581596315277e-05, "loss": 1.6814, "step": 282 }, { "epoch": 1.9517241379310346, "grad_norm": 4.704112529754639, "learning_rate": 2.160989673027759e-05, "loss": 0.6912, "step": 283 }, { "epoch": 1.9586206896551723, "grad_norm": 4.953239440917969, "learning_rate": 2.154807283296171e-05, "loss": 0.1418, "step": 284 }, { "epoch": 1.9655172413793105, "grad_norm": 5.499814033508301, "learning_rate": 2.1486111204759425e-05, "loss": 1.0075, "step": 285 }, { "epoch": 1.9724137931034482, "grad_norm": 5.502691268920898, "learning_rate": 2.1424013148959535e-05, "loss": 1.6252, "step": 286 }, { "epoch": 1.9793103448275862, "grad_norm": 9.957716941833496, "learning_rate": 2.1361779971720438e-05, "loss": 1.0677, "step": 287 }, { "epoch": 1.986206896551724, "grad_norm": 4.3258280754089355, "learning_rate": 2.129941298204263e-05, "loss": 1.0192, "step": 288 }, { "epoch": 1.993103448275862, "grad_norm": 7.173874855041504, "learning_rate": 2.1236913491741212e-05, "loss": 0.1295, "step": 289 }, { "epoch": 2.0, "grad_norm": 5.360743045806885, "learning_rate": 2.117428281541827e-05, "loss": 0.8945, "step": 290 } ], "logging_steps": 1, "max_steps": 725, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 145, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.4562668925845504e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }