{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998199819981998, "eval_steps": 695, "global_step": 2777, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00036003600360036, "grad_norm": 5.391221523284912, "learning_rate": 4.000000000000001e-06, "loss": 11.6744, "step": 1 }, { "epoch": 0.00036003600360036, "eval_loss": 11.619186401367188, "eval_runtime": 101.0372, "eval_samples_per_second": 46.3, "eval_steps_per_second": 11.58, "step": 1 }, { "epoch": 0.00072007200720072, "grad_norm": 4.955807209014893, "learning_rate": 8.000000000000001e-06, "loss": 11.5784, "step": 2 }, { "epoch": 0.00108010801080108, "grad_norm": 6.398324966430664, "learning_rate": 1.2e-05, "loss": 11.8558, "step": 3 }, { "epoch": 0.00144014401440144, "grad_norm": 5.072609901428223, "learning_rate": 1.6000000000000003e-05, "loss": 11.4906, "step": 4 }, { "epoch": 0.0018001800180018, "grad_norm": 6.004848957061768, "learning_rate": 2e-05, "loss": 11.8837, "step": 5 }, { "epoch": 0.00216021602160216, "grad_norm": 4.884363174438477, "learning_rate": 2.4e-05, "loss": 11.68, "step": 6 }, { "epoch": 0.0025202520252025204, "grad_norm": 5.704855442047119, "learning_rate": 2.8000000000000003e-05, "loss": 11.68, "step": 7 }, { "epoch": 0.00288028802880288, "grad_norm": 5.609814643859863, "learning_rate": 3.2000000000000005e-05, "loss": 11.5502, "step": 8 }, { "epoch": 0.0032403240324032404, "grad_norm": 5.240819931030273, "learning_rate": 3.6e-05, "loss": 11.7237, "step": 9 }, { "epoch": 0.0036003600360036, "grad_norm": 5.8036932945251465, "learning_rate": 4e-05, "loss": 11.7074, "step": 10 }, { "epoch": 0.0039603960396039604, "grad_norm": 6.281662464141846, "learning_rate": 4.4000000000000006e-05, "loss": 11.4896, "step": 11 }, { "epoch": 0.00432043204320432, "grad_norm": 5.4554829597473145, "learning_rate": 4.8e-05, "loss": 11.6188, "step": 12 }, { "epoch": 0.00468046804680468, "grad_norm": 5.878482341766357, "learning_rate": 5.2000000000000004e-05, "loss": 11.3579, "step": 13 }, { "epoch": 0.005040504050405041, "grad_norm": 5.315092086791992, "learning_rate": 5.6000000000000006e-05, "loss": 11.3018, "step": 14 }, { "epoch": 0.0054005400540054005, "grad_norm": 5.732967853546143, "learning_rate": 6e-05, "loss": 11.4945, "step": 15 }, { "epoch": 0.00576057605760576, "grad_norm": 4.960424900054932, "learning_rate": 6.400000000000001e-05, "loss": 11.2206, "step": 16 }, { "epoch": 0.006120612061206121, "grad_norm": 5.72969913482666, "learning_rate": 6.800000000000001e-05, "loss": 11.2825, "step": 17 }, { "epoch": 0.006480648064806481, "grad_norm": 5.419050693511963, "learning_rate": 7.2e-05, "loss": 10.9409, "step": 18 }, { "epoch": 0.006840684068406841, "grad_norm": 5.5492987632751465, "learning_rate": 7.6e-05, "loss": 11.4013, "step": 19 }, { "epoch": 0.0072007200720072, "grad_norm": 5.730124473571777, "learning_rate": 8e-05, "loss": 11.2501, "step": 20 }, { "epoch": 0.007560756075607561, "grad_norm": 6.309354305267334, "learning_rate": 8.4e-05, "loss": 11.0562, "step": 21 }, { "epoch": 0.007920792079207921, "grad_norm": 5.861652374267578, "learning_rate": 8.800000000000001e-05, "loss": 11.0803, "step": 22 }, { "epoch": 0.008280828082808282, "grad_norm": 6.468849182128906, "learning_rate": 9.200000000000001e-05, "loss": 11.2922, "step": 23 }, { "epoch": 0.00864086408640864, "grad_norm": 7.292991638183594, "learning_rate": 9.6e-05, "loss": 10.8244, "step": 24 }, { "epoch": 0.009000900090009001, "grad_norm": 9.702167510986328, "learning_rate": 0.0001, "loss": 10.7772, "step": 25 }, { "epoch": 0.00936093609360936, "grad_norm": 5.527791976928711, "learning_rate": 0.00010400000000000001, "loss": 11.0536, "step": 26 }, { "epoch": 0.00972097209720972, "grad_norm": 5.660224437713623, "learning_rate": 0.00010800000000000001, "loss": 11.3104, "step": 27 }, { "epoch": 0.010081008100810081, "grad_norm": 5.601504802703857, "learning_rate": 0.00011200000000000001, "loss": 10.9429, "step": 28 }, { "epoch": 0.01044104410441044, "grad_norm": 5.513530731201172, "learning_rate": 0.000116, "loss": 10.9418, "step": 29 }, { "epoch": 0.010801080108010801, "grad_norm": 5.222030162811279, "learning_rate": 0.00012, "loss": 10.6514, "step": 30 }, { "epoch": 0.011161116111611162, "grad_norm": 5.038498401641846, "learning_rate": 0.000124, "loss": 10.8715, "step": 31 }, { "epoch": 0.01152115211521152, "grad_norm": 5.638880729675293, "learning_rate": 0.00012800000000000002, "loss": 10.3996, "step": 32 }, { "epoch": 0.011881188118811881, "grad_norm": 5.269554615020752, "learning_rate": 0.000132, "loss": 10.3747, "step": 33 }, { "epoch": 0.012241224122412242, "grad_norm": 5.762288570404053, "learning_rate": 0.00013600000000000003, "loss": 10.5589, "step": 34 }, { "epoch": 0.012601260126012601, "grad_norm": 5.092434406280518, "learning_rate": 0.00014, "loss": 10.2383, "step": 35 }, { "epoch": 0.012961296129612962, "grad_norm": 5.491393089294434, "learning_rate": 0.000144, "loss": 10.0614, "step": 36 }, { "epoch": 0.01332133213321332, "grad_norm": 5.155731201171875, "learning_rate": 0.000148, "loss": 9.8436, "step": 37 }, { "epoch": 0.013681368136813681, "grad_norm": 5.45465087890625, "learning_rate": 0.000152, "loss": 10.14, "step": 38 }, { "epoch": 0.014041404140414042, "grad_norm": 5.7989606857299805, "learning_rate": 0.00015600000000000002, "loss": 9.9675, "step": 39 }, { "epoch": 0.0144014401440144, "grad_norm": 5.706261157989502, "learning_rate": 0.00016, "loss": 9.5643, "step": 40 }, { "epoch": 0.014761476147614761, "grad_norm": 5.091904640197754, "learning_rate": 0.000164, "loss": 9.5091, "step": 41 }, { "epoch": 0.015121512151215122, "grad_norm": 5.238409996032715, "learning_rate": 0.000168, "loss": 9.3674, "step": 42 }, { "epoch": 0.015481548154815481, "grad_norm": 4.824737071990967, "learning_rate": 0.000172, "loss": 9.5337, "step": 43 }, { "epoch": 0.015841584158415842, "grad_norm": 4.952434539794922, "learning_rate": 0.00017600000000000002, "loss": 9.2853, "step": 44 }, { "epoch": 0.016201620162016202, "grad_norm": 5.05830192565918, "learning_rate": 0.00018, "loss": 9.5244, "step": 45 }, { "epoch": 0.016561656165616563, "grad_norm": 4.598903656005859, "learning_rate": 0.00018400000000000003, "loss": 9.1549, "step": 46 }, { "epoch": 0.01692169216921692, "grad_norm": 5.076130390167236, "learning_rate": 0.000188, "loss": 9.4544, "step": 47 }, { "epoch": 0.01728172817281728, "grad_norm": 4.568606853485107, "learning_rate": 0.000192, "loss": 9.3589, "step": 48 }, { "epoch": 0.01764176417641764, "grad_norm": 6.102865219116211, "learning_rate": 0.000196, "loss": 9.3574, "step": 49 }, { "epoch": 0.018001800180018002, "grad_norm": 6.295242786407471, "learning_rate": 0.0002, "loss": 9.2564, "step": 50 }, { "epoch": 0.018361836183618363, "grad_norm": 4.133947849273682, "learning_rate": 0.0001999999336410622, "loss": 9.0434, "step": 51 }, { "epoch": 0.01872187218721872, "grad_norm": 3.816929817199707, "learning_rate": 0.00019999973456433681, "loss": 8.9869, "step": 52 }, { "epoch": 0.01908190819081908, "grad_norm": 3.87251877784729, "learning_rate": 0.00019999940277008808, "loss": 8.9041, "step": 53 }, { "epoch": 0.01944194419441944, "grad_norm": 3.925431728363037, "learning_rate": 0.00019999893825875637, "loss": 8.8908, "step": 54 }, { "epoch": 0.019801980198019802, "grad_norm": 4.555037975311279, "learning_rate": 0.00019999834103095812, "loss": 8.9972, "step": 55 }, { "epoch": 0.020162016201620163, "grad_norm": 4.029503345489502, "learning_rate": 0.00019999761108748597, "loss": 8.9023, "step": 56 }, { "epoch": 0.020522052205220524, "grad_norm": 3.193443536758423, "learning_rate": 0.00019999674842930876, "loss": 8.6223, "step": 57 }, { "epoch": 0.02088208820882088, "grad_norm": 3.635911226272583, "learning_rate": 0.0001999957530575713, "loss": 8.4718, "step": 58 }, { "epoch": 0.02124212421242124, "grad_norm": 3.919508218765259, "learning_rate": 0.00019999462497359466, "loss": 8.4637, "step": 59 }, { "epoch": 0.021602160216021602, "grad_norm": 3.876338481903076, "learning_rate": 0.000199993364178876, "loss": 8.576, "step": 60 }, { "epoch": 0.021962196219621963, "grad_norm": 3.777447462081909, "learning_rate": 0.00019999197067508865, "loss": 8.3571, "step": 61 }, { "epoch": 0.022322232223222323, "grad_norm": 3.5673623085021973, "learning_rate": 0.000199990444464082, "loss": 8.5479, "step": 62 }, { "epoch": 0.02268226822682268, "grad_norm": 3.583082914352417, "learning_rate": 0.00019998878554788166, "loss": 8.1977, "step": 63 }, { "epoch": 0.02304230423042304, "grad_norm": 2.835787773132324, "learning_rate": 0.00019998699392868922, "loss": 8.2202, "step": 64 }, { "epoch": 0.023402340234023402, "grad_norm": 3.372305154800415, "learning_rate": 0.00019998506960888256, "loss": 8.3326, "step": 65 }, { "epoch": 0.023762376237623763, "grad_norm": 3.7524638175964355, "learning_rate": 0.00019998301259101554, "loss": 8.2353, "step": 66 }, { "epoch": 0.024122412241224123, "grad_norm": 3.233792304992676, "learning_rate": 0.00019998082287781826, "loss": 7.8709, "step": 67 }, { "epoch": 0.024482448244824484, "grad_norm": 3.251406192779541, "learning_rate": 0.0001999785004721968, "loss": 8.3691, "step": 68 }, { "epoch": 0.02484248424842484, "grad_norm": 3.2626826763153076, "learning_rate": 0.00019997604537723342, "loss": 8.1906, "step": 69 }, { "epoch": 0.025202520252025202, "grad_norm": 3.216583490371704, "learning_rate": 0.00019997345759618647, "loss": 7.9592, "step": 70 }, { "epoch": 0.025562556255625563, "grad_norm": 3.309875011444092, "learning_rate": 0.0001999707371324904, "loss": 8.0691, "step": 71 }, { "epoch": 0.025922592259225923, "grad_norm": 3.656224250793457, "learning_rate": 0.00019996788398975578, "loss": 8.0343, "step": 72 }, { "epoch": 0.026282628262826284, "grad_norm": 3.816601276397705, "learning_rate": 0.00019996489817176918, "loss": 8.3206, "step": 73 }, { "epoch": 0.02664266426642664, "grad_norm": 4.16778564453125, "learning_rate": 0.00019996177968249334, "loss": 7.9706, "step": 74 }, { "epoch": 0.027002700270027002, "grad_norm": 5.989762306213379, "learning_rate": 0.0001999585285260671, "loss": 8.1316, "step": 75 }, { "epoch": 0.027362736273627362, "grad_norm": 6.801653861999512, "learning_rate": 0.00019995514470680527, "loss": 8.0789, "step": 76 }, { "epoch": 0.027722772277227723, "grad_norm": 6.341710090637207, "learning_rate": 0.00019995162822919883, "loss": 7.5837, "step": 77 }, { "epoch": 0.028082808280828084, "grad_norm": 4.6270575523376465, "learning_rate": 0.0001999479790979147, "loss": 7.8758, "step": 78 }, { "epoch": 0.028442844284428444, "grad_norm": 3.671124219894409, "learning_rate": 0.00019994419731779602, "loss": 7.8061, "step": 79 }, { "epoch": 0.0288028802880288, "grad_norm": 3.308453321456909, "learning_rate": 0.0001999402828938618, "loss": 8.104, "step": 80 }, { "epoch": 0.029162916291629162, "grad_norm": 2.890606641769409, "learning_rate": 0.00019993623583130723, "loss": 7.624, "step": 81 }, { "epoch": 0.029522952295229523, "grad_norm": 3.6852352619171143, "learning_rate": 0.0001999320561355035, "loss": 7.7895, "step": 82 }, { "epoch": 0.029882988298829884, "grad_norm": 3.132948160171509, "learning_rate": 0.00019992774381199778, "loss": 7.9111, "step": 83 }, { "epoch": 0.030243024302430244, "grad_norm": 3.1692230701446533, "learning_rate": 0.00019992329886651331, "loss": 7.3854, "step": 84 }, { "epoch": 0.0306030603060306, "grad_norm": 2.8339498043060303, "learning_rate": 0.00019991872130494933, "loss": 7.4969, "step": 85 }, { "epoch": 0.030963096309630962, "grad_norm": 2.557858943939209, "learning_rate": 0.00019991401113338104, "loss": 7.5541, "step": 86 }, { "epoch": 0.031323132313231326, "grad_norm": 3.0587706565856934, "learning_rate": 0.00019990916835805974, "loss": 7.1242, "step": 87 }, { "epoch": 0.031683168316831684, "grad_norm": 3.4932024478912354, "learning_rate": 0.00019990419298541263, "loss": 7.0832, "step": 88 }, { "epoch": 0.03204320432043204, "grad_norm": 3.3262085914611816, "learning_rate": 0.00019989908502204292, "loss": 7.0744, "step": 89 }, { "epoch": 0.032403240324032405, "grad_norm": 2.8556482791900635, "learning_rate": 0.00019989384447472984, "loss": 7.3702, "step": 90 }, { "epoch": 0.03276327632763276, "grad_norm": 3.247647762298584, "learning_rate": 0.00019988847135042842, "loss": 7.356, "step": 91 }, { "epoch": 0.033123312331233126, "grad_norm": 2.9567129611968994, "learning_rate": 0.00019988296565626987, "loss": 7.0572, "step": 92 }, { "epoch": 0.03348334833483348, "grad_norm": 3.328278064727783, "learning_rate": 0.00019987732739956115, "loss": 6.9327, "step": 93 }, { "epoch": 0.03384338433843384, "grad_norm": 3.0802903175354004, "learning_rate": 0.00019987155658778529, "loss": 7.3882, "step": 94 }, { "epoch": 0.034203420342034205, "grad_norm": 3.4182589054107666, "learning_rate": 0.00019986565322860115, "loss": 7.3887, "step": 95 }, { "epoch": 0.03456345634563456, "grad_norm": 3.3240511417388916, "learning_rate": 0.00019985961732984356, "loss": 7.045, "step": 96 }, { "epoch": 0.034923492349234926, "grad_norm": 3.699915647506714, "learning_rate": 0.00019985344889952327, "loss": 7.2765, "step": 97 }, { "epoch": 0.03528352835283528, "grad_norm": 3.5944244861602783, "learning_rate": 0.00019984714794582683, "loss": 7.3357, "step": 98 }, { "epoch": 0.03564356435643564, "grad_norm": 4.225754261016846, "learning_rate": 0.00019984071447711675, "loss": 7.1458, "step": 99 }, { "epoch": 0.036003600360036005, "grad_norm": 6.09434700012207, "learning_rate": 0.0001998341485019314, "loss": 7.3447, "step": 100 }, { "epoch": 0.03636363636363636, "grad_norm": 4.437989711761475, "learning_rate": 0.000199827450028985, "loss": 7.1732, "step": 101 }, { "epoch": 0.036723672367236726, "grad_norm": 4.548201560974121, "learning_rate": 0.00019982061906716764, "loss": 7.2764, "step": 102 }, { "epoch": 0.03708370837083708, "grad_norm": 3.839555263519287, "learning_rate": 0.00019981365562554522, "loss": 7.3635, "step": 103 }, { "epoch": 0.03744374437443744, "grad_norm": 3.7629542350769043, "learning_rate": 0.00019980655971335945, "loss": 7.5026, "step": 104 }, { "epoch": 0.037803780378037805, "grad_norm": 3.7448863983154297, "learning_rate": 0.00019979933134002789, "loss": 7.0869, "step": 105 }, { "epoch": 0.03816381638163816, "grad_norm": 4.550889492034912, "learning_rate": 0.00019979197051514386, "loss": 6.6809, "step": 106 }, { "epoch": 0.038523852385238526, "grad_norm": 3.9035253524780273, "learning_rate": 0.00019978447724847652, "loss": 6.9597, "step": 107 }, { "epoch": 0.03888388838883888, "grad_norm": 3.1347122192382812, "learning_rate": 0.00019977685154997082, "loss": 6.6319, "step": 108 }, { "epoch": 0.03924392439243925, "grad_norm": 3.7817418575286865, "learning_rate": 0.0001997690934297473, "loss": 6.8094, "step": 109 }, { "epoch": 0.039603960396039604, "grad_norm": 2.7304024696350098, "learning_rate": 0.00019976120289810247, "loss": 6.7951, "step": 110 }, { "epoch": 0.03996399639963996, "grad_norm": 3.387932300567627, "learning_rate": 0.00019975317996550845, "loss": 6.7153, "step": 111 }, { "epoch": 0.040324032403240326, "grad_norm": 2.864016532897949, "learning_rate": 0.0001997450246426131, "loss": 6.7663, "step": 112 }, { "epoch": 0.04068406840684068, "grad_norm": 2.8113808631896973, "learning_rate": 0.00019973673694024, "loss": 6.629, "step": 113 }, { "epoch": 0.04104410441044105, "grad_norm": 2.6250040531158447, "learning_rate": 0.00019972831686938843, "loss": 6.2432, "step": 114 }, { "epoch": 0.041404140414041404, "grad_norm": 2.955963134765625, "learning_rate": 0.00019971976444123327, "loss": 6.7115, "step": 115 }, { "epoch": 0.04176417641764176, "grad_norm": 3.0330076217651367, "learning_rate": 0.00019971107966712518, "loss": 6.6801, "step": 116 }, { "epoch": 0.042124212421242126, "grad_norm": 3.377650260925293, "learning_rate": 0.00019970226255859038, "loss": 6.523, "step": 117 }, { "epoch": 0.04248424842484248, "grad_norm": 3.515782117843628, "learning_rate": 0.00019969331312733076, "loss": 6.7592, "step": 118 }, { "epoch": 0.04284428442844285, "grad_norm": 3.7806248664855957, "learning_rate": 0.0001996842313852238, "loss": 6.6889, "step": 119 }, { "epoch": 0.043204320432043204, "grad_norm": 3.435762405395508, "learning_rate": 0.0001996750173443226, "loss": 6.7071, "step": 120 }, { "epoch": 0.04356435643564356, "grad_norm": 3.39365553855896, "learning_rate": 0.00019966567101685587, "loss": 6.6396, "step": 121 }, { "epoch": 0.043924392439243926, "grad_norm": 3.594801425933838, "learning_rate": 0.0001996561924152278, "loss": 6.435, "step": 122 }, { "epoch": 0.04428442844284428, "grad_norm": 4.025651454925537, "learning_rate": 0.00019964658155201829, "loss": 7.0821, "step": 123 }, { "epoch": 0.04464446444644465, "grad_norm": 3.463228464126587, "learning_rate": 0.00019963683843998253, "loss": 6.5565, "step": 124 }, { "epoch": 0.045004500450045004, "grad_norm": 4.928868770599365, "learning_rate": 0.00019962696309205148, "loss": 6.7665, "step": 125 }, { "epoch": 0.04536453645364536, "grad_norm": 4.847606658935547, "learning_rate": 0.00019961695552133145, "loss": 6.4155, "step": 126 }, { "epoch": 0.045724572457245725, "grad_norm": 6.5078654289245605, "learning_rate": 0.00019960681574110426, "loss": 6.878, "step": 127 }, { "epoch": 0.04608460846084608, "grad_norm": 4.644542217254639, "learning_rate": 0.0001995965437648273, "loss": 6.4635, "step": 128 }, { "epoch": 0.04644464446444645, "grad_norm": 4.330336093902588, "learning_rate": 0.00019958613960613318, "loss": 6.434, "step": 129 }, { "epoch": 0.046804680468046804, "grad_norm": 3.3238542079925537, "learning_rate": 0.00019957560327883017, "loss": 6.1268, "step": 130 }, { "epoch": 0.04716471647164717, "grad_norm": 3.5520670413970947, "learning_rate": 0.0001995649347969019, "loss": 6.4838, "step": 131 }, { "epoch": 0.047524752475247525, "grad_norm": 3.1387791633605957, "learning_rate": 0.0001995541341745072, "loss": 6.042, "step": 132 }, { "epoch": 0.04788478847884788, "grad_norm": 5.484352111816406, "learning_rate": 0.0001995432014259806, "loss": 6.2728, "step": 133 }, { "epoch": 0.04824482448244825, "grad_norm": 3.3683643341064453, "learning_rate": 0.00019953213656583168, "loss": 6.0557, "step": 134 }, { "epoch": 0.048604860486048604, "grad_norm": 2.7083218097686768, "learning_rate": 0.00019952093960874556, "loss": 6.4812, "step": 135 }, { "epoch": 0.04896489648964897, "grad_norm": 2.7084672451019287, "learning_rate": 0.00019950961056958258, "loss": 6.6694, "step": 136 }, { "epoch": 0.049324932493249325, "grad_norm": 2.790422201156616, "learning_rate": 0.00019949814946337838, "loss": 6.2798, "step": 137 }, { "epoch": 0.04968496849684968, "grad_norm": 3.1344246864318848, "learning_rate": 0.00019948655630534396, "loss": 6.1944, "step": 138 }, { "epoch": 0.05004500450045005, "grad_norm": 3.3408470153808594, "learning_rate": 0.00019947483111086545, "loss": 6.3483, "step": 139 }, { "epoch": 0.050405040504050404, "grad_norm": 2.659766435623169, "learning_rate": 0.00019946297389550433, "loss": 6.2585, "step": 140 }, { "epoch": 0.05076507650765077, "grad_norm": 2.8162424564361572, "learning_rate": 0.0001994509846749972, "loss": 6.349, "step": 141 }, { "epoch": 0.051125112511251125, "grad_norm": 3.114379644393921, "learning_rate": 0.0001994388634652559, "loss": 6.4585, "step": 142 }, { "epoch": 0.05148514851485148, "grad_norm": 3.2588369846343994, "learning_rate": 0.00019942661028236745, "loss": 6.2732, "step": 143 }, { "epoch": 0.051845184518451846, "grad_norm": 3.0048773288726807, "learning_rate": 0.00019941422514259402, "loss": 6.4056, "step": 144 }, { "epoch": 0.052205220522052204, "grad_norm": 2.687142848968506, "learning_rate": 0.00019940170806237293, "loss": 6.118, "step": 145 }, { "epoch": 0.05256525652565257, "grad_norm": 2.4336202144622803, "learning_rate": 0.00019938905905831654, "loss": 6.3721, "step": 146 }, { "epoch": 0.052925292529252925, "grad_norm": 3.150280475616455, "learning_rate": 0.00019937627814721237, "loss": 5.7968, "step": 147 }, { "epoch": 0.05328532853285328, "grad_norm": 3.0133798122406006, "learning_rate": 0.00019936336534602295, "loss": 6.0878, "step": 148 }, { "epoch": 0.053645364536453646, "grad_norm": 3.5488665103912354, "learning_rate": 0.0001993503206718859, "loss": 6.6781, "step": 149 }, { "epoch": 0.054005400540054004, "grad_norm": 4.942127704620361, "learning_rate": 0.0001993371441421138, "loss": 6.2772, "step": 150 }, { "epoch": 0.05436543654365437, "grad_norm": 4.3673200607299805, "learning_rate": 0.00019932383577419432, "loss": 6.272, "step": 151 }, { "epoch": 0.054725472547254725, "grad_norm": 2.8859446048736572, "learning_rate": 0.00019931039558578997, "loss": 5.7987, "step": 152 }, { "epoch": 0.05508550855085508, "grad_norm": 3.215345859527588, "learning_rate": 0.00019929682359473834, "loss": 5.9682, "step": 153 }, { "epoch": 0.055445544554455446, "grad_norm": 3.1671488285064697, "learning_rate": 0.00019928311981905184, "loss": 6.6487, "step": 154 }, { "epoch": 0.0558055805580558, "grad_norm": 3.035339832305908, "learning_rate": 0.00019926928427691786, "loss": 6.3195, "step": 155 }, { "epoch": 0.05616561656165617, "grad_norm": 2.819296360015869, "learning_rate": 0.00019925531698669862, "loss": 6.2373, "step": 156 }, { "epoch": 0.056525652565256525, "grad_norm": 2.6943399906158447, "learning_rate": 0.00019924121796693127, "loss": 5.7856, "step": 157 }, { "epoch": 0.05688568856885689, "grad_norm": 3.4273905754089355, "learning_rate": 0.00019922698723632767, "loss": 6.406, "step": 158 }, { "epoch": 0.057245724572457246, "grad_norm": 3.285869836807251, "learning_rate": 0.00019921262481377455, "loss": 6.297, "step": 159 }, { "epoch": 0.0576057605760576, "grad_norm": 3.145364761352539, "learning_rate": 0.0001991981307183334, "loss": 5.9435, "step": 160 }, { "epoch": 0.05796579657965797, "grad_norm": 3.239286184310913, "learning_rate": 0.0001991835049692405, "loss": 5.8419, "step": 161 }, { "epoch": 0.058325832583258325, "grad_norm": 2.5348026752471924, "learning_rate": 0.00019916874758590684, "loss": 5.8359, "step": 162 }, { "epoch": 0.05868586858685869, "grad_norm": 2.9281060695648193, "learning_rate": 0.0001991538585879181, "loss": 5.6654, "step": 163 }, { "epoch": 0.059045904590459046, "grad_norm": 3.6080162525177, "learning_rate": 0.0001991388379950346, "loss": 6.3303, "step": 164 }, { "epoch": 0.0594059405940594, "grad_norm": 3.5625193119049072, "learning_rate": 0.00019912368582719142, "loss": 5.9362, "step": 165 }, { "epoch": 0.05976597659765977, "grad_norm": 2.861238956451416, "learning_rate": 0.00019910840210449817, "loss": 5.6185, "step": 166 }, { "epoch": 0.060126012601260125, "grad_norm": 3.1427133083343506, "learning_rate": 0.00019909298684723904, "loss": 5.9749, "step": 167 }, { "epoch": 0.06048604860486049, "grad_norm": 2.6581568717956543, "learning_rate": 0.0001990774400758729, "loss": 5.7056, "step": 168 }, { "epoch": 0.060846084608460846, "grad_norm": 2.240708351135254, "learning_rate": 0.00019906176181103304, "loss": 6.0279, "step": 169 }, { "epoch": 0.0612061206120612, "grad_norm": 2.948249578475952, "learning_rate": 0.00019904595207352737, "loss": 5.6673, "step": 170 }, { "epoch": 0.06156615661566157, "grad_norm": 2.8593263626098633, "learning_rate": 0.00019903001088433816, "loss": 5.6589, "step": 171 }, { "epoch": 0.061926192619261924, "grad_norm": 2.995178699493408, "learning_rate": 0.0001990139382646223, "loss": 5.6627, "step": 172 }, { "epoch": 0.06228622862286229, "grad_norm": 3.6065990924835205, "learning_rate": 0.000198997734235711, "loss": 6.2448, "step": 173 }, { "epoch": 0.06264626462646265, "grad_norm": 4.687837600708008, "learning_rate": 0.00019898139881910986, "loss": 5.9976, "step": 174 }, { "epoch": 0.063006300630063, "grad_norm": 4.329146862030029, "learning_rate": 0.00019896493203649897, "loss": 6.4935, "step": 175 }, { "epoch": 0.06336633663366337, "grad_norm": 3.881782054901123, "learning_rate": 0.00019894833390973266, "loss": 5.8349, "step": 176 }, { "epoch": 0.06372637263726373, "grad_norm": 4.233585834503174, "learning_rate": 0.00019893160446083963, "loss": 5.9879, "step": 177 }, { "epoch": 0.06408640864086408, "grad_norm": 3.393862247467041, "learning_rate": 0.0001989147437120228, "loss": 5.8239, "step": 178 }, { "epoch": 0.06444644464446445, "grad_norm": 3.3493380546569824, "learning_rate": 0.00019889775168565943, "loss": 6.1353, "step": 179 }, { "epoch": 0.06480648064806481, "grad_norm": 3.0439703464508057, "learning_rate": 0.000198880628404301, "loss": 6.0866, "step": 180 }, { "epoch": 0.06516651665166516, "grad_norm": 3.1560416221618652, "learning_rate": 0.0001988633738906731, "loss": 5.6327, "step": 181 }, { "epoch": 0.06552655265526552, "grad_norm": 4.7591657638549805, "learning_rate": 0.00019884598816767563, "loss": 5.839, "step": 182 }, { "epoch": 0.06588658865886589, "grad_norm": 2.586634874343872, "learning_rate": 0.0001988284712583825, "loss": 5.4748, "step": 183 }, { "epoch": 0.06624662466246625, "grad_norm": 2.2929999828338623, "learning_rate": 0.0001988108231860418, "loss": 5.8326, "step": 184 }, { "epoch": 0.0666066606660666, "grad_norm": 2.576805830001831, "learning_rate": 0.0001987930439740757, "loss": 5.8293, "step": 185 }, { "epoch": 0.06696669666966697, "grad_norm": 3.108707904815674, "learning_rate": 0.0001987751336460803, "loss": 5.9231, "step": 186 }, { "epoch": 0.06732673267326733, "grad_norm": 2.368267059326172, "learning_rate": 0.00019875709222582594, "loss": 5.6469, "step": 187 }, { "epoch": 0.06768676867686768, "grad_norm": 3.020864486694336, "learning_rate": 0.0001987389197372567, "loss": 5.8781, "step": 188 }, { "epoch": 0.06804680468046805, "grad_norm": 2.432720184326172, "learning_rate": 0.00019872061620449078, "loss": 6.2033, "step": 189 }, { "epoch": 0.06840684068406841, "grad_norm": 2.4212331771850586, "learning_rate": 0.00019870218165182025, "loss": 5.7962, "step": 190 }, { "epoch": 0.06876687668766877, "grad_norm": 2.8409509658813477, "learning_rate": 0.00019868361610371097, "loss": 5.6254, "step": 191 }, { "epoch": 0.06912691269126912, "grad_norm": 2.1944754123687744, "learning_rate": 0.00019866491958480284, "loss": 5.5786, "step": 192 }, { "epoch": 0.06948694869486949, "grad_norm": 3.18087100982666, "learning_rate": 0.00019864609211990946, "loss": 5.561, "step": 193 }, { "epoch": 0.06984698469846985, "grad_norm": 2.7998907566070557, "learning_rate": 0.0001986271337340182, "loss": 5.8962, "step": 194 }, { "epoch": 0.0702070207020702, "grad_norm": 2.61316180229187, "learning_rate": 0.00019860804445229023, "loss": 5.6438, "step": 195 }, { "epoch": 0.07056705670567057, "grad_norm": 2.3283307552337646, "learning_rate": 0.0001985888243000605, "loss": 5.7617, "step": 196 }, { "epoch": 0.07092709270927093, "grad_norm": 3.0777363777160645, "learning_rate": 0.00019856947330283752, "loss": 5.5723, "step": 197 }, { "epoch": 0.07128712871287128, "grad_norm": 3.0738508701324463, "learning_rate": 0.00019854999148630355, "loss": 5.8281, "step": 198 }, { "epoch": 0.07164716471647165, "grad_norm": 3.864041328430176, "learning_rate": 0.00019853037887631448, "loss": 6.1932, "step": 199 }, { "epoch": 0.07200720072007201, "grad_norm": 4.276803493499756, "learning_rate": 0.0001985106354988997, "loss": 6.0477, "step": 200 }, { "epoch": 0.07236723672367237, "grad_norm": 4.5397748947143555, "learning_rate": 0.0001984907613802622, "loss": 5.4879, "step": 201 }, { "epoch": 0.07272727272727272, "grad_norm": 3.2944319248199463, "learning_rate": 0.0001984707565467785, "loss": 6.1137, "step": 202 }, { "epoch": 0.07308730873087309, "grad_norm": 3.5978901386260986, "learning_rate": 0.0001984506210249986, "loss": 5.8026, "step": 203 }, { "epoch": 0.07344734473447345, "grad_norm": 3.680521249771118, "learning_rate": 0.00019843035484164593, "loss": 5.7198, "step": 204 }, { "epoch": 0.0738073807380738, "grad_norm": 2.7733376026153564, "learning_rate": 0.00019840995802361734, "loss": 5.8044, "step": 205 }, { "epoch": 0.07416741674167417, "grad_norm": 3.0227551460266113, "learning_rate": 0.00019838943059798304, "loss": 5.625, "step": 206 }, { "epoch": 0.07452745274527453, "grad_norm": 2.693079710006714, "learning_rate": 0.00019836877259198662, "loss": 5.3865, "step": 207 }, { "epoch": 0.07488748874887488, "grad_norm": 2.970263957977295, "learning_rate": 0.00019834798403304494, "loss": 5.4135, "step": 208 }, { "epoch": 0.07524752475247524, "grad_norm": 2.4294893741607666, "learning_rate": 0.0001983270649487481, "loss": 5.59, "step": 209 }, { "epoch": 0.07560756075607561, "grad_norm": 2.7756547927856445, "learning_rate": 0.0001983060153668595, "loss": 5.3611, "step": 210 }, { "epoch": 0.07596759675967597, "grad_norm": 2.610945224761963, "learning_rate": 0.00019828483531531568, "loss": 5.6271, "step": 211 }, { "epoch": 0.07632763276327632, "grad_norm": 3.3113486766815186, "learning_rate": 0.00019826352482222638, "loss": 5.4726, "step": 212 }, { "epoch": 0.07668766876687669, "grad_norm": 2.1098732948303223, "learning_rate": 0.0001982420839158744, "loss": 5.2566, "step": 213 }, { "epoch": 0.07704770477047705, "grad_norm": 2.361694812774658, "learning_rate": 0.0001982205126247157, "loss": 5.3923, "step": 214 }, { "epoch": 0.0774077407740774, "grad_norm": 2.2943015098571777, "learning_rate": 0.00019819881097737915, "loss": 5.6276, "step": 215 }, { "epoch": 0.07776777677767777, "grad_norm": 2.6616368293762207, "learning_rate": 0.0001981769790026668, "loss": 5.4411, "step": 216 }, { "epoch": 0.07812781278127813, "grad_norm": 2.558661937713623, "learning_rate": 0.00019815501672955358, "loss": 5.0324, "step": 217 }, { "epoch": 0.0784878487848785, "grad_norm": 3.133481025695801, "learning_rate": 0.00019813292418718732, "loss": 5.8089, "step": 218 }, { "epoch": 0.07884788478847884, "grad_norm": 2.8957459926605225, "learning_rate": 0.0001981107014048888, "loss": 5.6448, "step": 219 }, { "epoch": 0.07920792079207921, "grad_norm": 2.435612916946411, "learning_rate": 0.00019808834841215158, "loss": 5.5601, "step": 220 }, { "epoch": 0.07956795679567957, "grad_norm": 2.545100450515747, "learning_rate": 0.0001980658652386421, "loss": 5.6624, "step": 221 }, { "epoch": 0.07992799279927992, "grad_norm": 2.8909523487091064, "learning_rate": 0.00019804325191419956, "loss": 5.9231, "step": 222 }, { "epoch": 0.08028802880288029, "grad_norm": 2.783823251724243, "learning_rate": 0.00019802050846883592, "loss": 6.2122, "step": 223 }, { "epoch": 0.08064806480648065, "grad_norm": 3.4865949153900146, "learning_rate": 0.0001979976349327357, "loss": 6.0217, "step": 224 }, { "epoch": 0.081008100810081, "grad_norm": 4.174108028411865, "learning_rate": 0.00019797463133625626, "loss": 6.0016, "step": 225 }, { "epoch": 0.08136813681368137, "grad_norm": 3.6121280193328857, "learning_rate": 0.00019795149770992745, "loss": 5.5763, "step": 226 }, { "epoch": 0.08172817281728173, "grad_norm": 3.4251668453216553, "learning_rate": 0.00019792823408445174, "loss": 5.8605, "step": 227 }, { "epoch": 0.0820882088208821, "grad_norm": 3.3870620727539062, "learning_rate": 0.0001979048404907041, "loss": 5.8316, "step": 228 }, { "epoch": 0.08244824482448244, "grad_norm": 1.994755506515503, "learning_rate": 0.000197881316959732, "loss": 5.5797, "step": 229 }, { "epoch": 0.08280828082808281, "grad_norm": 2.855531692504883, "learning_rate": 0.00019785766352275542, "loss": 5.6349, "step": 230 }, { "epoch": 0.08316831683168317, "grad_norm": 2.65555739402771, "learning_rate": 0.00019783388021116664, "loss": 5.4636, "step": 231 }, { "epoch": 0.08352835283528352, "grad_norm": 2.921633243560791, "learning_rate": 0.00019780996705653044, "loss": 5.417, "step": 232 }, { "epoch": 0.08388838883888389, "grad_norm": 3.474546194076538, "learning_rate": 0.00019778592409058378, "loss": 5.3859, "step": 233 }, { "epoch": 0.08424842484248425, "grad_norm": 2.511760711669922, "learning_rate": 0.00019776175134523597, "loss": 5.5061, "step": 234 }, { "epoch": 0.0846084608460846, "grad_norm": 2.2720072269439697, "learning_rate": 0.00019773744885256863, "loss": 5.2613, "step": 235 }, { "epoch": 0.08496849684968497, "grad_norm": 2.45263409614563, "learning_rate": 0.0001977130166448355, "loss": 5.7824, "step": 236 }, { "epoch": 0.08532853285328533, "grad_norm": 2.810452461242676, "learning_rate": 0.0001976884547544624, "loss": 5.6098, "step": 237 }, { "epoch": 0.0856885688568857, "grad_norm": 2.936281204223633, "learning_rate": 0.00019766376321404746, "loss": 5.4772, "step": 238 }, { "epoch": 0.08604860486048604, "grad_norm": 3.000941276550293, "learning_rate": 0.00019763894205636072, "loss": 5.4838, "step": 239 }, { "epoch": 0.08640864086408641, "grad_norm": 1.9919918775558472, "learning_rate": 0.00019761399131434427, "loss": 5.1271, "step": 240 }, { "epoch": 0.08676867686768677, "grad_norm": 2.818498373031616, "learning_rate": 0.00019758891102111226, "loss": 5.3386, "step": 241 }, { "epoch": 0.08712871287128712, "grad_norm": 2.278205156326294, "learning_rate": 0.00019756370120995066, "loss": 5.5661, "step": 242 }, { "epoch": 0.08748874887488749, "grad_norm": 2.9450225830078125, "learning_rate": 0.00019753836191431742, "loss": 5.3917, "step": 243 }, { "epoch": 0.08784878487848785, "grad_norm": 2.6072285175323486, "learning_rate": 0.00019751289316784237, "loss": 5.1402, "step": 244 }, { "epoch": 0.08820882088208822, "grad_norm": 1.9053361415863037, "learning_rate": 0.000197487295004327, "loss": 5.4664, "step": 245 }, { "epoch": 0.08856885688568857, "grad_norm": 2.451340436935425, "learning_rate": 0.00019746156745774468, "loss": 5.5551, "step": 246 }, { "epoch": 0.08892889288928893, "grad_norm": 2.6773860454559326, "learning_rate": 0.0001974357105622405, "loss": 5.5615, "step": 247 }, { "epoch": 0.0892889288928893, "grad_norm": 2.6578361988067627, "learning_rate": 0.00019740972435213115, "loss": 5.8491, "step": 248 }, { "epoch": 0.08964896489648964, "grad_norm": 3.2765934467315674, "learning_rate": 0.00019738360886190496, "loss": 5.8359, "step": 249 }, { "epoch": 0.09000900090009001, "grad_norm": 4.44022274017334, "learning_rate": 0.0001973573641262219, "loss": 6.3776, "step": 250 }, { "epoch": 0.09036903690369037, "grad_norm": 3.5060126781463623, "learning_rate": 0.00019733099017991341, "loss": 5.2258, "step": 251 }, { "epoch": 0.09072907290729072, "grad_norm": 3.3604626655578613, "learning_rate": 0.00019730448705798239, "loss": 5.5046, "step": 252 }, { "epoch": 0.09108910891089109, "grad_norm": 2.7966156005859375, "learning_rate": 0.00019727785479560327, "loss": 5.6467, "step": 253 }, { "epoch": 0.09144914491449145, "grad_norm": 2.827324628829956, "learning_rate": 0.0001972510934281218, "loss": 5.5496, "step": 254 }, { "epoch": 0.09180918091809182, "grad_norm": 2.5747945308685303, "learning_rate": 0.0001972242029910551, "loss": 5.1269, "step": 255 }, { "epoch": 0.09216921692169217, "grad_norm": 2.47976016998291, "learning_rate": 0.0001971971835200916, "loss": 5.5411, "step": 256 }, { "epoch": 0.09252925292529253, "grad_norm": 2.3428852558135986, "learning_rate": 0.00019717003505109095, "loss": 5.1775, "step": 257 }, { "epoch": 0.0928892889288929, "grad_norm": 3.0765557289123535, "learning_rate": 0.00019714275762008405, "loss": 5.3954, "step": 258 }, { "epoch": 0.09324932493249324, "grad_norm": 2.549563407897949, "learning_rate": 0.0001971153512632729, "loss": 5.3875, "step": 259 }, { "epoch": 0.09360936093609361, "grad_norm": 2.8717353343963623, "learning_rate": 0.00019708781601703065, "loss": 5.047, "step": 260 }, { "epoch": 0.09396939693969397, "grad_norm": 2.2203121185302734, "learning_rate": 0.00019706015191790145, "loss": 5.2141, "step": 261 }, { "epoch": 0.09432943294329434, "grad_norm": 2.383986473083496, "learning_rate": 0.00019703235900260055, "loss": 5.429, "step": 262 }, { "epoch": 0.09468946894689469, "grad_norm": 2.2943928241729736, "learning_rate": 0.00019700443730801413, "loss": 5.4756, "step": 263 }, { "epoch": 0.09504950495049505, "grad_norm": 1.895622730255127, "learning_rate": 0.0001969763868711992, "loss": 5.1072, "step": 264 }, { "epoch": 0.09540954095409541, "grad_norm": 1.9407848119735718, "learning_rate": 0.0001969482077293838, "loss": 5.0003, "step": 265 }, { "epoch": 0.09576957695769577, "grad_norm": 2.3450191020965576, "learning_rate": 0.00019691989991996663, "loss": 5.3993, "step": 266 }, { "epoch": 0.09612961296129613, "grad_norm": 2.7572624683380127, "learning_rate": 0.00019689146348051719, "loss": 5.648, "step": 267 }, { "epoch": 0.0964896489648965, "grad_norm": 2.0007078647613525, "learning_rate": 0.00019686289844877579, "loss": 5.0566, "step": 268 }, { "epoch": 0.09684968496849684, "grad_norm": 2.1224796772003174, "learning_rate": 0.00019683420486265327, "loss": 4.9926, "step": 269 }, { "epoch": 0.09720972097209721, "grad_norm": 1.9481003284454346, "learning_rate": 0.00019680538276023118, "loss": 5.3721, "step": 270 }, { "epoch": 0.09756975697569757, "grad_norm": 2.778371810913086, "learning_rate": 0.0001967764321797616, "loss": 5.43, "step": 271 }, { "epoch": 0.09792979297929794, "grad_norm": 2.592320442199707, "learning_rate": 0.0001967473531596671, "loss": 5.4041, "step": 272 }, { "epoch": 0.09828982898289829, "grad_norm": 2.72464656829834, "learning_rate": 0.00019671814573854078, "loss": 5.9389, "step": 273 }, { "epoch": 0.09864986498649865, "grad_norm": 3.2835888862609863, "learning_rate": 0.00019668880995514604, "loss": 5.5321, "step": 274 }, { "epoch": 0.09900990099009901, "grad_norm": 4.350769519805908, "learning_rate": 0.00019665934584841682, "loss": 6.0003, "step": 275 }, { "epoch": 0.09936993699369936, "grad_norm": 3.744112968444824, "learning_rate": 0.00019662975345745713, "loss": 5.7387, "step": 276 }, { "epoch": 0.09972997299729973, "grad_norm": 2.7531681060791016, "learning_rate": 0.00019660003282154147, "loss": 5.0919, "step": 277 }, { "epoch": 0.1000900090009001, "grad_norm": 2.2056615352630615, "learning_rate": 0.00019657018398011434, "loss": 5.1607, "step": 278 }, { "epoch": 0.10045004500450044, "grad_norm": 3.7169909477233887, "learning_rate": 0.0001965402069727906, "loss": 5.37, "step": 279 }, { "epoch": 0.10081008100810081, "grad_norm": 4.994881629943848, "learning_rate": 0.00019651010183935498, "loss": 5.4116, "step": 280 }, { "epoch": 0.10117011701170117, "grad_norm": 2.110886335372925, "learning_rate": 0.00019647986861976246, "loss": 5.6185, "step": 281 }, { "epoch": 0.10153015301530154, "grad_norm": 2.4211294651031494, "learning_rate": 0.00019644950735413788, "loss": 5.2895, "step": 282 }, { "epoch": 0.10189018901890189, "grad_norm": 2.3990745544433594, "learning_rate": 0.0001964190180827761, "loss": 5.5752, "step": 283 }, { "epoch": 0.10225022502250225, "grad_norm": 2.2132537364959717, "learning_rate": 0.00019638840084614182, "loss": 4.953, "step": 284 }, { "epoch": 0.10261026102610261, "grad_norm": 2.9952592849731445, "learning_rate": 0.00019635765568486955, "loss": 5.2637, "step": 285 }, { "epoch": 0.10297029702970296, "grad_norm": 2.621925115585327, "learning_rate": 0.00019632678263976368, "loss": 5.3269, "step": 286 }, { "epoch": 0.10333033303330333, "grad_norm": 1.9161827564239502, "learning_rate": 0.0001962957817517982, "loss": 5.2743, "step": 287 }, { "epoch": 0.10369036903690369, "grad_norm": 2.8809425830841064, "learning_rate": 0.00019626465306211687, "loss": 4.8225, "step": 288 }, { "epoch": 0.10405040504050406, "grad_norm": 2.3898348808288574, "learning_rate": 0.00019623339661203301, "loss": 5.2633, "step": 289 }, { "epoch": 0.10441044104410441, "grad_norm": 2.4710440635681152, "learning_rate": 0.00019620201244302952, "loss": 5.1064, "step": 290 }, { "epoch": 0.10477047704770477, "grad_norm": 2.3487277030944824, "learning_rate": 0.00019617050059675878, "loss": 5.3717, "step": 291 }, { "epoch": 0.10513051305130514, "grad_norm": 2.8060383796691895, "learning_rate": 0.0001961388611150427, "loss": 5.0828, "step": 292 }, { "epoch": 0.10549054905490549, "grad_norm": 1.8839240074157715, "learning_rate": 0.00019610709403987246, "loss": 4.8968, "step": 293 }, { "epoch": 0.10585058505850585, "grad_norm": 2.4121265411376953, "learning_rate": 0.00019607519941340867, "loss": 5.3503, "step": 294 }, { "epoch": 0.10621062106210621, "grad_norm": 2.379385232925415, "learning_rate": 0.00019604317727798124, "loss": 5.4793, "step": 295 }, { "epoch": 0.10657065706570656, "grad_norm": 2.5868208408355713, "learning_rate": 0.00019601102767608923, "loss": 5.4413, "step": 296 }, { "epoch": 0.10693069306930693, "grad_norm": 2.258801221847534, "learning_rate": 0.00019597875065040094, "loss": 5.3289, "step": 297 }, { "epoch": 0.10729072907290729, "grad_norm": 3.434406042098999, "learning_rate": 0.0001959463462437537, "loss": 5.8155, "step": 298 }, { "epoch": 0.10765076507650766, "grad_norm": 2.431286573410034, "learning_rate": 0.00019591381449915397, "loss": 5.5256, "step": 299 }, { "epoch": 0.10801080108010801, "grad_norm": 2.626918315887451, "learning_rate": 0.0001958811554597772, "loss": 5.6643, "step": 300 }, { "epoch": 0.10837083708370837, "grad_norm": 4.818332672119141, "learning_rate": 0.00019584836916896781, "loss": 5.7093, "step": 301 }, { "epoch": 0.10873087308730874, "grad_norm": 2.928095579147339, "learning_rate": 0.000195815455670239, "loss": 5.4955, "step": 302 }, { "epoch": 0.10909090909090909, "grad_norm": 2.9136202335357666, "learning_rate": 0.0001957824150072729, "loss": 5.2354, "step": 303 }, { "epoch": 0.10945094509450945, "grad_norm": 2.9085636138916016, "learning_rate": 0.0001957492472239204, "loss": 5.5329, "step": 304 }, { "epoch": 0.10981098109810981, "grad_norm": 1.9821232557296753, "learning_rate": 0.00019571595236420102, "loss": 5.2906, "step": 305 }, { "epoch": 0.11017101710171016, "grad_norm": 2.7555158138275146, "learning_rate": 0.00019568253047230302, "loss": 5.003, "step": 306 }, { "epoch": 0.11053105310531053, "grad_norm": 2.532947063446045, "learning_rate": 0.00019564898159258324, "loss": 5.206, "step": 307 }, { "epoch": 0.11089108910891089, "grad_norm": 2.4232444763183594, "learning_rate": 0.00019561530576956703, "loss": 5.0053, "step": 308 }, { "epoch": 0.11125112511251126, "grad_norm": 3.1183736324310303, "learning_rate": 0.00019558150304794822, "loss": 5.078, "step": 309 }, { "epoch": 0.1116111611161116, "grad_norm": 2.4659345149993896, "learning_rate": 0.00019554757347258907, "loss": 5.1274, "step": 310 }, { "epoch": 0.11197119711971197, "grad_norm": 2.3671152591705322, "learning_rate": 0.0001955135170885202, "loss": 5.4931, "step": 311 }, { "epoch": 0.11233123312331234, "grad_norm": 2.0349647998809814, "learning_rate": 0.0001954793339409405, "loss": 5.1148, "step": 312 }, { "epoch": 0.11269126912691269, "grad_norm": 2.184335470199585, "learning_rate": 0.00019544502407521712, "loss": 5.6701, "step": 313 }, { "epoch": 0.11305130513051305, "grad_norm": 2.2550289630889893, "learning_rate": 0.00019541058753688538, "loss": 5.3741, "step": 314 }, { "epoch": 0.11341134113411341, "grad_norm": 2.894897937774658, "learning_rate": 0.00019537602437164875, "loss": 5.1487, "step": 315 }, { "epoch": 0.11377137713771378, "grad_norm": 2.2867188453674316, "learning_rate": 0.0001953413346253787, "loss": 4.9861, "step": 316 }, { "epoch": 0.11413141314131413, "grad_norm": 2.5983200073242188, "learning_rate": 0.00019530651834411474, "loss": 5.1494, "step": 317 }, { "epoch": 0.11449144914491449, "grad_norm": 2.7470312118530273, "learning_rate": 0.0001952715755740643, "loss": 5.0505, "step": 318 }, { "epoch": 0.11485148514851486, "grad_norm": 2.3131463527679443, "learning_rate": 0.00019523650636160268, "loss": 4.8398, "step": 319 }, { "epoch": 0.1152115211521152, "grad_norm": 2.4200801849365234, "learning_rate": 0.00019520131075327298, "loss": 5.3922, "step": 320 }, { "epoch": 0.11557155715571557, "grad_norm": 1.904872179031372, "learning_rate": 0.0001951659887957861, "loss": 5.5282, "step": 321 }, { "epoch": 0.11593159315931593, "grad_norm": 1.897890329360962, "learning_rate": 0.00019513054053602055, "loss": 5.37, "step": 322 }, { "epoch": 0.11629162916291629, "grad_norm": 2.7623322010040283, "learning_rate": 0.00019509496602102252, "loss": 5.7574, "step": 323 }, { "epoch": 0.11665166516651665, "grad_norm": 4.012531757354736, "learning_rate": 0.00019505926529800576, "loss": 5.5631, "step": 324 }, { "epoch": 0.11701170117011701, "grad_norm": 3.5521020889282227, "learning_rate": 0.00019502343841435151, "loss": 5.8678, "step": 325 }, { "epoch": 0.11737173717371738, "grad_norm": 3.4806554317474365, "learning_rate": 0.00019498748541760846, "loss": 5.2348, "step": 326 }, { "epoch": 0.11773177317731773, "grad_norm": 3.5824429988861084, "learning_rate": 0.00019495140635549261, "loss": 5.5141, "step": 327 }, { "epoch": 0.11809180918091809, "grad_norm": 2.9857945442199707, "learning_rate": 0.00019491520127588738, "loss": 5.2804, "step": 328 }, { "epoch": 0.11845184518451846, "grad_norm": 1.9040719270706177, "learning_rate": 0.00019487887022684336, "loss": 5.0913, "step": 329 }, { "epoch": 0.1188118811881188, "grad_norm": 2.358011245727539, "learning_rate": 0.00019484241325657835, "loss": 5.3166, "step": 330 }, { "epoch": 0.11917191719171917, "grad_norm": 2.452613115310669, "learning_rate": 0.00019480583041347726, "loss": 5.2579, "step": 331 }, { "epoch": 0.11953195319531953, "grad_norm": 1.885299563407898, "learning_rate": 0.0001947691217460921, "loss": 5.2208, "step": 332 }, { "epoch": 0.1198919891989199, "grad_norm": 2.377636671066284, "learning_rate": 0.00019473228730314179, "loss": 5.3096, "step": 333 }, { "epoch": 0.12025202520252025, "grad_norm": 2.386121988296509, "learning_rate": 0.00019469532713351222, "loss": 5.1569, "step": 334 }, { "epoch": 0.12061206120612061, "grad_norm": 1.9236092567443848, "learning_rate": 0.00019465824128625617, "loss": 5.1291, "step": 335 }, { "epoch": 0.12097209720972098, "grad_norm": 2.0662996768951416, "learning_rate": 0.00019462102981059317, "loss": 5.4015, "step": 336 }, { "epoch": 0.12133213321332133, "grad_norm": 2.1840786933898926, "learning_rate": 0.00019458369275590954, "loss": 5.1743, "step": 337 }, { "epoch": 0.12169216921692169, "grad_norm": 2.220691204071045, "learning_rate": 0.00019454623017175812, "loss": 5.0971, "step": 338 }, { "epoch": 0.12205220522052206, "grad_norm": 1.8053256273269653, "learning_rate": 0.00019450864210785858, "loss": 5.347, "step": 339 }, { "epoch": 0.1224122412241224, "grad_norm": 1.9502235651016235, "learning_rate": 0.0001944709286140969, "loss": 5.2748, "step": 340 }, { "epoch": 0.12277227722772277, "grad_norm": 1.7731366157531738, "learning_rate": 0.0001944330897405257, "loss": 5.2389, "step": 341 }, { "epoch": 0.12313231323132313, "grad_norm": 2.857713460922241, "learning_rate": 0.00019439512553736394, "loss": 5.5584, "step": 342 }, { "epoch": 0.1234923492349235, "grad_norm": 1.5755183696746826, "learning_rate": 0.00019435703605499683, "loss": 5.0005, "step": 343 }, { "epoch": 0.12385238523852385, "grad_norm": 2.1318392753601074, "learning_rate": 0.00019431882134397598, "loss": 4.8136, "step": 344 }, { "epoch": 0.12421242124212421, "grad_norm": 2.3851094245910645, "learning_rate": 0.0001942804814550191, "loss": 5.2057, "step": 345 }, { "epoch": 0.12457245724572458, "grad_norm": 2.685013771057129, "learning_rate": 0.0001942420164390101, "loss": 5.5869, "step": 346 }, { "epoch": 0.12493249324932493, "grad_norm": 2.550307035446167, "learning_rate": 0.0001942034263469989, "loss": 5.068, "step": 347 }, { "epoch": 0.1252925292529253, "grad_norm": 2.6020565032958984, "learning_rate": 0.00019416471123020156, "loss": 5.5365, "step": 348 }, { "epoch": 0.12565256525652566, "grad_norm": 2.915011405944824, "learning_rate": 0.0001941258711399998, "loss": 5.5553, "step": 349 }, { "epoch": 0.126012601260126, "grad_norm": 3.451205015182495, "learning_rate": 0.00019408690612794148, "loss": 5.6268, "step": 350 }, { "epoch": 0.12637263726372638, "grad_norm": 4.140198707580566, "learning_rate": 0.00019404781624574011, "loss": 5.5532, "step": 351 }, { "epoch": 0.12673267326732673, "grad_norm": 2.5888521671295166, "learning_rate": 0.00019400860154527493, "loss": 5.0767, "step": 352 }, { "epoch": 0.12709270927092708, "grad_norm": 3.074704170227051, "learning_rate": 0.00019396926207859084, "loss": 5.1876, "step": 353 }, { "epoch": 0.12745274527452746, "grad_norm": 2.2525579929351807, "learning_rate": 0.0001939297978978984, "loss": 4.954, "step": 354 }, { "epoch": 0.1278127812781278, "grad_norm": 2.2643632888793945, "learning_rate": 0.0001938902090555736, "loss": 5.5216, "step": 355 }, { "epoch": 0.12817281728172816, "grad_norm": 2.2976651191711426, "learning_rate": 0.00019385049560415794, "loss": 5.3145, "step": 356 }, { "epoch": 0.12853285328532854, "grad_norm": 2.773254156112671, "learning_rate": 0.00019381065759635822, "loss": 5.2893, "step": 357 }, { "epoch": 0.1288928892889289, "grad_norm": 1.7148758172988892, "learning_rate": 0.0001937706950850466, "loss": 4.8919, "step": 358 }, { "epoch": 0.12925292529252924, "grad_norm": 1.769167184829712, "learning_rate": 0.00019373060812326052, "loss": 5.1463, "step": 359 }, { "epoch": 0.12961296129612962, "grad_norm": 2.7855257987976074, "learning_rate": 0.00019369039676420252, "loss": 5.3797, "step": 360 }, { "epoch": 0.12997299729972997, "grad_norm": 1.7530555725097656, "learning_rate": 0.00019365006106124028, "loss": 5.0662, "step": 361 }, { "epoch": 0.13033303330333032, "grad_norm": 1.963545560836792, "learning_rate": 0.00019360960106790643, "loss": 5.0004, "step": 362 }, { "epoch": 0.1306930693069307, "grad_norm": 1.887229561805725, "learning_rate": 0.0001935690168378987, "loss": 5.1733, "step": 363 }, { "epoch": 0.13105310531053105, "grad_norm": 1.974086046218872, "learning_rate": 0.00019352830842507958, "loss": 5.1651, "step": 364 }, { "epoch": 0.13141314131413143, "grad_norm": 2.0009772777557373, "learning_rate": 0.00019348747588347637, "loss": 5.1493, "step": 365 }, { "epoch": 0.13177317731773178, "grad_norm": 1.8167927265167236, "learning_rate": 0.0001934465192672812, "loss": 4.9338, "step": 366 }, { "epoch": 0.13213321332133213, "grad_norm": 1.8077260255813599, "learning_rate": 0.0001934054386308508, "loss": 4.8103, "step": 367 }, { "epoch": 0.1324932493249325, "grad_norm": 2.6427648067474365, "learning_rate": 0.00019336423402870653, "loss": 5.4321, "step": 368 }, { "epoch": 0.13285328532853286, "grad_norm": 2.079245090484619, "learning_rate": 0.00019332290551553425, "loss": 5.3425, "step": 369 }, { "epoch": 0.1332133213321332, "grad_norm": 1.9009953737258911, "learning_rate": 0.00019328145314618432, "loss": 5.3169, "step": 370 }, { "epoch": 0.13357335733573358, "grad_norm": 1.8705310821533203, "learning_rate": 0.0001932398769756714, "loss": 4.9403, "step": 371 }, { "epoch": 0.13393339333933393, "grad_norm": 1.8321329355239868, "learning_rate": 0.0001931981770591745, "loss": 5.1256, "step": 372 }, { "epoch": 0.13429342934293428, "grad_norm": 2.4803407192230225, "learning_rate": 0.0001931563534520369, "loss": 5.7172, "step": 373 }, { "epoch": 0.13465346534653466, "grad_norm": 2.4806675910949707, "learning_rate": 0.00019311440620976597, "loss": 5.2023, "step": 374 }, { "epoch": 0.135013501350135, "grad_norm": 2.764505386352539, "learning_rate": 0.00019307233538803323, "loss": 5.7188, "step": 375 }, { "epoch": 0.13537353735373536, "grad_norm": 3.1487159729003906, "learning_rate": 0.0001930301410426741, "loss": 5.4085, "step": 376 }, { "epoch": 0.13573357335733574, "grad_norm": 4.365496635437012, "learning_rate": 0.00019298782322968815, "loss": 5.181, "step": 377 }, { "epoch": 0.1360936093609361, "grad_norm": 4.676678657531738, "learning_rate": 0.0001929453820052386, "loss": 4.8029, "step": 378 }, { "epoch": 0.13645364536453644, "grad_norm": 2.482147455215454, "learning_rate": 0.00019290281742565256, "loss": 5.2323, "step": 379 }, { "epoch": 0.13681368136813682, "grad_norm": 2.6805992126464844, "learning_rate": 0.0001928601295474208, "loss": 4.7856, "step": 380 }, { "epoch": 0.13717371737173717, "grad_norm": 2.0542752742767334, "learning_rate": 0.00019281731842719782, "loss": 4.8101, "step": 381 }, { "epoch": 0.13753375337533755, "grad_norm": 1.3544633388519287, "learning_rate": 0.0001927743841218016, "loss": 4.9205, "step": 382 }, { "epoch": 0.1378937893789379, "grad_norm": 1.80148184299469, "learning_rate": 0.00019273132668821364, "loss": 5.2472, "step": 383 }, { "epoch": 0.13825382538253825, "grad_norm": 1.7780407667160034, "learning_rate": 0.00019268814618357886, "loss": 5.1249, "step": 384 }, { "epoch": 0.13861386138613863, "grad_norm": 1.7747406959533691, "learning_rate": 0.00019264484266520547, "loss": 5.0265, "step": 385 }, { "epoch": 0.13897389738973898, "grad_norm": 2.0032706260681152, "learning_rate": 0.00019260141619056507, "loss": 4.8149, "step": 386 }, { "epoch": 0.13933393339333933, "grad_norm": 1.8248376846313477, "learning_rate": 0.00019255786681729225, "loss": 5.0561, "step": 387 }, { "epoch": 0.1396939693969397, "grad_norm": 2.1387648582458496, "learning_rate": 0.0001925141946031849, "loss": 5.2459, "step": 388 }, { "epoch": 0.14005400540054005, "grad_norm": 1.6618605852127075, "learning_rate": 0.0001924703996062038, "loss": 5.5427, "step": 389 }, { "epoch": 0.1404140414041404, "grad_norm": 1.8091217279434204, "learning_rate": 0.00019242648188447272, "loss": 5.3355, "step": 390 }, { "epoch": 0.14077407740774078, "grad_norm": 1.5371900796890259, "learning_rate": 0.0001923824414962784, "loss": 5.0075, "step": 391 }, { "epoch": 0.14113411341134113, "grad_norm": 1.6160904169082642, "learning_rate": 0.00019233827850007027, "loss": 5.0782, "step": 392 }, { "epoch": 0.14149414941494148, "grad_norm": 2.6607415676116943, "learning_rate": 0.0001922939929544605, "loss": 4.9883, "step": 393 }, { "epoch": 0.14185418541854186, "grad_norm": 2.1520042419433594, "learning_rate": 0.00019224958491822396, "loss": 4.8165, "step": 394 }, { "epoch": 0.1422142214221422, "grad_norm": 1.8465698957443237, "learning_rate": 0.000192205054450298, "loss": 4.9328, "step": 395 }, { "epoch": 0.14257425742574256, "grad_norm": 2.0882647037506104, "learning_rate": 0.00019216040160978262, "loss": 5.0565, "step": 396 }, { "epoch": 0.14293429342934294, "grad_norm": 1.8768270015716553, "learning_rate": 0.00019211562645594002, "loss": 4.9836, "step": 397 }, { "epoch": 0.1432943294329433, "grad_norm": 2.7058422565460205, "learning_rate": 0.00019207072904819486, "loss": 5.5061, "step": 398 }, { "epoch": 0.14365436543654364, "grad_norm": 2.3394227027893066, "learning_rate": 0.000192025709446134, "loss": 5.3377, "step": 399 }, { "epoch": 0.14401440144014402, "grad_norm": 2.7763686180114746, "learning_rate": 0.00019198056770950656, "loss": 5.6518, "step": 400 }, { "epoch": 0.14437443744374437, "grad_norm": 3.3545148372650146, "learning_rate": 0.00019193530389822363, "loss": 5.2922, "step": 401 }, { "epoch": 0.14473447344734475, "grad_norm": 1.994736671447754, "learning_rate": 0.00019188991807235844, "loss": 5.0987, "step": 402 }, { "epoch": 0.1450945094509451, "grad_norm": 2.1159956455230713, "learning_rate": 0.00019184441029214608, "loss": 4.9078, "step": 403 }, { "epoch": 0.14545454545454545, "grad_norm": 1.9915326833724976, "learning_rate": 0.00019179878061798347, "loss": 4.8249, "step": 404 }, { "epoch": 0.14581458145814583, "grad_norm": 2.081965208053589, "learning_rate": 0.00019175302911042936, "loss": 4.949, "step": 405 }, { "epoch": 0.14617461746174618, "grad_norm": 1.8846912384033203, "learning_rate": 0.0001917071558302042, "loss": 4.7634, "step": 406 }, { "epoch": 0.14653465346534653, "grad_norm": 2.0804243087768555, "learning_rate": 0.00019166116083819002, "loss": 5.2972, "step": 407 }, { "epoch": 0.1468946894689469, "grad_norm": 1.6286754608154297, "learning_rate": 0.0001916150441954304, "loss": 5.024, "step": 408 }, { "epoch": 0.14725472547254725, "grad_norm": 1.9272427558898926, "learning_rate": 0.00019156880596313033, "loss": 4.902, "step": 409 }, { "epoch": 0.1476147614761476, "grad_norm": 2.1995272636413574, "learning_rate": 0.0001915224462026563, "loss": 5.2471, "step": 410 }, { "epoch": 0.14797479747974798, "grad_norm": 1.864698052406311, "learning_rate": 0.0001914759649755359, "loss": 5.1977, "step": 411 }, { "epoch": 0.14833483348334833, "grad_norm": 1.937286615371704, "learning_rate": 0.0001914293623434581, "loss": 4.7627, "step": 412 }, { "epoch": 0.14869486948694868, "grad_norm": 1.9285743236541748, "learning_rate": 0.00019138263836827288, "loss": 4.8136, "step": 413 }, { "epoch": 0.14905490549054906, "grad_norm": 1.9823040962219238, "learning_rate": 0.00019133579311199133, "loss": 5.1382, "step": 414 }, { "epoch": 0.1494149414941494, "grad_norm": 1.8620997667312622, "learning_rate": 0.00019128882663678546, "loss": 4.9124, "step": 415 }, { "epoch": 0.14977497749774976, "grad_norm": 2.0542571544647217, "learning_rate": 0.00019124173900498818, "loss": 4.6567, "step": 416 }, { "epoch": 0.15013501350135014, "grad_norm": 1.7971733808517456, "learning_rate": 0.00019119453027909323, "loss": 4.8622, "step": 417 }, { "epoch": 0.1504950495049505, "grad_norm": 1.7473721504211426, "learning_rate": 0.00019114720052175498, "loss": 5.1331, "step": 418 }, { "epoch": 0.15085508550855087, "grad_norm": 2.0732085704803467, "learning_rate": 0.0001910997497957885, "loss": 5.1333, "step": 419 }, { "epoch": 0.15121512151215122, "grad_norm": 1.8084912300109863, "learning_rate": 0.0001910521781641694, "loss": 4.8632, "step": 420 }, { "epoch": 0.15157515751575157, "grad_norm": 2.067760705947876, "learning_rate": 0.0001910044856900337, "loss": 5.2845, "step": 421 }, { "epoch": 0.15193519351935195, "grad_norm": 2.5814943313598633, "learning_rate": 0.0001909566724366779, "loss": 5.398, "step": 422 }, { "epoch": 0.1522952295229523, "grad_norm": 2.2594242095947266, "learning_rate": 0.0001909087384675587, "loss": 5.25, "step": 423 }, { "epoch": 0.15265526552655265, "grad_norm": 2.8528106212615967, "learning_rate": 0.000190860683846293, "loss": 5.6809, "step": 424 }, { "epoch": 0.15301530153015303, "grad_norm": 2.9201619625091553, "learning_rate": 0.00019081250863665794, "loss": 5.5664, "step": 425 }, { "epoch": 0.15337533753375338, "grad_norm": 3.6063146591186523, "learning_rate": 0.00019076421290259058, "loss": 5.3827, "step": 426 }, { "epoch": 0.15373537353735373, "grad_norm": 2.526179313659668, "learning_rate": 0.00019071579670818808, "loss": 5.106, "step": 427 }, { "epoch": 0.1540954095409541, "grad_norm": 2.3812053203582764, "learning_rate": 0.00019066726011770726, "loss": 4.8439, "step": 428 }, { "epoch": 0.15445544554455445, "grad_norm": 2.1316640377044678, "learning_rate": 0.00019061860319556496, "loss": 5.3182, "step": 429 }, { "epoch": 0.1548154815481548, "grad_norm": 2.0962913036346436, "learning_rate": 0.00019056982600633755, "loss": 4.9082, "step": 430 }, { "epoch": 0.15517551755175518, "grad_norm": 1.7051233053207397, "learning_rate": 0.0001905209286147611, "loss": 5.1091, "step": 431 }, { "epoch": 0.15553555355535553, "grad_norm": 1.8145262002944946, "learning_rate": 0.00019047191108573125, "loss": 4.7386, "step": 432 }, { "epoch": 0.15589558955895588, "grad_norm": 2.0659284591674805, "learning_rate": 0.00019042277348430288, "loss": 5.0365, "step": 433 }, { "epoch": 0.15625562556255626, "grad_norm": 2.1042418479919434, "learning_rate": 0.0001903735158756905, "loss": 5.0352, "step": 434 }, { "epoch": 0.1566156615661566, "grad_norm": 1.6454683542251587, "learning_rate": 0.00019032413832526773, "loss": 4.6965, "step": 435 }, { "epoch": 0.156975697569757, "grad_norm": 2.451984167098999, "learning_rate": 0.00019027464089856736, "loss": 5.3104, "step": 436 }, { "epoch": 0.15733573357335734, "grad_norm": 1.5949114561080933, "learning_rate": 0.00019022502366128135, "loss": 4.9479, "step": 437 }, { "epoch": 0.1576957695769577, "grad_norm": 1.6969925165176392, "learning_rate": 0.00019017528667926068, "loss": 5.0748, "step": 438 }, { "epoch": 0.15805580558055807, "grad_norm": 1.8219711780548096, "learning_rate": 0.00019012543001851518, "loss": 4.8468, "step": 439 }, { "epoch": 0.15841584158415842, "grad_norm": 1.819425344467163, "learning_rate": 0.00019007545374521355, "loss": 4.6628, "step": 440 }, { "epoch": 0.15877587758775877, "grad_norm": 1.949058175086975, "learning_rate": 0.0001900253579256832, "loss": 5.1057, "step": 441 }, { "epoch": 0.15913591359135915, "grad_norm": 1.5444921255111694, "learning_rate": 0.00018997514262641035, "loss": 5.0297, "step": 442 }, { "epoch": 0.1594959495949595, "grad_norm": 1.449641227722168, "learning_rate": 0.00018992480791403958, "loss": 4.9619, "step": 443 }, { "epoch": 0.15985598559855985, "grad_norm": 2.433344841003418, "learning_rate": 0.00018987435385537404, "loss": 5.3115, "step": 444 }, { "epoch": 0.16021602160216022, "grad_norm": 1.7659337520599365, "learning_rate": 0.00018982378051737538, "loss": 4.9919, "step": 445 }, { "epoch": 0.16057605760576057, "grad_norm": 1.6800904273986816, "learning_rate": 0.0001897730879671634, "loss": 4.7254, "step": 446 }, { "epoch": 0.16093609360936093, "grad_norm": 3.3446006774902344, "learning_rate": 0.00018972227627201617, "loss": 5.5292, "step": 447 }, { "epoch": 0.1612961296129613, "grad_norm": 3.71976375579834, "learning_rate": 0.0001896713454993699, "loss": 5.5853, "step": 448 }, { "epoch": 0.16165616561656165, "grad_norm": 2.7482266426086426, "learning_rate": 0.00018962029571681886, "loss": 5.6693, "step": 449 }, { "epoch": 0.162016201620162, "grad_norm": 3.5169460773468018, "learning_rate": 0.00018956912699211517, "loss": 5.8031, "step": 450 }, { "epoch": 0.16237623762376238, "grad_norm": 3.7753636837005615, "learning_rate": 0.00018951783939316893, "loss": 4.818, "step": 451 }, { "epoch": 0.16273627362736273, "grad_norm": 2.4180219173431396, "learning_rate": 0.00018946643298804793, "loss": 4.6077, "step": 452 }, { "epoch": 0.1630963096309631, "grad_norm": 2.407536745071411, "learning_rate": 0.0001894149078449777, "loss": 5.2159, "step": 453 }, { "epoch": 0.16345634563456346, "grad_norm": 1.8214914798736572, "learning_rate": 0.00018936326403234125, "loss": 5.1427, "step": 454 }, { "epoch": 0.1638163816381638, "grad_norm": 1.9131064414978027, "learning_rate": 0.00018931150161867916, "loss": 4.7128, "step": 455 }, { "epoch": 0.1641764176417642, "grad_norm": 1.8776406049728394, "learning_rate": 0.00018925962067268946, "loss": 4.9725, "step": 456 }, { "epoch": 0.16453645364536454, "grad_norm": 2.3590633869171143, "learning_rate": 0.0001892076212632274, "loss": 4.8201, "step": 457 }, { "epoch": 0.1648964896489649, "grad_norm": 2.2605080604553223, "learning_rate": 0.0001891555034593055, "loss": 4.8645, "step": 458 }, { "epoch": 0.16525652565256527, "grad_norm": 1.9868052005767822, "learning_rate": 0.00018910326733009337, "loss": 5.4631, "step": 459 }, { "epoch": 0.16561656165616562, "grad_norm": 1.6422542333602905, "learning_rate": 0.00018905091294491776, "loss": 5.1917, "step": 460 }, { "epoch": 0.16597659765976597, "grad_norm": 1.5032970905303955, "learning_rate": 0.00018899844037326225, "loss": 4.7756, "step": 461 }, { "epoch": 0.16633663366336635, "grad_norm": 1.892916202545166, "learning_rate": 0.00018894584968476733, "loss": 4.9898, "step": 462 }, { "epoch": 0.1666966696669667, "grad_norm": 2.54939341545105, "learning_rate": 0.00018889314094923024, "loss": 5.1087, "step": 463 }, { "epoch": 0.16705670567056705, "grad_norm": 1.70566987991333, "learning_rate": 0.0001888403142366049, "loss": 4.8872, "step": 464 }, { "epoch": 0.16741674167416742, "grad_norm": 1.6707229614257812, "learning_rate": 0.00018878736961700182, "loss": 5.0393, "step": 465 }, { "epoch": 0.16777677767776777, "grad_norm": 2.0254170894622803, "learning_rate": 0.00018873430716068792, "loss": 5.0949, "step": 466 }, { "epoch": 0.16813681368136812, "grad_norm": 1.7791615724563599, "learning_rate": 0.00018868112693808665, "loss": 5.0134, "step": 467 }, { "epoch": 0.1684968496849685, "grad_norm": 2.100054979324341, "learning_rate": 0.00018862782901977754, "loss": 5.0241, "step": 468 }, { "epoch": 0.16885688568856885, "grad_norm": 2.047647714614868, "learning_rate": 0.0001885744134764966, "loss": 5.1617, "step": 469 }, { "epoch": 0.1692169216921692, "grad_norm": 1.9202643632888794, "learning_rate": 0.00018852088037913577, "loss": 5.1913, "step": 470 }, { "epoch": 0.16957695769576958, "grad_norm": 1.5470832586288452, "learning_rate": 0.00018846722979874297, "loss": 4.935, "step": 471 }, { "epoch": 0.16993699369936993, "grad_norm": 1.6920344829559326, "learning_rate": 0.00018841346180652213, "loss": 5.2467, "step": 472 }, { "epoch": 0.1702970297029703, "grad_norm": 2.0190439224243164, "learning_rate": 0.00018835957647383303, "loss": 5.4482, "step": 473 }, { "epoch": 0.17065706570657066, "grad_norm": 2.1805849075317383, "learning_rate": 0.0001883055738721911, "loss": 5.2908, "step": 474 }, { "epoch": 0.171017101710171, "grad_norm": 2.504727363586426, "learning_rate": 0.00018825145407326747, "loss": 5.1893, "step": 475 }, { "epoch": 0.1713771377137714, "grad_norm": 3.133061647415161, "learning_rate": 0.00018819721714888877, "loss": 5.0389, "step": 476 }, { "epoch": 0.17173717371737174, "grad_norm": 2.387793779373169, "learning_rate": 0.00018814286317103714, "loss": 4.963, "step": 477 }, { "epoch": 0.1720972097209721, "grad_norm": 2.892192840576172, "learning_rate": 0.00018808839221184999, "loss": 5.3117, "step": 478 }, { "epoch": 0.17245724572457247, "grad_norm": 2.097123861312866, "learning_rate": 0.00018803380434362, "loss": 5.2016, "step": 479 }, { "epoch": 0.17281728172817282, "grad_norm": 1.3749529123306274, "learning_rate": 0.00018797909963879503, "loss": 4.8276, "step": 480 }, { "epoch": 0.17317731773177317, "grad_norm": 2.303584575653076, "learning_rate": 0.00018792427816997803, "loss": 5.1623, "step": 481 }, { "epoch": 0.17353735373537355, "grad_norm": 4.135527610778809, "learning_rate": 0.00018786934000992688, "loss": 4.7037, "step": 482 }, { "epoch": 0.1738973897389739, "grad_norm": 1.7476530075073242, "learning_rate": 0.00018781428523155435, "loss": 4.74, "step": 483 }, { "epoch": 0.17425742574257425, "grad_norm": 1.6134731769561768, "learning_rate": 0.00018775911390792795, "loss": 5.1493, "step": 484 }, { "epoch": 0.17461746174617462, "grad_norm": 1.7763808965682983, "learning_rate": 0.00018770382611226987, "loss": 4.8619, "step": 485 }, { "epoch": 0.17497749774977497, "grad_norm": 1.4742817878723145, "learning_rate": 0.00018764842191795698, "loss": 4.8855, "step": 486 }, { "epoch": 0.17533753375337532, "grad_norm": 1.5574274063110352, "learning_rate": 0.00018759290139852048, "loss": 5.187, "step": 487 }, { "epoch": 0.1756975697569757, "grad_norm": 1.4653335809707642, "learning_rate": 0.000187537264627646, "loss": 4.8548, "step": 488 }, { "epoch": 0.17605760576057605, "grad_norm": 1.5503312349319458, "learning_rate": 0.0001874815116791736, "loss": 5.0649, "step": 489 }, { "epoch": 0.17641764176417643, "grad_norm": 1.7193348407745361, "learning_rate": 0.00018742564262709725, "loss": 4.667, "step": 490 }, { "epoch": 0.17677767776777678, "grad_norm": 1.4068859815597534, "learning_rate": 0.00018736965754556528, "loss": 4.5735, "step": 491 }, { "epoch": 0.17713771377137713, "grad_norm": 2.0566108226776123, "learning_rate": 0.00018731355650887985, "loss": 5.4348, "step": 492 }, { "epoch": 0.1774977497749775, "grad_norm": 1.428653359413147, "learning_rate": 0.00018725733959149712, "loss": 4.8304, "step": 493 }, { "epoch": 0.17785778577857786, "grad_norm": 2.3756580352783203, "learning_rate": 0.00018720100686802694, "loss": 5.3095, "step": 494 }, { "epoch": 0.1782178217821782, "grad_norm": 1.7651077508926392, "learning_rate": 0.00018714455841323287, "loss": 5.0266, "step": 495 }, { "epoch": 0.1785778577857786, "grad_norm": 2.093381643295288, "learning_rate": 0.00018708799430203218, "loss": 5.1646, "step": 496 }, { "epoch": 0.17893789378937894, "grad_norm": 1.4486280679702759, "learning_rate": 0.00018703131460949554, "loss": 4.9302, "step": 497 }, { "epoch": 0.1792979297929793, "grad_norm": 2.9753105640411377, "learning_rate": 0.000186974519410847, "loss": 5.7727, "step": 498 }, { "epoch": 0.17965796579657967, "grad_norm": 3.126038074493408, "learning_rate": 0.00018691760878146395, "loss": 5.5132, "step": 499 }, { "epoch": 0.18001800180018002, "grad_norm": 2.3378102779388428, "learning_rate": 0.00018686058279687698, "loss": 5.5611, "step": 500 }, { "epoch": 0.18037803780378037, "grad_norm": 3.1730830669403076, "learning_rate": 0.0001868034415327698, "loss": 5.0738, "step": 501 }, { "epoch": 0.18073807380738074, "grad_norm": 2.214451789855957, "learning_rate": 0.000186746185064979, "loss": 4.9771, "step": 502 }, { "epoch": 0.1810981098109811, "grad_norm": 2.729839324951172, "learning_rate": 0.00018668881346949417, "loss": 5.0892, "step": 503 }, { "epoch": 0.18145814581458145, "grad_norm": 1.8582335710525513, "learning_rate": 0.00018663132682245772, "loss": 4.8183, "step": 504 }, { "epoch": 0.18181818181818182, "grad_norm": 1.577508807182312, "learning_rate": 0.0001865737252001647, "loss": 4.7524, "step": 505 }, { "epoch": 0.18217821782178217, "grad_norm": 1.4923256635665894, "learning_rate": 0.00018651600867906272, "loss": 4.7477, "step": 506 }, { "epoch": 0.18253825382538255, "grad_norm": 1.3686710596084595, "learning_rate": 0.00018645817733575193, "loss": 4.8028, "step": 507 }, { "epoch": 0.1828982898289829, "grad_norm": 1.701985239982605, "learning_rate": 0.00018640023124698486, "loss": 4.9122, "step": 508 }, { "epoch": 0.18325832583258325, "grad_norm": 1.4300390481948853, "learning_rate": 0.00018634217048966637, "loss": 4.9376, "step": 509 }, { "epoch": 0.18361836183618363, "grad_norm": 2.832374095916748, "learning_rate": 0.0001862839951408534, "loss": 5.0504, "step": 510 }, { "epoch": 0.18397839783978398, "grad_norm": 1.3927937746047974, "learning_rate": 0.0001862257052777551, "loss": 4.7502, "step": 511 }, { "epoch": 0.18433843384338433, "grad_norm": 2.047490358352661, "learning_rate": 0.0001861673009777325, "loss": 4.6108, "step": 512 }, { "epoch": 0.1846984698469847, "grad_norm": 1.9925168752670288, "learning_rate": 0.00018610878231829854, "loss": 4.907, "step": 513 }, { "epoch": 0.18505850585058506, "grad_norm": 3.167588710784912, "learning_rate": 0.00018605014937711796, "loss": 5.2115, "step": 514 }, { "epoch": 0.1854185418541854, "grad_norm": 1.751932144165039, "learning_rate": 0.00018599140223200716, "loss": 5.0556, "step": 515 }, { "epoch": 0.1857785778577858, "grad_norm": 1.883099913597107, "learning_rate": 0.0001859325409609341, "loss": 5.3705, "step": 516 }, { "epoch": 0.18613861386138614, "grad_norm": 1.7942845821380615, "learning_rate": 0.00018587356564201817, "loss": 5.2473, "step": 517 }, { "epoch": 0.1864986498649865, "grad_norm": 2.2205777168273926, "learning_rate": 0.0001858144763535302, "loss": 4.8881, "step": 518 }, { "epoch": 0.18685868586858687, "grad_norm": 1.5663084983825684, "learning_rate": 0.0001857552731738922, "loss": 5.0613, "step": 519 }, { "epoch": 0.18721872187218722, "grad_norm": 1.7280100584030151, "learning_rate": 0.00018569595618167745, "loss": 4.5994, "step": 520 }, { "epoch": 0.18757875787578757, "grad_norm": 1.6075050830841064, "learning_rate": 0.00018563652545561013, "loss": 5.1569, "step": 521 }, { "epoch": 0.18793879387938794, "grad_norm": 1.9055190086364746, "learning_rate": 0.00018557698107456549, "loss": 5.1687, "step": 522 }, { "epoch": 0.1882988298829883, "grad_norm": 2.2929351329803467, "learning_rate": 0.00018551732311756952, "loss": 5.1761, "step": 523 }, { "epoch": 0.18865886588658867, "grad_norm": 2.500159502029419, "learning_rate": 0.000185457551663799, "loss": 5.6979, "step": 524 }, { "epoch": 0.18901890189018902, "grad_norm": 2.326322555541992, "learning_rate": 0.00018539766679258134, "loss": 5.1247, "step": 525 }, { "epoch": 0.18937893789378937, "grad_norm": 6.278741359710693, "learning_rate": 0.0001853376685833945, "loss": 4.9737, "step": 526 }, { "epoch": 0.18973897389738975, "grad_norm": 2.955718994140625, "learning_rate": 0.00018527755711586678, "loss": 5.1884, "step": 527 }, { "epoch": 0.1900990099009901, "grad_norm": 1.8568588495254517, "learning_rate": 0.0001852173324697769, "loss": 5.0796, "step": 528 }, { "epoch": 0.19045904590459045, "grad_norm": 1.538630723953247, "learning_rate": 0.00018515699472505364, "loss": 4.8754, "step": 529 }, { "epoch": 0.19081908190819083, "grad_norm": 1.7481509447097778, "learning_rate": 0.00018509654396177609, "loss": 5.1271, "step": 530 }, { "epoch": 0.19117911791179118, "grad_norm": 1.5548195838928223, "learning_rate": 0.00018503598026017312, "loss": 4.8736, "step": 531 }, { "epoch": 0.19153915391539153, "grad_norm": 1.5747162103652954, "learning_rate": 0.00018497530370062363, "loss": 4.922, "step": 532 }, { "epoch": 0.1918991899189919, "grad_norm": 1.2808769941329956, "learning_rate": 0.00018491451436365627, "loss": 4.6306, "step": 533 }, { "epoch": 0.19225922592259226, "grad_norm": 1.9267456531524658, "learning_rate": 0.00018485361232994932, "loss": 5.0321, "step": 534 }, { "epoch": 0.1926192619261926, "grad_norm": 1.491786241531372, "learning_rate": 0.0001847925976803307, "loss": 4.7556, "step": 535 }, { "epoch": 0.192979297929793, "grad_norm": 1.279565691947937, "learning_rate": 0.00018473147049577774, "loss": 4.8633, "step": 536 }, { "epoch": 0.19333933393339334, "grad_norm": 1.7372647523880005, "learning_rate": 0.00018467023085741717, "loss": 5.1584, "step": 537 }, { "epoch": 0.1936993699369937, "grad_norm": 1.3435121774673462, "learning_rate": 0.0001846088788465249, "loss": 4.7903, "step": 538 }, { "epoch": 0.19405940594059407, "grad_norm": 1.2509280443191528, "learning_rate": 0.00018454741454452603, "loss": 4.8515, "step": 539 }, { "epoch": 0.19441944194419442, "grad_norm": 1.8514949083328247, "learning_rate": 0.0001844858380329947, "loss": 4.8086, "step": 540 }, { "epoch": 0.19477947794779477, "grad_norm": 1.8898829221725464, "learning_rate": 0.00018442414939365387, "loss": 4.7823, "step": 541 }, { "epoch": 0.19513951395139514, "grad_norm": 1.3015843629837036, "learning_rate": 0.00018436234870837547, "loss": 4.6961, "step": 542 }, { "epoch": 0.1954995499549955, "grad_norm": 1.529765248298645, "learning_rate": 0.00018430043605918006, "loss": 4.7827, "step": 543 }, { "epoch": 0.19585958595859587, "grad_norm": 1.641842007637024, "learning_rate": 0.00018423841152823673, "loss": 5.2028, "step": 544 }, { "epoch": 0.19621962196219622, "grad_norm": 1.861567497253418, "learning_rate": 0.00018417627519786315, "loss": 5.013, "step": 545 }, { "epoch": 0.19657965796579657, "grad_norm": 1.7786455154418945, "learning_rate": 0.00018411402715052538, "loss": 5.1648, "step": 546 }, { "epoch": 0.19693969396939695, "grad_norm": 1.5268160104751587, "learning_rate": 0.00018405166746883762, "loss": 5.0003, "step": 547 }, { "epoch": 0.1972997299729973, "grad_norm": 1.5869791507720947, "learning_rate": 0.00018398919623556238, "loss": 5.0791, "step": 548 }, { "epoch": 0.19765976597659765, "grad_norm": 1.8240504264831543, "learning_rate": 0.00018392661353361015, "loss": 5.1986, "step": 549 }, { "epoch": 0.19801980198019803, "grad_norm": 1.9763150215148926, "learning_rate": 0.00018386391944603934, "loss": 5.4118, "step": 550 }, { "epoch": 0.19837983798379838, "grad_norm": 3.0309135913848877, "learning_rate": 0.0001838011140560562, "loss": 5.1273, "step": 551 }, { "epoch": 0.19873987398739873, "grad_norm": 2.879659414291382, "learning_rate": 0.00018373819744701476, "loss": 4.9732, "step": 552 }, { "epoch": 0.1990999099909991, "grad_norm": 2.4230196475982666, "learning_rate": 0.00018367516970241657, "loss": 4.8701, "step": 553 }, { "epoch": 0.19945994599459946, "grad_norm": 1.527785301208496, "learning_rate": 0.00018361203090591071, "loss": 4.9533, "step": 554 }, { "epoch": 0.1998199819981998, "grad_norm": 2.2586495876312256, "learning_rate": 0.00018354878114129367, "loss": 5.0682, "step": 555 }, { "epoch": 0.2001800180018002, "grad_norm": 1.911534309387207, "learning_rate": 0.00018348542049250916, "loss": 4.5963, "step": 556 }, { "epoch": 0.20054005400540054, "grad_norm": 1.68331778049469, "learning_rate": 0.00018342194904364813, "loss": 4.9191, "step": 557 }, { "epoch": 0.2009000900090009, "grad_norm": 1.540486216545105, "learning_rate": 0.00018335836687894853, "loss": 5.1383, "step": 558 }, { "epoch": 0.20126012601260126, "grad_norm": 2.277985095977783, "learning_rate": 0.00018329467408279522, "loss": 5.1415, "step": 559 }, { "epoch": 0.20162016201620162, "grad_norm": 1.8936793804168701, "learning_rate": 0.00018323087073971993, "loss": 4.5897, "step": 560 }, { "epoch": 0.201980198019802, "grad_norm": 1.8286054134368896, "learning_rate": 0.00018316695693440117, "loss": 4.6607, "step": 561 }, { "epoch": 0.20234023402340234, "grad_norm": 1.3202015161514282, "learning_rate": 0.00018310293275166392, "loss": 5.0079, "step": 562 }, { "epoch": 0.2027002700270027, "grad_norm": 1.8645577430725098, "learning_rate": 0.00018303879827647975, "loss": 4.7252, "step": 563 }, { "epoch": 0.20306030603060307, "grad_norm": 1.383417010307312, "learning_rate": 0.00018297455359396657, "loss": 5.0536, "step": 564 }, { "epoch": 0.20342034203420342, "grad_norm": 1.9294304847717285, "learning_rate": 0.0001829101987893885, "loss": 4.9231, "step": 565 }, { "epoch": 0.20378037803780377, "grad_norm": 1.5366313457489014, "learning_rate": 0.00018284573394815597, "loss": 4.7901, "step": 566 }, { "epoch": 0.20414041404140415, "grad_norm": 1.1666430234909058, "learning_rate": 0.00018278115915582526, "loss": 4.4916, "step": 567 }, { "epoch": 0.2045004500450045, "grad_norm": 1.4304466247558594, "learning_rate": 0.0001827164744980987, "loss": 4.8673, "step": 568 }, { "epoch": 0.20486048604860485, "grad_norm": 1.5400853157043457, "learning_rate": 0.00018265168006082437, "loss": 5.1139, "step": 569 }, { "epoch": 0.20522052205220523, "grad_norm": 1.3901218175888062, "learning_rate": 0.0001825867759299961, "loss": 5.3105, "step": 570 }, { "epoch": 0.20558055805580558, "grad_norm": 1.6825261116027832, "learning_rate": 0.00018252176219175328, "loss": 4.8088, "step": 571 }, { "epoch": 0.20594059405940593, "grad_norm": 1.423314094543457, "learning_rate": 0.00018245663893238075, "loss": 4.9159, "step": 572 }, { "epoch": 0.2063006300630063, "grad_norm": 2.1610755920410156, "learning_rate": 0.00018239140623830868, "loss": 5.0322, "step": 573 }, { "epoch": 0.20666066606660666, "grad_norm": 2.3829901218414307, "learning_rate": 0.00018232606419611255, "loss": 5.305, "step": 574 }, { "epoch": 0.207020702070207, "grad_norm": 2.3431599140167236, "learning_rate": 0.00018226061289251298, "loss": 5.3596, "step": 575 }, { "epoch": 0.20738073807380739, "grad_norm": 2.5606956481933594, "learning_rate": 0.00018219505241437545, "loss": 5.3337, "step": 576 }, { "epoch": 0.20774077407740774, "grad_norm": 1.3641765117645264, "learning_rate": 0.00018212938284871047, "loss": 4.5617, "step": 577 }, { "epoch": 0.20810081008100811, "grad_norm": 1.8042774200439453, "learning_rate": 0.00018206360428267332, "loss": 4.8936, "step": 578 }, { "epoch": 0.20846084608460846, "grad_norm": 1.7920327186584473, "learning_rate": 0.0001819977168035639, "loss": 5.0555, "step": 579 }, { "epoch": 0.20882088208820881, "grad_norm": 1.3233819007873535, "learning_rate": 0.0001819317204988267, "loss": 4.9664, "step": 580 }, { "epoch": 0.2091809180918092, "grad_norm": 1.3599638938903809, "learning_rate": 0.00018186561545605054, "loss": 5.1357, "step": 581 }, { "epoch": 0.20954095409540954, "grad_norm": 1.2919039726257324, "learning_rate": 0.0001817994017629687, "loss": 5.0696, "step": 582 }, { "epoch": 0.2099009900990099, "grad_norm": 1.4741486310958862, "learning_rate": 0.00018173307950745854, "loss": 4.8335, "step": 583 }, { "epoch": 0.21026102610261027, "grad_norm": 1.8232271671295166, "learning_rate": 0.0001816666487775416, "loss": 4.9422, "step": 584 }, { "epoch": 0.21062106210621062, "grad_norm": 1.6007291078567505, "learning_rate": 0.0001816001096613833, "loss": 4.8473, "step": 585 }, { "epoch": 0.21098109810981097, "grad_norm": 1.5359485149383545, "learning_rate": 0.00018153346224729293, "loss": 5.0639, "step": 586 }, { "epoch": 0.21134113411341135, "grad_norm": 1.622120976448059, "learning_rate": 0.00018146670662372354, "loss": 4.9315, "step": 587 }, { "epoch": 0.2117011701170117, "grad_norm": 2.3048367500305176, "learning_rate": 0.00018139984287927175, "loss": 4.8382, "step": 588 }, { "epoch": 0.21206120612061205, "grad_norm": 1.7604765892028809, "learning_rate": 0.00018133287110267776, "loss": 4.776, "step": 589 }, { "epoch": 0.21242124212421243, "grad_norm": 1.3824963569641113, "learning_rate": 0.00018126579138282503, "loss": 5.1347, "step": 590 }, { "epoch": 0.21278127812781278, "grad_norm": 2.024390935897827, "learning_rate": 0.00018119860380874037, "loss": 5.0018, "step": 591 }, { "epoch": 0.21314131413141313, "grad_norm": 1.3187147378921509, "learning_rate": 0.00018113130846959368, "loss": 5.1671, "step": 592 }, { "epoch": 0.2135013501350135, "grad_norm": 1.140762209892273, "learning_rate": 0.00018106390545469795, "loss": 4.675, "step": 593 }, { "epoch": 0.21386138613861386, "grad_norm": 2.119356870651245, "learning_rate": 0.00018099639485350897, "loss": 4.651, "step": 594 }, { "epoch": 0.21422142214221424, "grad_norm": 1.6680924892425537, "learning_rate": 0.0001809287767556254, "loss": 4.718, "step": 595 }, { "epoch": 0.21458145814581459, "grad_norm": 1.3462631702423096, "learning_rate": 0.00018086105125078857, "loss": 5.0576, "step": 596 }, { "epoch": 0.21494149414941494, "grad_norm": 1.4861905574798584, "learning_rate": 0.00018079321842888227, "loss": 4.9842, "step": 597 }, { "epoch": 0.2153015301530153, "grad_norm": 1.6816765069961548, "learning_rate": 0.00018072527837993284, "loss": 5.3918, "step": 598 }, { "epoch": 0.21566156615661566, "grad_norm": 2.004573345184326, "learning_rate": 0.00018065723119410884, "loss": 5.0587, "step": 599 }, { "epoch": 0.21602160216021601, "grad_norm": 3.218261480331421, "learning_rate": 0.00018058907696172108, "loss": 5.5396, "step": 600 }, { "epoch": 0.2163816381638164, "grad_norm": 1.584768533706665, "learning_rate": 0.00018052081577322234, "loss": 4.9708, "step": 601 }, { "epoch": 0.21674167416741674, "grad_norm": 1.6932294368743896, "learning_rate": 0.0001804524477192075, "loss": 4.8287, "step": 602 }, { "epoch": 0.2171017101710171, "grad_norm": 1.7169231176376343, "learning_rate": 0.00018038397289041314, "loss": 4.551, "step": 603 }, { "epoch": 0.21746174617461747, "grad_norm": 1.5970929861068726, "learning_rate": 0.0001803153913777176, "loss": 4.8561, "step": 604 }, { "epoch": 0.21782178217821782, "grad_norm": 1.2572895288467407, "learning_rate": 0.00018024670327214084, "loss": 4.9702, "step": 605 }, { "epoch": 0.21818181818181817, "grad_norm": 1.986138105392456, "learning_rate": 0.00018017790866484422, "loss": 5.0378, "step": 606 }, { "epoch": 0.21854185418541855, "grad_norm": 1.478252649307251, "learning_rate": 0.00018010900764713048, "loss": 4.7493, "step": 607 }, { "epoch": 0.2189018901890189, "grad_norm": 1.4298301935195923, "learning_rate": 0.0001800400003104436, "loss": 5.3147, "step": 608 }, { "epoch": 0.21926192619261925, "grad_norm": 1.337350845336914, "learning_rate": 0.00017997088674636872, "loss": 4.6894, "step": 609 }, { "epoch": 0.21962196219621963, "grad_norm": 1.3257286548614502, "learning_rate": 0.00017990166704663177, "loss": 5.0221, "step": 610 }, { "epoch": 0.21998199819981998, "grad_norm": 1.4358315467834473, "learning_rate": 0.00017983234130309968, "loss": 5.0326, "step": 611 }, { "epoch": 0.22034203420342033, "grad_norm": 1.6882402896881104, "learning_rate": 0.00017976290960778024, "loss": 4.895, "step": 612 }, { "epoch": 0.2207020702070207, "grad_norm": 1.3076093196868896, "learning_rate": 0.00017969337205282155, "loss": 4.8874, "step": 613 }, { "epoch": 0.22106210621062106, "grad_norm": 1.3435124158859253, "learning_rate": 0.00017962372873051252, "loss": 4.9925, "step": 614 }, { "epoch": 0.22142214221422143, "grad_norm": 2.2377662658691406, "learning_rate": 0.00017955397973328215, "loss": 5.0888, "step": 615 }, { "epoch": 0.22178217821782178, "grad_norm": 1.777191400527954, "learning_rate": 0.00017948412515369995, "loss": 4.8241, "step": 616 }, { "epoch": 0.22214221422142214, "grad_norm": 1.478715419769287, "learning_rate": 0.00017941416508447536, "loss": 4.507, "step": 617 }, { "epoch": 0.2225022502250225, "grad_norm": 1.3011283874511719, "learning_rate": 0.00017934409961845791, "loss": 4.7552, "step": 618 }, { "epoch": 0.22286228622862286, "grad_norm": 1.2801713943481445, "learning_rate": 0.00017927392884863703, "loss": 4.5152, "step": 619 }, { "epoch": 0.2232223222322232, "grad_norm": 1.8236119747161865, "learning_rate": 0.00017920365286814183, "loss": 5.012, "step": 620 }, { "epoch": 0.2235823582358236, "grad_norm": 1.5958244800567627, "learning_rate": 0.00017913327177024115, "loss": 5.091, "step": 621 }, { "epoch": 0.22394239423942394, "grad_norm": 1.5439236164093018, "learning_rate": 0.00017906278564834324, "loss": 4.8818, "step": 622 }, { "epoch": 0.2243024302430243, "grad_norm": 1.7584513425827026, "learning_rate": 0.0001789921945959958, "loss": 5.1626, "step": 623 }, { "epoch": 0.22466246624662467, "grad_norm": 1.7982007265090942, "learning_rate": 0.00017892149870688578, "loss": 5.0671, "step": 624 }, { "epoch": 0.22502250225022502, "grad_norm": 2.3741369247436523, "learning_rate": 0.00017885069807483926, "loss": 5.3132, "step": 625 }, { "epoch": 0.22538253825382537, "grad_norm": 2.7315993309020996, "learning_rate": 0.00017877979279382135, "loss": 5.0911, "step": 626 }, { "epoch": 0.22574257425742575, "grad_norm": 1.9711253643035889, "learning_rate": 0.00017870878295793598, "loss": 5.0541, "step": 627 }, { "epoch": 0.2261026102610261, "grad_norm": 2.0043630599975586, "learning_rate": 0.00017863766866142594, "loss": 5.3504, "step": 628 }, { "epoch": 0.22646264626462645, "grad_norm": 1.8370227813720703, "learning_rate": 0.00017856644999867264, "loss": 4.9745, "step": 629 }, { "epoch": 0.22682268226822683, "grad_norm": 1.3397353887557983, "learning_rate": 0.00017849512706419592, "loss": 5.0795, "step": 630 }, { "epoch": 0.22718271827182718, "grad_norm": 1.9813218116760254, "learning_rate": 0.0001784236999526541, "loss": 4.5778, "step": 631 }, { "epoch": 0.22754275427542756, "grad_norm": 1.3915972709655762, "learning_rate": 0.00017835216875884368, "loss": 5.3249, "step": 632 }, { "epoch": 0.2279027902790279, "grad_norm": 1.8962548971176147, "learning_rate": 0.0001782805335776994, "loss": 5.2326, "step": 633 }, { "epoch": 0.22826282628262826, "grad_norm": 1.5071747303009033, "learning_rate": 0.00017820879450429394, "loss": 4.7139, "step": 634 }, { "epoch": 0.22862286228622863, "grad_norm": 1.6194506883621216, "learning_rate": 0.0001781369516338378, "loss": 5.1211, "step": 635 }, { "epoch": 0.22898289828982898, "grad_norm": 1.541662335395813, "learning_rate": 0.0001780650050616794, "loss": 5.1637, "step": 636 }, { "epoch": 0.22934293429342933, "grad_norm": 1.373365044593811, "learning_rate": 0.00017799295488330467, "loss": 5.0808, "step": 637 }, { "epoch": 0.2297029702970297, "grad_norm": 1.1762840747833252, "learning_rate": 0.0001779208011943371, "loss": 4.7893, "step": 638 }, { "epoch": 0.23006300630063006, "grad_norm": 1.5785561800003052, "learning_rate": 0.00017784854409053747, "loss": 4.8314, "step": 639 }, { "epoch": 0.2304230423042304, "grad_norm": 2.131624221801758, "learning_rate": 0.00017777618366780393, "loss": 4.9747, "step": 640 }, { "epoch": 0.2307830783078308, "grad_norm": 1.3572007417678833, "learning_rate": 0.00017770372002217172, "loss": 4.7746, "step": 641 }, { "epoch": 0.23114311431143114, "grad_norm": 0.9324185252189636, "learning_rate": 0.00017763115324981294, "loss": 4.6222, "step": 642 }, { "epoch": 0.2315031503150315, "grad_norm": 1.7739285230636597, "learning_rate": 0.0001775584834470368, "loss": 4.5461, "step": 643 }, { "epoch": 0.23186318631863187, "grad_norm": 1.4608396291732788, "learning_rate": 0.000177485710710289, "loss": 4.9555, "step": 644 }, { "epoch": 0.23222322232223222, "grad_norm": 1.2847424745559692, "learning_rate": 0.00017741283513615205, "loss": 4.668, "step": 645 }, { "epoch": 0.23258325832583257, "grad_norm": 1.3863086700439453, "learning_rate": 0.00017733985682134482, "loss": 5.1688, "step": 646 }, { "epoch": 0.23294329432943295, "grad_norm": 1.4129760265350342, "learning_rate": 0.00017726677586272263, "loss": 5.1561, "step": 647 }, { "epoch": 0.2333033303330333, "grad_norm": 1.450738549232483, "learning_rate": 0.00017719359235727694, "loss": 5.4269, "step": 648 }, { "epoch": 0.23366336633663368, "grad_norm": 1.901845097541809, "learning_rate": 0.00017712030640213534, "loss": 5.0974, "step": 649 }, { "epoch": 0.23402340234023403, "grad_norm": 2.5835745334625244, "learning_rate": 0.00017704691809456143, "loss": 5.4495, "step": 650 }, { "epoch": 0.23438343834383438, "grad_norm": 2.3962457180023193, "learning_rate": 0.00017697342753195456, "loss": 4.7914, "step": 651 }, { "epoch": 0.23474347434743476, "grad_norm": 1.527147889137268, "learning_rate": 0.00017689983481184989, "loss": 4.9679, "step": 652 }, { "epoch": 0.2351035103510351, "grad_norm": 1.4330006837844849, "learning_rate": 0.00017682614003191807, "loss": 5.0339, "step": 653 }, { "epoch": 0.23546354635463546, "grad_norm": 1.0066897869110107, "learning_rate": 0.0001767523432899653, "loss": 4.8585, "step": 654 }, { "epoch": 0.23582358235823583, "grad_norm": 1.3479336500167847, "learning_rate": 0.00017667844468393295, "loss": 4.662, "step": 655 }, { "epoch": 0.23618361836183618, "grad_norm": 0.9287430644035339, "learning_rate": 0.0001766044443118978, "loss": 4.8556, "step": 656 }, { "epoch": 0.23654365436543653, "grad_norm": 1.4379856586456299, "learning_rate": 0.00017653034227207152, "loss": 5.1246, "step": 657 }, { "epoch": 0.2369036903690369, "grad_norm": 1.7954436540603638, "learning_rate": 0.00017645613866280077, "loss": 5.1441, "step": 658 }, { "epoch": 0.23726372637263726, "grad_norm": 1.1439120769500732, "learning_rate": 0.00017638183358256696, "loss": 4.7969, "step": 659 }, { "epoch": 0.2376237623762376, "grad_norm": 1.266098976135254, "learning_rate": 0.00017630742712998628, "loss": 4.9832, "step": 660 }, { "epoch": 0.237983798379838, "grad_norm": 1.0109524726867676, "learning_rate": 0.00017623291940380937, "loss": 4.731, "step": 661 }, { "epoch": 0.23834383438343834, "grad_norm": 1.1992263793945312, "learning_rate": 0.0001761583105029213, "loss": 4.6046, "step": 662 }, { "epoch": 0.2387038703870387, "grad_norm": 1.3985931873321533, "learning_rate": 0.00017608360052634138, "loss": 5.0826, "step": 663 }, { "epoch": 0.23906390639063907, "grad_norm": 1.4095903635025024, "learning_rate": 0.00017600878957322314, "loss": 4.6519, "step": 664 }, { "epoch": 0.23942394239423942, "grad_norm": 1.2141473293304443, "learning_rate": 0.00017593387774285412, "loss": 4.9543, "step": 665 }, { "epoch": 0.2397839783978398, "grad_norm": 1.5373363494873047, "learning_rate": 0.00017585886513465566, "loss": 4.8106, "step": 666 }, { "epoch": 0.24014401440144015, "grad_norm": 1.7050813436508179, "learning_rate": 0.0001757837518481829, "loss": 4.651, "step": 667 }, { "epoch": 0.2405040504050405, "grad_norm": 1.3369519710540771, "learning_rate": 0.0001757085379831246, "loss": 4.9524, "step": 668 }, { "epoch": 0.24086408640864088, "grad_norm": 1.4655152559280396, "learning_rate": 0.00017563322363930306, "loss": 4.8456, "step": 669 }, { "epoch": 0.24122412241224123, "grad_norm": 2.142806053161621, "learning_rate": 0.00017555780891667384, "loss": 5.0776, "step": 670 }, { "epoch": 0.24158415841584158, "grad_norm": 1.640889286994934, "learning_rate": 0.00017548229391532572, "loss": 4.7559, "step": 671 }, { "epoch": 0.24194419441944195, "grad_norm": 1.3450456857681274, "learning_rate": 0.00017540667873548063, "loss": 5.0178, "step": 672 }, { "epoch": 0.2423042304230423, "grad_norm": 1.837351679801941, "learning_rate": 0.00017533096347749344, "loss": 5.2248, "step": 673 }, { "epoch": 0.24266426642664266, "grad_norm": 1.7267247438430786, "learning_rate": 0.00017525514824185185, "loss": 5.3538, "step": 674 }, { "epoch": 0.24302430243024303, "grad_norm": 2.6373047828674316, "learning_rate": 0.0001751792331291762, "loss": 5.3673, "step": 675 }, { "epoch": 0.24338433843384338, "grad_norm": 2.385141134262085, "learning_rate": 0.00017510321824021943, "loss": 4.7227, "step": 676 }, { "epoch": 0.24374437443744373, "grad_norm": 2.1644632816314697, "learning_rate": 0.00017502710367586687, "loss": 5.0503, "step": 677 }, { "epoch": 0.2441044104410441, "grad_norm": 1.4525551795959473, "learning_rate": 0.0001749508895371362, "loss": 4.7614, "step": 678 }, { "epoch": 0.24446444644464446, "grad_norm": 1.5059491395950317, "learning_rate": 0.00017487457592517714, "loss": 4.8818, "step": 679 }, { "epoch": 0.2448244824482448, "grad_norm": 1.5318140983581543, "learning_rate": 0.00017479816294127152, "loss": 4.9906, "step": 680 }, { "epoch": 0.2451845184518452, "grad_norm": 1.6971796751022339, "learning_rate": 0.00017472165068683305, "loss": 4.9896, "step": 681 }, { "epoch": 0.24554455445544554, "grad_norm": 1.204179286956787, "learning_rate": 0.0001746450392634071, "loss": 4.6281, "step": 682 }, { "epoch": 0.2459045904590459, "grad_norm": 1.596869707107544, "learning_rate": 0.00017456832877267084, "loss": 4.9251, "step": 683 }, { "epoch": 0.24626462646264627, "grad_norm": 1.7464261054992676, "learning_rate": 0.00017449151931643272, "loss": 4.8314, "step": 684 }, { "epoch": 0.24662466246624662, "grad_norm": 1.2955352067947388, "learning_rate": 0.00017441461099663262, "loss": 4.6449, "step": 685 }, { "epoch": 0.246984698469847, "grad_norm": 1.4609013795852661, "learning_rate": 0.00017433760391534167, "loss": 5.248, "step": 686 }, { "epoch": 0.24734473447344735, "grad_norm": 1.3379247188568115, "learning_rate": 0.00017426049817476197, "loss": 4.6666, "step": 687 }, { "epoch": 0.2477047704770477, "grad_norm": 1.2864155769348145, "learning_rate": 0.00017418329387722668, "loss": 4.7158, "step": 688 }, { "epoch": 0.24806480648064808, "grad_norm": 1.673769474029541, "learning_rate": 0.0001741059911251997, "loss": 5.1281, "step": 689 }, { "epoch": 0.24842484248424843, "grad_norm": 1.1956952810287476, "learning_rate": 0.00017402859002127555, "loss": 4.7296, "step": 690 }, { "epoch": 0.24878487848784878, "grad_norm": 1.7122550010681152, "learning_rate": 0.0001739510906681794, "loss": 4.7988, "step": 691 }, { "epoch": 0.24914491449144915, "grad_norm": 1.0974791049957275, "learning_rate": 0.00017387349316876666, "loss": 4.6796, "step": 692 }, { "epoch": 0.2495049504950495, "grad_norm": 1.4044756889343262, "learning_rate": 0.00017379579762602317, "loss": 4.7857, "step": 693 }, { "epoch": 0.24986498649864985, "grad_norm": 1.515895128250122, "learning_rate": 0.00017371800414306478, "loss": 4.7323, "step": 694 }, { "epoch": 0.2502250225022502, "grad_norm": 1.1949536800384521, "learning_rate": 0.0001736401128231373, "loss": 4.9984, "step": 695 }, { "epoch": 0.2502250225022502, "eval_loss": 4.921685218811035, "eval_runtime": 101.2807, "eval_samples_per_second": 46.188, "eval_steps_per_second": 11.552, "step": 695 }, { "epoch": 0.2505850585058506, "grad_norm": 2.016028881072998, "learning_rate": 0.00017356212376961648, "loss": 4.8223, "step": 696 }, { "epoch": 0.25094509450945096, "grad_norm": 1.5955978631973267, "learning_rate": 0.00017348403708600772, "loss": 4.6324, "step": 697 }, { "epoch": 0.2513051305130513, "grad_norm": 1.7373415231704712, "learning_rate": 0.00017340585287594604, "loss": 5.3296, "step": 698 }, { "epoch": 0.25166516651665166, "grad_norm": 2.196326494216919, "learning_rate": 0.0001733275712431958, "loss": 5.2923, "step": 699 }, { "epoch": 0.252025202520252, "grad_norm": 3.086874008178711, "learning_rate": 0.00017324919229165075, "loss": 5.1968, "step": 700 }, { "epoch": 0.25238523852385236, "grad_norm": 2.4407310485839844, "learning_rate": 0.0001731707161253338, "loss": 4.9754, "step": 701 }, { "epoch": 0.25274527452745277, "grad_norm": 1.2670294046401978, "learning_rate": 0.00017309214284839678, "loss": 4.8269, "step": 702 }, { "epoch": 0.2531053105310531, "grad_norm": 1.6742947101593018, "learning_rate": 0.00017301347256512054, "loss": 5.2089, "step": 703 }, { "epoch": 0.25346534653465347, "grad_norm": 1.399596095085144, "learning_rate": 0.00017293470537991463, "loss": 5.1412, "step": 704 }, { "epoch": 0.2538253825382538, "grad_norm": 1.1809393167495728, "learning_rate": 0.0001728558413973171, "loss": 4.9513, "step": 705 }, { "epoch": 0.25418541854185417, "grad_norm": 0.9787651896476746, "learning_rate": 0.00017277688072199457, "loss": 4.53, "step": 706 }, { "epoch": 0.2545454545454545, "grad_norm": 1.3147783279418945, "learning_rate": 0.00017269782345874203, "loss": 4.7199, "step": 707 }, { "epoch": 0.2549054905490549, "grad_norm": 1.2243294715881348, "learning_rate": 0.00017261866971248258, "loss": 4.7546, "step": 708 }, { "epoch": 0.2552655265526553, "grad_norm": 1.3117494583129883, "learning_rate": 0.00017253941958826732, "loss": 4.8072, "step": 709 }, { "epoch": 0.2556255625562556, "grad_norm": 1.1374012231826782, "learning_rate": 0.00017246007319127545, "loss": 4.7096, "step": 710 }, { "epoch": 0.255985598559856, "grad_norm": 1.3385273218154907, "learning_rate": 0.00017238063062681374, "loss": 4.6932, "step": 711 }, { "epoch": 0.2563456345634563, "grad_norm": 1.1105351448059082, "learning_rate": 0.00017230109200031668, "loss": 4.9573, "step": 712 }, { "epoch": 0.25670567056705673, "grad_norm": 1.1182695627212524, "learning_rate": 0.00017222145741734626, "loss": 4.7149, "step": 713 }, { "epoch": 0.2570657065706571, "grad_norm": 1.5708491802215576, "learning_rate": 0.00017214172698359182, "loss": 4.8978, "step": 714 }, { "epoch": 0.25742574257425743, "grad_norm": 1.0699411630630493, "learning_rate": 0.00017206190080486987, "loss": 4.8549, "step": 715 }, { "epoch": 0.2577857785778578, "grad_norm": 1.2931327819824219, "learning_rate": 0.00017198197898712404, "loss": 4.5914, "step": 716 }, { "epoch": 0.25814581458145813, "grad_norm": 1.082270860671997, "learning_rate": 0.00017190196163642483, "loss": 4.6384, "step": 717 }, { "epoch": 0.2585058505850585, "grad_norm": 1.0534776449203491, "learning_rate": 0.00017182184885896964, "loss": 4.4978, "step": 718 }, { "epoch": 0.2588658865886589, "grad_norm": 1.1717240810394287, "learning_rate": 0.0001717416407610824, "loss": 4.8859, "step": 719 }, { "epoch": 0.25922592259225924, "grad_norm": 1.2578966617584229, "learning_rate": 0.00017166133744921357, "loss": 4.8346, "step": 720 }, { "epoch": 0.2595859585958596, "grad_norm": 1.708533763885498, "learning_rate": 0.00017158093902994005, "loss": 4.9671, "step": 721 }, { "epoch": 0.25994599459945994, "grad_norm": 1.2007981538772583, "learning_rate": 0.00017150044560996488, "loss": 4.7716, "step": 722 }, { "epoch": 0.2603060306030603, "grad_norm": 1.6198557615280151, "learning_rate": 0.00017141985729611725, "loss": 5.2346, "step": 723 }, { "epoch": 0.26066606660666064, "grad_norm": 1.9489868879318237, "learning_rate": 0.00017133917419535221, "loss": 5.2922, "step": 724 }, { "epoch": 0.26102610261026105, "grad_norm": 1.8972506523132324, "learning_rate": 0.00017125839641475072, "loss": 5.1553, "step": 725 }, { "epoch": 0.2613861386138614, "grad_norm": 4.419572353363037, "learning_rate": 0.00017117752406151926, "loss": 4.7237, "step": 726 }, { "epoch": 0.26174617461746175, "grad_norm": 1.8346415758132935, "learning_rate": 0.00017109655724298995, "loss": 5.3116, "step": 727 }, { "epoch": 0.2621062106210621, "grad_norm": 1.0029700994491577, "learning_rate": 0.00017101549606662024, "loss": 4.604, "step": 728 }, { "epoch": 0.26246624662466245, "grad_norm": 1.7620614767074585, "learning_rate": 0.00017093434063999278, "loss": 4.8614, "step": 729 }, { "epoch": 0.26282628262826285, "grad_norm": 1.2524189949035645, "learning_rate": 0.0001708530910708153, "loss": 5.058, "step": 730 }, { "epoch": 0.2631863186318632, "grad_norm": 1.5832024812698364, "learning_rate": 0.00017077174746692056, "loss": 4.8144, "step": 731 }, { "epoch": 0.26354635463546355, "grad_norm": 1.6468006372451782, "learning_rate": 0.00017069030993626603, "loss": 4.7682, "step": 732 }, { "epoch": 0.2639063906390639, "grad_norm": 1.5815296173095703, "learning_rate": 0.00017060877858693385, "loss": 4.626, "step": 733 }, { "epoch": 0.26426642664266425, "grad_norm": 1.1093206405639648, "learning_rate": 0.00017052715352713075, "loss": 4.6702, "step": 734 }, { "epoch": 0.2646264626462646, "grad_norm": 1.2010303735733032, "learning_rate": 0.00017044543486518772, "loss": 4.6888, "step": 735 }, { "epoch": 0.264986498649865, "grad_norm": 1.3065500259399414, "learning_rate": 0.00017036362270956009, "loss": 4.6952, "step": 736 }, { "epoch": 0.26534653465346536, "grad_norm": 1.1059433221817017, "learning_rate": 0.00017028171716882714, "loss": 4.4088, "step": 737 }, { "epoch": 0.2657065706570657, "grad_norm": 1.786853313446045, "learning_rate": 0.00017019971835169223, "loss": 5.2083, "step": 738 }, { "epoch": 0.26606660666066606, "grad_norm": 1.19621741771698, "learning_rate": 0.00017011762636698244, "loss": 4.6439, "step": 739 }, { "epoch": 0.2664266426642664, "grad_norm": 1.2290087938308716, "learning_rate": 0.00017003544132364846, "loss": 4.9241, "step": 740 }, { "epoch": 0.26678667866786676, "grad_norm": 1.106048822402954, "learning_rate": 0.00016995316333076458, "loss": 4.8128, "step": 741 }, { "epoch": 0.26714671467146717, "grad_norm": 1.1408966779708862, "learning_rate": 0.00016987079249752843, "loss": 4.8207, "step": 742 }, { "epoch": 0.2675067506750675, "grad_norm": 1.3608462810516357, "learning_rate": 0.00016978832893326074, "loss": 5.0293, "step": 743 }, { "epoch": 0.26786678667866787, "grad_norm": 1.0211683511734009, "learning_rate": 0.00016970577274740545, "loss": 4.5325, "step": 744 }, { "epoch": 0.2682268226822682, "grad_norm": 1.0703456401824951, "learning_rate": 0.0001696231240495294, "loss": 5.0604, "step": 745 }, { "epoch": 0.26858685868586857, "grad_norm": 1.1928843259811401, "learning_rate": 0.00016954038294932216, "loss": 4.884, "step": 746 }, { "epoch": 0.268946894689469, "grad_norm": 1.1553142070770264, "learning_rate": 0.00016945754955659595, "loss": 5.2998, "step": 747 }, { "epoch": 0.2693069306930693, "grad_norm": 1.1428641080856323, "learning_rate": 0.0001693746239812855, "loss": 5.1812, "step": 748 }, { "epoch": 0.2696669666966697, "grad_norm": 1.5842117071151733, "learning_rate": 0.0001692916063334479, "loss": 5.0772, "step": 749 }, { "epoch": 0.27002700270027, "grad_norm": 1.990872859954834, "learning_rate": 0.00016920849672326236, "loss": 5.282, "step": 750 }, { "epoch": 0.2703870387038704, "grad_norm": 1.2507834434509277, "learning_rate": 0.00016912529526103023, "loss": 4.6507, "step": 751 }, { "epoch": 0.2707470747074707, "grad_norm": 1.4511178731918335, "learning_rate": 0.0001690420020571747, "loss": 4.7657, "step": 752 }, { "epoch": 0.27110711071107113, "grad_norm": 1.1944736242294312, "learning_rate": 0.00016895861722224074, "loss": 5.1535, "step": 753 }, { "epoch": 0.2714671467146715, "grad_norm": 1.556862235069275, "learning_rate": 0.00016887514086689494, "loss": 4.8301, "step": 754 }, { "epoch": 0.27182718271827183, "grad_norm": 1.0710150003433228, "learning_rate": 0.00016879157310192535, "loss": 5.0124, "step": 755 }, { "epoch": 0.2721872187218722, "grad_norm": 1.3693212270736694, "learning_rate": 0.00016870791403824132, "loss": 5.1666, "step": 756 }, { "epoch": 0.27254725472547253, "grad_norm": 0.9505925178527832, "learning_rate": 0.0001686241637868734, "loss": 4.7864, "step": 757 }, { "epoch": 0.2729072907290729, "grad_norm": 1.162520170211792, "learning_rate": 0.00016854032245897308, "loss": 5.0125, "step": 758 }, { "epoch": 0.2732673267326733, "grad_norm": 1.1077611446380615, "learning_rate": 0.0001684563901658129, "loss": 5.0365, "step": 759 }, { "epoch": 0.27362736273627364, "grad_norm": 0.9608715176582336, "learning_rate": 0.0001683723670187859, "loss": 4.6992, "step": 760 }, { "epoch": 0.273987398739874, "grad_norm": 2.9034841060638428, "learning_rate": 0.00016828825312940592, "loss": 4.5776, "step": 761 }, { "epoch": 0.27434743474347434, "grad_norm": 1.195059061050415, "learning_rate": 0.0001682040486093071, "loss": 4.707, "step": 762 }, { "epoch": 0.2747074707470747, "grad_norm": 1.4137169122695923, "learning_rate": 0.00016811975357024382, "loss": 4.9037, "step": 763 }, { "epoch": 0.2750675067506751, "grad_norm": 1.4018524885177612, "learning_rate": 0.00016803536812409075, "loss": 5.0312, "step": 764 }, { "epoch": 0.27542754275427545, "grad_norm": 1.1424809694290161, "learning_rate": 0.00016795089238284242, "loss": 4.7786, "step": 765 }, { "epoch": 0.2757875787578758, "grad_norm": 1.3780423402786255, "learning_rate": 0.00016786632645861323, "loss": 5.0719, "step": 766 }, { "epoch": 0.27614761476147615, "grad_norm": 1.1080266237258911, "learning_rate": 0.00016778167046363734, "loss": 4.8753, "step": 767 }, { "epoch": 0.2765076507650765, "grad_norm": 1.6309232711791992, "learning_rate": 0.0001676969245102683, "loss": 4.6502, "step": 768 }, { "epoch": 0.27686768676867685, "grad_norm": 0.9395397305488586, "learning_rate": 0.0001676120887109792, "loss": 4.5501, "step": 769 }, { "epoch": 0.27722772277227725, "grad_norm": 0.9613596796989441, "learning_rate": 0.00016752716317836229, "loss": 4.6644, "step": 770 }, { "epoch": 0.2775877587758776, "grad_norm": 1.1425570249557495, "learning_rate": 0.00016744214802512893, "loss": 4.5632, "step": 771 }, { "epoch": 0.27794779477947795, "grad_norm": 1.5352091789245605, "learning_rate": 0.00016735704336410943, "loss": 4.9709, "step": 772 }, { "epoch": 0.2783078307830783, "grad_norm": 1.2851957082748413, "learning_rate": 0.00016727184930825288, "loss": 5.1564, "step": 773 }, { "epoch": 0.27866786678667865, "grad_norm": 1.5510812997817993, "learning_rate": 0.00016718656597062705, "loss": 5.3022, "step": 774 }, { "epoch": 0.279027902790279, "grad_norm": 3.7985401153564453, "learning_rate": 0.00016710119346441814, "loss": 5.1354, "step": 775 }, { "epoch": 0.2793879387938794, "grad_norm": 1.5523865222930908, "learning_rate": 0.00016701573190293077, "loss": 4.8188, "step": 776 }, { "epoch": 0.27974797479747976, "grad_norm": 0.9541698694229126, "learning_rate": 0.00016693018139958763, "loss": 4.5135, "step": 777 }, { "epoch": 0.2801080108010801, "grad_norm": 0.9594448208808899, "learning_rate": 0.0001668445420679296, "loss": 4.8982, "step": 778 }, { "epoch": 0.28046804680468046, "grad_norm": 1.0501006841659546, "learning_rate": 0.00016675881402161536, "loss": 4.9397, "step": 779 }, { "epoch": 0.2808280828082808, "grad_norm": 1.0564275979995728, "learning_rate": 0.0001666729973744214, "loss": 4.5046, "step": 780 }, { "epoch": 0.2811881188118812, "grad_norm": 1.144883394241333, "learning_rate": 0.00016658709224024162, "loss": 4.5505, "step": 781 }, { "epoch": 0.28154815481548157, "grad_norm": 1.21260666847229, "learning_rate": 0.00016650109873308765, "loss": 4.9434, "step": 782 }, { "epoch": 0.2819081908190819, "grad_norm": 1.0558801889419556, "learning_rate": 0.00016641501696708813, "loss": 4.5204, "step": 783 }, { "epoch": 0.28226822682268227, "grad_norm": 0.9824090003967285, "learning_rate": 0.00016632884705648898, "loss": 5.0061, "step": 784 }, { "epoch": 0.2826282628262826, "grad_norm": 1.2358840703964233, "learning_rate": 0.0001662425891156531, "loss": 5.1995, "step": 785 }, { "epoch": 0.28298829882988297, "grad_norm": 1.0543807744979858, "learning_rate": 0.0001661562432590602, "loss": 5.0176, "step": 786 }, { "epoch": 0.2833483348334834, "grad_norm": 1.2233498096466064, "learning_rate": 0.00016606980960130665, "loss": 4.9524, "step": 787 }, { "epoch": 0.2837083708370837, "grad_norm": 1.098244547843933, "learning_rate": 0.00016598328825710533, "loss": 4.6454, "step": 788 }, { "epoch": 0.2840684068406841, "grad_norm": 1.366742491722107, "learning_rate": 0.00016589667934128558, "loss": 4.9312, "step": 789 }, { "epoch": 0.2844284428442844, "grad_norm": 1.4314054250717163, "learning_rate": 0.00016580998296879292, "loss": 4.5107, "step": 790 }, { "epoch": 0.2847884788478848, "grad_norm": 1.1748394966125488, "learning_rate": 0.00016572319925468892, "loss": 4.8257, "step": 791 }, { "epoch": 0.2851485148514851, "grad_norm": 1.6585294008255005, "learning_rate": 0.00016563632831415102, "loss": 5.0808, "step": 792 }, { "epoch": 0.28550855085508553, "grad_norm": 1.0720629692077637, "learning_rate": 0.00016554937026247253, "loss": 4.8906, "step": 793 }, { "epoch": 0.2858685868586859, "grad_norm": 1.641931176185608, "learning_rate": 0.0001654623252150624, "loss": 5.2314, "step": 794 }, { "epoch": 0.28622862286228623, "grad_norm": 1.1715306043624878, "learning_rate": 0.00016537519328744486, "loss": 4.7414, "step": 795 }, { "epoch": 0.2865886588658866, "grad_norm": 1.3205702304840088, "learning_rate": 0.00016528797459525963, "loss": 5.0718, "step": 796 }, { "epoch": 0.28694869486948693, "grad_norm": 1.5365204811096191, "learning_rate": 0.00016520066925426144, "loss": 4.8465, "step": 797 }, { "epoch": 0.2873087308730873, "grad_norm": 1.1206798553466797, "learning_rate": 0.00016511327738032015, "loss": 4.8585, "step": 798 }, { "epoch": 0.2876687668766877, "grad_norm": 1.5003643035888672, "learning_rate": 0.00016502579908942035, "loss": 5.1255, "step": 799 }, { "epoch": 0.28802880288028804, "grad_norm": 1.6085450649261475, "learning_rate": 0.00016493823449766136, "loss": 5.6082, "step": 800 }, { "epoch": 0.2883888388838884, "grad_norm": 3.5468766689300537, "learning_rate": 0.00016485058372125712, "loss": 4.9319, "step": 801 }, { "epoch": 0.28874887488748874, "grad_norm": 1.450865626335144, "learning_rate": 0.0001647628468765358, "loss": 5.1702, "step": 802 }, { "epoch": 0.2891089108910891, "grad_norm": 1.1530933380126953, "learning_rate": 0.00016467502407993992, "loss": 4.7842, "step": 803 }, { "epoch": 0.2894689468946895, "grad_norm": 1.0461550951004028, "learning_rate": 0.00016458711544802603, "loss": 4.964, "step": 804 }, { "epoch": 0.28982898289828984, "grad_norm": 1.2056033611297607, "learning_rate": 0.00016449912109746457, "loss": 4.7158, "step": 805 }, { "epoch": 0.2901890189018902, "grad_norm": 1.1307734251022339, "learning_rate": 0.0001644110411450398, "loss": 4.891, "step": 806 }, { "epoch": 0.29054905490549054, "grad_norm": 0.9744582772254944, "learning_rate": 0.00016432287570764952, "loss": 4.8994, "step": 807 }, { "epoch": 0.2909090909090909, "grad_norm": 1.094619631767273, "learning_rate": 0.00016423462490230509, "loss": 4.7232, "step": 808 }, { "epoch": 0.29126912691269125, "grad_norm": 1.2090365886688232, "learning_rate": 0.00016414628884613107, "loss": 4.6844, "step": 809 }, { "epoch": 0.29162916291629165, "grad_norm": 1.1496669054031372, "learning_rate": 0.00016405786765636514, "loss": 4.6257, "step": 810 }, { "epoch": 0.291989198919892, "grad_norm": 1.021824836730957, "learning_rate": 0.00016396936145035812, "loss": 4.8279, "step": 811 }, { "epoch": 0.29234923492349235, "grad_norm": 0.9202598333358765, "learning_rate": 0.00016388077034557355, "loss": 4.6532, "step": 812 }, { "epoch": 0.2927092709270927, "grad_norm": 1.1226942539215088, "learning_rate": 0.00016379209445958762, "loss": 4.8354, "step": 813 }, { "epoch": 0.29306930693069305, "grad_norm": 1.0273377895355225, "learning_rate": 0.00016370333391008913, "loss": 4.3541, "step": 814 }, { "epoch": 0.2934293429342934, "grad_norm": 1.2255477905273438, "learning_rate": 0.00016361448881487914, "loss": 4.7901, "step": 815 }, { "epoch": 0.2937893789378938, "grad_norm": 0.9498298764228821, "learning_rate": 0.00016352555929187096, "loss": 4.6441, "step": 816 }, { "epoch": 0.29414941494149416, "grad_norm": 1.0662606954574585, "learning_rate": 0.00016343654545909007, "loss": 4.9122, "step": 817 }, { "epoch": 0.2945094509450945, "grad_norm": 1.0344738960266113, "learning_rate": 0.00016334744743467364, "loss": 4.6176, "step": 818 }, { "epoch": 0.29486948694869486, "grad_norm": 0.7476336359977722, "learning_rate": 0.00016325826533687072, "loss": 4.4825, "step": 819 }, { "epoch": 0.2952295229522952, "grad_norm": 1.0336343050003052, "learning_rate": 0.00016316899928404187, "loss": 5.072, "step": 820 }, { "epoch": 0.2955895589558956, "grad_norm": 1.3752336502075195, "learning_rate": 0.00016307964939465914, "loss": 4.9852, "step": 821 }, { "epoch": 0.29594959495949597, "grad_norm": 1.2596033811569214, "learning_rate": 0.00016299021578730579, "loss": 4.9698, "step": 822 }, { "epoch": 0.2963096309630963, "grad_norm": 1.857324481010437, "learning_rate": 0.0001629006985806761, "loss": 5.2779, "step": 823 }, { "epoch": 0.29666966696669667, "grad_norm": 1.6580755710601807, "learning_rate": 0.0001628110978935756, "loss": 5.1542, "step": 824 }, { "epoch": 0.297029702970297, "grad_norm": 2.069047212600708, "learning_rate": 0.00016272141384492025, "loss": 5.4598, "step": 825 }, { "epoch": 0.29738973897389737, "grad_norm": 1.1856191158294678, "learning_rate": 0.00016263164655373692, "loss": 4.5994, "step": 826 }, { "epoch": 0.29774977497749777, "grad_norm": 1.4068639278411865, "learning_rate": 0.00016254179613916278, "loss": 4.6774, "step": 827 }, { "epoch": 0.2981098109810981, "grad_norm": 1.251482367515564, "learning_rate": 0.00016245186272044544, "loss": 5.0974, "step": 828 }, { "epoch": 0.2984698469846985, "grad_norm": 1.6903501749038696, "learning_rate": 0.0001623618464169426, "loss": 4.8803, "step": 829 }, { "epoch": 0.2988298829882988, "grad_norm": 0.92371666431427, "learning_rate": 0.000162271747348122, "loss": 4.851, "step": 830 }, { "epoch": 0.2991899189918992, "grad_norm": 1.2435139417648315, "learning_rate": 0.0001621815656335612, "loss": 4.5808, "step": 831 }, { "epoch": 0.2995499549954995, "grad_norm": 1.1530983448028564, "learning_rate": 0.00016209130139294744, "loss": 4.5771, "step": 832 }, { "epoch": 0.29990999099909993, "grad_norm": 1.008490800857544, "learning_rate": 0.00016200095474607753, "loss": 5.1045, "step": 833 }, { "epoch": 0.3002700270027003, "grad_norm": 1.6871010065078735, "learning_rate": 0.0001619105258128576, "loss": 4.7034, "step": 834 }, { "epoch": 0.30063006300630063, "grad_norm": 1.0384951829910278, "learning_rate": 0.00016182001471330302, "loss": 4.8448, "step": 835 }, { "epoch": 0.300990099009901, "grad_norm": 1.4010770320892334, "learning_rate": 0.0001617294215675382, "loss": 4.8944, "step": 836 }, { "epoch": 0.30135013501350133, "grad_norm": 1.1556458473205566, "learning_rate": 0.00016163874649579647, "loss": 4.5784, "step": 837 }, { "epoch": 0.30171017101710174, "grad_norm": 0.8926945924758911, "learning_rate": 0.00016154798961841977, "loss": 4.5535, "step": 838 }, { "epoch": 0.3020702070207021, "grad_norm": 1.1621904373168945, "learning_rate": 0.0001614571510558588, "loss": 4.8086, "step": 839 }, { "epoch": 0.30243024302430244, "grad_norm": 1.0140894651412964, "learning_rate": 0.00016136623092867248, "loss": 4.6833, "step": 840 }, { "epoch": 0.3027902790279028, "grad_norm": 0.8983398079872131, "learning_rate": 0.00016127522935752814, "loss": 4.7961, "step": 841 }, { "epoch": 0.30315031503150314, "grad_norm": 0.8260481357574463, "learning_rate": 0.0001611841464632011, "loss": 4.5209, "step": 842 }, { "epoch": 0.3035103510351035, "grad_norm": 0.8811922073364258, "learning_rate": 0.0001610929823665747, "loss": 5.0509, "step": 843 }, { "epoch": 0.3038703870387039, "grad_norm": 1.1481789350509644, "learning_rate": 0.00016100173718863986, "loss": 4.8124, "step": 844 }, { "epoch": 0.30423042304230424, "grad_norm": 0.8910513520240784, "learning_rate": 0.0001609104110504954, "loss": 4.7773, "step": 845 }, { "epoch": 0.3045904590459046, "grad_norm": 0.9736277461051941, "learning_rate": 0.00016081900407334732, "loss": 4.6872, "step": 846 }, { "epoch": 0.30495049504950494, "grad_norm": 1.2259780168533325, "learning_rate": 0.00016072751637850904, "loss": 5.3895, "step": 847 }, { "epoch": 0.3053105310531053, "grad_norm": 1.333270788192749, "learning_rate": 0.00016063594808740113, "loss": 5.3089, "step": 848 }, { "epoch": 0.30567056705670564, "grad_norm": 1.8068937063217163, "learning_rate": 0.00016054429932155104, "loss": 5.1097, "step": 849 }, { "epoch": 0.30603060306030605, "grad_norm": 2.003225326538086, "learning_rate": 0.00016045257020259304, "loss": 5.2578, "step": 850 }, { "epoch": 0.3063906390639064, "grad_norm": 2.1421890258789062, "learning_rate": 0.00016036076085226814, "loss": 4.633, "step": 851 }, { "epoch": 0.30675067506750675, "grad_norm": 1.4106087684631348, "learning_rate": 0.00016026887139242372, "loss": 5.0115, "step": 852 }, { "epoch": 0.3071107110711071, "grad_norm": 1.3179970979690552, "learning_rate": 0.00016017690194501351, "loss": 5.0523, "step": 853 }, { "epoch": 0.30747074707470745, "grad_norm": 1.1221106052398682, "learning_rate": 0.00016008485263209742, "loss": 4.469, "step": 854 }, { "epoch": 0.30783078307830786, "grad_norm": 0.877373456954956, "learning_rate": 0.00015999272357584133, "loss": 4.6435, "step": 855 }, { "epoch": 0.3081908190819082, "grad_norm": 1.2408608198165894, "learning_rate": 0.000159900514898517, "loss": 4.8264, "step": 856 }, { "epoch": 0.30855085508550856, "grad_norm": 1.0822373628616333, "learning_rate": 0.0001598082267225018, "loss": 5.0694, "step": 857 }, { "epoch": 0.3089108910891089, "grad_norm": 1.031474232673645, "learning_rate": 0.00015971585917027862, "loss": 4.6595, "step": 858 }, { "epoch": 0.30927092709270926, "grad_norm": 1.0592454671859741, "learning_rate": 0.00015962341236443574, "loss": 4.5791, "step": 859 }, { "epoch": 0.3096309630963096, "grad_norm": 0.7305690050125122, "learning_rate": 0.0001595308864276666, "loss": 4.9633, "step": 860 }, { "epoch": 0.30999099909991, "grad_norm": 1.120477318763733, "learning_rate": 0.00015943828148276966, "loss": 4.7703, "step": 861 }, { "epoch": 0.31035103510351036, "grad_norm": 1.1676793098449707, "learning_rate": 0.0001593455976526482, "loss": 4.6116, "step": 862 }, { "epoch": 0.3107110711071107, "grad_norm": 1.009584665298462, "learning_rate": 0.0001592528350603103, "loss": 5.0741, "step": 863 }, { "epoch": 0.31107110711071106, "grad_norm": 1.012658953666687, "learning_rate": 0.0001591599938288684, "loss": 4.7777, "step": 864 }, { "epoch": 0.3114311431143114, "grad_norm": 0.9617002010345459, "learning_rate": 0.00015906707408153947, "loss": 4.5044, "step": 865 }, { "epoch": 0.31179117911791177, "grad_norm": 1.048292875289917, "learning_rate": 0.00015897407594164467, "loss": 4.495, "step": 866 }, { "epoch": 0.31215121512151217, "grad_norm": 0.9742674827575684, "learning_rate": 0.00015888099953260905, "loss": 4.7548, "step": 867 }, { "epoch": 0.3125112511251125, "grad_norm": 1.1329879760742188, "learning_rate": 0.00015878784497796176, "loss": 4.8652, "step": 868 }, { "epoch": 0.31287128712871287, "grad_norm": 0.9601360559463501, "learning_rate": 0.0001586946124013354, "loss": 4.6138, "step": 869 }, { "epoch": 0.3132313231323132, "grad_norm": 1.0140951871871948, "learning_rate": 0.00015860130192646646, "loss": 4.6407, "step": 870 }, { "epoch": 0.31359135913591357, "grad_norm": 1.9553427696228027, "learning_rate": 0.00015850791367719443, "loss": 5.0412, "step": 871 }, { "epoch": 0.313951395139514, "grad_norm": 0.8475087285041809, "learning_rate": 0.0001584144477774623, "loss": 4.6872, "step": 872 }, { "epoch": 0.31431143114311433, "grad_norm": 1.4645193815231323, "learning_rate": 0.00015832090435131604, "loss": 5.0024, "step": 873 }, { "epoch": 0.3146714671467147, "grad_norm": 1.4982762336730957, "learning_rate": 0.00015822728352290447, "loss": 5.1353, "step": 874 }, { "epoch": 0.31503150315031503, "grad_norm": 1.3741843700408936, "learning_rate": 0.00015813358541647915, "loss": 5.2968, "step": 875 }, { "epoch": 0.3153915391539154, "grad_norm": 3.021010398864746, "learning_rate": 0.0001580398101563943, "loss": 5.2408, "step": 876 }, { "epoch": 0.31575157515751573, "grad_norm": 1.1927891969680786, "learning_rate": 0.00015794595786710632, "loss": 5.0367, "step": 877 }, { "epoch": 0.31611161116111614, "grad_norm": 0.9293290376663208, "learning_rate": 0.00015785202867317407, "loss": 4.8202, "step": 878 }, { "epoch": 0.3164716471647165, "grad_norm": 0.9863614439964294, "learning_rate": 0.00015775802269925836, "loss": 4.5804, "step": 879 }, { "epoch": 0.31683168316831684, "grad_norm": 1.4362967014312744, "learning_rate": 0.0001576639400701219, "loss": 4.6306, "step": 880 }, { "epoch": 0.3171917191719172, "grad_norm": 1.408553123474121, "learning_rate": 0.0001575697809106292, "loss": 5.0018, "step": 881 }, { "epoch": 0.31755175517551754, "grad_norm": 0.8808412551879883, "learning_rate": 0.00015747554534574626, "loss": 4.8715, "step": 882 }, { "epoch": 0.3179117911791179, "grad_norm": 1.1640198230743408, "learning_rate": 0.0001573812335005405, "loss": 4.8324, "step": 883 }, { "epoch": 0.3182718271827183, "grad_norm": 1.0969043970108032, "learning_rate": 0.00015728684550018064, "loss": 4.739, "step": 884 }, { "epoch": 0.31863186318631864, "grad_norm": 1.165368676185608, "learning_rate": 0.00015719238146993646, "loss": 4.7556, "step": 885 }, { "epoch": 0.318991899189919, "grad_norm": 1.1786396503448486, "learning_rate": 0.00015709784153517851, "loss": 4.7313, "step": 886 }, { "epoch": 0.31935193519351934, "grad_norm": 1.2101906538009644, "learning_rate": 0.00015700322582137827, "loss": 4.8052, "step": 887 }, { "epoch": 0.3197119711971197, "grad_norm": 1.2660850286483765, "learning_rate": 0.0001569085344541077, "loss": 5.0038, "step": 888 }, { "epoch": 0.3200720072007201, "grad_norm": 1.249326229095459, "learning_rate": 0.00015681376755903912, "loss": 5.0246, "step": 889 }, { "epoch": 0.32043204320432045, "grad_norm": 1.0174130201339722, "learning_rate": 0.00015671892526194516, "loss": 4.93, "step": 890 }, { "epoch": 0.3207920792079208, "grad_norm": 0.8214113712310791, "learning_rate": 0.00015662400768869854, "loss": 4.8746, "step": 891 }, { "epoch": 0.32115211521152115, "grad_norm": 0.9143441319465637, "learning_rate": 0.0001565290149652718, "loss": 4.8004, "step": 892 }, { "epoch": 0.3215121512151215, "grad_norm": 1.020521879196167, "learning_rate": 0.0001564339472177373, "loss": 4.832, "step": 893 }, { "epoch": 0.32187218721872185, "grad_norm": 0.85971999168396, "learning_rate": 0.00015633880457226692, "loss": 4.8473, "step": 894 }, { "epoch": 0.32223222322232226, "grad_norm": 1.136918306350708, "learning_rate": 0.00015624358715513192, "loss": 4.6338, "step": 895 }, { "epoch": 0.3225922592259226, "grad_norm": 1.1943031549453735, "learning_rate": 0.0001561482950927029, "loss": 5.1376, "step": 896 }, { "epoch": 0.32295229522952296, "grad_norm": 1.0146737098693848, "learning_rate": 0.00015605292851144942, "loss": 4.9343, "step": 897 }, { "epoch": 0.3233123312331233, "grad_norm": 1.2824499607086182, "learning_rate": 0.00015595748753793998, "loss": 4.9354, "step": 898 }, { "epoch": 0.32367236723672366, "grad_norm": 1.8544282913208008, "learning_rate": 0.00015586197229884184, "loss": 5.2082, "step": 899 }, { "epoch": 0.324032403240324, "grad_norm": 2.0411593914031982, "learning_rate": 0.00015576638292092077, "loss": 5.3725, "step": 900 }, { "epoch": 0.3243924392439244, "grad_norm": 1.9044406414031982, "learning_rate": 0.00015567071953104096, "loss": 5.0433, "step": 901 }, { "epoch": 0.32475247524752476, "grad_norm": 1.3686928749084473, "learning_rate": 0.00015557498225616487, "loss": 5.0233, "step": 902 }, { "epoch": 0.3251125112511251, "grad_norm": 1.0368106365203857, "learning_rate": 0.0001554791712233529, "loss": 4.6988, "step": 903 }, { "epoch": 0.32547254725472546, "grad_norm": 0.9846864938735962, "learning_rate": 0.00015538328655976353, "loss": 4.6639, "step": 904 }, { "epoch": 0.3258325832583258, "grad_norm": 1.0573269128799438, "learning_rate": 0.00015528732839265272, "loss": 4.8586, "step": 905 }, { "epoch": 0.3261926192619262, "grad_norm": 0.842383623123169, "learning_rate": 0.0001551912968493742, "loss": 4.8161, "step": 906 }, { "epoch": 0.32655265526552657, "grad_norm": 1.0880528688430786, "learning_rate": 0.00015509519205737896, "loss": 4.9867, "step": 907 }, { "epoch": 0.3269126912691269, "grad_norm": 1.0716383457183838, "learning_rate": 0.0001549990141442153, "loss": 5.1915, "step": 908 }, { "epoch": 0.32727272727272727, "grad_norm": 1.0352778434753418, "learning_rate": 0.00015490276323752838, "loss": 4.745, "step": 909 }, { "epoch": 0.3276327632763276, "grad_norm": 1.0129772424697876, "learning_rate": 0.00015480643946506043, "loss": 4.4961, "step": 910 }, { "epoch": 0.32799279927992797, "grad_norm": 1.0241246223449707, "learning_rate": 0.00015471004295465035, "loss": 4.7148, "step": 911 }, { "epoch": 0.3283528352835284, "grad_norm": 1.0528348684310913, "learning_rate": 0.0001546135738342335, "loss": 4.8904, "step": 912 }, { "epoch": 0.3287128712871287, "grad_norm": 0.90036940574646, "learning_rate": 0.00015451703223184166, "loss": 4.6939, "step": 913 }, { "epoch": 0.3290729072907291, "grad_norm": 1.3356916904449463, "learning_rate": 0.00015442041827560274, "loss": 4.7737, "step": 914 }, { "epoch": 0.32943294329432943, "grad_norm": 1.0879907608032227, "learning_rate": 0.0001543237320937408, "loss": 4.7154, "step": 915 }, { "epoch": 0.3297929792979298, "grad_norm": 1.069511890411377, "learning_rate": 0.00015422697381457567, "loss": 4.7343, "step": 916 }, { "epoch": 0.33015301530153013, "grad_norm": 1.1529669761657715, "learning_rate": 0.00015413014356652286, "loss": 4.976, "step": 917 }, { "epoch": 0.33051305130513053, "grad_norm": 1.2753455638885498, "learning_rate": 0.00015403324147809344, "loss": 4.8575, "step": 918 }, { "epoch": 0.3308730873087309, "grad_norm": 0.9767019748687744, "learning_rate": 0.0001539362676778938, "loss": 4.8543, "step": 919 }, { "epoch": 0.33123312331233123, "grad_norm": 0.9553641080856323, "learning_rate": 0.00015383922229462549, "loss": 4.977, "step": 920 }, { "epoch": 0.3315931593159316, "grad_norm": 0.8495627641677856, "learning_rate": 0.0001537421054570851, "loss": 4.7638, "step": 921 }, { "epoch": 0.33195319531953194, "grad_norm": 0.8193251490592957, "learning_rate": 0.000153644917294164, "loss": 4.7902, "step": 922 }, { "epoch": 0.33231323132313234, "grad_norm": 1.0532013177871704, "learning_rate": 0.00015354765793484834, "loss": 5.1835, "step": 923 }, { "epoch": 0.3326732673267327, "grad_norm": 1.482528805732727, "learning_rate": 0.00015345032750821856, "loss": 5.5671, "step": 924 }, { "epoch": 0.33303330333033304, "grad_norm": 2.2630598545074463, "learning_rate": 0.00015335292614344963, "loss": 5.4195, "step": 925 }, { "epoch": 0.3333933393339334, "grad_norm": 1.2598108053207397, "learning_rate": 0.0001532554539698105, "loss": 4.7357, "step": 926 }, { "epoch": 0.33375337533753374, "grad_norm": 1.5963959693908691, "learning_rate": 0.00015315791111666425, "loss": 4.826, "step": 927 }, { "epoch": 0.3341134113411341, "grad_norm": 1.3899768590927124, "learning_rate": 0.0001530602977134676, "loss": 4.6758, "step": 928 }, { "epoch": 0.3344734473447345, "grad_norm": 1.2996914386749268, "learning_rate": 0.00015296261388977108, "loss": 5.0804, "step": 929 }, { "epoch": 0.33483348334833485, "grad_norm": 0.9530441164970398, "learning_rate": 0.00015286485977521845, "loss": 4.7637, "step": 930 }, { "epoch": 0.3351935193519352, "grad_norm": 0.8917075395584106, "learning_rate": 0.000152767035499547, "loss": 4.7331, "step": 931 }, { "epoch": 0.33555355535553555, "grad_norm": 1.269867181777954, "learning_rate": 0.000152669141192587, "loss": 5.0558, "step": 932 }, { "epoch": 0.3359135913591359, "grad_norm": 1.514622449874878, "learning_rate": 0.00015257117698426172, "loss": 4.9906, "step": 933 }, { "epoch": 0.33627362736273625, "grad_norm": 1.0862271785736084, "learning_rate": 0.00015247314300458712, "loss": 4.7081, "step": 934 }, { "epoch": 0.33663366336633666, "grad_norm": 0.8043900728225708, "learning_rate": 0.00015237503938367186, "loss": 4.9713, "step": 935 }, { "epoch": 0.336993699369937, "grad_norm": 0.8863739371299744, "learning_rate": 0.00015227686625171697, "loss": 5.0078, "step": 936 }, { "epoch": 0.33735373537353736, "grad_norm": 0.8409878611564636, "learning_rate": 0.00015217862373901575, "loss": 4.7633, "step": 937 }, { "epoch": 0.3377137713771377, "grad_norm": 0.9702298045158386, "learning_rate": 0.00015208031197595356, "loss": 4.404, "step": 938 }, { "epoch": 0.33807380738073806, "grad_norm": 0.8834205269813538, "learning_rate": 0.0001519819310930077, "loss": 4.7584, "step": 939 }, { "epoch": 0.3384338433843384, "grad_norm": 1.1648602485656738, "learning_rate": 0.00015188348122074715, "loss": 4.8814, "step": 940 }, { "epoch": 0.3387938793879388, "grad_norm": 0.9649667143821716, "learning_rate": 0.00015178496248983254, "loss": 4.6399, "step": 941 }, { "epoch": 0.33915391539153916, "grad_norm": 0.9292764067649841, "learning_rate": 0.00015168637503101584, "loss": 4.7679, "step": 942 }, { "epoch": 0.3395139513951395, "grad_norm": 0.8649988174438477, "learning_rate": 0.0001515877189751402, "loss": 4.7873, "step": 943 }, { "epoch": 0.33987398739873986, "grad_norm": 0.9205020666122437, "learning_rate": 0.00015148899445313981, "loss": 4.8269, "step": 944 }, { "epoch": 0.3402340234023402, "grad_norm": 1.2201025485992432, "learning_rate": 0.00015139020159603983, "loss": 4.7058, "step": 945 }, { "epoch": 0.3405940594059406, "grad_norm": 1.0716661214828491, "learning_rate": 0.00015129134053495604, "loss": 4.7139, "step": 946 }, { "epoch": 0.34095409540954097, "grad_norm": 0.9085667729377747, "learning_rate": 0.00015119241140109467, "loss": 4.5195, "step": 947 }, { "epoch": 0.3413141314131413, "grad_norm": 0.9523339867591858, "learning_rate": 0.0001510934143257524, "loss": 5.1915, "step": 948 }, { "epoch": 0.34167416741674167, "grad_norm": 1.3508998155593872, "learning_rate": 0.00015099434944031606, "loss": 5.2323, "step": 949 }, { "epoch": 0.342034203420342, "grad_norm": 1.9917466640472412, "learning_rate": 0.00015089521687626243, "loss": 5.4336, "step": 950 }, { "epoch": 0.34239423942394237, "grad_norm": 1.3084412813186646, "learning_rate": 0.0001507960167651582, "loss": 4.7862, "step": 951 }, { "epoch": 0.3427542754275428, "grad_norm": 1.4469012022018433, "learning_rate": 0.0001506967492386596, "loss": 4.9395, "step": 952 }, { "epoch": 0.3431143114311431, "grad_norm": 0.9551678895950317, "learning_rate": 0.0001505974144285124, "loss": 5.1265, "step": 953 }, { "epoch": 0.3434743474347435, "grad_norm": 1.559654951095581, "learning_rate": 0.00015049801246655163, "loss": 4.7992, "step": 954 }, { "epoch": 0.3438343834383438, "grad_norm": 1.190437912940979, "learning_rate": 0.0001503985434847015, "loss": 5.1382, "step": 955 }, { "epoch": 0.3441944194419442, "grad_norm": 0.8244277238845825, "learning_rate": 0.00015029900761497506, "loss": 4.5185, "step": 956 }, { "epoch": 0.3445544554455445, "grad_norm": 0.788070797920227, "learning_rate": 0.00015019940498947428, "loss": 4.7384, "step": 957 }, { "epoch": 0.34491449144914493, "grad_norm": 0.7468547224998474, "learning_rate": 0.00015009973574038962, "loss": 4.8971, "step": 958 }, { "epoch": 0.3452745274527453, "grad_norm": 0.8603270649909973, "learning_rate": 0.00015000000000000001, "loss": 4.3912, "step": 959 }, { "epoch": 0.34563456345634563, "grad_norm": 0.9192284345626831, "learning_rate": 0.00014990019790067256, "loss": 4.482, "step": 960 }, { "epoch": 0.345994599459946, "grad_norm": 0.8669421672821045, "learning_rate": 0.00014980032957486255, "loss": 4.5312, "step": 961 }, { "epoch": 0.34635463546354633, "grad_norm": 0.9489680528640747, "learning_rate": 0.00014970039515511304, "loss": 4.6693, "step": 962 }, { "epoch": 0.34671467146714674, "grad_norm": 1.0888569355010986, "learning_rate": 0.00014960039477405487, "loss": 4.869, "step": 963 }, { "epoch": 0.3470747074707471, "grad_norm": 1.0825499296188354, "learning_rate": 0.0001495003285644065, "loss": 4.5917, "step": 964 }, { "epoch": 0.34743474347434744, "grad_norm": 1.1033159494400024, "learning_rate": 0.0001494001966589736, "loss": 5.0973, "step": 965 }, { "epoch": 0.3477947794779478, "grad_norm": 0.7403422594070435, "learning_rate": 0.00014929999919064917, "loss": 4.7091, "step": 966 }, { "epoch": 0.34815481548154814, "grad_norm": 0.9281265735626221, "learning_rate": 0.00014919973629241314, "loss": 4.5066, "step": 967 }, { "epoch": 0.3485148514851485, "grad_norm": 1.0097898244857788, "learning_rate": 0.00014909940809733222, "loss": 4.8756, "step": 968 }, { "epoch": 0.3488748874887489, "grad_norm": 1.0524588823318481, "learning_rate": 0.00014899901473855998, "loss": 4.715, "step": 969 }, { "epoch": 0.34923492349234925, "grad_norm": 1.0118778944015503, "learning_rate": 0.00014889855634933627, "loss": 4.843, "step": 970 }, { "epoch": 0.3495949594959496, "grad_norm": 1.1899731159210205, "learning_rate": 0.00014879803306298736, "loss": 4.8947, "step": 971 }, { "epoch": 0.34995499549954995, "grad_norm": 1.4655492305755615, "learning_rate": 0.00014869744501292561, "loss": 5.0872, "step": 972 }, { "epoch": 0.3503150315031503, "grad_norm": 1.59488844871521, "learning_rate": 0.0001485967923326494, "loss": 5.13, "step": 973 }, { "epoch": 0.35067506750675065, "grad_norm": 2.68699049949646, "learning_rate": 0.00014849607515574276, "loss": 5.0265, "step": 974 }, { "epoch": 0.35103510351035105, "grad_norm": 1.9067751169204712, "learning_rate": 0.00014839529361587538, "loss": 5.4959, "step": 975 }, { "epoch": 0.3513951395139514, "grad_norm": 1.8809281587600708, "learning_rate": 0.00014829444784680244, "loss": 5.3514, "step": 976 }, { "epoch": 0.35175517551755175, "grad_norm": 1.4820325374603271, "learning_rate": 0.00014819353798236427, "loss": 4.6617, "step": 977 }, { "epoch": 0.3521152115211521, "grad_norm": 1.1088730096817017, "learning_rate": 0.00014809256415648626, "loss": 4.8845, "step": 978 }, { "epoch": 0.35247524752475246, "grad_norm": 1.4352535009384155, "learning_rate": 0.00014799152650317877, "loss": 4.7856, "step": 979 }, { "epoch": 0.35283528352835286, "grad_norm": 0.9059150218963623, "learning_rate": 0.00014789042515653687, "loss": 4.8338, "step": 980 }, { "epoch": 0.3531953195319532, "grad_norm": 1.1553621292114258, "learning_rate": 0.00014778926025074002, "loss": 4.8173, "step": 981 }, { "epoch": 0.35355535553555356, "grad_norm": 1.1156034469604492, "learning_rate": 0.00014768803192005223, "loss": 4.6934, "step": 982 }, { "epoch": 0.3539153915391539, "grad_norm": 0.906928300857544, "learning_rate": 0.00014758674029882152, "loss": 4.7319, "step": 983 }, { "epoch": 0.35427542754275426, "grad_norm": 1.2109665870666504, "learning_rate": 0.00014748538552148002, "loss": 4.7891, "step": 984 }, { "epoch": 0.3546354635463546, "grad_norm": 1.0026674270629883, "learning_rate": 0.0001473839677225436, "loss": 4.6982, "step": 985 }, { "epoch": 0.354995499549955, "grad_norm": 0.9090620279312134, "learning_rate": 0.00014728248703661182, "loss": 4.5659, "step": 986 }, { "epoch": 0.35535553555355537, "grad_norm": 1.0185548067092896, "learning_rate": 0.00014718094359836772, "loss": 4.7228, "step": 987 }, { "epoch": 0.3557155715571557, "grad_norm": 0.7778179049491882, "learning_rate": 0.00014707933754257754, "loss": 4.4779, "step": 988 }, { "epoch": 0.35607560756075607, "grad_norm": 1.0444481372833252, "learning_rate": 0.00014697766900409074, "loss": 4.9622, "step": 989 }, { "epoch": 0.3564356435643564, "grad_norm": 1.5330283641815186, "learning_rate": 0.00014687593811783963, "loss": 4.8482, "step": 990 }, { "epoch": 0.35679567956795677, "grad_norm": 0.9238508343696594, "learning_rate": 0.00014677414501883926, "loss": 4.9441, "step": 991 }, { "epoch": 0.3571557155715572, "grad_norm": 1.4587763547897339, "learning_rate": 0.0001466722898421873, "loss": 4.8419, "step": 992 }, { "epoch": 0.3575157515751575, "grad_norm": 1.1518075466156006, "learning_rate": 0.00014657037272306368, "loss": 4.6916, "step": 993 }, { "epoch": 0.3578757875787579, "grad_norm": 0.799138069152832, "learning_rate": 0.00014646839379673076, "loss": 4.3815, "step": 994 }, { "epoch": 0.3582358235823582, "grad_norm": 1.213332176208496, "learning_rate": 0.00014636635319853275, "loss": 5.1047, "step": 995 }, { "epoch": 0.3585958595859586, "grad_norm": 1.3635562658309937, "learning_rate": 0.00014626425106389573, "loss": 5.2238, "step": 996 }, { "epoch": 0.358955895589559, "grad_norm": 0.9962284564971924, "learning_rate": 0.00014616208752832758, "loss": 5.0498, "step": 997 }, { "epoch": 0.35931593159315933, "grad_norm": 1.2514888048171997, "learning_rate": 0.00014605986272741748, "loss": 4.936, "step": 998 }, { "epoch": 0.3596759675967597, "grad_norm": 1.1103378534317017, "learning_rate": 0.00014595757679683607, "loss": 5.0447, "step": 999 }, { "epoch": 0.36003600360036003, "grad_norm": 2.1691062450408936, "learning_rate": 0.00014585522987233503, "loss": 5.491, "step": 1000 }, { "epoch": 0.3603960396039604, "grad_norm": 1.8741892576217651, "learning_rate": 0.00014575282208974702, "loss": 5.2204, "step": 1001 }, { "epoch": 0.36075607560756073, "grad_norm": 1.0809324979782104, "learning_rate": 0.0001456503535849855, "loss": 4.9838, "step": 1002 }, { "epoch": 0.36111611161116114, "grad_norm": 1.172060489654541, "learning_rate": 0.00014554782449404448, "loss": 4.7522, "step": 1003 }, { "epoch": 0.3614761476147615, "grad_norm": 0.8441461324691772, "learning_rate": 0.00014544523495299842, "loss": 4.9235, "step": 1004 }, { "epoch": 0.36183618361836184, "grad_norm": 1.1785805225372314, "learning_rate": 0.00014534258509800197, "loss": 4.7191, "step": 1005 }, { "epoch": 0.3621962196219622, "grad_norm": 2.189944267272949, "learning_rate": 0.00014523987506528978, "loss": 4.9356, "step": 1006 }, { "epoch": 0.36255625562556254, "grad_norm": 0.9100619554519653, "learning_rate": 0.00014513710499117647, "loss": 4.6031, "step": 1007 }, { "epoch": 0.3629162916291629, "grad_norm": 1.0797786712646484, "learning_rate": 0.0001450342750120563, "loss": 4.6115, "step": 1008 }, { "epoch": 0.3632763276327633, "grad_norm": 0.8818701505661011, "learning_rate": 0.00014493138526440303, "loss": 4.711, "step": 1009 }, { "epoch": 0.36363636363636365, "grad_norm": 1.0068962574005127, "learning_rate": 0.00014482843588476974, "loss": 4.9463, "step": 1010 }, { "epoch": 0.363996399639964, "grad_norm": 1.7812259197235107, "learning_rate": 0.00014472542700978867, "loss": 4.7244, "step": 1011 }, { "epoch": 0.36435643564356435, "grad_norm": 0.9703940153121948, "learning_rate": 0.00014462235877617098, "loss": 4.7569, "step": 1012 }, { "epoch": 0.3647164716471647, "grad_norm": 0.9530327320098877, "learning_rate": 0.0001445192313207067, "loss": 5.1042, "step": 1013 }, { "epoch": 0.3650765076507651, "grad_norm": 0.8761149048805237, "learning_rate": 0.00014441604478026437, "loss": 4.8209, "step": 1014 }, { "epoch": 0.36543654365436545, "grad_norm": 1.0026590824127197, "learning_rate": 0.00014431279929179097, "loss": 4.8765, "step": 1015 }, { "epoch": 0.3657965796579658, "grad_norm": 0.7755431532859802, "learning_rate": 0.00014420949499231172, "loss": 4.5195, "step": 1016 }, { "epoch": 0.36615661566156615, "grad_norm": 0.9436198472976685, "learning_rate": 0.00014410613201892985, "loss": 4.6982, "step": 1017 }, { "epoch": 0.3665166516651665, "grad_norm": 0.8173322081565857, "learning_rate": 0.00014400271050882653, "loss": 5.1097, "step": 1018 }, { "epoch": 0.36687668766876685, "grad_norm": 0.9280872344970703, "learning_rate": 0.00014389923059926062, "loss": 4.757, "step": 1019 }, { "epoch": 0.36723672367236726, "grad_norm": 0.9003520011901855, "learning_rate": 0.00014379569242756846, "loss": 4.9784, "step": 1020 }, { "epoch": 0.3675967596759676, "grad_norm": 1.3225197792053223, "learning_rate": 0.0001436920961311637, "loss": 5.1636, "step": 1021 }, { "epoch": 0.36795679567956796, "grad_norm": 0.940623939037323, "learning_rate": 0.00014358844184753712, "loss": 5.1865, "step": 1022 }, { "epoch": 0.3683168316831683, "grad_norm": 1.1208678483963013, "learning_rate": 0.0001434847297142565, "loss": 5.2052, "step": 1023 }, { "epoch": 0.36867686768676866, "grad_norm": 1.6092015504837036, "learning_rate": 0.00014338095986896637, "loss": 5.2666, "step": 1024 }, { "epoch": 0.369036903690369, "grad_norm": 1.844506025314331, "learning_rate": 0.0001432771324493879, "loss": 5.4748, "step": 1025 }, { "epoch": 0.3693969396939694, "grad_norm": 2.317615032196045, "learning_rate": 0.00014317324759331856, "loss": 4.6974, "step": 1026 }, { "epoch": 0.36975697569756977, "grad_norm": 0.9359394907951355, "learning_rate": 0.00014306930543863219, "loss": 4.7296, "step": 1027 }, { "epoch": 0.3701170117011701, "grad_norm": 0.8531212210655212, "learning_rate": 0.00014296530612327863, "loss": 4.6999, "step": 1028 }, { "epoch": 0.37047704770477047, "grad_norm": 0.7763692736625671, "learning_rate": 0.0001428612497852835, "loss": 5.0536, "step": 1029 }, { "epoch": 0.3708370837083708, "grad_norm": 0.7523413896560669, "learning_rate": 0.0001427571365627482, "loss": 4.5295, "step": 1030 }, { "epoch": 0.3711971197119712, "grad_norm": 1.2539907693862915, "learning_rate": 0.00014265296659384956, "loss": 4.887, "step": 1031 }, { "epoch": 0.3715571557155716, "grad_norm": 1.1317229270935059, "learning_rate": 0.00014254874001683976, "loss": 4.5146, "step": 1032 }, { "epoch": 0.3719171917191719, "grad_norm": 0.9427993893623352, "learning_rate": 0.0001424444569700461, "loss": 4.5106, "step": 1033 }, { "epoch": 0.3722772277227723, "grad_norm": 0.813029944896698, "learning_rate": 0.00014234011759187083, "loss": 4.8805, "step": 1034 }, { "epoch": 0.3726372637263726, "grad_norm": 0.7851507663726807, "learning_rate": 0.00014223572202079094, "loss": 5.1088, "step": 1035 }, { "epoch": 0.372997299729973, "grad_norm": 0.8256126046180725, "learning_rate": 0.00014213127039535803, "loss": 4.6876, "step": 1036 }, { "epoch": 0.3733573357335734, "grad_norm": 0.9763374924659729, "learning_rate": 0.00014202676285419812, "loss": 4.8456, "step": 1037 }, { "epoch": 0.37371737173717373, "grad_norm": 0.9133673310279846, "learning_rate": 0.0001419221995360113, "loss": 4.525, "step": 1038 }, { "epoch": 0.3740774077407741, "grad_norm": 0.7200229167938232, "learning_rate": 0.00014181758057957186, "loss": 4.6137, "step": 1039 }, { "epoch": 0.37443744374437443, "grad_norm": 0.5833539962768555, "learning_rate": 0.0001417129061237278, "loss": 4.6604, "step": 1040 }, { "epoch": 0.3747974797479748, "grad_norm": 0.9346949458122253, "learning_rate": 0.0001416081763074009, "loss": 4.8527, "step": 1041 }, { "epoch": 0.37515751575157513, "grad_norm": 0.7368828058242798, "learning_rate": 0.00014150339126958633, "loss": 4.932, "step": 1042 }, { "epoch": 0.37551755175517554, "grad_norm": 0.7255602478981018, "learning_rate": 0.00014139855114935252, "loss": 4.6075, "step": 1043 }, { "epoch": 0.3758775877587759, "grad_norm": 0.7426056861877441, "learning_rate": 0.00014129365608584108, "loss": 4.5646, "step": 1044 }, { "epoch": 0.37623762376237624, "grad_norm": 1.131618618965149, "learning_rate": 0.00014118870621826656, "loss": 4.9526, "step": 1045 }, { "epoch": 0.3765976597659766, "grad_norm": 0.8817772269248962, "learning_rate": 0.0001410837016859161, "loss": 4.5403, "step": 1046 }, { "epoch": 0.37695769576957694, "grad_norm": 0.8336977958679199, "learning_rate": 0.00014097864262814955, "loss": 5.1945, "step": 1047 }, { "epoch": 0.37731773177317735, "grad_norm": 1.1306034326553345, "learning_rate": 0.00014087352918439904, "loss": 5.2583, "step": 1048 }, { "epoch": 0.3776777677767777, "grad_norm": 1.4777275323867798, "learning_rate": 0.00014076836149416887, "loss": 5.2032, "step": 1049 }, { "epoch": 0.37803780378037805, "grad_norm": 1.7121351957321167, "learning_rate": 0.00014066313969703545, "loss": 5.3917, "step": 1050 }, { "epoch": 0.3783978397839784, "grad_norm": 1.3249011039733887, "learning_rate": 0.00014055786393264683, "loss": 4.5258, "step": 1051 }, { "epoch": 0.37875787578757875, "grad_norm": 1.185707688331604, "learning_rate": 0.0001404525343407228, "loss": 4.7997, "step": 1052 }, { "epoch": 0.3791179117911791, "grad_norm": 0.7875931262969971, "learning_rate": 0.00014034715106105456, "loss": 4.5115, "step": 1053 }, { "epoch": 0.3794779477947795, "grad_norm": 0.6152934432029724, "learning_rate": 0.00014024171423350455, "loss": 4.5271, "step": 1054 }, { "epoch": 0.37983798379837985, "grad_norm": 0.7910044193267822, "learning_rate": 0.00014013622399800627, "loss": 4.591, "step": 1055 }, { "epoch": 0.3801980198019802, "grad_norm": 1.1457709074020386, "learning_rate": 0.00014003068049456418, "loss": 4.5818, "step": 1056 }, { "epoch": 0.38055805580558055, "grad_norm": 0.8700783252716064, "learning_rate": 0.0001399250838632533, "loss": 4.5598, "step": 1057 }, { "epoch": 0.3809180918091809, "grad_norm": 0.9683236479759216, "learning_rate": 0.00013981943424421932, "loss": 4.9761, "step": 1058 }, { "epoch": 0.38127812781278125, "grad_norm": 0.8842438459396362, "learning_rate": 0.00013971373177767805, "loss": 4.878, "step": 1059 }, { "epoch": 0.38163816381638166, "grad_norm": 0.8185229301452637, "learning_rate": 0.0001396079766039157, "loss": 4.7094, "step": 1060 }, { "epoch": 0.381998199819982, "grad_norm": 1.1154496669769287, "learning_rate": 0.0001395021688632882, "loss": 4.9167, "step": 1061 }, { "epoch": 0.38235823582358236, "grad_norm": 1.122309923171997, "learning_rate": 0.00013939630869622133, "loss": 4.8653, "step": 1062 }, { "epoch": 0.3827182718271827, "grad_norm": 0.7007623910903931, "learning_rate": 0.00013929039624321053, "loss": 4.3717, "step": 1063 }, { "epoch": 0.38307830783078306, "grad_norm": 1.0007191896438599, "learning_rate": 0.00013918443164482046, "loss": 4.7034, "step": 1064 }, { "epoch": 0.38343834383438347, "grad_norm": 0.8400418758392334, "learning_rate": 0.00013907841504168516, "loss": 5.0231, "step": 1065 }, { "epoch": 0.3837983798379838, "grad_norm": 0.8117850422859192, "learning_rate": 0.00013897234657450757, "loss": 4.716, "step": 1066 }, { "epoch": 0.38415841584158417, "grad_norm": 0.9142310619354248, "learning_rate": 0.00013886622638405952, "loss": 4.3933, "step": 1067 }, { "epoch": 0.3845184518451845, "grad_norm": 0.8659790754318237, "learning_rate": 0.0001387600546111815, "loss": 5.0209, "step": 1068 }, { "epoch": 0.38487848784878487, "grad_norm": 1.464654803276062, "learning_rate": 0.0001386538313967824, "loss": 4.8429, "step": 1069 }, { "epoch": 0.3852385238523852, "grad_norm": 1.0455034971237183, "learning_rate": 0.0001385475568818394, "loss": 4.9994, "step": 1070 }, { "epoch": 0.3855985598559856, "grad_norm": 1.0117931365966797, "learning_rate": 0.00013844123120739782, "loss": 4.7524, "step": 1071 }, { "epoch": 0.385958595859586, "grad_norm": 1.0140798091888428, "learning_rate": 0.0001383348545145708, "loss": 4.5978, "step": 1072 }, { "epoch": 0.3863186318631863, "grad_norm": 0.9929814338684082, "learning_rate": 0.00013822842694453924, "loss": 5.0697, "step": 1073 }, { "epoch": 0.3866786678667867, "grad_norm": 1.8501864671707153, "learning_rate": 0.00013812194863855156, "loss": 5.4553, "step": 1074 }, { "epoch": 0.387038703870387, "grad_norm": 1.3297158479690552, "learning_rate": 0.0001380154197379235, "loss": 5.1434, "step": 1075 }, { "epoch": 0.3873987398739874, "grad_norm": 1.3064671754837036, "learning_rate": 0.00013790884038403795, "loss": 4.7108, "step": 1076 }, { "epoch": 0.3877587758775878, "grad_norm": 1.8386310338974, "learning_rate": 0.00013780221071834476, "loss": 4.6172, "step": 1077 }, { "epoch": 0.38811881188118813, "grad_norm": 1.4296783208847046, "learning_rate": 0.00013769553088236055, "loss": 5.0008, "step": 1078 }, { "epoch": 0.3884788478847885, "grad_norm": 1.3294901847839355, "learning_rate": 0.0001375888010176686, "loss": 4.8293, "step": 1079 }, { "epoch": 0.38883888388838883, "grad_norm": 0.9052990674972534, "learning_rate": 0.0001374820212659184, "loss": 5.0959, "step": 1080 }, { "epoch": 0.3891989198919892, "grad_norm": 0.6910704374313354, "learning_rate": 0.00013737519176882588, "loss": 4.8354, "step": 1081 }, { "epoch": 0.38955895589558953, "grad_norm": 0.973875880241394, "learning_rate": 0.00013726831266817278, "loss": 4.6088, "step": 1082 }, { "epoch": 0.38991899189918994, "grad_norm": 0.7647384405136108, "learning_rate": 0.00013716138410580685, "loss": 4.7574, "step": 1083 }, { "epoch": 0.3902790279027903, "grad_norm": 1.0366477966308594, "learning_rate": 0.00013705440622364137, "loss": 4.5836, "step": 1084 }, { "epoch": 0.39063906390639064, "grad_norm": 0.7150110006332397, "learning_rate": 0.00013694737916365517, "loss": 4.5989, "step": 1085 }, { "epoch": 0.390999099909991, "grad_norm": 0.8569515943527222, "learning_rate": 0.0001368403030678922, "loss": 4.7995, "step": 1086 }, { "epoch": 0.39135913591359134, "grad_norm": 0.8392965793609619, "learning_rate": 0.0001367331780784616, "loss": 4.9211, "step": 1087 }, { "epoch": 0.39171917191719174, "grad_norm": 0.764937698841095, "learning_rate": 0.00013662600433753745, "loss": 4.9077, "step": 1088 }, { "epoch": 0.3920792079207921, "grad_norm": 0.971203088760376, "learning_rate": 0.00013651878198735838, "loss": 4.7849, "step": 1089 }, { "epoch": 0.39243924392439244, "grad_norm": 0.8801988363265991, "learning_rate": 0.00013641151117022767, "loss": 4.685, "step": 1090 }, { "epoch": 0.3927992799279928, "grad_norm": 0.7537325620651245, "learning_rate": 0.00013630419202851284, "loss": 4.7856, "step": 1091 }, { "epoch": 0.39315931593159315, "grad_norm": 1.0417050123214722, "learning_rate": 0.00013619682470464558, "loss": 4.3694, "step": 1092 }, { "epoch": 0.3935193519351935, "grad_norm": 0.7336771488189697, "learning_rate": 0.00013608940934112156, "loss": 4.6453, "step": 1093 }, { "epoch": 0.3938793879387939, "grad_norm": 0.8440720438957214, "learning_rate": 0.0001359819460805001, "loss": 5.1781, "step": 1094 }, { "epoch": 0.39423942394239425, "grad_norm": 1.0878543853759766, "learning_rate": 0.00013587443506540422, "loss": 5.1328, "step": 1095 }, { "epoch": 0.3945994599459946, "grad_norm": 0.8983904719352722, "learning_rate": 0.0001357668764385202, "loss": 5.043, "step": 1096 }, { "epoch": 0.39495949594959495, "grad_norm": 1.2309449911117554, "learning_rate": 0.0001356592703425976, "loss": 4.8985, "step": 1097 }, { "epoch": 0.3953195319531953, "grad_norm": 0.9788826107978821, "learning_rate": 0.00013555161692044892, "loss": 5.1936, "step": 1098 }, { "epoch": 0.39567956795679565, "grad_norm": 1.42829430103302, "learning_rate": 0.00013544391631494952, "loss": 5.2761, "step": 1099 }, { "epoch": 0.39603960396039606, "grad_norm": 1.6005115509033203, "learning_rate": 0.00013533616866903735, "loss": 5.116, "step": 1100 }, { "epoch": 0.3963996399639964, "grad_norm": 3.0721354484558105, "learning_rate": 0.00013522837412571282, "loss": 4.9747, "step": 1101 }, { "epoch": 0.39675967596759676, "grad_norm": 0.9887019395828247, "learning_rate": 0.0001351205328280385, "loss": 4.4736, "step": 1102 }, { "epoch": 0.3971197119711971, "grad_norm": 0.8533967137336731, "learning_rate": 0.00013501264491913906, "loss": 4.7911, "step": 1103 }, { "epoch": 0.39747974797479746, "grad_norm": 1.4522795677185059, "learning_rate": 0.00013490471054220112, "loss": 5.2936, "step": 1104 }, { "epoch": 0.39783978397839787, "grad_norm": 1.0104986429214478, "learning_rate": 0.00013479672984047288, "loss": 4.7986, "step": 1105 }, { "epoch": 0.3981998199819982, "grad_norm": 0.674165666103363, "learning_rate": 0.00013468870295726398, "loss": 4.7862, "step": 1106 }, { "epoch": 0.39855985598559857, "grad_norm": 0.7466803193092346, "learning_rate": 0.00013458063003594543, "loss": 4.5683, "step": 1107 }, { "epoch": 0.3989198919891989, "grad_norm": 1.233678936958313, "learning_rate": 0.00013447251121994933, "loss": 4.8117, "step": 1108 }, { "epoch": 0.39927992799279927, "grad_norm": 0.7311433553695679, "learning_rate": 0.00013436434665276865, "loss": 4.8338, "step": 1109 }, { "epoch": 0.3996399639963996, "grad_norm": 1.2107664346694946, "learning_rate": 0.00013425613647795713, "loss": 4.747, "step": 1110 }, { "epoch": 0.4, "grad_norm": 0.9774342775344849, "learning_rate": 0.000134147880839129, "loss": 4.6647, "step": 1111 }, { "epoch": 0.4003600360036004, "grad_norm": 0.8316183090209961, "learning_rate": 0.00013403957987995882, "loss": 4.6156, "step": 1112 }, { "epoch": 0.4007200720072007, "grad_norm": 0.8299689888954163, "learning_rate": 0.00013393123374418137, "loss": 4.7576, "step": 1113 }, { "epoch": 0.4010801080108011, "grad_norm": 0.8191565275192261, "learning_rate": 0.00013382284257559132, "loss": 4.6748, "step": 1114 }, { "epoch": 0.4014401440144014, "grad_norm": 1.1108641624450684, "learning_rate": 0.00013371440651804313, "loss": 4.6654, "step": 1115 }, { "epoch": 0.4018001800180018, "grad_norm": 1.5050597190856934, "learning_rate": 0.00013360592571545082, "loss": 4.7271, "step": 1116 }, { "epoch": 0.4021602160216022, "grad_norm": 0.6661246418952942, "learning_rate": 0.00013349740031178784, "loss": 4.7434, "step": 1117 }, { "epoch": 0.40252025202520253, "grad_norm": 0.9325123429298401, "learning_rate": 0.00013338883045108674, "loss": 4.5033, "step": 1118 }, { "epoch": 0.4028802880288029, "grad_norm": 0.7152042984962463, "learning_rate": 0.00013328021627743915, "loss": 4.98, "step": 1119 }, { "epoch": 0.40324032403240323, "grad_norm": 1.1741433143615723, "learning_rate": 0.00013317155793499556, "loss": 4.5421, "step": 1120 }, { "epoch": 0.4036003600360036, "grad_norm": 1.2132052183151245, "learning_rate": 0.00013306285556796495, "loss": 5.1257, "step": 1121 }, { "epoch": 0.403960396039604, "grad_norm": 0.9931376576423645, "learning_rate": 0.00013295410932061478, "loss": 4.9036, "step": 1122 }, { "epoch": 0.40432043204320434, "grad_norm": 1.2949656248092651, "learning_rate": 0.00013284531933727083, "loss": 5.2814, "step": 1123 }, { "epoch": 0.4046804680468047, "grad_norm": 1.2032676935195923, "learning_rate": 0.0001327364857623168, "loss": 4.9101, "step": 1124 }, { "epoch": 0.40504050405040504, "grad_norm": 2.935535192489624, "learning_rate": 0.00013262760874019428, "loss": 5.5174, "step": 1125 }, { "epoch": 0.4054005400540054, "grad_norm": 1.6732901334762573, "learning_rate": 0.00013251868841540257, "loss": 4.8242, "step": 1126 }, { "epoch": 0.40576057605760574, "grad_norm": 1.4497289657592773, "learning_rate": 0.00013240972493249847, "loss": 5.3078, "step": 1127 }, { "epoch": 0.40612061206120614, "grad_norm": 1.0038177967071533, "learning_rate": 0.00013230071843609587, "loss": 4.8709, "step": 1128 }, { "epoch": 0.4064806480648065, "grad_norm": 0.8603922724723816, "learning_rate": 0.000132191669070866, "loss": 4.7909, "step": 1129 }, { "epoch": 0.40684068406840684, "grad_norm": 0.7687175273895264, "learning_rate": 0.00013208257698153677, "loss": 4.9778, "step": 1130 }, { "epoch": 0.4072007200720072, "grad_norm": 0.8526626825332642, "learning_rate": 0.00013197344231289296, "loss": 4.5872, "step": 1131 }, { "epoch": 0.40756075607560754, "grad_norm": 0.7719563841819763, "learning_rate": 0.0001318642652097757, "loss": 4.4888, "step": 1132 }, { "epoch": 0.4079207920792079, "grad_norm": 0.7941271662712097, "learning_rate": 0.0001317550458170826, "loss": 4.526, "step": 1133 }, { "epoch": 0.4082808280828083, "grad_norm": 0.9268568158149719, "learning_rate": 0.00013164578427976727, "loss": 4.7826, "step": 1134 }, { "epoch": 0.40864086408640865, "grad_norm": 1.0768158435821533, "learning_rate": 0.00013153648074283936, "loss": 4.8116, "step": 1135 }, { "epoch": 0.409000900090009, "grad_norm": 0.8673043847084045, "learning_rate": 0.00013142713535136414, "loss": 4.982, "step": 1136 }, { "epoch": 0.40936093609360935, "grad_norm": 0.9443647265434265, "learning_rate": 0.00013131774825046245, "loss": 4.7423, "step": 1137 }, { "epoch": 0.4097209720972097, "grad_norm": 0.9891415238380432, "learning_rate": 0.00013120831958531067, "loss": 4.8075, "step": 1138 }, { "epoch": 0.4100810081008101, "grad_norm": 0.9973192811012268, "learning_rate": 0.00013109884950114007, "loss": 4.7516, "step": 1139 }, { "epoch": 0.41044104410441046, "grad_norm": 0.8082119822502136, "learning_rate": 0.00013098933814323707, "loss": 4.4898, "step": 1140 }, { "epoch": 0.4108010801080108, "grad_norm": 0.7986486554145813, "learning_rate": 0.0001308797856569428, "loss": 4.9171, "step": 1141 }, { "epoch": 0.41116111611161116, "grad_norm": 1.0446362495422363, "learning_rate": 0.00013077019218765305, "loss": 4.9862, "step": 1142 }, { "epoch": 0.4115211521152115, "grad_norm": 0.7441645860671997, "learning_rate": 0.00013066055788081788, "loss": 4.6925, "step": 1143 }, { "epoch": 0.41188118811881186, "grad_norm": 0.8899047374725342, "learning_rate": 0.00013055088288194163, "loss": 4.7605, "step": 1144 }, { "epoch": 0.41224122412241226, "grad_norm": 0.8489072918891907, "learning_rate": 0.0001304411673365826, "loss": 4.8283, "step": 1145 }, { "epoch": 0.4126012601260126, "grad_norm": 1.014509916305542, "learning_rate": 0.000130331411390353, "loss": 4.5665, "step": 1146 }, { "epoch": 0.41296129612961296, "grad_norm": 0.9186869263648987, "learning_rate": 0.00013022161518891855, "loss": 4.9906, "step": 1147 }, { "epoch": 0.4133213321332133, "grad_norm": 1.4205145835876465, "learning_rate": 0.00013011177887799845, "loss": 5.2449, "step": 1148 }, { "epoch": 0.41368136813681367, "grad_norm": 0.9844337105751038, "learning_rate": 0.0001300019026033651, "loss": 5.0538, "step": 1149 }, { "epoch": 0.414041404140414, "grad_norm": 1.7266241312026978, "learning_rate": 0.00012989198651084397, "loss": 5.46, "step": 1150 }, { "epoch": 0.4144014401440144, "grad_norm": 1.170470952987671, "learning_rate": 0.00012978203074631334, "loss": 4.8089, "step": 1151 }, { "epoch": 0.41476147614761477, "grad_norm": 1.0281877517700195, "learning_rate": 0.00012967203545570418, "loss": 4.6209, "step": 1152 }, { "epoch": 0.4151215121512151, "grad_norm": 0.9903915524482727, "learning_rate": 0.00012956200078499994, "loss": 4.3591, "step": 1153 }, { "epoch": 0.41548154815481547, "grad_norm": 1.1112825870513916, "learning_rate": 0.00012945192688023624, "loss": 4.6581, "step": 1154 }, { "epoch": 0.4158415841584158, "grad_norm": 0.7413083910942078, "learning_rate": 0.00012934181388750087, "loss": 4.762, "step": 1155 }, { "epoch": 0.41620162016201623, "grad_norm": 0.9765453338623047, "learning_rate": 0.0001292316619529334, "loss": 4.7615, "step": 1156 }, { "epoch": 0.4165616561656166, "grad_norm": 0.8726073503494263, "learning_rate": 0.00012912147122272523, "loss": 4.9829, "step": 1157 }, { "epoch": 0.41692169216921693, "grad_norm": 0.7000720500946045, "learning_rate": 0.00012901124184311905, "loss": 4.7909, "step": 1158 }, { "epoch": 0.4172817281728173, "grad_norm": 1.2081289291381836, "learning_rate": 0.00012890097396040903, "loss": 4.838, "step": 1159 }, { "epoch": 0.41764176417641763, "grad_norm": 0.8126158714294434, "learning_rate": 0.0001287906677209403, "loss": 4.5821, "step": 1160 }, { "epoch": 0.418001800180018, "grad_norm": 0.9014402031898499, "learning_rate": 0.00012868032327110904, "loss": 5.1858, "step": 1161 }, { "epoch": 0.4183618361836184, "grad_norm": 0.9191497564315796, "learning_rate": 0.00012856994075736197, "loss": 4.8331, "step": 1162 }, { "epoch": 0.41872187218721874, "grad_norm": 0.7086000442504883, "learning_rate": 0.0001284595203261965, "loss": 4.9313, "step": 1163 }, { "epoch": 0.4190819081908191, "grad_norm": 0.7402483224868774, "learning_rate": 0.0001283490621241602, "loss": 5.0152, "step": 1164 }, { "epoch": 0.41944194419441944, "grad_norm": 0.8372672200202942, "learning_rate": 0.00012823856629785093, "loss": 4.4173, "step": 1165 }, { "epoch": 0.4198019801980198, "grad_norm": 0.7594465017318726, "learning_rate": 0.00012812803299391628, "loss": 4.5408, "step": 1166 }, { "epoch": 0.42016201620162014, "grad_norm": 1.4983152151107788, "learning_rate": 0.00012801746235905384, "loss": 5.0135, "step": 1167 }, { "epoch": 0.42052205220522054, "grad_norm": 1.4116381406784058, "learning_rate": 0.00012790685454001054, "loss": 4.7556, "step": 1168 }, { "epoch": 0.4208820882088209, "grad_norm": 1.290519118309021, "learning_rate": 0.00012779620968358273, "loss": 4.5214, "step": 1169 }, { "epoch": 0.42124212421242124, "grad_norm": 0.8605509400367737, "learning_rate": 0.00012768552793661594, "loss": 4.6911, "step": 1170 }, { "epoch": 0.4216021602160216, "grad_norm": 0.9390202164649963, "learning_rate": 0.00012757480944600462, "loss": 5.0831, "step": 1171 }, { "epoch": 0.42196219621962194, "grad_norm": 1.106753945350647, "learning_rate": 0.00012746405435869198, "loss": 5.0177, "step": 1172 }, { "epoch": 0.42232223222322235, "grad_norm": 1.1474226713180542, "learning_rate": 0.00012735326282166984, "loss": 4.9379, "step": 1173 }, { "epoch": 0.4226822682268227, "grad_norm": 1.3341996669769287, "learning_rate": 0.00012724243498197837, "loss": 5.1112, "step": 1174 }, { "epoch": 0.42304230423042305, "grad_norm": 1.7512990236282349, "learning_rate": 0.0001271315709867059, "loss": 5.0614, "step": 1175 }, { "epoch": 0.4234023402340234, "grad_norm": 1.0912712812423706, "learning_rate": 0.0001270206709829888, "loss": 4.9184, "step": 1176 }, { "epoch": 0.42376237623762375, "grad_norm": 1.343735933303833, "learning_rate": 0.0001269097351180112, "loss": 4.8776, "step": 1177 }, { "epoch": 0.4241224122412241, "grad_norm": 1.0407140254974365, "learning_rate": 0.00012679876353900482, "loss": 4.752, "step": 1178 }, { "epoch": 0.4244824482448245, "grad_norm": 1.1934579610824585, "learning_rate": 0.00012668775639324874, "loss": 4.6586, "step": 1179 }, { "epoch": 0.42484248424842486, "grad_norm": 0.8980515599250793, "learning_rate": 0.00012657671382806937, "loss": 4.6048, "step": 1180 }, { "epoch": 0.4252025202520252, "grad_norm": 0.772833526134491, "learning_rate": 0.00012646563599083996, "loss": 4.8211, "step": 1181 }, { "epoch": 0.42556255625562556, "grad_norm": 0.8841773867607117, "learning_rate": 0.0001263545230289807, "loss": 4.4507, "step": 1182 }, { "epoch": 0.4259225922592259, "grad_norm": 0.7956357002258301, "learning_rate": 0.00012624337508995834, "loss": 5.0147, "step": 1183 }, { "epoch": 0.42628262826282626, "grad_norm": 0.8929400444030762, "learning_rate": 0.00012613219232128608, "loss": 4.9053, "step": 1184 }, { "epoch": 0.42664266426642666, "grad_norm": 0.7843483090400696, "learning_rate": 0.0001260209748705233, "loss": 5.0519, "step": 1185 }, { "epoch": 0.427002700270027, "grad_norm": 0.7303058505058289, "learning_rate": 0.00012590972288527546, "loss": 4.5666, "step": 1186 }, { "epoch": 0.42736273627362736, "grad_norm": 0.9950706362724304, "learning_rate": 0.0001257984365131938, "loss": 4.8163, "step": 1187 }, { "epoch": 0.4277227722772277, "grad_norm": 0.9004842042922974, "learning_rate": 0.0001256871159019753, "loss": 4.8224, "step": 1188 }, { "epoch": 0.42808280828082806, "grad_norm": 0.7937396168708801, "learning_rate": 0.00012557576119936225, "loss": 4.5765, "step": 1189 }, { "epoch": 0.42844284428442847, "grad_norm": 0.8856514096260071, "learning_rate": 0.00012546437255314222, "loss": 4.6348, "step": 1190 }, { "epoch": 0.4288028802880288, "grad_norm": 1.235495686531067, "learning_rate": 0.00012535295011114795, "loss": 4.8799, "step": 1191 }, { "epoch": 0.42916291629162917, "grad_norm": 1.0895787477493286, "learning_rate": 0.00012524149402125685, "loss": 5.2455, "step": 1192 }, { "epoch": 0.4295229522952295, "grad_norm": 0.842725396156311, "learning_rate": 0.00012513000443139112, "loss": 4.758, "step": 1193 }, { "epoch": 0.42988298829882987, "grad_norm": 0.8005720973014832, "learning_rate": 0.00012501848148951735, "loss": 4.7166, "step": 1194 }, { "epoch": 0.4302430243024302, "grad_norm": 0.9036151170730591, "learning_rate": 0.00012490692534364642, "loss": 4.3832, "step": 1195 }, { "epoch": 0.4306030603060306, "grad_norm": 1.1666127443313599, "learning_rate": 0.00012479533614183334, "loss": 4.6348, "step": 1196 }, { "epoch": 0.430963096309631, "grad_norm": 1.1041556596755981, "learning_rate": 0.00012468371403217684, "loss": 4.8944, "step": 1197 }, { "epoch": 0.4313231323132313, "grad_norm": 0.9621699452400208, "learning_rate": 0.00012457205916281943, "loss": 4.7718, "step": 1198 }, { "epoch": 0.4316831683168317, "grad_norm": 1.1753240823745728, "learning_rate": 0.00012446037168194714, "loss": 4.9495, "step": 1199 }, { "epoch": 0.43204320432043203, "grad_norm": 1.4692611694335938, "learning_rate": 0.00012434865173778915, "loss": 5.8088, "step": 1200 }, { "epoch": 0.4324032403240324, "grad_norm": 3.4769439697265625, "learning_rate": 0.00012423689947861787, "loss": 5.2299, "step": 1201 }, { "epoch": 0.4327632763276328, "grad_norm": 0.7332233190536499, "learning_rate": 0.00012412511505274844, "loss": 4.4305, "step": 1202 }, { "epoch": 0.43312331233123313, "grad_norm": 0.791152834892273, "learning_rate": 0.00012401329860853885, "loss": 5.0611, "step": 1203 }, { "epoch": 0.4334833483348335, "grad_norm": 1.2914446592330933, "learning_rate": 0.00012390145029438947, "loss": 4.9828, "step": 1204 }, { "epoch": 0.43384338433843384, "grad_norm": 0.8650038838386536, "learning_rate": 0.000123789570258743, "loss": 5.2274, "step": 1205 }, { "epoch": 0.4342034203420342, "grad_norm": 0.7898814678192139, "learning_rate": 0.00012367765865008428, "loss": 4.5792, "step": 1206 }, { "epoch": 0.4345634563456346, "grad_norm": 1.1311291456222534, "learning_rate": 0.00012356571561693996, "loss": 4.8372, "step": 1207 }, { "epoch": 0.43492349234923494, "grad_norm": 0.7760782241821289, "learning_rate": 0.00012345374130787854, "loss": 4.7993, "step": 1208 }, { "epoch": 0.4352835283528353, "grad_norm": 1.017967700958252, "learning_rate": 0.0001233417358715099, "loss": 4.6437, "step": 1209 }, { "epoch": 0.43564356435643564, "grad_norm": 1.2804354429244995, "learning_rate": 0.00012322969945648523, "loss": 4.2972, "step": 1210 }, { "epoch": 0.436003600360036, "grad_norm": 0.8337898254394531, "learning_rate": 0.000123117632211497, "loss": 4.6893, "step": 1211 }, { "epoch": 0.43636363636363634, "grad_norm": 0.867520809173584, "learning_rate": 0.00012300553428527832, "loss": 4.9483, "step": 1212 }, { "epoch": 0.43672367236723675, "grad_norm": 0.8490375280380249, "learning_rate": 0.0001228934058266033, "loss": 4.6251, "step": 1213 }, { "epoch": 0.4370837083708371, "grad_norm": 0.8703073263168335, "learning_rate": 0.0001227812469842864, "loss": 4.5385, "step": 1214 }, { "epoch": 0.43744374437443745, "grad_norm": 0.9746583104133606, "learning_rate": 0.0001226690579071825, "loss": 4.8996, "step": 1215 }, { "epoch": 0.4378037803780378, "grad_norm": 0.7318705916404724, "learning_rate": 0.00012255683874418645, "loss": 4.4165, "step": 1216 }, { "epoch": 0.43816381638163815, "grad_norm": 0.8295899629592896, "learning_rate": 0.00012244458964423327, "loss": 4.5525, "step": 1217 }, { "epoch": 0.4385238523852385, "grad_norm": 0.9655759930610657, "learning_rate": 0.00012233231075629747, "loss": 4.5796, "step": 1218 }, { "epoch": 0.4388838883888389, "grad_norm": 0.7518540620803833, "learning_rate": 0.0001222200022293933, "loss": 4.6897, "step": 1219 }, { "epoch": 0.43924392439243926, "grad_norm": 0.6703758239746094, "learning_rate": 0.0001221076642125742, "loss": 4.4407, "step": 1220 }, { "epoch": 0.4396039603960396, "grad_norm": 0.6767472624778748, "learning_rate": 0.00012199529685493278, "loss": 4.6747, "step": 1221 }, { "epoch": 0.43996399639963996, "grad_norm": 0.7684047818183899, "learning_rate": 0.00012188290030560063, "loss": 4.6929, "step": 1222 }, { "epoch": 0.4403240324032403, "grad_norm": 1.4957590103149414, "learning_rate": 0.00012177047471374807, "loss": 5.2717, "step": 1223 }, { "epoch": 0.44068406840684066, "grad_norm": 1.0840327739715576, "learning_rate": 0.00012165802022858399, "loss": 5.199, "step": 1224 }, { "epoch": 0.44104410441044106, "grad_norm": 1.6059932708740234, "learning_rate": 0.00012154553699935553, "loss": 5.493, "step": 1225 }, { "epoch": 0.4414041404140414, "grad_norm": 1.5921814441680908, "learning_rate": 0.0001214330251753481, "loss": 4.8692, "step": 1226 }, { "epoch": 0.44176417641764176, "grad_norm": 0.9325987696647644, "learning_rate": 0.00012132048490588492, "loss": 4.8154, "step": 1227 }, { "epoch": 0.4421242124212421, "grad_norm": 1.2291182279586792, "learning_rate": 0.00012120791634032715, "loss": 4.5689, "step": 1228 }, { "epoch": 0.44248424842484246, "grad_norm": 0.8440655469894409, "learning_rate": 0.00012109531962807332, "loss": 4.8022, "step": 1229 }, { "epoch": 0.44284428442844287, "grad_norm": 0.8819119334220886, "learning_rate": 0.00012098269491855942, "loss": 4.745, "step": 1230 }, { "epoch": 0.4432043204320432, "grad_norm": 0.7879696488380432, "learning_rate": 0.00012087004236125858, "loss": 4.9081, "step": 1231 }, { "epoch": 0.44356435643564357, "grad_norm": 0.8012781143188477, "learning_rate": 0.0001207573621056809, "loss": 5.1384, "step": 1232 }, { "epoch": 0.4439243924392439, "grad_norm": 4.395026683807373, "learning_rate": 0.00012064465430137315, "loss": 5.099, "step": 1233 }, { "epoch": 0.44428442844284427, "grad_norm": 0.737079381942749, "learning_rate": 0.00012053191909791883, "loss": 4.5297, "step": 1234 }, { "epoch": 0.4446444644464446, "grad_norm": 0.7090547680854797, "learning_rate": 0.00012041915664493761, "loss": 4.3716, "step": 1235 }, { "epoch": 0.445004500450045, "grad_norm": 0.8456137776374817, "learning_rate": 0.00012030636709208551, "loss": 4.5888, "step": 1236 }, { "epoch": 0.4453645364536454, "grad_norm": 0.9556293487548828, "learning_rate": 0.00012019355058905435, "loss": 4.6386, "step": 1237 }, { "epoch": 0.4457245724572457, "grad_norm": 1.3399239778518677, "learning_rate": 0.00012008070728557186, "loss": 4.9974, "step": 1238 }, { "epoch": 0.4460846084608461, "grad_norm": 0.9864078760147095, "learning_rate": 0.00011996783733140122, "loss": 4.5971, "step": 1239 }, { "epoch": 0.4464446444644464, "grad_norm": 0.9206048846244812, "learning_rate": 0.0001198549408763411, "loss": 4.6504, "step": 1240 }, { "epoch": 0.4468046804680468, "grad_norm": 0.7920734882354736, "learning_rate": 0.00011974201807022525, "loss": 4.5222, "step": 1241 }, { "epoch": 0.4471647164716472, "grad_norm": 0.9126737713813782, "learning_rate": 0.00011962906906292238, "loss": 4.4975, "step": 1242 }, { "epoch": 0.44752475247524753, "grad_norm": 0.8361069560050964, "learning_rate": 0.00011951609400433605, "loss": 4.8476, "step": 1243 }, { "epoch": 0.4478847884788479, "grad_norm": 1.2121682167053223, "learning_rate": 0.00011940309304440433, "loss": 4.9648, "step": 1244 }, { "epoch": 0.44824482448244823, "grad_norm": 0.7456281781196594, "learning_rate": 0.00011929006633309974, "loss": 4.6785, "step": 1245 }, { "epoch": 0.4486048604860486, "grad_norm": 0.775759220123291, "learning_rate": 0.00011917701402042889, "loss": 4.8784, "step": 1246 }, { "epoch": 0.448964896489649, "grad_norm": 1.0275262594223022, "learning_rate": 0.00011906393625643244, "loss": 5.1059, "step": 1247 }, { "epoch": 0.44932493249324934, "grad_norm": 0.8145440816879272, "learning_rate": 0.00011895083319118477, "loss": 5.0963, "step": 1248 }, { "epoch": 0.4496849684968497, "grad_norm": 1.1181066036224365, "learning_rate": 0.00011883770497479387, "loss": 5.0131, "step": 1249 }, { "epoch": 0.45004500450045004, "grad_norm": 1.8296819925308228, "learning_rate": 0.00011872455175740112, "loss": 5.2702, "step": 1250 }, { "epoch": 0.4504050405040504, "grad_norm": 1.730488896369934, "learning_rate": 0.00011861137368918105, "loss": 5.1305, "step": 1251 }, { "epoch": 0.45076507650765074, "grad_norm": 1.0825215578079224, "learning_rate": 0.00011849817092034118, "loss": 4.9019, "step": 1252 }, { "epoch": 0.45112511251125115, "grad_norm": 0.8123356103897095, "learning_rate": 0.00011838494360112185, "loss": 4.4342, "step": 1253 }, { "epoch": 0.4514851485148515, "grad_norm": 0.9870261549949646, "learning_rate": 0.00011827169188179592, "loss": 4.9825, "step": 1254 }, { "epoch": 0.45184518451845185, "grad_norm": 0.8250070810317993, "learning_rate": 0.00011815841591266872, "loss": 4.5846, "step": 1255 }, { "epoch": 0.4522052205220522, "grad_norm": 1.0997467041015625, "learning_rate": 0.00011804511584407763, "loss": 4.5639, "step": 1256 }, { "epoch": 0.45256525652565255, "grad_norm": 0.7792536020278931, "learning_rate": 0.00011793179182639218, "loss": 4.6994, "step": 1257 }, { "epoch": 0.4529252925292529, "grad_norm": 0.6587983965873718, "learning_rate": 0.00011781844401001352, "loss": 4.8202, "step": 1258 }, { "epoch": 0.4532853285328533, "grad_norm": 0.9794634580612183, "learning_rate": 0.00011770507254537453, "loss": 4.4016, "step": 1259 }, { "epoch": 0.45364536453645365, "grad_norm": 0.8039897084236145, "learning_rate": 0.00011759167758293935, "loss": 4.7848, "step": 1260 }, { "epoch": 0.454005400540054, "grad_norm": 1.3053600788116455, "learning_rate": 0.0001174782592732034, "loss": 4.9723, "step": 1261 }, { "epoch": 0.45436543654365436, "grad_norm": 1.1314433813095093, "learning_rate": 0.00011736481776669306, "loss": 4.9508, "step": 1262 }, { "epoch": 0.4547254725472547, "grad_norm": 0.8784576058387756, "learning_rate": 0.00011725135321396543, "loss": 4.7798, "step": 1263 }, { "epoch": 0.4550855085508551, "grad_norm": 0.7723816633224487, "learning_rate": 0.00011713786576560835, "loss": 4.8276, "step": 1264 }, { "epoch": 0.45544554455445546, "grad_norm": 0.8358681797981262, "learning_rate": 0.00011702435557223987, "loss": 4.6874, "step": 1265 }, { "epoch": 0.4558055805580558, "grad_norm": 0.9585774540901184, "learning_rate": 0.00011691082278450836, "loss": 4.7501, "step": 1266 }, { "epoch": 0.45616561656165616, "grad_norm": 0.7977786660194397, "learning_rate": 0.00011679726755309205, "loss": 4.6517, "step": 1267 }, { "epoch": 0.4565256525652565, "grad_norm": 0.8501828908920288, "learning_rate": 0.00011668369002869912, "loss": 4.9298, "step": 1268 }, { "epoch": 0.45688568856885686, "grad_norm": 1.0356849431991577, "learning_rate": 0.00011657009036206718, "loss": 4.7837, "step": 1269 }, { "epoch": 0.45724572457245727, "grad_norm": 0.8533749580383301, "learning_rate": 0.00011645646870396333, "loss": 4.7115, "step": 1270 }, { "epoch": 0.4576057605760576, "grad_norm": 0.8388182520866394, "learning_rate": 0.00011634282520518383, "loss": 5.0224, "step": 1271 }, { "epoch": 0.45796579657965797, "grad_norm": 1.1325950622558594, "learning_rate": 0.00011622916001655388, "loss": 4.6831, "step": 1272 }, { "epoch": 0.4583258325832583, "grad_norm": 0.7390809059143066, "learning_rate": 0.00011611547328892754, "loss": 5.0284, "step": 1273 }, { "epoch": 0.45868586858685867, "grad_norm": 1.6919934749603271, "learning_rate": 0.00011600176517318741, "loss": 5.3664, "step": 1274 }, { "epoch": 0.459045904590459, "grad_norm": 1.8952467441558838, "learning_rate": 0.00011588803582024446, "loss": 5.2845, "step": 1275 }, { "epoch": 0.4594059405940594, "grad_norm": 1.0147459506988525, "learning_rate": 0.0001157742853810379, "loss": 4.4858, "step": 1276 }, { "epoch": 0.4597659765976598, "grad_norm": 0.9189479947090149, "learning_rate": 0.00011566051400653486, "loss": 4.9424, "step": 1277 }, { "epoch": 0.4601260126012601, "grad_norm": 1.2866673469543457, "learning_rate": 0.00011554672184773032, "loss": 4.8392, "step": 1278 }, { "epoch": 0.4604860486048605, "grad_norm": 0.6796092987060547, "learning_rate": 0.00011543290905564683, "loss": 4.7882, "step": 1279 }, { "epoch": 0.4608460846084608, "grad_norm": 0.7936111092567444, "learning_rate": 0.00011531907578133429, "loss": 5.0133, "step": 1280 }, { "epoch": 0.46120612061206123, "grad_norm": 0.8667351007461548, "learning_rate": 0.00011520522217586984, "loss": 4.7428, "step": 1281 }, { "epoch": 0.4615661566156616, "grad_norm": 0.796633243560791, "learning_rate": 0.00011509134839035748, "loss": 4.8215, "step": 1282 }, { "epoch": 0.46192619261926193, "grad_norm": 0.7455129027366638, "learning_rate": 0.00011497745457592816, "loss": 4.8385, "step": 1283 }, { "epoch": 0.4622862286228623, "grad_norm": 0.6883043646812439, "learning_rate": 0.0001148635408837393, "loss": 4.9312, "step": 1284 }, { "epoch": 0.46264626462646263, "grad_norm": 1.5165472030639648, "learning_rate": 0.00011474960746497472, "loss": 4.987, "step": 1285 }, { "epoch": 0.463006300630063, "grad_norm": 0.7015155553817749, "learning_rate": 0.00011463565447084445, "loss": 4.4301, "step": 1286 }, { "epoch": 0.4633663366336634, "grad_norm": 0.9830206632614136, "learning_rate": 0.0001145216820525845, "loss": 4.8288, "step": 1287 }, { "epoch": 0.46372637263726374, "grad_norm": 1.205315113067627, "learning_rate": 0.00011440769036145662, "loss": 4.9204, "step": 1288 }, { "epoch": 0.4640864086408641, "grad_norm": 0.6896906495094299, "learning_rate": 0.00011429367954874819, "loss": 4.5911, "step": 1289 }, { "epoch": 0.46444644464446444, "grad_norm": 0.6286702752113342, "learning_rate": 0.00011417964976577187, "loss": 4.7917, "step": 1290 }, { "epoch": 0.4648064806480648, "grad_norm": 0.7007503509521484, "learning_rate": 0.00011406560116386562, "loss": 4.8353, "step": 1291 }, { "epoch": 0.46516651665166514, "grad_norm": 0.785198986530304, "learning_rate": 0.00011395153389439233, "loss": 4.988, "step": 1292 }, { "epoch": 0.46552655265526555, "grad_norm": 0.643828272819519, "learning_rate": 0.0001138374481087396, "loss": 4.4371, "step": 1293 }, { "epoch": 0.4658865886588659, "grad_norm": 0.8021451234817505, "learning_rate": 0.00011372334395831972, "loss": 4.6181, "step": 1294 }, { "epoch": 0.46624662466246625, "grad_norm": 1.27736234664917, "learning_rate": 0.00011360922159456928, "loss": 4.8397, "step": 1295 }, { "epoch": 0.4666066606660666, "grad_norm": 0.6858236193656921, "learning_rate": 0.00011349508116894903, "loss": 4.9309, "step": 1296 }, { "epoch": 0.46696669666966695, "grad_norm": 0.9422666430473328, "learning_rate": 0.00011338092283294377, "loss": 4.8297, "step": 1297 }, { "epoch": 0.46732673267326735, "grad_norm": 0.8717568516731262, "learning_rate": 0.00011326674673806195, "loss": 5.2597, "step": 1298 }, { "epoch": 0.4676867686768677, "grad_norm": 1.1170207262039185, "learning_rate": 0.00011315255303583572, "loss": 5.1702, "step": 1299 }, { "epoch": 0.46804680468046805, "grad_norm": 1.6666972637176514, "learning_rate": 0.0001130383418778205, "loss": 5.3505, "step": 1300 }, { "epoch": 0.4684068406840684, "grad_norm": 4.088332653045654, "learning_rate": 0.0001129241134155949, "loss": 4.9151, "step": 1301 }, { "epoch": 0.46876687668766875, "grad_norm": 0.8962947130203247, "learning_rate": 0.00011280986780076057, "loss": 4.916, "step": 1302 }, { "epoch": 0.4691269126912691, "grad_norm": 0.748858630657196, "learning_rate": 0.0001126956051849418, "loss": 4.5858, "step": 1303 }, { "epoch": 0.4694869486948695, "grad_norm": 0.9193429350852966, "learning_rate": 0.00011258132571978555, "loss": 4.6093, "step": 1304 }, { "epoch": 0.46984698469846986, "grad_norm": 0.9823437929153442, "learning_rate": 0.00011246702955696106, "loss": 4.849, "step": 1305 }, { "epoch": 0.4702070207020702, "grad_norm": 0.6782870292663574, "learning_rate": 0.0001123527168481598, "loss": 4.9946, "step": 1306 }, { "epoch": 0.47056705670567056, "grad_norm": 0.6508191823959351, "learning_rate": 0.00011223838774509514, "loss": 4.7743, "step": 1307 }, { "epoch": 0.4709270927092709, "grad_norm": 0.682228684425354, "learning_rate": 0.00011212404239950224, "loss": 4.7985, "step": 1308 }, { "epoch": 0.47128712871287126, "grad_norm": 1.045074224472046, "learning_rate": 0.00011200968096313787, "loss": 4.6407, "step": 1309 }, { "epoch": 0.47164716471647167, "grad_norm": 1.3870502710342407, "learning_rate": 0.00011189530358778005, "loss": 4.9701, "step": 1310 }, { "epoch": 0.472007200720072, "grad_norm": 0.7295395731925964, "learning_rate": 0.00011178091042522795, "loss": 4.372, "step": 1311 }, { "epoch": 0.47236723672367237, "grad_norm": 1.3397456407546997, "learning_rate": 0.00011166650162730188, "loss": 4.9535, "step": 1312 }, { "epoch": 0.4727272727272727, "grad_norm": 0.8281415700912476, "learning_rate": 0.00011155207734584263, "loss": 4.5739, "step": 1313 }, { "epoch": 0.47308730873087307, "grad_norm": 0.7725363969802856, "learning_rate": 0.00011143763773271178, "loss": 5.0986, "step": 1314 }, { "epoch": 0.4734473447344735, "grad_norm": 0.7773278951644897, "learning_rate": 0.00011132318293979109, "loss": 4.8597, "step": 1315 }, { "epoch": 0.4738073807380738, "grad_norm": 0.8446284532546997, "learning_rate": 0.00011120871311898254, "loss": 4.6486, "step": 1316 }, { "epoch": 0.4741674167416742, "grad_norm": 0.7795506119728088, "learning_rate": 0.00011109422842220805, "loss": 4.6422, "step": 1317 }, { "epoch": 0.4745274527452745, "grad_norm": 0.6184239387512207, "learning_rate": 0.0001109797290014093, "loss": 4.4654, "step": 1318 }, { "epoch": 0.4748874887488749, "grad_norm": 0.8903937339782715, "learning_rate": 0.00011086521500854745, "loss": 4.5689, "step": 1319 }, { "epoch": 0.4752475247524752, "grad_norm": 0.7741526365280151, "learning_rate": 0.00011075068659560308, "loss": 4.6233, "step": 1320 }, { "epoch": 0.47560756075607563, "grad_norm": 0.9469612836837769, "learning_rate": 0.00011063614391457582, "loss": 4.9393, "step": 1321 }, { "epoch": 0.475967596759676, "grad_norm": 1.173879623413086, "learning_rate": 0.00011052158711748434, "loss": 5.1219, "step": 1322 }, { "epoch": 0.47632763276327633, "grad_norm": 1.115415334701538, "learning_rate": 0.00011040701635636592, "loss": 4.6119, "step": 1323 }, { "epoch": 0.4766876687668767, "grad_norm": 1.5755536556243896, "learning_rate": 0.00011029243178327649, "loss": 5.3156, "step": 1324 }, { "epoch": 0.47704770477047703, "grad_norm": 1.1577731370925903, "learning_rate": 0.00011017783355029026, "loss": 5.1467, "step": 1325 }, { "epoch": 0.4774077407740774, "grad_norm": 1.519116759300232, "learning_rate": 0.00011006322180949953, "loss": 4.6894, "step": 1326 }, { "epoch": 0.4777677767776778, "grad_norm": 0.9527605772018433, "learning_rate": 0.00010994859671301462, "loss": 5.0449, "step": 1327 }, { "epoch": 0.47812781278127814, "grad_norm": 0.8860084414482117, "learning_rate": 0.00010983395841296348, "loss": 4.6926, "step": 1328 }, { "epoch": 0.4784878487848785, "grad_norm": 0.8542980551719666, "learning_rate": 0.00010971930706149167, "loss": 4.5156, "step": 1329 }, { "epoch": 0.47884788478847884, "grad_norm": 0.9299918413162231, "learning_rate": 0.00010960464281076197, "loss": 4.6254, "step": 1330 }, { "epoch": 0.4792079207920792, "grad_norm": 0.7832848429679871, "learning_rate": 0.00010948996581295436, "loss": 4.4019, "step": 1331 }, { "epoch": 0.4795679567956796, "grad_norm": 0.7857620120048523, "learning_rate": 0.00010937527622026575, "loss": 4.7156, "step": 1332 }, { "epoch": 0.47992799279927995, "grad_norm": 0.8863069415092468, "learning_rate": 0.00010926057418490971, "loss": 4.5967, "step": 1333 }, { "epoch": 0.4802880288028803, "grad_norm": 0.9793080687522888, "learning_rate": 0.00010914585985911632, "loss": 4.4778, "step": 1334 }, { "epoch": 0.48064806480648065, "grad_norm": 1.0404165983200073, "learning_rate": 0.00010903113339513205, "loss": 4.9781, "step": 1335 }, { "epoch": 0.481008100810081, "grad_norm": 1.001639723777771, "learning_rate": 0.00010891639494521935, "loss": 4.8293, "step": 1336 }, { "epoch": 0.48136813681368135, "grad_norm": 0.8920490145683289, "learning_rate": 0.00010880164466165674, "loss": 4.3999, "step": 1337 }, { "epoch": 0.48172817281728175, "grad_norm": 1.003096342086792, "learning_rate": 0.00010868688269673828, "loss": 5.01, "step": 1338 }, { "epoch": 0.4820882088208821, "grad_norm": 0.7430381774902344, "learning_rate": 0.00010857210920277366, "loss": 4.8974, "step": 1339 }, { "epoch": 0.48244824482448245, "grad_norm": 0.8446447849273682, "learning_rate": 0.00010845732433208779, "loss": 4.6448, "step": 1340 }, { "epoch": 0.4828082808280828, "grad_norm": 0.7498891353607178, "learning_rate": 0.00010834252823702076, "loss": 4.761, "step": 1341 }, { "epoch": 0.48316831683168315, "grad_norm": 0.6509808301925659, "learning_rate": 0.00010822772106992747, "loss": 4.7459, "step": 1342 }, { "epoch": 0.4835283528352835, "grad_norm": 0.9102056622505188, "learning_rate": 0.00010811290298317755, "loss": 4.9137, "step": 1343 }, { "epoch": 0.4838883888388839, "grad_norm": 1.0613189935684204, "learning_rate": 0.00010799807412915517, "loss": 4.3103, "step": 1344 }, { "epoch": 0.48424842484248426, "grad_norm": 1.0716426372528076, "learning_rate": 0.0001078832346602587, "loss": 5.0106, "step": 1345 }, { "epoch": 0.4846084608460846, "grad_norm": 0.8888649344444275, "learning_rate": 0.00010776838472890065, "loss": 4.9421, "step": 1346 }, { "epoch": 0.48496849684968496, "grad_norm": 0.7990186214447021, "learning_rate": 0.0001076535244875074, "loss": 5.0825, "step": 1347 }, { "epoch": 0.4853285328532853, "grad_norm": 1.1681544780731201, "learning_rate": 0.00010753865408851906, "loss": 4.9825, "step": 1348 }, { "epoch": 0.4856885688568857, "grad_norm": 1.141554355621338, "learning_rate": 0.00010742377368438914, "loss": 5.1929, "step": 1349 }, { "epoch": 0.48604860486048607, "grad_norm": 1.1723651885986328, "learning_rate": 0.00010730888342758454, "loss": 5.4049, "step": 1350 }, { "epoch": 0.4864086408640864, "grad_norm": 2.490889310836792, "learning_rate": 0.0001071939834705851, "loss": 4.6995, "step": 1351 }, { "epoch": 0.48676867686768677, "grad_norm": 0.8721453547477722, "learning_rate": 0.00010707907396588361, "loss": 4.7901, "step": 1352 }, { "epoch": 0.4871287128712871, "grad_norm": 1.2546683549880981, "learning_rate": 0.00010696415506598557, "loss": 4.5257, "step": 1353 }, { "epoch": 0.48748874887488747, "grad_norm": 0.9638644456863403, "learning_rate": 0.00010684922692340884, "loss": 4.6166, "step": 1354 }, { "epoch": 0.4878487848784879, "grad_norm": 0.8636690974235535, "learning_rate": 0.00010673428969068364, "loss": 4.9165, "step": 1355 }, { "epoch": 0.4882088208820882, "grad_norm": 1.7687629461288452, "learning_rate": 0.00010661934352035224, "loss": 4.5002, "step": 1356 }, { "epoch": 0.4885688568856886, "grad_norm": 0.9787132740020752, "learning_rate": 0.00010650438856496872, "loss": 4.5295, "step": 1357 }, { "epoch": 0.4889288928892889, "grad_norm": 0.8555500507354736, "learning_rate": 0.0001063894249770989, "loss": 4.7362, "step": 1358 }, { "epoch": 0.4892889288928893, "grad_norm": 1.4172734022140503, "learning_rate": 0.00010627445290931997, "loss": 4.7544, "step": 1359 }, { "epoch": 0.4896489648964896, "grad_norm": 1.0591694116592407, "learning_rate": 0.00010615947251422044, "loss": 4.5676, "step": 1360 }, { "epoch": 0.49000900090009003, "grad_norm": 1.2712838649749756, "learning_rate": 0.00010604448394439983, "loss": 5.1231, "step": 1361 }, { "epoch": 0.4903690369036904, "grad_norm": 1.0828771591186523, "learning_rate": 0.00010592948735246854, "loss": 4.7859, "step": 1362 }, { "epoch": 0.49072907290729073, "grad_norm": 1.1068415641784668, "learning_rate": 0.00010581448289104758, "loss": 5.0285, "step": 1363 }, { "epoch": 0.4910891089108911, "grad_norm": 0.7831496000289917, "learning_rate": 0.00010569947071276847, "loss": 4.5127, "step": 1364 }, { "epoch": 0.49144914491449143, "grad_norm": 0.9309321641921997, "learning_rate": 0.00010558445097027292, "loss": 4.8782, "step": 1365 }, { "epoch": 0.4918091809180918, "grad_norm": 0.8637677431106567, "learning_rate": 0.00010546942381621265, "loss": 4.4378, "step": 1366 }, { "epoch": 0.4921692169216922, "grad_norm": 0.7803555727005005, "learning_rate": 0.0001053543894032493, "loss": 4.681, "step": 1367 }, { "epoch": 0.49252925292529254, "grad_norm": 0.8252502083778381, "learning_rate": 0.00010523934788405407, "loss": 5.1739, "step": 1368 }, { "epoch": 0.4928892889288929, "grad_norm": 0.750989556312561, "learning_rate": 0.00010512429941130766, "loss": 4.6177, "step": 1369 }, { "epoch": 0.49324932493249324, "grad_norm": 0.8373405933380127, "learning_rate": 0.00010500924413769988, "loss": 4.8114, "step": 1370 }, { "epoch": 0.4936093609360936, "grad_norm": 0.764385998249054, "learning_rate": 0.00010489418221592973, "loss": 4.6901, "step": 1371 }, { "epoch": 0.493969396939694, "grad_norm": 1.1150025129318237, "learning_rate": 0.00010477911379870488, "loss": 5.0304, "step": 1372 }, { "epoch": 0.49432943294329434, "grad_norm": 1.9383710622787476, "learning_rate": 0.00010466403903874176, "loss": 5.4131, "step": 1373 }, { "epoch": 0.4946894689468947, "grad_norm": 1.0594481229782104, "learning_rate": 0.0001045489580887651, "loss": 4.9395, "step": 1374 }, { "epoch": 0.49504950495049505, "grad_norm": 1.1613411903381348, "learning_rate": 0.00010443387110150791, "loss": 5.1006, "step": 1375 }, { "epoch": 0.4954095409540954, "grad_norm": 2.1980786323547363, "learning_rate": 0.00010431877822971117, "loss": 5.0446, "step": 1376 }, { "epoch": 0.49576957695769575, "grad_norm": 0.950690746307373, "learning_rate": 0.00010420367962612372, "loss": 4.9041, "step": 1377 }, { "epoch": 0.49612961296129615, "grad_norm": 0.8650350570678711, "learning_rate": 0.00010408857544350194, "loss": 4.6469, "step": 1378 }, { "epoch": 0.4964896489648965, "grad_norm": 0.6349000930786133, "learning_rate": 0.00010397346583460971, "loss": 4.6103, "step": 1379 }, { "epoch": 0.49684968496849685, "grad_norm": 0.8492519855499268, "learning_rate": 0.00010385835095221803, "loss": 4.9419, "step": 1380 }, { "epoch": 0.4972097209720972, "grad_norm": 0.7640931606292725, "learning_rate": 0.00010374323094910496, "loss": 4.7148, "step": 1381 }, { "epoch": 0.49756975697569755, "grad_norm": 0.705634593963623, "learning_rate": 0.00010362810597805526, "loss": 4.7858, "step": 1382 }, { "epoch": 0.4979297929792979, "grad_norm": 0.640390932559967, "learning_rate": 0.0001035129761918604, "loss": 4.6866, "step": 1383 }, { "epoch": 0.4982898289828983, "grad_norm": 0.6646218299865723, "learning_rate": 0.0001033978417433182, "loss": 4.7531, "step": 1384 }, { "epoch": 0.49864986498649866, "grad_norm": 0.7338967323303223, "learning_rate": 0.00010328270278523256, "loss": 4.8981, "step": 1385 }, { "epoch": 0.499009900990099, "grad_norm": 0.8542543053627014, "learning_rate": 0.00010316755947041352, "loss": 4.7243, "step": 1386 }, { "epoch": 0.49936993699369936, "grad_norm": 0.9543262124061584, "learning_rate": 0.00010305241195167687, "loss": 4.5981, "step": 1387 }, { "epoch": 0.4997299729972997, "grad_norm": 1.9068870544433594, "learning_rate": 0.00010293726038184393, "loss": 5.2105, "step": 1388 }, { "epoch": 0.5000900090009001, "grad_norm": 0.8715320229530334, "learning_rate": 0.00010282210491374138, "loss": 4.6966, "step": 1389 }, { "epoch": 0.5004500450045004, "grad_norm": 0.6652578115463257, "learning_rate": 0.00010270694570020116, "loss": 4.7134, "step": 1390 }, { "epoch": 0.5004500450045004, "eval_loss": 4.819091796875, "eval_runtime": 100.2944, "eval_samples_per_second": 46.643, "eval_steps_per_second": 11.666, "step": 1390 }, { "epoch": 0.5008100810081008, "grad_norm": 0.826438844203949, "learning_rate": 0.00010259178289406011, "loss": 4.8383, "step": 1391 }, { "epoch": 0.5011701170117012, "grad_norm": 0.7669975757598877, "learning_rate": 0.00010247661664815986, "loss": 4.9241, "step": 1392 }, { "epoch": 0.5015301530153016, "grad_norm": 0.7355509996414185, "learning_rate": 0.00010236144711534661, "loss": 4.4544, "step": 1393 }, { "epoch": 0.5018901890189019, "grad_norm": 0.6872754693031311, "learning_rate": 0.0001022462744484709, "loss": 4.9361, "step": 1394 }, { "epoch": 0.5022502250225023, "grad_norm": 0.6951656937599182, "learning_rate": 0.00010213109880038747, "loss": 4.7553, "step": 1395 }, { "epoch": 0.5026102610261026, "grad_norm": 0.6541697382926941, "learning_rate": 0.00010201592032395499, "loss": 4.6221, "step": 1396 }, { "epoch": 0.502970297029703, "grad_norm": 0.729430079460144, "learning_rate": 0.00010190073917203589, "loss": 4.9455, "step": 1397 }, { "epoch": 0.5033303330333033, "grad_norm": 1.0041310787200928, "learning_rate": 0.00010178555549749613, "loss": 5.1839, "step": 1398 }, { "epoch": 0.5036903690369037, "grad_norm": 0.9455398321151733, "learning_rate": 0.00010167036945320503, "loss": 5.0098, "step": 1399 }, { "epoch": 0.504050405040504, "grad_norm": 1.5032222270965576, "learning_rate": 0.0001015551811920351, "loss": 5.4927, "step": 1400 }, { "epoch": 0.5044104410441044, "grad_norm": 0.9934758543968201, "learning_rate": 0.00010143999086686171, "loss": 5.1418, "step": 1401 }, { "epoch": 0.5047704770477047, "grad_norm": 1.0731993913650513, "learning_rate": 0.00010132479863056303, "loss": 5.1302, "step": 1402 }, { "epoch": 0.5051305130513052, "grad_norm": 0.7515692710876465, "learning_rate": 0.00010120960463601976, "loss": 4.5247, "step": 1403 }, { "epoch": 0.5054905490549055, "grad_norm": 1.0562608242034912, "learning_rate": 0.00010109440903611493, "loss": 4.9218, "step": 1404 }, { "epoch": 0.5058505850585059, "grad_norm": 0.7232738137245178, "learning_rate": 0.00010097921198373368, "loss": 5.0449, "step": 1405 }, { "epoch": 0.5062106210621062, "grad_norm": 1.5582194328308105, "learning_rate": 0.00010086401363176305, "loss": 4.9883, "step": 1406 }, { "epoch": 0.5065706570657066, "grad_norm": 0.5701386332511902, "learning_rate": 0.00010074881413309193, "loss": 4.7997, "step": 1407 }, { "epoch": 0.5069306930693069, "grad_norm": 0.8754087686538696, "learning_rate": 0.00010063361364061057, "loss": 4.5842, "step": 1408 }, { "epoch": 0.5072907290729073, "grad_norm": 0.9331681132316589, "learning_rate": 0.00010051841230721065, "loss": 4.5656, "step": 1409 }, { "epoch": 0.5076507650765076, "grad_norm": 1.0775038003921509, "learning_rate": 0.0001004032102857849, "loss": 4.5366, "step": 1410 }, { "epoch": 0.508010801080108, "grad_norm": 1.0820711851119995, "learning_rate": 0.00010028800772922706, "loss": 4.859, "step": 1411 }, { "epoch": 0.5083708370837083, "grad_norm": 0.8344745635986328, "learning_rate": 0.00010017280479043147, "loss": 4.7325, "step": 1412 }, { "epoch": 0.5087308730873087, "grad_norm": 0.9460582137107849, "learning_rate": 0.00010005760162229305, "loss": 4.9831, "step": 1413 }, { "epoch": 0.509090909090909, "grad_norm": 0.8958419561386108, "learning_rate": 9.994239837770699e-05, "loss": 5.0868, "step": 1414 }, { "epoch": 0.5094509450945095, "grad_norm": 0.792186975479126, "learning_rate": 9.982719520956855e-05, "loss": 4.7051, "step": 1415 }, { "epoch": 0.5098109810981098, "grad_norm": 0.7827296257019043, "learning_rate": 9.971199227077295e-05, "loss": 4.8934, "step": 1416 }, { "epoch": 0.5101710171017102, "grad_norm": 0.8142650723457336, "learning_rate": 9.959678971421508e-05, "loss": 4.4786, "step": 1417 }, { "epoch": 0.5105310531053106, "grad_norm": 0.9982710480690002, "learning_rate": 9.948158769278939e-05, "loss": 4.8992, "step": 1418 }, { "epoch": 0.5108910891089109, "grad_norm": 0.8093355298042297, "learning_rate": 9.936638635938945e-05, "loss": 4.9529, "step": 1419 }, { "epoch": 0.5112511251125113, "grad_norm": 0.6960091590881348, "learning_rate": 9.925118586690809e-05, "loss": 4.6388, "step": 1420 }, { "epoch": 0.5116111611161116, "grad_norm": 0.8927443623542786, "learning_rate": 9.913598636823693e-05, "loss": 4.7726, "step": 1421 }, { "epoch": 0.511971197119712, "grad_norm": 1.0828852653503418, "learning_rate": 9.902078801626636e-05, "loss": 5.1102, "step": 1422 }, { "epoch": 0.5123312331233123, "grad_norm": 0.8618959784507751, "learning_rate": 9.890559096388509e-05, "loss": 4.7913, "step": 1423 }, { "epoch": 0.5126912691269127, "grad_norm": 1.5530176162719727, "learning_rate": 9.879039536398024e-05, "loss": 4.9746, "step": 1424 }, { "epoch": 0.513051305130513, "grad_norm": 1.6995723247528076, "learning_rate": 9.867520136943698e-05, "loss": 4.9776, "step": 1425 }, { "epoch": 0.5134113411341135, "grad_norm": 1.1592552661895752, "learning_rate": 9.856000913313832e-05, "loss": 4.7069, "step": 1426 }, { "epoch": 0.5137713771377138, "grad_norm": 0.8983449339866638, "learning_rate": 9.844481880796491e-05, "loss": 4.7614, "step": 1427 }, { "epoch": 0.5141314131413142, "grad_norm": 0.6931654810905457, "learning_rate": 9.832963054679497e-05, "loss": 4.7884, "step": 1428 }, { "epoch": 0.5144914491449145, "grad_norm": 1.0931594371795654, "learning_rate": 9.821444450250392e-05, "loss": 4.7423, "step": 1429 }, { "epoch": 0.5148514851485149, "grad_norm": 0.8028708696365356, "learning_rate": 9.809926082796415e-05, "loss": 4.932, "step": 1430 }, { "epoch": 0.5152115211521152, "grad_norm": 0.5453850030899048, "learning_rate": 9.798407967604502e-05, "loss": 4.886, "step": 1431 }, { "epoch": 0.5155715571557156, "grad_norm": 0.8439428806304932, "learning_rate": 9.786890119961253e-05, "loss": 4.9247, "step": 1432 }, { "epoch": 0.5159315931593159, "grad_norm": 0.7233529686927795, "learning_rate": 9.775372555152912e-05, "loss": 4.9305, "step": 1433 }, { "epoch": 0.5162916291629163, "grad_norm": 0.6636310815811157, "learning_rate": 9.763855288465341e-05, "loss": 4.5924, "step": 1434 }, { "epoch": 0.5166516651665166, "grad_norm": 0.6490011215209961, "learning_rate": 9.752338335184015e-05, "loss": 4.6092, "step": 1435 }, { "epoch": 0.517011701170117, "grad_norm": 0.6125934720039368, "learning_rate": 9.740821710593989e-05, "loss": 4.725, "step": 1436 }, { "epoch": 0.5173717371737173, "grad_norm": 0.8968194127082825, "learning_rate": 9.729305429979887e-05, "loss": 4.8702, "step": 1437 }, { "epoch": 0.5177317731773178, "grad_norm": 0.9133161902427673, "learning_rate": 9.717789508625865e-05, "loss": 4.7776, "step": 1438 }, { "epoch": 0.5180918091809181, "grad_norm": 0.7110667824745178, "learning_rate": 9.70627396181561e-05, "loss": 4.5551, "step": 1439 }, { "epoch": 0.5184518451845185, "grad_norm": 0.7550270557403564, "learning_rate": 9.694758804832314e-05, "loss": 4.9471, "step": 1440 }, { "epoch": 0.5188118811881188, "grad_norm": 0.8355233669281006, "learning_rate": 9.68324405295865e-05, "loss": 4.6401, "step": 1441 }, { "epoch": 0.5191719171917192, "grad_norm": 1.0176023244857788, "learning_rate": 9.671729721476746e-05, "loss": 4.7754, "step": 1442 }, { "epoch": 0.5195319531953195, "grad_norm": 0.7926576733589172, "learning_rate": 9.660215825668184e-05, "loss": 4.7933, "step": 1443 }, { "epoch": 0.5198919891989199, "grad_norm": 0.8102412819862366, "learning_rate": 9.648702380813958e-05, "loss": 4.9955, "step": 1444 }, { "epoch": 0.5202520252025202, "grad_norm": 0.7579959034919739, "learning_rate": 9.637189402194476e-05, "loss": 4.5224, "step": 1445 }, { "epoch": 0.5206120612061206, "grad_norm": 1.1166945695877075, "learning_rate": 9.625676905089506e-05, "loss": 5.0929, "step": 1446 }, { "epoch": 0.5209720972097209, "grad_norm": 0.7166795134544373, "learning_rate": 9.614164904778196e-05, "loss": 4.9213, "step": 1447 }, { "epoch": 0.5213321332133213, "grad_norm": 1.4086633920669556, "learning_rate": 9.602653416539031e-05, "loss": 4.9692, "step": 1448 }, { "epoch": 0.5216921692169217, "grad_norm": 0.8931282162666321, "learning_rate": 9.591142455649808e-05, "loss": 4.9206, "step": 1449 }, { "epoch": 0.5220522052205221, "grad_norm": 1.0117639303207397, "learning_rate": 9.579632037387632e-05, "loss": 5.0037, "step": 1450 }, { "epoch": 0.5224122412241224, "grad_norm": 1.4104924201965332, "learning_rate": 9.568122177028884e-05, "loss": 4.6522, "step": 1451 }, { "epoch": 0.5227722772277228, "grad_norm": 0.8898065090179443, "learning_rate": 9.556612889849214e-05, "loss": 4.4271, "step": 1452 }, { "epoch": 0.5231323132313231, "grad_norm": 1.3225266933441162, "learning_rate": 9.545104191123493e-05, "loss": 5.0652, "step": 1453 }, { "epoch": 0.5234923492349235, "grad_norm": 0.5962294340133667, "learning_rate": 9.533596096125825e-05, "loss": 4.9742, "step": 1454 }, { "epoch": 0.5238523852385238, "grad_norm": 0.6631790399551392, "learning_rate": 9.522088620129511e-05, "loss": 4.4972, "step": 1455 }, { "epoch": 0.5242124212421242, "grad_norm": 0.7971886992454529, "learning_rate": 9.510581778407031e-05, "loss": 4.8759, "step": 1456 }, { "epoch": 0.5245724572457245, "grad_norm": 0.8555065989494324, "learning_rate": 9.499075586230013e-05, "loss": 4.6214, "step": 1457 }, { "epoch": 0.5249324932493249, "grad_norm": 0.7049931883811951, "learning_rate": 9.487570058869237e-05, "loss": 4.3151, "step": 1458 }, { "epoch": 0.5252925292529252, "grad_norm": 0.6889054775238037, "learning_rate": 9.476065211594593e-05, "loss": 4.5404, "step": 1459 }, { "epoch": 0.5256525652565257, "grad_norm": 0.7169436812400818, "learning_rate": 9.464561059675073e-05, "loss": 4.4303, "step": 1460 }, { "epoch": 0.5260126012601261, "grad_norm": 0.6740374565124512, "learning_rate": 9.453057618378737e-05, "loss": 4.5571, "step": 1461 }, { "epoch": 0.5263726372637264, "grad_norm": 1.2154463529586792, "learning_rate": 9.44155490297271e-05, "loss": 4.6135, "step": 1462 }, { "epoch": 0.5267326732673268, "grad_norm": 0.7495080828666687, "learning_rate": 9.430052928723153e-05, "loss": 4.4763, "step": 1463 }, { "epoch": 0.5270927092709271, "grad_norm": 1.2248061895370483, "learning_rate": 9.418551710895243e-05, "loss": 4.655, "step": 1464 }, { "epoch": 0.5274527452745275, "grad_norm": 0.7806352376937866, "learning_rate": 9.407051264753147e-05, "loss": 4.8002, "step": 1465 }, { "epoch": 0.5278127812781278, "grad_norm": 0.720514714717865, "learning_rate": 9.395551605560018e-05, "loss": 4.7797, "step": 1466 }, { "epoch": 0.5281728172817282, "grad_norm": 0.8030894994735718, "learning_rate": 9.38405274857796e-05, "loss": 4.4738, "step": 1467 }, { "epoch": 0.5285328532853285, "grad_norm": 0.9285864233970642, "learning_rate": 9.372554709068005e-05, "loss": 4.8816, "step": 1468 }, { "epoch": 0.5288928892889289, "grad_norm": 0.7847463488578796, "learning_rate": 9.361057502290113e-05, "loss": 4.8936, "step": 1469 }, { "epoch": 0.5292529252925292, "grad_norm": 0.866493821144104, "learning_rate": 9.349561143503128e-05, "loss": 4.6361, "step": 1470 }, { "epoch": 0.5296129612961296, "grad_norm": 0.8414556384086609, "learning_rate": 9.338065647964779e-05, "loss": 4.3976, "step": 1471 }, { "epoch": 0.52997299729973, "grad_norm": 0.7641831040382385, "learning_rate": 9.326571030931637e-05, "loss": 5.205, "step": 1472 }, { "epoch": 0.5303330333033304, "grad_norm": 1.0077167749404907, "learning_rate": 9.315077307659117e-05, "loss": 5.0721, "step": 1473 }, { "epoch": 0.5306930693069307, "grad_norm": 0.9120255708694458, "learning_rate": 9.303584493401444e-05, "loss": 4.7782, "step": 1474 }, { "epoch": 0.5310531053105311, "grad_norm": 1.3254908323287964, "learning_rate": 9.292092603411641e-05, "loss": 5.464, "step": 1475 }, { "epoch": 0.5314131413141314, "grad_norm": 1.8736522197723389, "learning_rate": 9.280601652941494e-05, "loss": 5.1757, "step": 1476 }, { "epoch": 0.5317731773177318, "grad_norm": 0.9066369533538818, "learning_rate": 9.269111657241548e-05, "loss": 4.8926, "step": 1477 }, { "epoch": 0.5321332133213321, "grad_norm": 1.7387737035751343, "learning_rate": 9.257622631561085e-05, "loss": 4.7481, "step": 1478 }, { "epoch": 0.5324932493249325, "grad_norm": 0.750296950340271, "learning_rate": 9.246134591148099e-05, "loss": 4.2386, "step": 1479 }, { "epoch": 0.5328532853285328, "grad_norm": 1.1397596597671509, "learning_rate": 9.234647551249261e-05, "loss": 4.9964, "step": 1480 }, { "epoch": 0.5332133213321332, "grad_norm": 0.8775277137756348, "learning_rate": 9.223161527109937e-05, "loss": 4.7732, "step": 1481 }, { "epoch": 0.5335733573357335, "grad_norm": 0.9205058217048645, "learning_rate": 9.211676533974131e-05, "loss": 4.5175, "step": 1482 }, { "epoch": 0.533933393339334, "grad_norm": 1.167930245399475, "learning_rate": 9.200192587084488e-05, "loss": 5.0832, "step": 1483 }, { "epoch": 0.5342934293429343, "grad_norm": 0.6535282135009766, "learning_rate": 9.188709701682247e-05, "loss": 4.5957, "step": 1484 }, { "epoch": 0.5346534653465347, "grad_norm": 0.9349364042282104, "learning_rate": 9.177227893007254e-05, "loss": 4.6587, "step": 1485 }, { "epoch": 0.535013501350135, "grad_norm": 0.8866311311721802, "learning_rate": 9.165747176297929e-05, "loss": 4.697, "step": 1486 }, { "epoch": 0.5353735373537354, "grad_norm": 1.0513415336608887, "learning_rate": 9.154267566791223e-05, "loss": 4.7072, "step": 1487 }, { "epoch": 0.5357335733573357, "grad_norm": 0.7274541258811951, "learning_rate": 9.142789079722638e-05, "loss": 4.3303, "step": 1488 }, { "epoch": 0.5360936093609361, "grad_norm": 0.845374345779419, "learning_rate": 9.131311730326172e-05, "loss": 4.6276, "step": 1489 }, { "epoch": 0.5364536453645364, "grad_norm": 0.7885565161705017, "learning_rate": 9.119835533834331e-05, "loss": 4.8416, "step": 1490 }, { "epoch": 0.5368136813681368, "grad_norm": 0.8569223284721375, "learning_rate": 9.108360505478066e-05, "loss": 4.6285, "step": 1491 }, { "epoch": 0.5371737173717371, "grad_norm": 0.762844443321228, "learning_rate": 9.096886660486797e-05, "loss": 4.8427, "step": 1492 }, { "epoch": 0.5375337533753375, "grad_norm": 0.8421863913536072, "learning_rate": 9.085414014088369e-05, "loss": 4.905, "step": 1493 }, { "epoch": 0.537893789378938, "grad_norm": 0.7791987061500549, "learning_rate": 9.073942581509034e-05, "loss": 4.7764, "step": 1494 }, { "epoch": 0.5382538253825383, "grad_norm": 0.9516592621803284, "learning_rate": 9.062472377973427e-05, "loss": 4.7255, "step": 1495 }, { "epoch": 0.5386138613861386, "grad_norm": 0.7316043376922607, "learning_rate": 9.051003418704565e-05, "loss": 4.7619, "step": 1496 }, { "epoch": 0.538973897389739, "grad_norm": 0.9737268686294556, "learning_rate": 9.039535718923804e-05, "loss": 4.8169, "step": 1497 }, { "epoch": 0.5393339333933393, "grad_norm": 1.1187301874160767, "learning_rate": 9.028069293850838e-05, "loss": 4.9653, "step": 1498 }, { "epoch": 0.5396939693969397, "grad_norm": 1.5251964330673218, "learning_rate": 9.016604158703654e-05, "loss": 5.3705, "step": 1499 }, { "epoch": 0.54005400540054, "grad_norm": 1.3677904605865479, "learning_rate": 9.005140328698539e-05, "loss": 5.3299, "step": 1500 }, { "epoch": 0.5404140414041404, "grad_norm": 2.879427194595337, "learning_rate": 8.993677819050046e-05, "loss": 5.0086, "step": 1501 }, { "epoch": 0.5407740774077407, "grad_norm": 0.67740797996521, "learning_rate": 8.982216644970979e-05, "loss": 4.842, "step": 1502 }, { "epoch": 0.5411341134113411, "grad_norm": 0.6215454339981079, "learning_rate": 8.970756821672352e-05, "loss": 4.819, "step": 1503 }, { "epoch": 0.5414941494149415, "grad_norm": 0.7233485579490662, "learning_rate": 8.95929836436341e-05, "loss": 5.1138, "step": 1504 }, { "epoch": 0.5418541854185418, "grad_norm": 0.8734248280525208, "learning_rate": 8.947841288251568e-05, "loss": 4.5471, "step": 1505 }, { "epoch": 0.5422142214221423, "grad_norm": 0.7609754800796509, "learning_rate": 8.93638560854242e-05, "loss": 4.5867, "step": 1506 }, { "epoch": 0.5425742574257426, "grad_norm": 0.591051459312439, "learning_rate": 8.924931340439694e-05, "loss": 4.7519, "step": 1507 }, { "epoch": 0.542934293429343, "grad_norm": 0.7521675229072571, "learning_rate": 8.913478499145254e-05, "loss": 4.9931, "step": 1508 }, { "epoch": 0.5432943294329433, "grad_norm": 0.7421920895576477, "learning_rate": 8.902027099859074e-05, "loss": 4.9076, "step": 1509 }, { "epoch": 0.5436543654365437, "grad_norm": 0.732514500617981, "learning_rate": 8.890577157779198e-05, "loss": 4.9947, "step": 1510 }, { "epoch": 0.544014401440144, "grad_norm": 0.872582733631134, "learning_rate": 8.879128688101749e-05, "loss": 4.4497, "step": 1511 }, { "epoch": 0.5443744374437444, "grad_norm": 0.6524242162704468, "learning_rate": 8.867681706020894e-05, "loss": 4.8583, "step": 1512 }, { "epoch": 0.5447344734473447, "grad_norm": 0.7136997580528259, "learning_rate": 8.856236226728825e-05, "loss": 4.7893, "step": 1513 }, { "epoch": 0.5450945094509451, "grad_norm": 0.6949340105056763, "learning_rate": 8.844792265415738e-05, "loss": 4.5592, "step": 1514 }, { "epoch": 0.5454545454545454, "grad_norm": 0.6542133092880249, "learning_rate": 8.833349837269814e-05, "loss": 4.6849, "step": 1515 }, { "epoch": 0.5458145814581458, "grad_norm": 0.9554135203361511, "learning_rate": 8.821908957477203e-05, "loss": 4.9527, "step": 1516 }, { "epoch": 0.5461746174617462, "grad_norm": 0.7612982988357544, "learning_rate": 8.810469641222001e-05, "loss": 5.1376, "step": 1517 }, { "epoch": 0.5465346534653466, "grad_norm": 0.6914458274841309, "learning_rate": 8.799031903686217e-05, "loss": 4.7025, "step": 1518 }, { "epoch": 0.5468946894689469, "grad_norm": 0.5740631818771362, "learning_rate": 8.787595760049777e-05, "loss": 4.4563, "step": 1519 }, { "epoch": 0.5472547254725473, "grad_norm": 0.7158836126327515, "learning_rate": 8.776161225490489e-05, "loss": 4.5597, "step": 1520 }, { "epoch": 0.5476147614761476, "grad_norm": 0.6493854522705078, "learning_rate": 8.764728315184024e-05, "loss": 4.9063, "step": 1521 }, { "epoch": 0.547974797479748, "grad_norm": 0.7967309951782227, "learning_rate": 8.753297044303896e-05, "loss": 4.7724, "step": 1522 }, { "epoch": 0.5483348334833483, "grad_norm": 0.8738707900047302, "learning_rate": 8.741867428021446e-05, "loss": 5.0162, "step": 1523 }, { "epoch": 0.5486948694869487, "grad_norm": 1.0331279039382935, "learning_rate": 8.73043948150582e-05, "loss": 4.8083, "step": 1524 }, { "epoch": 0.549054905490549, "grad_norm": 1.4867066144943237, "learning_rate": 8.719013219923947e-05, "loss": 5.3772, "step": 1525 }, { "epoch": 0.5494149414941494, "grad_norm": 0.9216363430023193, "learning_rate": 8.707588658440511e-05, "loss": 4.8583, "step": 1526 }, { "epoch": 0.5497749774977497, "grad_norm": 0.6520406007766724, "learning_rate": 8.696165812217953e-05, "loss": 4.735, "step": 1527 }, { "epoch": 0.5501350135013502, "grad_norm": 0.742540717124939, "learning_rate": 8.684744696416432e-05, "loss": 5.052, "step": 1528 }, { "epoch": 0.5504950495049505, "grad_norm": 0.7773281335830688, "learning_rate": 8.673325326193806e-05, "loss": 4.7007, "step": 1529 }, { "epoch": 0.5508550855085509, "grad_norm": 0.7746535539627075, "learning_rate": 8.661907716705627e-05, "loss": 4.6244, "step": 1530 }, { "epoch": 0.5512151215121512, "grad_norm": 0.7446948885917664, "learning_rate": 8.650491883105097e-05, "loss": 4.863, "step": 1531 }, { "epoch": 0.5515751575157516, "grad_norm": 0.714407205581665, "learning_rate": 8.639077840543077e-05, "loss": 4.8291, "step": 1532 }, { "epoch": 0.5519351935193519, "grad_norm": 0.9697020053863525, "learning_rate": 8.627665604168032e-05, "loss": 4.6392, "step": 1533 }, { "epoch": 0.5522952295229523, "grad_norm": 0.7405415773391724, "learning_rate": 8.616255189126043e-05, "loss": 4.5928, "step": 1534 }, { "epoch": 0.5526552655265526, "grad_norm": 0.6478596329689026, "learning_rate": 8.604846610560771e-05, "loss": 4.7173, "step": 1535 }, { "epoch": 0.553015301530153, "grad_norm": 0.6404251456260681, "learning_rate": 8.593439883613441e-05, "loss": 4.453, "step": 1536 }, { "epoch": 0.5533753375337533, "grad_norm": 0.6942550539970398, "learning_rate": 8.582035023422815e-05, "loss": 4.7839, "step": 1537 }, { "epoch": 0.5537353735373537, "grad_norm": 0.7156291604042053, "learning_rate": 8.570632045125185e-05, "loss": 4.9326, "step": 1538 }, { "epoch": 0.554095409540954, "grad_norm": 0.7970767617225647, "learning_rate": 8.559230963854338e-05, "loss": 4.7284, "step": 1539 }, { "epoch": 0.5544554455445545, "grad_norm": 0.7158586382865906, "learning_rate": 8.547831794741552e-05, "loss": 4.6065, "step": 1540 }, { "epoch": 0.5548154815481549, "grad_norm": 0.5735970735549927, "learning_rate": 8.536434552915556e-05, "loss": 4.7163, "step": 1541 }, { "epoch": 0.5551755175517552, "grad_norm": 0.5961145758628845, "learning_rate": 8.525039253502529e-05, "loss": 4.4807, "step": 1542 }, { "epoch": 0.5555355535553556, "grad_norm": 1.0740234851837158, "learning_rate": 8.513645911626071e-05, "loss": 4.8928, "step": 1543 }, { "epoch": 0.5558955895589559, "grad_norm": 0.8574578762054443, "learning_rate": 8.502254542407186e-05, "loss": 4.8314, "step": 1544 }, { "epoch": 0.5562556255625563, "grad_norm": 0.6801769733428955, "learning_rate": 8.490865160964253e-05, "loss": 4.8751, "step": 1545 }, { "epoch": 0.5566156615661566, "grad_norm": 1.083175778388977, "learning_rate": 8.47947778241302e-05, "loss": 4.8205, "step": 1546 }, { "epoch": 0.556975697569757, "grad_norm": 0.8133716583251953, "learning_rate": 8.468092421866573e-05, "loss": 4.8413, "step": 1547 }, { "epoch": 0.5573357335733573, "grad_norm": 0.8513250350952148, "learning_rate": 8.45670909443532e-05, "loss": 5.0853, "step": 1548 }, { "epoch": 0.5576957695769577, "grad_norm": 1.1919034719467163, "learning_rate": 8.445327815226969e-05, "loss": 5.0187, "step": 1549 }, { "epoch": 0.558055805580558, "grad_norm": 1.377685308456421, "learning_rate": 8.433948599346516e-05, "loss": 5.1966, "step": 1550 }, { "epoch": 0.5584158415841585, "grad_norm": 0.9856991767883301, "learning_rate": 8.422571461896215e-05, "loss": 4.9112, "step": 1551 }, { "epoch": 0.5587758775877588, "grad_norm": 1.103893756866455, "learning_rate": 8.411196417975558e-05, "loss": 4.6929, "step": 1552 }, { "epoch": 0.5591359135913592, "grad_norm": 0.7098972201347351, "learning_rate": 8.399823482681262e-05, "loss": 4.8045, "step": 1553 }, { "epoch": 0.5594959495949595, "grad_norm": 0.8016402721405029, "learning_rate": 8.388452671107246e-05, "loss": 4.9784, "step": 1554 }, { "epoch": 0.5598559855985599, "grad_norm": 0.8243830800056458, "learning_rate": 8.377083998344615e-05, "loss": 4.7894, "step": 1555 }, { "epoch": 0.5602160216021602, "grad_norm": 0.708741307258606, "learning_rate": 8.36571747948162e-05, "loss": 4.8073, "step": 1556 }, { "epoch": 0.5605760576057606, "grad_norm": 1.3110893964767456, "learning_rate": 8.354353129603668e-05, "loss": 4.9727, "step": 1557 }, { "epoch": 0.5609360936093609, "grad_norm": 0.8679048418998718, "learning_rate": 8.342990963793283e-05, "loss": 4.5692, "step": 1558 }, { "epoch": 0.5612961296129613, "grad_norm": 0.9281949400901794, "learning_rate": 8.33163099713009e-05, "loss": 5.0668, "step": 1559 }, { "epoch": 0.5616561656165616, "grad_norm": 0.9578452110290527, "learning_rate": 8.320273244690796e-05, "loss": 5.0065, "step": 1560 }, { "epoch": 0.562016201620162, "grad_norm": 0.8283352851867676, "learning_rate": 8.308917721549167e-05, "loss": 4.8786, "step": 1561 }, { "epoch": 0.5623762376237624, "grad_norm": 0.9560914635658264, "learning_rate": 8.297564442776014e-05, "loss": 4.6688, "step": 1562 }, { "epoch": 0.5627362736273628, "grad_norm": 0.966285765171051, "learning_rate": 8.286213423439169e-05, "loss": 4.6505, "step": 1563 }, { "epoch": 0.5630963096309631, "grad_norm": 0.6457225680351257, "learning_rate": 8.274864678603458e-05, "loss": 4.7324, "step": 1564 }, { "epoch": 0.5634563456345635, "grad_norm": 0.9832311272621155, "learning_rate": 8.263518223330697e-05, "loss": 4.2546, "step": 1565 }, { "epoch": 0.5638163816381638, "grad_norm": 0.5678200125694275, "learning_rate": 8.252174072679661e-05, "loss": 4.8263, "step": 1566 }, { "epoch": 0.5641764176417642, "grad_norm": 0.6344163417816162, "learning_rate": 8.240832241706068e-05, "loss": 4.6787, "step": 1567 }, { "epoch": 0.5645364536453645, "grad_norm": 0.5460944771766663, "learning_rate": 8.22949274546255e-05, "loss": 4.6593, "step": 1568 }, { "epoch": 0.5648964896489649, "grad_norm": 0.9370971322059631, "learning_rate": 8.218155598998648e-05, "loss": 5.0046, "step": 1569 }, { "epoch": 0.5652565256525652, "grad_norm": 0.7465395331382751, "learning_rate": 8.206820817360787e-05, "loss": 4.6039, "step": 1570 }, { "epoch": 0.5656165616561656, "grad_norm": 0.6622176170349121, "learning_rate": 8.195488415592238e-05, "loss": 4.6292, "step": 1571 }, { "epoch": 0.5659765976597659, "grad_norm": 0.7051721811294556, "learning_rate": 8.184158408733131e-05, "loss": 4.7648, "step": 1572 }, { "epoch": 0.5663366336633663, "grad_norm": 0.7271416783332825, "learning_rate": 8.172830811820407e-05, "loss": 5.0451, "step": 1573 }, { "epoch": 0.5666966696669667, "grad_norm": 0.9978146553039551, "learning_rate": 8.161505639887817e-05, "loss": 5.4136, "step": 1574 }, { "epoch": 0.5670567056705671, "grad_norm": 1.500651240348816, "learning_rate": 8.150182907965883e-05, "loss": 5.3961, "step": 1575 }, { "epoch": 0.5674167416741674, "grad_norm": 1.3992029428482056, "learning_rate": 8.138862631081896e-05, "loss": 4.9493, "step": 1576 }, { "epoch": 0.5677767776777678, "grad_norm": 1.5307037830352783, "learning_rate": 8.127544824259889e-05, "loss": 5.277, "step": 1577 }, { "epoch": 0.5681368136813681, "grad_norm": 0.7250295281410217, "learning_rate": 8.116229502520618e-05, "loss": 4.5189, "step": 1578 }, { "epoch": 0.5684968496849685, "grad_norm": 0.6795329451560974, "learning_rate": 8.104916680881527e-05, "loss": 4.862, "step": 1579 }, { "epoch": 0.5688568856885688, "grad_norm": 0.5743387341499329, "learning_rate": 8.093606374356759e-05, "loss": 4.8536, "step": 1580 }, { "epoch": 0.5692169216921692, "grad_norm": 0.700360119342804, "learning_rate": 8.082298597957112e-05, "loss": 4.7792, "step": 1581 }, { "epoch": 0.5695769576957695, "grad_norm": 0.6575736999511719, "learning_rate": 8.070993366690029e-05, "loss": 5.0423, "step": 1582 }, { "epoch": 0.5699369936993699, "grad_norm": 0.7689267992973328, "learning_rate": 8.059690695559568e-05, "loss": 4.5819, "step": 1583 }, { "epoch": 0.5702970297029702, "grad_norm": 0.7079759836196899, "learning_rate": 8.048390599566397e-05, "loss": 4.6153, "step": 1584 }, { "epoch": 0.5706570657065707, "grad_norm": 0.5983802080154419, "learning_rate": 8.037093093707763e-05, "loss": 4.7194, "step": 1585 }, { "epoch": 0.5710171017101711, "grad_norm": 0.8288666009902954, "learning_rate": 8.025798192977481e-05, "loss": 4.4559, "step": 1586 }, { "epoch": 0.5713771377137714, "grad_norm": 0.82045978307724, "learning_rate": 8.014505912365893e-05, "loss": 4.321, "step": 1587 }, { "epoch": 0.5717371737173718, "grad_norm": 0.5901921391487122, "learning_rate": 8.003216266859877e-05, "loss": 4.4723, "step": 1588 }, { "epoch": 0.5720972097209721, "grad_norm": 0.5934436917304993, "learning_rate": 7.991929271442817e-05, "loss": 4.722, "step": 1589 }, { "epoch": 0.5724572457245725, "grad_norm": 0.6561322212219238, "learning_rate": 7.980644941094566e-05, "loss": 5.1444, "step": 1590 }, { "epoch": 0.5728172817281728, "grad_norm": 0.532434344291687, "learning_rate": 7.969363290791451e-05, "loss": 4.544, "step": 1591 }, { "epoch": 0.5731773177317732, "grad_norm": 0.5906174778938293, "learning_rate": 7.958084335506239e-05, "loss": 4.6771, "step": 1592 }, { "epoch": 0.5735373537353735, "grad_norm": 0.8805077075958252, "learning_rate": 7.946808090208122e-05, "loss": 4.8108, "step": 1593 }, { "epoch": 0.5738973897389739, "grad_norm": 0.6874720454216003, "learning_rate": 7.935534569862686e-05, "loss": 4.5281, "step": 1594 }, { "epoch": 0.5742574257425742, "grad_norm": 1.0771909952163696, "learning_rate": 7.924263789431912e-05, "loss": 4.9165, "step": 1595 }, { "epoch": 0.5746174617461746, "grad_norm": 1.0937650203704834, "learning_rate": 7.912995763874143e-05, "loss": 5.0992, "step": 1596 }, { "epoch": 0.574977497749775, "grad_norm": 1.5542136430740356, "learning_rate": 7.90173050814406e-05, "loss": 5.1426, "step": 1597 }, { "epoch": 0.5753375337533754, "grad_norm": 1.1191812753677368, "learning_rate": 7.89046803719267e-05, "loss": 5.2826, "step": 1598 }, { "epoch": 0.5756975697569757, "grad_norm": 0.9704378843307495, "learning_rate": 7.879208365967287e-05, "loss": 5.1034, "step": 1599 }, { "epoch": 0.5760576057605761, "grad_norm": 1.3196125030517578, "learning_rate": 7.867951509411506e-05, "loss": 5.1528, "step": 1600 }, { "epoch": 0.5764176417641764, "grad_norm": 2.333880662918091, "learning_rate": 7.856697482465196e-05, "loss": 4.9852, "step": 1601 }, { "epoch": 0.5767776777677768, "grad_norm": 1.0826363563537598, "learning_rate": 7.84544630006445e-05, "loss": 4.8263, "step": 1602 }, { "epoch": 0.5771377137713771, "grad_norm": 0.980603814125061, "learning_rate": 7.834197977141603e-05, "loss": 4.4921, "step": 1603 }, { "epoch": 0.5774977497749775, "grad_norm": 0.6565625071525574, "learning_rate": 7.822952528625191e-05, "loss": 4.7848, "step": 1604 }, { "epoch": 0.5778577857785778, "grad_norm": 0.9467577934265137, "learning_rate": 7.811709969439938e-05, "loss": 4.6023, "step": 1605 }, { "epoch": 0.5782178217821782, "grad_norm": 0.7946081161499023, "learning_rate": 7.800470314506724e-05, "loss": 4.543, "step": 1606 }, { "epoch": 0.5785778577857785, "grad_norm": 0.9473694562911987, "learning_rate": 7.789233578742582e-05, "loss": 5.1386, "step": 1607 }, { "epoch": 0.578937893789379, "grad_norm": 0.7135196924209595, "learning_rate": 7.77799977706067e-05, "loss": 4.8152, "step": 1608 }, { "epoch": 0.5792979297929793, "grad_norm": 0.7521832585334778, "learning_rate": 7.766768924370254e-05, "loss": 4.65, "step": 1609 }, { "epoch": 0.5796579657965797, "grad_norm": 0.7366710305213928, "learning_rate": 7.755541035576677e-05, "loss": 4.448, "step": 1610 }, { "epoch": 0.58001800180018, "grad_norm": 0.9552801847457886, "learning_rate": 7.744316125581355e-05, "loss": 4.5018, "step": 1611 }, { "epoch": 0.5803780378037804, "grad_norm": 0.6024777293205261, "learning_rate": 7.733094209281756e-05, "loss": 4.7512, "step": 1612 }, { "epoch": 0.5807380738073807, "grad_norm": 1.195746898651123, "learning_rate": 7.721875301571359e-05, "loss": 4.8037, "step": 1613 }, { "epoch": 0.5810981098109811, "grad_norm": 0.8305825591087341, "learning_rate": 7.71065941733967e-05, "loss": 4.6396, "step": 1614 }, { "epoch": 0.5814581458145814, "grad_norm": 0.937282145023346, "learning_rate": 7.699446571472166e-05, "loss": 4.4877, "step": 1615 }, { "epoch": 0.5818181818181818, "grad_norm": 0.7361468076705933, "learning_rate": 7.688236778850306e-05, "loss": 4.604, "step": 1616 }, { "epoch": 0.5821782178217821, "grad_norm": 0.985714852809906, "learning_rate": 7.677030054351477e-05, "loss": 4.7311, "step": 1617 }, { "epoch": 0.5825382538253825, "grad_norm": 0.647030234336853, "learning_rate": 7.665826412849013e-05, "loss": 4.9563, "step": 1618 }, { "epoch": 0.582898289828983, "grad_norm": 0.583246648311615, "learning_rate": 7.654625869212146e-05, "loss": 4.4818, "step": 1619 }, { "epoch": 0.5832583258325833, "grad_norm": 0.9955174326896667, "learning_rate": 7.643428438306004e-05, "loss": 4.7674, "step": 1620 }, { "epoch": 0.5836183618361837, "grad_norm": 0.9936240911483765, "learning_rate": 7.632234134991575e-05, "loss": 4.5255, "step": 1621 }, { "epoch": 0.583978397839784, "grad_norm": 0.9111624360084534, "learning_rate": 7.6210429741257e-05, "loss": 4.7815, "step": 1622 }, { "epoch": 0.5843384338433844, "grad_norm": 0.8694493174552917, "learning_rate": 7.609854970561053e-05, "loss": 5.22, "step": 1623 }, { "epoch": 0.5846984698469847, "grad_norm": 1.2981653213500977, "learning_rate": 7.598670139146117e-05, "loss": 5.1569, "step": 1624 }, { "epoch": 0.585058505850585, "grad_norm": 1.4491846561431885, "learning_rate": 7.587488494725157e-05, "loss": 5.644, "step": 1625 }, { "epoch": 0.5854185418541854, "grad_norm": 4.114619255065918, "learning_rate": 7.576310052138215e-05, "loss": 4.7732, "step": 1626 }, { "epoch": 0.5857785778577858, "grad_norm": 0.9385104179382324, "learning_rate": 7.565134826221083e-05, "loss": 4.8205, "step": 1627 }, { "epoch": 0.5861386138613861, "grad_norm": 0.8031629323959351, "learning_rate": 7.55396283180529e-05, "loss": 4.65, "step": 1628 }, { "epoch": 0.5864986498649865, "grad_norm": 0.8165589570999146, "learning_rate": 7.542794083718059e-05, "loss": 4.8113, "step": 1629 }, { "epoch": 0.5868586858685868, "grad_norm": 0.7751879096031189, "learning_rate": 7.531628596782316e-05, "loss": 4.7953, "step": 1630 }, { "epoch": 0.5872187218721873, "grad_norm": 0.8618746995925903, "learning_rate": 7.520466385816671e-05, "loss": 4.6135, "step": 1631 }, { "epoch": 0.5875787578757876, "grad_norm": 1.1003646850585938, "learning_rate": 7.509307465635358e-05, "loss": 4.7031, "step": 1632 }, { "epoch": 0.587938793879388, "grad_norm": 0.7130169868469238, "learning_rate": 7.498151851048267e-05, "loss": 4.364, "step": 1633 }, { "epoch": 0.5882988298829883, "grad_norm": 0.816373884677887, "learning_rate": 7.48699955686089e-05, "loss": 4.8291, "step": 1634 }, { "epoch": 0.5886588658865887, "grad_norm": 0.7508324384689331, "learning_rate": 7.475850597874319e-05, "loss": 4.8374, "step": 1635 }, { "epoch": 0.589018901890189, "grad_norm": 0.6991592645645142, "learning_rate": 7.464704988885209e-05, "loss": 4.7721, "step": 1636 }, { "epoch": 0.5893789378937894, "grad_norm": 0.6070843935012817, "learning_rate": 7.453562744685778e-05, "loss": 5.1812, "step": 1637 }, { "epoch": 0.5897389738973897, "grad_norm": 0.580551266670227, "learning_rate": 7.442423880063778e-05, "loss": 4.7607, "step": 1638 }, { "epoch": 0.5900990099009901, "grad_norm": 0.8013322353363037, "learning_rate": 7.431288409802473e-05, "loss": 4.6397, "step": 1639 }, { "epoch": 0.5904590459045904, "grad_norm": 0.8236899971961975, "learning_rate": 7.42015634868062e-05, "loss": 5.0199, "step": 1640 }, { "epoch": 0.5908190819081908, "grad_norm": 0.6588436365127563, "learning_rate": 7.409027711472456e-05, "loss": 4.4477, "step": 1641 }, { "epoch": 0.5911791179117912, "grad_norm": 0.9058972001075745, "learning_rate": 7.39790251294767e-05, "loss": 4.6782, "step": 1642 }, { "epoch": 0.5915391539153916, "grad_norm": 0.7739425897598267, "learning_rate": 7.386780767871397e-05, "loss": 4.6644, "step": 1643 }, { "epoch": 0.5918991899189919, "grad_norm": 0.9859951138496399, "learning_rate": 7.37566249100417e-05, "loss": 4.9345, "step": 1644 }, { "epoch": 0.5922592259225923, "grad_norm": 0.94615638256073, "learning_rate": 7.364547697101933e-05, "loss": 4.9104, "step": 1645 }, { "epoch": 0.5926192619261926, "grad_norm": 0.6184026598930359, "learning_rate": 7.353436400916004e-05, "loss": 4.7239, "step": 1646 }, { "epoch": 0.592979297929793, "grad_norm": 0.6131138205528259, "learning_rate": 7.342328617193067e-05, "loss": 4.9109, "step": 1647 }, { "epoch": 0.5933393339333933, "grad_norm": 0.8020132780075073, "learning_rate": 7.331224360675126e-05, "loss": 4.7352, "step": 1648 }, { "epoch": 0.5936993699369937, "grad_norm": 1.0741486549377441, "learning_rate": 7.320123646099519e-05, "loss": 5.2285, "step": 1649 }, { "epoch": 0.594059405940594, "grad_norm": 1.2344449758529663, "learning_rate": 7.309026488198884e-05, "loss": 5.3315, "step": 1650 }, { "epoch": 0.5944194419441944, "grad_norm": 0.947981059551239, "learning_rate": 7.297932901701123e-05, "loss": 4.8494, "step": 1651 }, { "epoch": 0.5947794779477947, "grad_norm": 0.7184794545173645, "learning_rate": 7.286842901329412e-05, "loss": 4.4837, "step": 1652 }, { "epoch": 0.5951395139513952, "grad_norm": 0.659092903137207, "learning_rate": 7.275756501802166e-05, "loss": 4.4865, "step": 1653 }, { "epoch": 0.5954995499549955, "grad_norm": 0.8012544512748718, "learning_rate": 7.264673717833019e-05, "loss": 4.5965, "step": 1654 }, { "epoch": 0.5958595859585959, "grad_norm": 1.0120086669921875, "learning_rate": 7.253594564130804e-05, "loss": 4.7858, "step": 1655 }, { "epoch": 0.5962196219621962, "grad_norm": 1.0920915603637695, "learning_rate": 7.242519055399539e-05, "loss": 4.7804, "step": 1656 }, { "epoch": 0.5965796579657966, "grad_norm": 0.6081851720809937, "learning_rate": 7.231447206338407e-05, "loss": 4.8375, "step": 1657 }, { "epoch": 0.596939693969397, "grad_norm": 1.0984938144683838, "learning_rate": 7.22037903164173e-05, "loss": 4.7434, "step": 1658 }, { "epoch": 0.5972997299729973, "grad_norm": 1.1671141386032104, "learning_rate": 7.209314545998949e-05, "loss": 4.6966, "step": 1659 }, { "epoch": 0.5976597659765976, "grad_norm": 1.2776623964309692, "learning_rate": 7.198253764094618e-05, "loss": 4.8766, "step": 1660 }, { "epoch": 0.598019801980198, "grad_norm": 1.1133800745010376, "learning_rate": 7.187196700608373e-05, "loss": 4.9712, "step": 1661 }, { "epoch": 0.5983798379837983, "grad_norm": 0.8766571283340454, "learning_rate": 7.176143370214914e-05, "loss": 4.7079, "step": 1662 }, { "epoch": 0.5987398739873987, "grad_norm": 0.8257899880409241, "learning_rate": 7.165093787583984e-05, "loss": 4.3869, "step": 1663 }, { "epoch": 0.599099909990999, "grad_norm": 0.6749482750892639, "learning_rate": 7.154047967380354e-05, "loss": 4.6119, "step": 1664 }, { "epoch": 0.5994599459945995, "grad_norm": 0.6785940527915955, "learning_rate": 7.143005924263803e-05, "loss": 4.5932, "step": 1665 }, { "epoch": 0.5998199819981999, "grad_norm": 0.6335827708244324, "learning_rate": 7.131967672889101e-05, "loss": 4.6642, "step": 1666 }, { "epoch": 0.6001800180018002, "grad_norm": 0.8998749256134033, "learning_rate": 7.12093322790597e-05, "loss": 4.8338, "step": 1667 }, { "epoch": 0.6005400540054006, "grad_norm": 0.7676742672920227, "learning_rate": 7.1099026039591e-05, "loss": 4.4145, "step": 1668 }, { "epoch": 0.6009000900090009, "grad_norm": 0.6727948188781738, "learning_rate": 7.098875815688095e-05, "loss": 4.8703, "step": 1669 }, { "epoch": 0.6012601260126013, "grad_norm": 1.0915569067001343, "learning_rate": 7.087852877727481e-05, "loss": 4.9705, "step": 1670 }, { "epoch": 0.6016201620162016, "grad_norm": 0.6964028477668762, "learning_rate": 7.07683380470666e-05, "loss": 5.1096, "step": 1671 }, { "epoch": 0.601980198019802, "grad_norm": 0.8635994791984558, "learning_rate": 7.065818611249915e-05, "loss": 5.1809, "step": 1672 }, { "epoch": 0.6023402340234023, "grad_norm": 0.8733118772506714, "learning_rate": 7.054807311976379e-05, "loss": 4.9687, "step": 1673 }, { "epoch": 0.6027002700270027, "grad_norm": 0.8445477485656738, "learning_rate": 7.043799921500009e-05, "loss": 5.0272, "step": 1674 }, { "epoch": 0.603060306030603, "grad_norm": 1.4227135181427002, "learning_rate": 7.032796454429583e-05, "loss": 5.6162, "step": 1675 }, { "epoch": 0.6034203420342035, "grad_norm": 0.8639402985572815, "learning_rate": 7.021796925368667e-05, "loss": 4.684, "step": 1676 }, { "epoch": 0.6037803780378038, "grad_norm": 1.1845555305480957, "learning_rate": 7.010801348915608e-05, "loss": 4.9312, "step": 1677 }, { "epoch": 0.6041404140414042, "grad_norm": 0.5735663771629333, "learning_rate": 6.999809739663492e-05, "loss": 4.8477, "step": 1678 }, { "epoch": 0.6045004500450045, "grad_norm": 0.7418989539146423, "learning_rate": 6.988822112200156e-05, "loss": 4.9024, "step": 1679 }, { "epoch": 0.6048604860486049, "grad_norm": 0.7708451151847839, "learning_rate": 6.977838481108145e-05, "loss": 4.442, "step": 1680 }, { "epoch": 0.6052205220522052, "grad_norm": 1.4791598320007324, "learning_rate": 6.966858860964702e-05, "loss": 4.7687, "step": 1681 }, { "epoch": 0.6055805580558056, "grad_norm": 0.7193477749824524, "learning_rate": 6.955883266341741e-05, "loss": 4.7705, "step": 1682 }, { "epoch": 0.6059405940594059, "grad_norm": 0.904472291469574, "learning_rate": 6.944911711805842e-05, "loss": 4.6842, "step": 1683 }, { "epoch": 0.6063006300630063, "grad_norm": 0.6349477171897888, "learning_rate": 6.933944211918215e-05, "loss": 4.6385, "step": 1684 }, { "epoch": 0.6066606660666066, "grad_norm": 0.7457664012908936, "learning_rate": 6.922980781234699e-05, "loss": 4.3806, "step": 1685 }, { "epoch": 0.607020702070207, "grad_norm": 0.7954607605934143, "learning_rate": 6.91202143430572e-05, "loss": 5.1228, "step": 1686 }, { "epoch": 0.6073807380738074, "grad_norm": 0.6613931059837341, "learning_rate": 6.901066185676295e-05, "loss": 4.3413, "step": 1687 }, { "epoch": 0.6077407740774078, "grad_norm": 0.6740626692771912, "learning_rate": 6.890115049885994e-05, "loss": 4.8427, "step": 1688 }, { "epoch": 0.6081008100810081, "grad_norm": 0.6859798431396484, "learning_rate": 6.879168041468938e-05, "loss": 4.6523, "step": 1689 }, { "epoch": 0.6084608460846085, "grad_norm": 0.7126948833465576, "learning_rate": 6.868225174953755e-05, "loss": 4.987, "step": 1690 }, { "epoch": 0.6088208820882088, "grad_norm": 0.6304237842559814, "learning_rate": 6.85728646486359e-05, "loss": 4.5052, "step": 1691 }, { "epoch": 0.6091809180918092, "grad_norm": 0.5101225972175598, "learning_rate": 6.846351925716068e-05, "loss": 4.449, "step": 1692 }, { "epoch": 0.6095409540954095, "grad_norm": 0.8675354719161987, "learning_rate": 6.835421572023272e-05, "loss": 4.7941, "step": 1693 }, { "epoch": 0.6099009900990099, "grad_norm": 0.8150596022605896, "learning_rate": 6.82449541829174e-05, "loss": 4.8855, "step": 1694 }, { "epoch": 0.6102610261026102, "grad_norm": 0.7487970590591431, "learning_rate": 6.81357347902243e-05, "loss": 4.5067, "step": 1695 }, { "epoch": 0.6106210621062106, "grad_norm": 0.7239964008331299, "learning_rate": 6.80265576871071e-05, "loss": 4.8702, "step": 1696 }, { "epoch": 0.6109810981098109, "grad_norm": 0.668526291847229, "learning_rate": 6.791742301846326e-05, "loss": 4.9668, "step": 1697 }, { "epoch": 0.6113411341134113, "grad_norm": 0.673700749874115, "learning_rate": 6.780833092913403e-05, "loss": 4.8733, "step": 1698 }, { "epoch": 0.6117011701170117, "grad_norm": 0.7996618747711182, "learning_rate": 6.769928156390414e-05, "loss": 4.7596, "step": 1699 }, { "epoch": 0.6120612061206121, "grad_norm": 1.36536705493927, "learning_rate": 6.759027506750158e-05, "loss": 5.1771, "step": 1700 }, { "epoch": 0.6124212421242125, "grad_norm": 3.8609843254089355, "learning_rate": 6.748131158459742e-05, "loss": 5.4209, "step": 1701 }, { "epoch": 0.6127812781278128, "grad_norm": 0.9584519863128662, "learning_rate": 6.737239125980573e-05, "loss": 4.6022, "step": 1702 }, { "epoch": 0.6131413141314132, "grad_norm": 0.7238506078720093, "learning_rate": 6.726351423768322e-05, "loss": 4.4749, "step": 1703 }, { "epoch": 0.6135013501350135, "grad_norm": 0.8569992780685425, "learning_rate": 6.715468066272921e-05, "loss": 5.2025, "step": 1704 }, { "epoch": 0.6138613861386139, "grad_norm": 0.7682144641876221, "learning_rate": 6.704589067938523e-05, "loss": 4.8465, "step": 1705 }, { "epoch": 0.6142214221422142, "grad_norm": 0.8438863158226013, "learning_rate": 6.693714443203507e-05, "loss": 4.9573, "step": 1706 }, { "epoch": 0.6145814581458146, "grad_norm": 0.8496419191360474, "learning_rate": 6.682844206500445e-05, "loss": 4.9977, "step": 1707 }, { "epoch": 0.6149414941494149, "grad_norm": 0.6941019296646118, "learning_rate": 6.671978372256084e-05, "loss": 4.9231, "step": 1708 }, { "epoch": 0.6153015301530153, "grad_norm": 0.7767177224159241, "learning_rate": 6.661116954891328e-05, "loss": 4.8246, "step": 1709 }, { "epoch": 0.6156615661566157, "grad_norm": 0.7980932593345642, "learning_rate": 6.650259968821218e-05, "loss": 4.8588, "step": 1710 }, { "epoch": 0.6160216021602161, "grad_norm": 0.8906815052032471, "learning_rate": 6.639407428454922e-05, "loss": 4.4652, "step": 1711 }, { "epoch": 0.6163816381638164, "grad_norm": 0.9898139834403992, "learning_rate": 6.62855934819569e-05, "loss": 4.6276, "step": 1712 }, { "epoch": 0.6167416741674168, "grad_norm": 0.5662094354629517, "learning_rate": 6.617715742440869e-05, "loss": 4.6352, "step": 1713 }, { "epoch": 0.6171017101710171, "grad_norm": 0.7726882696151733, "learning_rate": 6.606876625581863e-05, "loss": 4.6636, "step": 1714 }, { "epoch": 0.6174617461746175, "grad_norm": 0.7297523617744446, "learning_rate": 6.59604201200412e-05, "loss": 4.796, "step": 1715 }, { "epoch": 0.6178217821782178, "grad_norm": 0.7021007537841797, "learning_rate": 6.585211916087102e-05, "loss": 4.703, "step": 1716 }, { "epoch": 0.6181818181818182, "grad_norm": 0.7362711429595947, "learning_rate": 6.574386352204289e-05, "loss": 5.1348, "step": 1717 }, { "epoch": 0.6185418541854185, "grad_norm": 1.0990628004074097, "learning_rate": 6.563565334723134e-05, "loss": 4.7556, "step": 1718 }, { "epoch": 0.6189018901890189, "grad_norm": 0.5887585282325745, "learning_rate": 6.55274887800507e-05, "loss": 4.4678, "step": 1719 }, { "epoch": 0.6192619261926192, "grad_norm": 0.7051016688346863, "learning_rate": 6.54193699640546e-05, "loss": 4.8287, "step": 1720 }, { "epoch": 0.6196219621962196, "grad_norm": 0.8038293123245239, "learning_rate": 6.531129704273604e-05, "loss": 4.905, "step": 1721 }, { "epoch": 0.61998199819982, "grad_norm": 0.9665653705596924, "learning_rate": 6.520327015952713e-05, "loss": 4.7933, "step": 1722 }, { "epoch": 0.6203420342034204, "grad_norm": 0.7566668391227722, "learning_rate": 6.509528945779888e-05, "loss": 5.2126, "step": 1723 }, { "epoch": 0.6207020702070207, "grad_norm": 0.8725939989089966, "learning_rate": 6.498735508086093e-05, "loss": 5.1499, "step": 1724 }, { "epoch": 0.6210621062106211, "grad_norm": 1.4660851955413818, "learning_rate": 6.487946717196153e-05, "loss": 5.2557, "step": 1725 }, { "epoch": 0.6214221422142214, "grad_norm": 1.0590465068817139, "learning_rate": 6.47716258742872e-05, "loss": 5.2878, "step": 1726 }, { "epoch": 0.6217821782178218, "grad_norm": 0.7507508397102356, "learning_rate": 6.466383133096267e-05, "loss": 4.7792, "step": 1727 }, { "epoch": 0.6221422142214221, "grad_norm": 0.6140052080154419, "learning_rate": 6.45560836850505e-05, "loss": 5.0165, "step": 1728 }, { "epoch": 0.6225022502250225, "grad_norm": 0.637725293636322, "learning_rate": 6.44483830795511e-05, "loss": 4.6269, "step": 1729 }, { "epoch": 0.6228622862286228, "grad_norm": 0.7622739672660828, "learning_rate": 6.434072965740242e-05, "loss": 4.6183, "step": 1730 }, { "epoch": 0.6232223222322232, "grad_norm": 0.5821534395217896, "learning_rate": 6.423312356147983e-05, "loss": 4.6595, "step": 1731 }, { "epoch": 0.6235823582358235, "grad_norm": 0.9150854349136353, "learning_rate": 6.412556493459581e-05, "loss": 4.6189, "step": 1732 }, { "epoch": 0.623942394239424, "grad_norm": 0.6262230277061462, "learning_rate": 6.40180539194999e-05, "loss": 4.8841, "step": 1733 }, { "epoch": 0.6243024302430243, "grad_norm": 1.0352506637573242, "learning_rate": 6.391059065887847e-05, "loss": 4.7395, "step": 1734 }, { "epoch": 0.6246624662466247, "grad_norm": 0.7014068961143494, "learning_rate": 6.380317529535442e-05, "loss": 5.0256, "step": 1735 }, { "epoch": 0.625022502250225, "grad_norm": 0.5638285875320435, "learning_rate": 6.369580797148718e-05, "loss": 4.885, "step": 1736 }, { "epoch": 0.6253825382538254, "grad_norm": 0.6988465189933777, "learning_rate": 6.358848882977233e-05, "loss": 4.406, "step": 1737 }, { "epoch": 0.6257425742574257, "grad_norm": 0.6802326440811157, "learning_rate": 6.348121801264163e-05, "loss": 4.6854, "step": 1738 }, { "epoch": 0.6261026102610261, "grad_norm": 0.7166337370872498, "learning_rate": 6.337399566246257e-05, "loss": 4.9086, "step": 1739 }, { "epoch": 0.6264626462646264, "grad_norm": 0.7824912071228027, "learning_rate": 6.326682192153838e-05, "loss": 4.6632, "step": 1740 }, { "epoch": 0.6268226822682268, "grad_norm": 0.656356155872345, "learning_rate": 6.315969693210782e-05, "loss": 4.837, "step": 1741 }, { "epoch": 0.6271827182718271, "grad_norm": 0.653224766254425, "learning_rate": 6.305262083634488e-05, "loss": 4.8749, "step": 1742 }, { "epoch": 0.6275427542754275, "grad_norm": 0.6347287893295288, "learning_rate": 6.294559377635864e-05, "loss": 4.9606, "step": 1743 }, { "epoch": 0.627902790279028, "grad_norm": 0.6182569265365601, "learning_rate": 6.283861589419316e-05, "loss": 4.609, "step": 1744 }, { "epoch": 0.6282628262826283, "grad_norm": 0.6387762427330017, "learning_rate": 6.273168733182722e-05, "loss": 4.8894, "step": 1745 }, { "epoch": 0.6286228622862287, "grad_norm": 0.7164087295532227, "learning_rate": 6.262480823117416e-05, "loss": 4.8205, "step": 1746 }, { "epoch": 0.628982898289829, "grad_norm": 1.3452858924865723, "learning_rate": 6.251797873408161e-05, "loss": 5.041, "step": 1747 }, { "epoch": 0.6293429342934294, "grad_norm": 0.9320101737976074, "learning_rate": 6.241119898233144e-05, "loss": 5.0222, "step": 1748 }, { "epoch": 0.6297029702970297, "grad_norm": 0.9436710476875305, "learning_rate": 6.230446911763943e-05, "loss": 5.1603, "step": 1749 }, { "epoch": 0.6300630063006301, "grad_norm": 1.33075749874115, "learning_rate": 6.219778928165527e-05, "loss": 5.3501, "step": 1750 }, { "epoch": 0.6304230423042304, "grad_norm": 1.5732166767120361, "learning_rate": 6.209115961596208e-05, "loss": 4.5678, "step": 1751 }, { "epoch": 0.6307830783078308, "grad_norm": 0.752070963382721, "learning_rate": 6.198458026207652e-05, "loss": 4.7007, "step": 1752 }, { "epoch": 0.6311431143114311, "grad_norm": 1.0328824520111084, "learning_rate": 6.187805136144847e-05, "loss": 4.5845, "step": 1753 }, { "epoch": 0.6315031503150315, "grad_norm": 0.7405611276626587, "learning_rate": 6.177157305546078e-05, "loss": 4.8958, "step": 1754 }, { "epoch": 0.6318631863186318, "grad_norm": 1.0916255712509155, "learning_rate": 6.16651454854292e-05, "loss": 4.9579, "step": 1755 }, { "epoch": 0.6322232223222323, "grad_norm": 0.7531924843788147, "learning_rate": 6.15587687926022e-05, "loss": 4.9045, "step": 1756 }, { "epoch": 0.6325832583258326, "grad_norm": 0.6694497466087341, "learning_rate": 6.145244311816063e-05, "loss": 4.8199, "step": 1757 }, { "epoch": 0.632943294329433, "grad_norm": 0.674801766872406, "learning_rate": 6.134616860321764e-05, "loss": 5.045, "step": 1758 }, { "epoch": 0.6333033303330333, "grad_norm": 1.0546095371246338, "learning_rate": 6.123994538881851e-05, "loss": 5.2049, "step": 1759 }, { "epoch": 0.6336633663366337, "grad_norm": 0.6533979177474976, "learning_rate": 6.113377361594049e-05, "loss": 5.0091, "step": 1760 }, { "epoch": 0.634023402340234, "grad_norm": 0.8369541764259338, "learning_rate": 6.102765342549246e-05, "loss": 4.6803, "step": 1761 }, { "epoch": 0.6343834383438344, "grad_norm": 1.1984913349151611, "learning_rate": 6.092158495831486e-05, "loss": 4.8795, "step": 1762 }, { "epoch": 0.6347434743474347, "grad_norm": 0.5820183753967285, "learning_rate": 6.0815568355179556e-05, "loss": 4.8205, "step": 1763 }, { "epoch": 0.6351035103510351, "grad_norm": 0.6554083228111267, "learning_rate": 6.070960375678949e-05, "loss": 4.7503, "step": 1764 }, { "epoch": 0.6354635463546354, "grad_norm": 0.9061247706413269, "learning_rate": 6.0603691303778696e-05, "loss": 4.8138, "step": 1765 }, { "epoch": 0.6358235823582358, "grad_norm": 0.766362190246582, "learning_rate": 6.0497831136711836e-05, "loss": 4.4586, "step": 1766 }, { "epoch": 0.6361836183618362, "grad_norm": 0.7624132037162781, "learning_rate": 6.039202339608432e-05, "loss": 4.7515, "step": 1767 }, { "epoch": 0.6365436543654366, "grad_norm": 0.6750558614730835, "learning_rate": 6.028626822232193e-05, "loss": 5.0298, "step": 1768 }, { "epoch": 0.6369036903690369, "grad_norm": 0.8097175359725952, "learning_rate": 6.018056575578075e-05, "loss": 4.8065, "step": 1769 }, { "epoch": 0.6372637263726373, "grad_norm": 0.76420658826828, "learning_rate": 6.007491613674669e-05, "loss": 4.6139, "step": 1770 }, { "epoch": 0.6376237623762376, "grad_norm": 0.7700027823448181, "learning_rate": 5.996931950543583e-05, "loss": 5.0582, "step": 1771 }, { "epoch": 0.637983798379838, "grad_norm": 0.6350474953651428, "learning_rate": 5.986377600199371e-05, "loss": 4.8447, "step": 1772 }, { "epoch": 0.6383438343834383, "grad_norm": 0.7874675989151001, "learning_rate": 5.9758285766495495e-05, "loss": 5.1058, "step": 1773 }, { "epoch": 0.6387038703870387, "grad_norm": 1.3597450256347656, "learning_rate": 5.965284893894547e-05, "loss": 5.2674, "step": 1774 }, { "epoch": 0.639063906390639, "grad_norm": 0.9899166822433472, "learning_rate": 5.9547465659277215e-05, "loss": 5.1647, "step": 1775 }, { "epoch": 0.6394239423942394, "grad_norm": 2.18182635307312, "learning_rate": 5.944213606735322e-05, "loss": 4.7493, "step": 1776 }, { "epoch": 0.6397839783978397, "grad_norm": 0.7582593560218811, "learning_rate": 5.933686030296459e-05, "loss": 4.7778, "step": 1777 }, { "epoch": 0.6401440144014402, "grad_norm": 0.7539111375808716, "learning_rate": 5.923163850583113e-05, "loss": 4.5992, "step": 1778 }, { "epoch": 0.6405040504050405, "grad_norm": 0.8477987051010132, "learning_rate": 5.9126470815600966e-05, "loss": 4.7375, "step": 1779 }, { "epoch": 0.6408640864086409, "grad_norm": 0.6299402713775635, "learning_rate": 5.9021357371850486e-05, "loss": 4.652, "step": 1780 }, { "epoch": 0.6412241224122412, "grad_norm": 0.7032667994499207, "learning_rate": 5.8916298314083915e-05, "loss": 4.3023, "step": 1781 }, { "epoch": 0.6415841584158416, "grad_norm": 0.8376038074493408, "learning_rate": 5.881129378173347e-05, "loss": 4.6135, "step": 1782 }, { "epoch": 0.641944194419442, "grad_norm": 0.9109644889831543, "learning_rate": 5.8706343914158914e-05, "loss": 4.4688, "step": 1783 }, { "epoch": 0.6423042304230423, "grad_norm": 1.2081356048583984, "learning_rate": 5.860144885064751e-05, "loss": 4.6571, "step": 1784 }, { "epoch": 0.6426642664266426, "grad_norm": 0.7176885008811951, "learning_rate": 5.8496608730413716e-05, "loss": 4.8487, "step": 1785 }, { "epoch": 0.643024302430243, "grad_norm": 0.5789865851402283, "learning_rate": 5.8391823692599124e-05, "loss": 4.8274, "step": 1786 }, { "epoch": 0.6433843384338434, "grad_norm": 0.6851209998130798, "learning_rate": 5.828709387627218e-05, "loss": 4.4276, "step": 1787 }, { "epoch": 0.6437443744374437, "grad_norm": 0.7992741465568542, "learning_rate": 5.818241942042819e-05, "loss": 4.236, "step": 1788 }, { "epoch": 0.644104410441044, "grad_norm": 0.6117070317268372, "learning_rate": 5.807780046398873e-05, "loss": 4.7178, "step": 1789 }, { "epoch": 0.6444644464446445, "grad_norm": 0.5066046118736267, "learning_rate": 5.797323714580192e-05, "loss": 4.608, "step": 1790 }, { "epoch": 0.6448244824482449, "grad_norm": 0.729286789894104, "learning_rate": 5.786872960464196e-05, "loss": 5.1699, "step": 1791 }, { "epoch": 0.6451845184518452, "grad_norm": 0.6828146576881409, "learning_rate": 5.7764277979209094e-05, "loss": 4.761, "step": 1792 }, { "epoch": 0.6455445544554456, "grad_norm": 0.5605363249778748, "learning_rate": 5.765988240812921e-05, "loss": 4.7329, "step": 1793 }, { "epoch": 0.6459045904590459, "grad_norm": 0.7322149872779846, "learning_rate": 5.755554302995393e-05, "loss": 4.9112, "step": 1794 }, { "epoch": 0.6462646264626463, "grad_norm": 0.6481497287750244, "learning_rate": 5.74512599831603e-05, "loss": 4.6671, "step": 1795 }, { "epoch": 0.6466246624662466, "grad_norm": 0.6252749562263489, "learning_rate": 5.73470334061505e-05, "loss": 4.9359, "step": 1796 }, { "epoch": 0.646984698469847, "grad_norm": 0.8327919840812683, "learning_rate": 5.724286343725185e-05, "loss": 5.0185, "step": 1797 }, { "epoch": 0.6473447344734473, "grad_norm": 0.8310493230819702, "learning_rate": 5.713875021471653e-05, "loss": 5.057, "step": 1798 }, { "epoch": 0.6477047704770477, "grad_norm": 0.8026371002197266, "learning_rate": 5.7034693876721376e-05, "loss": 4.8322, "step": 1799 }, { "epoch": 0.648064806480648, "grad_norm": 1.5379436016082764, "learning_rate": 5.693069456136779e-05, "loss": 5.3851, "step": 1800 }, { "epoch": 0.6484248424842485, "grad_norm": 0.8481616973876953, "learning_rate": 5.682675240668143e-05, "loss": 4.6216, "step": 1801 }, { "epoch": 0.6487848784878488, "grad_norm": 0.8129981756210327, "learning_rate": 5.6722867550612116e-05, "loss": 4.4348, "step": 1802 }, { "epoch": 0.6491449144914492, "grad_norm": 0.8237068057060242, "learning_rate": 5.661904013103365e-05, "loss": 4.833, "step": 1803 }, { "epoch": 0.6495049504950495, "grad_norm": 0.6432878375053406, "learning_rate": 5.6515270285743524e-05, "loss": 4.6828, "step": 1804 }, { "epoch": 0.6498649864986499, "grad_norm": 0.9345207214355469, "learning_rate": 5.6411558152462894e-05, "loss": 4.5397, "step": 1805 }, { "epoch": 0.6502250225022502, "grad_norm": 0.5214217901229858, "learning_rate": 5.630790386883631e-05, "loss": 4.6287, "step": 1806 }, { "epoch": 0.6505850585058506, "grad_norm": 0.7941548824310303, "learning_rate": 5.620430757243156e-05, "loss": 4.7249, "step": 1807 }, { "epoch": 0.6509450945094509, "grad_norm": 0.6697788238525391, "learning_rate": 5.6100769400739383e-05, "loss": 4.6915, "step": 1808 }, { "epoch": 0.6513051305130513, "grad_norm": 0.8533397912979126, "learning_rate": 5.599728949117348e-05, "loss": 4.9724, "step": 1809 }, { "epoch": 0.6516651665166516, "grad_norm": 0.6576551795005798, "learning_rate": 5.589386798107018e-05, "loss": 4.6834, "step": 1810 }, { "epoch": 0.652025202520252, "grad_norm": 0.9863411784172058, "learning_rate": 5.579050500768836e-05, "loss": 5.2171, "step": 1811 }, { "epoch": 0.6523852385238524, "grad_norm": 0.6458398699760437, "learning_rate": 5.5687200708209076e-05, "loss": 4.849, "step": 1812 }, { "epoch": 0.6527452745274528, "grad_norm": 0.6335414052009583, "learning_rate": 5.558395521973565e-05, "loss": 4.4948, "step": 1813 }, { "epoch": 0.6531053105310531, "grad_norm": 0.664715051651001, "learning_rate": 5.54807686792933e-05, "loss": 4.5827, "step": 1814 }, { "epoch": 0.6534653465346535, "grad_norm": 0.6272945404052734, "learning_rate": 5.5377641223829e-05, "loss": 4.8311, "step": 1815 }, { "epoch": 0.6538253825382538, "grad_norm": 0.5388202667236328, "learning_rate": 5.527457299021133e-05, "loss": 4.7276, "step": 1816 }, { "epoch": 0.6541854185418542, "grad_norm": 0.6526055335998535, "learning_rate": 5.5171564115230254e-05, "loss": 4.8818, "step": 1817 }, { "epoch": 0.6545454545454545, "grad_norm": 0.6699817776679993, "learning_rate": 5.5068614735597e-05, "loss": 4.5449, "step": 1818 }, { "epoch": 0.6549054905490549, "grad_norm": 0.7195981740951538, "learning_rate": 5.496572498794372e-05, "loss": 5.0371, "step": 1819 }, { "epoch": 0.6552655265526552, "grad_norm": 0.7665285468101501, "learning_rate": 5.486289500882355e-05, "loss": 4.5342, "step": 1820 }, { "epoch": 0.6556255625562556, "grad_norm": 0.7282291054725647, "learning_rate": 5.476012493471023e-05, "loss": 4.7795, "step": 1821 }, { "epoch": 0.6559855985598559, "grad_norm": 0.7076379656791687, "learning_rate": 5.4657414901998095e-05, "loss": 4.5155, "step": 1822 }, { "epoch": 0.6563456345634563, "grad_norm": 0.9629925489425659, "learning_rate": 5.4554765047001613e-05, "loss": 4.7799, "step": 1823 }, { "epoch": 0.6567056705670568, "grad_norm": 1.2366801500320435, "learning_rate": 5.445217550595552e-05, "loss": 5.1011, "step": 1824 }, { "epoch": 0.6570657065706571, "grad_norm": 1.4854378700256348, "learning_rate": 5.43496464150145e-05, "loss": 4.9074, "step": 1825 }, { "epoch": 0.6574257425742575, "grad_norm": 0.7619197368621826, "learning_rate": 5.424717791025302e-05, "loss": 4.3182, "step": 1826 }, { "epoch": 0.6577857785778578, "grad_norm": 0.718901515007019, "learning_rate": 5.4144770127665024e-05, "loss": 4.8806, "step": 1827 }, { "epoch": 0.6581458145814582, "grad_norm": 0.6655319333076477, "learning_rate": 5.4042423203163975e-05, "loss": 4.6217, "step": 1828 }, { "epoch": 0.6585058505850585, "grad_norm": 0.6734433770179749, "learning_rate": 5.394013727258254e-05, "loss": 4.512, "step": 1829 }, { "epoch": 0.6588658865886589, "grad_norm": 0.77203768491745, "learning_rate": 5.3837912471672446e-05, "loss": 4.7527, "step": 1830 }, { "epoch": 0.6592259225922592, "grad_norm": 0.665772020816803, "learning_rate": 5.3735748936104255e-05, "loss": 4.6671, "step": 1831 }, { "epoch": 0.6595859585958596, "grad_norm": 0.8873734474182129, "learning_rate": 5.363364680146725e-05, "loss": 5.2537, "step": 1832 }, { "epoch": 0.6599459945994599, "grad_norm": 0.8361346125602722, "learning_rate": 5.3531606203269236e-05, "loss": 4.6509, "step": 1833 }, { "epoch": 0.6603060306030603, "grad_norm": 0.8583419919013977, "learning_rate": 5.342962727693633e-05, "loss": 4.8991, "step": 1834 }, { "epoch": 0.6606660666066607, "grad_norm": 0.6351554989814758, "learning_rate": 5.332771015781275e-05, "loss": 4.6789, "step": 1835 }, { "epoch": 0.6610261026102611, "grad_norm": 0.8681540489196777, "learning_rate": 5.322585498116075e-05, "loss": 4.6612, "step": 1836 }, { "epoch": 0.6613861386138614, "grad_norm": 0.7458162307739258, "learning_rate": 5.31240618821604e-05, "loss": 4.5165, "step": 1837 }, { "epoch": 0.6617461746174618, "grad_norm": 1.2762198448181152, "learning_rate": 5.302233099590928e-05, "loss": 4.3505, "step": 1838 }, { "epoch": 0.6621062106210621, "grad_norm": 0.8469787836074829, "learning_rate": 5.292066245742246e-05, "loss": 5.2084, "step": 1839 }, { "epoch": 0.6624662466246625, "grad_norm": 1.1678879261016846, "learning_rate": 5.2819056401632304e-05, "loss": 4.6556, "step": 1840 }, { "epoch": 0.6628262826282628, "grad_norm": 0.635159432888031, "learning_rate": 5.271751296338823e-05, "loss": 4.7362, "step": 1841 }, { "epoch": 0.6631863186318632, "grad_norm": 0.7722536325454712, "learning_rate": 5.2616032277456463e-05, "loss": 5.0082, "step": 1842 }, { "epoch": 0.6635463546354635, "grad_norm": 0.5722522139549255, "learning_rate": 5.251461447852003e-05, "loss": 4.8685, "step": 1843 }, { "epoch": 0.6639063906390639, "grad_norm": 0.4967202842235565, "learning_rate": 5.2413259701178505e-05, "loss": 4.7663, "step": 1844 }, { "epoch": 0.6642664266426642, "grad_norm": 0.6773428320884705, "learning_rate": 5.231196807994779e-05, "loss": 4.8131, "step": 1845 }, { "epoch": 0.6646264626462647, "grad_norm": 0.681438684463501, "learning_rate": 5.221073974925997e-05, "loss": 4.9336, "step": 1846 }, { "epoch": 0.664986498649865, "grad_norm": 0.8762783408164978, "learning_rate": 5.210957484346314e-05, "loss": 5.4102, "step": 1847 }, { "epoch": 0.6653465346534654, "grad_norm": 0.971596360206604, "learning_rate": 5.200847349682121e-05, "loss": 5.5351, "step": 1848 }, { "epoch": 0.6657065706570657, "grad_norm": 0.8573640584945679, "learning_rate": 5.190743584351376e-05, "loss": 5.221, "step": 1849 }, { "epoch": 0.6660666066606661, "grad_norm": 1.9084817171096802, "learning_rate": 5.180646201763577e-05, "loss": 5.6038, "step": 1850 }, { "epoch": 0.6664266426642664, "grad_norm": 1.0819993019104004, "learning_rate": 5.170555215319757e-05, "loss": 4.7475, "step": 1851 }, { "epoch": 0.6667866786678668, "grad_norm": 1.0352802276611328, "learning_rate": 5.160470638412461e-05, "loss": 4.815, "step": 1852 }, { "epoch": 0.6671467146714671, "grad_norm": 0.9234697222709656, "learning_rate": 5.150392484425728e-05, "loss": 5.1675, "step": 1853 }, { "epoch": 0.6675067506750675, "grad_norm": 0.7216193079948425, "learning_rate": 5.140320766735063e-05, "loss": 4.7963, "step": 1854 }, { "epoch": 0.6678667866786678, "grad_norm": 0.7376987934112549, "learning_rate": 5.130255498707438e-05, "loss": 4.5616, "step": 1855 }, { "epoch": 0.6682268226822682, "grad_norm": 0.8142603039741516, "learning_rate": 5.120196693701267e-05, "loss": 4.4483, "step": 1856 }, { "epoch": 0.6685868586858685, "grad_norm": 0.558108389377594, "learning_rate": 5.1101443650663764e-05, "loss": 4.9678, "step": 1857 }, { "epoch": 0.668946894689469, "grad_norm": 0.8197848200798035, "learning_rate": 5.100098526144006e-05, "loss": 4.7343, "step": 1858 }, { "epoch": 0.6693069306930693, "grad_norm": 0.75601726770401, "learning_rate": 5.090059190266779e-05, "loss": 5.1422, "step": 1859 }, { "epoch": 0.6696669666966697, "grad_norm": 0.6538819074630737, "learning_rate": 5.0800263707586903e-05, "loss": 4.7189, "step": 1860 }, { "epoch": 0.67002700270027, "grad_norm": 1.0711504220962524, "learning_rate": 5.0700000809350836e-05, "loss": 4.703, "step": 1861 }, { "epoch": 0.6703870387038704, "grad_norm": 0.7144097089767456, "learning_rate": 5.059980334102637e-05, "loss": 4.3519, "step": 1862 }, { "epoch": 0.6707470747074707, "grad_norm": 0.801908016204834, "learning_rate": 5.049967143559349e-05, "loss": 4.7171, "step": 1863 }, { "epoch": 0.6711071107110711, "grad_norm": 0.9164612293243408, "learning_rate": 5.0399605225945135e-05, "loss": 4.7499, "step": 1864 }, { "epoch": 0.6714671467146714, "grad_norm": 0.6613209843635559, "learning_rate": 5.0299604844886985e-05, "loss": 4.6498, "step": 1865 }, { "epoch": 0.6718271827182718, "grad_norm": 0.8215205669403076, "learning_rate": 5.019967042513748e-05, "loss": 4.9471, "step": 1866 }, { "epoch": 0.6721872187218721, "grad_norm": 1.1121480464935303, "learning_rate": 5.009980209932743e-05, "loss": 5.0059, "step": 1867 }, { "epoch": 0.6725472547254725, "grad_norm": 0.516362190246582, "learning_rate": 5.000000000000002e-05, "loss": 4.5532, "step": 1868 }, { "epoch": 0.672907290729073, "grad_norm": 0.5757459998130798, "learning_rate": 4.990026425961038e-05, "loss": 4.6692, "step": 1869 }, { "epoch": 0.6732673267326733, "grad_norm": 0.6386864185333252, "learning_rate": 4.980059501052572e-05, "loss": 4.709, "step": 1870 }, { "epoch": 0.6736273627362737, "grad_norm": 1.0361477136611938, "learning_rate": 4.9700992385024934e-05, "loss": 5.1841, "step": 1871 }, { "epoch": 0.673987398739874, "grad_norm": 0.7029489874839783, "learning_rate": 4.960145651529856e-05, "loss": 4.6923, "step": 1872 }, { "epoch": 0.6743474347434744, "grad_norm": 0.9367014765739441, "learning_rate": 4.9501987533448413e-05, "loss": 5.4315, "step": 1873 }, { "epoch": 0.6747074707470747, "grad_norm": 1.1029951572418213, "learning_rate": 4.940258557148765e-05, "loss": 5.5143, "step": 1874 }, { "epoch": 0.6750675067506751, "grad_norm": 1.3592629432678223, "learning_rate": 4.930325076134042e-05, "loss": 5.2936, "step": 1875 }, { "epoch": 0.6754275427542754, "grad_norm": 2.5242362022399902, "learning_rate": 4.920398323484182e-05, "loss": 4.9574, "step": 1876 }, { "epoch": 0.6757875787578758, "grad_norm": 0.8447120189666748, "learning_rate": 4.9104783123737566e-05, "loss": 4.8343, "step": 1877 }, { "epoch": 0.6761476147614761, "grad_norm": 0.9122353196144104, "learning_rate": 4.9005650559683946e-05, "loss": 4.7467, "step": 1878 }, { "epoch": 0.6765076507650765, "grad_norm": 0.7419308423995972, "learning_rate": 4.890658567424763e-05, "loss": 4.8193, "step": 1879 }, { "epoch": 0.6768676867686768, "grad_norm": 0.9262195825576782, "learning_rate": 4.880758859890536e-05, "loss": 4.8589, "step": 1880 }, { "epoch": 0.6772277227722773, "grad_norm": 0.7551014423370361, "learning_rate": 4.8708659465043996e-05, "loss": 4.9234, "step": 1881 }, { "epoch": 0.6775877587758776, "grad_norm": 0.949427604675293, "learning_rate": 4.860979840396016e-05, "loss": 5.0352, "step": 1882 }, { "epoch": 0.677947794779478, "grad_norm": 0.545581042766571, "learning_rate": 4.851100554686021e-05, "loss": 4.7575, "step": 1883 }, { "epoch": 0.6783078307830783, "grad_norm": 1.049039363861084, "learning_rate": 4.841228102485984e-05, "loss": 4.9452, "step": 1884 }, { "epoch": 0.6786678667866787, "grad_norm": 0.7244091033935547, "learning_rate": 4.831362496898418e-05, "loss": 4.1711, "step": 1885 }, { "epoch": 0.679027902790279, "grad_norm": 0.6629701852798462, "learning_rate": 4.821503751016746e-05, "loss": 4.4553, "step": 1886 }, { "epoch": 0.6793879387938794, "grad_norm": 0.8092941045761108, "learning_rate": 4.8116518779252885e-05, "loss": 4.4547, "step": 1887 }, { "epoch": 0.6797479747974797, "grad_norm": 0.938777506351471, "learning_rate": 4.8018068906992356e-05, "loss": 4.8169, "step": 1888 }, { "epoch": 0.6801080108010801, "grad_norm": 0.8320968747138977, "learning_rate": 4.791968802404648e-05, "loss": 5.0336, "step": 1889 }, { "epoch": 0.6804680468046804, "grad_norm": 0.7015447020530701, "learning_rate": 4.7821376260984285e-05, "loss": 4.5182, "step": 1890 }, { "epoch": 0.6808280828082808, "grad_norm": 0.839224100112915, "learning_rate": 4.772313374828304e-05, "loss": 5.0579, "step": 1891 }, { "epoch": 0.6811881188118812, "grad_norm": 0.7794513702392578, "learning_rate": 4.762496061632814e-05, "loss": 4.5722, "step": 1892 }, { "epoch": 0.6815481548154816, "grad_norm": 0.8089141845703125, "learning_rate": 4.752685699541287e-05, "loss": 4.6715, "step": 1893 }, { "epoch": 0.6819081908190819, "grad_norm": 0.7713329195976257, "learning_rate": 4.742882301573828e-05, "loss": 4.6361, "step": 1894 }, { "epoch": 0.6822682268226823, "grad_norm": 0.5469356775283813, "learning_rate": 4.733085880741301e-05, "loss": 4.6219, "step": 1895 }, { "epoch": 0.6826282628262826, "grad_norm": 0.5594090223312378, "learning_rate": 4.7232964500453006e-05, "loss": 4.8541, "step": 1896 }, { "epoch": 0.682988298829883, "grad_norm": 1.0760741233825684, "learning_rate": 4.713514022478155e-05, "loss": 5.1101, "step": 1897 }, { "epoch": 0.6833483348334833, "grad_norm": 0.7999005317687988, "learning_rate": 4.7037386110228985e-05, "loss": 4.8297, "step": 1898 }, { "epoch": 0.6837083708370837, "grad_norm": 0.8169113397598267, "learning_rate": 4.6939702286532414e-05, "loss": 4.9598, "step": 1899 }, { "epoch": 0.684068406840684, "grad_norm": 1.252626895904541, "learning_rate": 4.684208888333577e-05, "loss": 5.188, "step": 1900 }, { "epoch": 0.6844284428442844, "grad_norm": 0.5477898716926575, "learning_rate": 4.6744546030189486e-05, "loss": 4.418, "step": 1901 }, { "epoch": 0.6847884788478847, "grad_norm": 0.5308620929718018, "learning_rate": 4.6647073856550415e-05, "loss": 4.9039, "step": 1902 }, { "epoch": 0.6851485148514852, "grad_norm": 0.49975287914276123, "learning_rate": 4.654967249178147e-05, "loss": 4.9186, "step": 1903 }, { "epoch": 0.6855085508550856, "grad_norm": 0.8217042684555054, "learning_rate": 4.645234206515171e-05, "loss": 4.643, "step": 1904 }, { "epoch": 0.6858685868586859, "grad_norm": 0.5679884552955627, "learning_rate": 4.635508270583601e-05, "loss": 4.3299, "step": 1905 }, { "epoch": 0.6862286228622863, "grad_norm": 0.6399744749069214, "learning_rate": 4.625789454291493e-05, "loss": 4.7144, "step": 1906 }, { "epoch": 0.6865886588658866, "grad_norm": 0.6763333678245544, "learning_rate": 4.6160777705374524e-05, "loss": 4.8988, "step": 1907 }, { "epoch": 0.686948694869487, "grad_norm": 0.6468039155006409, "learning_rate": 4.606373232210621e-05, "loss": 4.7513, "step": 1908 }, { "epoch": 0.6873087308730873, "grad_norm": 0.5941967964172363, "learning_rate": 4.596675852190656e-05, "loss": 4.522, "step": 1909 }, { "epoch": 0.6876687668766877, "grad_norm": 0.6434370279312134, "learning_rate": 4.586985643347717e-05, "loss": 4.5992, "step": 1910 }, { "epoch": 0.688028802880288, "grad_norm": 0.684079110622406, "learning_rate": 4.577302618542435e-05, "loss": 4.824, "step": 1911 }, { "epoch": 0.6883888388838884, "grad_norm": 0.5801157355308533, "learning_rate": 4.567626790625921e-05, "loss": 4.8966, "step": 1912 }, { "epoch": 0.6887488748874887, "grad_norm": 0.5670872330665588, "learning_rate": 4.5579581724397255e-05, "loss": 4.2661, "step": 1913 }, { "epoch": 0.689108910891089, "grad_norm": 0.521619439125061, "learning_rate": 4.548296776815839e-05, "loss": 4.4828, "step": 1914 }, { "epoch": 0.6894689468946895, "grad_norm": 0.6507243514060974, "learning_rate": 4.538642616576652e-05, "loss": 4.6493, "step": 1915 }, { "epoch": 0.6898289828982899, "grad_norm": 0.5946521759033203, "learning_rate": 4.5289957045349653e-05, "loss": 4.7168, "step": 1916 }, { "epoch": 0.6901890189018902, "grad_norm": 0.574393630027771, "learning_rate": 4.519356053493958e-05, "loss": 4.7364, "step": 1917 }, { "epoch": 0.6905490549054906, "grad_norm": 0.6550558805465698, "learning_rate": 4.5097236762471653e-05, "loss": 4.6582, "step": 1918 }, { "epoch": 0.6909090909090909, "grad_norm": 0.9266576766967773, "learning_rate": 4.5000985855784746e-05, "loss": 4.7063, "step": 1919 }, { "epoch": 0.6912691269126913, "grad_norm": 0.5542899966239929, "learning_rate": 4.490480794262104e-05, "loss": 4.8331, "step": 1920 }, { "epoch": 0.6916291629162916, "grad_norm": 0.8518589735031128, "learning_rate": 4.480870315062583e-05, "loss": 4.9934, "step": 1921 }, { "epoch": 0.691989198919892, "grad_norm": 0.9087819457054138, "learning_rate": 4.471267160734731e-05, "loss": 5.1262, "step": 1922 }, { "epoch": 0.6923492349234923, "grad_norm": 0.9683169722557068, "learning_rate": 4.4616713440236516e-05, "loss": 5.2386, "step": 1923 }, { "epoch": 0.6927092709270927, "grad_norm": 1.0893546342849731, "learning_rate": 4.4520828776647104e-05, "loss": 4.8366, "step": 1924 }, { "epoch": 0.693069306930693, "grad_norm": 1.2286421060562134, "learning_rate": 4.442501774383515e-05, "loss": 5.369, "step": 1925 }, { "epoch": 0.6934293429342935, "grad_norm": 0.5617548227310181, "learning_rate": 4.432928046895905e-05, "loss": 4.6432, "step": 1926 }, { "epoch": 0.6937893789378938, "grad_norm": 0.7067145109176636, "learning_rate": 4.4233617079079236e-05, "loss": 4.5953, "step": 1927 }, { "epoch": 0.6941494149414942, "grad_norm": 0.659268856048584, "learning_rate": 4.413802770115816e-05, "loss": 5.1397, "step": 1928 }, { "epoch": 0.6945094509450945, "grad_norm": 0.7078251242637634, "learning_rate": 4.404251246206005e-05, "loss": 4.7738, "step": 1929 }, { "epoch": 0.6948694869486949, "grad_norm": 0.5560519695281982, "learning_rate": 4.3947071488550605e-05, "loss": 4.49, "step": 1930 }, { "epoch": 0.6952295229522952, "grad_norm": 0.5539645552635193, "learning_rate": 4.385170490729712e-05, "loss": 4.4477, "step": 1931 }, { "epoch": 0.6955895589558956, "grad_norm": 0.7647258639335632, "learning_rate": 4.375641284486808e-05, "loss": 4.5653, "step": 1932 }, { "epoch": 0.6959495949594959, "grad_norm": 0.5726279020309448, "learning_rate": 4.366119542773314e-05, "loss": 4.6699, "step": 1933 }, { "epoch": 0.6963096309630963, "grad_norm": 0.9219176769256592, "learning_rate": 4.3566052782262735e-05, "loss": 4.6111, "step": 1934 }, { "epoch": 0.6966696669666966, "grad_norm": 0.8426400423049927, "learning_rate": 4.347098503472822e-05, "loss": 4.8859, "step": 1935 }, { "epoch": 0.697029702970297, "grad_norm": 0.7717829942703247, "learning_rate": 4.337599231130147e-05, "loss": 4.3902, "step": 1936 }, { "epoch": 0.6973897389738974, "grad_norm": 0.7202708125114441, "learning_rate": 4.328107473805487e-05, "loss": 4.8488, "step": 1937 }, { "epoch": 0.6977497749774978, "grad_norm": 0.8424999117851257, "learning_rate": 4.318623244096092e-05, "loss": 4.6686, "step": 1938 }, { "epoch": 0.6981098109810981, "grad_norm": 0.6789209842681885, "learning_rate": 4.309146554589234e-05, "loss": 4.8707, "step": 1939 }, { "epoch": 0.6984698469846985, "grad_norm": 0.8634017705917358, "learning_rate": 4.2996774178621736e-05, "loss": 4.8603, "step": 1940 }, { "epoch": 0.6988298829882988, "grad_norm": 0.7735937833786011, "learning_rate": 4.2902158464821496e-05, "loss": 4.7517, "step": 1941 }, { "epoch": 0.6991899189918992, "grad_norm": 0.6638203263282776, "learning_rate": 4.2807618530063565e-05, "loss": 4.5559, "step": 1942 }, { "epoch": 0.6995499549954995, "grad_norm": 0.7306031584739685, "learning_rate": 4.271315449981934e-05, "loss": 4.3512, "step": 1943 }, { "epoch": 0.6999099909990999, "grad_norm": 0.8207063674926758, "learning_rate": 4.2618766499459516e-05, "loss": 5.0055, "step": 1944 }, { "epoch": 0.7002700270027002, "grad_norm": 0.6139596700668335, "learning_rate": 4.2524454654253775e-05, "loss": 4.6997, "step": 1945 }, { "epoch": 0.7006300630063006, "grad_norm": 0.9638845324516296, "learning_rate": 4.2430219089370823e-05, "loss": 5.1796, "step": 1946 }, { "epoch": 0.700990099009901, "grad_norm": 1.1057206392288208, "learning_rate": 4.23360599298781e-05, "loss": 5.3861, "step": 1947 }, { "epoch": 0.7013501350135013, "grad_norm": 0.869283139705658, "learning_rate": 4.224197730074169e-05, "loss": 5.035, "step": 1948 }, { "epoch": 0.7017101710171018, "grad_norm": 0.7809675931930542, "learning_rate": 4.2147971326825966e-05, "loss": 5.0235, "step": 1949 }, { "epoch": 0.7020702070207021, "grad_norm": 1.561911940574646, "learning_rate": 4.20540421328937e-05, "loss": 5.2386, "step": 1950 }, { "epoch": 0.7024302430243025, "grad_norm": 0.843798816204071, "learning_rate": 4.1960189843605745e-05, "loss": 4.9271, "step": 1951 }, { "epoch": 0.7027902790279028, "grad_norm": 0.7116716504096985, "learning_rate": 4.1866414583520877e-05, "loss": 4.9151, "step": 1952 }, { "epoch": 0.7031503150315032, "grad_norm": 0.8110635280609131, "learning_rate": 4.177271647709556e-05, "loss": 4.8799, "step": 1953 }, { "epoch": 0.7035103510351035, "grad_norm": 0.8753562569618225, "learning_rate": 4.1679095648683986e-05, "loss": 4.6706, "step": 1954 }, { "epoch": 0.7038703870387039, "grad_norm": 0.9446102380752563, "learning_rate": 4.158555222253771e-05, "loss": 4.7998, "step": 1955 }, { "epoch": 0.7042304230423042, "grad_norm": 1.3820642232894897, "learning_rate": 4.149208632280559e-05, "loss": 4.7055, "step": 1956 }, { "epoch": 0.7045904590459046, "grad_norm": 1.0470118522644043, "learning_rate": 4.139869807353357e-05, "loss": 4.7841, "step": 1957 }, { "epoch": 0.7049504950495049, "grad_norm": 0.8367030024528503, "learning_rate": 4.130538759866457e-05, "loss": 5.2838, "step": 1958 }, { "epoch": 0.7053105310531053, "grad_norm": 0.6585936546325684, "learning_rate": 4.121215502203829e-05, "loss": 4.8096, "step": 1959 }, { "epoch": 0.7056705670567057, "grad_norm": 0.9417300224304199, "learning_rate": 4.1119000467390955e-05, "loss": 4.6286, "step": 1960 }, { "epoch": 0.7060306030603061, "grad_norm": 0.7592486143112183, "learning_rate": 4.102592405835536e-05, "loss": 5.1779, "step": 1961 }, { "epoch": 0.7063906390639064, "grad_norm": 0.6947286128997803, "learning_rate": 4.0932925918460516e-05, "loss": 4.6453, "step": 1962 }, { "epoch": 0.7067506750675068, "grad_norm": 0.5848962664604187, "learning_rate": 4.084000617113164e-05, "loss": 4.7674, "step": 1963 }, { "epoch": 0.7071107110711071, "grad_norm": 1.2515913248062134, "learning_rate": 4.074716493968975e-05, "loss": 4.6358, "step": 1964 }, { "epoch": 0.7074707470747075, "grad_norm": 0.7720615863800049, "learning_rate": 4.0654402347351814e-05, "loss": 4.4811, "step": 1965 }, { "epoch": 0.7078307830783078, "grad_norm": 0.7216893434524536, "learning_rate": 4.056171851723035e-05, "loss": 4.5832, "step": 1966 }, { "epoch": 0.7081908190819082, "grad_norm": 0.5522660613059998, "learning_rate": 4.046911357233343e-05, "loss": 4.75, "step": 1967 }, { "epoch": 0.7085508550855085, "grad_norm": 0.7386857271194458, "learning_rate": 4.037658763556428e-05, "loss": 4.4266, "step": 1968 }, { "epoch": 0.7089108910891089, "grad_norm": 0.8335764408111572, "learning_rate": 4.028414082972141e-05, "loss": 4.3608, "step": 1969 }, { "epoch": 0.7092709270927092, "grad_norm": 0.7491186261177063, "learning_rate": 4.019177327749822e-05, "loss": 4.8352, "step": 1970 }, { "epoch": 0.7096309630963097, "grad_norm": 0.9205876588821411, "learning_rate": 4.0099485101483014e-05, "loss": 5.0586, "step": 1971 }, { "epoch": 0.70999099909991, "grad_norm": 0.6963374018669128, "learning_rate": 4.000727642415867e-05, "loss": 4.499, "step": 1972 }, { "epoch": 0.7103510351035104, "grad_norm": 0.9542635083198547, "learning_rate": 3.991514736790258e-05, "loss": 5.1105, "step": 1973 }, { "epoch": 0.7107110711071107, "grad_norm": 1.1971359252929688, "learning_rate": 3.982309805498649e-05, "loss": 4.9849, "step": 1974 }, { "epoch": 0.7110711071107111, "grad_norm": 1.055363416671753, "learning_rate": 3.9731128607576306e-05, "loss": 5.2035, "step": 1975 }, { "epoch": 0.7114311431143114, "grad_norm": 2.2710304260253906, "learning_rate": 3.963923914773187e-05, "loss": 4.7704, "step": 1976 }, { "epoch": 0.7117911791179118, "grad_norm": 0.739315927028656, "learning_rate": 3.954742979740695e-05, "loss": 5.2393, "step": 1977 }, { "epoch": 0.7121512151215121, "grad_norm": 0.7492758631706238, "learning_rate": 3.945570067844901e-05, "loss": 4.7168, "step": 1978 }, { "epoch": 0.7125112511251125, "grad_norm": 0.4400799870491028, "learning_rate": 3.936405191259891e-05, "loss": 4.3876, "step": 1979 }, { "epoch": 0.7128712871287128, "grad_norm": 0.9049892425537109, "learning_rate": 3.927248362149097e-05, "loss": 4.5856, "step": 1980 }, { "epoch": 0.7132313231323132, "grad_norm": 0.6218405961990356, "learning_rate": 3.9180995926652705e-05, "loss": 4.8043, "step": 1981 }, { "epoch": 0.7135913591359135, "grad_norm": 0.6553037166595459, "learning_rate": 3.9089588949504655e-05, "loss": 4.805, "step": 1982 }, { "epoch": 0.713951395139514, "grad_norm": 0.45045459270477295, "learning_rate": 3.899826281136015e-05, "loss": 4.6216, "step": 1983 }, { "epoch": 0.7143114311431144, "grad_norm": 0.6989582180976868, "learning_rate": 3.890701763342536e-05, "loss": 4.9492, "step": 1984 }, { "epoch": 0.7146714671467147, "grad_norm": 0.7872725129127502, "learning_rate": 3.8815853536798904e-05, "loss": 4.9512, "step": 1985 }, { "epoch": 0.715031503150315, "grad_norm": 0.6595510244369507, "learning_rate": 3.8724770642471865e-05, "loss": 4.4377, "step": 1986 }, { "epoch": 0.7153915391539154, "grad_norm": 0.47658970952033997, "learning_rate": 3.863376907132752e-05, "loss": 4.6671, "step": 1987 }, { "epoch": 0.7157515751575158, "grad_norm": 0.42084112763404846, "learning_rate": 3.854284894414122e-05, "loss": 4.4798, "step": 1988 }, { "epoch": 0.7161116111611161, "grad_norm": 0.5337504148483276, "learning_rate": 3.8452010381580216e-05, "loss": 4.3987, "step": 1989 }, { "epoch": 0.7164716471647165, "grad_norm": 0.6366197466850281, "learning_rate": 3.836125350420358e-05, "loss": 5.2636, "step": 1990 }, { "epoch": 0.7168316831683168, "grad_norm": 0.8405591249465942, "learning_rate": 3.82705784324618e-05, "loss": 4.8786, "step": 1991 }, { "epoch": 0.7171917191719172, "grad_norm": 0.6100167632102966, "learning_rate": 3.8179985286696986e-05, "loss": 4.5409, "step": 1992 }, { "epoch": 0.7175517551755175, "grad_norm": 0.699909508228302, "learning_rate": 3.8089474187142406e-05, "loss": 4.8712, "step": 1993 }, { "epoch": 0.717911791179118, "grad_norm": 0.6978292465209961, "learning_rate": 3.79990452539225e-05, "loss": 4.9707, "step": 1994 }, { "epoch": 0.7182718271827183, "grad_norm": 0.8318164348602295, "learning_rate": 3.790869860705258e-05, "loss": 4.8339, "step": 1995 }, { "epoch": 0.7186318631863187, "grad_norm": 0.639952540397644, "learning_rate": 3.781843436643882e-05, "loss": 4.9529, "step": 1996 }, { "epoch": 0.718991899189919, "grad_norm": 0.8525128364562988, "learning_rate": 3.772825265187802e-05, "loss": 5.0666, "step": 1997 }, { "epoch": 0.7193519351935194, "grad_norm": 1.0021530389785767, "learning_rate": 3.763815358305743e-05, "loss": 5.1745, "step": 1998 }, { "epoch": 0.7197119711971197, "grad_norm": 1.0609560012817383, "learning_rate": 3.7548137279554586e-05, "loss": 5.1994, "step": 1999 }, { "epoch": 0.7200720072007201, "grad_norm": 1.7202980518341064, "learning_rate": 3.7458203860837234e-05, "loss": 5.4059, "step": 2000 }, { "epoch": 0.7204320432043204, "grad_norm": 1.2057273387908936, "learning_rate": 3.736835344626311e-05, "loss": 4.8807, "step": 2001 }, { "epoch": 0.7207920792079208, "grad_norm": 0.9880334734916687, "learning_rate": 3.727858615507974e-05, "loss": 5.044, "step": 2002 }, { "epoch": 0.7211521152115211, "grad_norm": 0.6543089747428894, "learning_rate": 3.7188902106424416e-05, "loss": 4.7401, "step": 2003 }, { "epoch": 0.7215121512151215, "grad_norm": 0.5077190399169922, "learning_rate": 3.709930141932386e-05, "loss": 5.0552, "step": 2004 }, { "epoch": 0.7218721872187218, "grad_norm": 0.6145045757293701, "learning_rate": 3.7009784212694265e-05, "loss": 5.1865, "step": 2005 }, { "epoch": 0.7222322232223223, "grad_norm": 0.49541333317756653, "learning_rate": 3.692035060534088e-05, "loss": 4.8607, "step": 2006 }, { "epoch": 0.7225922592259226, "grad_norm": 0.8183977007865906, "learning_rate": 3.683100071595813e-05, "loss": 4.4959, "step": 2007 }, { "epoch": 0.722952295229523, "grad_norm": 0.7221876382827759, "learning_rate": 3.674173466312928e-05, "loss": 4.3129, "step": 2008 }, { "epoch": 0.7233123312331233, "grad_norm": 0.9419858455657959, "learning_rate": 3.665255256532638e-05, "loss": 5.1136, "step": 2009 }, { "epoch": 0.7236723672367237, "grad_norm": 0.500443696975708, "learning_rate": 3.656345454090996e-05, "loss": 4.9675, "step": 2010 }, { "epoch": 0.724032403240324, "grad_norm": 0.5056512355804443, "learning_rate": 3.6474440708129045e-05, "loss": 4.9805, "step": 2011 }, { "epoch": 0.7243924392439244, "grad_norm": 0.734207808971405, "learning_rate": 3.638551118512089e-05, "loss": 4.5258, "step": 2012 }, { "epoch": 0.7247524752475247, "grad_norm": 0.6353861689567566, "learning_rate": 3.6296666089910936e-05, "loss": 5.0694, "step": 2013 }, { "epoch": 0.7251125112511251, "grad_norm": 0.6294865012168884, "learning_rate": 3.620790554041241e-05, "loss": 4.6572, "step": 2014 }, { "epoch": 0.7254725472547254, "grad_norm": 0.6333225965499878, "learning_rate": 3.611922965442648e-05, "loss": 4.6558, "step": 2015 }, { "epoch": 0.7258325832583258, "grad_norm": 0.6373658180236816, "learning_rate": 3.603063854964188e-05, "loss": 5.0726, "step": 2016 }, { "epoch": 0.7261926192619262, "grad_norm": 0.8722042441368103, "learning_rate": 3.594213234363486e-05, "loss": 4.6815, "step": 2017 }, { "epoch": 0.7265526552655266, "grad_norm": 0.7341203689575195, "learning_rate": 3.5853711153868965e-05, "loss": 4.957, "step": 2018 }, { "epoch": 0.7269126912691269, "grad_norm": 0.5686054825782776, "learning_rate": 3.5765375097694916e-05, "loss": 4.7206, "step": 2019 }, { "epoch": 0.7272727272727273, "grad_norm": 1.069074273109436, "learning_rate": 3.56771242923505e-05, "loss": 5.3025, "step": 2020 }, { "epoch": 0.7276327632763276, "grad_norm": 0.6375458836555481, "learning_rate": 3.558895885496023e-05, "loss": 4.6998, "step": 2021 }, { "epoch": 0.727992799279928, "grad_norm": 0.6588398218154907, "learning_rate": 3.550087890253544e-05, "loss": 4.9146, "step": 2022 }, { "epoch": 0.7283528352835283, "grad_norm": 1.0640164613723755, "learning_rate": 3.541288455197398e-05, "loss": 4.9861, "step": 2023 }, { "epoch": 0.7287128712871287, "grad_norm": 1.0872198343276978, "learning_rate": 3.53249759200601e-05, "loss": 5.1508, "step": 2024 }, { "epoch": 0.729072907290729, "grad_norm": 1.2077916860580444, "learning_rate": 3.523715312346421e-05, "loss": 5.3081, "step": 2025 }, { "epoch": 0.7294329432943294, "grad_norm": 1.2681756019592285, "learning_rate": 3.51494162787429e-05, "loss": 4.9204, "step": 2026 }, { "epoch": 0.7297929792979297, "grad_norm": 0.5897660255432129, "learning_rate": 3.506176550233863e-05, "loss": 4.4497, "step": 2027 }, { "epoch": 0.7301530153015302, "grad_norm": 0.7174355983734131, "learning_rate": 3.497420091057969e-05, "loss": 4.9662, "step": 2028 }, { "epoch": 0.7305130513051306, "grad_norm": 0.6896499991416931, "learning_rate": 3.488672261967989e-05, "loss": 4.8752, "step": 2029 }, { "epoch": 0.7308730873087309, "grad_norm": 0.9643046259880066, "learning_rate": 3.479933074573858e-05, "loss": 4.723, "step": 2030 }, { "epoch": 0.7312331233123313, "grad_norm": 0.6577152609825134, "learning_rate": 3.47120254047404e-05, "loss": 4.6554, "step": 2031 }, { "epoch": 0.7315931593159316, "grad_norm": 0.6549072265625, "learning_rate": 3.462480671255515e-05, "loss": 4.7597, "step": 2032 }, { "epoch": 0.731953195319532, "grad_norm": 0.595219075679779, "learning_rate": 3.4537674784937614e-05, "loss": 4.6868, "step": 2033 }, { "epoch": 0.7323132313231323, "grad_norm": 0.6932517290115356, "learning_rate": 3.445062973752745e-05, "loss": 4.557, "step": 2034 }, { "epoch": 0.7326732673267327, "grad_norm": 0.6161606907844543, "learning_rate": 3.4363671685848986e-05, "loss": 4.7073, "step": 2035 }, { "epoch": 0.733033303330333, "grad_norm": 0.6454643607139587, "learning_rate": 3.427680074531113e-05, "loss": 4.5563, "step": 2036 }, { "epoch": 0.7333933393339334, "grad_norm": 0.7846769094467163, "learning_rate": 3.419001703120709e-05, "loss": 4.4964, "step": 2037 }, { "epoch": 0.7337533753375337, "grad_norm": 0.9348975419998169, "learning_rate": 3.410332065871441e-05, "loss": 4.4099, "step": 2038 }, { "epoch": 0.7341134113411341, "grad_norm": 0.5604249835014343, "learning_rate": 3.401671174289469e-05, "loss": 4.6552, "step": 2039 }, { "epoch": 0.7344734473447345, "grad_norm": 0.593227744102478, "learning_rate": 3.393019039869338e-05, "loss": 4.441, "step": 2040 }, { "epoch": 0.7348334833483349, "grad_norm": 1.0157716274261475, "learning_rate": 3.3843756740939817e-05, "loss": 4.5417, "step": 2041 }, { "epoch": 0.7351935193519352, "grad_norm": 0.9067453145980835, "learning_rate": 3.3757410884346894e-05, "loss": 5.033, "step": 2042 }, { "epoch": 0.7355535553555356, "grad_norm": 0.6749830842018127, "learning_rate": 3.367115294351104e-05, "loss": 4.8072, "step": 2043 }, { "epoch": 0.7359135913591359, "grad_norm": 0.4967857897281647, "learning_rate": 3.358498303291191e-05, "loss": 4.5516, "step": 2044 }, { "epoch": 0.7362736273627363, "grad_norm": 0.5756816864013672, "learning_rate": 3.3498901266912396e-05, "loss": 4.6619, "step": 2045 }, { "epoch": 0.7366336633663366, "grad_norm": 0.7196652889251709, "learning_rate": 3.3412907759758385e-05, "loss": 4.9634, "step": 2046 }, { "epoch": 0.736993699369937, "grad_norm": 0.5623925924301147, "learning_rate": 3.332700262557864e-05, "loss": 4.8888, "step": 2047 }, { "epoch": 0.7373537353735373, "grad_norm": 1.13788640499115, "learning_rate": 3.324118597838464e-05, "loss": 5.2636, "step": 2048 }, { "epoch": 0.7377137713771377, "grad_norm": 1.3406879901885986, "learning_rate": 3.31554579320704e-05, "loss": 5.2915, "step": 2049 }, { "epoch": 0.738073807380738, "grad_norm": 1.0955404043197632, "learning_rate": 3.3069818600412375e-05, "loss": 5.0604, "step": 2050 }, { "epoch": 0.7384338433843385, "grad_norm": 2.2472808361053467, "learning_rate": 3.298426809706928e-05, "loss": 4.6431, "step": 2051 }, { "epoch": 0.7387938793879388, "grad_norm": 0.5622848272323608, "learning_rate": 3.289880653558188e-05, "loss": 4.9647, "step": 2052 }, { "epoch": 0.7391539153915392, "grad_norm": 0.537049412727356, "learning_rate": 3.281343402937297e-05, "loss": 4.8475, "step": 2053 }, { "epoch": 0.7395139513951395, "grad_norm": 0.7046445608139038, "learning_rate": 3.2728150691747115e-05, "loss": 4.8618, "step": 2054 }, { "epoch": 0.7398739873987399, "grad_norm": 0.5483196973800659, "learning_rate": 3.264295663589061e-05, "loss": 4.7506, "step": 2055 }, { "epoch": 0.7402340234023402, "grad_norm": 0.7228554487228394, "learning_rate": 3.25578519748711e-05, "loss": 4.9671, "step": 2056 }, { "epoch": 0.7405940594059406, "grad_norm": 0.7139778733253479, "learning_rate": 3.2472836821637744e-05, "loss": 4.837, "step": 2057 }, { "epoch": 0.7409540954095409, "grad_norm": 0.6157667636871338, "learning_rate": 3.238791128902082e-05, "loss": 4.6113, "step": 2058 }, { "epoch": 0.7413141314131413, "grad_norm": 1.1201602220535278, "learning_rate": 3.230307548973174e-05, "loss": 4.7593, "step": 2059 }, { "epoch": 0.7416741674167416, "grad_norm": 0.6364259123802185, "learning_rate": 3.2218329536362704e-05, "loss": 4.4534, "step": 2060 }, { "epoch": 0.742034203420342, "grad_norm": 0.8327221870422363, "learning_rate": 3.213367354138678e-05, "loss": 5.1408, "step": 2061 }, { "epoch": 0.7423942394239424, "grad_norm": 0.624847948551178, "learning_rate": 3.204910761715763e-05, "loss": 4.5746, "step": 2062 }, { "epoch": 0.7427542754275428, "grad_norm": 0.7800330519676208, "learning_rate": 3.196463187590929e-05, "loss": 4.8576, "step": 2063 }, { "epoch": 0.7431143114311431, "grad_norm": 1.7853704690933228, "learning_rate": 3.18802464297562e-05, "loss": 4.8703, "step": 2064 }, { "epoch": 0.7434743474347435, "grad_norm": 0.6822658777236938, "learning_rate": 3.17959513906929e-05, "loss": 4.7471, "step": 2065 }, { "epoch": 0.7438343834383438, "grad_norm": 0.9525924921035767, "learning_rate": 3.1711746870594086e-05, "loss": 4.9575, "step": 2066 }, { "epoch": 0.7441944194419442, "grad_norm": 1.0060420036315918, "learning_rate": 3.162763298121408e-05, "loss": 4.2924, "step": 2067 }, { "epoch": 0.7445544554455445, "grad_norm": 0.8173012733459473, "learning_rate": 3.1543609834187115e-05, "loss": 5.1242, "step": 2068 }, { "epoch": 0.7449144914491449, "grad_norm": 0.6984362602233887, "learning_rate": 3.145967754102691e-05, "loss": 4.5351, "step": 2069 }, { "epoch": 0.7452745274527453, "grad_norm": 0.6665016412734985, "learning_rate": 3.137583621312665e-05, "loss": 4.6194, "step": 2070 }, { "epoch": 0.7456345634563456, "grad_norm": 1.0984699726104736, "learning_rate": 3.129208596175872e-05, "loss": 4.9804, "step": 2071 }, { "epoch": 0.745994599459946, "grad_norm": 0.7321772575378418, "learning_rate": 3.120842689807468e-05, "loss": 4.9711, "step": 2072 }, { "epoch": 0.7463546354635463, "grad_norm": 1.0770906209945679, "learning_rate": 3.112485913310508e-05, "loss": 5.2855, "step": 2073 }, { "epoch": 0.7467146714671468, "grad_norm": 0.8592466711997986, "learning_rate": 3.10413827777593e-05, "loss": 4.9253, "step": 2074 }, { "epoch": 0.7470747074707471, "grad_norm": 1.0927594900131226, "learning_rate": 3.0957997942825336e-05, "loss": 5.1627, "step": 2075 }, { "epoch": 0.7474347434743475, "grad_norm": 1.0298612117767334, "learning_rate": 3.0874704738969794e-05, "loss": 5.1717, "step": 2076 }, { "epoch": 0.7477947794779478, "grad_norm": 0.8015543222427368, "learning_rate": 3.079150327673766e-05, "loss": 4.6964, "step": 2077 }, { "epoch": 0.7481548154815482, "grad_norm": 0.7126684188842773, "learning_rate": 3.070839366655215e-05, "loss": 4.9846, "step": 2078 }, { "epoch": 0.7485148514851485, "grad_norm": 1.5721231698989868, "learning_rate": 3.062537601871452e-05, "loss": 4.8304, "step": 2079 }, { "epoch": 0.7488748874887489, "grad_norm": 0.6689034104347229, "learning_rate": 3.054245044340408e-05, "loss": 4.6386, "step": 2080 }, { "epoch": 0.7492349234923492, "grad_norm": 1.0003162622451782, "learning_rate": 3.0459617050677868e-05, "loss": 4.9661, "step": 2081 }, { "epoch": 0.7495949594959496, "grad_norm": 0.7442778944969177, "learning_rate": 3.0376875950470617e-05, "loss": 4.4624, "step": 2082 }, { "epoch": 0.7499549954995499, "grad_norm": 1.011825442314148, "learning_rate": 3.0294227252594555e-05, "loss": 4.8084, "step": 2083 }, { "epoch": 0.7503150315031503, "grad_norm": 0.5705128908157349, "learning_rate": 3.021167106673928e-05, "loss": 4.7329, "step": 2084 }, { "epoch": 0.7506750675067507, "grad_norm": 0.4860425889492035, "learning_rate": 3.0129207502471625e-05, "loss": 4.6015, "step": 2085 }, { "epoch": 0.7506750675067507, "eval_loss": 4.810147762298584, "eval_runtime": 101.7868, "eval_samples_per_second": 45.959, "eval_steps_per_second": 11.495, "step": 2085 }, { "epoch": 0.7510351035103511, "grad_norm": 0.671138346195221, "learning_rate": 3.0046836669235433e-05, "loss": 4.6252, "step": 2086 }, { "epoch": 0.7513951395139514, "grad_norm": 0.9334102869033813, "learning_rate": 2.996455867635155e-05, "loss": 4.5999, "step": 2087 }, { "epoch": 0.7517551755175518, "grad_norm": 0.8894442319869995, "learning_rate": 2.988237363301758e-05, "loss": 5.0383, "step": 2088 }, { "epoch": 0.7521152115211521, "grad_norm": 0.6898563504219055, "learning_rate": 2.9800281648307794e-05, "loss": 4.8665, "step": 2089 }, { "epoch": 0.7524752475247525, "grad_norm": 0.6982426047325134, "learning_rate": 2.9718282831172883e-05, "loss": 4.8052, "step": 2090 }, { "epoch": 0.7528352835283528, "grad_norm": 0.6563032865524292, "learning_rate": 2.9636377290439944e-05, "loss": 4.4563, "step": 2091 }, { "epoch": 0.7531953195319532, "grad_norm": 0.747215986251831, "learning_rate": 2.9554565134812294e-05, "loss": 5.0272, "step": 2092 }, { "epoch": 0.7535553555355535, "grad_norm": 0.842694878578186, "learning_rate": 2.9472846472869298e-05, "loss": 5.1278, "step": 2093 }, { "epoch": 0.7539153915391539, "grad_norm": 1.041561484336853, "learning_rate": 2.9391221413066182e-05, "loss": 5.2336, "step": 2094 }, { "epoch": 0.7542754275427542, "grad_norm": 0.7632551193237305, "learning_rate": 2.930969006373402e-05, "loss": 4.7186, "step": 2095 }, { "epoch": 0.7546354635463547, "grad_norm": 1.0100700855255127, "learning_rate": 2.922825253307947e-05, "loss": 5.0076, "step": 2096 }, { "epoch": 0.754995499549955, "grad_norm": 0.5503267049789429, "learning_rate": 2.9146908929184713e-05, "loss": 4.6652, "step": 2097 }, { "epoch": 0.7553555355535554, "grad_norm": 0.7919767498970032, "learning_rate": 2.9065659360007247e-05, "loss": 5.4393, "step": 2098 }, { "epoch": 0.7557155715571557, "grad_norm": 1.0583096742630005, "learning_rate": 2.898450393337977e-05, "loss": 5.2044, "step": 2099 }, { "epoch": 0.7560756075607561, "grad_norm": 1.1439197063446045, "learning_rate": 2.8903442757010035e-05, "loss": 5.2308, "step": 2100 }, { "epoch": 0.7564356435643564, "grad_norm": 1.3537611961364746, "learning_rate": 2.8822475938480764e-05, "loss": 4.7218, "step": 2101 }, { "epoch": 0.7567956795679568, "grad_norm": 0.6820851564407349, "learning_rate": 2.874160358524931e-05, "loss": 4.7861, "step": 2102 }, { "epoch": 0.7571557155715571, "grad_norm": 1.5382546186447144, "learning_rate": 2.8660825804647795e-05, "loss": 4.6253, "step": 2103 }, { "epoch": 0.7575157515751575, "grad_norm": 0.9953200221061707, "learning_rate": 2.8580142703882796e-05, "loss": 4.5608, "step": 2104 }, { "epoch": 0.7578757875787578, "grad_norm": 0.7481911778450012, "learning_rate": 2.8499554390035143e-05, "loss": 4.9828, "step": 2105 }, { "epoch": 0.7582358235823582, "grad_norm": 0.8973466753959656, "learning_rate": 2.8419060970059974e-05, "loss": 4.4942, "step": 2106 }, { "epoch": 0.7585958595859585, "grad_norm": 0.6188588738441467, "learning_rate": 2.8338662550786443e-05, "loss": 4.917, "step": 2107 }, { "epoch": 0.758955895589559, "grad_norm": 0.8116443157196045, "learning_rate": 2.8258359238917665e-05, "loss": 4.8898, "step": 2108 }, { "epoch": 0.7593159315931594, "grad_norm": 0.996688187122345, "learning_rate": 2.8178151141030406e-05, "loss": 4.6581, "step": 2109 }, { "epoch": 0.7596759675967597, "grad_norm": 1.0279045104980469, "learning_rate": 2.8098038363575186e-05, "loss": 4.8797, "step": 2110 }, { "epoch": 0.76003600360036, "grad_norm": 0.7636517286300659, "learning_rate": 2.8018021012875994e-05, "loss": 4.4845, "step": 2111 }, { "epoch": 0.7603960396039604, "grad_norm": 0.8215141892433167, "learning_rate": 2.7938099195130153e-05, "loss": 4.7628, "step": 2112 }, { "epoch": 0.7607560756075608, "grad_norm": 1.067277431488037, "learning_rate": 2.7858273016408197e-05, "loss": 5.2078, "step": 2113 }, { "epoch": 0.7611161116111611, "grad_norm": 0.7776371240615845, "learning_rate": 2.7778542582653744e-05, "loss": 5.1417, "step": 2114 }, { "epoch": 0.7614761476147615, "grad_norm": 0.8139586448669434, "learning_rate": 2.769890799968332e-05, "loss": 4.8309, "step": 2115 }, { "epoch": 0.7618361836183618, "grad_norm": 0.7961241006851196, "learning_rate": 2.7619369373186288e-05, "loss": 4.9364, "step": 2116 }, { "epoch": 0.7621962196219622, "grad_norm": 0.6579238176345825, "learning_rate": 2.753992680872457e-05, "loss": 4.8429, "step": 2117 }, { "epoch": 0.7625562556255625, "grad_norm": 0.6686379909515381, "learning_rate": 2.746058041173266e-05, "loss": 4.8558, "step": 2118 }, { "epoch": 0.762916291629163, "grad_norm": 0.6807066202163696, "learning_rate": 2.7381330287517426e-05, "loss": 4.4459, "step": 2119 }, { "epoch": 0.7632763276327633, "grad_norm": 0.6293361783027649, "learning_rate": 2.7302176541257986e-05, "loss": 4.898, "step": 2120 }, { "epoch": 0.7636363636363637, "grad_norm": 0.8723804950714111, "learning_rate": 2.7223119278005438e-05, "loss": 4.6484, "step": 2121 }, { "epoch": 0.763996399639964, "grad_norm": 0.848070502281189, "learning_rate": 2.7144158602682924e-05, "loss": 5.2841, "step": 2122 }, { "epoch": 0.7643564356435644, "grad_norm": 0.9075490832328796, "learning_rate": 2.7065294620085424e-05, "loss": 5.2137, "step": 2123 }, { "epoch": 0.7647164716471647, "grad_norm": 1.0051218271255493, "learning_rate": 2.6986527434879472e-05, "loss": 5.1463, "step": 2124 }, { "epoch": 0.7650765076507651, "grad_norm": 1.058005928993225, "learning_rate": 2.6907857151603234e-05, "loss": 5.2255, "step": 2125 }, { "epoch": 0.7654365436543654, "grad_norm": 1.0294654369354248, "learning_rate": 2.6829283874666233e-05, "loss": 4.6688, "step": 2126 }, { "epoch": 0.7657965796579658, "grad_norm": 0.8858208656311035, "learning_rate": 2.6750807708349267e-05, "loss": 4.6658, "step": 2127 }, { "epoch": 0.7661566156615661, "grad_norm": 0.8013286590576172, "learning_rate": 2.6672428756804225e-05, "loss": 4.849, "step": 2128 }, { "epoch": 0.7665166516651665, "grad_norm": 0.7389269471168518, "learning_rate": 2.659414712405398e-05, "loss": 4.4728, "step": 2129 }, { "epoch": 0.7668766876687669, "grad_norm": 0.6583431363105774, "learning_rate": 2.6515962913992275e-05, "loss": 4.9133, "step": 2130 }, { "epoch": 0.7672367236723673, "grad_norm": 0.6830927133560181, "learning_rate": 2.643787623038354e-05, "loss": 4.7863, "step": 2131 }, { "epoch": 0.7675967596759676, "grad_norm": 0.6234518885612488, "learning_rate": 2.6359887176862718e-05, "loss": 4.6174, "step": 2132 }, { "epoch": 0.767956795679568, "grad_norm": 0.7106865048408508, "learning_rate": 2.6281995856935237e-05, "loss": 4.7574, "step": 2133 }, { "epoch": 0.7683168316831683, "grad_norm": 0.9051821231842041, "learning_rate": 2.6204202373976818e-05, "loss": 4.8563, "step": 2134 }, { "epoch": 0.7686768676867687, "grad_norm": 0.6196146011352539, "learning_rate": 2.6126506831233344e-05, "loss": 4.5112, "step": 2135 }, { "epoch": 0.769036903690369, "grad_norm": 0.6337679624557495, "learning_rate": 2.6048909331820636e-05, "loss": 4.6024, "step": 2136 }, { "epoch": 0.7693969396939694, "grad_norm": 0.6263675689697266, "learning_rate": 2.5971409978724458e-05, "loss": 4.6405, "step": 2137 }, { "epoch": 0.7697569756975697, "grad_norm": 1.0419749021530151, "learning_rate": 2.5894008874800325e-05, "loss": 4.7905, "step": 2138 }, { "epoch": 0.7701170117011701, "grad_norm": 0.8129563927650452, "learning_rate": 2.581670612277335e-05, "loss": 4.4431, "step": 2139 }, { "epoch": 0.7704770477047704, "grad_norm": 0.7132530212402344, "learning_rate": 2.5739501825238053e-05, "loss": 4.7698, "step": 2140 }, { "epoch": 0.7708370837083708, "grad_norm": 0.6291266083717346, "learning_rate": 2.566239608465838e-05, "loss": 4.5729, "step": 2141 }, { "epoch": 0.7711971197119712, "grad_norm": 0.5569373965263367, "learning_rate": 2.558538900336741e-05, "loss": 4.9298, "step": 2142 }, { "epoch": 0.7715571557155716, "grad_norm": 0.72600257396698, "learning_rate": 2.5508480683567315e-05, "loss": 4.8837, "step": 2143 }, { "epoch": 0.771917191719172, "grad_norm": 0.7014186978340149, "learning_rate": 2.543167122732918e-05, "loss": 4.6547, "step": 2144 }, { "epoch": 0.7722772277227723, "grad_norm": 0.8487231731414795, "learning_rate": 2.5354960736592883e-05, "loss": 4.6923, "step": 2145 }, { "epoch": 0.7726372637263726, "grad_norm": 0.6868178844451904, "learning_rate": 2.5278349313166992e-05, "loss": 4.6032, "step": 2146 }, { "epoch": 0.772997299729973, "grad_norm": 0.739762544631958, "learning_rate": 2.5201837058728505e-05, "loss": 4.8253, "step": 2147 }, { "epoch": 0.7733573357335733, "grad_norm": 1.2171458005905151, "learning_rate": 2.512542407482289e-05, "loss": 5.0895, "step": 2148 }, { "epoch": 0.7737173717371737, "grad_norm": 1.3037205934524536, "learning_rate": 2.504911046286382e-05, "loss": 4.9572, "step": 2149 }, { "epoch": 0.774077407740774, "grad_norm": 1.3282265663146973, "learning_rate": 2.4972896324133144e-05, "loss": 5.4213, "step": 2150 }, { "epoch": 0.7744374437443744, "grad_norm": 1.392838716506958, "learning_rate": 2.4896781759780585e-05, "loss": 4.7398, "step": 2151 }, { "epoch": 0.7747974797479747, "grad_norm": 0.6054560542106628, "learning_rate": 2.4820766870823807e-05, "loss": 4.7054, "step": 2152 }, { "epoch": 0.7751575157515752, "grad_norm": 0.6699235439300537, "learning_rate": 2.4744851758148156e-05, "loss": 4.6727, "step": 2153 }, { "epoch": 0.7755175517551756, "grad_norm": 0.5790958404541016, "learning_rate": 2.4669036522506584e-05, "loss": 4.8101, "step": 2154 }, { "epoch": 0.7758775877587759, "grad_norm": 0.8083108067512512, "learning_rate": 2.45933212645194e-05, "loss": 4.5601, "step": 2155 }, { "epoch": 0.7762376237623763, "grad_norm": 1.0053972005844116, "learning_rate": 2.451770608467432e-05, "loss": 4.6501, "step": 2156 }, { "epoch": 0.7765976597659766, "grad_norm": 0.6478819847106934, "learning_rate": 2.4442191083326195e-05, "loss": 4.6527, "step": 2157 }, { "epoch": 0.776957695769577, "grad_norm": 1.433661937713623, "learning_rate": 2.4366776360696942e-05, "loss": 5.0813, "step": 2158 }, { "epoch": 0.7773177317731773, "grad_norm": 1.1521214246749878, "learning_rate": 2.429146201687538e-05, "loss": 4.5685, "step": 2159 }, { "epoch": 0.7776777677767777, "grad_norm": 0.6991515159606934, "learning_rate": 2.42162481518171e-05, "loss": 4.7082, "step": 2160 }, { "epoch": 0.778037803780378, "grad_norm": 0.8507077693939209, "learning_rate": 2.414113486534434e-05, "loss": 4.6635, "step": 2161 }, { "epoch": 0.7783978397839784, "grad_norm": 0.6432574987411499, "learning_rate": 2.4066122257145894e-05, "loss": 4.5532, "step": 2162 }, { "epoch": 0.7787578757875787, "grad_norm": 0.6845300793647766, "learning_rate": 2.3991210426776855e-05, "loss": 4.6837, "step": 2163 }, { "epoch": 0.7791179117911791, "grad_norm": 0.743084728717804, "learning_rate": 2.3916399473658623e-05, "loss": 4.7629, "step": 2164 }, { "epoch": 0.7794779477947795, "grad_norm": 0.7912582159042358, "learning_rate": 2.3841689497078746e-05, "loss": 4.8711, "step": 2165 }, { "epoch": 0.7798379837983799, "grad_norm": 0.6151897311210632, "learning_rate": 2.376708059619065e-05, "loss": 4.5838, "step": 2166 }, { "epoch": 0.7801980198019802, "grad_norm": 0.8373451828956604, "learning_rate": 2.3692572870013718e-05, "loss": 4.9176, "step": 2167 }, { "epoch": 0.7805580558055806, "grad_norm": 0.5105034708976746, "learning_rate": 2.361816641743303e-05, "loss": 4.5368, "step": 2168 }, { "epoch": 0.7809180918091809, "grad_norm": 0.6912795305252075, "learning_rate": 2.354386133719927e-05, "loss": 4.3363, "step": 2169 }, { "epoch": 0.7812781278127813, "grad_norm": 0.8666832447052002, "learning_rate": 2.3469657727928506e-05, "loss": 4.5923, "step": 2170 }, { "epoch": 0.7816381638163816, "grad_norm": 0.8544076085090637, "learning_rate": 2.339555568810221e-05, "loss": 4.9953, "step": 2171 }, { "epoch": 0.781998199819982, "grad_norm": 1.0247130393981934, "learning_rate": 2.3321555316067045e-05, "loss": 5.4613, "step": 2172 }, { "epoch": 0.7823582358235823, "grad_norm": 1.0253169536590576, "learning_rate": 2.3247656710034737e-05, "loss": 5.1228, "step": 2173 }, { "epoch": 0.7827182718271827, "grad_norm": 1.0105392932891846, "learning_rate": 2.3173859968081944e-05, "loss": 4.7797, "step": 2174 }, { "epoch": 0.783078307830783, "grad_norm": 0.8575915098190308, "learning_rate": 2.3100165188150125e-05, "loss": 5.0751, "step": 2175 }, { "epoch": 0.7834383438343835, "grad_norm": 1.8292618989944458, "learning_rate": 2.3026572468045437e-05, "loss": 4.8729, "step": 2176 }, { "epoch": 0.7837983798379838, "grad_norm": 0.7877120971679688, "learning_rate": 2.295308190543859e-05, "loss": 4.3804, "step": 2177 }, { "epoch": 0.7841584158415842, "grad_norm": 2.0647926330566406, "learning_rate": 2.287969359786466e-05, "loss": 4.7416, "step": 2178 }, { "epoch": 0.7845184518451845, "grad_norm": 1.065865159034729, "learning_rate": 2.280640764272306e-05, "loss": 4.7786, "step": 2179 }, { "epoch": 0.7848784878487849, "grad_norm": 0.7052769064903259, "learning_rate": 2.2733224137277366e-05, "loss": 4.9935, "step": 2180 }, { "epoch": 0.7852385238523852, "grad_norm": 0.8767135739326477, "learning_rate": 2.266014317865519e-05, "loss": 4.7496, "step": 2181 }, { "epoch": 0.7855985598559856, "grad_norm": 0.5927574634552002, "learning_rate": 2.2587164863847975e-05, "loss": 5.0468, "step": 2182 }, { "epoch": 0.7859585958595859, "grad_norm": 1.1241964101791382, "learning_rate": 2.251428928971102e-05, "loss": 4.837, "step": 2183 }, { "epoch": 0.7863186318631863, "grad_norm": 0.5535566806793213, "learning_rate": 2.244151655296327e-05, "loss": 4.5108, "step": 2184 }, { "epoch": 0.7866786678667866, "grad_norm": 0.8191620111465454, "learning_rate": 2.236884675018709e-05, "loss": 4.9146, "step": 2185 }, { "epoch": 0.787038703870387, "grad_norm": 0.7227510809898376, "learning_rate": 2.2296279977828337e-05, "loss": 4.4775, "step": 2186 }, { "epoch": 0.7873987398739875, "grad_norm": 0.5830511450767517, "learning_rate": 2.222381633219608e-05, "loss": 4.604, "step": 2187 }, { "epoch": 0.7877587758775878, "grad_norm": 0.6540189981460571, "learning_rate": 2.2151455909462538e-05, "loss": 4.5976, "step": 2188 }, { "epoch": 0.7881188118811882, "grad_norm": 0.6274272203445435, "learning_rate": 2.2079198805662914e-05, "loss": 4.5981, "step": 2189 }, { "epoch": 0.7884788478847885, "grad_norm": 0.9411934018135071, "learning_rate": 2.2007045116695313e-05, "loss": 4.6338, "step": 2190 }, { "epoch": 0.7888388838883889, "grad_norm": 0.640446662902832, "learning_rate": 2.1934994938320584e-05, "loss": 4.5651, "step": 2191 }, { "epoch": 0.7891989198919892, "grad_norm": 0.6163681149482727, "learning_rate": 2.1863048366162208e-05, "loss": 4.4887, "step": 2192 }, { "epoch": 0.7895589558955896, "grad_norm": 0.6504824757575989, "learning_rate": 2.179120549570609e-05, "loss": 4.84, "step": 2193 }, { "epoch": 0.7899189918991899, "grad_norm": 0.6089352965354919, "learning_rate": 2.1719466422300607e-05, "loss": 4.8128, "step": 2194 }, { "epoch": 0.7902790279027903, "grad_norm": 0.8295852541923523, "learning_rate": 2.1647831241156302e-05, "loss": 5.124, "step": 2195 }, { "epoch": 0.7906390639063906, "grad_norm": 0.6523507237434387, "learning_rate": 2.1576300047345932e-05, "loss": 4.6459, "step": 2196 }, { "epoch": 0.790999099909991, "grad_norm": 0.6584704518318176, "learning_rate": 2.15048729358041e-05, "loss": 4.6353, "step": 2197 }, { "epoch": 0.7913591359135913, "grad_norm": 0.9848087430000305, "learning_rate": 2.1433550001327373e-05, "loss": 5.323, "step": 2198 }, { "epoch": 0.7917191719171918, "grad_norm": 0.8073936104774475, "learning_rate": 2.136233133857405e-05, "loss": 5.1241, "step": 2199 }, { "epoch": 0.7920792079207921, "grad_norm": 1.2630630731582642, "learning_rate": 2.129121704206405e-05, "loss": 5.2807, "step": 2200 }, { "epoch": 0.7924392439243925, "grad_norm": 1.061435580253601, "learning_rate": 2.1220207206178688e-05, "loss": 4.6137, "step": 2201 }, { "epoch": 0.7927992799279928, "grad_norm": 0.7881227731704712, "learning_rate": 2.114930192516076e-05, "loss": 4.5962, "step": 2202 }, { "epoch": 0.7931593159315932, "grad_norm": 0.7256401777267456, "learning_rate": 2.107850129311426e-05, "loss": 4.8192, "step": 2203 }, { "epoch": 0.7935193519351935, "grad_norm": 0.9993703365325928, "learning_rate": 2.1007805404004242e-05, "loss": 5.1882, "step": 2204 }, { "epoch": 0.7938793879387939, "grad_norm": 0.714908242225647, "learning_rate": 2.09372143516568e-05, "loss": 4.6632, "step": 2205 }, { "epoch": 0.7942394239423942, "grad_norm": 0.9289857745170593, "learning_rate": 2.0866728229758857e-05, "loss": 4.8051, "step": 2206 }, { "epoch": 0.7945994599459946, "grad_norm": 1.0063886642456055, "learning_rate": 2.0796347131858186e-05, "loss": 4.8422, "step": 2207 }, { "epoch": 0.7949594959495949, "grad_norm": 0.7214351296424866, "learning_rate": 2.072607115136298e-05, "loss": 4.639, "step": 2208 }, { "epoch": 0.7953195319531953, "grad_norm": 0.7175842523574829, "learning_rate": 2.065590038154209e-05, "loss": 5.0236, "step": 2209 }, { "epoch": 0.7956795679567957, "grad_norm": 0.5457351207733154, "learning_rate": 2.058583491552465e-05, "loss": 4.5711, "step": 2210 }, { "epoch": 0.7960396039603961, "grad_norm": 0.6841213703155518, "learning_rate": 2.0515874846300077e-05, "loss": 4.6929, "step": 2211 }, { "epoch": 0.7963996399639964, "grad_norm": 0.8968706727027893, "learning_rate": 2.044602026671786e-05, "loss": 4.7611, "step": 2212 }, { "epoch": 0.7967596759675968, "grad_norm": 0.6250977516174316, "learning_rate": 2.0376271269487514e-05, "loss": 4.3924, "step": 2213 }, { "epoch": 0.7971197119711971, "grad_norm": 0.6848214864730835, "learning_rate": 2.0306627947178446e-05, "loss": 4.5518, "step": 2214 }, { "epoch": 0.7974797479747975, "grad_norm": 0.5987040996551514, "learning_rate": 2.0237090392219805e-05, "loss": 4.9023, "step": 2215 }, { "epoch": 0.7978397839783978, "grad_norm": 0.6589849591255188, "learning_rate": 2.0167658696900317e-05, "loss": 4.42, "step": 2216 }, { "epoch": 0.7981998199819982, "grad_norm": 0.7822523713111877, "learning_rate": 2.0098332953368272e-05, "loss": 4.8455, "step": 2217 }, { "epoch": 0.7985598559855985, "grad_norm": 0.8310371041297913, "learning_rate": 2.0029113253631314e-05, "loss": 5.1098, "step": 2218 }, { "epoch": 0.7989198919891989, "grad_norm": 0.8362820148468018, "learning_rate": 1.995999968955641e-05, "loss": 4.5659, "step": 2219 }, { "epoch": 0.7992799279927992, "grad_norm": 0.7935851812362671, "learning_rate": 1.9890992352869543e-05, "loss": 4.5946, "step": 2220 }, { "epoch": 0.7996399639963997, "grad_norm": 0.6581932902336121, "learning_rate": 1.9822091335155812e-05, "loss": 5.0322, "step": 2221 }, { "epoch": 0.8, "grad_norm": 0.6121277213096619, "learning_rate": 1.9753296727859195e-05, "loss": 4.5539, "step": 2222 }, { "epoch": 0.8003600360036004, "grad_norm": 0.7371995449066162, "learning_rate": 1.9684608622282417e-05, "loss": 5.2824, "step": 2223 }, { "epoch": 0.8007200720072007, "grad_norm": 0.825187087059021, "learning_rate": 1.9616027109586887e-05, "loss": 5.0987, "step": 2224 }, { "epoch": 0.8010801080108011, "grad_norm": 1.2908596992492676, "learning_rate": 1.9547552280792524e-05, "loss": 5.2177, "step": 2225 }, { "epoch": 0.8014401440144014, "grad_norm": 2.2421438694000244, "learning_rate": 1.947918422677769e-05, "loss": 4.7303, "step": 2226 }, { "epoch": 0.8018001800180018, "grad_norm": 0.5044226050376892, "learning_rate": 1.941092303827896e-05, "loss": 4.6811, "step": 2227 }, { "epoch": 0.8021602160216021, "grad_norm": 0.5311564207077026, "learning_rate": 1.9342768805891178e-05, "loss": 4.8866, "step": 2228 }, { "epoch": 0.8025202520252025, "grad_norm": 0.8058616518974304, "learning_rate": 1.927472162006717e-05, "loss": 4.5184, "step": 2229 }, { "epoch": 0.8028802880288028, "grad_norm": 0.690186619758606, "learning_rate": 1.920678157111776e-05, "loss": 4.657, "step": 2230 }, { "epoch": 0.8032403240324032, "grad_norm": 0.7219494581222534, "learning_rate": 1.9138948749211472e-05, "loss": 4.7445, "step": 2231 }, { "epoch": 0.8036003600360035, "grad_norm": 0.6288260221481323, "learning_rate": 1.9071223244374614e-05, "loss": 4.9032, "step": 2232 }, { "epoch": 0.803960396039604, "grad_norm": 0.7226089239120483, "learning_rate": 1.9003605146491054e-05, "loss": 5.1746, "step": 2233 }, { "epoch": 0.8043204320432044, "grad_norm": 0.8098820447921753, "learning_rate": 1.8936094545302095e-05, "loss": 4.8424, "step": 2234 }, { "epoch": 0.8046804680468047, "grad_norm": 0.7526129484176636, "learning_rate": 1.8868691530406336e-05, "loss": 4.5838, "step": 2235 }, { "epoch": 0.8050405040504051, "grad_norm": 0.8739807605743408, "learning_rate": 1.8801396191259645e-05, "loss": 4.7443, "step": 2236 }, { "epoch": 0.8054005400540054, "grad_norm": 0.6892088651657104, "learning_rate": 1.8734208617174988e-05, "loss": 4.9657, "step": 2237 }, { "epoch": 0.8057605760576058, "grad_norm": 0.5015419125556946, "learning_rate": 1.866712889732225e-05, "loss": 4.7528, "step": 2238 }, { "epoch": 0.8061206120612061, "grad_norm": 0.5005907416343689, "learning_rate": 1.8600157120728244e-05, "loss": 4.7449, "step": 2239 }, { "epoch": 0.8064806480648065, "grad_norm": 0.526184618473053, "learning_rate": 1.8533293376276472e-05, "loss": 4.6524, "step": 2240 }, { "epoch": 0.8068406840684068, "grad_norm": 0.6554297804832458, "learning_rate": 1.8466537752707068e-05, "loss": 4.8402, "step": 2241 }, { "epoch": 0.8072007200720072, "grad_norm": 0.6219531893730164, "learning_rate": 1.839989033861673e-05, "loss": 4.6053, "step": 2242 }, { "epoch": 0.8075607560756075, "grad_norm": 0.6740393042564392, "learning_rate": 1.8333351222458407e-05, "loss": 4.385, "step": 2243 }, { "epoch": 0.807920792079208, "grad_norm": 0.7463712096214294, "learning_rate": 1.826692049254145e-05, "loss": 4.8666, "step": 2244 }, { "epoch": 0.8082808280828083, "grad_norm": 0.6941218376159668, "learning_rate": 1.820059823703133e-05, "loss": 4.6104, "step": 2245 }, { "epoch": 0.8086408640864087, "grad_norm": 0.8766574859619141, "learning_rate": 1.8134384543949478e-05, "loss": 4.6754, "step": 2246 }, { "epoch": 0.809000900090009, "grad_norm": 0.7981788516044617, "learning_rate": 1.8068279501173335e-05, "loss": 4.8805, "step": 2247 }, { "epoch": 0.8093609360936094, "grad_norm": 1.02590811252594, "learning_rate": 1.8002283196436097e-05, "loss": 4.9571, "step": 2248 }, { "epoch": 0.8097209720972097, "grad_norm": 1.0470219850540161, "learning_rate": 1.7936395717326704e-05, "loss": 5.117, "step": 2249 }, { "epoch": 0.8100810081008101, "grad_norm": 1.3908179998397827, "learning_rate": 1.787061715128956e-05, "loss": 5.2106, "step": 2250 }, { "epoch": 0.8104410441044104, "grad_norm": 0.970249354839325, "learning_rate": 1.7804947585624588e-05, "loss": 5.0495, "step": 2251 }, { "epoch": 0.8108010801080108, "grad_norm": 0.7076825499534607, "learning_rate": 1.773938710748706e-05, "loss": 4.6217, "step": 2252 }, { "epoch": 0.8111611161116111, "grad_norm": 0.657702624797821, "learning_rate": 1.7673935803887453e-05, "loss": 4.4113, "step": 2253 }, { "epoch": 0.8115211521152115, "grad_norm": 0.6246639490127563, "learning_rate": 1.760859376169133e-05, "loss": 4.5696, "step": 2254 }, { "epoch": 0.8118811881188119, "grad_norm": 0.5377760529518127, "learning_rate": 1.754336106761927e-05, "loss": 4.5045, "step": 2255 }, { "epoch": 0.8122412241224123, "grad_norm": 0.5879418849945068, "learning_rate": 1.7478237808246722e-05, "loss": 5.1566, "step": 2256 }, { "epoch": 0.8126012601260126, "grad_norm": 0.6851582527160645, "learning_rate": 1.741322407000391e-05, "loss": 4.776, "step": 2257 }, { "epoch": 0.812961296129613, "grad_norm": 1.0152539014816284, "learning_rate": 1.7348319939175637e-05, "loss": 4.4992, "step": 2258 }, { "epoch": 0.8133213321332133, "grad_norm": 0.6916372179985046, "learning_rate": 1.7283525501901323e-05, "loss": 4.1276, "step": 2259 }, { "epoch": 0.8136813681368137, "grad_norm": 0.5402399897575378, "learning_rate": 1.7218840844174754e-05, "loss": 4.7419, "step": 2260 }, { "epoch": 0.814041404140414, "grad_norm": 0.48339545726776123, "learning_rate": 1.715426605184407e-05, "loss": 4.3284, "step": 2261 }, { "epoch": 0.8144014401440144, "grad_norm": 0.7083166241645813, "learning_rate": 1.70898012106115e-05, "loss": 4.5239, "step": 2262 }, { "epoch": 0.8147614761476147, "grad_norm": 0.5646843314170837, "learning_rate": 1.7025446406033453e-05, "loss": 4.5845, "step": 2263 }, { "epoch": 0.8151215121512151, "grad_norm": 0.7311326265335083, "learning_rate": 1.696120172352025e-05, "loss": 4.5815, "step": 2264 }, { "epoch": 0.8154815481548154, "grad_norm": 0.743579626083374, "learning_rate": 1.6897067248336095e-05, "loss": 5.0977, "step": 2265 }, { "epoch": 0.8158415841584158, "grad_norm": 0.6290472149848938, "learning_rate": 1.683304306559884e-05, "loss": 4.7559, "step": 2266 }, { "epoch": 0.8162016201620163, "grad_norm": 0.5165727734565735, "learning_rate": 1.676912926028007e-05, "loss": 4.8021, "step": 2267 }, { "epoch": 0.8165616561656166, "grad_norm": 0.5954656600952148, "learning_rate": 1.6705325917204805e-05, "loss": 4.7522, "step": 2268 }, { "epoch": 0.816921692169217, "grad_norm": 0.6288970708847046, "learning_rate": 1.66416331210515e-05, "loss": 4.7196, "step": 2269 }, { "epoch": 0.8172817281728173, "grad_norm": 0.6287972331047058, "learning_rate": 1.6578050956351886e-05, "loss": 5.1404, "step": 2270 }, { "epoch": 0.8176417641764177, "grad_norm": 0.8024221658706665, "learning_rate": 1.6514579507490848e-05, "loss": 5.0936, "step": 2271 }, { "epoch": 0.818001800180018, "grad_norm": 0.5822760462760925, "learning_rate": 1.6451218858706374e-05, "loss": 4.9177, "step": 2272 }, { "epoch": 0.8183618361836184, "grad_norm": 0.8307278752326965, "learning_rate": 1.6387969094089316e-05, "loss": 5.2499, "step": 2273 }, { "epoch": 0.8187218721872187, "grad_norm": 0.9024346470832825, "learning_rate": 1.632483029758345e-05, "loss": 5.276, "step": 2274 }, { "epoch": 0.819081908190819, "grad_norm": 1.2056093215942383, "learning_rate": 1.626180255298525e-05, "loss": 5.3459, "step": 2275 }, { "epoch": 0.8194419441944194, "grad_norm": 1.9106166362762451, "learning_rate": 1.619888594394382e-05, "loss": 4.8601, "step": 2276 }, { "epoch": 0.8198019801980198, "grad_norm": 0.6660627126693726, "learning_rate": 1.6136080553960687e-05, "loss": 4.7553, "step": 2277 }, { "epoch": 0.8201620162016202, "grad_norm": 0.4384493827819824, "learning_rate": 1.6073386466389872e-05, "loss": 4.5587, "step": 2278 }, { "epoch": 0.8205220522052206, "grad_norm": 0.5030012726783752, "learning_rate": 1.601080376443763e-05, "loss": 4.5943, "step": 2279 }, { "epoch": 0.8208820882088209, "grad_norm": 0.5574566721916199, "learning_rate": 1.5948332531162413e-05, "loss": 4.5229, "step": 2280 }, { "epoch": 0.8212421242124213, "grad_norm": 0.519743025302887, "learning_rate": 1.5885972849474672e-05, "loss": 4.5851, "step": 2281 }, { "epoch": 0.8216021602160216, "grad_norm": 0.5831529498100281, "learning_rate": 1.5823724802136865e-05, "loss": 4.7909, "step": 2282 }, { "epoch": 0.821962196219622, "grad_norm": 0.8960453271865845, "learning_rate": 1.576158847176329e-05, "loss": 4.8842, "step": 2283 }, { "epoch": 0.8223222322232223, "grad_norm": 0.6710848808288574, "learning_rate": 1.5699563940819962e-05, "loss": 4.8542, "step": 2284 }, { "epoch": 0.8226822682268227, "grad_norm": 0.5260566473007202, "learning_rate": 1.5637651291624523e-05, "loss": 4.6999, "step": 2285 }, { "epoch": 0.823042304230423, "grad_norm": 0.6472052335739136, "learning_rate": 1.557585060634612e-05, "loss": 4.5737, "step": 2286 }, { "epoch": 0.8234023402340234, "grad_norm": 0.757953941822052, "learning_rate": 1.5514161967005337e-05, "loss": 4.4606, "step": 2287 }, { "epoch": 0.8237623762376237, "grad_norm": 0.5936845541000366, "learning_rate": 1.5452585455473977e-05, "loss": 4.5589, "step": 2288 }, { "epoch": 0.8241224122412241, "grad_norm": 0.6369442343711853, "learning_rate": 1.539112115347511e-05, "loss": 4.963, "step": 2289 }, { "epoch": 0.8244824482448245, "grad_norm": 0.8031920194625854, "learning_rate": 1.5329769142582827e-05, "loss": 4.9157, "step": 2290 }, { "epoch": 0.8248424842484249, "grad_norm": 0.756523847579956, "learning_rate": 1.526852950422226e-05, "loss": 4.6052, "step": 2291 }, { "epoch": 0.8252025202520252, "grad_norm": 0.6446996331214905, "learning_rate": 1.5207402319669306e-05, "loss": 4.7325, "step": 2292 }, { "epoch": 0.8255625562556256, "grad_norm": 0.6594321727752686, "learning_rate": 1.5146387670050687e-05, "loss": 4.7268, "step": 2293 }, { "epoch": 0.8259225922592259, "grad_norm": 0.6736615896224976, "learning_rate": 1.5085485636343755e-05, "loss": 5.0275, "step": 2294 }, { "epoch": 0.8262826282628263, "grad_norm": 0.5317438244819641, "learning_rate": 1.5024696299376407e-05, "loss": 4.7624, "step": 2295 }, { "epoch": 0.8266426642664266, "grad_norm": 0.6013367772102356, "learning_rate": 1.4964019739826907e-05, "loss": 4.5998, "step": 2296 }, { "epoch": 0.827002700270027, "grad_norm": 0.5864752531051636, "learning_rate": 1.4903456038223939e-05, "loss": 4.7793, "step": 2297 }, { "epoch": 0.8273627362736273, "grad_norm": 1.0099995136260986, "learning_rate": 1.4843005274946365e-05, "loss": 5.441, "step": 2298 }, { "epoch": 0.8277227722772277, "grad_norm": 1.1725364923477173, "learning_rate": 1.4782667530223126e-05, "loss": 4.894, "step": 2299 }, { "epoch": 0.828082808280828, "grad_norm": 1.4512721300125122, "learning_rate": 1.4722442884133214e-05, "loss": 5.316, "step": 2300 }, { "epoch": 0.8284428442844285, "grad_norm": 0.6787356734275818, "learning_rate": 1.4662331416605501e-05, "loss": 4.4722, "step": 2301 }, { "epoch": 0.8288028802880288, "grad_norm": 1.0046409368515015, "learning_rate": 1.4602333207418651e-05, "loss": 4.455, "step": 2302 }, { "epoch": 0.8291629162916292, "grad_norm": 0.7777742743492126, "learning_rate": 1.454244833620102e-05, "loss": 4.277, "step": 2303 }, { "epoch": 0.8295229522952295, "grad_norm": 1.2001330852508545, "learning_rate": 1.4482676882430502e-05, "loss": 4.9538, "step": 2304 }, { "epoch": 0.8298829882988299, "grad_norm": 0.6005185842514038, "learning_rate": 1.4423018925434528e-05, "loss": 4.5779, "step": 2305 }, { "epoch": 0.8302430243024302, "grad_norm": 0.7251618504524231, "learning_rate": 1.4363474544389877e-05, "loss": 4.6006, "step": 2306 }, { "epoch": 0.8306030603060306, "grad_norm": 0.9123652577400208, "learning_rate": 1.4304043818322565e-05, "loss": 4.927, "step": 2307 }, { "epoch": 0.8309630963096309, "grad_norm": 0.9391204118728638, "learning_rate": 1.424472682610779e-05, "loss": 5.0999, "step": 2308 }, { "epoch": 0.8313231323132313, "grad_norm": 0.6396461725234985, "learning_rate": 1.4185523646469822e-05, "loss": 4.552, "step": 2309 }, { "epoch": 0.8316831683168316, "grad_norm": 0.5968081951141357, "learning_rate": 1.4126434357981877e-05, "loss": 4.6309, "step": 2310 }, { "epoch": 0.832043204320432, "grad_norm": 0.8504002690315247, "learning_rate": 1.4067459039065956e-05, "loss": 4.995, "step": 2311 }, { "epoch": 0.8324032403240325, "grad_norm": 0.9265114068984985, "learning_rate": 1.4008597767992871e-05, "loss": 4.8508, "step": 2312 }, { "epoch": 0.8327632763276328, "grad_norm": 0.5217415690422058, "learning_rate": 1.3949850622882054e-05, "loss": 4.8427, "step": 2313 }, { "epoch": 0.8331233123312332, "grad_norm": 0.8731891512870789, "learning_rate": 1.3891217681701474e-05, "loss": 4.713, "step": 2314 }, { "epoch": 0.8334833483348335, "grad_norm": 0.602576732635498, "learning_rate": 1.3832699022267515e-05, "loss": 4.5357, "step": 2315 }, { "epoch": 0.8338433843384339, "grad_norm": 0.6332751512527466, "learning_rate": 1.3774294722244907e-05, "loss": 4.6608, "step": 2316 }, { "epoch": 0.8342034203420342, "grad_norm": 0.7543874979019165, "learning_rate": 1.3716004859146592e-05, "loss": 4.6207, "step": 2317 }, { "epoch": 0.8345634563456346, "grad_norm": 0.888930082321167, "learning_rate": 1.3657829510333654e-05, "loss": 4.6348, "step": 2318 }, { "epoch": 0.8349234923492349, "grad_norm": 0.8453302979469299, "learning_rate": 1.3599768753015152e-05, "loss": 4.7868, "step": 2319 }, { "epoch": 0.8352835283528353, "grad_norm": 0.5163218379020691, "learning_rate": 1.3541822664248094e-05, "loss": 4.4345, "step": 2320 }, { "epoch": 0.8356435643564356, "grad_norm": 0.49913036823272705, "learning_rate": 1.3483991320937306e-05, "loss": 4.961, "step": 2321 }, { "epoch": 0.836003600360036, "grad_norm": 1.1673535108566284, "learning_rate": 1.3426274799835337e-05, "loss": 4.8401, "step": 2322 }, { "epoch": 0.8363636363636363, "grad_norm": 0.8345517516136169, "learning_rate": 1.336867317754229e-05, "loss": 5.0522, "step": 2323 }, { "epoch": 0.8367236723672368, "grad_norm": 0.9774869680404663, "learning_rate": 1.3311186530505838e-05, "loss": 5.1665, "step": 2324 }, { "epoch": 0.8370837083708371, "grad_norm": 1.130599856376648, "learning_rate": 1.3253814935021026e-05, "loss": 5.1565, "step": 2325 }, { "epoch": 0.8374437443744375, "grad_norm": 0.9888586401939392, "learning_rate": 1.3196558467230247e-05, "loss": 4.9575, "step": 2326 }, { "epoch": 0.8378037803780378, "grad_norm": 0.6156508326530457, "learning_rate": 1.3139417203123027e-05, "loss": 4.9267, "step": 2327 }, { "epoch": 0.8381638163816382, "grad_norm": 0.6930103302001953, "learning_rate": 1.3082391218536061e-05, "loss": 4.7754, "step": 2328 }, { "epoch": 0.8385238523852385, "grad_norm": 0.7321805357933044, "learning_rate": 1.3025480589153005e-05, "loss": 4.8377, "step": 2329 }, { "epoch": 0.8388838883888389, "grad_norm": 0.7025576233863831, "learning_rate": 1.2968685390504465e-05, "loss": 4.8009, "step": 2330 }, { "epoch": 0.8392439243924392, "grad_norm": 0.5497130155563354, "learning_rate": 1.29120056979678e-05, "loss": 4.4953, "step": 2331 }, { "epoch": 0.8396039603960396, "grad_norm": 0.5606801509857178, "learning_rate": 1.2855441586767113e-05, "loss": 4.4783, "step": 2332 }, { "epoch": 0.8399639963996399, "grad_norm": 0.5393441915512085, "learning_rate": 1.2798993131973091e-05, "loss": 4.4954, "step": 2333 }, { "epoch": 0.8403240324032403, "grad_norm": 0.601349949836731, "learning_rate": 1.2742660408502904e-05, "loss": 4.6084, "step": 2334 }, { "epoch": 0.8406840684068407, "grad_norm": 0.5767045617103577, "learning_rate": 1.2686443491120149e-05, "loss": 4.7144, "step": 2335 }, { "epoch": 0.8410441044104411, "grad_norm": 0.7713471055030823, "learning_rate": 1.263034245443473e-05, "loss": 4.673, "step": 2336 }, { "epoch": 0.8414041404140414, "grad_norm": 0.771685779094696, "learning_rate": 1.2574357372902767e-05, "loss": 4.9869, "step": 2337 }, { "epoch": 0.8417641764176418, "grad_norm": 0.5168091654777527, "learning_rate": 1.2518488320826449e-05, "loss": 4.5899, "step": 2338 }, { "epoch": 0.8421242124212421, "grad_norm": 1.225448727607727, "learning_rate": 1.2462735372353996e-05, "loss": 4.7007, "step": 2339 }, { "epoch": 0.8424842484248425, "grad_norm": 0.5362923741340637, "learning_rate": 1.2407098601479539e-05, "loss": 4.696, "step": 2340 }, { "epoch": 0.8428442844284428, "grad_norm": 0.6797324419021606, "learning_rate": 1.2351578082043047e-05, "loss": 4.791, "step": 2341 }, { "epoch": 0.8432043204320432, "grad_norm": 1.1674048900604248, "learning_rate": 1.2296173887730123e-05, "loss": 4.9394, "step": 2342 }, { "epoch": 0.8435643564356435, "grad_norm": 0.6934359073638916, "learning_rate": 1.2240886092072068e-05, "loss": 4.5077, "step": 2343 }, { "epoch": 0.8439243924392439, "grad_norm": 0.5877205729484558, "learning_rate": 1.2185714768445667e-05, "loss": 4.5705, "step": 2344 }, { "epoch": 0.8442844284428442, "grad_norm": 0.6093941926956177, "learning_rate": 1.2130659990073146e-05, "loss": 4.5415, "step": 2345 }, { "epoch": 0.8446444644464447, "grad_norm": 0.7287856936454773, "learning_rate": 1.2075721830021969e-05, "loss": 5.0374, "step": 2346 }, { "epoch": 0.845004500450045, "grad_norm": 0.8647234439849854, "learning_rate": 1.2020900361204968e-05, "loss": 4.6786, "step": 2347 }, { "epoch": 0.8453645364536454, "grad_norm": 0.7940008044242859, "learning_rate": 1.1966195656380031e-05, "loss": 5.0893, "step": 2348 }, { "epoch": 0.8457245724572457, "grad_norm": 1.0051583051681519, "learning_rate": 1.1911607788150036e-05, "loss": 5.203, "step": 2349 }, { "epoch": 0.8460846084608461, "grad_norm": 0.9990129470825195, "learning_rate": 1.1857136828962855e-05, "loss": 5.0418, "step": 2350 }, { "epoch": 0.8464446444644464, "grad_norm": 1.3434467315673828, "learning_rate": 1.1802782851111205e-05, "loss": 5.1628, "step": 2351 }, { "epoch": 0.8468046804680468, "grad_norm": 0.7505450248718262, "learning_rate": 1.1748545926732535e-05, "loss": 4.6661, "step": 2352 }, { "epoch": 0.8471647164716472, "grad_norm": 1.0241285562515259, "learning_rate": 1.169442612780891e-05, "loss": 4.7007, "step": 2353 }, { "epoch": 0.8475247524752475, "grad_norm": 1.4250359535217285, "learning_rate": 1.1640423526166988e-05, "loss": 4.5734, "step": 2354 }, { "epoch": 0.8478847884788479, "grad_norm": 0.8431483507156372, "learning_rate": 1.158653819347788e-05, "loss": 4.6603, "step": 2355 }, { "epoch": 0.8482448244824482, "grad_norm": 0.6706793904304504, "learning_rate": 1.1532770201257082e-05, "loss": 4.7574, "step": 2356 }, { "epoch": 0.8486048604860486, "grad_norm": 0.6527566909790039, "learning_rate": 1.1479119620864276e-05, "loss": 4.6939, "step": 2357 }, { "epoch": 0.848964896489649, "grad_norm": 0.846868634223938, "learning_rate": 1.1425586523503395e-05, "loss": 4.7602, "step": 2358 }, { "epoch": 0.8493249324932494, "grad_norm": 0.8098002672195435, "learning_rate": 1.1372170980222441e-05, "loss": 4.9113, "step": 2359 }, { "epoch": 0.8496849684968497, "grad_norm": 0.6968252062797546, "learning_rate": 1.1318873061913405e-05, "loss": 4.8477, "step": 2360 }, { "epoch": 0.8500450045004501, "grad_norm": 0.6603767275810242, "learning_rate": 1.1265692839312092e-05, "loss": 4.7987, "step": 2361 }, { "epoch": 0.8504050405040504, "grad_norm": 0.9465769529342651, "learning_rate": 1.1212630382998213e-05, "loss": 4.5938, "step": 2362 }, { "epoch": 0.8507650765076508, "grad_norm": 0.6345024704933167, "learning_rate": 1.1159685763395111e-05, "loss": 4.7153, "step": 2363 }, { "epoch": 0.8511251125112511, "grad_norm": 0.7378025054931641, "learning_rate": 1.1106859050769769e-05, "loss": 4.5385, "step": 2364 }, { "epoch": 0.8514851485148515, "grad_norm": 0.9905508160591125, "learning_rate": 1.1054150315232681e-05, "loss": 4.7941, "step": 2365 }, { "epoch": 0.8518451845184518, "grad_norm": 0.8407430648803711, "learning_rate": 1.1001559626737756e-05, "loss": 4.7788, "step": 2366 }, { "epoch": 0.8522052205220522, "grad_norm": 0.8498520255088806, "learning_rate": 1.0949087055082252e-05, "loss": 4.1181, "step": 2367 }, { "epoch": 0.8525652565256525, "grad_norm": 0.9800708889961243, "learning_rate": 1.089673266990663e-05, "loss": 4.8319, "step": 2368 }, { "epoch": 0.852925292529253, "grad_norm": 0.7625902891159058, "learning_rate": 1.0844496540694515e-05, "loss": 4.7761, "step": 2369 }, { "epoch": 0.8532853285328533, "grad_norm": 0.638638436794281, "learning_rate": 1.0792378736772612e-05, "loss": 4.7487, "step": 2370 }, { "epoch": 0.8536453645364537, "grad_norm": 0.6259344220161438, "learning_rate": 1.0740379327310569e-05, "loss": 4.9022, "step": 2371 }, { "epoch": 0.854005400540054, "grad_norm": 0.9138006567955017, "learning_rate": 1.0688498381320855e-05, "loss": 5.0246, "step": 2372 }, { "epoch": 0.8543654365436544, "grad_norm": 0.9508568048477173, "learning_rate": 1.0636735967658784e-05, "loss": 4.9478, "step": 2373 }, { "epoch": 0.8547254725472547, "grad_norm": 1.1043336391448975, "learning_rate": 1.0585092155022336e-05, "loss": 4.8974, "step": 2374 }, { "epoch": 0.8550855085508551, "grad_norm": 1.3299425840377808, "learning_rate": 1.0533567011952094e-05, "loss": 5.1492, "step": 2375 }, { "epoch": 0.8554455445544554, "grad_norm": 0.6157066822052002, "learning_rate": 1.0482160606831093e-05, "loss": 4.7135, "step": 2376 }, { "epoch": 0.8558055805580558, "grad_norm": 1.1526126861572266, "learning_rate": 1.0430873007884857e-05, "loss": 4.8683, "step": 2377 }, { "epoch": 0.8561656165616561, "grad_norm": 0.5212879180908203, "learning_rate": 1.0379704283181179e-05, "loss": 4.5955, "step": 2378 }, { "epoch": 0.8565256525652565, "grad_norm": 0.8130112290382385, "learning_rate": 1.0328654500630108e-05, "loss": 4.7918, "step": 2379 }, { "epoch": 0.8568856885688569, "grad_norm": 0.7200890183448792, "learning_rate": 1.0277723727983845e-05, "loss": 4.8406, "step": 2380 }, { "epoch": 0.8572457245724573, "grad_norm": 0.6026584506034851, "learning_rate": 1.0226912032836611e-05, "loss": 4.7515, "step": 2381 }, { "epoch": 0.8576057605760576, "grad_norm": 0.9684290885925293, "learning_rate": 1.0176219482624616e-05, "loss": 4.8093, "step": 2382 }, { "epoch": 0.857965796579658, "grad_norm": 0.6427994966506958, "learning_rate": 1.0125646144625955e-05, "loss": 4.5308, "step": 2383 }, { "epoch": 0.8583258325832583, "grad_norm": 1.033554196357727, "learning_rate": 1.007519208596045e-05, "loss": 4.8341, "step": 2384 }, { "epoch": 0.8586858685868587, "grad_norm": 0.6669801473617554, "learning_rate": 1.002485737358968e-05, "loss": 4.8964, "step": 2385 }, { "epoch": 0.859045904590459, "grad_norm": 0.6307418942451477, "learning_rate": 9.974642074316798e-06, "loss": 4.8266, "step": 2386 }, { "epoch": 0.8594059405940594, "grad_norm": 0.6424444913864136, "learning_rate": 9.924546254786493e-06, "loss": 4.7471, "step": 2387 }, { "epoch": 0.8597659765976597, "grad_norm": 0.8725467920303345, "learning_rate": 9.874569981484861e-06, "loss": 4.5142, "step": 2388 }, { "epoch": 0.8601260126012601, "grad_norm": 1.1564704179763794, "learning_rate": 9.824713320739342e-06, "loss": 4.7016, "step": 2389 }, { "epoch": 0.8604860486048604, "grad_norm": 0.7655138969421387, "learning_rate": 9.774976338718677e-06, "loss": 4.3319, "step": 2390 }, { "epoch": 0.8608460846084608, "grad_norm": 0.7302666306495667, "learning_rate": 9.725359101432674e-06, "loss": 4.6624, "step": 2391 }, { "epoch": 0.8612061206120613, "grad_norm": 0.7123817801475525, "learning_rate": 9.675861674732312e-06, "loss": 4.5181, "step": 2392 }, { "epoch": 0.8615661566156616, "grad_norm": 0.5299736261367798, "learning_rate": 9.62648412430951e-06, "loss": 4.6567, "step": 2393 }, { "epoch": 0.861926192619262, "grad_norm": 0.711216390132904, "learning_rate": 9.577226515697124e-06, "loss": 4.6318, "step": 2394 }, { "epoch": 0.8622862286228623, "grad_norm": 0.7324408888816833, "learning_rate": 9.528088914268784e-06, "loss": 4.6413, "step": 2395 }, { "epoch": 0.8626462646264627, "grad_norm": 0.6073545217514038, "learning_rate": 9.479071385238892e-06, "loss": 4.6813, "step": 2396 }, { "epoch": 0.863006300630063, "grad_norm": 0.9429351687431335, "learning_rate": 9.430173993662451e-06, "loss": 4.7784, "step": 2397 }, { "epoch": 0.8633663366336634, "grad_norm": 0.8551303148269653, "learning_rate": 9.381396804435061e-06, "loss": 5.4424, "step": 2398 }, { "epoch": 0.8637263726372637, "grad_norm": 0.9081370234489441, "learning_rate": 9.332739882292752e-06, "loss": 5.1161, "step": 2399 }, { "epoch": 0.8640864086408641, "grad_norm": 0.9314940571784973, "learning_rate": 9.284203291811954e-06, "loss": 5.0829, "step": 2400 }, { "epoch": 0.8644464446444644, "grad_norm": 0.747048556804657, "learning_rate": 9.23578709740942e-06, "loss": 4.671, "step": 2401 }, { "epoch": 0.8648064806480648, "grad_norm": 0.8901441097259521, "learning_rate": 9.187491363342093e-06, "loss": 4.7503, "step": 2402 }, { "epoch": 0.8651665166516652, "grad_norm": 0.8733905553817749, "learning_rate": 9.139316153707023e-06, "loss": 4.1668, "step": 2403 }, { "epoch": 0.8655265526552656, "grad_norm": 0.8293418288230896, "learning_rate": 9.091261532441342e-06, "loss": 4.9468, "step": 2404 }, { "epoch": 0.8658865886588659, "grad_norm": 0.6938745379447937, "learning_rate": 9.043327563322112e-06, "loss": 4.8042, "step": 2405 }, { "epoch": 0.8662466246624663, "grad_norm": 0.7350160479545593, "learning_rate": 8.995514309966302e-06, "loss": 5.0458, "step": 2406 }, { "epoch": 0.8666066606660666, "grad_norm": 0.5033836960792542, "learning_rate": 8.947821835830616e-06, "loss": 4.7879, "step": 2407 }, { "epoch": 0.866966696669667, "grad_norm": 0.9561224579811096, "learning_rate": 8.900250204211514e-06, "loss": 4.8389, "step": 2408 }, { "epoch": 0.8673267326732673, "grad_norm": 0.8856688141822815, "learning_rate": 8.852799478245032e-06, "loss": 4.5677, "step": 2409 }, { "epoch": 0.8676867686768677, "grad_norm": 0.48359963297843933, "learning_rate": 8.80546972090679e-06, "loss": 4.6154, "step": 2410 }, { "epoch": 0.868046804680468, "grad_norm": 0.6133762001991272, "learning_rate": 8.758260995011825e-06, "loss": 4.818, "step": 2411 }, { "epoch": 0.8684068406840684, "grad_norm": 0.7412658333778381, "learning_rate": 8.711173363214553e-06, "loss": 4.5966, "step": 2412 }, { "epoch": 0.8687668766876687, "grad_norm": 0.681463897228241, "learning_rate": 8.664206888008697e-06, "loss": 4.5624, "step": 2413 }, { "epoch": 0.8691269126912692, "grad_norm": 0.7318177223205566, "learning_rate": 8.617361631727138e-06, "loss": 4.5771, "step": 2414 }, { "epoch": 0.8694869486948695, "grad_norm": 0.8274372220039368, "learning_rate": 8.570637656541914e-06, "loss": 4.9709, "step": 2415 }, { "epoch": 0.8698469846984699, "grad_norm": 0.6884218454360962, "learning_rate": 8.524035024464105e-06, "loss": 4.9416, "step": 2416 }, { "epoch": 0.8702070207020702, "grad_norm": 0.7390003800392151, "learning_rate": 8.47755379734373e-06, "loss": 4.7099, "step": 2417 }, { "epoch": 0.8705670567056706, "grad_norm": 1.129050374031067, "learning_rate": 8.431194036869672e-06, "loss": 4.8879, "step": 2418 }, { "epoch": 0.8709270927092709, "grad_norm": 0.7855664491653442, "learning_rate": 8.384955804569627e-06, "loss": 4.8775, "step": 2419 }, { "epoch": 0.8712871287128713, "grad_norm": 0.5347578525543213, "learning_rate": 8.338839161809997e-06, "loss": 4.6191, "step": 2420 }, { "epoch": 0.8716471647164716, "grad_norm": 0.754165768623352, "learning_rate": 8.292844169795833e-06, "loss": 4.5964, "step": 2421 }, { "epoch": 0.872007200720072, "grad_norm": 0.7422668933868408, "learning_rate": 8.24697088957066e-06, "loss": 4.804, "step": 2422 }, { "epoch": 0.8723672367236723, "grad_norm": 0.8895533680915833, "learning_rate": 8.201219382016556e-06, "loss": 5.1019, "step": 2423 }, { "epoch": 0.8727272727272727, "grad_norm": 0.7388155460357666, "learning_rate": 8.15558970785395e-06, "loss": 4.7046, "step": 2424 }, { "epoch": 0.873087308730873, "grad_norm": 1.3550125360488892, "learning_rate": 8.110081927641566e-06, "loss": 4.9702, "step": 2425 }, { "epoch": 0.8734473447344735, "grad_norm": 2.8201375007629395, "learning_rate": 8.064696101776358e-06, "loss": 5.3071, "step": 2426 }, { "epoch": 0.8738073807380738, "grad_norm": 1.0882468223571777, "learning_rate": 8.019432290493457e-06, "loss": 4.851, "step": 2427 }, { "epoch": 0.8741674167416742, "grad_norm": 0.9948346018791199, "learning_rate": 7.974290553866005e-06, "loss": 5.0427, "step": 2428 }, { "epoch": 0.8745274527452745, "grad_norm": 0.6691415309906006, "learning_rate": 7.929270951805178e-06, "loss": 4.9892, "step": 2429 }, { "epoch": 0.8748874887488749, "grad_norm": 0.775093138217926, "learning_rate": 7.884373544060009e-06, "loss": 4.3908, "step": 2430 }, { "epoch": 0.8752475247524752, "grad_norm": 0.6868644952774048, "learning_rate": 7.839598390217396e-06, "loss": 4.7946, "step": 2431 }, { "epoch": 0.8756075607560756, "grad_norm": 0.6689639091491699, "learning_rate": 7.794945549701993e-06, "loss": 4.8674, "step": 2432 }, { "epoch": 0.875967596759676, "grad_norm": 0.9124707579612732, "learning_rate": 7.750415081776063e-06, "loss": 4.9911, "step": 2433 }, { "epoch": 0.8763276327632763, "grad_norm": 0.7038251161575317, "learning_rate": 7.70600704553951e-06, "loss": 4.8584, "step": 2434 }, { "epoch": 0.8766876687668766, "grad_norm": 0.7156389951705933, "learning_rate": 7.661721499929753e-06, "loss": 4.3274, "step": 2435 }, { "epoch": 0.877047704770477, "grad_norm": 0.8068670034408569, "learning_rate": 7.6175585037216226e-06, "loss": 4.5658, "step": 2436 }, { "epoch": 0.8774077407740775, "grad_norm": 0.7935437560081482, "learning_rate": 7.573518115527289e-06, "loss": 4.9122, "step": 2437 }, { "epoch": 0.8777677767776778, "grad_norm": 0.9261611700057983, "learning_rate": 7.529600393796232e-06, "loss": 4.8509, "step": 2438 }, { "epoch": 0.8781278127812782, "grad_norm": 0.8355916738510132, "learning_rate": 7.485805396815126e-06, "loss": 4.3652, "step": 2439 }, { "epoch": 0.8784878487848785, "grad_norm": 0.43560856580734253, "learning_rate": 7.442133182707745e-06, "loss": 4.5542, "step": 2440 }, { "epoch": 0.8788478847884789, "grad_norm": 0.7524927258491516, "learning_rate": 7.3985838094349444e-06, "loss": 4.7226, "step": 2441 }, { "epoch": 0.8792079207920792, "grad_norm": 0.7564715147018433, "learning_rate": 7.355157334794516e-06, "loss": 4.7208, "step": 2442 }, { "epoch": 0.8795679567956796, "grad_norm": 0.9967451691627502, "learning_rate": 7.3118538164211545e-06, "loss": 5.2971, "step": 2443 }, { "epoch": 0.8799279927992799, "grad_norm": 0.8346577286720276, "learning_rate": 7.2686733117863784e-06, "loss": 4.7256, "step": 2444 }, { "epoch": 0.8802880288028803, "grad_norm": 0.638346254825592, "learning_rate": 7.225615878198422e-06, "loss": 4.8184, "step": 2445 }, { "epoch": 0.8806480648064806, "grad_norm": 0.5529339909553528, "learning_rate": 7.1826815728021965e-06, "loss": 4.5564, "step": 2446 }, { "epoch": 0.881008100810081, "grad_norm": 0.5807334184646606, "learning_rate": 7.1398704525792e-06, "loss": 4.8166, "step": 2447 }, { "epoch": 0.8813681368136813, "grad_norm": 0.7800282835960388, "learning_rate": 7.097182574347472e-06, "loss": 5.1405, "step": 2448 }, { "epoch": 0.8817281728172818, "grad_norm": 0.6698582768440247, "learning_rate": 7.054617994761414e-06, "loss": 4.907, "step": 2449 }, { "epoch": 0.8820882088208821, "grad_norm": 1.5296711921691895, "learning_rate": 7.012176770311862e-06, "loss": 5.3242, "step": 2450 }, { "epoch": 0.8824482448244825, "grad_norm": 1.2979846000671387, "learning_rate": 6.969858957325904e-06, "loss": 4.922, "step": 2451 }, { "epoch": 0.8828082808280828, "grad_norm": 0.6412333250045776, "learning_rate": 6.927664611966811e-06, "loss": 5.0412, "step": 2452 }, { "epoch": 0.8831683168316832, "grad_norm": 0.619648814201355, "learning_rate": 6.8855937902340576e-06, "loss": 4.507, "step": 2453 }, { "epoch": 0.8835283528352835, "grad_norm": 0.5163532495498657, "learning_rate": 6.843646547963123e-06, "loss": 4.9747, "step": 2454 }, { "epoch": 0.8838883888388839, "grad_norm": 0.7194183468818665, "learning_rate": 6.801822940825509e-06, "loss": 4.5837, "step": 2455 }, { "epoch": 0.8842484248424842, "grad_norm": 0.8414213061332703, "learning_rate": 6.760123024328624e-06, "loss": 4.6327, "step": 2456 }, { "epoch": 0.8846084608460846, "grad_norm": 0.684772253036499, "learning_rate": 6.718546853815688e-06, "loss": 4.9001, "step": 2457 }, { "epoch": 0.8849684968496849, "grad_norm": 0.47863101959228516, "learning_rate": 6.67709448446574e-06, "loss": 4.8486, "step": 2458 }, { "epoch": 0.8853285328532853, "grad_norm": 0.6075344681739807, "learning_rate": 6.635765971293484e-06, "loss": 4.9541, "step": 2459 }, { "epoch": 0.8856885688568857, "grad_norm": 0.5354955196380615, "learning_rate": 6.594561369149199e-06, "loss": 4.6317, "step": 2460 }, { "epoch": 0.8860486048604861, "grad_norm": 0.7306193113327026, "learning_rate": 6.553480732718808e-06, "loss": 4.524, "step": 2461 }, { "epoch": 0.8864086408640864, "grad_norm": 0.6062951683998108, "learning_rate": 6.512524116523633e-06, "loss": 4.5702, "step": 2462 }, { "epoch": 0.8867686768676868, "grad_norm": 0.7549055814743042, "learning_rate": 6.4716915749204465e-06, "loss": 4.6434, "step": 2463 }, { "epoch": 0.8871287128712871, "grad_norm": 0.83303302526474, "learning_rate": 6.4309831621013005e-06, "loss": 4.7192, "step": 2464 }, { "epoch": 0.8874887488748875, "grad_norm": 0.6464311480522156, "learning_rate": 6.390398932093555e-06, "loss": 4.7064, "step": 2465 }, { "epoch": 0.8878487848784878, "grad_norm": 1.1855717897415161, "learning_rate": 6.3499389387597254e-06, "loss": 4.9074, "step": 2466 }, { "epoch": 0.8882088208820882, "grad_norm": 0.8404142260551453, "learning_rate": 6.30960323579749e-06, "loss": 4.8212, "step": 2467 }, { "epoch": 0.8885688568856885, "grad_norm": 0.5642232894897461, "learning_rate": 6.269391876739495e-06, "loss": 4.8076, "step": 2468 }, { "epoch": 0.8889288928892889, "grad_norm": 0.8032687306404114, "learning_rate": 6.229304914953405e-06, "loss": 5.019, "step": 2469 }, { "epoch": 0.8892889288928892, "grad_norm": 0.8474968671798706, "learning_rate": 6.189342403641807e-06, "loss": 5.0512, "step": 2470 }, { "epoch": 0.8896489648964897, "grad_norm": 0.6336872577667236, "learning_rate": 6.149504395842087e-06, "loss": 4.6737, "step": 2471 }, { "epoch": 0.89000900090009, "grad_norm": 0.704339325428009, "learning_rate": 6.109790944426397e-06, "loss": 4.5293, "step": 2472 }, { "epoch": 0.8903690369036904, "grad_norm": 0.8684128522872925, "learning_rate": 6.070202102101597e-06, "loss": 4.7989, "step": 2473 }, { "epoch": 0.8907290729072908, "grad_norm": 1.042490839958191, "learning_rate": 6.030737921409169e-06, "loss": 4.9338, "step": 2474 }, { "epoch": 0.8910891089108911, "grad_norm": 1.1774296760559082, "learning_rate": 5.9913984547250945e-06, "loss": 5.2439, "step": 2475 }, { "epoch": 0.8914491449144915, "grad_norm": 1.316601037979126, "learning_rate": 5.95218375425991e-06, "loss": 5.2065, "step": 2476 }, { "epoch": 0.8918091809180918, "grad_norm": 0.9096778631210327, "learning_rate": 5.913093872058528e-06, "loss": 4.6322, "step": 2477 }, { "epoch": 0.8921692169216922, "grad_norm": 0.6777582764625549, "learning_rate": 5.874128860000216e-06, "loss": 4.4147, "step": 2478 }, { "epoch": 0.8925292529252925, "grad_norm": 0.5754499435424805, "learning_rate": 5.835288769798486e-06, "loss": 4.833, "step": 2479 }, { "epoch": 0.8928892889288929, "grad_norm": 1.0769809484481812, "learning_rate": 5.7965736530010916e-06, "loss": 4.4729, "step": 2480 }, { "epoch": 0.8932493249324932, "grad_norm": 0.6358700394630432, "learning_rate": 5.757983560989921e-06, "loss": 4.9246, "step": 2481 }, { "epoch": 0.8936093609360936, "grad_norm": 1.1338918209075928, "learning_rate": 5.719518544980929e-06, "loss": 4.7706, "step": 2482 }, { "epoch": 0.893969396939694, "grad_norm": 0.4643517732620239, "learning_rate": 5.681178656024055e-06, "loss": 4.7676, "step": 2483 }, { "epoch": 0.8943294329432944, "grad_norm": 1.0510177612304688, "learning_rate": 5.642963945003188e-06, "loss": 4.6983, "step": 2484 }, { "epoch": 0.8946894689468947, "grad_norm": 0.8604787588119507, "learning_rate": 5.604874462636078e-06, "loss": 4.4971, "step": 2485 }, { "epoch": 0.8950495049504951, "grad_norm": 0.8846144080162048, "learning_rate": 5.566910259474289e-06, "loss": 4.4979, "step": 2486 }, { "epoch": 0.8954095409540954, "grad_norm": 0.7206079959869385, "learning_rate": 5.529071385903084e-06, "loss": 4.8595, "step": 2487 }, { "epoch": 0.8957695769576958, "grad_norm": 0.8029129505157471, "learning_rate": 5.491357892141425e-06, "loss": 4.871, "step": 2488 }, { "epoch": 0.8961296129612961, "grad_norm": 0.5468530654907227, "learning_rate": 5.453769828241872e-06, "loss": 4.4908, "step": 2489 }, { "epoch": 0.8964896489648965, "grad_norm": 1.086614727973938, "learning_rate": 5.416307244090502e-06, "loss": 4.8457, "step": 2490 }, { "epoch": 0.8968496849684968, "grad_norm": 1.064418077468872, "learning_rate": 5.378970189406829e-06, "loss": 4.6813, "step": 2491 }, { "epoch": 0.8972097209720972, "grad_norm": 0.5295194387435913, "learning_rate": 5.341758713743828e-06, "loss": 4.557, "step": 2492 }, { "epoch": 0.8975697569756975, "grad_norm": 0.8219357132911682, "learning_rate": 5.304672866487792e-06, "loss": 4.8301, "step": 2493 }, { "epoch": 0.897929792979298, "grad_norm": 0.9336304664611816, "learning_rate": 5.267712696858229e-06, "loss": 4.7836, "step": 2494 }, { "epoch": 0.8982898289828983, "grad_norm": 0.8698000907897949, "learning_rate": 5.230878253907912e-06, "loss": 4.6849, "step": 2495 }, { "epoch": 0.8986498649864987, "grad_norm": 0.6905087232589722, "learning_rate": 5.194169586522734e-06, "loss": 4.7967, "step": 2496 }, { "epoch": 0.899009900990099, "grad_norm": 0.754138171672821, "learning_rate": 5.157586743421672e-06, "loss": 4.9295, "step": 2497 }, { "epoch": 0.8993699369936994, "grad_norm": 1.8262755870819092, "learning_rate": 5.121129773156663e-06, "loss": 5.313, "step": 2498 }, { "epoch": 0.8997299729972997, "grad_norm": 0.7745803594589233, "learning_rate": 5.0847987241126385e-06, "loss": 5.1595, "step": 2499 }, { "epoch": 0.9000900090009001, "grad_norm": 1.3876433372497559, "learning_rate": 5.0485936445074046e-06, "loss": 5.2019, "step": 2500 }, { "epoch": 0.9004500450045004, "grad_norm": 1.133023738861084, "learning_rate": 5.012514582391592e-06, "loss": 4.5723, "step": 2501 }, { "epoch": 0.9008100810081008, "grad_norm": 0.6465590000152588, "learning_rate": 4.976561585648509e-06, "loss": 4.7929, "step": 2502 }, { "epoch": 0.9011701170117011, "grad_norm": 1.2847857475280762, "learning_rate": 4.9407347019942544e-06, "loss": 4.8718, "step": 2503 }, { "epoch": 0.9015301530153015, "grad_norm": 0.569114089012146, "learning_rate": 4.905033978977491e-06, "loss": 4.4804, "step": 2504 }, { "epoch": 0.9018901890189019, "grad_norm": 0.9793164134025574, "learning_rate": 4.869459463979465e-06, "loss": 4.986, "step": 2505 }, { "epoch": 0.9022502250225023, "grad_norm": 0.5514426231384277, "learning_rate": 4.8340112042139065e-06, "loss": 4.9524, "step": 2506 }, { "epoch": 0.9026102610261026, "grad_norm": 0.8211607336997986, "learning_rate": 4.798689246727006e-06, "loss": 4.8468, "step": 2507 }, { "epoch": 0.902970297029703, "grad_norm": 0.5269903540611267, "learning_rate": 4.7634936383973095e-06, "loss": 4.8626, "step": 2508 }, { "epoch": 0.9033303330333033, "grad_norm": 0.6444000005722046, "learning_rate": 4.728424425935707e-06, "loss": 4.551, "step": 2509 }, { "epoch": 0.9036903690369037, "grad_norm": 1.075435757637024, "learning_rate": 4.693481655885257e-06, "loss": 4.8247, "step": 2510 }, { "epoch": 0.904050405040504, "grad_norm": 1.0397629737854004, "learning_rate": 4.658665374621307e-06, "loss": 4.6963, "step": 2511 }, { "epoch": 0.9044104410441044, "grad_norm": 0.6805405616760254, "learning_rate": 4.623975628351273e-06, "loss": 4.4516, "step": 2512 }, { "epoch": 0.9047704770477047, "grad_norm": 0.7398169040679932, "learning_rate": 4.58941246311464e-06, "loss": 4.7951, "step": 2513 }, { "epoch": 0.9051305130513051, "grad_norm": 0.6716864109039307, "learning_rate": 4.554975924782912e-06, "loss": 4.7471, "step": 2514 }, { "epoch": 0.9054905490549054, "grad_norm": 0.6767914295196533, "learning_rate": 4.520666059059531e-06, "loss": 4.5634, "step": 2515 }, { "epoch": 0.9058505850585058, "grad_norm": 0.7175542712211609, "learning_rate": 4.486482911479839e-06, "loss": 4.719, "step": 2516 }, { "epoch": 0.9062106210621063, "grad_norm": 0.9069615602493286, "learning_rate": 4.452426527410947e-06, "loss": 5.5713, "step": 2517 }, { "epoch": 0.9065706570657066, "grad_norm": 0.6263923048973083, "learning_rate": 4.418496952051798e-06, "loss": 4.7829, "step": 2518 }, { "epoch": 0.906930693069307, "grad_norm": 0.7558562159538269, "learning_rate": 4.384694230432984e-06, "loss": 4.9266, "step": 2519 }, { "epoch": 0.9072907290729073, "grad_norm": 0.6696991324424744, "learning_rate": 4.351018407416763e-06, "loss": 4.3571, "step": 2520 }, { "epoch": 0.9076507650765077, "grad_norm": 0.6993823051452637, "learning_rate": 4.317469527696983e-06, "loss": 5.2419, "step": 2521 }, { "epoch": 0.908010801080108, "grad_norm": 0.6072081923484802, "learning_rate": 4.2840476357989825e-06, "loss": 4.9883, "step": 2522 }, { "epoch": 0.9083708370837084, "grad_norm": 0.8503673672676086, "learning_rate": 4.250752776079614e-06, "loss": 5.0176, "step": 2523 }, { "epoch": 0.9087308730873087, "grad_norm": 0.9142279624938965, "learning_rate": 4.217584992727108e-06, "loss": 5.2182, "step": 2524 }, { "epoch": 0.9090909090909091, "grad_norm": 1.77701735496521, "learning_rate": 4.184544329761009e-06, "loss": 5.2836, "step": 2525 }, { "epoch": 0.9094509450945094, "grad_norm": 2.4094419479370117, "learning_rate": 4.151630831032205e-06, "loss": 4.853, "step": 2526 }, { "epoch": 0.9098109810981098, "grad_norm": 0.5602378249168396, "learning_rate": 4.118844540222788e-06, "loss": 5.1699, "step": 2527 }, { "epoch": 0.9101710171017102, "grad_norm": 0.7844763994216919, "learning_rate": 4.0861855008460405e-06, "loss": 4.8816, "step": 2528 }, { "epoch": 0.9105310531053106, "grad_norm": 0.5660812258720398, "learning_rate": 4.0536537562463225e-06, "loss": 4.8106, "step": 2529 }, { "epoch": 0.9108910891089109, "grad_norm": 0.5048322081565857, "learning_rate": 4.021249349599077e-06, "loss": 4.2835, "step": 2530 }, { "epoch": 0.9112511251125113, "grad_norm": 0.7268801927566528, "learning_rate": 3.988972323910778e-06, "loss": 4.59, "step": 2531 }, { "epoch": 0.9116111611161116, "grad_norm": 0.7188135981559753, "learning_rate": 3.95682272201876e-06, "loss": 4.6502, "step": 2532 }, { "epoch": 0.911971197119712, "grad_norm": 0.6325691342353821, "learning_rate": 3.924800586591326e-06, "loss": 4.7787, "step": 2533 }, { "epoch": 0.9123312331233123, "grad_norm": 0.5503108501434326, "learning_rate": 3.892905960127546e-06, "loss": 4.6064, "step": 2534 }, { "epoch": 0.9126912691269127, "grad_norm": 0.8401983380317688, "learning_rate": 3.861138884957316e-06, "loss": 4.3366, "step": 2535 }, { "epoch": 0.913051305130513, "grad_norm": 0.521528422832489, "learning_rate": 3.829499403241221e-06, "loss": 4.7727, "step": 2536 }, { "epoch": 0.9134113411341134, "grad_norm": 0.5778352618217468, "learning_rate": 3.797987556970495e-06, "loss": 4.9353, "step": 2537 }, { "epoch": 0.9137713771377137, "grad_norm": 0.5739848613739014, "learning_rate": 3.7666033879670048e-06, "loss": 4.973, "step": 2538 }, { "epoch": 0.9141314131413142, "grad_norm": 0.5352855324745178, "learning_rate": 3.735346937883144e-06, "loss": 4.511, "step": 2539 }, { "epoch": 0.9144914491449145, "grad_norm": 0.7256152033805847, "learning_rate": 3.7042182482018075e-06, "loss": 4.5012, "step": 2540 }, { "epoch": 0.9148514851485149, "grad_norm": 0.8612756133079529, "learning_rate": 3.6732173602363363e-06, "loss": 4.6615, "step": 2541 }, { "epoch": 0.9152115211521152, "grad_norm": 0.6105715036392212, "learning_rate": 3.6423443151304526e-06, "loss": 4.451, "step": 2542 }, { "epoch": 0.9155715571557156, "grad_norm": 0.6826533079147339, "learning_rate": 3.611599153858214e-06, "loss": 4.6159, "step": 2543 }, { "epoch": 0.9159315931593159, "grad_norm": 0.6430963277816772, "learning_rate": 3.580981917223913e-06, "loss": 4.7071, "step": 2544 }, { "epoch": 0.9162916291629163, "grad_norm": 0.6871779561042786, "learning_rate": 3.5504926458621246e-06, "loss": 4.741, "step": 2545 }, { "epoch": 0.9166516651665166, "grad_norm": 0.7385034561157227, "learning_rate": 3.5201313802375456e-06, "loss": 4.6154, "step": 2546 }, { "epoch": 0.917011701170117, "grad_norm": 0.9472239017486572, "learning_rate": 3.4898981606450333e-06, "loss": 5.0247, "step": 2547 }, { "epoch": 0.9173717371737173, "grad_norm": 0.7648311853408813, "learning_rate": 3.4597930272094235e-06, "loss": 5.1778, "step": 2548 }, { "epoch": 0.9177317731773177, "grad_norm": 1.013818383216858, "learning_rate": 3.4298160198856568e-06, "loss": 4.9648, "step": 2549 }, { "epoch": 0.918091809180918, "grad_norm": 1.1295243501663208, "learning_rate": 3.3999671784585517e-06, "loss": 5.2376, "step": 2550 }, { "epoch": 0.9184518451845185, "grad_norm": 3.160092830657959, "learning_rate": 3.370246542542865e-06, "loss": 4.6393, "step": 2551 }, { "epoch": 0.9188118811881189, "grad_norm": 0.7330535054206848, "learning_rate": 3.3406541515832003e-06, "loss": 4.5272, "step": 2552 }, { "epoch": 0.9191719171917192, "grad_norm": 0.9496917724609375, "learning_rate": 3.311190044853951e-06, "loss": 5.049, "step": 2553 }, { "epoch": 0.9195319531953196, "grad_norm": 0.7707210779190063, "learning_rate": 3.2818542614592497e-06, "loss": 4.7573, "step": 2554 }, { "epoch": 0.9198919891989199, "grad_norm": 0.673030436038971, "learning_rate": 3.252646840332918e-06, "loss": 4.6971, "step": 2555 }, { "epoch": 0.9202520252025203, "grad_norm": 0.6012186408042908, "learning_rate": 3.2235678202384267e-06, "loss": 4.5512, "step": 2556 }, { "epoch": 0.9206120612061206, "grad_norm": 1.0911415815353394, "learning_rate": 3.1946172397688267e-06, "loss": 4.3919, "step": 2557 }, { "epoch": 0.920972097209721, "grad_norm": 0.7308849692344666, "learning_rate": 3.1657951373467497e-06, "loss": 4.8589, "step": 2558 }, { "epoch": 0.9213321332133213, "grad_norm": 0.676698625087738, "learning_rate": 3.1371015512242306e-06, "loss": 4.8728, "step": 2559 }, { "epoch": 0.9216921692169217, "grad_norm": 1.072251796722412, "learning_rate": 3.1085365194828075e-06, "loss": 5.1019, "step": 2560 }, { "epoch": 0.922052205220522, "grad_norm": 0.6631917953491211, "learning_rate": 3.0801000800333877e-06, "loss": 4.6797, "step": 2561 }, { "epoch": 0.9224122412241225, "grad_norm": 0.5610212683677673, "learning_rate": 3.051792270616216e-06, "loss": 4.6816, "step": 2562 }, { "epoch": 0.9227722772277228, "grad_norm": 0.5413954257965088, "learning_rate": 3.023613128800795e-06, "loss": 4.6725, "step": 2563 }, { "epoch": 0.9231323132313232, "grad_norm": 0.7505655288696289, "learning_rate": 2.995562691985898e-06, "loss": 4.7661, "step": 2564 }, { "epoch": 0.9234923492349235, "grad_norm": 0.810063362121582, "learning_rate": 2.9676409973994566e-06, "loss": 4.773, "step": 2565 }, { "epoch": 0.9238523852385239, "grad_norm": 1.0670089721679688, "learning_rate": 2.939848082098562e-06, "loss": 5.1925, "step": 2566 }, { "epoch": 0.9242124212421242, "grad_norm": 0.6282406449317932, "learning_rate": 2.912183982969385e-06, "loss": 4.9323, "step": 2567 }, { "epoch": 0.9245724572457246, "grad_norm": 0.9556277394294739, "learning_rate": 2.8846487367271135e-06, "loss": 4.8946, "step": 2568 }, { "epoch": 0.9249324932493249, "grad_norm": 0.5398349761962891, "learning_rate": 2.8572423799159586e-06, "loss": 4.5384, "step": 2569 }, { "epoch": 0.9252925292529253, "grad_norm": 0.6668215990066528, "learning_rate": 2.8299649489090475e-06, "loss": 4.6729, "step": 2570 }, { "epoch": 0.9256525652565256, "grad_norm": 0.6637030839920044, "learning_rate": 2.802816479908399e-06, "loss": 4.8677, "step": 2571 }, { "epoch": 0.926012601260126, "grad_norm": 0.5411921143531799, "learning_rate": 2.7757970089449024e-06, "loss": 4.6855, "step": 2572 }, { "epoch": 0.9263726372637263, "grad_norm": 0.5961670875549316, "learning_rate": 2.748906571878207e-06, "loss": 4.913, "step": 2573 }, { "epoch": 0.9267326732673268, "grad_norm": 0.8857282400131226, "learning_rate": 2.722145204396742e-06, "loss": 5.2247, "step": 2574 }, { "epoch": 0.9270927092709271, "grad_norm": 1.259244680404663, "learning_rate": 2.6955129420176196e-06, "loss": 5.4074, "step": 2575 }, { "epoch": 0.9274527452745275, "grad_norm": 0.9549702405929565, "learning_rate": 2.6690098200866098e-06, "loss": 4.5297, "step": 2576 }, { "epoch": 0.9278127812781278, "grad_norm": 0.7229859232902527, "learning_rate": 2.6426358737781098e-06, "loss": 4.9248, "step": 2577 }, { "epoch": 0.9281728172817282, "grad_norm": 0.722059428691864, "learning_rate": 2.6163911380950425e-06, "loss": 4.8236, "step": 2578 }, { "epoch": 0.9285328532853285, "grad_norm": 0.6506609916687012, "learning_rate": 2.590275647868867e-06, "loss": 4.4557, "step": 2579 }, { "epoch": 0.9288928892889289, "grad_norm": 0.7389491200447083, "learning_rate": 2.564289437759515e-06, "loss": 4.7994, "step": 2580 }, { "epoch": 0.9292529252925292, "grad_norm": 0.5561854839324951, "learning_rate": 2.53843254225532e-06, "loss": 4.2714, "step": 2581 }, { "epoch": 0.9296129612961296, "grad_norm": 0.7266308069229126, "learning_rate": 2.5127049956730207e-06, "loss": 4.3677, "step": 2582 }, { "epoch": 0.9299729972997299, "grad_norm": 0.7052050828933716, "learning_rate": 2.4871068321576596e-06, "loss": 4.5698, "step": 2583 }, { "epoch": 0.9303330333033303, "grad_norm": 0.7006728649139404, "learning_rate": 2.4616380856825716e-06, "loss": 4.7132, "step": 2584 }, { "epoch": 0.9306930693069307, "grad_norm": 0.6953990459442139, "learning_rate": 2.436298790049363e-06, "loss": 4.6789, "step": 2585 }, { "epoch": 0.9310531053105311, "grad_norm": 0.6005169749259949, "learning_rate": 2.4110889788877656e-06, "loss": 4.4247, "step": 2586 }, { "epoch": 0.9314131413141314, "grad_norm": 0.5607119798660278, "learning_rate": 2.3860086856557383e-06, "loss": 4.5032, "step": 2587 }, { "epoch": 0.9317731773177318, "grad_norm": 0.8107718825340271, "learning_rate": 2.3610579436393e-06, "loss": 4.9354, "step": 2588 }, { "epoch": 0.9321332133213321, "grad_norm": 0.5992099642753601, "learning_rate": 2.33623678595255e-06, "loss": 4.5928, "step": 2589 }, { "epoch": 0.9324932493249325, "grad_norm": 0.5759278535842896, "learning_rate": 2.311545245537594e-06, "loss": 4.8887, "step": 2590 }, { "epoch": 0.9328532853285328, "grad_norm": 0.7263954281806946, "learning_rate": 2.286983355164529e-06, "loss": 4.8844, "step": 2591 }, { "epoch": 0.9332133213321332, "grad_norm": 0.6867969632148743, "learning_rate": 2.2625511474313685e-06, "loss": 4.5716, "step": 2592 }, { "epoch": 0.9335733573357335, "grad_norm": 0.6144258379936218, "learning_rate": 2.23824865476403e-06, "loss": 4.7591, "step": 2593 }, { "epoch": 0.9339333933393339, "grad_norm": 0.6983000040054321, "learning_rate": 2.2140759094162467e-06, "loss": 4.5595, "step": 2594 }, { "epoch": 0.9342934293429342, "grad_norm": 0.7773663997650146, "learning_rate": 2.1900329434695887e-06, "loss": 4.6514, "step": 2595 }, { "epoch": 0.9346534653465347, "grad_norm": 0.7720439434051514, "learning_rate": 2.166119788833354e-06, "loss": 4.9227, "step": 2596 }, { "epoch": 0.9350135013501351, "grad_norm": 0.7285527586936951, "learning_rate": 2.1423364772445887e-06, "loss": 4.8956, "step": 2597 }, { "epoch": 0.9353735373537354, "grad_norm": 1.0870779752731323, "learning_rate": 2.118683040267999e-06, "loss": 5.0033, "step": 2598 }, { "epoch": 0.9357335733573358, "grad_norm": 0.965726912021637, "learning_rate": 2.095159509295919e-06, "loss": 5.2102, "step": 2599 }, { "epoch": 0.9360936093609361, "grad_norm": 1.4614653587341309, "learning_rate": 2.0717659155482738e-06, "loss": 5.5101, "step": 2600 }, { "epoch": 0.9364536453645365, "grad_norm": 0.5772082209587097, "learning_rate": 2.0485022900725513e-06, "loss": 4.786, "step": 2601 }, { "epoch": 0.9368136813681368, "grad_norm": 0.7124701142311096, "learning_rate": 2.025368663743743e-06, "loss": 4.4701, "step": 2602 }, { "epoch": 0.9371737173717372, "grad_norm": 0.7923992872238159, "learning_rate": 2.002365067264289e-06, "loss": 5.1817, "step": 2603 }, { "epoch": 0.9375337533753375, "grad_norm": 0.6645485758781433, "learning_rate": 1.9794915311641018e-06, "loss": 4.5608, "step": 2604 }, { "epoch": 0.9378937893789379, "grad_norm": 1.0128847360610962, "learning_rate": 1.9567480858004306e-06, "loss": 4.549, "step": 2605 }, { "epoch": 0.9382538253825382, "grad_norm": 0.6514415144920349, "learning_rate": 1.9341347613579087e-06, "loss": 5.0031, "step": 2606 }, { "epoch": 0.9386138613861386, "grad_norm": 0.9877171516418457, "learning_rate": 1.91165158784844e-06, "loss": 5.0519, "step": 2607 }, { "epoch": 0.938973897389739, "grad_norm": 0.6819136738777161, "learning_rate": 1.889298595111233e-06, "loss": 4.6016, "step": 2608 }, { "epoch": 0.9393339333933394, "grad_norm": 0.9286605715751648, "learning_rate": 1.8670758128126909e-06, "loss": 4.8841, "step": 2609 }, { "epoch": 0.9396939693969397, "grad_norm": 0.5582537651062012, "learning_rate": 1.844983270446432e-06, "loss": 4.7466, "step": 2610 }, { "epoch": 0.9400540054005401, "grad_norm": 0.9149574041366577, "learning_rate": 1.8230209973331914e-06, "loss": 4.5378, "step": 2611 }, { "epoch": 0.9404140414041404, "grad_norm": 0.5852335691452026, "learning_rate": 1.8011890226208527e-06, "loss": 4.906, "step": 2612 }, { "epoch": 0.9407740774077408, "grad_norm": 0.7403162717819214, "learning_rate": 1.7794873752843277e-06, "loss": 4.6849, "step": 2613 }, { "epoch": 0.9411341134113411, "grad_norm": 0.7608280777931213, "learning_rate": 1.7579160841256104e-06, "loss": 4.6213, "step": 2614 }, { "epoch": 0.9414941494149415, "grad_norm": 0.7691354751586914, "learning_rate": 1.7364751777736332e-06, "loss": 4.6725, "step": 2615 }, { "epoch": 0.9418541854185418, "grad_norm": 0.7262532114982605, "learning_rate": 1.7151646846843227e-06, "loss": 4.9798, "step": 2616 }, { "epoch": 0.9422142214221422, "grad_norm": 0.5878902077674866, "learning_rate": 1.6939846331405108e-06, "loss": 4.707, "step": 2617 }, { "epoch": 0.9425742574257425, "grad_norm": 0.637844443321228, "learning_rate": 1.6729350512519005e-06, "loss": 4.9285, "step": 2618 }, { "epoch": 0.942934293429343, "grad_norm": 0.6767174005508423, "learning_rate": 1.6520159669550783e-06, "loss": 4.8655, "step": 2619 }, { "epoch": 0.9432943294329433, "grad_norm": 0.6745604872703552, "learning_rate": 1.6312274080133804e-06, "loss": 4.9893, "step": 2620 }, { "epoch": 0.9436543654365437, "grad_norm": 0.6165941953659058, "learning_rate": 1.6105694020169593e-06, "loss": 4.676, "step": 2621 }, { "epoch": 0.944014401440144, "grad_norm": 1.2223420143127441, "learning_rate": 1.5900419763826614e-06, "loss": 4.7306, "step": 2622 }, { "epoch": 0.9443744374437444, "grad_norm": 0.6925899386405945, "learning_rate": 1.5696451583540827e-06, "loss": 5.2183, "step": 2623 }, { "epoch": 0.9447344734473447, "grad_norm": 0.6742193102836609, "learning_rate": 1.5493789750014031e-06, "loss": 4.9525, "step": 2624 }, { "epoch": 0.9450945094509451, "grad_norm": 1.818490982055664, "learning_rate": 1.5292434532215072e-06, "loss": 5.372, "step": 2625 }, { "epoch": 0.9454545454545454, "grad_norm": 3.1944949626922607, "learning_rate": 1.5092386197378183e-06, "loss": 4.7497, "step": 2626 }, { "epoch": 0.9458145814581458, "grad_norm": 0.5739015340805054, "learning_rate": 1.489364501100332e-06, "loss": 4.6389, "step": 2627 }, { "epoch": 0.9461746174617461, "grad_norm": 0.8385379910469055, "learning_rate": 1.4696211236855272e-06, "loss": 4.8675, "step": 2628 }, { "epoch": 0.9465346534653465, "grad_norm": 0.7263203859329224, "learning_rate": 1.4500085136964326e-06, "loss": 4.5676, "step": 2629 }, { "epoch": 0.946894689468947, "grad_norm": 0.9063706398010254, "learning_rate": 1.430526697162482e-06, "loss": 4.4987, "step": 2630 }, { "epoch": 0.9472547254725473, "grad_norm": 0.9547297954559326, "learning_rate": 1.4111756999395154e-06, "loss": 4.7062, "step": 2631 }, { "epoch": 0.9476147614761476, "grad_norm": 0.7871283292770386, "learning_rate": 1.3919555477097668e-06, "loss": 4.6941, "step": 2632 }, { "epoch": 0.947974797479748, "grad_norm": 0.8934873342514038, "learning_rate": 1.3728662659818204e-06, "loss": 4.5796, "step": 2633 }, { "epoch": 0.9483348334833483, "grad_norm": 0.5550655126571655, "learning_rate": 1.3539078800905659e-06, "loss": 4.6309, "step": 2634 }, { "epoch": 0.9486948694869487, "grad_norm": 0.8748136758804321, "learning_rate": 1.3350804151971653e-06, "loss": 4.7027, "step": 2635 }, { "epoch": 0.949054905490549, "grad_norm": 0.6412554383277893, "learning_rate": 1.3163838962890195e-06, "loss": 4.7536, "step": 2636 }, { "epoch": 0.9494149414941494, "grad_norm": 0.5575628280639648, "learning_rate": 1.2978183481797801e-06, "loss": 4.9255, "step": 2637 }, { "epoch": 0.9497749774977498, "grad_norm": 0.6832976341247559, "learning_rate": 1.2793837955092258e-06, "loss": 4.8386, "step": 2638 }, { "epoch": 0.9501350135013501, "grad_norm": 0.7369568347930908, "learning_rate": 1.261080262743297e-06, "loss": 4.8507, "step": 2639 }, { "epoch": 0.9504950495049505, "grad_norm": 0.6423168778419495, "learning_rate": 1.2429077741740736e-06, "loss": 4.5613, "step": 2640 }, { "epoch": 0.9508550855085508, "grad_norm": 0.6358969807624817, "learning_rate": 1.2248663539196848e-06, "loss": 4.5165, "step": 2641 }, { "epoch": 0.9512151215121513, "grad_norm": 0.726514995098114, "learning_rate": 1.2069560259243328e-06, "loss": 5.0404, "step": 2642 }, { "epoch": 0.9515751575157516, "grad_norm": 0.6469401121139526, "learning_rate": 1.1891768139582037e-06, "loss": 4.3748, "step": 2643 }, { "epoch": 0.951935193519352, "grad_norm": 0.7196247577667236, "learning_rate": 1.1715287416175113e-06, "loss": 4.7141, "step": 2644 }, { "epoch": 0.9522952295229523, "grad_norm": 0.6038268804550171, "learning_rate": 1.1540118323243865e-06, "loss": 4.5587, "step": 2645 }, { "epoch": 0.9526552655265527, "grad_norm": 0.6786099076271057, "learning_rate": 1.1366261093268992e-06, "loss": 5.1968, "step": 2646 }, { "epoch": 0.953015301530153, "grad_norm": 0.5878410339355469, "learning_rate": 1.1193715956990258e-06, "loss": 4.7435, "step": 2647 }, { "epoch": 0.9533753375337534, "grad_norm": 0.8352496027946472, "learning_rate": 1.1022483143405705e-06, "loss": 4.9383, "step": 2648 }, { "epoch": 0.9537353735373537, "grad_norm": 0.7436163425445557, "learning_rate": 1.08525628797721e-06, "loss": 4.9565, "step": 2649 }, { "epoch": 0.9540954095409541, "grad_norm": 0.9482848644256592, "learning_rate": 1.068395539160394e-06, "loss": 4.9916, "step": 2650 }, { "epoch": 0.9544554455445544, "grad_norm": 1.374624252319336, "learning_rate": 1.0516660902673448e-06, "loss": 4.6837, "step": 2651 }, { "epoch": 0.9548154815481548, "grad_norm": 0.5657400488853455, "learning_rate": 1.035067963501024e-06, "loss": 4.3842, "step": 2652 }, { "epoch": 0.9551755175517552, "grad_norm": 0.640466570854187, "learning_rate": 1.018601180890133e-06, "loss": 4.8339, "step": 2653 }, { "epoch": 0.9555355535553556, "grad_norm": 0.7675738334655762, "learning_rate": 1.0022657642890231e-06, "loss": 4.5358, "step": 2654 }, { "epoch": 0.9558955895589559, "grad_norm": 0.6688444018363953, "learning_rate": 9.86061735377708e-07, "loss": 4.8008, "step": 2655 }, { "epoch": 0.9562556255625563, "grad_norm": 0.7679070830345154, "learning_rate": 9.699891156618402e-07, "loss": 4.8804, "step": 2656 }, { "epoch": 0.9566156615661566, "grad_norm": 0.7303805351257324, "learning_rate": 9.540479264726676e-07, "loss": 4.6286, "step": 2657 }, { "epoch": 0.956975697569757, "grad_norm": 0.7357656955718994, "learning_rate": 9.382381889669667e-07, "loss": 4.7679, "step": 2658 }, { "epoch": 0.9573357335733573, "grad_norm": 0.8778092265129089, "learning_rate": 9.225599241271199e-07, "loss": 4.8732, "step": 2659 }, { "epoch": 0.9576957695769577, "grad_norm": 0.4600130021572113, "learning_rate": 9.070131527609604e-07, "loss": 4.8048, "step": 2660 }, { "epoch": 0.958055805580558, "grad_norm": 0.6551803946495056, "learning_rate": 8.9159789550185e-07, "loss": 4.506, "step": 2661 }, { "epoch": 0.9584158415841584, "grad_norm": 0.7119221687316895, "learning_rate": 8.763141728085789e-07, "loss": 4.5518, "step": 2662 }, { "epoch": 0.9587758775877587, "grad_norm": 0.571420431137085, "learning_rate": 8.611620049653879e-07, "loss": 4.8277, "step": 2663 }, { "epoch": 0.9591359135913592, "grad_norm": 0.9699096083641052, "learning_rate": 8.461414120819133e-07, "loss": 4.9504, "step": 2664 }, { "epoch": 0.9594959495949595, "grad_norm": 1.1072616577148438, "learning_rate": 8.312524140931644e-07, "loss": 4.5514, "step": 2665 }, { "epoch": 0.9598559855985599, "grad_norm": 0.5243213176727295, "learning_rate": 8.16495030759501e-07, "loss": 5.0715, "step": 2666 }, { "epoch": 0.9602160216021602, "grad_norm": 1.0006332397460938, "learning_rate": 8.018692816666118e-07, "loss": 5.0062, "step": 2667 }, { "epoch": 0.9605760576057606, "grad_norm": 0.7771649956703186, "learning_rate": 7.873751862254696e-07, "loss": 4.8082, "step": 2668 }, { "epoch": 0.9609360936093609, "grad_norm": 0.6831532120704651, "learning_rate": 7.730127636723539e-07, "loss": 4.8668, "step": 2669 }, { "epoch": 0.9612961296129613, "grad_norm": 0.9201724529266357, "learning_rate": 7.587820330687389e-07, "loss": 4.5572, "step": 2670 }, { "epoch": 0.9616561656165616, "grad_norm": 0.730021595954895, "learning_rate": 7.446830133013616e-07, "loss": 4.5861, "step": 2671 }, { "epoch": 0.962016201620162, "grad_norm": 0.7570610642433167, "learning_rate": 7.307157230821426e-07, "loss": 5.0164, "step": 2672 }, { "epoch": 0.9623762376237623, "grad_norm": 0.8982148170471191, "learning_rate": 7.168801809481763e-07, "loss": 5.0613, "step": 2673 }, { "epoch": 0.9627362736273627, "grad_norm": 0.7683990001678467, "learning_rate": 7.031764052616852e-07, "loss": 4.9976, "step": 2674 }, { "epoch": 0.963096309630963, "grad_norm": 1.3601149320602417, "learning_rate": 6.896044142100433e-07, "loss": 5.3859, "step": 2675 }, { "epoch": 0.9634563456345635, "grad_norm": 0.7018367648124695, "learning_rate": 6.761642258056978e-07, "loss": 4.7254, "step": 2676 }, { "epoch": 0.9638163816381639, "grad_norm": 0.8308577537536621, "learning_rate": 6.628558578862021e-07, "loss": 5.1009, "step": 2677 }, { "epoch": 0.9641764176417642, "grad_norm": 0.6796321272850037, "learning_rate": 6.496793281141056e-07, "loss": 4.7258, "step": 2678 }, { "epoch": 0.9645364536453646, "grad_norm": 0.8252047300338745, "learning_rate": 6.366346539770529e-07, "loss": 4.5518, "step": 2679 }, { "epoch": 0.9648964896489649, "grad_norm": 0.7564746737480164, "learning_rate": 6.237218527876399e-07, "loss": 4.4544, "step": 2680 }, { "epoch": 0.9652565256525653, "grad_norm": 0.49899938702583313, "learning_rate": 6.109409416834688e-07, "loss": 4.7035, "step": 2681 }, { "epoch": 0.9656165616561656, "grad_norm": 0.8740639686584473, "learning_rate": 5.982919376270823e-07, "loss": 4.5884, "step": 2682 }, { "epoch": 0.965976597659766, "grad_norm": 1.1928685903549194, "learning_rate": 5.857748574059851e-07, "loss": 4.7686, "step": 2683 }, { "epoch": 0.9663366336633663, "grad_norm": 0.8761278986930847, "learning_rate": 5.733897176325665e-07, "loss": 4.9492, "step": 2684 }, { "epoch": 0.9666966696669667, "grad_norm": 0.6431811451911926, "learning_rate": 5.611365347441334e-07, "loss": 4.8721, "step": 2685 }, { "epoch": 0.967056705670567, "grad_norm": 0.7401660084724426, "learning_rate": 5.49015325002833e-07, "loss": 4.8349, "step": 2686 }, { "epoch": 0.9674167416741675, "grad_norm": 0.7588707804679871, "learning_rate": 5.370261044956971e-07, "loss": 4.6324, "step": 2687 }, { "epoch": 0.9677767776777678, "grad_norm": 0.5506744980812073, "learning_rate": 5.25168889134553e-07, "loss": 4.6068, "step": 2688 }, { "epoch": 0.9681368136813682, "grad_norm": 0.6155896782875061, "learning_rate": 5.134436946560572e-07, "loss": 4.7839, "step": 2689 }, { "epoch": 0.9684968496849685, "grad_norm": 0.7731359601020813, "learning_rate": 5.018505366216175e-07, "loss": 4.8457, "step": 2690 }, { "epoch": 0.9688568856885689, "grad_norm": 0.6838685870170593, "learning_rate": 4.903894304174372e-07, "loss": 4.9202, "step": 2691 }, { "epoch": 0.9692169216921692, "grad_norm": 0.7036203742027283, "learning_rate": 4.790603912544489e-07, "loss": 4.609, "step": 2692 }, { "epoch": 0.9695769576957696, "grad_norm": 0.6218310594558716, "learning_rate": 4.678634341683252e-07, "loss": 4.7149, "step": 2693 }, { "epoch": 0.9699369936993699, "grad_norm": 0.6616173982620239, "learning_rate": 4.567985740194236e-07, "loss": 4.5665, "step": 2694 }, { "epoch": 0.9702970297029703, "grad_norm": 0.677174985408783, "learning_rate": 4.458658254927972e-07, "loss": 5.1044, "step": 2695 }, { "epoch": 0.9706570657065706, "grad_norm": 0.7272735238075256, "learning_rate": 4.3506520309813947e-07, "loss": 5.0046, "step": 2696 }, { "epoch": 0.971017101710171, "grad_norm": 0.9171267151832581, "learning_rate": 4.2439672116982855e-07, "loss": 5.1378, "step": 2697 }, { "epoch": 0.9713771377137714, "grad_norm": 0.9265881180763245, "learning_rate": 4.138603938668273e-07, "loss": 5.1032, "step": 2698 }, { "epoch": 0.9717371737173718, "grad_norm": 0.8648788332939148, "learning_rate": 4.034562351727389e-07, "loss": 5.3116, "step": 2699 }, { "epoch": 0.9720972097209721, "grad_norm": 1.2427067756652832, "learning_rate": 3.9318425889574017e-07, "loss": 5.5403, "step": 2700 }, { "epoch": 0.9724572457245725, "grad_norm": 1.1116799116134644, "learning_rate": 3.8304447866857053e-07, "loss": 5.1365, "step": 2701 }, { "epoch": 0.9728172817281728, "grad_norm": 0.5051364898681641, "learning_rate": 3.73036907948543e-07, "loss": 4.3293, "step": 2702 }, { "epoch": 0.9731773177317732, "grad_norm": 1.0161361694335938, "learning_rate": 3.631615600174887e-07, "loss": 4.4407, "step": 2703 }, { "epoch": 0.9735373537353735, "grad_norm": 0.8723888993263245, "learning_rate": 3.5341844798174594e-07, "loss": 5.0663, "step": 2704 }, { "epoch": 0.9738973897389739, "grad_norm": 0.6905868649482727, "learning_rate": 3.4380758477219333e-07, "loss": 4.9586, "step": 2705 }, { "epoch": 0.9742574257425742, "grad_norm": 0.6786131262779236, "learning_rate": 3.343289831441387e-07, "loss": 4.4407, "step": 2706 }, { "epoch": 0.9746174617461746, "grad_norm": 0.7154151201248169, "learning_rate": 3.2498265567739717e-07, "loss": 4.6941, "step": 2707 }, { "epoch": 0.9749774977497749, "grad_norm": 0.7994034290313721, "learning_rate": 3.1576861477621287e-07, "loss": 4.857, "step": 2708 }, { "epoch": 0.9753375337533753, "grad_norm": 0.6819135546684265, "learning_rate": 3.0668687266925956e-07, "loss": 5.0646, "step": 2709 }, { "epoch": 0.9756975697569757, "grad_norm": 0.613059401512146, "learning_rate": 2.977374414096401e-07, "loss": 4.9047, "step": 2710 }, { "epoch": 0.9760576057605761, "grad_norm": 0.9619209170341492, "learning_rate": 2.889203328748424e-07, "loss": 4.5585, "step": 2711 }, { "epoch": 0.9764176417641764, "grad_norm": 0.5810762047767639, "learning_rate": 2.8023555876673937e-07, "loss": 4.8418, "step": 2712 }, { "epoch": 0.9767776777677768, "grad_norm": 0.6123738884925842, "learning_rate": 2.7168313061159964e-07, "loss": 4.5524, "step": 2713 }, { "epoch": 0.9771377137713771, "grad_norm": 0.6715987324714661, "learning_rate": 2.6326305976001055e-07, "loss": 4.9585, "step": 2714 }, { "epoch": 0.9774977497749775, "grad_norm": 0.6126198768615723, "learning_rate": 2.549753573869107e-07, "loss": 4.6261, "step": 2715 }, { "epoch": 0.9778577857785778, "grad_norm": 0.6247376203536987, "learning_rate": 2.468200344915572e-07, "loss": 4.7465, "step": 2716 }, { "epoch": 0.9782178217821782, "grad_norm": 0.6758084297180176, "learning_rate": 2.3879710189753656e-07, "loss": 4.6146, "step": 2717 }, { "epoch": 0.9785778577857785, "grad_norm": 0.8351864218711853, "learning_rate": 2.3090657025270912e-07, "loss": 4.9151, "step": 2718 }, { "epoch": 0.9789378937893789, "grad_norm": 1.0861241817474365, "learning_rate": 2.2314845002922025e-07, "loss": 4.75, "step": 2719 }, { "epoch": 0.9792979297929792, "grad_norm": 0.8706967830657959, "learning_rate": 2.15522751523467e-07, "loss": 5.033, "step": 2720 }, { "epoch": 0.9796579657965797, "grad_norm": 0.8538120985031128, "learning_rate": 2.080294848561426e-07, "loss": 4.5049, "step": 2721 }, { "epoch": 0.9800180018001801, "grad_norm": 0.567323625087738, "learning_rate": 2.0066865997212525e-07, "loss": 4.7168, "step": 2722 }, { "epoch": 0.9803780378037804, "grad_norm": 0.8506109118461609, "learning_rate": 1.9344028664056713e-07, "loss": 5.0249, "step": 2723 }, { "epoch": 0.9807380738073808, "grad_norm": 1.3589588403701782, "learning_rate": 1.8634437445479435e-07, "loss": 5.1037, "step": 2724 }, { "epoch": 0.9810981098109811, "grad_norm": 1.3143948316574097, "learning_rate": 1.7938093283236258e-07, "loss": 5.555, "step": 2725 }, { "epoch": 0.9814581458145815, "grad_norm": 1.7907097339630127, "learning_rate": 1.7254997101500137e-07, "loss": 4.8558, "step": 2726 }, { "epoch": 0.9818181818181818, "grad_norm": 0.6720486283302307, "learning_rate": 1.6585149806860324e-07, "loss": 4.4563, "step": 2727 }, { "epoch": 0.9821782178217822, "grad_norm": 0.6979610919952393, "learning_rate": 1.5928552288326793e-07, "loss": 4.5348, "step": 2728 }, { "epoch": 0.9825382538253825, "grad_norm": 0.8894920349121094, "learning_rate": 1.5285205417319149e-07, "loss": 4.7034, "step": 2729 }, { "epoch": 0.9828982898289829, "grad_norm": 0.6074763536453247, "learning_rate": 1.4655110047675503e-07, "loss": 4.5014, "step": 2730 }, { "epoch": 0.9832583258325832, "grad_norm": 0.7769091129302979, "learning_rate": 1.403826701564359e-07, "loss": 4.7347, "step": 2731 }, { "epoch": 0.9836183618361836, "grad_norm": 0.7279649376869202, "learning_rate": 1.3434677139885222e-07, "loss": 4.5844, "step": 2732 }, { "epoch": 0.983978397839784, "grad_norm": 0.8169944882392883, "learning_rate": 1.2844341221471824e-07, "loss": 4.939, "step": 2733 }, { "epoch": 0.9843384338433844, "grad_norm": 0.7737520337104797, "learning_rate": 1.2267260043885564e-07, "loss": 4.3103, "step": 2734 }, { "epoch": 0.9846984698469847, "grad_norm": 0.7852435111999512, "learning_rate": 1.170343437301491e-07, "loss": 4.6278, "step": 2735 }, { "epoch": 0.9850585058505851, "grad_norm": 0.5969253182411194, "learning_rate": 1.1152864957157949e-07, "loss": 4.5914, "step": 2736 }, { "epoch": 0.9854185418541854, "grad_norm": 0.8359677791595459, "learning_rate": 1.0615552527017958e-07, "loss": 4.5289, "step": 2737 }, { "epoch": 0.9857785778577858, "grad_norm": 0.8683612942695618, "learning_rate": 1.0091497795706728e-07, "loss": 4.5384, "step": 2738 }, { "epoch": 0.9861386138613861, "grad_norm": 0.6547925472259521, "learning_rate": 9.580701458736796e-08, "loss": 4.7082, "step": 2739 }, { "epoch": 0.9864986498649865, "grad_norm": 0.5458620190620422, "learning_rate": 9.083164194025883e-08, "loss": 4.6053, "step": 2740 }, { "epoch": 0.9868586858685868, "grad_norm": 0.6568934917449951, "learning_rate": 8.598886661895788e-08, "loss": 4.5799, "step": 2741 }, { "epoch": 0.9872187218721872, "grad_norm": 0.661688506603241, "learning_rate": 8.127869505069053e-08, "loss": 4.3995, "step": 2742 }, { "epoch": 0.9875787578757875, "grad_norm": 0.7148826122283936, "learning_rate": 7.670113348670071e-08, "loss": 4.607, "step": 2743 }, { "epoch": 0.987938793879388, "grad_norm": 0.5700961947441101, "learning_rate": 7.225618800222877e-08, "loss": 4.7385, "step": 2744 }, { "epoch": 0.9882988298829883, "grad_norm": 0.6849833130836487, "learning_rate": 6.794386449651135e-08, "loss": 4.9545, "step": 2745 }, { "epoch": 0.9886588658865887, "grad_norm": 0.6745620965957642, "learning_rate": 6.376416869277036e-08, "loss": 4.8148, "step": 2746 }, { "epoch": 0.989018901890189, "grad_norm": 0.9582657814025879, "learning_rate": 5.971710613821291e-08, "loss": 5.3346, "step": 2747 }, { "epoch": 0.9893789378937894, "grad_norm": 0.8358666300773621, "learning_rate": 5.5802682204009194e-08, "loss": 5.1755, "step": 2748 }, { "epoch": 0.9897389738973897, "grad_norm": 0.9131016135215759, "learning_rate": 5.2020902085303525e-08, "loss": 5.2653, "step": 2749 }, { "epoch": 0.9900990099009901, "grad_norm": 1.4401273727416992, "learning_rate": 4.837177080119215e-08, "loss": 5.5157, "step": 2750 }, { "epoch": 0.9904590459045904, "grad_norm": 0.676723301410675, "learning_rate": 4.485529319473436e-08, "loss": 4.8633, "step": 2751 }, { "epoch": 0.9908190819081908, "grad_norm": 0.7600442171096802, "learning_rate": 4.147147393290807e-08, "loss": 4.7974, "step": 2752 }, { "epoch": 0.9911791179117911, "grad_norm": 1.2827167510986328, "learning_rate": 3.8220317506654226e-08, "loss": 4.8168, "step": 2753 }, { "epoch": 0.9915391539153915, "grad_norm": 0.9814483523368835, "learning_rate": 3.510182823083241e-08, "loss": 4.736, "step": 2754 }, { "epoch": 0.991899189918992, "grad_norm": 0.9813688397407532, "learning_rate": 3.2116010244254144e-08, "loss": 4.7668, "step": 2755 }, { "epoch": 0.9922592259225923, "grad_norm": 0.7599554657936096, "learning_rate": 2.9262867509605163e-08, "loss": 4.7808, "step": 2756 }, { "epoch": 0.9926192619261927, "grad_norm": 0.5308656692504883, "learning_rate": 2.6542403813545334e-08, "loss": 4.571, "step": 2757 }, { "epoch": 0.992979297929793, "grad_norm": 0.5827195048332214, "learning_rate": 2.3954622766597657e-08, "loss": 4.7896, "step": 2758 }, { "epoch": 0.9933393339333934, "grad_norm": 0.9100288152694702, "learning_rate": 2.1499527803214846e-08, "loss": 4.8101, "step": 2759 }, { "epoch": 0.9936993699369937, "grad_norm": 1.0028470754623413, "learning_rate": 1.9177122181757156e-08, "loss": 4.6816, "step": 2760 }, { "epoch": 0.994059405940594, "grad_norm": 0.7796440720558167, "learning_rate": 1.698740898444795e-08, "loss": 4.6857, "step": 2761 }, { "epoch": 0.9944194419441944, "grad_norm": 0.6619350910186768, "learning_rate": 1.4930391117451426e-08, "loss": 4.8632, "step": 2762 }, { "epoch": 0.9947794779477948, "grad_norm": 0.6579605937004089, "learning_rate": 1.3006071310783797e-08, "loss": 4.8485, "step": 2763 }, { "epoch": 0.9951395139513951, "grad_norm": 0.7904688119888306, "learning_rate": 1.1214452118368802e-08, "loss": 4.6414, "step": 2764 }, { "epoch": 0.9954995499549955, "grad_norm": 0.6213950514793396, "learning_rate": 9.555535917993297e-09, "loss": 4.56, "step": 2765 }, { "epoch": 0.9958595859585958, "grad_norm": 0.5204569697380066, "learning_rate": 8.029324911351666e-09, "loss": 4.42, "step": 2766 }, { "epoch": 0.9962196219621963, "grad_norm": 0.5820871591567993, "learning_rate": 6.635821124001406e-09, "loss": 4.7025, "step": 2767 }, { "epoch": 0.9965796579657966, "grad_norm": 0.5928208231925964, "learning_rate": 5.375026405352035e-09, "loss": 5.1903, "step": 2768 }, { "epoch": 0.996939693969397, "grad_norm": 0.5016415119171143, "learning_rate": 4.246942428709488e-09, "loss": 4.7524, "step": 2769 }, { "epoch": 0.9972997299729973, "grad_norm": 0.650364339351654, "learning_rate": 3.2515706912539245e-09, "loss": 4.3618, "step": 2770 }, { "epoch": 0.9976597659765977, "grad_norm": 1.0009171962738037, "learning_rate": 2.388912514017516e-09, "loss": 4.9389, "step": 2771 }, { "epoch": 0.998019801980198, "grad_norm": 1.0807517766952515, "learning_rate": 1.6589690418955528e-09, "loss": 5.1394, "step": 2772 }, { "epoch": 0.9983798379837984, "grad_norm": 0.9495770335197449, "learning_rate": 1.0617412436464413e-09, "loss": 5.1927, "step": 2773 }, { "epoch": 0.9987398739873987, "grad_norm": 0.8993694186210632, "learning_rate": 5.972299119250125e-10, "loss": 5.2012, "step": 2774 }, { "epoch": 0.9990999099909991, "grad_norm": 1.2877267599105835, "learning_rate": 2.6543566319370275e-10, "loss": 5.2814, "step": 2775 }, { "epoch": 0.9994599459945994, "grad_norm": 0.5570570826530457, "learning_rate": 6.63589378113727e-11, "loss": 4.7664, "step": 2776 }, { "epoch": 0.9998199819981998, "grad_norm": 0.6813939213752747, "learning_rate": 0.0, "loss": 4.551, "step": 2777 } ], "logging_steps": 1, "max_steps": 2777, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 695, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3015768690130944.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }