|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 37, |
|
"global_step": 290, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006896551724137931, |
|
"grad_norm": 2.651252269744873, |
|
"learning_rate": 7.5e-07, |
|
"loss": 4.7891, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006896551724137931, |
|
"eval_loss": 3.3850624561309814, |
|
"eval_runtime": 4.5511, |
|
"eval_samples_per_second": 1.758, |
|
"eval_steps_per_second": 1.758, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.013793103448275862, |
|
"grad_norm": 2.064390182495117, |
|
"learning_rate": 1.5e-06, |
|
"loss": 2.9792, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.020689655172413793, |
|
"grad_norm": 1.254639983177185, |
|
"learning_rate": 2.25e-06, |
|
"loss": 3.0642, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.027586206896551724, |
|
"grad_norm": 1.5396596193313599, |
|
"learning_rate": 3e-06, |
|
"loss": 3.249, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.034482758620689655, |
|
"grad_norm": 2.1187007427215576, |
|
"learning_rate": 3.75e-06, |
|
"loss": 3.4551, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.041379310344827586, |
|
"grad_norm": 3.609483003616333, |
|
"learning_rate": 4.5e-06, |
|
"loss": 6.0963, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04827586206896552, |
|
"grad_norm": 2.3606605529785156, |
|
"learning_rate": 5.25e-06, |
|
"loss": 4.5106, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05517241379310345, |
|
"grad_norm": 1.5973587036132812, |
|
"learning_rate": 6e-06, |
|
"loss": 3.3045, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.06206896551724138, |
|
"grad_norm": 1.4481266736984253, |
|
"learning_rate": 6.750000000000001e-06, |
|
"loss": 3.5009, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06896551724137931, |
|
"grad_norm": 2.194455862045288, |
|
"learning_rate": 7.5e-06, |
|
"loss": 5.0099, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07586206896551724, |
|
"grad_norm": 2.1214380264282227, |
|
"learning_rate": 8.25e-06, |
|
"loss": 3.7116, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.08275862068965517, |
|
"grad_norm": 1.501364827156067, |
|
"learning_rate": 9e-06, |
|
"loss": 3.3266, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0896551724137931, |
|
"grad_norm": 2.078202962875366, |
|
"learning_rate": 9.75e-06, |
|
"loss": 3.4969, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.09655172413793103, |
|
"grad_norm": 1.9757059812545776, |
|
"learning_rate": 1.05e-05, |
|
"loss": 3.3945, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.10344827586206896, |
|
"grad_norm": 2.730952739715576, |
|
"learning_rate": 1.125e-05, |
|
"loss": 3.5526, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1103448275862069, |
|
"grad_norm": 3.307832717895508, |
|
"learning_rate": 1.2e-05, |
|
"loss": 4.7907, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.11724137931034483, |
|
"grad_norm": 2.1818623542785645, |
|
"learning_rate": 1.275e-05, |
|
"loss": 3.9304, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.12413793103448276, |
|
"grad_norm": 3.2467010021209717, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 4.9399, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1310344827586207, |
|
"grad_norm": 1.985250473022461, |
|
"learning_rate": 1.4249999999999999e-05, |
|
"loss": 2.9249, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.13793103448275862, |
|
"grad_norm": 2.669128894805908, |
|
"learning_rate": 1.5e-05, |
|
"loss": 3.33, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.14482758620689656, |
|
"grad_norm": 2.3027143478393555, |
|
"learning_rate": 1.575e-05, |
|
"loss": 2.6643, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.15172413793103448, |
|
"grad_norm": 3.2221450805664062, |
|
"learning_rate": 1.65e-05, |
|
"loss": 3.6538, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.15862068965517243, |
|
"grad_norm": 4.213558197021484, |
|
"learning_rate": 1.725e-05, |
|
"loss": 3.5537, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.16551724137931034, |
|
"grad_norm": 1.8173574209213257, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.9893, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1724137931034483, |
|
"grad_norm": 2.863757610321045, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 2.487, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1793103448275862, |
|
"grad_norm": 2.650667905807495, |
|
"learning_rate": 1.95e-05, |
|
"loss": 2.4155, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.18620689655172415, |
|
"grad_norm": 2.5865488052368164, |
|
"learning_rate": 2.025e-05, |
|
"loss": 2.6802, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.19310344827586207, |
|
"grad_norm": 2.5942306518554688, |
|
"learning_rate": 2.1e-05, |
|
"loss": 2.6337, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.72637414932251, |
|
"learning_rate": 2.175e-05, |
|
"loss": 4.4921, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.20689655172413793, |
|
"grad_norm": 3.370011806488037, |
|
"learning_rate": 2.25e-05, |
|
"loss": 2.8945, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.21379310344827587, |
|
"grad_norm": 2.9828832149505615, |
|
"learning_rate": 2.3250000000000003e-05, |
|
"loss": 3.031, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.2206896551724138, |
|
"grad_norm": 2.9294815063476562, |
|
"learning_rate": 2.4e-05, |
|
"loss": 2.7564, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.22758620689655173, |
|
"grad_norm": 2.9029836654663086, |
|
"learning_rate": 2.475e-05, |
|
"loss": 2.635, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.23448275862068965, |
|
"grad_norm": 3.5925707817077637, |
|
"learning_rate": 2.55e-05, |
|
"loss": 2.9502, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2413793103448276, |
|
"grad_norm": 4.757285118103027, |
|
"learning_rate": 2.625e-05, |
|
"loss": 2.0215, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2482758620689655, |
|
"grad_norm": 12.417799949645996, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 4.3027, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.25517241379310346, |
|
"grad_norm": 4.293732166290283, |
|
"learning_rate": 2.7750000000000004e-05, |
|
"loss": 1.8095, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.25517241379310346, |
|
"eval_loss": 2.7702932357788086, |
|
"eval_runtime": 4.4922, |
|
"eval_samples_per_second": 1.781, |
|
"eval_steps_per_second": 1.781, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2620689655172414, |
|
"grad_norm": 4.490789890289307, |
|
"learning_rate": 2.8499999999999998e-05, |
|
"loss": 2.2294, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2689655172413793, |
|
"grad_norm": 6.464705467224121, |
|
"learning_rate": 2.925e-05, |
|
"loss": 1.8365, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 4.536788463592529, |
|
"learning_rate": 3e-05, |
|
"loss": 2.3919, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2827586206896552, |
|
"grad_norm": 4.616312503814697, |
|
"learning_rate": 2.9999842246463293e-05, |
|
"loss": 2.7667, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2896551724137931, |
|
"grad_norm": 9.268322944641113, |
|
"learning_rate": 2.9999368989171332e-05, |
|
"loss": 1.6036, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.296551724137931, |
|
"grad_norm": 9.249737739562988, |
|
"learning_rate": 2.9998580238078518e-05, |
|
"loss": 3.5214, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.30344827586206896, |
|
"grad_norm": 5.715229034423828, |
|
"learning_rate": 2.9997476009775285e-05, |
|
"loss": 2.8424, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.3103448275862069, |
|
"grad_norm": 11.085472106933594, |
|
"learning_rate": 2.999605632748776e-05, |
|
"loss": 3.2402, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.31724137931034485, |
|
"grad_norm": 9.64968204498291, |
|
"learning_rate": 2.999432122107726e-05, |
|
"loss": 2.5156, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.32413793103448274, |
|
"grad_norm": 5.387156963348389, |
|
"learning_rate": 2.9992270727039674e-05, |
|
"loss": 2.5335, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3310344827586207, |
|
"grad_norm": 8.309078216552734, |
|
"learning_rate": 2.9989904888504697e-05, |
|
"loss": 3.9152, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.33793103448275863, |
|
"grad_norm": 8.13180923461914, |
|
"learning_rate": 2.998722375523491e-05, |
|
"loss": 2.6522, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 5.079596042633057, |
|
"learning_rate": 2.9984227383624753e-05, |
|
"loss": 2.1673, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.35172413793103446, |
|
"grad_norm": 4.810355186462402, |
|
"learning_rate": 2.9980915836699322e-05, |
|
"loss": 2.4192, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3586206896551724, |
|
"grad_norm": 5.691514015197754, |
|
"learning_rate": 2.9977289184113038e-05, |
|
"loss": 2.1087, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.36551724137931035, |
|
"grad_norm": 6.280345439910889, |
|
"learning_rate": 2.997334750214822e-05, |
|
"loss": 2.1365, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3724137931034483, |
|
"grad_norm": 5.447160243988037, |
|
"learning_rate": 2.9969090873713425e-05, |
|
"loss": 2.3364, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3793103448275862, |
|
"grad_norm": 9.810458183288574, |
|
"learning_rate": 2.9964519388341753e-05, |
|
"loss": 2.6459, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.38620689655172413, |
|
"grad_norm": 8.42009449005127, |
|
"learning_rate": 2.9959633142188928e-05, |
|
"loss": 2.449, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3931034482758621, |
|
"grad_norm": 4.881720066070557, |
|
"learning_rate": 2.99544322380313e-05, |
|
"loss": 2.1418, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 5.82330322265625, |
|
"learning_rate": 2.994891678526368e-05, |
|
"loss": 1.7012, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.4068965517241379, |
|
"grad_norm": 7.9598917961120605, |
|
"learning_rate": 2.994308689989702e-05, |
|
"loss": 1.935, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.41379310344827586, |
|
"grad_norm": 4.582525253295898, |
|
"learning_rate": 2.9936942704555988e-05, |
|
"loss": 2.3673, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4206896551724138, |
|
"grad_norm": 5.968487739562988, |
|
"learning_rate": 2.9930484328476392e-05, |
|
"loss": 1.3351, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.42758620689655175, |
|
"grad_norm": 6.217311859130859, |
|
"learning_rate": 2.992371190750246e-05, |
|
"loss": 2.8033, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.43448275862068964, |
|
"grad_norm": 15.824493408203125, |
|
"learning_rate": 2.9916625584083965e-05, |
|
"loss": 3.5775, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.4413793103448276, |
|
"grad_norm": 6.833436965942383, |
|
"learning_rate": 2.990922550727326e-05, |
|
"loss": 3.1572, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.4482758620689655, |
|
"grad_norm": 7.139071464538574, |
|
"learning_rate": 2.9901511832722107e-05, |
|
"loss": 2.5818, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.45517241379310347, |
|
"grad_norm": 5.222311973571777, |
|
"learning_rate": 2.989348472267844e-05, |
|
"loss": 1.6886, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.46206896551724136, |
|
"grad_norm": 5.626909255981445, |
|
"learning_rate": 2.988514434598292e-05, |
|
"loss": 2.3763, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.4689655172413793, |
|
"grad_norm": 4.76120138168335, |
|
"learning_rate": 2.9876490878065402e-05, |
|
"loss": 2.3714, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.47586206896551725, |
|
"grad_norm": 4.187699317932129, |
|
"learning_rate": 2.9867524500941255e-05, |
|
"loss": 1.5454, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.4827586206896552, |
|
"grad_norm": 6.578094959259033, |
|
"learning_rate": 2.9858245403207488e-05, |
|
"loss": 2.9233, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4896551724137931, |
|
"grad_norm": 21.87733268737793, |
|
"learning_rate": 2.9848653780038844e-05, |
|
"loss": 3.232, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.496551724137931, |
|
"grad_norm": 8.493062973022461, |
|
"learning_rate": 2.9838749833183647e-05, |
|
"loss": 2.1025, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.503448275862069, |
|
"grad_norm": 9.408147811889648, |
|
"learning_rate": 2.9828533770959584e-05, |
|
"loss": 1.8301, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5103448275862069, |
|
"grad_norm": 5.230013847351074, |
|
"learning_rate": 2.9818005808249323e-05, |
|
"loss": 1.6428, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5103448275862069, |
|
"eval_loss": 2.1590590476989746, |
|
"eval_runtime": 4.4994, |
|
"eval_samples_per_second": 1.778, |
|
"eval_steps_per_second": 1.778, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5172413793103449, |
|
"grad_norm": 5.788967132568359, |
|
"learning_rate": 2.9807166166495966e-05, |
|
"loss": 0.9333, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5241379310344828, |
|
"grad_norm": 5.982523441314697, |
|
"learning_rate": 2.979601507369843e-05, |
|
"loss": 1.8066, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5310344827586206, |
|
"grad_norm": 4.951727867126465, |
|
"learning_rate": 2.978455276440662e-05, |
|
"loss": 1.3418, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5379310344827586, |
|
"grad_norm": 5.705328941345215, |
|
"learning_rate": 2.977277947971652e-05, |
|
"loss": 1.1168, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5448275862068965, |
|
"grad_norm": 10.673117637634277, |
|
"learning_rate": 2.9760695467265096e-05, |
|
"loss": 1.9509, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 7.005712985992432, |
|
"learning_rate": 2.9748300981225112e-05, |
|
"loss": 2.8095, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5586206896551724, |
|
"grad_norm": 10.768919944763184, |
|
"learning_rate": 2.9735596282299767e-05, |
|
"loss": 1.1537, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5655172413793104, |
|
"grad_norm": 5.258718967437744, |
|
"learning_rate": 2.9722581637717225e-05, |
|
"loss": 2.0656, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5724137931034483, |
|
"grad_norm": 6.1790971755981445, |
|
"learning_rate": 2.9709257321224973e-05, |
|
"loss": 2.4952, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5793103448275863, |
|
"grad_norm": 5.152263164520264, |
|
"learning_rate": 2.9695623613084094e-05, |
|
"loss": 1.631, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5862068965517241, |
|
"grad_norm": 9.117701530456543, |
|
"learning_rate": 2.9681680800063333e-05, |
|
"loss": 1.5723, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.593103448275862, |
|
"grad_norm": 15.927285194396973, |
|
"learning_rate": 2.966742917543311e-05, |
|
"loss": 4.0567, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 7.300537109375, |
|
"learning_rate": 2.9652869038959308e-05, |
|
"loss": 2.0857, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6068965517241379, |
|
"grad_norm": 5.633571147918701, |
|
"learning_rate": 2.9638000696897004e-05, |
|
"loss": 1.5046, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.6137931034482759, |
|
"grad_norm": 5.7234625816345215, |
|
"learning_rate": 2.9622824461984e-05, |
|
"loss": 2.1678, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.6206896551724138, |
|
"grad_norm": 5.1384124755859375, |
|
"learning_rate": 2.9607340653434263e-05, |
|
"loss": 1.2331, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6275862068965518, |
|
"grad_norm": 9.393917083740234, |
|
"learning_rate": 2.9591549596931196e-05, |
|
"loss": 2.3239, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.6344827586206897, |
|
"grad_norm": 29.67850112915039, |
|
"learning_rate": 2.957545162462081e-05, |
|
"loss": 2.7711, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6413793103448275, |
|
"grad_norm": 11.757678985595703, |
|
"learning_rate": 2.95590470751047e-05, |
|
"loss": 0.862, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6482758620689655, |
|
"grad_norm": 5.055336952209473, |
|
"learning_rate": 2.954233629343297e-05, |
|
"loss": 0.7167, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6551724137931034, |
|
"grad_norm": 6.484320640563965, |
|
"learning_rate": 2.9525319631096936e-05, |
|
"loss": 1.518, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6620689655172414, |
|
"grad_norm": 21.282947540283203, |
|
"learning_rate": 2.950799744602176e-05, |
|
"loss": 1.6637, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6689655172413793, |
|
"grad_norm": 4.83950662612915, |
|
"learning_rate": 2.94903701025589e-05, |
|
"loss": 1.8401, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6758620689655173, |
|
"grad_norm": 8.747105598449707, |
|
"learning_rate": 2.9472437971478456e-05, |
|
"loss": 2.6096, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6827586206896552, |
|
"grad_norm": 5.483854293823242, |
|
"learning_rate": 2.9454201429961377e-05, |
|
"loss": 1.1973, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 8.147703170776367, |
|
"learning_rate": 2.9435660861591523e-05, |
|
"loss": 2.2331, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.696551724137931, |
|
"grad_norm": 10.85132884979248, |
|
"learning_rate": 2.9416816656347585e-05, |
|
"loss": 1.3165, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.7034482758620689, |
|
"grad_norm": 9.249916076660156, |
|
"learning_rate": 2.9397669210594905e-05, |
|
"loss": 2.4115, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.7103448275862069, |
|
"grad_norm": 8.229886054992676, |
|
"learning_rate": 2.9378218927077116e-05, |
|
"loss": 1.3861, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.7172413793103448, |
|
"grad_norm": 5.1249613761901855, |
|
"learning_rate": 2.9358466214907692e-05, |
|
"loss": 1.3119, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.7241379310344828, |
|
"grad_norm": 7.522292137145996, |
|
"learning_rate": 2.9338411489561327e-05, |
|
"loss": 2.3285, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7310344827586207, |
|
"grad_norm": 8.781840324401855, |
|
"learning_rate": 2.931805517286519e-05, |
|
"loss": 2.3725, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.7379310344827587, |
|
"grad_norm": 8.974847793579102, |
|
"learning_rate": 2.929739769299009e-05, |
|
"loss": 2.7679, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.7448275862068966, |
|
"grad_norm": 13.757139205932617, |
|
"learning_rate": 2.927643948444142e-05, |
|
"loss": 2.8373, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7517241379310344, |
|
"grad_norm": 6.372509956359863, |
|
"learning_rate": 2.925518098805005e-05, |
|
"loss": 0.3335, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.7586206896551724, |
|
"grad_norm": 18.51079750061035, |
|
"learning_rate": 2.9233622650963046e-05, |
|
"loss": 3.7183, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7655172413793103, |
|
"grad_norm": 7.130232810974121, |
|
"learning_rate": 2.921176492663426e-05, |
|
"loss": 2.7397, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.7655172413793103, |
|
"eval_loss": 2.076895236968994, |
|
"eval_runtime": 4.5121, |
|
"eval_samples_per_second": 1.773, |
|
"eval_steps_per_second": 1.773, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.7724137931034483, |
|
"grad_norm": 5.484634876251221, |
|
"learning_rate": 2.9189608274814813e-05, |
|
"loss": 1.0504, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7793103448275862, |
|
"grad_norm": 7.2361979484558105, |
|
"learning_rate": 2.916715316154339e-05, |
|
"loss": 1.2353, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7862068965517242, |
|
"grad_norm": 5.681639194488525, |
|
"learning_rate": 2.9144400059136457e-05, |
|
"loss": 1.5471, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7931034482758621, |
|
"grad_norm": 8.158857345581055, |
|
"learning_rate": 2.9121349446178338e-05, |
|
"loss": 2.4476, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 6.511702537536621, |
|
"learning_rate": 2.909800180751112e-05, |
|
"loss": 1.1023, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.8068965517241379, |
|
"grad_norm": 9.999719619750977, |
|
"learning_rate": 2.907435763422449e-05, |
|
"loss": 0.5992, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.8137931034482758, |
|
"grad_norm": 7.8701171875, |
|
"learning_rate": 2.9050417423645374e-05, |
|
"loss": 1.6095, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.8206896551724138, |
|
"grad_norm": 6.189757823944092, |
|
"learning_rate": 2.9026181679327483e-05, |
|
"loss": 1.5167, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 17.398225784301758, |
|
"learning_rate": 2.9001650911040744e-05, |
|
"loss": 1.3394, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8344827586206897, |
|
"grad_norm": 13.492450714111328, |
|
"learning_rate": 2.897682563476054e-05, |
|
"loss": 5.1631, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.8413793103448276, |
|
"grad_norm": 6.701801776885986, |
|
"learning_rate": 2.8951706372656898e-05, |
|
"loss": 1.4963, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.8482758620689655, |
|
"grad_norm": 11.134561538696289, |
|
"learning_rate": 2.8926293653083475e-05, |
|
"loss": 2.0888, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.8551724137931035, |
|
"grad_norm": 5.3243608474731445, |
|
"learning_rate": 2.890058801056645e-05, |
|
"loss": 1.9287, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8620689655172413, |
|
"grad_norm": 5.231749057769775, |
|
"learning_rate": 2.8874589985793298e-05, |
|
"loss": 1.543, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8689655172413793, |
|
"grad_norm": 29.016874313354492, |
|
"learning_rate": 2.88483001256014e-05, |
|
"loss": 3.8339, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.8758620689655172, |
|
"grad_norm": 5.226491451263428, |
|
"learning_rate": 2.8821718982966544e-05, |
|
"loss": 1.189, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.8827586206896552, |
|
"grad_norm": 5.349326133728027, |
|
"learning_rate": 2.87948471169913e-05, |
|
"loss": 0.4606, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.8896551724137931, |
|
"grad_norm": 8.602606773376465, |
|
"learning_rate": 2.8767685092893244e-05, |
|
"loss": 3.0269, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.896551724137931, |
|
"grad_norm": 8.23015308380127, |
|
"learning_rate": 2.874023348199311e-05, |
|
"loss": 1.2428, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.903448275862069, |
|
"grad_norm": 5.327670097351074, |
|
"learning_rate": 2.8712492861702712e-05, |
|
"loss": 0.9737, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.9103448275862069, |
|
"grad_norm": 4.516807556152344, |
|
"learning_rate": 2.868446381551285e-05, |
|
"loss": 1.2708, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.9172413793103448, |
|
"grad_norm": 7.308916091918945, |
|
"learning_rate": 2.865614693298101e-05, |
|
"loss": 2.5184, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.9241379310344827, |
|
"grad_norm": 7.566427230834961, |
|
"learning_rate": 2.8627542809718972e-05, |
|
"loss": 1.4915, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.9310344827586207, |
|
"grad_norm": 6.399407863616943, |
|
"learning_rate": 2.8598652047380292e-05, |
|
"loss": 1.9207, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.9379310344827586, |
|
"grad_norm": 5.2886786460876465, |
|
"learning_rate": 2.8569475253647624e-05, |
|
"loss": 1.0044, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.9448275862068966, |
|
"grad_norm": 4.2925310134887695, |
|
"learning_rate": 2.854001304221995e-05, |
|
"loss": 1.6515, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.9517241379310345, |
|
"grad_norm": 3.5389490127563477, |
|
"learning_rate": 2.8510266032799688e-05, |
|
"loss": 0.624, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.9586206896551724, |
|
"grad_norm": 5.259653568267822, |
|
"learning_rate": 2.8480234851079622e-05, |
|
"loss": 1.5153, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.9655172413793104, |
|
"grad_norm": 7.633677959442139, |
|
"learning_rate": 2.8449920128729772e-05, |
|
"loss": 1.6288, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9724137931034482, |
|
"grad_norm": 7.951615333557129, |
|
"learning_rate": 2.841932250338409e-05, |
|
"loss": 2.4153, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.9793103448275862, |
|
"grad_norm": 4.297111511230469, |
|
"learning_rate": 2.8388442618627063e-05, |
|
"loss": 0.4714, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.9862068965517241, |
|
"grad_norm": 5.56075382232666, |
|
"learning_rate": 2.8357281123980153e-05, |
|
"loss": 1.3144, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.993103448275862, |
|
"grad_norm": 7.056431293487549, |
|
"learning_rate": 2.8325838674888168e-05, |
|
"loss": 2.9863, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 5.265207290649414, |
|
"learning_rate": 2.829411593270545e-05, |
|
"loss": 1.8668, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.006896551724138, |
|
"grad_norm": 6.738914966583252, |
|
"learning_rate": 2.826211356468196e-05, |
|
"loss": 1.6213, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.013793103448276, |
|
"grad_norm": 19.16160774230957, |
|
"learning_rate": 2.822983224394926e-05, |
|
"loss": 1.2061, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.0206896551724138, |
|
"grad_norm": 6.643246173858643, |
|
"learning_rate": 2.8197272649506363e-05, |
|
"loss": 1.936, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.0206896551724138, |
|
"eval_loss": 1.9735311269760132, |
|
"eval_runtime": 4.4975, |
|
"eval_samples_per_second": 1.779, |
|
"eval_steps_per_second": 1.779, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.0275862068965518, |
|
"grad_norm": 5.578238010406494, |
|
"learning_rate": 2.8164435466205423e-05, |
|
"loss": 1.7389, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"grad_norm": 10.583780288696289, |
|
"learning_rate": 2.8131321384737344e-05, |
|
"loss": 0.4309, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0413793103448277, |
|
"grad_norm": 7.088948726654053, |
|
"learning_rate": 2.809793110161725e-05, |
|
"loss": 2.5514, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.0482758620689656, |
|
"grad_norm": 5.819264888763428, |
|
"learning_rate": 2.8064265319169854e-05, |
|
"loss": 1.3072, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.0551724137931036, |
|
"grad_norm": 6.2393364906311035, |
|
"learning_rate": 2.803032474551465e-05, |
|
"loss": 1.2324, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.0620689655172413, |
|
"grad_norm": 4.304983139038086, |
|
"learning_rate": 2.799611009455104e-05, |
|
"loss": 0.1901, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.0689655172413792, |
|
"grad_norm": 5.9356865882873535, |
|
"learning_rate": 2.7961622085943317e-05, |
|
"loss": 1.7746, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.0758620689655172, |
|
"grad_norm": 5.441093444824219, |
|
"learning_rate": 2.792686144510553e-05, |
|
"loss": 1.9543, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.0827586206896551, |
|
"grad_norm": 6.648105144500732, |
|
"learning_rate": 2.789182890318621e-05, |
|
"loss": 2.7205, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.089655172413793, |
|
"grad_norm": 44.886863708496094, |
|
"learning_rate": 2.785652519705301e-05, |
|
"loss": 3.465, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.096551724137931, |
|
"grad_norm": 5.482683181762695, |
|
"learning_rate": 2.78209510692772e-05, |
|
"loss": 1.4479, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.103448275862069, |
|
"grad_norm": 7.18867826461792, |
|
"learning_rate": 2.778510726811804e-05, |
|
"loss": 2.3233, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.110344827586207, |
|
"grad_norm": 10.170475006103516, |
|
"learning_rate": 2.7748994547507052e-05, |
|
"loss": 1.4287, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.1172413793103448, |
|
"grad_norm": 16.519548416137695, |
|
"learning_rate": 2.7712613667032156e-05, |
|
"loss": 2.0423, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.1241379310344828, |
|
"grad_norm": 6.548756122589111, |
|
"learning_rate": 2.7675965391921692e-05, |
|
"loss": 0.8162, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.1310344827586207, |
|
"grad_norm": 10.879814147949219, |
|
"learning_rate": 2.763905049302833e-05, |
|
"loss": 2.0672, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.1379310344827587, |
|
"grad_norm": 6.071365833282471, |
|
"learning_rate": 2.7601869746812855e-05, |
|
"loss": 1.5196, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.1448275862068966, |
|
"grad_norm": 6.293059349060059, |
|
"learning_rate": 2.7564423935327817e-05, |
|
"loss": 0.7617, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.1517241379310346, |
|
"grad_norm": 8.047264099121094, |
|
"learning_rate": 2.7526713846201118e-05, |
|
"loss": 0.4147, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.1586206896551725, |
|
"grad_norm": 3.7810680866241455, |
|
"learning_rate": 2.7488740272619413e-05, |
|
"loss": 1.0664, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.1655172413793102, |
|
"grad_norm": 6.20160436630249, |
|
"learning_rate": 2.7450504013311443e-05, |
|
"loss": 1.9879, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.1724137931034484, |
|
"grad_norm": 6.690729141235352, |
|
"learning_rate": 2.7412005872531222e-05, |
|
"loss": 1.4968, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.1793103448275861, |
|
"grad_norm": 9.275552749633789, |
|
"learning_rate": 2.737324666004113e-05, |
|
"loss": 4.3534, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.186206896551724, |
|
"grad_norm": 5.3003106117248535, |
|
"learning_rate": 2.7334227191094885e-05, |
|
"loss": 0.9354, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.193103448275862, |
|
"grad_norm": 6.302150726318359, |
|
"learning_rate": 2.729494828642038e-05, |
|
"loss": 1.2665, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 7.4209303855896, |
|
"learning_rate": 2.7255410772202435e-05, |
|
"loss": 1.5308, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.206896551724138, |
|
"grad_norm": 8.516342163085938, |
|
"learning_rate": 2.7215615480065415e-05, |
|
"loss": 1.1144, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.2137931034482758, |
|
"grad_norm": 4.906225204467773, |
|
"learning_rate": 2.7175563247055723e-05, |
|
"loss": 2.0357, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.2206896551724138, |
|
"grad_norm": 5.213315010070801, |
|
"learning_rate": 2.7135254915624213e-05, |
|
"loss": 1.6, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.2275862068965517, |
|
"grad_norm": 6.571521282196045, |
|
"learning_rate": 2.709469133360847e-05, |
|
"loss": 1.4624, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.2344827586206897, |
|
"grad_norm": 3.46243953704834, |
|
"learning_rate": 2.7053873354214957e-05, |
|
"loss": 0.3171, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.2413793103448276, |
|
"grad_norm": 5.865048885345459, |
|
"learning_rate": 2.7012801836001098e-05, |
|
"loss": 1.9204, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.2482758620689656, |
|
"grad_norm": 6.989517688751221, |
|
"learning_rate": 2.6971477642857185e-05, |
|
"loss": 1.518, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.2551724137931035, |
|
"grad_norm": 4.894804000854492, |
|
"learning_rate": 2.6929901643988237e-05, |
|
"loss": 0.8945, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.2620689655172415, |
|
"grad_norm": 4.024252414703369, |
|
"learning_rate": 2.6888074713895705e-05, |
|
"loss": 0.9237, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.2689655172413792, |
|
"grad_norm": 6.457338333129883, |
|
"learning_rate": 2.6845997732359074e-05, |
|
"loss": 2.3232, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.2758620689655173, |
|
"grad_norm": 7.112627029418945, |
|
"learning_rate": 2.680367158441736e-05, |
|
"loss": 0.2386, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.2758620689655173, |
|
"eval_loss": 1.9213309288024902, |
|
"eval_runtime": 4.497, |
|
"eval_samples_per_second": 1.779, |
|
"eval_steps_per_second": 1.779, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.282758620689655, |
|
"grad_norm": 15.904428482055664, |
|
"learning_rate": 2.676109716035051e-05, |
|
"loss": 1.914, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.2896551724137932, |
|
"grad_norm": 6.568241596221924, |
|
"learning_rate": 2.6718275355660643e-05, |
|
"loss": 0.952, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.296551724137931, |
|
"grad_norm": 10.139225959777832, |
|
"learning_rate": 2.667520707105325e-05, |
|
"loss": 2.1507, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.303448275862069, |
|
"grad_norm": 6.300340175628662, |
|
"learning_rate": 2.6631893212418227e-05, |
|
"loss": 1.8095, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.3103448275862069, |
|
"grad_norm": 4.754003047943115, |
|
"learning_rate": 2.6588334690810826e-05, |
|
"loss": 1.2223, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.3172413793103448, |
|
"grad_norm": 5.433348655700684, |
|
"learning_rate": 2.654453242243249e-05, |
|
"loss": 0.9192, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.3241379310344827, |
|
"grad_norm": 14.02868366241455, |
|
"learning_rate": 2.6500487328611584e-05, |
|
"loss": 1.642, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.3310344827586207, |
|
"grad_norm": 4.654847621917725, |
|
"learning_rate": 2.645620033578402e-05, |
|
"loss": 1.6479, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.3379310344827586, |
|
"grad_norm": 6.280045509338379, |
|
"learning_rate": 2.6411672375473768e-05, |
|
"loss": 1.9089, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.3448275862068966, |
|
"grad_norm": 6.455976486206055, |
|
"learning_rate": 2.6366904384273252e-05, |
|
"loss": 0.8846, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.3517241379310345, |
|
"grad_norm": 13.289530754089355, |
|
"learning_rate": 2.6321897303823665e-05, |
|
"loss": 1.32, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.3586206896551725, |
|
"grad_norm": 16.279457092285156, |
|
"learning_rate": 2.6276652080795157e-05, |
|
"loss": 0.7443, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.3655172413793104, |
|
"grad_norm": 5.546213626861572, |
|
"learning_rate": 2.6231169666866928e-05, |
|
"loss": 1.777, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.3724137931034484, |
|
"grad_norm": 6.5098748207092285, |
|
"learning_rate": 2.6185451018707188e-05, |
|
"loss": 0.1505, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 7.808483123779297, |
|
"learning_rate": 2.613949709795307e-05, |
|
"loss": 2.0242, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.386206896551724, |
|
"grad_norm": 19.0113582611084, |
|
"learning_rate": 2.6093308871190376e-05, |
|
"loss": 0.5734, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.3931034482758622, |
|
"grad_norm": 5.4320268630981445, |
|
"learning_rate": 2.6046887309933252e-05, |
|
"loss": 1.9298, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 5.639513969421387, |
|
"learning_rate": 2.6000233390603764e-05, |
|
"loss": 2.1008, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.4068965517241379, |
|
"grad_norm": 6.506295680999756, |
|
"learning_rate": 2.595334809451135e-05, |
|
"loss": 2.0629, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.4137931034482758, |
|
"grad_norm": 6.257269382476807, |
|
"learning_rate": 2.590623240783217e-05, |
|
"loss": 0.9089, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.4206896551724137, |
|
"grad_norm": 7.4428534507751465, |
|
"learning_rate": 2.5858887321588403e-05, |
|
"loss": 1.6967, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.4275862068965517, |
|
"grad_norm": 7.234190464019775, |
|
"learning_rate": 2.5811313831627343e-05, |
|
"loss": 1.1008, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.4344827586206896, |
|
"grad_norm": 4.542428493499756, |
|
"learning_rate": 2.5763512938600496e-05, |
|
"loss": 1.0296, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.4413793103448276, |
|
"grad_norm": 4.9460978507995605, |
|
"learning_rate": 2.5715485647942526e-05, |
|
"loss": 1.2251, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.4482758620689655, |
|
"grad_norm": 8.297338485717773, |
|
"learning_rate": 2.566723296985009e-05, |
|
"loss": 1.244, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.4551724137931035, |
|
"grad_norm": 5.6764326095581055, |
|
"learning_rate": 2.561875591926061e-05, |
|
"loss": 2.075, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.4620689655172414, |
|
"grad_norm": 7.905850410461426, |
|
"learning_rate": 2.55700555158309e-05, |
|
"loss": 1.1372, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.4689655172413794, |
|
"grad_norm": 7.2166523933410645, |
|
"learning_rate": 2.552113278391575e-05, |
|
"loss": 0.8753, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.4758620689655173, |
|
"grad_norm": 16.44755744934082, |
|
"learning_rate": 2.5471988752546358e-05, |
|
"loss": 1.3347, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.4827586206896552, |
|
"grad_norm": 6.070692539215088, |
|
"learning_rate": 2.542262445540869e-05, |
|
"loss": 1.7499, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.489655172413793, |
|
"grad_norm": 7.000082492828369, |
|
"learning_rate": 2.5373040930821747e-05, |
|
"loss": 2.1478, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.4965517241379311, |
|
"grad_norm": 7.349568843841553, |
|
"learning_rate": 2.5323239221715704e-05, |
|
"loss": 1.4035, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.5034482758620689, |
|
"grad_norm": 6.242449760437012, |
|
"learning_rate": 2.5273220375609993e-05, |
|
"loss": 1.4238, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.510344827586207, |
|
"grad_norm": 5.620000839233398, |
|
"learning_rate": 2.5222985444591268e-05, |
|
"loss": 1.3079, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.5172413793103448, |
|
"grad_norm": 4.874967098236084, |
|
"learning_rate": 2.5172535485291263e-05, |
|
"loss": 0.5889, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.524137931034483, |
|
"grad_norm": 13.453874588012695, |
|
"learning_rate": 2.5121871558864588e-05, |
|
"loss": 3.2711, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.5310344827586206, |
|
"grad_norm": 24.48921775817871, |
|
"learning_rate": 2.5070994730966375e-05, |
|
"loss": 2.8233, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.5310344827586206, |
|
"eval_loss": 1.8946478366851807, |
|
"eval_runtime": 4.54, |
|
"eval_samples_per_second": 1.762, |
|
"eval_steps_per_second": 1.762, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.5379310344827586, |
|
"grad_norm": 4.243319034576416, |
|
"learning_rate": 2.5019906071729905e-05, |
|
"loss": 0.4053, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.5448275862068965, |
|
"grad_norm": 9.057891845703125, |
|
"learning_rate": 2.496860665574406e-05, |
|
"loss": 2.5334, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.5517241379310345, |
|
"grad_norm": 8.277237892150879, |
|
"learning_rate": 2.4917097562030756e-05, |
|
"loss": 2.1032, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.5586206896551724, |
|
"grad_norm": 4.527651786804199, |
|
"learning_rate": 2.4865379874022212e-05, |
|
"loss": 0.6529, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.5655172413793104, |
|
"grad_norm": 4.352250099182129, |
|
"learning_rate": 2.4813454679538192e-05, |
|
"loss": 0.2521, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.5724137931034483, |
|
"grad_norm": 5.923490524291992, |
|
"learning_rate": 2.4761323070763103e-05, |
|
"loss": 1.8706, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.5793103448275863, |
|
"grad_norm": 6.146652698516846, |
|
"learning_rate": 2.4708986144223038e-05, |
|
"loss": 1.4523, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.5862068965517242, |
|
"grad_norm": 8.59493350982666, |
|
"learning_rate": 2.4656445000762695e-05, |
|
"loss": 2.0485, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.593103448275862, |
|
"grad_norm": 14.89201545715332, |
|
"learning_rate": 2.4603700745522238e-05, |
|
"loss": 1.6968, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 7.599427700042725, |
|
"learning_rate": 2.455075448791403e-05, |
|
"loss": 0.5584, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.6068965517241378, |
|
"grad_norm": 6.137027263641357, |
|
"learning_rate": 2.4497607341599338e-05, |
|
"loss": 1.2578, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.613793103448276, |
|
"grad_norm": 13.683157920837402, |
|
"learning_rate": 2.444426042446486e-05, |
|
"loss": 0.3959, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.6206896551724137, |
|
"grad_norm": 6.342459201812744, |
|
"learning_rate": 2.439071485859924e-05, |
|
"loss": 1.5216, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.6275862068965519, |
|
"grad_norm": 4.803065776824951, |
|
"learning_rate": 2.433697177026947e-05, |
|
"loss": 1.2224, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.6344827586206896, |
|
"grad_norm": 19.011320114135742, |
|
"learning_rate": 2.4283032289897184e-05, |
|
"loss": 2.417, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.6413793103448275, |
|
"grad_norm": 13.680153846740723, |
|
"learning_rate": 2.4228897552034885e-05, |
|
"loss": 1.0688, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.6482758620689655, |
|
"grad_norm": 5.316374778747559, |
|
"learning_rate": 2.417456869534209e-05, |
|
"loss": 1.804, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.6551724137931034, |
|
"grad_norm": 4.965254783630371, |
|
"learning_rate": 2.4120046862561367e-05, |
|
"loss": 1.0666, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.6620689655172414, |
|
"grad_norm": 5.786849498748779, |
|
"learning_rate": 2.406533320049431e-05, |
|
"loss": 1.4944, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.6689655172413793, |
|
"grad_norm": 6.54045295715332, |
|
"learning_rate": 2.4010428859977416e-05, |
|
"loss": 0.9506, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.6758620689655173, |
|
"grad_norm": 8.71989631652832, |
|
"learning_rate": 2.3955334995857866e-05, |
|
"loss": 1.8664, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.6827586206896552, |
|
"grad_norm": 7.771244525909424, |
|
"learning_rate": 2.3900052766969252e-05, |
|
"loss": 2.7829, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.6896551724137931, |
|
"grad_norm": 13.747180938720703, |
|
"learning_rate": 2.3844583336107192e-05, |
|
"loss": 0.6658, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.6965517241379309, |
|
"grad_norm": 8.790626525878906, |
|
"learning_rate": 2.378892787000487e-05, |
|
"loss": 1.6198, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.703448275862069, |
|
"grad_norm": 6.457479000091553, |
|
"learning_rate": 2.37330875393085e-05, |
|
"loss": 2.0938, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.7103448275862068, |
|
"grad_norm": 3.5511231422424316, |
|
"learning_rate": 2.3677063518552706e-05, |
|
"loss": 1.3691, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.717241379310345, |
|
"grad_norm": 4.779417037963867, |
|
"learning_rate": 2.3620856986135807e-05, |
|
"loss": 1.4682, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"grad_norm": 10.626018524169922, |
|
"learning_rate": 2.356446912429504e-05, |
|
"loss": 1.7604, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.7310344827586208, |
|
"grad_norm": 9.10898208618164, |
|
"learning_rate": 2.3507901119081694e-05, |
|
"loss": 0.4235, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.7379310344827585, |
|
"grad_norm": 6.013878345489502, |
|
"learning_rate": 2.3451154160336145e-05, |
|
"loss": 1.2361, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.7448275862068967, |
|
"grad_norm": 6.5278401374816895, |
|
"learning_rate": 2.3394229441662863e-05, |
|
"loss": 2.053, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.7517241379310344, |
|
"grad_norm": 6.2794623374938965, |
|
"learning_rate": 2.3337128160405262e-05, |
|
"loss": 0.772, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.7586206896551724, |
|
"grad_norm": 7.884958744049072, |
|
"learning_rate": 2.3279851517620567e-05, |
|
"loss": 1.3663, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.7655172413793103, |
|
"grad_norm": 6.92486047744751, |
|
"learning_rate": 2.322240071805449e-05, |
|
"loss": 1.3793, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.7724137931034483, |
|
"grad_norm": 6.688532829284668, |
|
"learning_rate": 2.3164776970115952e-05, |
|
"loss": 2.1226, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.7793103448275862, |
|
"grad_norm": 5.486979007720947, |
|
"learning_rate": 2.310698148585162e-05, |
|
"loss": 1.8741, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.7862068965517242, |
|
"grad_norm": 4.790956497192383, |
|
"learning_rate": 2.3049015480920432e-05, |
|
"loss": 1.1631, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.7862068965517242, |
|
"eval_loss": 1.9128004312515259, |
|
"eval_runtime": 4.4978, |
|
"eval_samples_per_second": 1.779, |
|
"eval_steps_per_second": 1.779, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.793103448275862, |
|
"grad_norm": 7.004488945007324, |
|
"learning_rate": 2.299088017456803e-05, |
|
"loss": 1.5968, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 9.21987533569336, |
|
"learning_rate": 2.29325767896011e-05, |
|
"loss": 1.7426, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.806896551724138, |
|
"grad_norm": 7.149972915649414, |
|
"learning_rate": 2.2874106552361672e-05, |
|
"loss": 1.2684, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.8137931034482757, |
|
"grad_norm": 4.868863105773926, |
|
"learning_rate": 2.2815470692701305e-05, |
|
"loss": 0.6387, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.8206896551724139, |
|
"grad_norm": 7.658527851104736, |
|
"learning_rate": 2.2756670443955236e-05, |
|
"loss": 1.2035, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.8275862068965516, |
|
"grad_norm": 4.457070827484131, |
|
"learning_rate": 2.2697707042916415e-05, |
|
"loss": 1.492, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.8344827586206898, |
|
"grad_norm": 3.999518394470215, |
|
"learning_rate": 2.2638581729809522e-05, |
|
"loss": 0.3712, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.8413793103448275, |
|
"grad_norm": 5.254199504852295, |
|
"learning_rate": 2.2579295748264856e-05, |
|
"loss": 1.5827, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.8482758620689657, |
|
"grad_norm": 11.072293281555176, |
|
"learning_rate": 2.2519850345292192e-05, |
|
"loss": 1.5692, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.8551724137931034, |
|
"grad_norm": 9.097825050354004, |
|
"learning_rate": 2.2460246771254525e-05, |
|
"loss": 2.0791, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.8620689655172413, |
|
"grad_norm": 15.463027954101562, |
|
"learning_rate": 2.2400486279841812e-05, |
|
"loss": 2.0508, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.8689655172413793, |
|
"grad_norm": 8.5389404296875, |
|
"learning_rate": 2.2340570128044567e-05, |
|
"loss": 0.9089, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.8758620689655172, |
|
"grad_norm": 7.47584867477417, |
|
"learning_rate": 2.228049957612744e-05, |
|
"loss": 1.6641, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.8827586206896552, |
|
"grad_norm": 5.4179253578186035, |
|
"learning_rate": 2.2220275887602688e-05, |
|
"loss": 1.9974, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.889655172413793, |
|
"grad_norm": 20.747957229614258, |
|
"learning_rate": 2.2159900329203642e-05, |
|
"loss": 3.0228, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.896551724137931, |
|
"grad_norm": 8.873815536499023, |
|
"learning_rate": 2.2099374170858004e-05, |
|
"loss": 1.7806, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.903448275862069, |
|
"grad_norm": 5.834484100341797, |
|
"learning_rate": 2.2038698685661188e-05, |
|
"loss": 1.2638, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.910344827586207, |
|
"grad_norm": 5.626514911651611, |
|
"learning_rate": 2.197787514984951e-05, |
|
"loss": 1.2453, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.9172413793103447, |
|
"grad_norm": 5.62680196762085, |
|
"learning_rate": 2.1916904842773355e-05, |
|
"loss": 1.6145, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.9241379310344828, |
|
"grad_norm": 10.727760314941406, |
|
"learning_rate": 2.1855789046870265e-05, |
|
"loss": 0.7303, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.9310344827586206, |
|
"grad_norm": 4.86237907409668, |
|
"learning_rate": 2.1794529047637962e-05, |
|
"loss": 0.8277, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.9379310344827587, |
|
"grad_norm": 5.81272554397583, |
|
"learning_rate": 2.1733126133607333e-05, |
|
"loss": 1.3026, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.9448275862068964, |
|
"grad_norm": 7.906366348266602, |
|
"learning_rate": 2.1671581596315277e-05, |
|
"loss": 1.6814, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.9517241379310346, |
|
"grad_norm": 4.704112529754639, |
|
"learning_rate": 2.160989673027759e-05, |
|
"loss": 0.6912, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.9586206896551723, |
|
"grad_norm": 4.953239440917969, |
|
"learning_rate": 2.154807283296171e-05, |
|
"loss": 0.1418, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.9655172413793105, |
|
"grad_norm": 5.499814033508301, |
|
"learning_rate": 2.1486111204759425e-05, |
|
"loss": 1.0075, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.9724137931034482, |
|
"grad_norm": 5.502691268920898, |
|
"learning_rate": 2.1424013148959535e-05, |
|
"loss": 1.6252, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.9793103448275862, |
|
"grad_norm": 9.957716941833496, |
|
"learning_rate": 2.1361779971720438e-05, |
|
"loss": 1.0677, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.986206896551724, |
|
"grad_norm": 4.3258280754089355, |
|
"learning_rate": 2.129941298204263e-05, |
|
"loss": 1.0192, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.993103448275862, |
|
"grad_norm": 7.173874855041504, |
|
"learning_rate": 2.1236913491741212e-05, |
|
"loss": 0.1295, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 5.360743045806885, |
|
"learning_rate": 2.117428281541827e-05, |
|
"loss": 0.8945, |
|
"step": 290 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 725, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 145, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.4562668925845504e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|