|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.009663691408816602, |
|
"eval_steps": 500, |
|
"global_step": 239, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 4.043385526701507e-05, |
|
"grad_norm": 45.212158203125, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 8.4776, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 8.086771053403014e-05, |
|
"grad_norm": 42.83491897583008, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 7.4022, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00012130156580104521, |
|
"grad_norm": 47.69302749633789, |
|
"learning_rate": 3e-06, |
|
"loss": 8.5953, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00016173542106806028, |
|
"grad_norm": 47.8973503112793, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 7.7783, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00020216927633507535, |
|
"grad_norm": 43.4521598815918, |
|
"learning_rate": 5e-06, |
|
"loss": 7.9174, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00024260313160209043, |
|
"grad_norm": 46.271541595458984, |
|
"learning_rate": 6e-06, |
|
"loss": 8.6758, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0002830369868691055, |
|
"grad_norm": 45.16845703125, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 7.8099, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.00032347084213612057, |
|
"grad_norm": 43.50260543823242, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 7.1801, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.00036390469740313564, |
|
"grad_norm": 41.92473602294922, |
|
"learning_rate": 9e-06, |
|
"loss": 7.1574, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0004043385526701507, |
|
"grad_norm": 43.03952407836914, |
|
"learning_rate": 1e-05, |
|
"loss": 7.1354, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0004447724079371658, |
|
"grad_norm": 41.086021423339844, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 7.3757, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.00048520626320418085, |
|
"grad_norm": 43.03165817260742, |
|
"learning_rate": 1.2e-05, |
|
"loss": 7.2673, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0005256401184711959, |
|
"grad_norm": 41.32279586791992, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 7.0322, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.000566073973738211, |
|
"grad_norm": 40.678192138671875, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 6.9458, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0006065078290052261, |
|
"grad_norm": 42.551509857177734, |
|
"learning_rate": 1.5e-05, |
|
"loss": 7.4505, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0006469416842722411, |
|
"grad_norm": 39.79718017578125, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 6.314, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0006873755395392562, |
|
"grad_norm": 39.78065490722656, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 6.0012, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0007278093948062713, |
|
"grad_norm": 34.97587966918945, |
|
"learning_rate": 1.8e-05, |
|
"loss": 5.7898, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0007682432500732863, |
|
"grad_norm": 34.85056686401367, |
|
"learning_rate": 1.9e-05, |
|
"loss": 5.3237, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0008086771053403014, |
|
"grad_norm": 31.200321197509766, |
|
"learning_rate": 2e-05, |
|
"loss": 4.8291, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0008491109606073165, |
|
"grad_norm": 34.75196075439453, |
|
"learning_rate": 2.1e-05, |
|
"loss": 5.7917, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0008895448158743316, |
|
"grad_norm": 25.683300018310547, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 3.6869, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0009299786711413466, |
|
"grad_norm": 23.978288650512695, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 3.6214, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0009704125264083617, |
|
"grad_norm": 24.98045539855957, |
|
"learning_rate": 2.4e-05, |
|
"loss": 3.8051, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0010108463816753768, |
|
"grad_norm": 25.768600463867188, |
|
"learning_rate": 2.5e-05, |
|
"loss": 3.8138, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0010512802369423918, |
|
"grad_norm": 28.268779754638672, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 3.6824, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.001091714092209407, |
|
"grad_norm": 24.55759620666504, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 2.6137, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.001132147947476422, |
|
"grad_norm": 32.37775421142578, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 3.0377, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.001172581802743437, |
|
"grad_norm": 24.953506469726562, |
|
"learning_rate": 2.9e-05, |
|
"loss": 2.3354, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0012130156580104521, |
|
"grad_norm": 22.368303298950195, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6599, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0012534495132774672, |
|
"grad_norm": 26.778047561645508, |
|
"learning_rate": 3.1e-05, |
|
"loss": 1.6387, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0012938833685444823, |
|
"grad_norm": 13.506389617919922, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 0.9049, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0013343172238114973, |
|
"grad_norm": 18.525230407714844, |
|
"learning_rate": 3.3e-05, |
|
"loss": 0.8052, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0013747510790785124, |
|
"grad_norm": 25.12320899963379, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 0.8044, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0014151849343455275, |
|
"grad_norm": 16.060945510864258, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.6799, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0014556187896125426, |
|
"grad_norm": 18.008813858032227, |
|
"learning_rate": 3.6e-05, |
|
"loss": 0.4561, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0014960526448795576, |
|
"grad_norm": 21.687620162963867, |
|
"learning_rate": 3.7e-05, |
|
"loss": 0.613, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0015364865001465727, |
|
"grad_norm": 14.03872013092041, |
|
"learning_rate": 3.8e-05, |
|
"loss": 0.5134, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0015769203554135878, |
|
"grad_norm": 8.90583610534668, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 0.2898, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0016173542106806028, |
|
"grad_norm": 15.97493839263916, |
|
"learning_rate": 4e-05, |
|
"loss": 0.4115, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.001657788065947618, |
|
"grad_norm": 6.131041526794434, |
|
"learning_rate": 4.1e-05, |
|
"loss": 0.3075, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.001698221921214633, |
|
"grad_norm": 21.00753402709961, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.5586, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.001738655776481648, |
|
"grad_norm": 23.8162899017334, |
|
"learning_rate": 4.3e-05, |
|
"loss": 0.6964, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0017790896317486631, |
|
"grad_norm": 22.47564125061035, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 0.543, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0018195234870156782, |
|
"grad_norm": 20.058208465576172, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.5529, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0018599573422826933, |
|
"grad_norm": 9.034168243408203, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 0.4264, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0019003911975497083, |
|
"grad_norm": 13.754554748535156, |
|
"learning_rate": 4.7e-05, |
|
"loss": 0.4332, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0019408250528167234, |
|
"grad_norm": 16.2254638671875, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.5085, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0019812589080837385, |
|
"grad_norm": 13.377281188964844, |
|
"learning_rate": 4.9e-05, |
|
"loss": 0.4047, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0020216927633507535, |
|
"grad_norm": 16.529783248901367, |
|
"learning_rate": 5e-05, |
|
"loss": 0.602, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0020621266186177686, |
|
"grad_norm": 16.30471420288086, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 0.4734, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0021025604738847837, |
|
"grad_norm": 9.81867790222168, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 0.4394, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0021429943291517988, |
|
"grad_norm": 8.821556091308594, |
|
"learning_rate": 5.300000000000001e-05, |
|
"loss": 0.3627, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.002183428184418814, |
|
"grad_norm": 7.72442626953125, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 0.3547, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.002223862039685829, |
|
"grad_norm": 9.638863563537598, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 0.3275, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.002264295894952844, |
|
"grad_norm": 6.1317458152771, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 0.2867, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.002304729750219859, |
|
"grad_norm": 11.842965126037598, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 0.3486, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.002345163605486874, |
|
"grad_norm": 3.987241506576538, |
|
"learning_rate": 5.8e-05, |
|
"loss": 0.2699, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.002385597460753889, |
|
"grad_norm": 6.591022968292236, |
|
"learning_rate": 5.9e-05, |
|
"loss": 0.3184, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0024260313160209043, |
|
"grad_norm": 7.872280120849609, |
|
"learning_rate": 6e-05, |
|
"loss": 0.3346, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0024664651712879193, |
|
"grad_norm": 2.6104869842529297, |
|
"learning_rate": 6.1e-05, |
|
"loss": 0.3243, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.0025068990265549344, |
|
"grad_norm": 3.023655652999878, |
|
"learning_rate": 6.2e-05, |
|
"loss": 0.3306, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.0025473328818219495, |
|
"grad_norm": 6.13469123840332, |
|
"learning_rate": 6.3e-05, |
|
"loss": 0.344, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.0025877667370889645, |
|
"grad_norm": 6.2675957679748535, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 0.3637, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.0026282005923559796, |
|
"grad_norm": 15.284539222717285, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 0.4253, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0026686344476229947, |
|
"grad_norm": 13.781516075134277, |
|
"learning_rate": 6.6e-05, |
|
"loss": 0.3658, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.0027090683028900098, |
|
"grad_norm": 3.6815264225006104, |
|
"learning_rate": 6.7e-05, |
|
"loss": 0.3152, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.002749502158157025, |
|
"grad_norm": 5.936532497406006, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 0.312, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.00278993601342404, |
|
"grad_norm": 5.848452568054199, |
|
"learning_rate": 6.9e-05, |
|
"loss": 0.2422, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.002830369868691055, |
|
"grad_norm": 19.137374877929688, |
|
"learning_rate": 7e-05, |
|
"loss": 0.4338, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.00287080372395807, |
|
"grad_norm": 10.636536598205566, |
|
"learning_rate": 7.1e-05, |
|
"loss": 0.3493, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.002911237579225085, |
|
"grad_norm": 4.964332580566406, |
|
"learning_rate": 7.2e-05, |
|
"loss": 0.2712, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0029516714344921, |
|
"grad_norm": 8.327373504638672, |
|
"learning_rate": 7.3e-05, |
|
"loss": 0.2796, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.0029921052897591153, |
|
"grad_norm": 8.643411636352539, |
|
"learning_rate": 7.4e-05, |
|
"loss": 0.3479, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.0030325391450261303, |
|
"grad_norm": 9.094339370727539, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.3282, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0030729730002931454, |
|
"grad_norm": 16.117694854736328, |
|
"learning_rate": 7.6e-05, |
|
"loss": 0.4187, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.0031134068555601605, |
|
"grad_norm": 21.272748947143555, |
|
"learning_rate": 7.7e-05, |
|
"loss": 0.5197, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.0031538407108271755, |
|
"grad_norm": 5.69344425201416, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 0.2646, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.0031942745660941906, |
|
"grad_norm": 7.0776824951171875, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 0.282, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.0032347084213612057, |
|
"grad_norm": 5.962209701538086, |
|
"learning_rate": 8e-05, |
|
"loss": 0.3024, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0032751422766282207, |
|
"grad_norm": 6.475072860717773, |
|
"learning_rate": 8.1e-05, |
|
"loss": 0.3527, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.003315576131895236, |
|
"grad_norm": 10.585362434387207, |
|
"learning_rate": 8.2e-05, |
|
"loss": 0.3635, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.003356009987162251, |
|
"grad_norm": 13.020295143127441, |
|
"learning_rate": 8.3e-05, |
|
"loss": 0.296, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.003396443842429266, |
|
"grad_norm": 12.620179176330566, |
|
"learning_rate": 8.4e-05, |
|
"loss": 0.3679, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.003436877697696281, |
|
"grad_norm": 14.295244216918945, |
|
"learning_rate": 8.5e-05, |
|
"loss": 0.3293, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.003477311552963296, |
|
"grad_norm": 11.91524887084961, |
|
"learning_rate": 8.6e-05, |
|
"loss": 0.3314, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.003517745408230311, |
|
"grad_norm": 4.24912166595459, |
|
"learning_rate": 8.7e-05, |
|
"loss": 0.3097, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.0035581792634973262, |
|
"grad_norm": 2.8676578998565674, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 0.2086, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.0035986131187643413, |
|
"grad_norm": 13.0736665725708, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 0.3789, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.0036390469740313564, |
|
"grad_norm": 15.212523460388184, |
|
"learning_rate": 9e-05, |
|
"loss": 0.3977, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0036794808292983715, |
|
"grad_norm": 4.857946395874023, |
|
"learning_rate": 9.1e-05, |
|
"loss": 0.2782, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.0037199146845653865, |
|
"grad_norm": 13.703444480895996, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 0.4441, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.0037603485398324016, |
|
"grad_norm": 7.481781482696533, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 0.4544, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.0038007823950994167, |
|
"grad_norm": 10.419188499450684, |
|
"learning_rate": 9.4e-05, |
|
"loss": 0.2466, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.0038412162503664317, |
|
"grad_norm": 6.384120941162109, |
|
"learning_rate": 9.5e-05, |
|
"loss": 0.3243, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.003881650105633447, |
|
"grad_norm": 5.624557971954346, |
|
"learning_rate": 9.6e-05, |
|
"loss": 0.2636, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.003922083960900462, |
|
"grad_norm": 6.27712869644165, |
|
"learning_rate": 9.7e-05, |
|
"loss": 0.3291, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.003962517816167477, |
|
"grad_norm": 8.306694030761719, |
|
"learning_rate": 9.8e-05, |
|
"loss": 0.3152, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.0040029516714344925, |
|
"grad_norm": 10.752472877502441, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 0.3415, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.004043385526701507, |
|
"grad_norm": 10.531058311462402, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4284, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.004083819381968523, |
|
"grad_norm": 16.106300354003906, |
|
"learning_rate": 9.99999995932986e-05, |
|
"loss": 0.6921, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.004124253237235537, |
|
"grad_norm": 11.43233585357666, |
|
"learning_rate": 9.999999837319442e-05, |
|
"loss": 0.3084, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.004164687092502553, |
|
"grad_norm": 5.925229072570801, |
|
"learning_rate": 9.999999633968746e-05, |
|
"loss": 0.2721, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.004205120947769567, |
|
"grad_norm": 13.037198066711426, |
|
"learning_rate": 9.999999349277778e-05, |
|
"loss": 0.3153, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.004245554803036583, |
|
"grad_norm": 11.120277404785156, |
|
"learning_rate": 9.999998983246538e-05, |
|
"loss": 0.3173, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0042859886583035975, |
|
"grad_norm": 17.156024932861328, |
|
"learning_rate": 9.999998535875038e-05, |
|
"loss": 0.582, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.004326422513570613, |
|
"grad_norm": 1.7565780878067017, |
|
"learning_rate": 9.999998007163281e-05, |
|
"loss": 0.1944, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.004366856368837628, |
|
"grad_norm": 4.962274074554443, |
|
"learning_rate": 9.999997397111278e-05, |
|
"loss": 0.2287, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.004407290224104643, |
|
"grad_norm": 5.148132801055908, |
|
"learning_rate": 9.999996705719036e-05, |
|
"loss": 0.2062, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.004447724079371658, |
|
"grad_norm": 7.428779602050781, |
|
"learning_rate": 9.999995932986568e-05, |
|
"loss": 0.2594, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.004488157934638673, |
|
"grad_norm": 7.802266597747803, |
|
"learning_rate": 9.999995078913888e-05, |
|
"loss": 0.2838, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.004528591789905688, |
|
"grad_norm": 9.690343856811523, |
|
"learning_rate": 9.999994143501008e-05, |
|
"loss": 0.2114, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.0045690256451727035, |
|
"grad_norm": 8.961145401000977, |
|
"learning_rate": 9.999993126747943e-05, |
|
"loss": 0.1836, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.004609459500439718, |
|
"grad_norm": 2.7630367279052734, |
|
"learning_rate": 9.999992028654711e-05, |
|
"loss": 0.1005, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.004649893355706734, |
|
"grad_norm": 20.46099090576172, |
|
"learning_rate": 9.999990849221329e-05, |
|
"loss": 0.4513, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.004690327210973748, |
|
"grad_norm": 11.76425838470459, |
|
"learning_rate": 9.999989588447816e-05, |
|
"loss": 0.348, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.004730761066240764, |
|
"grad_norm": 13.530948638916016, |
|
"learning_rate": 9.999988246334193e-05, |
|
"loss": 0.2852, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.004771194921507778, |
|
"grad_norm": 4.60286808013916, |
|
"learning_rate": 9.999986822880483e-05, |
|
"loss": 0.1511, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.004811628776774794, |
|
"grad_norm": 8.1397705078125, |
|
"learning_rate": 9.999985318086706e-05, |
|
"loss": 0.3053, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.0048520626320418085, |
|
"grad_norm": 9.727378845214844, |
|
"learning_rate": 9.999983731952889e-05, |
|
"loss": 0.3261, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.004892496487308824, |
|
"grad_norm": 6.355672359466553, |
|
"learning_rate": 9.999982064479057e-05, |
|
"loss": 0.1726, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.004932930342575839, |
|
"grad_norm": 8.556386947631836, |
|
"learning_rate": 9.999980315665237e-05, |
|
"loss": 0.21, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.004973364197842854, |
|
"grad_norm": 9.588984489440918, |
|
"learning_rate": 9.999978485511459e-05, |
|
"loss": 0.3401, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.005013798053109869, |
|
"grad_norm": 6.82341194152832, |
|
"learning_rate": 9.999976574017749e-05, |
|
"loss": 0.2865, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.005054231908376884, |
|
"grad_norm": 13.025908470153809, |
|
"learning_rate": 9.999974581184142e-05, |
|
"loss": 0.3679, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.005094665763643899, |
|
"grad_norm": 7.907348155975342, |
|
"learning_rate": 9.999972507010669e-05, |
|
"loss": 0.2739, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.0051350996189109144, |
|
"grad_norm": 5.3686676025390625, |
|
"learning_rate": 9.999970351497363e-05, |
|
"loss": 0.1398, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.005175533474177929, |
|
"grad_norm": 6.662126064300537, |
|
"learning_rate": 9.99996811464426e-05, |
|
"loss": 0.2436, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.005215967329444945, |
|
"grad_norm": 7.145336627960205, |
|
"learning_rate": 9.999965796451397e-05, |
|
"loss": 0.174, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.005256401184711959, |
|
"grad_norm": 7.168648719787598, |
|
"learning_rate": 9.99996339691881e-05, |
|
"loss": 0.2184, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.005296835039978975, |
|
"grad_norm": 10.925111770629883, |
|
"learning_rate": 9.99996091604654e-05, |
|
"loss": 0.3104, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.005337268895245989, |
|
"grad_norm": 7.4000396728515625, |
|
"learning_rate": 9.999958353834624e-05, |
|
"loss": 0.1764, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.005377702750513005, |
|
"grad_norm": 14.293291091918945, |
|
"learning_rate": 9.999955710283109e-05, |
|
"loss": 0.3948, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.0054181366057800195, |
|
"grad_norm": 17.54743003845215, |
|
"learning_rate": 9.999952985392033e-05, |
|
"loss": 0.4679, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.005458570461047035, |
|
"grad_norm": 10.179829597473145, |
|
"learning_rate": 9.999950179161442e-05, |
|
"loss": 0.247, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.00549900431631405, |
|
"grad_norm": 8.208870887756348, |
|
"learning_rate": 9.999947291591383e-05, |
|
"loss": 0.3418, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.005539438171581065, |
|
"grad_norm": 7.8983917236328125, |
|
"learning_rate": 9.9999443226819e-05, |
|
"loss": 0.3445, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.00557987202684808, |
|
"grad_norm": 14.267950057983398, |
|
"learning_rate": 9.999941272433046e-05, |
|
"loss": 0.3628, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.005620305882115095, |
|
"grad_norm": 11.430856704711914, |
|
"learning_rate": 9.999938140844866e-05, |
|
"loss": 0.278, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.00566073973738211, |
|
"grad_norm": 8.389185905456543, |
|
"learning_rate": 9.999934927917414e-05, |
|
"loss": 0.3661, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0057011735926491254, |
|
"grad_norm": 8.984382629394531, |
|
"learning_rate": 9.999931633650739e-05, |
|
"loss": 0.3152, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.00574160744791614, |
|
"grad_norm": 6.51492166519165, |
|
"learning_rate": 9.999928258044899e-05, |
|
"loss": 0.2281, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.005782041303183156, |
|
"grad_norm": 7.01376485824585, |
|
"learning_rate": 9.999924801099946e-05, |
|
"loss": 0.3193, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.00582247515845017, |
|
"grad_norm": 7.640584468841553, |
|
"learning_rate": 9.999921262815936e-05, |
|
"loss": 0.2331, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.005862909013717186, |
|
"grad_norm": 4.045457363128662, |
|
"learning_rate": 9.999917643192928e-05, |
|
"loss": 0.1184, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.0059033428689842, |
|
"grad_norm": 13.00910758972168, |
|
"learning_rate": 9.999913942230979e-05, |
|
"loss": 0.3623, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.005943776724251216, |
|
"grad_norm": 7.396110534667969, |
|
"learning_rate": 9.999910159930151e-05, |
|
"loss": 0.2281, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.0059842105795182305, |
|
"grad_norm": 3.814600944519043, |
|
"learning_rate": 9.999906296290506e-05, |
|
"loss": 0.1162, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.006024644434785246, |
|
"grad_norm": 10.155074119567871, |
|
"learning_rate": 9.999902351312105e-05, |
|
"loss": 0.2232, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.006065078290052261, |
|
"grad_norm": 7.059305667877197, |
|
"learning_rate": 9.999898324995013e-05, |
|
"loss": 0.1808, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.006105512145319276, |
|
"grad_norm": 12.093023300170898, |
|
"learning_rate": 9.999894217339296e-05, |
|
"loss": 0.3253, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.006145946000586291, |
|
"grad_norm": 13.166694641113281, |
|
"learning_rate": 9.999890028345019e-05, |
|
"loss": 0.3509, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.006186379855853306, |
|
"grad_norm": 5.3184733390808105, |
|
"learning_rate": 9.999885758012253e-05, |
|
"loss": 0.1875, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.006226813711120321, |
|
"grad_norm": 7.991506099700928, |
|
"learning_rate": 9.999881406341065e-05, |
|
"loss": 0.186, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.006267247566387336, |
|
"grad_norm": 10.271965026855469, |
|
"learning_rate": 9.999876973331528e-05, |
|
"loss": 0.3393, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.006307681421654351, |
|
"grad_norm": 8.440420150756836, |
|
"learning_rate": 9.99987245898371e-05, |
|
"loss": 0.3662, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.006348115276921367, |
|
"grad_norm": 9.392427444458008, |
|
"learning_rate": 9.99986786329769e-05, |
|
"loss": 0.2827, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.006388549132188381, |
|
"grad_norm": 11.753120422363281, |
|
"learning_rate": 9.999863186273539e-05, |
|
"loss": 0.3695, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.006428982987455397, |
|
"grad_norm": 7.335087776184082, |
|
"learning_rate": 9.999858427911335e-05, |
|
"loss": 0.2826, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.006469416842722411, |
|
"grad_norm": 8.617646217346191, |
|
"learning_rate": 9.999853588211154e-05, |
|
"loss": 0.2268, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.006509850697989427, |
|
"grad_norm": 6.348892688751221, |
|
"learning_rate": 9.999848667173075e-05, |
|
"loss": 0.2811, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.0065502845532564415, |
|
"grad_norm": 9.774231910705566, |
|
"learning_rate": 9.999843664797178e-05, |
|
"loss": 0.4024, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.006590718408523457, |
|
"grad_norm": 11.390604019165039, |
|
"learning_rate": 9.999838581083546e-05, |
|
"loss": 0.2387, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.006631152263790472, |
|
"grad_norm": 4.1529388427734375, |
|
"learning_rate": 9.99983341603226e-05, |
|
"loss": 0.2513, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.006671586119057487, |
|
"grad_norm": 9.902484893798828, |
|
"learning_rate": 9.999828169643404e-05, |
|
"loss": 0.2148, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.006712019974324502, |
|
"grad_norm": 13.555265426635742, |
|
"learning_rate": 9.999822841917064e-05, |
|
"loss": 0.3568, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.006752453829591517, |
|
"grad_norm": 10.916128158569336, |
|
"learning_rate": 9.999817432853326e-05, |
|
"loss": 0.2892, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.006792887684858532, |
|
"grad_norm": 5.216245651245117, |
|
"learning_rate": 9.999811942452279e-05, |
|
"loss": 0.2052, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.006833321540125547, |
|
"grad_norm": 11.540072441101074, |
|
"learning_rate": 9.999806370714011e-05, |
|
"loss": 0.3585, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.006873755395392562, |
|
"grad_norm": 5.5519304275512695, |
|
"learning_rate": 9.999800717638614e-05, |
|
"loss": 0.2453, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.006914189250659578, |
|
"grad_norm": 7.624457359313965, |
|
"learning_rate": 9.999794983226179e-05, |
|
"loss": 0.2453, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.006954623105926592, |
|
"grad_norm": 3.1242964267730713, |
|
"learning_rate": 9.999789167476801e-05, |
|
"loss": 0.2162, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.006995056961193608, |
|
"grad_norm": 5.320684432983398, |
|
"learning_rate": 9.999783270390572e-05, |
|
"loss": 0.2053, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.007035490816460622, |
|
"grad_norm": 9.082324028015137, |
|
"learning_rate": 9.999777291967589e-05, |
|
"loss": 0.3074, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.007075924671727638, |
|
"grad_norm": 9.537432670593262, |
|
"learning_rate": 9.999771232207951e-05, |
|
"loss": 0.2791, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.0071163585269946525, |
|
"grad_norm": 9.438758850097656, |
|
"learning_rate": 9.999765091111754e-05, |
|
"loss": 0.2213, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.007156792382261668, |
|
"grad_norm": 6.272062301635742, |
|
"learning_rate": 9.999758868679099e-05, |
|
"loss": 0.2219, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.007197226237528683, |
|
"grad_norm": 3.2677524089813232, |
|
"learning_rate": 9.999752564910086e-05, |
|
"loss": 0.2241, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.007237660092795698, |
|
"grad_norm": 3.407979726791382, |
|
"learning_rate": 9.99974617980482e-05, |
|
"loss": 0.1709, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.007278093948062713, |
|
"grad_norm": 7.650908946990967, |
|
"learning_rate": 9.999739713363404e-05, |
|
"loss": 0.189, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.007318527803329728, |
|
"grad_norm": 5.595089912414551, |
|
"learning_rate": 9.999733165585943e-05, |
|
"loss": 0.1611, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.007358961658596743, |
|
"grad_norm": 5.560061931610107, |
|
"learning_rate": 9.999726536472542e-05, |
|
"loss": 0.1824, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.007399395513863758, |
|
"grad_norm": 13.8944091796875, |
|
"learning_rate": 9.99971982602331e-05, |
|
"loss": 0.3737, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.007439829369130773, |
|
"grad_norm": 5.863430976867676, |
|
"learning_rate": 9.999713034238359e-05, |
|
"loss": 0.2017, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.0074802632243977886, |
|
"grad_norm": 4.334754467010498, |
|
"learning_rate": 9.999706161117795e-05, |
|
"loss": 0.0885, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.007520697079664803, |
|
"grad_norm": 5.766237735748291, |
|
"learning_rate": 9.99969920666173e-05, |
|
"loss": 0.254, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.007561130934931819, |
|
"grad_norm": 4.142415523529053, |
|
"learning_rate": 9.99969217087028e-05, |
|
"loss": 0.1412, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.007601564790198833, |
|
"grad_norm": 7.84074068069458, |
|
"learning_rate": 9.999685053743559e-05, |
|
"loss": 0.1959, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.007641998645465849, |
|
"grad_norm": 8.681429862976074, |
|
"learning_rate": 9.999677855281682e-05, |
|
"loss": 0.1584, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.0076824325007328635, |
|
"grad_norm": 9.750258445739746, |
|
"learning_rate": 9.999670575484765e-05, |
|
"loss": 0.2074, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.007722866355999879, |
|
"grad_norm": 7.412321090698242, |
|
"learning_rate": 9.999663214352929e-05, |
|
"loss": 0.1696, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.007763300211266894, |
|
"grad_norm": 9.03699016571045, |
|
"learning_rate": 9.999655771886291e-05, |
|
"loss": 0.1942, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.007803734066533909, |
|
"grad_norm": 7.925232887268066, |
|
"learning_rate": 9.999648248084974e-05, |
|
"loss": 0.1793, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.007844167921800925, |
|
"grad_norm": 7.363532066345215, |
|
"learning_rate": 9.9996406429491e-05, |
|
"loss": 0.1718, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.00788460177706794, |
|
"grad_norm": 9.17047119140625, |
|
"learning_rate": 9.999632956478793e-05, |
|
"loss": 0.404, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.007925035632334954, |
|
"grad_norm": 8.83364486694336, |
|
"learning_rate": 9.999625188674175e-05, |
|
"loss": 0.2276, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.007965469487601969, |
|
"grad_norm": 9.548094749450684, |
|
"learning_rate": 9.999617339535378e-05, |
|
"loss": 0.1875, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.008005903342868985, |
|
"grad_norm": 6.08480167388916, |
|
"learning_rate": 9.999609409062525e-05, |
|
"loss": 0.145, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.008046337198136, |
|
"grad_norm": 15.142061233520508, |
|
"learning_rate": 9.999601397255747e-05, |
|
"loss": 0.4395, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.008086771053403014, |
|
"grad_norm": 15.888526916503906, |
|
"learning_rate": 9.999593304115174e-05, |
|
"loss": 0.5784, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.008127204908670029, |
|
"grad_norm": 6.388537883758545, |
|
"learning_rate": 9.999585129640936e-05, |
|
"loss": 0.2958, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.008167638763937045, |
|
"grad_norm": 5.720010757446289, |
|
"learning_rate": 9.999576873833169e-05, |
|
"loss": 0.1192, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.00820807261920406, |
|
"grad_norm": 7.905060291290283, |
|
"learning_rate": 9.999568536692006e-05, |
|
"loss": 0.3184, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.008248506474471074, |
|
"grad_norm": 3.085916519165039, |
|
"learning_rate": 9.999560118217583e-05, |
|
"loss": 0.1954, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.008288940329738089, |
|
"grad_norm": 8.547829627990723, |
|
"learning_rate": 9.999551618410034e-05, |
|
"loss": 0.2605, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.008329374185005105, |
|
"grad_norm": 7.506508827209473, |
|
"learning_rate": 9.999543037269504e-05, |
|
"loss": 0.3028, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.00836980804027212, |
|
"grad_norm": 4.869304656982422, |
|
"learning_rate": 9.999534374796124e-05, |
|
"loss": 0.2271, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.008410241895539135, |
|
"grad_norm": 7.360259056091309, |
|
"learning_rate": 9.999525630990041e-05, |
|
"loss": 0.2761, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.00845067575080615, |
|
"grad_norm": 6.078726768493652, |
|
"learning_rate": 9.999516805851397e-05, |
|
"loss": 0.2623, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.008491109606073166, |
|
"grad_norm": 2.553845167160034, |
|
"learning_rate": 9.999507899380331e-05, |
|
"loss": 0.1659, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.00853154346134018, |
|
"grad_norm": 3.2435362339019775, |
|
"learning_rate": 9.999498911576993e-05, |
|
"loss": 0.1498, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.008571977316607195, |
|
"grad_norm": 6.379277229309082, |
|
"learning_rate": 9.999489842441527e-05, |
|
"loss": 0.2309, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.00861241117187421, |
|
"grad_norm": 4.347065448760986, |
|
"learning_rate": 9.99948069197408e-05, |
|
"loss": 0.2128, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.008652845027141226, |
|
"grad_norm": 9.05762767791748, |
|
"learning_rate": 9.999471460174803e-05, |
|
"loss": 0.1779, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.00869327888240824, |
|
"grad_norm": 4.830628871917725, |
|
"learning_rate": 9.999462147043843e-05, |
|
"loss": 0.1574, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.008733712737675255, |
|
"grad_norm": 3.8125157356262207, |
|
"learning_rate": 9.999452752581355e-05, |
|
"loss": 0.1504, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.00877414659294227, |
|
"grad_norm": 2.8179168701171875, |
|
"learning_rate": 9.999443276787489e-05, |
|
"loss": 0.1777, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.008814580448209286, |
|
"grad_norm": 5.864883899688721, |
|
"learning_rate": 9.9994337196624e-05, |
|
"loss": 0.3495, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.008855014303476301, |
|
"grad_norm": 6.230106353759766, |
|
"learning_rate": 9.999424081206245e-05, |
|
"loss": 0.1318, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.008895448158743316, |
|
"grad_norm": 6.827365875244141, |
|
"learning_rate": 9.999414361419178e-05, |
|
"loss": 0.1966, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.00893588201401033, |
|
"grad_norm": 12.011444091796875, |
|
"learning_rate": 9.99940456030136e-05, |
|
"loss": 0.2844, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.008976315869277347, |
|
"grad_norm": 7.509864330291748, |
|
"learning_rate": 9.999394677852948e-05, |
|
"loss": 0.3334, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.009016749724544361, |
|
"grad_norm": 10.87213134765625, |
|
"learning_rate": 9.999384714074105e-05, |
|
"loss": 0.2559, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.009057183579811376, |
|
"grad_norm": 7.5143327713012695, |
|
"learning_rate": 9.99937466896499e-05, |
|
"loss": 0.1708, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.00909761743507839, |
|
"grad_norm": 17.510313034057617, |
|
"learning_rate": 9.99936454252577e-05, |
|
"loss": 0.4556, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.009138051290345407, |
|
"grad_norm": 6.468897342681885, |
|
"learning_rate": 9.999354334756608e-05, |
|
"loss": 0.2092, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.009178485145612422, |
|
"grad_norm": 6.171923637390137, |
|
"learning_rate": 9.99934404565767e-05, |
|
"loss": 0.2199, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.009218919000879436, |
|
"grad_norm": 8.079005241394043, |
|
"learning_rate": 9.999333675229123e-05, |
|
"loss": 0.2471, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.00925935285614645, |
|
"grad_norm": 11.559858322143555, |
|
"learning_rate": 9.999323223471136e-05, |
|
"loss": 0.3881, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.009299786711413467, |
|
"grad_norm": 16.177331924438477, |
|
"learning_rate": 9.999312690383881e-05, |
|
"loss": 0.4198, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.009340220566680482, |
|
"grad_norm": 16.129283905029297, |
|
"learning_rate": 9.999302075967526e-05, |
|
"loss": 0.3592, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.009380654421947496, |
|
"grad_norm": 13.0033597946167, |
|
"learning_rate": 9.999291380222246e-05, |
|
"loss": 0.3194, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.009421088277214511, |
|
"grad_norm": 5.525213718414307, |
|
"learning_rate": 9.999280603148215e-05, |
|
"loss": 0.1632, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.009461522132481527, |
|
"grad_norm": 7.511592388153076, |
|
"learning_rate": 9.999269744745606e-05, |
|
"loss": 0.2767, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.009501955987748542, |
|
"grad_norm": 4.852881908416748, |
|
"learning_rate": 9.999258805014599e-05, |
|
"loss": 0.1132, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.009542389843015557, |
|
"grad_norm": 5.765609264373779, |
|
"learning_rate": 9.999247783955369e-05, |
|
"loss": 0.1631, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.009582823698282571, |
|
"grad_norm": 9.90166187286377, |
|
"learning_rate": 9.999236681568097e-05, |
|
"loss": 0.2498, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.009623257553549588, |
|
"grad_norm": 7.612752914428711, |
|
"learning_rate": 9.999225497852962e-05, |
|
"loss": 0.2766, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.009663691408816602, |
|
"grad_norm": 11.317419052124023, |
|
"learning_rate": 9.99921423281015e-05, |
|
"loss": 0.2912, |
|
"step": 239 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 24731, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 239, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.4782353377329152e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|