|
{ |
|
"best_metric": 0.5081329345703125, |
|
"best_model_checkpoint": "finetuned-cards-blackjack/checkpoint-2800", |
|
"epoch": 7.0, |
|
"eval_steps": 100, |
|
"global_step": 2891, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.301379680633545, |
|
"learning_rate": 0.00019930819785541338, |
|
"loss": 1.3967, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.8909592628479004, |
|
"learning_rate": 0.00019861639571082672, |
|
"loss": 1.3469, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.4435207843780518, |
|
"learning_rate": 0.00019792459356624006, |
|
"loss": 1.321, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.026681900024414, |
|
"learning_rate": 0.0001972327914216534, |
|
"loss": 1.0812, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.660863399505615, |
|
"learning_rate": 0.00019654098927706677, |
|
"loss": 1.2255, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.201864719390869, |
|
"learning_rate": 0.00019584918713248014, |
|
"loss": 1.2845, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.0525405406951904, |
|
"learning_rate": 0.00019515738498789345, |
|
"loss": 1.3223, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.4655351638793945, |
|
"learning_rate": 0.00019446558284330682, |
|
"loss": 1.444, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.199573993682861, |
|
"learning_rate": 0.00019377378069872016, |
|
"loss": 1.4201, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.03210973739624, |
|
"learning_rate": 0.00019308197855413353, |
|
"loss": 1.3563, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_accuracy": 0.6749571183533448, |
|
"eval_loss": 1.1494646072387695, |
|
"eval_runtime": 6.2584, |
|
"eval_samples_per_second": 186.311, |
|
"eval_steps_per_second": 23.329, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 4.278244495391846, |
|
"learning_rate": 0.00019239017640954688, |
|
"loss": 1.3488, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 3.920788049697876, |
|
"learning_rate": 0.00019169837426496022, |
|
"loss": 1.451, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 3.261601686477661, |
|
"learning_rate": 0.00019100657212037359, |
|
"loss": 1.2141, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 5.404760837554932, |
|
"learning_rate": 0.00019031476997578695, |
|
"loss": 1.5746, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.8406295776367188, |
|
"learning_rate": 0.0001896229678312003, |
|
"loss": 1.2777, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 3.7745213508605957, |
|
"learning_rate": 0.00018893116568661364, |
|
"loss": 1.2075, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 3.8692853450775146, |
|
"learning_rate": 0.00018823936354202698, |
|
"loss": 1.2082, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 3.2764315605163574, |
|
"learning_rate": 0.00018754756139744035, |
|
"loss": 1.1009, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.6145033836364746, |
|
"learning_rate": 0.0001868557592528537, |
|
"loss": 1.2236, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.715363502502441, |
|
"learning_rate": 0.00018616395710826703, |
|
"loss": 1.3393, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_accuracy": 0.7204116638078902, |
|
"eval_loss": 1.0388233661651611, |
|
"eval_runtime": 6.0054, |
|
"eval_samples_per_second": 194.159, |
|
"eval_steps_per_second": 24.311, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 3.9448986053466797, |
|
"learning_rate": 0.0001854721549636804, |
|
"loss": 1.1597, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.357956886291504, |
|
"learning_rate": 0.00018478035281909374, |
|
"loss": 1.1734, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 5.5605244636535645, |
|
"learning_rate": 0.0001840885506745071, |
|
"loss": 1.165, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 4.48176908493042, |
|
"learning_rate": 0.00018339674852992045, |
|
"loss": 1.4025, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.6814768314361572, |
|
"learning_rate": 0.0001827049463853338, |
|
"loss": 1.198, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 4.112949848175049, |
|
"learning_rate": 0.00018201314424074716, |
|
"loss": 1.1062, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 5.755402565002441, |
|
"learning_rate": 0.0001813213420961605, |
|
"loss": 1.144, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.6004717350006104, |
|
"learning_rate": 0.00018062953995157384, |
|
"loss": 1.2527, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.4746742248535156, |
|
"learning_rate": 0.0001799377378069872, |
|
"loss": 1.1316, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.231992483139038, |
|
"learning_rate": 0.00017924593566240055, |
|
"loss": 1.2033, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"eval_accuracy": 0.7547169811320755, |
|
"eval_loss": 0.9323562979698181, |
|
"eval_runtime": 7.9011, |
|
"eval_samples_per_second": 147.574, |
|
"eval_steps_per_second": 18.478, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 4.073417663574219, |
|
"learning_rate": 0.00017855413351781392, |
|
"loss": 1.3702, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 5.040902137756348, |
|
"learning_rate": 0.00017786233137322726, |
|
"loss": 1.1423, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.7068464756011963, |
|
"learning_rate": 0.0001771705292286406, |
|
"loss": 1.0609, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 4.864231586456299, |
|
"learning_rate": 0.00017647872708405397, |
|
"loss": 1.201, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 6.007138252258301, |
|
"learning_rate": 0.00017578692493946732, |
|
"loss": 1.0687, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 4.0837225914001465, |
|
"learning_rate": 0.00017509512279488069, |
|
"loss": 1.1311, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 4.566812992095947, |
|
"learning_rate": 0.00017440332065029403, |
|
"loss": 1.3071, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 3.3199901580810547, |
|
"learning_rate": 0.00017371151850570737, |
|
"loss": 1.0246, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.4883534908294678, |
|
"learning_rate": 0.00017301971636112074, |
|
"loss": 1.0215, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 5.232284069061279, |
|
"learning_rate": 0.00017232791421653408, |
|
"loss": 0.9672, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"eval_accuracy": 0.7658662092624356, |
|
"eval_loss": 0.8557726144790649, |
|
"eval_runtime": 6.2462, |
|
"eval_samples_per_second": 186.674, |
|
"eval_steps_per_second": 23.374, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 3.8225362300872803, |
|
"learning_rate": 0.00017163611207194742, |
|
"loss": 1.0908, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 4.098091125488281, |
|
"learning_rate": 0.0001709443099273608, |
|
"loss": 0.8621, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 4.027368068695068, |
|
"learning_rate": 0.00017025250778277413, |
|
"loss": 0.9868, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 4.375247478485107, |
|
"learning_rate": 0.0001695607056381875, |
|
"loss": 1.0179, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 8.204839706420898, |
|
"learning_rate": 0.00016886890349360084, |
|
"loss": 0.902, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 3.1056785583496094, |
|
"learning_rate": 0.00016817710134901418, |
|
"loss": 0.9873, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 6.811554908752441, |
|
"learning_rate": 0.00016748529920442755, |
|
"loss": 0.9035, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 4.715181350708008, |
|
"learning_rate": 0.0001667934970598409, |
|
"loss": 1.0024, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 5.355204105377197, |
|
"learning_rate": 0.00016610169491525423, |
|
"loss": 1.0384, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 8.26843547821045, |
|
"learning_rate": 0.0001654098927706676, |
|
"loss": 0.8674, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"eval_accuracy": 0.7615780445969125, |
|
"eval_loss": 0.8456417322158813, |
|
"eval_runtime": 5.9836, |
|
"eval_samples_per_second": 194.865, |
|
"eval_steps_per_second": 24.4, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 3.612718343734741, |
|
"learning_rate": 0.00016471809062608094, |
|
"loss": 1.035, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 3.2531259059906006, |
|
"learning_rate": 0.0001640262884814943, |
|
"loss": 0.9591, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.1132917404174805, |
|
"learning_rate": 0.00016333448633690765, |
|
"loss": 0.7013, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 5.840766906738281, |
|
"learning_rate": 0.000162642684192321, |
|
"loss": 1.1066, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.8128092288970947, |
|
"learning_rate": 0.00016195088204773436, |
|
"loss": 0.8851, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 5.935888290405273, |
|
"learning_rate": 0.0001612590799031477, |
|
"loss": 0.9738, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 4.2558488845825195, |
|
"learning_rate": 0.00016056727775856107, |
|
"loss": 1.1094, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 3.7361583709716797, |
|
"learning_rate": 0.0001598754756139744, |
|
"loss": 1.0376, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 3.6672043800354004, |
|
"learning_rate": 0.00015918367346938776, |
|
"loss": 0.9765, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 2.8976941108703613, |
|
"learning_rate": 0.00015849187132480113, |
|
"loss": 0.8277, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"eval_accuracy": 0.7958833619210978, |
|
"eval_loss": 0.7562589049339294, |
|
"eval_runtime": 6.7504, |
|
"eval_samples_per_second": 172.731, |
|
"eval_steps_per_second": 21.628, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 4.665554046630859, |
|
"learning_rate": 0.00015780006918021447, |
|
"loss": 0.8139, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 4.166018486022949, |
|
"learning_rate": 0.0001571082670356278, |
|
"loss": 1.1314, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 3.610258102416992, |
|
"learning_rate": 0.00015641646489104115, |
|
"loss": 0.9497, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 4.610332489013672, |
|
"learning_rate": 0.00015572466274645452, |
|
"loss": 1.0767, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 3.796252965927124, |
|
"learning_rate": 0.0001550328606018679, |
|
"loss": 0.8486, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 3.9809694290161133, |
|
"learning_rate": 0.00015434105845728123, |
|
"loss": 0.9211, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 2.5232605934143066, |
|
"learning_rate": 0.00015364925631269457, |
|
"loss": 0.8843, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 4.975670337677002, |
|
"learning_rate": 0.00015295745416810794, |
|
"loss": 0.9494, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 5.420626163482666, |
|
"learning_rate": 0.00015226565202352128, |
|
"loss": 0.9786, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 3.364365339279175, |
|
"learning_rate": 0.00015157384987893465, |
|
"loss": 0.8703, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"eval_accuracy": 0.7538593481989708, |
|
"eval_loss": 0.8465284109115601, |
|
"eval_runtime": 6.2814, |
|
"eval_samples_per_second": 185.628, |
|
"eval_steps_per_second": 23.243, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 3.51340913772583, |
|
"learning_rate": 0.00015088204773434796, |
|
"loss": 0.9032, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 3.7203245162963867, |
|
"learning_rate": 0.00015019024558976133, |
|
"loss": 0.7729, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 3.596214771270752, |
|
"learning_rate": 0.0001494984434451747, |
|
"loss": 0.8151, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 3.2724595069885254, |
|
"learning_rate": 0.00014880664130058804, |
|
"loss": 0.8064, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 3.5748846530914307, |
|
"learning_rate": 0.00014811483915600139, |
|
"loss": 0.8419, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 5.998478412628174, |
|
"learning_rate": 0.00014742303701141473, |
|
"loss": 0.8494, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 3.545043706893921, |
|
"learning_rate": 0.0001467312348668281, |
|
"loss": 0.7695, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 3.9944069385528564, |
|
"learning_rate": 0.00014603943272224146, |
|
"loss": 0.9405, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 5.435621738433838, |
|
"learning_rate": 0.00014534763057765478, |
|
"loss": 0.8863, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 7.365724086761475, |
|
"learning_rate": 0.00014465582843306815, |
|
"loss": 0.893, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"eval_accuracy": 0.8001715265866209, |
|
"eval_loss": 0.688121497631073, |
|
"eval_runtime": 6.0254, |
|
"eval_samples_per_second": 193.514, |
|
"eval_steps_per_second": 24.231, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 6.192987442016602, |
|
"learning_rate": 0.0001439640262884815, |
|
"loss": 0.8819, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 3.021066188812256, |
|
"learning_rate": 0.00014327222414389486, |
|
"loss": 0.6413, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 4.522083759307861, |
|
"learning_rate": 0.0001425804219993082, |
|
"loss": 0.9185, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 3.089639186859131, |
|
"learning_rate": 0.00014188861985472154, |
|
"loss": 0.7384, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 4.491950988769531, |
|
"learning_rate": 0.0001411968177101349, |
|
"loss": 0.765, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 3.618821144104004, |
|
"learning_rate": 0.00014050501556554828, |
|
"loss": 0.8237, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 4.773688793182373, |
|
"learning_rate": 0.00013981321342096162, |
|
"loss": 0.7171, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 1.607408881187439, |
|
"learning_rate": 0.00013912141127637496, |
|
"loss": 0.6438, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 4.511462211608887, |
|
"learning_rate": 0.0001384296091317883, |
|
"loss": 0.7983, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 4.259463787078857, |
|
"learning_rate": 0.00013773780698720167, |
|
"loss": 0.9454, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"eval_accuracy": 0.8027444253859348, |
|
"eval_loss": 0.7210972905158997, |
|
"eval_runtime": 5.9658, |
|
"eval_samples_per_second": 195.449, |
|
"eval_steps_per_second": 24.473, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 3.810264825820923, |
|
"learning_rate": 0.00013704600484261504, |
|
"loss": 0.7729, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 5.475677967071533, |
|
"learning_rate": 0.00013635420269802835, |
|
"loss": 0.7531, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 2.9276745319366455, |
|
"learning_rate": 0.00013566240055344172, |
|
"loss": 0.725, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 4.840962886810303, |
|
"learning_rate": 0.00013497059840885506, |
|
"loss": 0.6938, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 5.3595194816589355, |
|
"learning_rate": 0.00013427879626426843, |
|
"loss": 0.6863, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 7.755936145782471, |
|
"learning_rate": 0.00013358699411968177, |
|
"loss": 0.7146, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 3.4426372051239014, |
|
"learning_rate": 0.00013289519197509512, |
|
"loss": 0.7144, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 4.554823398590088, |
|
"learning_rate": 0.00013220338983050849, |
|
"loss": 0.6512, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 2.8689632415771484, |
|
"learning_rate": 0.00013151158768592183, |
|
"loss": 0.7966, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 3.4381957054138184, |
|
"learning_rate": 0.0001308197855413352, |
|
"loss": 0.8109, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"eval_accuracy": 0.8284734133790738, |
|
"eval_loss": 0.6368530988693237, |
|
"eval_runtime": 6.2777, |
|
"eval_samples_per_second": 185.738, |
|
"eval_steps_per_second": 23.257, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 2.984152317047119, |
|
"learning_rate": 0.00013012798339674854, |
|
"loss": 0.6477, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 5.486266613006592, |
|
"learning_rate": 0.00012943618125216188, |
|
"loss": 0.6834, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 2.2987334728240967, |
|
"learning_rate": 0.00012874437910757525, |
|
"loss": 0.6415, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 7.007256507873535, |
|
"learning_rate": 0.0001280525769629886, |
|
"loss": 0.6464, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 1.817421555519104, |
|
"learning_rate": 0.00012736077481840193, |
|
"loss": 0.774, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 4.492701530456543, |
|
"learning_rate": 0.0001266689726738153, |
|
"loss": 0.9331, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 3.884744644165039, |
|
"learning_rate": 0.00012597717052922864, |
|
"loss": 0.6107, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 4.274733066558838, |
|
"learning_rate": 0.000125285368384642, |
|
"loss": 0.6268, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 4.432763576507568, |
|
"learning_rate": 0.00012459356624005535, |
|
"loss": 0.6326, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 4.16074275970459, |
|
"learning_rate": 0.0001239017640954687, |
|
"loss": 0.8762, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"eval_accuracy": 0.839622641509434, |
|
"eval_loss": 0.6335619688034058, |
|
"eval_runtime": 6.1128, |
|
"eval_samples_per_second": 190.748, |
|
"eval_steps_per_second": 23.884, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 4.018909931182861, |
|
"learning_rate": 0.00012320996195088206, |
|
"loss": 0.8039, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 8.111551284790039, |
|
"learning_rate": 0.0001225181598062954, |
|
"loss": 0.6436, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 4.22373628616333, |
|
"learning_rate": 0.00012182635766170876, |
|
"loss": 0.6228, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 4.817978858947754, |
|
"learning_rate": 0.00012113455551712211, |
|
"loss": 0.7047, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 5.471624851226807, |
|
"learning_rate": 0.00012044275337253545, |
|
"loss": 0.8293, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 3.491068124771118, |
|
"learning_rate": 0.00011975095122794881, |
|
"loss": 0.7128, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 4.463800430297852, |
|
"learning_rate": 0.00011905914908336215, |
|
"loss": 0.7569, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 2.7582342624664307, |
|
"learning_rate": 0.00011836734693877552, |
|
"loss": 0.6774, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 4.606247901916504, |
|
"learning_rate": 0.00011767554479418887, |
|
"loss": 0.7384, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 4.3657660484313965, |
|
"learning_rate": 0.00011698374264960222, |
|
"loss": 0.8034, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"eval_accuracy": 0.8164665523156089, |
|
"eval_loss": 0.657957911491394, |
|
"eval_runtime": 6.0796, |
|
"eval_samples_per_second": 191.79, |
|
"eval_steps_per_second": 24.015, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 3.4329655170440674, |
|
"learning_rate": 0.00011629194050501557, |
|
"loss": 0.733, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 4.565850257873535, |
|
"learning_rate": 0.00011560013836042894, |
|
"loss": 0.7215, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 3.225835084915161, |
|
"learning_rate": 0.00011490833621584227, |
|
"loss": 0.6895, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 5.188159942626953, |
|
"learning_rate": 0.00011421653407125564, |
|
"loss": 0.6821, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 1.3620127439498901, |
|
"learning_rate": 0.00011352473192666896, |
|
"loss": 0.5604, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 3.446960687637329, |
|
"learning_rate": 0.00011283292978208233, |
|
"loss": 0.5554, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 5.016846656799316, |
|
"learning_rate": 0.00011214112763749569, |
|
"loss": 0.7156, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 5.600184440612793, |
|
"learning_rate": 0.00011144932549290903, |
|
"loss": 0.745, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 3.7640082836151123, |
|
"learning_rate": 0.00011075752334832239, |
|
"loss": 0.6316, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 4.837277889251709, |
|
"learning_rate": 0.00011006572120373573, |
|
"loss": 0.5833, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"eval_accuracy": 0.8439108061749572, |
|
"eval_loss": 0.5827564597129822, |
|
"eval_runtime": 6.3009, |
|
"eval_samples_per_second": 185.052, |
|
"eval_steps_per_second": 23.171, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 5.6047797203063965, |
|
"learning_rate": 0.00010937391905914908, |
|
"loss": 0.7184, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 4.177833080291748, |
|
"learning_rate": 0.0001087512971290211, |
|
"loss": 0.7094, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 2.0531811714172363, |
|
"learning_rate": 0.00010805949498443447, |
|
"loss": 0.6679, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 2.919313430786133, |
|
"learning_rate": 0.0001073676928398478, |
|
"loss": 0.4368, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 5.47185754776001, |
|
"learning_rate": 0.00010667589069526116, |
|
"loss": 0.5666, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 5.082462310791016, |
|
"learning_rate": 0.00010598408855067452, |
|
"loss": 0.7758, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 3.3282408714294434, |
|
"learning_rate": 0.00010529228640608786, |
|
"loss": 0.6055, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 5.19661808013916, |
|
"learning_rate": 0.00010460048426150121, |
|
"loss": 0.6678, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 2.5423412322998047, |
|
"learning_rate": 0.00010390868211691456, |
|
"loss": 0.5333, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 9.068185806274414, |
|
"learning_rate": 0.00010321687997232792, |
|
"loss": 0.8811, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"eval_accuracy": 0.8259005145797599, |
|
"eval_loss": 0.6564387679100037, |
|
"eval_runtime": 5.9782, |
|
"eval_samples_per_second": 195.041, |
|
"eval_steps_per_second": 24.422, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 5.794499397277832, |
|
"learning_rate": 0.00010252507782774128, |
|
"loss": 0.8535, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 4.731594562530518, |
|
"learning_rate": 0.00010183327568315462, |
|
"loss": 0.6556, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 3.823868751525879, |
|
"learning_rate": 0.00010114147353856798, |
|
"loss": 0.687, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 7.72351598739624, |
|
"learning_rate": 0.00010044967139398133, |
|
"loss": 0.6479, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 5.026217937469482, |
|
"learning_rate": 9.975786924939467e-05, |
|
"loss": 0.64, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 2.873476028442383, |
|
"learning_rate": 9.906606710480803e-05, |
|
"loss": 0.6692, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 3.9772098064422607, |
|
"learning_rate": 9.837426496022138e-05, |
|
"loss": 0.8156, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 5.044854164123535, |
|
"learning_rate": 9.768246281563474e-05, |
|
"loss": 0.7261, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 2.8663127422332764, |
|
"learning_rate": 9.699066067104808e-05, |
|
"loss": 0.7608, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 7.623239040374756, |
|
"learning_rate": 9.629885852646143e-05, |
|
"loss": 0.5639, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"eval_accuracy": 0.8439108061749572, |
|
"eval_loss": 0.5736597180366516, |
|
"eval_runtime": 6.1394, |
|
"eval_samples_per_second": 189.92, |
|
"eval_steps_per_second": 23.781, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 4.9297966957092285, |
|
"learning_rate": 9.560705638187479e-05, |
|
"loss": 0.6672, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 2.0875468254089355, |
|
"learning_rate": 9.491525423728815e-05, |
|
"loss": 0.5335, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 10.350793838500977, |
|
"learning_rate": 9.422345209270149e-05, |
|
"loss": 0.7163, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 4.162230014801025, |
|
"learning_rate": 9.353164994811484e-05, |
|
"loss": 0.7528, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 5.249913215637207, |
|
"learning_rate": 9.28398478035282e-05, |
|
"loss": 0.7946, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 3.5188651084899902, |
|
"learning_rate": 9.214804565894155e-05, |
|
"loss": 0.7108, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 5.497143268585205, |
|
"learning_rate": 9.14562435143549e-05, |
|
"loss": 0.8685, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 4.383511066436768, |
|
"learning_rate": 9.076444136976825e-05, |
|
"loss": 0.7093, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 4.953444957733154, |
|
"learning_rate": 9.00726392251816e-05, |
|
"loss": 0.7253, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 3.61757230758667, |
|
"learning_rate": 8.938083708059496e-05, |
|
"loss": 0.639, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"eval_accuracy": 0.8379073756432247, |
|
"eval_loss": 0.560886561870575, |
|
"eval_runtime": 5.9489, |
|
"eval_samples_per_second": 196.002, |
|
"eval_steps_per_second": 24.542, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 5.650317668914795, |
|
"learning_rate": 8.868903493600831e-05, |
|
"loss": 0.8287, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 6.012249946594238, |
|
"learning_rate": 8.799723279142166e-05, |
|
"loss": 0.6976, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 5.186240196228027, |
|
"learning_rate": 8.730543064683501e-05, |
|
"loss": 0.7452, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 5.2669572830200195, |
|
"learning_rate": 8.661362850224835e-05, |
|
"loss": 0.8586, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.4105117321014404, |
|
"learning_rate": 8.592182635766172e-05, |
|
"loss": 0.6194, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"grad_norm": 1.2886375188827515, |
|
"learning_rate": 8.523002421307506e-05, |
|
"loss": 0.4969, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 1.915207862854004, |
|
"learning_rate": 8.453822206848842e-05, |
|
"loss": 0.571, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 3.7422375679016113, |
|
"learning_rate": 8.384641992390176e-05, |
|
"loss": 0.6791, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 5.4421467781066895, |
|
"learning_rate": 8.315461777931513e-05, |
|
"loss": 0.6829, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 1.9872852563858032, |
|
"learning_rate": 8.246281563472847e-05, |
|
"loss": 0.6455, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"eval_accuracy": 0.8370497427101201, |
|
"eval_loss": 0.5820054411888123, |
|
"eval_runtime": 6.2231, |
|
"eval_samples_per_second": 187.366, |
|
"eval_steps_per_second": 23.461, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 3.1257195472717285, |
|
"learning_rate": 8.177101349014182e-05, |
|
"loss": 0.6619, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 3.6743292808532715, |
|
"learning_rate": 8.107921134555517e-05, |
|
"loss": 0.8357, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 3.7856836318969727, |
|
"learning_rate": 8.038740920096852e-05, |
|
"loss": 0.6017, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 4.6526970863342285, |
|
"learning_rate": 7.969560705638188e-05, |
|
"loss": 0.6805, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 3.4002761840820312, |
|
"learning_rate": 7.900380491179523e-05, |
|
"loss": 0.5558, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 3.9795327186584473, |
|
"learning_rate": 7.831200276720859e-05, |
|
"loss": 0.6921, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 3.5085155963897705, |
|
"learning_rate": 7.762020062262193e-05, |
|
"loss": 0.5476, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 5.0314412117004395, |
|
"learning_rate": 7.69283984780353e-05, |
|
"loss": 0.8566, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 2.536855697631836, |
|
"learning_rate": 7.623659633344864e-05, |
|
"loss": 0.5743, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 4.995050430297852, |
|
"learning_rate": 7.5544794188862e-05, |
|
"loss": 0.5402, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"eval_accuracy": 0.8344768439108061, |
|
"eval_loss": 0.5796906352043152, |
|
"eval_runtime": 6.1279, |
|
"eval_samples_per_second": 190.278, |
|
"eval_steps_per_second": 23.826, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 4.143467903137207, |
|
"learning_rate": 7.485299204427533e-05, |
|
"loss": 0.6715, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 1.8152028322219849, |
|
"learning_rate": 7.416118989968869e-05, |
|
"loss": 0.6965, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"grad_norm": 3.699620485305786, |
|
"learning_rate": 7.346938775510205e-05, |
|
"loss": 0.5758, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 2.2266180515289307, |
|
"learning_rate": 7.27775856105154e-05, |
|
"loss": 0.6802, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 4.586669445037842, |
|
"learning_rate": 7.208578346592874e-05, |
|
"loss": 0.5885, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 4.72069787979126, |
|
"learning_rate": 7.13939813213421e-05, |
|
"loss": 0.6404, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 5.436990261077881, |
|
"learning_rate": 7.070217917675545e-05, |
|
"loss": 0.7781, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 4.715204238891602, |
|
"learning_rate": 7.001037703216881e-05, |
|
"loss": 0.7109, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 3.261801242828369, |
|
"learning_rate": 6.931857488758215e-05, |
|
"loss": 0.5707, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 3.516954183578491, |
|
"learning_rate": 6.86267727429955e-05, |
|
"loss": 0.5311, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"eval_accuracy": 0.8456260720411664, |
|
"eval_loss": 0.55106520652771, |
|
"eval_runtime": 6.3501, |
|
"eval_samples_per_second": 183.618, |
|
"eval_steps_per_second": 22.992, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 4.697694778442383, |
|
"learning_rate": 6.793497059840886e-05, |
|
"loss": 0.6169, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 4.627555847167969, |
|
"learning_rate": 6.724316845382221e-05, |
|
"loss": 0.649, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 3.16441011428833, |
|
"learning_rate": 6.662054652369423e-05, |
|
"loss": 0.6538, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 3.6413562297821045, |
|
"learning_rate": 6.592874437910757e-05, |
|
"loss": 0.5963, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 4.7628960609436035, |
|
"learning_rate": 6.523694223452093e-05, |
|
"loss": 0.5106, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 3.3812217712402344, |
|
"learning_rate": 6.454514008993428e-05, |
|
"loss": 0.6908, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 3.9284725189208984, |
|
"learning_rate": 6.385333794534764e-05, |
|
"loss": 0.4863, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 3.633194923400879, |
|
"learning_rate": 6.316153580076098e-05, |
|
"loss": 0.4383, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 6.324495792388916, |
|
"learning_rate": 6.246973365617433e-05, |
|
"loss": 0.8217, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 5.055554389953613, |
|
"learning_rate": 6.177793151158769e-05, |
|
"loss": 0.5734, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"eval_accuracy": 0.8507718696397941, |
|
"eval_loss": 0.5443547964096069, |
|
"eval_runtime": 5.9845, |
|
"eval_samples_per_second": 194.837, |
|
"eval_steps_per_second": 24.396, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 3.0936367511749268, |
|
"learning_rate": 6.108612936700104e-05, |
|
"loss": 0.6563, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 3.478715181350708, |
|
"learning_rate": 6.039432722241439e-05, |
|
"loss": 0.595, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 3.4817001819610596, |
|
"learning_rate": 5.970252507782774e-05, |
|
"loss": 0.3782, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"grad_norm": 6.603343963623047, |
|
"learning_rate": 5.901072293324109e-05, |
|
"loss": 0.5735, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 1.6107616424560547, |
|
"learning_rate": 5.831892078865445e-05, |
|
"loss": 0.6045, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 4.367840766906738, |
|
"learning_rate": 5.76271186440678e-05, |
|
"loss": 0.5533, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 5.01, |
|
"grad_norm": 6.6618266105651855, |
|
"learning_rate": 5.6935316499481154e-05, |
|
"loss": 0.4114, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"grad_norm": 7.852776050567627, |
|
"learning_rate": 5.62435143548945e-05, |
|
"loss": 0.642, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 5.06, |
|
"grad_norm": 5.769771099090576, |
|
"learning_rate": 5.555171221030785e-05, |
|
"loss": 0.5934, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 5.08, |
|
"grad_norm": 2.4635396003723145, |
|
"learning_rate": 5.485991006572121e-05, |
|
"loss": 0.5206, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 5.08, |
|
"eval_accuracy": 0.8636363636363636, |
|
"eval_loss": 0.5326434969902039, |
|
"eval_runtime": 6.1606, |
|
"eval_samples_per_second": 189.267, |
|
"eval_steps_per_second": 23.699, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"grad_norm": 4.699713706970215, |
|
"learning_rate": 5.416810792113456e-05, |
|
"loss": 0.6311, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 5.13, |
|
"grad_norm": 3.2119288444519043, |
|
"learning_rate": 5.347630577654791e-05, |
|
"loss": 0.4781, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"grad_norm": 5.083879470825195, |
|
"learning_rate": 5.278450363196126e-05, |
|
"loss": 0.5123, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 5.18, |
|
"grad_norm": 3.2444283962249756, |
|
"learning_rate": 5.209270148737462e-05, |
|
"loss": 0.39, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 5.21, |
|
"grad_norm": 2.9540326595306396, |
|
"learning_rate": 5.140089934278797e-05, |
|
"loss": 0.5081, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 5.23, |
|
"grad_norm": 4.055675029754639, |
|
"learning_rate": 5.0709097198201316e-05, |
|
"loss": 0.5884, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 2.7214150428771973, |
|
"learning_rate": 5.0017295053614664e-05, |
|
"loss": 0.5241, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 1.249835729598999, |
|
"learning_rate": 4.932549290902802e-05, |
|
"loss": 0.3994, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 5.3, |
|
"grad_norm": 5.9494829177856445, |
|
"learning_rate": 4.863369076444137e-05, |
|
"loss": 0.5295, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 5.33, |
|
"grad_norm": 3.318251371383667, |
|
"learning_rate": 4.794188861985472e-05, |
|
"loss": 0.6272, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 5.33, |
|
"eval_accuracy": 0.8524871355060034, |
|
"eval_loss": 0.5477628707885742, |
|
"eval_runtime": 6.3218, |
|
"eval_samples_per_second": 184.442, |
|
"eval_steps_per_second": 23.095, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 5.35, |
|
"grad_norm": 4.580955505371094, |
|
"learning_rate": 4.725008647526807e-05, |
|
"loss": 0.5204, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"grad_norm": 1.7544004917144775, |
|
"learning_rate": 4.6558284330681426e-05, |
|
"loss": 0.5388, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 4.454672336578369, |
|
"learning_rate": 4.586648218609478e-05, |
|
"loss": 0.4773, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 5.42, |
|
"grad_norm": 7.039458274841309, |
|
"learning_rate": 4.517468004150813e-05, |
|
"loss": 0.5821, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 5.45, |
|
"grad_norm": 4.6715006828308105, |
|
"learning_rate": 4.4482877896921485e-05, |
|
"loss": 0.5881, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 5.47, |
|
"grad_norm": 3.3161721229553223, |
|
"learning_rate": 4.379107575233484e-05, |
|
"loss": 0.5587, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 5.245398044586182, |
|
"learning_rate": 4.309927360774819e-05, |
|
"loss": 0.4973, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"grad_norm": 3.1016721725463867, |
|
"learning_rate": 4.2407471463161536e-05, |
|
"loss": 0.4004, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"grad_norm": 4.883015155792236, |
|
"learning_rate": 4.171566931857489e-05, |
|
"loss": 0.4267, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 5.57, |
|
"grad_norm": 4.550380229949951, |
|
"learning_rate": 4.102386717398824e-05, |
|
"loss": 0.5124, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 5.57, |
|
"eval_accuracy": 0.8687821612349914, |
|
"eval_loss": 0.5295912623405457, |
|
"eval_runtime": 6.0432, |
|
"eval_samples_per_second": 192.944, |
|
"eval_steps_per_second": 24.159, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 5.59, |
|
"grad_norm": 3.2469303607940674, |
|
"learning_rate": 4.0332065029401595e-05, |
|
"loss": 0.4733, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 5.62, |
|
"grad_norm": 5.656401634216309, |
|
"learning_rate": 3.964026288481494e-05, |
|
"loss": 0.5978, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 2.7476541996002197, |
|
"learning_rate": 3.89484607402283e-05, |
|
"loss": 0.3513, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 5.67, |
|
"grad_norm": 4.047815322875977, |
|
"learning_rate": 3.825665859564165e-05, |
|
"loss": 0.5287, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 5.69, |
|
"grad_norm": 3.4885923862457275, |
|
"learning_rate": 3.7564856451055e-05, |
|
"loss": 0.4946, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 5.71, |
|
"grad_norm": 7.513520240783691, |
|
"learning_rate": 3.687305430646835e-05, |
|
"loss": 0.6233, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 5.74, |
|
"grad_norm": 2.3985989093780518, |
|
"learning_rate": 3.61812521618817e-05, |
|
"loss": 0.5149, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"grad_norm": 5.046018123626709, |
|
"learning_rate": 3.5489450017295054e-05, |
|
"loss": 0.4948, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 5.79, |
|
"grad_norm": 2.6082875728607178, |
|
"learning_rate": 3.479764787270841e-05, |
|
"loss": 0.6084, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"grad_norm": 2.541283369064331, |
|
"learning_rate": 3.410584572812176e-05, |
|
"loss": 0.5659, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"eval_accuracy": 0.8704974271012007, |
|
"eval_loss": 0.5180826783180237, |
|
"eval_runtime": 6.1391, |
|
"eval_samples_per_second": 189.929, |
|
"eval_steps_per_second": 23.782, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"grad_norm": 2.853994846343994, |
|
"learning_rate": 3.341404358353511e-05, |
|
"loss": 0.6081, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"grad_norm": 4.628828525543213, |
|
"learning_rate": 3.272224143894847e-05, |
|
"loss": 0.4588, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"grad_norm": 3.1006319522857666, |
|
"learning_rate": 3.2030439294361816e-05, |
|
"loss": 0.4341, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 5.91, |
|
"grad_norm": 2.395719528198242, |
|
"learning_rate": 3.133863714977517e-05, |
|
"loss": 0.442, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 5.93, |
|
"grad_norm": 3.238839864730835, |
|
"learning_rate": 3.064683500518852e-05, |
|
"loss": 0.4359, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 5.96, |
|
"grad_norm": 5.706843852996826, |
|
"learning_rate": 2.9955032860601867e-05, |
|
"loss": 0.5139, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 5.98, |
|
"grad_norm": 6.059083461761475, |
|
"learning_rate": 2.9263230716015223e-05, |
|
"loss": 0.4459, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 4.164783954620361, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 0.5037, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 6.03, |
|
"grad_norm": 3.230203151702881, |
|
"learning_rate": 2.7879626426841926e-05, |
|
"loss": 0.4225, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 6.05, |
|
"grad_norm": 5.467190742492676, |
|
"learning_rate": 2.7187824282255274e-05, |
|
"loss": 0.4212, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 6.05, |
|
"eval_accuracy": 0.8610634648370498, |
|
"eval_loss": 0.5200443267822266, |
|
"eval_runtime": 6.2608, |
|
"eval_samples_per_second": 186.239, |
|
"eval_steps_per_second": 23.32, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"grad_norm": 3.7668442726135254, |
|
"learning_rate": 2.649602213766863e-05, |
|
"loss": 0.4042, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 6.1, |
|
"grad_norm": 3.094477415084839, |
|
"learning_rate": 2.580421999308198e-05, |
|
"loss": 0.4338, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 6.13, |
|
"grad_norm": 5.538024425506592, |
|
"learning_rate": 2.5112417848495333e-05, |
|
"loss": 0.3269, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 6.15, |
|
"grad_norm": 5.658746719360352, |
|
"learning_rate": 2.4420615703908685e-05, |
|
"loss": 0.4719, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 6.17, |
|
"grad_norm": 1.6886987686157227, |
|
"learning_rate": 2.3728813559322036e-05, |
|
"loss": 0.395, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"grad_norm": 3.538180112838745, |
|
"learning_rate": 2.3037011414735388e-05, |
|
"loss": 0.2877, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 6.22, |
|
"grad_norm": 2.9912898540496826, |
|
"learning_rate": 2.234520927014874e-05, |
|
"loss": 0.4797, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 2.68037748336792, |
|
"learning_rate": 2.1653407125562088e-05, |
|
"loss": 0.5114, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 6.27, |
|
"grad_norm": 5.079796314239502, |
|
"learning_rate": 2.096160498097544e-05, |
|
"loss": 0.3604, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 6.3, |
|
"grad_norm": 3.052543878555298, |
|
"learning_rate": 2.026980283638879e-05, |
|
"loss": 0.4338, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 6.3, |
|
"eval_accuracy": 0.8730703259005146, |
|
"eval_loss": 0.5135151743888855, |
|
"eval_runtime": 5.9846, |
|
"eval_samples_per_second": 194.834, |
|
"eval_steps_per_second": 24.396, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 6.32, |
|
"grad_norm": 5.780861854553223, |
|
"learning_rate": 1.9578000691802147e-05, |
|
"loss": 0.3725, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 6.34, |
|
"grad_norm": 4.87053108215332, |
|
"learning_rate": 1.88861985472155e-05, |
|
"loss": 0.2491, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 6.37, |
|
"grad_norm": 2.2995293140411377, |
|
"learning_rate": 1.819439640262885e-05, |
|
"loss": 0.2911, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 6.39, |
|
"grad_norm": 1.6383118629455566, |
|
"learning_rate": 1.7502594258042202e-05, |
|
"loss": 0.2562, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 6.42, |
|
"grad_norm": 4.9596991539001465, |
|
"learning_rate": 1.6810792113455554e-05, |
|
"loss": 0.5795, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 6.44, |
|
"grad_norm": 2.922712802886963, |
|
"learning_rate": 1.6118989968868905e-05, |
|
"loss": 0.421, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 6.46, |
|
"grad_norm": 2.0401623249053955, |
|
"learning_rate": 1.5427187824282254e-05, |
|
"loss": 0.4283, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 6.49, |
|
"grad_norm": 0.9165148735046387, |
|
"learning_rate": 1.4735385679695607e-05, |
|
"loss": 0.4512, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 6.51, |
|
"grad_norm": 4.587483882904053, |
|
"learning_rate": 1.4043583535108959e-05, |
|
"loss": 0.4664, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 6.54, |
|
"grad_norm": 4.216481685638428, |
|
"learning_rate": 1.335178139052231e-05, |
|
"loss": 0.3407, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 6.54, |
|
"eval_accuracy": 0.87221269296741, |
|
"eval_loss": 0.5147121548652649, |
|
"eval_runtime": 6.1635, |
|
"eval_samples_per_second": 189.179, |
|
"eval_steps_per_second": 23.688, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"grad_norm": 1.7551047801971436, |
|
"learning_rate": 1.2659979245935664e-05, |
|
"loss": 0.4725, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 6.59, |
|
"grad_norm": 4.851523399353027, |
|
"learning_rate": 1.1968177101349016e-05, |
|
"loss": 0.4639, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 6.61, |
|
"grad_norm": 6.040704727172852, |
|
"learning_rate": 1.1276374956762366e-05, |
|
"loss": 0.3146, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 6.63, |
|
"grad_norm": 1.6925532817840576, |
|
"learning_rate": 1.0584572812175717e-05, |
|
"loss": 0.3665, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 6.66, |
|
"grad_norm": 2.9491493701934814, |
|
"learning_rate": 9.89277066758907e-06, |
|
"loss": 0.467, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 6.68, |
|
"grad_norm": 2.1744699478149414, |
|
"learning_rate": 9.200968523002422e-06, |
|
"loss": 0.3542, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 6.71, |
|
"grad_norm": 3.170931577682495, |
|
"learning_rate": 8.509166378415774e-06, |
|
"loss": 0.5874, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 6.73, |
|
"grad_norm": 3.2446773052215576, |
|
"learning_rate": 7.817364233829124e-06, |
|
"loss": 0.3705, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 6.76, |
|
"grad_norm": 3.8055498600006104, |
|
"learning_rate": 7.125562089242477e-06, |
|
"loss": 0.3164, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 6.78, |
|
"grad_norm": 2.3979437351226807, |
|
"learning_rate": 6.4337599446558285e-06, |
|
"loss": 0.4043, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 6.78, |
|
"eval_accuracy": 0.869639794168096, |
|
"eval_loss": 0.5081329345703125, |
|
"eval_runtime": 6.6143, |
|
"eval_samples_per_second": 176.285, |
|
"eval_steps_per_second": 22.073, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"grad_norm": 1.7395985126495361, |
|
"learning_rate": 5.74195780006918e-06, |
|
"loss": 0.3624, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"grad_norm": 2.924905300140381, |
|
"learning_rate": 5.050155655482532e-06, |
|
"loss": 0.4046, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 6.85, |
|
"grad_norm": 11.709400177001953, |
|
"learning_rate": 4.358353510895884e-06, |
|
"loss": 0.4807, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 6.416582107543945, |
|
"learning_rate": 3.666551366309236e-06, |
|
"loss": 0.4782, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 6.9, |
|
"grad_norm": 6.1391448974609375, |
|
"learning_rate": 2.9747492217225875e-06, |
|
"loss": 0.4852, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"grad_norm": 3.525520086288452, |
|
"learning_rate": 2.2829470771359392e-06, |
|
"loss": 0.4282, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 6.95, |
|
"grad_norm": 1.4197200536727905, |
|
"learning_rate": 1.591144932549291e-06, |
|
"loss": 0.4337, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 6.97, |
|
"grad_norm": 4.016748905181885, |
|
"learning_rate": 8.993427879626428e-07, |
|
"loss": 0.3915, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 2.1515309810638428, |
|
"learning_rate": 2.0754064337599448e-07, |
|
"loss": 0.4095, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"step": 2891, |
|
"total_flos": 3.5833623598425784e+18, |
|
"train_loss": 0.7298227465279536, |
|
"train_runtime": 1041.6701, |
|
"train_samples_per_second": 44.372, |
|
"train_steps_per_second": 2.775 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2891, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 7, |
|
"save_steps": 100, |
|
"total_flos": 3.5833623598425784e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|