Llama-3-LongStory-LORA / trainer_state.json
Azazelle's picture
Duplicate from Blackroot/Llama-3-LongStory-LORA
8f52c40 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.492822966507177,
"eval_steps": 500,
"global_step": 10650,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011961722488038277,
"grad_norm": 1.9270328283309937,
"learning_rate": 4.998999599839936e-05,
"loss": 2.3527,
"step": 15
},
{
"epoch": 0.023923444976076555,
"grad_norm": 1.8812594413757324,
"learning_rate": 4.995998399359744e-05,
"loss": 2.3692,
"step": 30
},
{
"epoch": 0.03588516746411483,
"grad_norm": 1.3909887075424194,
"learning_rate": 4.9929971988795524e-05,
"loss": 2.3111,
"step": 45
},
{
"epoch": 0.04784688995215311,
"grad_norm": 2.809542417526245,
"learning_rate": 4.98999599839936e-05,
"loss": 2.2284,
"step": 60
},
{
"epoch": 0.05980861244019139,
"grad_norm": 3.4008336067199707,
"learning_rate": 4.986994797919168e-05,
"loss": 2.2569,
"step": 75
},
{
"epoch": 0.07177033492822966,
"grad_norm": 1.2219867706298828,
"learning_rate": 4.983993597438976e-05,
"loss": 2.1762,
"step": 90
},
{
"epoch": 0.08373205741626795,
"grad_norm": 1.3036127090454102,
"learning_rate": 4.9809923969587836e-05,
"loss": 2.1956,
"step": 105
},
{
"epoch": 0.09569377990430622,
"grad_norm": 1.468847393989563,
"learning_rate": 4.977991196478592e-05,
"loss": 2.2729,
"step": 120
},
{
"epoch": 0.1076555023923445,
"grad_norm": 1.2088780403137207,
"learning_rate": 4.9749899959984e-05,
"loss": 2.2164,
"step": 135
},
{
"epoch": 0.11961722488038277,
"grad_norm": 1.197135090827942,
"learning_rate": 4.9719887955182076e-05,
"loss": 2.1056,
"step": 150
},
{
"epoch": 0.13157894736842105,
"grad_norm": 1.309010624885559,
"learning_rate": 4.9689875950380154e-05,
"loss": 2.171,
"step": 165
},
{
"epoch": 0.14354066985645933,
"grad_norm": 1.3516101837158203,
"learning_rate": 4.965986394557823e-05,
"loss": 2.1898,
"step": 180
},
{
"epoch": 0.15550239234449761,
"grad_norm": 1.186513900756836,
"learning_rate": 4.962985194077631e-05,
"loss": 2.1427,
"step": 195
},
{
"epoch": 0.1674641148325359,
"grad_norm": 1.159972906112671,
"learning_rate": 4.959983993597439e-05,
"loss": 2.2603,
"step": 210
},
{
"epoch": 0.17942583732057416,
"grad_norm": 1.188928484916687,
"learning_rate": 4.956982793117247e-05,
"loss": 2.2281,
"step": 225
},
{
"epoch": 0.19138755980861244,
"grad_norm": 2.18959903717041,
"learning_rate": 4.953981592637055e-05,
"loss": 2.2187,
"step": 240
},
{
"epoch": 0.20334928229665072,
"grad_norm": 1.267388939857483,
"learning_rate": 4.9509803921568634e-05,
"loss": 2.1898,
"step": 255
},
{
"epoch": 0.215311004784689,
"grad_norm": 1.5959223508834839,
"learning_rate": 4.947979191676671e-05,
"loss": 2.1488,
"step": 270
},
{
"epoch": 0.22727272727272727,
"grad_norm": 1.081666350364685,
"learning_rate": 4.944977991196479e-05,
"loss": 2.2176,
"step": 285
},
{
"epoch": 0.23923444976076555,
"grad_norm": 1.1691621541976929,
"learning_rate": 4.941976790716287e-05,
"loss": 2.1297,
"step": 300
},
{
"epoch": 0.2511961722488038,
"grad_norm": 1.4069727659225464,
"learning_rate": 4.9389755902360946e-05,
"loss": 2.2035,
"step": 315
},
{
"epoch": 0.2631578947368421,
"grad_norm": 1.135937213897705,
"learning_rate": 4.9359743897559024e-05,
"loss": 2.1925,
"step": 330
},
{
"epoch": 0.2751196172248804,
"grad_norm": 1.0926917791366577,
"learning_rate": 4.93297318927571e-05,
"loss": 2.1679,
"step": 345
},
{
"epoch": 0.28708133971291866,
"grad_norm": 1.0808637142181396,
"learning_rate": 4.9299719887955186e-05,
"loss": 2.2122,
"step": 360
},
{
"epoch": 0.29904306220095694,
"grad_norm": 1.2694952487945557,
"learning_rate": 4.9269707883153264e-05,
"loss": 2.1643,
"step": 375
},
{
"epoch": 0.31100478468899523,
"grad_norm": 1.1682099103927612,
"learning_rate": 4.923969587835134e-05,
"loss": 2.2263,
"step": 390
},
{
"epoch": 0.3229665071770335,
"grad_norm": 1.1954610347747803,
"learning_rate": 4.920968387354942e-05,
"loss": 2.1555,
"step": 405
},
{
"epoch": 0.3349282296650718,
"grad_norm": 1.0608245134353638,
"learning_rate": 4.9179671868747504e-05,
"loss": 2.1918,
"step": 420
},
{
"epoch": 0.34688995215311,
"grad_norm": 1.2034133672714233,
"learning_rate": 4.914965986394558e-05,
"loss": 2.1101,
"step": 435
},
{
"epoch": 0.3588516746411483,
"grad_norm": 1.0936003923416138,
"learning_rate": 4.911964785914366e-05,
"loss": 2.137,
"step": 450
},
{
"epoch": 0.3708133971291866,
"grad_norm": 1.188496708869934,
"learning_rate": 4.908963585434174e-05,
"loss": 2.1864,
"step": 465
},
{
"epoch": 0.3827751196172249,
"grad_norm": 1.350693941116333,
"learning_rate": 4.905962384953982e-05,
"loss": 2.1491,
"step": 480
},
{
"epoch": 0.39473684210526316,
"grad_norm": 1.2483429908752441,
"learning_rate": 4.90296118447379e-05,
"loss": 2.1868,
"step": 495
},
{
"epoch": 0.40669856459330145,
"grad_norm": 1.1137944459915161,
"learning_rate": 4.899959983993598e-05,
"loss": 2.191,
"step": 510
},
{
"epoch": 0.41866028708133973,
"grad_norm": 1.3261072635650635,
"learning_rate": 4.8969587835134056e-05,
"loss": 2.1336,
"step": 525
},
{
"epoch": 0.430622009569378,
"grad_norm": 1.6815850734710693,
"learning_rate": 4.8939575830332134e-05,
"loss": 2.1524,
"step": 540
},
{
"epoch": 0.44258373205741625,
"grad_norm": 1.080824851989746,
"learning_rate": 4.890956382553021e-05,
"loss": 2.2056,
"step": 555
},
{
"epoch": 0.45454545454545453,
"grad_norm": 1.2140378952026367,
"learning_rate": 4.887955182072829e-05,
"loss": 2.2114,
"step": 570
},
{
"epoch": 0.4665071770334928,
"grad_norm": 1.1290125846862793,
"learning_rate": 4.884953981592637e-05,
"loss": 2.101,
"step": 585
},
{
"epoch": 0.4784688995215311,
"grad_norm": 1.171129822731018,
"learning_rate": 4.881952781112445e-05,
"loss": 2.2186,
"step": 600
},
{
"epoch": 0.4904306220095694,
"grad_norm": 1.99854576587677,
"learning_rate": 4.878951580632253e-05,
"loss": 2.154,
"step": 615
},
{
"epoch": 0.5023923444976076,
"grad_norm": 1.1021254062652588,
"learning_rate": 4.8759503801520615e-05,
"loss": 2.1066,
"step": 630
},
{
"epoch": 0.5143540669856459,
"grad_norm": 1.022976040840149,
"learning_rate": 4.872949179671869e-05,
"loss": 2.1642,
"step": 645
},
{
"epoch": 0.5263157894736842,
"grad_norm": 1.110926866531372,
"learning_rate": 4.869947979191677e-05,
"loss": 2.1592,
"step": 660
},
{
"epoch": 0.5382775119617225,
"grad_norm": 1.096807599067688,
"learning_rate": 4.866946778711485e-05,
"loss": 2.2171,
"step": 675
},
{
"epoch": 0.5502392344497608,
"grad_norm": 1.2465318441390991,
"learning_rate": 4.8639455782312926e-05,
"loss": 2.1794,
"step": 690
},
{
"epoch": 0.562200956937799,
"grad_norm": 1.6367931365966797,
"learning_rate": 4.8609443777511004e-05,
"loss": 2.1405,
"step": 705
},
{
"epoch": 0.5741626794258373,
"grad_norm": 1.3877207040786743,
"learning_rate": 4.857943177270909e-05,
"loss": 2.1781,
"step": 720
},
{
"epoch": 0.5861244019138756,
"grad_norm": 1.1698716878890991,
"learning_rate": 4.8549419767907166e-05,
"loss": 2.2076,
"step": 735
},
{
"epoch": 0.5980861244019139,
"grad_norm": 1.1922690868377686,
"learning_rate": 4.8519407763105244e-05,
"loss": 2.1515,
"step": 750
},
{
"epoch": 0.6100478468899522,
"grad_norm": 1.1112874746322632,
"learning_rate": 4.848939575830332e-05,
"loss": 2.0535,
"step": 765
},
{
"epoch": 0.6220095693779905,
"grad_norm": 1.3220607042312622,
"learning_rate": 4.84593837535014e-05,
"loss": 2.1978,
"step": 780
},
{
"epoch": 0.6339712918660287,
"grad_norm": 1.2560738325119019,
"learning_rate": 4.8429371748699484e-05,
"loss": 2.245,
"step": 795
},
{
"epoch": 0.645933014354067,
"grad_norm": 1.1312100887298584,
"learning_rate": 4.839935974389756e-05,
"loss": 2.1252,
"step": 810
},
{
"epoch": 0.6578947368421053,
"grad_norm": 1.2060538530349731,
"learning_rate": 4.836934773909564e-05,
"loss": 2.1268,
"step": 825
},
{
"epoch": 0.6698564593301436,
"grad_norm": 2.0435290336608887,
"learning_rate": 4.8339335734293725e-05,
"loss": 2.2091,
"step": 840
},
{
"epoch": 0.6818181818181818,
"grad_norm": 2.7680532932281494,
"learning_rate": 4.83093237294918e-05,
"loss": 2.0631,
"step": 855
},
{
"epoch": 0.69377990430622,
"grad_norm": 1.1256909370422363,
"learning_rate": 4.827931172468988e-05,
"loss": 2.1396,
"step": 870
},
{
"epoch": 0.7057416267942583,
"grad_norm": 1.1224644184112549,
"learning_rate": 4.824929971988796e-05,
"loss": 2.107,
"step": 885
},
{
"epoch": 0.7177033492822966,
"grad_norm": 1.2712397575378418,
"learning_rate": 4.8219287715086036e-05,
"loss": 2.1332,
"step": 900
},
{
"epoch": 0.7296650717703349,
"grad_norm": 1.2399568557739258,
"learning_rate": 4.8189275710284114e-05,
"loss": 2.1198,
"step": 915
},
{
"epoch": 0.7416267942583732,
"grad_norm": 1.0852080583572388,
"learning_rate": 4.815926370548219e-05,
"loss": 2.1436,
"step": 930
},
{
"epoch": 0.7535885167464115,
"grad_norm": 1.3282052278518677,
"learning_rate": 4.812925170068027e-05,
"loss": 2.1763,
"step": 945
},
{
"epoch": 0.7655502392344498,
"grad_norm": 1.8598517179489136,
"learning_rate": 4.809923969587835e-05,
"loss": 2.1188,
"step": 960
},
{
"epoch": 0.777511961722488,
"grad_norm": 1.1602433919906616,
"learning_rate": 4.806922769107643e-05,
"loss": 2.2234,
"step": 975
},
{
"epoch": 0.7894736842105263,
"grad_norm": 1.3578499555587769,
"learning_rate": 4.803921568627452e-05,
"loss": 2.1404,
"step": 990
},
{
"epoch": 0.8014354066985646,
"grad_norm": 1.4764407873153687,
"learning_rate": 4.8009203681472595e-05,
"loss": 2.1582,
"step": 1005
},
{
"epoch": 0.8133971291866029,
"grad_norm": 1.083958387374878,
"learning_rate": 4.797919167667067e-05,
"loss": 2.1156,
"step": 1020
},
{
"epoch": 0.8253588516746412,
"grad_norm": 1.2568596601486206,
"learning_rate": 4.794917967186875e-05,
"loss": 2.1341,
"step": 1035
},
{
"epoch": 0.8373205741626795,
"grad_norm": 1.1657259464263916,
"learning_rate": 4.791916766706683e-05,
"loss": 2.1245,
"step": 1050
},
{
"epoch": 0.8492822966507177,
"grad_norm": 2.355947256088257,
"learning_rate": 4.7889155662264906e-05,
"loss": 2.1975,
"step": 1065
},
{
"epoch": 0.861244019138756,
"grad_norm": 2.6566946506500244,
"learning_rate": 4.7859143657462984e-05,
"loss": 2.1263,
"step": 1080
},
{
"epoch": 0.8732057416267942,
"grad_norm": 1.2993121147155762,
"learning_rate": 4.782913165266107e-05,
"loss": 2.1481,
"step": 1095
},
{
"epoch": 0.8851674641148325,
"grad_norm": 1.129744291305542,
"learning_rate": 4.7799119647859146e-05,
"loss": 2.1574,
"step": 1110
},
{
"epoch": 0.8971291866028708,
"grad_norm": 1.1695717573165894,
"learning_rate": 4.7769107643057224e-05,
"loss": 2.0916,
"step": 1125
},
{
"epoch": 0.9090909090909091,
"grad_norm": 1.159279465675354,
"learning_rate": 4.77390956382553e-05,
"loss": 2.1265,
"step": 1140
},
{
"epoch": 0.9210526315789473,
"grad_norm": 1.2150417566299438,
"learning_rate": 4.770908363345338e-05,
"loss": 2.1351,
"step": 1155
},
{
"epoch": 0.9330143540669856,
"grad_norm": 1.2673773765563965,
"learning_rate": 4.7679071628651465e-05,
"loss": 2.2444,
"step": 1170
},
{
"epoch": 0.9449760765550239,
"grad_norm": 1.1746214628219604,
"learning_rate": 4.764905962384954e-05,
"loss": 2.1371,
"step": 1185
},
{
"epoch": 0.9569377990430622,
"grad_norm": 1.3716073036193848,
"learning_rate": 4.761904761904762e-05,
"loss": 2.1414,
"step": 1200
},
{
"epoch": 0.9688995215311005,
"grad_norm": 1.1066573858261108,
"learning_rate": 4.7589035614245705e-05,
"loss": 2.0949,
"step": 1215
},
{
"epoch": 0.9808612440191388,
"grad_norm": 1.1547194719314575,
"learning_rate": 4.755902360944378e-05,
"loss": 2.1023,
"step": 1230
},
{
"epoch": 0.992822966507177,
"grad_norm": 1.5456453561782837,
"learning_rate": 4.752901160464186e-05,
"loss": 2.1542,
"step": 1245
},
{
"epoch": 1.0047846889952152,
"grad_norm": 1.7362697124481201,
"learning_rate": 4.749899959983994e-05,
"loss": 2.0444,
"step": 1260
},
{
"epoch": 1.0167464114832536,
"grad_norm": 5.408290386199951,
"learning_rate": 4.7468987595038016e-05,
"loss": 1.8079,
"step": 1275
},
{
"epoch": 1.0287081339712918,
"grad_norm": 3.33227276802063,
"learning_rate": 4.7438975590236094e-05,
"loss": 1.9851,
"step": 1290
},
{
"epoch": 1.0406698564593302,
"grad_norm": 1.4184224605560303,
"learning_rate": 4.740896358543417e-05,
"loss": 1.8732,
"step": 1305
},
{
"epoch": 1.0526315789473684,
"grad_norm": 1.5775929689407349,
"learning_rate": 4.737895158063225e-05,
"loss": 1.9714,
"step": 1320
},
{
"epoch": 1.0645933014354068,
"grad_norm": 1.4744929075241089,
"learning_rate": 4.7348939575830335e-05,
"loss": 1.8901,
"step": 1335
},
{
"epoch": 1.076555023923445,
"grad_norm": 1.5280168056488037,
"learning_rate": 4.731892757102841e-05,
"loss": 1.9348,
"step": 1350
},
{
"epoch": 1.0885167464114833,
"grad_norm": 1.2531495094299316,
"learning_rate": 4.72889155662265e-05,
"loss": 1.83,
"step": 1365
},
{
"epoch": 1.1004784688995215,
"grad_norm": 1.3821693658828735,
"learning_rate": 4.7258903561424575e-05,
"loss": 1.7183,
"step": 1380
},
{
"epoch": 1.11244019138756,
"grad_norm": 1.3789594173431396,
"learning_rate": 4.722889155662265e-05,
"loss": 1.8931,
"step": 1395
},
{
"epoch": 1.124401913875598,
"grad_norm": 1.2702490091323853,
"learning_rate": 4.719887955182073e-05,
"loss": 1.7617,
"step": 1410
},
{
"epoch": 1.1363636363636362,
"grad_norm": 1.4505800008773804,
"learning_rate": 4.716886754701881e-05,
"loss": 1.9103,
"step": 1425
},
{
"epoch": 1.1483253588516746,
"grad_norm": 1.612985610961914,
"learning_rate": 4.7138855542216886e-05,
"loss": 1.9471,
"step": 1440
},
{
"epoch": 1.160287081339713,
"grad_norm": 1.2852972745895386,
"learning_rate": 4.710884353741497e-05,
"loss": 1.9249,
"step": 1455
},
{
"epoch": 1.1722488038277512,
"grad_norm": 1.385501503944397,
"learning_rate": 4.707883153261305e-05,
"loss": 1.8883,
"step": 1470
},
{
"epoch": 1.1842105263157894,
"grad_norm": 1.4401298761367798,
"learning_rate": 4.704881952781113e-05,
"loss": 1.94,
"step": 1485
},
{
"epoch": 1.1961722488038278,
"grad_norm": 3.9501471519470215,
"learning_rate": 4.7018807523009204e-05,
"loss": 1.893,
"step": 1500
},
{
"epoch": 1.208133971291866,
"grad_norm": 1.3335622549057007,
"learning_rate": 4.698879551820728e-05,
"loss": 1.7215,
"step": 1515
},
{
"epoch": 1.2200956937799043,
"grad_norm": 1.6928309202194214,
"learning_rate": 4.695878351340536e-05,
"loss": 1.8889,
"step": 1530
},
{
"epoch": 1.2320574162679425,
"grad_norm": 1.2327487468719482,
"learning_rate": 4.6928771508603445e-05,
"loss": 1.8503,
"step": 1545
},
{
"epoch": 1.244019138755981,
"grad_norm": 1.3527581691741943,
"learning_rate": 4.689875950380152e-05,
"loss": 1.7963,
"step": 1560
},
{
"epoch": 1.255980861244019,
"grad_norm": 1.4024996757507324,
"learning_rate": 4.686874749899961e-05,
"loss": 1.8679,
"step": 1575
},
{
"epoch": 1.2679425837320575,
"grad_norm": 1.6798954010009766,
"learning_rate": 4.6838735494197685e-05,
"loss": 1.8944,
"step": 1590
},
{
"epoch": 1.2799043062200957,
"grad_norm": 1.4541043043136597,
"learning_rate": 4.680872348939576e-05,
"loss": 1.9555,
"step": 1605
},
{
"epoch": 1.291866028708134,
"grad_norm": 1.503612756729126,
"learning_rate": 4.677871148459384e-05,
"loss": 1.8223,
"step": 1620
},
{
"epoch": 1.3038277511961722,
"grad_norm": 1.4559051990509033,
"learning_rate": 4.674869947979192e-05,
"loss": 1.8442,
"step": 1635
},
{
"epoch": 1.3157894736842106,
"grad_norm": 1.3559598922729492,
"learning_rate": 4.6718687474989997e-05,
"loss": 1.933,
"step": 1650
},
{
"epoch": 1.3277511961722488,
"grad_norm": 1.3937571048736572,
"learning_rate": 4.6688675470188074e-05,
"loss": 1.864,
"step": 1665
},
{
"epoch": 1.339712918660287,
"grad_norm": 1.356520175933838,
"learning_rate": 4.665866346538615e-05,
"loss": 1.856,
"step": 1680
},
{
"epoch": 1.3516746411483254,
"grad_norm": 1.6281076669692993,
"learning_rate": 4.662865146058424e-05,
"loss": 1.8623,
"step": 1695
},
{
"epoch": 1.3636363636363638,
"grad_norm": 1.390368103981018,
"learning_rate": 4.6598639455782315e-05,
"loss": 1.8775,
"step": 1710
},
{
"epoch": 1.375598086124402,
"grad_norm": 1.575172781944275,
"learning_rate": 4.656862745098039e-05,
"loss": 1.9558,
"step": 1725
},
{
"epoch": 1.38755980861244,
"grad_norm": 1.6121597290039062,
"learning_rate": 4.653861544617848e-05,
"loss": 1.8698,
"step": 1740
},
{
"epoch": 1.3995215311004785,
"grad_norm": 1.4013128280639648,
"learning_rate": 4.6508603441376555e-05,
"loss": 1.8567,
"step": 1755
},
{
"epoch": 1.4114832535885167,
"grad_norm": 1.636841893196106,
"learning_rate": 4.647859143657463e-05,
"loss": 1.8708,
"step": 1770
},
{
"epoch": 1.423444976076555,
"grad_norm": 1.6554105281829834,
"learning_rate": 4.644857943177271e-05,
"loss": 1.9281,
"step": 1785
},
{
"epoch": 1.4354066985645932,
"grad_norm": 1.7569769620895386,
"learning_rate": 4.641856742697079e-05,
"loss": 1.8563,
"step": 1800
},
{
"epoch": 1.4473684210526316,
"grad_norm": 1.5896693468093872,
"learning_rate": 4.638855542216887e-05,
"loss": 1.8764,
"step": 1815
},
{
"epoch": 1.4593301435406698,
"grad_norm": 1.3887263536453247,
"learning_rate": 4.635854341736695e-05,
"loss": 1.8871,
"step": 1830
},
{
"epoch": 1.4712918660287082,
"grad_norm": 1.6596853733062744,
"learning_rate": 4.632853141256503e-05,
"loss": 1.9176,
"step": 1845
},
{
"epoch": 1.4832535885167464,
"grad_norm": 1.6174405813217163,
"learning_rate": 4.629851940776311e-05,
"loss": 1.8109,
"step": 1860
},
{
"epoch": 1.4952153110047846,
"grad_norm": 1.3717613220214844,
"learning_rate": 4.6268507402961185e-05,
"loss": 1.867,
"step": 1875
},
{
"epoch": 1.507177033492823,
"grad_norm": 1.4477450847625732,
"learning_rate": 4.623849539815926e-05,
"loss": 1.929,
"step": 1890
},
{
"epoch": 1.5191387559808613,
"grad_norm": 1.4237533807754517,
"learning_rate": 4.620848339335734e-05,
"loss": 1.8444,
"step": 1905
},
{
"epoch": 1.5311004784688995,
"grad_norm": 1.41818106174469,
"learning_rate": 4.6178471388555425e-05,
"loss": 1.8505,
"step": 1920
},
{
"epoch": 1.5430622009569377,
"grad_norm": 1.5824397802352905,
"learning_rate": 4.61484593837535e-05,
"loss": 1.773,
"step": 1935
},
{
"epoch": 1.555023923444976,
"grad_norm": 1.6391881704330444,
"learning_rate": 4.611844737895159e-05,
"loss": 1.9057,
"step": 1950
},
{
"epoch": 1.5669856459330145,
"grad_norm": 1.5484305620193481,
"learning_rate": 4.6088435374149665e-05,
"loss": 1.9141,
"step": 1965
},
{
"epoch": 1.5789473684210527,
"grad_norm": 1.4594415426254272,
"learning_rate": 4.605842336934774e-05,
"loss": 1.8732,
"step": 1980
},
{
"epoch": 1.5909090909090908,
"grad_norm": 1.3924568891525269,
"learning_rate": 4.602841136454582e-05,
"loss": 1.9441,
"step": 1995
},
{
"epoch": 1.6028708133971292,
"grad_norm": 1.523986577987671,
"learning_rate": 4.59983993597439e-05,
"loss": 1.9101,
"step": 2010
},
{
"epoch": 1.6148325358851676,
"grad_norm": 1.369285225868225,
"learning_rate": 4.596838735494198e-05,
"loss": 1.8829,
"step": 2025
},
{
"epoch": 1.6267942583732058,
"grad_norm": 1.4909306764602661,
"learning_rate": 4.5938375350140055e-05,
"loss": 1.9204,
"step": 2040
},
{
"epoch": 1.638755980861244,
"grad_norm": 1.5464478731155396,
"learning_rate": 4.590836334533814e-05,
"loss": 1.8064,
"step": 2055
},
{
"epoch": 1.6507177033492821,
"grad_norm": 1.5255078077316284,
"learning_rate": 4.587835134053622e-05,
"loss": 1.9518,
"step": 2070
},
{
"epoch": 1.6626794258373205,
"grad_norm": 1.3710672855377197,
"learning_rate": 4.5848339335734295e-05,
"loss": 1.8957,
"step": 2085
},
{
"epoch": 1.674641148325359,
"grad_norm": 1.4883019924163818,
"learning_rate": 4.581832733093237e-05,
"loss": 1.8884,
"step": 2100
},
{
"epoch": 1.686602870813397,
"grad_norm": 1.383284091949463,
"learning_rate": 4.578831532613046e-05,
"loss": 1.8924,
"step": 2115
},
{
"epoch": 1.6985645933014353,
"grad_norm": 1.5126210451126099,
"learning_rate": 4.5758303321328535e-05,
"loss": 1.9423,
"step": 2130
},
{
"epoch": 1.7105263157894737,
"grad_norm": 1.4830104112625122,
"learning_rate": 4.572829131652661e-05,
"loss": 1.9377,
"step": 2145
},
{
"epoch": 1.722488038277512,
"grad_norm": 1.578748106956482,
"learning_rate": 4.569827931172469e-05,
"loss": 1.8532,
"step": 2160
},
{
"epoch": 1.7344497607655502,
"grad_norm": 3.1164207458496094,
"learning_rate": 4.5668267306922776e-05,
"loss": 1.9072,
"step": 2175
},
{
"epoch": 1.7464114832535884,
"grad_norm": 1.5984658002853394,
"learning_rate": 4.5638255302120853e-05,
"loss": 1.9674,
"step": 2190
},
{
"epoch": 1.7583732057416268,
"grad_norm": 1.5007200241088867,
"learning_rate": 4.560824329731893e-05,
"loss": 1.93,
"step": 2205
},
{
"epoch": 1.7703349282296652,
"grad_norm": 2.623798131942749,
"learning_rate": 4.557823129251701e-05,
"loss": 1.9068,
"step": 2220
},
{
"epoch": 1.7822966507177034,
"grad_norm": 2.1396572589874268,
"learning_rate": 4.554821928771509e-05,
"loss": 1.886,
"step": 2235
},
{
"epoch": 1.7942583732057416,
"grad_norm": 1.5055629014968872,
"learning_rate": 4.5518207282913165e-05,
"loss": 1.8678,
"step": 2250
},
{
"epoch": 1.80622009569378,
"grad_norm": 1.4418485164642334,
"learning_rate": 4.548819527811124e-05,
"loss": 1.984,
"step": 2265
},
{
"epoch": 1.8181818181818183,
"grad_norm": 1.5159984827041626,
"learning_rate": 4.545818327330932e-05,
"loss": 1.9688,
"step": 2280
},
{
"epoch": 1.8301435406698565,
"grad_norm": 1.299607753753662,
"learning_rate": 4.5428171268507405e-05,
"loss": 1.9347,
"step": 2295
},
{
"epoch": 1.8421052631578947,
"grad_norm": 1.4144442081451416,
"learning_rate": 4.539815926370549e-05,
"loss": 1.8877,
"step": 2310
},
{
"epoch": 1.8540669856459329,
"grad_norm": 1.5180310010910034,
"learning_rate": 4.536814725890357e-05,
"loss": 1.9392,
"step": 2325
},
{
"epoch": 1.8660287081339713,
"grad_norm": 1.475977897644043,
"learning_rate": 4.5338135254101645e-05,
"loss": 1.8535,
"step": 2340
},
{
"epoch": 1.8779904306220097,
"grad_norm": 1.4614003896713257,
"learning_rate": 4.530812324929972e-05,
"loss": 1.9246,
"step": 2355
},
{
"epoch": 1.8899521531100478,
"grad_norm": 1.4736562967300415,
"learning_rate": 4.52781112444978e-05,
"loss": 1.9095,
"step": 2370
},
{
"epoch": 1.901913875598086,
"grad_norm": 1.3201289176940918,
"learning_rate": 4.524809923969588e-05,
"loss": 1.8479,
"step": 2385
},
{
"epoch": 1.9138755980861244,
"grad_norm": 1.4976378679275513,
"learning_rate": 4.521808723489396e-05,
"loss": 1.8262,
"step": 2400
},
{
"epoch": 1.9258373205741628,
"grad_norm": 1.5323299169540405,
"learning_rate": 4.5188075230092035e-05,
"loss": 1.8989,
"step": 2415
},
{
"epoch": 1.937799043062201,
"grad_norm": 2.050426483154297,
"learning_rate": 4.515806322529012e-05,
"loss": 1.8958,
"step": 2430
},
{
"epoch": 1.9497607655502391,
"grad_norm": 1.822324514389038,
"learning_rate": 4.51280512204882e-05,
"loss": 1.99,
"step": 2445
},
{
"epoch": 1.9617224880382775,
"grad_norm": 1.5009537935256958,
"learning_rate": 4.5098039215686275e-05,
"loss": 1.8561,
"step": 2460
},
{
"epoch": 1.973684210526316,
"grad_norm": 1.3751215934753418,
"learning_rate": 4.506802721088435e-05,
"loss": 1.9033,
"step": 2475
},
{
"epoch": 1.985645933014354,
"grad_norm": 1.6106884479522705,
"learning_rate": 4.503801520608244e-05,
"loss": 1.9555,
"step": 2490
},
{
"epoch": 1.9976076555023923,
"grad_norm": 1.5378204584121704,
"learning_rate": 4.5008003201280515e-05,
"loss": 2.0009,
"step": 2505
},
{
"epoch": 2.0095693779904304,
"grad_norm": 2.0536139011383057,
"learning_rate": 4.497799119647859e-05,
"loss": 1.7212,
"step": 2520
},
{
"epoch": 2.021531100478469,
"grad_norm": 1.7498282194137573,
"learning_rate": 4.494797919167667e-05,
"loss": 1.5574,
"step": 2535
},
{
"epoch": 2.0334928229665072,
"grad_norm": 1.7728687524795532,
"learning_rate": 4.4917967186874756e-05,
"loss": 1.4411,
"step": 2550
},
{
"epoch": 2.0454545454545454,
"grad_norm": 1.8067642450332642,
"learning_rate": 4.4887955182072834e-05,
"loss": 1.5242,
"step": 2565
},
{
"epoch": 2.0574162679425836,
"grad_norm": 1.924641489982605,
"learning_rate": 4.485794317727091e-05,
"loss": 1.5415,
"step": 2580
},
{
"epoch": 2.069377990430622,
"grad_norm": 1.9768836498260498,
"learning_rate": 4.482793117246899e-05,
"loss": 1.6774,
"step": 2595
},
{
"epoch": 2.0813397129186604,
"grad_norm": 1.943829894065857,
"learning_rate": 4.479791916766707e-05,
"loss": 1.6263,
"step": 2610
},
{
"epoch": 2.0933014354066986,
"grad_norm": 2.1001622676849365,
"learning_rate": 4.4767907162865145e-05,
"loss": 1.6304,
"step": 2625
},
{
"epoch": 2.1052631578947367,
"grad_norm": 2.0388505458831787,
"learning_rate": 4.473789515806322e-05,
"loss": 1.4718,
"step": 2640
},
{
"epoch": 2.117224880382775,
"grad_norm": 1.884468913078308,
"learning_rate": 4.47078831532613e-05,
"loss": 1.5752,
"step": 2655
},
{
"epoch": 2.1291866028708135,
"grad_norm": 1.9775267839431763,
"learning_rate": 4.4677871148459385e-05,
"loss": 1.478,
"step": 2670
},
{
"epoch": 2.1411483253588517,
"grad_norm": 1.8365753889083862,
"learning_rate": 4.464785914365747e-05,
"loss": 1.5408,
"step": 2685
},
{
"epoch": 2.15311004784689,
"grad_norm": 1.8778951168060303,
"learning_rate": 4.461784713885555e-05,
"loss": 1.6373,
"step": 2700
},
{
"epoch": 2.165071770334928,
"grad_norm": 1.9629762172698975,
"learning_rate": 4.4587835134053626e-05,
"loss": 1.5741,
"step": 2715
},
{
"epoch": 2.1770334928229667,
"grad_norm": 2.0409107208251953,
"learning_rate": 4.4557823129251704e-05,
"loss": 1.6216,
"step": 2730
},
{
"epoch": 2.188995215311005,
"grad_norm": 2.1008028984069824,
"learning_rate": 4.452781112444978e-05,
"loss": 1.5515,
"step": 2745
},
{
"epoch": 2.200956937799043,
"grad_norm": 2.2391457557678223,
"learning_rate": 4.449779911964786e-05,
"loss": 1.6279,
"step": 2760
},
{
"epoch": 2.212918660287081,
"grad_norm": 2.294734239578247,
"learning_rate": 4.446778711484594e-05,
"loss": 1.5232,
"step": 2775
},
{
"epoch": 2.22488038277512,
"grad_norm": 1.6631484031677246,
"learning_rate": 4.443777511004402e-05,
"loss": 1.5113,
"step": 2790
},
{
"epoch": 2.236842105263158,
"grad_norm": 1.9847686290740967,
"learning_rate": 4.44077631052421e-05,
"loss": 1.5006,
"step": 2805
},
{
"epoch": 2.248803827751196,
"grad_norm": 1.8953202962875366,
"learning_rate": 4.437775110044018e-05,
"loss": 1.5853,
"step": 2820
},
{
"epoch": 2.2607655502392343,
"grad_norm": 1.9015896320343018,
"learning_rate": 4.4347739095638255e-05,
"loss": 1.6078,
"step": 2835
},
{
"epoch": 2.2727272727272725,
"grad_norm": 1.900415301322937,
"learning_rate": 4.431772709083633e-05,
"loss": 1.5399,
"step": 2850
},
{
"epoch": 2.284688995215311,
"grad_norm": 1.9138609170913696,
"learning_rate": 4.428771508603442e-05,
"loss": 1.589,
"step": 2865
},
{
"epoch": 2.2966507177033493,
"grad_norm": 1.7661852836608887,
"learning_rate": 4.4257703081232496e-05,
"loss": 1.6258,
"step": 2880
},
{
"epoch": 2.3086124401913874,
"grad_norm": 1.9043537378311157,
"learning_rate": 4.4227691076430573e-05,
"loss": 1.6243,
"step": 2895
},
{
"epoch": 2.320574162679426,
"grad_norm": 1.8166050910949707,
"learning_rate": 4.419767907162866e-05,
"loss": 1.5999,
"step": 2910
},
{
"epoch": 2.3325358851674642,
"grad_norm": 1.7325972318649292,
"learning_rate": 4.4167667066826736e-05,
"loss": 1.586,
"step": 2925
},
{
"epoch": 2.3444976076555024,
"grad_norm": 1.8609052896499634,
"learning_rate": 4.4137655062024814e-05,
"loss": 1.5466,
"step": 2940
},
{
"epoch": 2.3564593301435406,
"grad_norm": 3.3115549087524414,
"learning_rate": 4.410764305722289e-05,
"loss": 1.5816,
"step": 2955
},
{
"epoch": 2.3684210526315788,
"grad_norm": 2.2015438079833984,
"learning_rate": 4.407763105242097e-05,
"loss": 1.5162,
"step": 2970
},
{
"epoch": 2.3803827751196174,
"grad_norm": 1.7339051961898804,
"learning_rate": 4.404761904761905e-05,
"loss": 1.5764,
"step": 2985
},
{
"epoch": 2.3923444976076556,
"grad_norm": 2.817207098007202,
"learning_rate": 4.4017607042817125e-05,
"loss": 1.5633,
"step": 3000
},
{
"epoch": 2.4043062200956937,
"grad_norm": 2.063880681991577,
"learning_rate": 4.39875950380152e-05,
"loss": 1.604,
"step": 3015
},
{
"epoch": 2.416267942583732,
"grad_norm": 1.8153194189071655,
"learning_rate": 4.395758303321329e-05,
"loss": 1.6417,
"step": 3030
},
{
"epoch": 2.4282296650717705,
"grad_norm": 3.646466016769409,
"learning_rate": 4.3927571028411365e-05,
"loss": 1.6325,
"step": 3045
},
{
"epoch": 2.4401913875598087,
"grad_norm": 1.9638229608535767,
"learning_rate": 4.389755902360945e-05,
"loss": 1.6393,
"step": 3060
},
{
"epoch": 2.452153110047847,
"grad_norm": 2.549917697906494,
"learning_rate": 4.386754701880753e-05,
"loss": 1.6231,
"step": 3075
},
{
"epoch": 2.464114832535885,
"grad_norm": 1.8698160648345947,
"learning_rate": 4.3837535014005606e-05,
"loss": 1.4995,
"step": 3090
},
{
"epoch": 2.4760765550239237,
"grad_norm": 1.8844027519226074,
"learning_rate": 4.3807523009203684e-05,
"loss": 1.6133,
"step": 3105
},
{
"epoch": 2.488038277511962,
"grad_norm": 2.275132417678833,
"learning_rate": 4.377751100440176e-05,
"loss": 1.6124,
"step": 3120
},
{
"epoch": 2.5,
"grad_norm": 1.729272723197937,
"learning_rate": 4.374749899959984e-05,
"loss": 1.6766,
"step": 3135
},
{
"epoch": 2.511961722488038,
"grad_norm": 1.9503229856491089,
"learning_rate": 4.3717486994797924e-05,
"loss": 1.6937,
"step": 3150
},
{
"epoch": 2.5239234449760763,
"grad_norm": 1.8774380683898926,
"learning_rate": 4.3687474989996e-05,
"loss": 1.6159,
"step": 3165
},
{
"epoch": 2.535885167464115,
"grad_norm": 2.066387176513672,
"learning_rate": 4.365746298519408e-05,
"loss": 1.6234,
"step": 3180
},
{
"epoch": 2.547846889952153,
"grad_norm": 2.7428183555603027,
"learning_rate": 4.362745098039216e-05,
"loss": 1.5469,
"step": 3195
},
{
"epoch": 2.5598086124401913,
"grad_norm": 1.9833886623382568,
"learning_rate": 4.3597438975590235e-05,
"loss": 1.5982,
"step": 3210
},
{
"epoch": 2.57177033492823,
"grad_norm": 1.7080726623535156,
"learning_rate": 4.356742697078831e-05,
"loss": 1.5975,
"step": 3225
},
{
"epoch": 2.583732057416268,
"grad_norm": 1.9213649034500122,
"learning_rate": 4.35374149659864e-05,
"loss": 1.5921,
"step": 3240
},
{
"epoch": 2.5956937799043063,
"grad_norm": 2.0085928440093994,
"learning_rate": 4.3507402961184476e-05,
"loss": 1.5904,
"step": 3255
},
{
"epoch": 2.6076555023923444,
"grad_norm": 1.903548002243042,
"learning_rate": 4.347739095638256e-05,
"loss": 1.5794,
"step": 3270
},
{
"epoch": 2.6196172248803826,
"grad_norm": 1.8258320093154907,
"learning_rate": 4.344737895158064e-05,
"loss": 1.6408,
"step": 3285
},
{
"epoch": 2.6315789473684212,
"grad_norm": 2.0597989559173584,
"learning_rate": 4.3417366946778716e-05,
"loss": 1.5868,
"step": 3300
},
{
"epoch": 2.6435406698564594,
"grad_norm": 2.0705902576446533,
"learning_rate": 4.3387354941976794e-05,
"loss": 1.6906,
"step": 3315
},
{
"epoch": 2.6555023923444976,
"grad_norm": 1.9880789518356323,
"learning_rate": 4.335734293717487e-05,
"loss": 1.5963,
"step": 3330
},
{
"epoch": 2.6674641148325358,
"grad_norm": 2.0182063579559326,
"learning_rate": 4.332733093237295e-05,
"loss": 1.6478,
"step": 3345
},
{
"epoch": 2.679425837320574,
"grad_norm": 1.9995989799499512,
"learning_rate": 4.329731892757103e-05,
"loss": 1.653,
"step": 3360
},
{
"epoch": 2.6913875598086126,
"grad_norm": 2.738987922668457,
"learning_rate": 4.3267306922769105e-05,
"loss": 1.6505,
"step": 3375
},
{
"epoch": 2.7033492822966507,
"grad_norm": 2.058044672012329,
"learning_rate": 4.323729491796719e-05,
"loss": 1.5528,
"step": 3390
},
{
"epoch": 2.715311004784689,
"grad_norm": 2.0416853427886963,
"learning_rate": 4.320728291316527e-05,
"loss": 1.5553,
"step": 3405
},
{
"epoch": 2.7272727272727275,
"grad_norm": 1.9002925157546997,
"learning_rate": 4.3177270908363346e-05,
"loss": 1.5736,
"step": 3420
},
{
"epoch": 2.7392344497607657,
"grad_norm": 1.8847737312316895,
"learning_rate": 4.314725890356143e-05,
"loss": 1.6232,
"step": 3435
},
{
"epoch": 2.751196172248804,
"grad_norm": 1.9627894163131714,
"learning_rate": 4.311724689875951e-05,
"loss": 1.6496,
"step": 3450
},
{
"epoch": 2.763157894736842,
"grad_norm": 1.823258638381958,
"learning_rate": 4.3087234893957586e-05,
"loss": 1.584,
"step": 3465
},
{
"epoch": 2.77511961722488,
"grad_norm": 3.361528158187866,
"learning_rate": 4.3057222889155664e-05,
"loss": 1.6163,
"step": 3480
},
{
"epoch": 2.787081339712919,
"grad_norm": 2.01798677444458,
"learning_rate": 4.302721088435374e-05,
"loss": 1.4596,
"step": 3495
},
{
"epoch": 2.799043062200957,
"grad_norm": 1.9381790161132812,
"learning_rate": 4.2997198879551826e-05,
"loss": 1.6621,
"step": 3510
},
{
"epoch": 2.811004784688995,
"grad_norm": 2.0217368602752686,
"learning_rate": 4.2967186874749904e-05,
"loss": 1.6089,
"step": 3525
},
{
"epoch": 2.8229665071770333,
"grad_norm": 1.7677721977233887,
"learning_rate": 4.293717486994798e-05,
"loss": 1.6052,
"step": 3540
},
{
"epoch": 2.8349282296650715,
"grad_norm": 1.9464062452316284,
"learning_rate": 4.290716286514606e-05,
"loss": 1.6751,
"step": 3555
},
{
"epoch": 2.84688995215311,
"grad_norm": 1.9557422399520874,
"learning_rate": 4.287715086034414e-05,
"loss": 1.5964,
"step": 3570
},
{
"epoch": 2.8588516746411483,
"grad_norm": 3.1278235912323,
"learning_rate": 4.2847138855542216e-05,
"loss": 1.6272,
"step": 3585
},
{
"epoch": 2.8708133971291865,
"grad_norm": 1.8671112060546875,
"learning_rate": 4.2817126850740293e-05,
"loss": 1.6573,
"step": 3600
},
{
"epoch": 2.882775119617225,
"grad_norm": 1.9375852346420288,
"learning_rate": 4.278711484593838e-05,
"loss": 1.6407,
"step": 3615
},
{
"epoch": 2.8947368421052633,
"grad_norm": 1.907958984375,
"learning_rate": 4.275710284113646e-05,
"loss": 1.6272,
"step": 3630
},
{
"epoch": 2.9066985645933014,
"grad_norm": 2.1269607543945312,
"learning_rate": 4.272709083633454e-05,
"loss": 1.5664,
"step": 3645
},
{
"epoch": 2.9186602870813396,
"grad_norm": 1.766072392463684,
"learning_rate": 4.269707883153262e-05,
"loss": 1.6766,
"step": 3660
},
{
"epoch": 2.930622009569378,
"grad_norm": 2.157346248626709,
"learning_rate": 4.2667066826730696e-05,
"loss": 1.6374,
"step": 3675
},
{
"epoch": 2.9425837320574164,
"grad_norm": 3.1585512161254883,
"learning_rate": 4.2637054821928774e-05,
"loss": 1.6082,
"step": 3690
},
{
"epoch": 2.9545454545454546,
"grad_norm": 2.0836970806121826,
"learning_rate": 4.260704281712685e-05,
"loss": 1.6703,
"step": 3705
},
{
"epoch": 2.9665071770334928,
"grad_norm": 1.729893445968628,
"learning_rate": 4.257703081232493e-05,
"loss": 1.6557,
"step": 3720
},
{
"epoch": 2.9784688995215314,
"grad_norm": 3.384397268295288,
"learning_rate": 4.254701880752301e-05,
"loss": 1.643,
"step": 3735
},
{
"epoch": 2.990430622009569,
"grad_norm": 1.8642953634262085,
"learning_rate": 4.2517006802721085e-05,
"loss": 1.6524,
"step": 3750
},
{
"epoch": 3.0023923444976077,
"grad_norm": 1.9247709512710571,
"learning_rate": 4.248699479791917e-05,
"loss": 1.484,
"step": 3765
},
{
"epoch": 3.014354066985646,
"grad_norm": 2.0377817153930664,
"learning_rate": 4.245698279311725e-05,
"loss": 1.2241,
"step": 3780
},
{
"epoch": 3.026315789473684,
"grad_norm": 2.2331552505493164,
"learning_rate": 4.2426970788315326e-05,
"loss": 1.1948,
"step": 3795
},
{
"epoch": 3.0382775119617227,
"grad_norm": 2.3499271869659424,
"learning_rate": 4.239695878351341e-05,
"loss": 1.2828,
"step": 3810
},
{
"epoch": 3.050239234449761,
"grad_norm": 2.445600748062134,
"learning_rate": 4.236694677871149e-05,
"loss": 1.1715,
"step": 3825
},
{
"epoch": 3.062200956937799,
"grad_norm": 2.801543951034546,
"learning_rate": 4.2336934773909566e-05,
"loss": 1.2167,
"step": 3840
},
{
"epoch": 3.074162679425837,
"grad_norm": 2.515307664871216,
"learning_rate": 4.2306922769107644e-05,
"loss": 1.1451,
"step": 3855
},
{
"epoch": 3.0861244019138754,
"grad_norm": 2.6123640537261963,
"learning_rate": 4.227691076430572e-05,
"loss": 1.256,
"step": 3870
},
{
"epoch": 3.098086124401914,
"grad_norm": 2.602388381958008,
"learning_rate": 4.2246898759503806e-05,
"loss": 1.1867,
"step": 3885
},
{
"epoch": 3.110047846889952,
"grad_norm": 2.552335739135742,
"learning_rate": 4.2216886754701884e-05,
"loss": 1.1845,
"step": 3900
},
{
"epoch": 3.1220095693779903,
"grad_norm": 2.6270079612731934,
"learning_rate": 4.218687474989996e-05,
"loss": 1.2479,
"step": 3915
},
{
"epoch": 3.1339712918660285,
"grad_norm": 2.490518808364868,
"learning_rate": 4.215686274509804e-05,
"loss": 1.2386,
"step": 3930
},
{
"epoch": 3.145933014354067,
"grad_norm": 2.348869800567627,
"learning_rate": 4.212685074029612e-05,
"loss": 1.2285,
"step": 3945
},
{
"epoch": 3.1578947368421053,
"grad_norm": 2.3546955585479736,
"learning_rate": 4.2096838735494196e-05,
"loss": 1.206,
"step": 3960
},
{
"epoch": 3.1698564593301435,
"grad_norm": 2.4429666996002197,
"learning_rate": 4.2066826730692274e-05,
"loss": 1.335,
"step": 3975
},
{
"epoch": 3.1818181818181817,
"grad_norm": 2.397874355316162,
"learning_rate": 4.203681472589036e-05,
"loss": 1.2252,
"step": 3990
},
{
"epoch": 3.1937799043062203,
"grad_norm": 2.526556968688965,
"learning_rate": 4.200680272108844e-05,
"loss": 1.2811,
"step": 4005
},
{
"epoch": 3.2057416267942584,
"grad_norm": 2.7083089351654053,
"learning_rate": 4.197679071628652e-05,
"loss": 1.3154,
"step": 4020
},
{
"epoch": 3.2177033492822966,
"grad_norm": 2.426650285720825,
"learning_rate": 4.19467787114846e-05,
"loss": 1.2251,
"step": 4035
},
{
"epoch": 3.229665071770335,
"grad_norm": 3.1592352390289307,
"learning_rate": 4.1916766706682676e-05,
"loss": 1.232,
"step": 4050
},
{
"epoch": 3.2416267942583734,
"grad_norm": 2.4699387550354004,
"learning_rate": 4.1886754701880754e-05,
"loss": 1.3075,
"step": 4065
},
{
"epoch": 3.2535885167464116,
"grad_norm": 2.410412311553955,
"learning_rate": 4.185674269707883e-05,
"loss": 1.2583,
"step": 4080
},
{
"epoch": 3.2655502392344498,
"grad_norm": 2.3662848472595215,
"learning_rate": 4.182673069227691e-05,
"loss": 1.2718,
"step": 4095
},
{
"epoch": 3.277511961722488,
"grad_norm": 2.241677761077881,
"learning_rate": 4.179671868747499e-05,
"loss": 1.2293,
"step": 4110
},
{
"epoch": 3.2894736842105265,
"grad_norm": 2.289928674697876,
"learning_rate": 4.176670668267307e-05,
"loss": 1.2369,
"step": 4125
},
{
"epoch": 3.3014354066985647,
"grad_norm": 2.9561991691589355,
"learning_rate": 4.173669467787115e-05,
"loss": 1.1936,
"step": 4140
},
{
"epoch": 3.313397129186603,
"grad_norm": 2.6181890964508057,
"learning_rate": 4.170668267306923e-05,
"loss": 1.2791,
"step": 4155
},
{
"epoch": 3.325358851674641,
"grad_norm": 2.208653688430786,
"learning_rate": 4.1676670668267306e-05,
"loss": 1.3175,
"step": 4170
},
{
"epoch": 3.3373205741626792,
"grad_norm": 2.460291624069214,
"learning_rate": 4.164665866346539e-05,
"loss": 1.255,
"step": 4185
},
{
"epoch": 3.349282296650718,
"grad_norm": 2.2541019916534424,
"learning_rate": 4.161664665866347e-05,
"loss": 1.2815,
"step": 4200
},
{
"epoch": 3.361244019138756,
"grad_norm": 2.543994903564453,
"learning_rate": 4.1586634653861546e-05,
"loss": 1.2888,
"step": 4215
},
{
"epoch": 3.373205741626794,
"grad_norm": 2.7568411827087402,
"learning_rate": 4.1556622649059624e-05,
"loss": 1.2894,
"step": 4230
},
{
"epoch": 3.3851674641148324,
"grad_norm": 2.5805466175079346,
"learning_rate": 4.152661064425771e-05,
"loss": 1.3434,
"step": 4245
},
{
"epoch": 3.397129186602871,
"grad_norm": 2.409097194671631,
"learning_rate": 4.149659863945579e-05,
"loss": 1.2903,
"step": 4260
},
{
"epoch": 3.409090909090909,
"grad_norm": 4.126059532165527,
"learning_rate": 4.1466586634653865e-05,
"loss": 1.2764,
"step": 4275
},
{
"epoch": 3.4210526315789473,
"grad_norm": 3.106367826461792,
"learning_rate": 4.143657462985194e-05,
"loss": 1.3184,
"step": 4290
},
{
"epoch": 3.4330143540669855,
"grad_norm": 2.195138454437256,
"learning_rate": 4.140656262505002e-05,
"loss": 1.2636,
"step": 4305
},
{
"epoch": 3.444976076555024,
"grad_norm": 2.7023708820343018,
"learning_rate": 4.13765506202481e-05,
"loss": 1.3316,
"step": 4320
},
{
"epoch": 3.4569377990430623,
"grad_norm": 2.262626886367798,
"learning_rate": 4.1346538615446176e-05,
"loss": 1.2847,
"step": 4335
},
{
"epoch": 3.4688995215311005,
"grad_norm": 2.5416321754455566,
"learning_rate": 4.131652661064426e-05,
"loss": 1.3254,
"step": 4350
},
{
"epoch": 3.4808612440191387,
"grad_norm": 2.868903875350952,
"learning_rate": 4.128651460584234e-05,
"loss": 1.2778,
"step": 4365
},
{
"epoch": 3.492822966507177,
"grad_norm": 2.347463607788086,
"learning_rate": 4.125650260104042e-05,
"loss": 1.34,
"step": 4380
},
{
"epoch": 3.5047846889952154,
"grad_norm": 2.644416332244873,
"learning_rate": 4.12264905962385e-05,
"loss": 1.2862,
"step": 4395
},
{
"epoch": 3.5167464114832536,
"grad_norm": 2.8803160190582275,
"learning_rate": 4.119647859143658e-05,
"loss": 1.3538,
"step": 4410
},
{
"epoch": 3.528708133971292,
"grad_norm": 2.643848180770874,
"learning_rate": 4.1166466586634657e-05,
"loss": 1.3566,
"step": 4425
},
{
"epoch": 3.5406698564593304,
"grad_norm": 2.555978298187256,
"learning_rate": 4.1136454581832734e-05,
"loss": 1.284,
"step": 4440
},
{
"epoch": 3.5526315789473686,
"grad_norm": 2.4635751247406006,
"learning_rate": 4.110644257703081e-05,
"loss": 1.3229,
"step": 4455
},
{
"epoch": 3.5645933014354068,
"grad_norm": 2.804314374923706,
"learning_rate": 4.107643057222889e-05,
"loss": 1.2931,
"step": 4470
},
{
"epoch": 3.576555023923445,
"grad_norm": 2.5955514907836914,
"learning_rate": 4.1046418567426975e-05,
"loss": 1.3153,
"step": 4485
},
{
"epoch": 3.588516746411483,
"grad_norm": 2.4464356899261475,
"learning_rate": 4.101640656262505e-05,
"loss": 1.2963,
"step": 4500
},
{
"epoch": 3.6004784688995217,
"grad_norm": 2.8158469200134277,
"learning_rate": 4.098639455782313e-05,
"loss": 1.333,
"step": 4515
},
{
"epoch": 3.61244019138756,
"grad_norm": 2.324192523956299,
"learning_rate": 4.095638255302121e-05,
"loss": 1.3438,
"step": 4530
},
{
"epoch": 3.624401913875598,
"grad_norm": 2.5822291374206543,
"learning_rate": 4.0926370548219286e-05,
"loss": 1.381,
"step": 4545
},
{
"epoch": 3.6363636363636362,
"grad_norm": 2.3783419132232666,
"learning_rate": 4.089635854341737e-05,
"loss": 1.321,
"step": 4560
},
{
"epoch": 3.6483253588516744,
"grad_norm": 2.453040361404419,
"learning_rate": 4.086634653861545e-05,
"loss": 1.35,
"step": 4575
},
{
"epoch": 3.660287081339713,
"grad_norm": 2.694587230682373,
"learning_rate": 4.0836334533813526e-05,
"loss": 1.3342,
"step": 4590
},
{
"epoch": 3.672248803827751,
"grad_norm": 2.4545223712921143,
"learning_rate": 4.080632252901161e-05,
"loss": 1.4238,
"step": 4605
},
{
"epoch": 3.6842105263157894,
"grad_norm": 2.5401089191436768,
"learning_rate": 4.077631052420969e-05,
"loss": 1.3699,
"step": 4620
},
{
"epoch": 3.696172248803828,
"grad_norm": 2.4257302284240723,
"learning_rate": 4.074629851940777e-05,
"loss": 1.3569,
"step": 4635
},
{
"epoch": 3.708133971291866,
"grad_norm": 2.7543747425079346,
"learning_rate": 4.0716286514605845e-05,
"loss": 1.2967,
"step": 4650
},
{
"epoch": 3.7200956937799043,
"grad_norm": 2.4614686965942383,
"learning_rate": 4.068627450980392e-05,
"loss": 1.2982,
"step": 4665
},
{
"epoch": 3.7320574162679425,
"grad_norm": 3.7613461017608643,
"learning_rate": 4.0656262505002e-05,
"loss": 1.3812,
"step": 4680
},
{
"epoch": 3.7440191387559807,
"grad_norm": 2.60383939743042,
"learning_rate": 4.062625050020008e-05,
"loss": 1.3526,
"step": 4695
},
{
"epoch": 3.7559808612440193,
"grad_norm": 2.3789987564086914,
"learning_rate": 4.0596238495398156e-05,
"loss": 1.3502,
"step": 4710
},
{
"epoch": 3.7679425837320575,
"grad_norm": 2.6684768199920654,
"learning_rate": 4.056622649059624e-05,
"loss": 1.4723,
"step": 4725
},
{
"epoch": 3.7799043062200957,
"grad_norm": 2.480144500732422,
"learning_rate": 4.053621448579432e-05,
"loss": 1.3716,
"step": 4740
},
{
"epoch": 3.791866028708134,
"grad_norm": 2.429513454437256,
"learning_rate": 4.05062024809924e-05,
"loss": 1.2895,
"step": 4755
},
{
"epoch": 3.803827751196172,
"grad_norm": 2.4947898387908936,
"learning_rate": 4.047619047619048e-05,
"loss": 1.4147,
"step": 4770
},
{
"epoch": 3.8157894736842106,
"grad_norm": 2.351773500442505,
"learning_rate": 4.044617847138856e-05,
"loss": 1.3712,
"step": 4785
},
{
"epoch": 3.827751196172249,
"grad_norm": 2.4937288761138916,
"learning_rate": 4.041616646658664e-05,
"loss": 1.3342,
"step": 4800
},
{
"epoch": 3.839712918660287,
"grad_norm": 3.4912281036376953,
"learning_rate": 4.0386154461784715e-05,
"loss": 1.3403,
"step": 4815
},
{
"epoch": 3.8516746411483256,
"grad_norm": 2.2786455154418945,
"learning_rate": 4.035614245698279e-05,
"loss": 1.335,
"step": 4830
},
{
"epoch": 3.8636363636363638,
"grad_norm": 2.7752015590667725,
"learning_rate": 4.032613045218088e-05,
"loss": 1.3739,
"step": 4845
},
{
"epoch": 3.875598086124402,
"grad_norm": 2.510052442550659,
"learning_rate": 4.0296118447378955e-05,
"loss": 1.3793,
"step": 4860
},
{
"epoch": 3.88755980861244,
"grad_norm": 4.657649517059326,
"learning_rate": 4.026610644257703e-05,
"loss": 1.3914,
"step": 4875
},
{
"epoch": 3.8995215311004783,
"grad_norm": 2.437033176422119,
"learning_rate": 4.023609443777511e-05,
"loss": 1.3793,
"step": 4890
},
{
"epoch": 3.911483253588517,
"grad_norm": 2.7319986820220947,
"learning_rate": 4.020608243297319e-05,
"loss": 1.437,
"step": 4905
},
{
"epoch": 3.923444976076555,
"grad_norm": 2.553680896759033,
"learning_rate": 4.0176070428171266e-05,
"loss": 1.3613,
"step": 4920
},
{
"epoch": 3.9354066985645932,
"grad_norm": 2.379471778869629,
"learning_rate": 4.014605842336935e-05,
"loss": 1.3638,
"step": 4935
},
{
"epoch": 3.9473684210526314,
"grad_norm": 2.8651113510131836,
"learning_rate": 4.011604641856743e-05,
"loss": 1.3265,
"step": 4950
},
{
"epoch": 3.9593301435406696,
"grad_norm": 2.366116762161255,
"learning_rate": 4.0086034413765513e-05,
"loss": 1.2701,
"step": 4965
},
{
"epoch": 3.971291866028708,
"grad_norm": 2.60257625579834,
"learning_rate": 4.005602240896359e-05,
"loss": 1.305,
"step": 4980
},
{
"epoch": 3.9832535885167464,
"grad_norm": 2.544235944747925,
"learning_rate": 4.002601040416167e-05,
"loss": 1.3632,
"step": 4995
},
{
"epoch": 3.9952153110047846,
"grad_norm": 2.541198253631592,
"learning_rate": 3.999599839935975e-05,
"loss": 1.4154,
"step": 5010
},
{
"epoch": 4.007177033492823,
"grad_norm": 3.7236313819885254,
"learning_rate": 3.9965986394557825e-05,
"loss": 1.1803,
"step": 5025
},
{
"epoch": 4.019138755980861,
"grad_norm": 3.206791877746582,
"learning_rate": 3.99359743897559e-05,
"loss": 0.9466,
"step": 5040
},
{
"epoch": 4.0311004784688995,
"grad_norm": 2.9792520999908447,
"learning_rate": 3.990596238495398e-05,
"loss": 0.8937,
"step": 5055
},
{
"epoch": 4.043062200956938,
"grad_norm": 3.3796586990356445,
"learning_rate": 3.987595038015206e-05,
"loss": 0.9352,
"step": 5070
},
{
"epoch": 4.055023923444976,
"grad_norm": 2.383775472640991,
"learning_rate": 3.984593837535014e-05,
"loss": 0.8506,
"step": 5085
},
{
"epoch": 4.0669856459330145,
"grad_norm": 2.6192071437835693,
"learning_rate": 3.981592637054822e-05,
"loss": 0.8886,
"step": 5100
},
{
"epoch": 4.078947368421052,
"grad_norm": 3.329030990600586,
"learning_rate": 3.97859143657463e-05,
"loss": 0.9639,
"step": 5115
},
{
"epoch": 4.090909090909091,
"grad_norm": 3.970484733581543,
"learning_rate": 3.975590236094438e-05,
"loss": 0.9112,
"step": 5130
},
{
"epoch": 4.1028708133971294,
"grad_norm": 3.082409381866455,
"learning_rate": 3.972589035614246e-05,
"loss": 0.8825,
"step": 5145
},
{
"epoch": 4.114832535885167,
"grad_norm": 2.9433696269989014,
"learning_rate": 3.969587835134054e-05,
"loss": 0.9384,
"step": 5160
},
{
"epoch": 4.126794258373206,
"grad_norm": 3.1707279682159424,
"learning_rate": 3.966586634653862e-05,
"loss": 0.9025,
"step": 5175
},
{
"epoch": 4.138755980861244,
"grad_norm": 3.336472988128662,
"learning_rate": 3.9635854341736695e-05,
"loss": 0.9228,
"step": 5190
},
{
"epoch": 4.150717703349282,
"grad_norm": 3.4995670318603516,
"learning_rate": 3.960584233693477e-05,
"loss": 0.9847,
"step": 5205
},
{
"epoch": 4.162679425837321,
"grad_norm": 3.3354713916778564,
"learning_rate": 3.957583033213286e-05,
"loss": 0.9717,
"step": 5220
},
{
"epoch": 4.1746411483253585,
"grad_norm": 3.2553207874298096,
"learning_rate": 3.9545818327330935e-05,
"loss": 0.9973,
"step": 5235
},
{
"epoch": 4.186602870813397,
"grad_norm": 3.007181406021118,
"learning_rate": 3.951580632252901e-05,
"loss": 0.919,
"step": 5250
},
{
"epoch": 4.198564593301436,
"grad_norm": 2.7252211570739746,
"learning_rate": 3.948579431772709e-05,
"loss": 0.8914,
"step": 5265
},
{
"epoch": 4.2105263157894735,
"grad_norm": 3.078258514404297,
"learning_rate": 3.945578231292517e-05,
"loss": 1.0353,
"step": 5280
},
{
"epoch": 4.222488038277512,
"grad_norm": 3.0154271125793457,
"learning_rate": 3.942577030812325e-05,
"loss": 0.9594,
"step": 5295
},
{
"epoch": 4.23444976076555,
"grad_norm": 3.7115094661712646,
"learning_rate": 3.939575830332133e-05,
"loss": 0.9248,
"step": 5310
},
{
"epoch": 4.246411483253588,
"grad_norm": 3.135359048843384,
"learning_rate": 3.936574629851941e-05,
"loss": 0.9918,
"step": 5325
},
{
"epoch": 4.258373205741627,
"grad_norm": 2.8541269302368164,
"learning_rate": 3.9335734293717494e-05,
"loss": 0.974,
"step": 5340
},
{
"epoch": 4.270334928229665,
"grad_norm": 3.1880204677581787,
"learning_rate": 3.930572228891557e-05,
"loss": 1.0267,
"step": 5355
},
{
"epoch": 4.282296650717703,
"grad_norm": 4.082556247711182,
"learning_rate": 3.927571028411365e-05,
"loss": 0.9764,
"step": 5370
},
{
"epoch": 4.294258373205742,
"grad_norm": 3.121758460998535,
"learning_rate": 3.924569827931173e-05,
"loss": 1.0353,
"step": 5385
},
{
"epoch": 4.30622009569378,
"grad_norm": 3.3821141719818115,
"learning_rate": 3.9215686274509805e-05,
"loss": 1.0219,
"step": 5400
},
{
"epoch": 4.318181818181818,
"grad_norm": 3.336914300918579,
"learning_rate": 3.918567426970788e-05,
"loss": 1.0427,
"step": 5415
},
{
"epoch": 4.330143540669856,
"grad_norm": 3.1878132820129395,
"learning_rate": 3.915566226490596e-05,
"loss": 1.0125,
"step": 5430
},
{
"epoch": 4.342105263157895,
"grad_norm": 3.5293705463409424,
"learning_rate": 3.912565026010404e-05,
"loss": 0.9655,
"step": 5445
},
{
"epoch": 4.354066985645933,
"grad_norm": 2.9817090034484863,
"learning_rate": 3.909563825530212e-05,
"loss": 0.9854,
"step": 5460
},
{
"epoch": 4.366028708133971,
"grad_norm": 3.0998663902282715,
"learning_rate": 3.90656262505002e-05,
"loss": 0.951,
"step": 5475
},
{
"epoch": 4.37799043062201,
"grad_norm": 3.541856050491333,
"learning_rate": 3.903561424569828e-05,
"loss": 1.0302,
"step": 5490
},
{
"epoch": 4.389952153110048,
"grad_norm": 3.180595636367798,
"learning_rate": 3.9005602240896364e-05,
"loss": 0.9434,
"step": 5505
},
{
"epoch": 4.401913875598086,
"grad_norm": 3.341787099838257,
"learning_rate": 3.897559023609444e-05,
"loss": 1.0062,
"step": 5520
},
{
"epoch": 4.413875598086125,
"grad_norm": 3.4445912837982178,
"learning_rate": 3.894557823129252e-05,
"loss": 0.9558,
"step": 5535
},
{
"epoch": 4.425837320574162,
"grad_norm": 2.839120388031006,
"learning_rate": 3.89155662264906e-05,
"loss": 1.0152,
"step": 5550
},
{
"epoch": 4.437799043062201,
"grad_norm": 3.482067108154297,
"learning_rate": 3.8885554221688675e-05,
"loss": 1.0234,
"step": 5565
},
{
"epoch": 4.44976076555024,
"grad_norm": 2.869065761566162,
"learning_rate": 3.885554221688676e-05,
"loss": 1.0045,
"step": 5580
},
{
"epoch": 4.461722488038277,
"grad_norm": 3.366964101791382,
"learning_rate": 3.882553021208484e-05,
"loss": 1.0086,
"step": 5595
},
{
"epoch": 4.473684210526316,
"grad_norm": 3.8538451194763184,
"learning_rate": 3.8795518207282915e-05,
"loss": 1.0727,
"step": 5610
},
{
"epoch": 4.485645933014354,
"grad_norm": 3.1612632274627686,
"learning_rate": 3.876550620248099e-05,
"loss": 1.1,
"step": 5625
},
{
"epoch": 4.497607655502392,
"grad_norm": 3.4518115520477295,
"learning_rate": 3.873549419767907e-05,
"loss": 0.9788,
"step": 5640
},
{
"epoch": 4.509569377990431,
"grad_norm": 2.8597676753997803,
"learning_rate": 3.870548219287715e-05,
"loss": 1.0111,
"step": 5655
},
{
"epoch": 4.521531100478469,
"grad_norm": 3.2637124061584473,
"learning_rate": 3.8675470188075233e-05,
"loss": 0.9647,
"step": 5670
},
{
"epoch": 4.533492822966507,
"grad_norm": 3.176473379135132,
"learning_rate": 3.864545818327331e-05,
"loss": 1.0303,
"step": 5685
},
{
"epoch": 4.545454545454545,
"grad_norm": 3.1555211544036865,
"learning_rate": 3.8615446178471396e-05,
"loss": 0.9983,
"step": 5700
},
{
"epoch": 4.557416267942584,
"grad_norm": 3.690917730331421,
"learning_rate": 3.8585434173669474e-05,
"loss": 1.0843,
"step": 5715
},
{
"epoch": 4.569377990430622,
"grad_norm": 3.4356346130371094,
"learning_rate": 3.855542216886755e-05,
"loss": 1.0957,
"step": 5730
},
{
"epoch": 4.58133971291866,
"grad_norm": 3.0207927227020264,
"learning_rate": 3.852541016406563e-05,
"loss": 0.9877,
"step": 5745
},
{
"epoch": 4.5933014354066986,
"grad_norm": 3.256007194519043,
"learning_rate": 3.849539815926371e-05,
"loss": 0.9934,
"step": 5760
},
{
"epoch": 4.605263157894737,
"grad_norm": 4.417782783508301,
"learning_rate": 3.8465386154461785e-05,
"loss": 1.0612,
"step": 5775
},
{
"epoch": 4.617224880382775,
"grad_norm": 2.802917242050171,
"learning_rate": 3.843537414965986e-05,
"loss": 1.0714,
"step": 5790
},
{
"epoch": 4.6291866028708135,
"grad_norm": 2.9113950729370117,
"learning_rate": 3.840536214485794e-05,
"loss": 1.0637,
"step": 5805
},
{
"epoch": 4.641148325358852,
"grad_norm": 3.0320019721984863,
"learning_rate": 3.8375350140056026e-05,
"loss": 1.0407,
"step": 5820
},
{
"epoch": 4.65311004784689,
"grad_norm": 2.9705982208251953,
"learning_rate": 3.83453381352541e-05,
"loss": 1.118,
"step": 5835
},
{
"epoch": 4.6650717703349285,
"grad_norm": 3.1082069873809814,
"learning_rate": 3.831532613045218e-05,
"loss": 1.1102,
"step": 5850
},
{
"epoch": 4.677033492822966,
"grad_norm": 3.2098066806793213,
"learning_rate": 3.828531412565026e-05,
"loss": 1.1063,
"step": 5865
},
{
"epoch": 4.688995215311005,
"grad_norm": 3.18621826171875,
"learning_rate": 3.8255302120848344e-05,
"loss": 1.0772,
"step": 5880
},
{
"epoch": 4.7009569377990434,
"grad_norm": 3.3197460174560547,
"learning_rate": 3.822529011604642e-05,
"loss": 1.0054,
"step": 5895
},
{
"epoch": 4.712918660287081,
"grad_norm": 2.8657805919647217,
"learning_rate": 3.81952781112445e-05,
"loss": 1.073,
"step": 5910
},
{
"epoch": 4.72488038277512,
"grad_norm": 2.897557497024536,
"learning_rate": 3.816526610644258e-05,
"loss": 1.0991,
"step": 5925
},
{
"epoch": 4.7368421052631575,
"grad_norm": 2.881815195083618,
"learning_rate": 3.813525410164066e-05,
"loss": 1.1037,
"step": 5940
},
{
"epoch": 4.748803827751196,
"grad_norm": 3.131378412246704,
"learning_rate": 3.810524209683874e-05,
"loss": 1.149,
"step": 5955
},
{
"epoch": 4.760765550239235,
"grad_norm": 3.3418426513671875,
"learning_rate": 3.807523009203682e-05,
"loss": 1.0799,
"step": 5970
},
{
"epoch": 4.7727272727272725,
"grad_norm": 2.759793519973755,
"learning_rate": 3.8045218087234895e-05,
"loss": 1.1026,
"step": 5985
},
{
"epoch": 4.784688995215311,
"grad_norm": 3.082688808441162,
"learning_rate": 3.801520608243297e-05,
"loss": 1.0911,
"step": 6000
},
{
"epoch": 4.796650717703349,
"grad_norm": 3.788597583770752,
"learning_rate": 3.798519407763105e-05,
"loss": 1.1133,
"step": 6015
},
{
"epoch": 4.8086124401913874,
"grad_norm": 3.0609753131866455,
"learning_rate": 3.795518207282913e-05,
"loss": 1.0023,
"step": 6030
},
{
"epoch": 4.820574162679426,
"grad_norm": 3.5260090827941895,
"learning_rate": 3.7925170068027214e-05,
"loss": 1.105,
"step": 6045
},
{
"epoch": 4.832535885167464,
"grad_norm": 3.1473610401153564,
"learning_rate": 3.789515806322529e-05,
"loss": 1.1896,
"step": 6060
},
{
"epoch": 4.844497607655502,
"grad_norm": 3.2314066886901855,
"learning_rate": 3.7865146058423376e-05,
"loss": 1.1403,
"step": 6075
},
{
"epoch": 4.856459330143541,
"grad_norm": 3.1266963481903076,
"learning_rate": 3.7835134053621454e-05,
"loss": 1.123,
"step": 6090
},
{
"epoch": 4.868421052631579,
"grad_norm": 3.1995601654052734,
"learning_rate": 3.780512204881953e-05,
"loss": 1.1833,
"step": 6105
},
{
"epoch": 4.880382775119617,
"grad_norm": 3.251296043395996,
"learning_rate": 3.777511004401761e-05,
"loss": 1.1502,
"step": 6120
},
{
"epoch": 4.892344497607656,
"grad_norm": 3.1420419216156006,
"learning_rate": 3.774509803921569e-05,
"loss": 1.1207,
"step": 6135
},
{
"epoch": 4.904306220095694,
"grad_norm": 2.992222785949707,
"learning_rate": 3.7715086034413765e-05,
"loss": 1.1347,
"step": 6150
},
{
"epoch": 4.916267942583732,
"grad_norm": 3.03808856010437,
"learning_rate": 3.768507402961184e-05,
"loss": 1.131,
"step": 6165
},
{
"epoch": 4.92822966507177,
"grad_norm": 3.9193668365478516,
"learning_rate": 3.765506202480993e-05,
"loss": 1.0749,
"step": 6180
},
{
"epoch": 4.940191387559809,
"grad_norm": 3.3145644664764404,
"learning_rate": 3.7625050020008006e-05,
"loss": 1.0406,
"step": 6195
},
{
"epoch": 4.952153110047847,
"grad_norm": 3.134812116622925,
"learning_rate": 3.7595038015206084e-05,
"loss": 1.1243,
"step": 6210
},
{
"epoch": 4.964114832535885,
"grad_norm": 3.403087854385376,
"learning_rate": 3.756502601040416e-05,
"loss": 1.0429,
"step": 6225
},
{
"epoch": 4.976076555023924,
"grad_norm": 3.0964858531951904,
"learning_rate": 3.753501400560224e-05,
"loss": 1.112,
"step": 6240
},
{
"epoch": 4.988038277511961,
"grad_norm": 4.416729927062988,
"learning_rate": 3.7505002000800324e-05,
"loss": 1.1144,
"step": 6255
},
{
"epoch": 5.0,
"grad_norm": 4.442926406860352,
"learning_rate": 3.74749899959984e-05,
"loss": 1.0938,
"step": 6270
},
{
"epoch": 5.011961722488039,
"grad_norm": 3.0728983879089355,
"learning_rate": 3.744497799119648e-05,
"loss": 0.6816,
"step": 6285
},
{
"epoch": 5.023923444976076,
"grad_norm": 3.4252402782440186,
"learning_rate": 3.7414965986394564e-05,
"loss": 0.7263,
"step": 6300
},
{
"epoch": 5.035885167464115,
"grad_norm": 4.501566410064697,
"learning_rate": 3.738495398159264e-05,
"loss": 0.6744,
"step": 6315
},
{
"epoch": 5.047846889952153,
"grad_norm": 3.8966481685638428,
"learning_rate": 3.735494197679072e-05,
"loss": 0.645,
"step": 6330
},
{
"epoch": 5.059808612440191,
"grad_norm": 3.794740915298462,
"learning_rate": 3.73249299719888e-05,
"loss": 0.6894,
"step": 6345
},
{
"epoch": 5.07177033492823,
"grad_norm": 3.1294026374816895,
"learning_rate": 3.7294917967186876e-05,
"loss": 0.6101,
"step": 6360
},
{
"epoch": 5.083732057416268,
"grad_norm": 3.1900405883789062,
"learning_rate": 3.7264905962384953e-05,
"loss": 0.6731,
"step": 6375
},
{
"epoch": 5.095693779904306,
"grad_norm": 3.9348907470703125,
"learning_rate": 3.723489395758303e-05,
"loss": 0.7257,
"step": 6390
},
{
"epoch": 5.107655502392345,
"grad_norm": 3.5655553340911865,
"learning_rate": 3.720488195278111e-05,
"loss": 0.6219,
"step": 6405
},
{
"epoch": 5.119617224880383,
"grad_norm": 3.678565740585327,
"learning_rate": 3.7174869947979194e-05,
"loss": 0.6896,
"step": 6420
},
{
"epoch": 5.131578947368421,
"grad_norm": 3.041287422180176,
"learning_rate": 3.714485794317727e-05,
"loss": 0.7084,
"step": 6435
},
{
"epoch": 5.143540669856459,
"grad_norm": 3.382601737976074,
"learning_rate": 3.7114845938375356e-05,
"loss": 0.6298,
"step": 6450
},
{
"epoch": 5.155502392344498,
"grad_norm": 3.4510035514831543,
"learning_rate": 3.7084833933573434e-05,
"loss": 0.6882,
"step": 6465
},
{
"epoch": 5.167464114832536,
"grad_norm": 4.204371929168701,
"learning_rate": 3.705482192877151e-05,
"loss": 0.7478,
"step": 6480
},
{
"epoch": 5.179425837320574,
"grad_norm": 3.669754981994629,
"learning_rate": 3.702480992396959e-05,
"loss": 0.7159,
"step": 6495
},
{
"epoch": 5.1913875598086126,
"grad_norm": 3.454606056213379,
"learning_rate": 3.699479791916767e-05,
"loss": 0.7049,
"step": 6510
},
{
"epoch": 5.203349282296651,
"grad_norm": 3.548112154006958,
"learning_rate": 3.6964785914365746e-05,
"loss": 0.7279,
"step": 6525
},
{
"epoch": 5.215311004784689,
"grad_norm": 4.184609413146973,
"learning_rate": 3.693477390956382e-05,
"loss": 0.7747,
"step": 6540
},
{
"epoch": 5.2272727272727275,
"grad_norm": 3.418808937072754,
"learning_rate": 3.690476190476191e-05,
"loss": 0.7833,
"step": 6555
},
{
"epoch": 5.239234449760765,
"grad_norm": 3.444638729095459,
"learning_rate": 3.6874749899959986e-05,
"loss": 0.81,
"step": 6570
},
{
"epoch": 5.251196172248804,
"grad_norm": 3.960958242416382,
"learning_rate": 3.6844737895158064e-05,
"loss": 0.6915,
"step": 6585
},
{
"epoch": 5.2631578947368425,
"grad_norm": 3.772879123687744,
"learning_rate": 3.681472589035614e-05,
"loss": 0.7157,
"step": 6600
},
{
"epoch": 5.27511961722488,
"grad_norm": 4.02428674697876,
"learning_rate": 3.6784713885554226e-05,
"loss": 0.7383,
"step": 6615
},
{
"epoch": 5.287081339712919,
"grad_norm": 3.4093050956726074,
"learning_rate": 3.6754701880752304e-05,
"loss": 0.7163,
"step": 6630
},
{
"epoch": 5.2990430622009566,
"grad_norm": 3.6924562454223633,
"learning_rate": 3.672468987595038e-05,
"loss": 0.7022,
"step": 6645
},
{
"epoch": 5.311004784688995,
"grad_norm": 3.356632947921753,
"learning_rate": 3.669467787114846e-05,
"loss": 0.737,
"step": 6660
},
{
"epoch": 5.322966507177034,
"grad_norm": 3.501210927963257,
"learning_rate": 3.6664665866346544e-05,
"loss": 0.7474,
"step": 6675
},
{
"epoch": 5.3349282296650715,
"grad_norm": 3.852551221847534,
"learning_rate": 3.663465386154462e-05,
"loss": 0.779,
"step": 6690
},
{
"epoch": 5.34688995215311,
"grad_norm": 3.4461312294006348,
"learning_rate": 3.66046418567427e-05,
"loss": 0.6816,
"step": 6705
},
{
"epoch": 5.358851674641148,
"grad_norm": 2.9088375568389893,
"learning_rate": 3.657462985194078e-05,
"loss": 0.7619,
"step": 6720
},
{
"epoch": 5.3708133971291865,
"grad_norm": 3.4227547645568848,
"learning_rate": 3.6544617847138856e-05,
"loss": 0.7646,
"step": 6735
},
{
"epoch": 5.382775119617225,
"grad_norm": 4.553009986877441,
"learning_rate": 3.6514605842336934e-05,
"loss": 0.7907,
"step": 6750
},
{
"epoch": 5.394736842105263,
"grad_norm": 3.965406656265259,
"learning_rate": 3.648459383753501e-05,
"loss": 0.7901,
"step": 6765
},
{
"epoch": 5.4066985645933014,
"grad_norm": 3.7064077854156494,
"learning_rate": 3.645458183273309e-05,
"loss": 0.758,
"step": 6780
},
{
"epoch": 5.41866028708134,
"grad_norm": 3.4479455947875977,
"learning_rate": 3.6424569827931174e-05,
"loss": 0.7439,
"step": 6795
},
{
"epoch": 5.430622009569378,
"grad_norm": 3.9599294662475586,
"learning_rate": 3.639455782312925e-05,
"loss": 0.8257,
"step": 6810
},
{
"epoch": 5.442583732057416,
"grad_norm": 3.7063801288604736,
"learning_rate": 3.6364545818327336e-05,
"loss": 0.7717,
"step": 6825
},
{
"epoch": 5.454545454545454,
"grad_norm": 4.6955060958862305,
"learning_rate": 3.6334533813525414e-05,
"loss": 0.7575,
"step": 6840
},
{
"epoch": 5.466507177033493,
"grad_norm": 3.915292501449585,
"learning_rate": 3.630452180872349e-05,
"loss": 0.7989,
"step": 6855
},
{
"epoch": 5.478468899521531,
"grad_norm": 3.974541664123535,
"learning_rate": 3.627450980392157e-05,
"loss": 0.8685,
"step": 6870
},
{
"epoch": 5.490430622009569,
"grad_norm": 3.9493520259857178,
"learning_rate": 3.624449779911965e-05,
"loss": 0.8111,
"step": 6885
},
{
"epoch": 5.502392344497608,
"grad_norm": 3.7138257026672363,
"learning_rate": 3.6214485794317726e-05,
"loss": 0.8086,
"step": 6900
},
{
"epoch": 5.514354066985646,
"grad_norm": 3.838562250137329,
"learning_rate": 3.618447378951581e-05,
"loss": 0.8,
"step": 6915
},
{
"epoch": 5.526315789473684,
"grad_norm": 3.5369865894317627,
"learning_rate": 3.615446178471389e-05,
"loss": 0.7449,
"step": 6930
},
{
"epoch": 5.538277511961723,
"grad_norm": 3.607936382293701,
"learning_rate": 3.6124449779911966e-05,
"loss": 0.7974,
"step": 6945
},
{
"epoch": 5.55023923444976,
"grad_norm": 4.021537780761719,
"learning_rate": 3.6094437775110044e-05,
"loss": 0.6972,
"step": 6960
},
{
"epoch": 5.562200956937799,
"grad_norm": 4.086754322052002,
"learning_rate": 3.606442577030812e-05,
"loss": 0.8349,
"step": 6975
},
{
"epoch": 5.574162679425838,
"grad_norm": 3.385819673538208,
"learning_rate": 3.6034413765506206e-05,
"loss": 0.8016,
"step": 6990
},
{
"epoch": 5.586124401913875,
"grad_norm": 3.3851637840270996,
"learning_rate": 3.6004401760704284e-05,
"loss": 0.8013,
"step": 7005
},
{
"epoch": 5.598086124401914,
"grad_norm": 3.6127657890319824,
"learning_rate": 3.597438975590236e-05,
"loss": 0.889,
"step": 7020
},
{
"epoch": 5.610047846889952,
"grad_norm": 3.7455716133117676,
"learning_rate": 3.594437775110045e-05,
"loss": 0.8244,
"step": 7035
},
{
"epoch": 5.62200956937799,
"grad_norm": 3.5797011852264404,
"learning_rate": 3.5914365746298525e-05,
"loss": 0.8794,
"step": 7050
},
{
"epoch": 5.633971291866029,
"grad_norm": 3.6951963901519775,
"learning_rate": 3.58843537414966e-05,
"loss": 0.8377,
"step": 7065
},
{
"epoch": 5.645933014354067,
"grad_norm": 4.805546283721924,
"learning_rate": 3.585434173669468e-05,
"loss": 0.7658,
"step": 7080
},
{
"epoch": 5.657894736842105,
"grad_norm": 3.3476104736328125,
"learning_rate": 3.582432973189276e-05,
"loss": 0.8535,
"step": 7095
},
{
"epoch": 5.669856459330144,
"grad_norm": 3.7429189682006836,
"learning_rate": 3.5794317727090836e-05,
"loss": 0.7698,
"step": 7110
},
{
"epoch": 5.681818181818182,
"grad_norm": 3.6189913749694824,
"learning_rate": 3.5764305722288914e-05,
"loss": 0.8843,
"step": 7125
},
{
"epoch": 5.69377990430622,
"grad_norm": 3.614164113998413,
"learning_rate": 3.573429371748699e-05,
"loss": 0.7855,
"step": 7140
},
{
"epoch": 5.705741626794258,
"grad_norm": 3.9962081909179688,
"learning_rate": 3.5704281712685076e-05,
"loss": 0.8501,
"step": 7155
},
{
"epoch": 5.717703349282297,
"grad_norm": 3.6668338775634766,
"learning_rate": 3.5674269707883154e-05,
"loss": 0.7866,
"step": 7170
},
{
"epoch": 5.729665071770335,
"grad_norm": 3.9314942359924316,
"learning_rate": 3.564425770308123e-05,
"loss": 0.8003,
"step": 7185
},
{
"epoch": 5.741626794258373,
"grad_norm": 4.32262659072876,
"learning_rate": 3.5614245698279317e-05,
"loss": 0.8137,
"step": 7200
},
{
"epoch": 5.753588516746412,
"grad_norm": 5.040790557861328,
"learning_rate": 3.5584233693477394e-05,
"loss": 0.8354,
"step": 7215
},
{
"epoch": 5.76555023923445,
"grad_norm": 3.7755401134490967,
"learning_rate": 3.555422168867547e-05,
"loss": 0.8574,
"step": 7230
},
{
"epoch": 5.777511961722488,
"grad_norm": 3.8143343925476074,
"learning_rate": 3.552420968387355e-05,
"loss": 0.8091,
"step": 7245
},
{
"epoch": 5.7894736842105265,
"grad_norm": 3.4861605167388916,
"learning_rate": 3.549419767907163e-05,
"loss": 0.8304,
"step": 7260
},
{
"epoch": 5.801435406698564,
"grad_norm": 3.5389742851257324,
"learning_rate": 3.546418567426971e-05,
"loss": 0.8676,
"step": 7275
},
{
"epoch": 5.813397129186603,
"grad_norm": 3.465071439743042,
"learning_rate": 3.543417366946779e-05,
"loss": 0.8296,
"step": 7290
},
{
"epoch": 5.8253588516746415,
"grad_norm": 3.9034931659698486,
"learning_rate": 3.540416166466587e-05,
"loss": 0.8398,
"step": 7305
},
{
"epoch": 5.837320574162679,
"grad_norm": 3.817934989929199,
"learning_rate": 3.5374149659863946e-05,
"loss": 0.8602,
"step": 7320
},
{
"epoch": 5.849282296650718,
"grad_norm": 4.706762790679932,
"learning_rate": 3.5344137655062024e-05,
"loss": 0.8684,
"step": 7335
},
{
"epoch": 5.861244019138756,
"grad_norm": 3.3008809089660645,
"learning_rate": 3.53141256502601e-05,
"loss": 0.8182,
"step": 7350
},
{
"epoch": 5.873205741626794,
"grad_norm": 3.5898377895355225,
"learning_rate": 3.5284113645458186e-05,
"loss": 0.8512,
"step": 7365
},
{
"epoch": 5.885167464114833,
"grad_norm": 3.8670029640197754,
"learning_rate": 3.5254101640656264e-05,
"loss": 0.8412,
"step": 7380
},
{
"epoch": 5.8971291866028706,
"grad_norm": 3.6071064472198486,
"learning_rate": 3.522408963585435e-05,
"loss": 0.8578,
"step": 7395
},
{
"epoch": 5.909090909090909,
"grad_norm": 4.674183368682861,
"learning_rate": 3.519407763105243e-05,
"loss": 0.8554,
"step": 7410
},
{
"epoch": 5.921052631578947,
"grad_norm": 3.45503306388855,
"learning_rate": 3.5164065626250505e-05,
"loss": 0.9224,
"step": 7425
},
{
"epoch": 5.9330143540669855,
"grad_norm": 3.4863317012786865,
"learning_rate": 3.513405362144858e-05,
"loss": 0.8177,
"step": 7440
},
{
"epoch": 5.944976076555024,
"grad_norm": 3.9804773330688477,
"learning_rate": 3.510404161664666e-05,
"loss": 0.8379,
"step": 7455
},
{
"epoch": 5.956937799043062,
"grad_norm": 3.6782078742980957,
"learning_rate": 3.507402961184474e-05,
"loss": 0.8634,
"step": 7470
},
{
"epoch": 5.9688995215311005,
"grad_norm": 3.7234580516815186,
"learning_rate": 3.5044017607042816e-05,
"loss": 0.9142,
"step": 7485
},
{
"epoch": 5.980861244019139,
"grad_norm": 3.6034648418426514,
"learning_rate": 3.5014005602240894e-05,
"loss": 0.8777,
"step": 7500
},
{
"epoch": 5.992822966507177,
"grad_norm": 3.407047748565674,
"learning_rate": 3.498399359743898e-05,
"loss": 0.8191,
"step": 7515
},
{
"epoch": 6.0047846889952154,
"grad_norm": 4.2239508628845215,
"learning_rate": 3.4953981592637056e-05,
"loss": 0.7896,
"step": 7530
},
{
"epoch": 6.016746411483253,
"grad_norm": 2.516592502593994,
"learning_rate": 3.4923969587835134e-05,
"loss": 0.5012,
"step": 7545
},
{
"epoch": 6.028708133971292,
"grad_norm": 3.366042375564575,
"learning_rate": 3.489395758303321e-05,
"loss": 0.4626,
"step": 7560
},
{
"epoch": 6.04066985645933,
"grad_norm": 4.176771640777588,
"learning_rate": 3.48639455782313e-05,
"loss": 0.4813,
"step": 7575
},
{
"epoch": 6.052631578947368,
"grad_norm": 3.807236671447754,
"learning_rate": 3.4833933573429375e-05,
"loss": 0.4928,
"step": 7590
},
{
"epoch": 6.064593301435407,
"grad_norm": 3.5176925659179688,
"learning_rate": 3.480392156862745e-05,
"loss": 0.4474,
"step": 7605
},
{
"epoch": 6.076555023923445,
"grad_norm": 3.860903739929199,
"learning_rate": 3.477390956382553e-05,
"loss": 0.5181,
"step": 7620
},
{
"epoch": 6.088516746411483,
"grad_norm": 3.883094072341919,
"learning_rate": 3.4743897559023615e-05,
"loss": 0.497,
"step": 7635
},
{
"epoch": 6.100478468899522,
"grad_norm": 3.299124240875244,
"learning_rate": 3.471388555422169e-05,
"loss": 0.5023,
"step": 7650
},
{
"epoch": 6.1124401913875595,
"grad_norm": 3.780906915664673,
"learning_rate": 3.468387354941977e-05,
"loss": 0.4938,
"step": 7665
},
{
"epoch": 6.124401913875598,
"grad_norm": 3.906473159790039,
"learning_rate": 3.465386154461785e-05,
"loss": 0.52,
"step": 7680
},
{
"epoch": 6.136363636363637,
"grad_norm": 3.7031853199005127,
"learning_rate": 3.4623849539815926e-05,
"loss": 0.4922,
"step": 7695
},
{
"epoch": 6.148325358851674,
"grad_norm": 4.119719505310059,
"learning_rate": 3.4593837535014004e-05,
"loss": 0.4726,
"step": 7710
},
{
"epoch": 6.160287081339713,
"grad_norm": 3.637122869491577,
"learning_rate": 3.456382553021208e-05,
"loss": 0.4522,
"step": 7725
},
{
"epoch": 6.172248803827751,
"grad_norm": 3.6455516815185547,
"learning_rate": 3.453381352541017e-05,
"loss": 0.497,
"step": 7740
},
{
"epoch": 6.184210526315789,
"grad_norm": 3.90136981010437,
"learning_rate": 3.4503801520608245e-05,
"loss": 0.5286,
"step": 7755
},
{
"epoch": 6.196172248803828,
"grad_norm": 3.776540994644165,
"learning_rate": 3.447378951580633e-05,
"loss": 0.5407,
"step": 7770
},
{
"epoch": 6.208133971291866,
"grad_norm": 4.160264015197754,
"learning_rate": 3.444377751100441e-05,
"loss": 0.4874,
"step": 7785
},
{
"epoch": 6.220095693779904,
"grad_norm": 3.5366413593292236,
"learning_rate": 3.4413765506202485e-05,
"loss": 0.4708,
"step": 7800
},
{
"epoch": 6.232057416267943,
"grad_norm": 3.604766368865967,
"learning_rate": 3.438375350140056e-05,
"loss": 0.5326,
"step": 7815
},
{
"epoch": 6.244019138755981,
"grad_norm": 3.5916519165039062,
"learning_rate": 3.435374149659864e-05,
"loss": 0.5411,
"step": 7830
},
{
"epoch": 6.255980861244019,
"grad_norm": 3.626094102859497,
"learning_rate": 3.432372949179672e-05,
"loss": 0.5142,
"step": 7845
},
{
"epoch": 6.267942583732057,
"grad_norm": 4.346883296966553,
"learning_rate": 3.4293717486994796e-05,
"loss": 0.5135,
"step": 7860
},
{
"epoch": 6.279904306220096,
"grad_norm": 4.123327732086182,
"learning_rate": 3.426370548219288e-05,
"loss": 0.5403,
"step": 7875
},
{
"epoch": 6.291866028708134,
"grad_norm": 4.1574482917785645,
"learning_rate": 3.423369347739096e-05,
"loss": 0.5373,
"step": 7890
},
{
"epoch": 6.303827751196172,
"grad_norm": 3.9462273120880127,
"learning_rate": 3.4203681472589037e-05,
"loss": 0.5223,
"step": 7905
},
{
"epoch": 6.315789473684211,
"grad_norm": 4.356924533843994,
"learning_rate": 3.4173669467787114e-05,
"loss": 0.5857,
"step": 7920
},
{
"epoch": 6.327751196172249,
"grad_norm": 3.8217930793762207,
"learning_rate": 3.41436574629852e-05,
"loss": 0.5272,
"step": 7935
},
{
"epoch": 6.339712918660287,
"grad_norm": 3.689328908920288,
"learning_rate": 3.411364545818328e-05,
"loss": 0.5162,
"step": 7950
},
{
"epoch": 6.351674641148326,
"grad_norm": 3.6850223541259766,
"learning_rate": 3.4083633453381355e-05,
"loss": 0.582,
"step": 7965
},
{
"epoch": 6.363636363636363,
"grad_norm": 4.063047885894775,
"learning_rate": 3.405362144857943e-05,
"loss": 0.5642,
"step": 7980
},
{
"epoch": 6.375598086124402,
"grad_norm": 3.6065573692321777,
"learning_rate": 3.402360944377751e-05,
"loss": 0.5225,
"step": 7995
},
{
"epoch": 6.3875598086124405,
"grad_norm": 4.188450336456299,
"learning_rate": 3.3993597438975595e-05,
"loss": 0.5911,
"step": 8010
},
{
"epoch": 6.399521531100478,
"grad_norm": 3.9791886806488037,
"learning_rate": 3.396358543417367e-05,
"loss": 0.5178,
"step": 8025
},
{
"epoch": 6.411483253588517,
"grad_norm": 4.381253719329834,
"learning_rate": 3.393357342937175e-05,
"loss": 0.5344,
"step": 8040
},
{
"epoch": 6.423444976076555,
"grad_norm": 3.810927152633667,
"learning_rate": 3.390356142456983e-05,
"loss": 0.4915,
"step": 8055
},
{
"epoch": 6.435406698564593,
"grad_norm": 4.254152774810791,
"learning_rate": 3.3873549419767907e-05,
"loss": 0.601,
"step": 8070
},
{
"epoch": 6.447368421052632,
"grad_norm": 4.086537837982178,
"learning_rate": 3.3843537414965984e-05,
"loss": 0.5944,
"step": 8085
},
{
"epoch": 6.45933014354067,
"grad_norm": 4.881983280181885,
"learning_rate": 3.381352541016406e-05,
"loss": 0.5789,
"step": 8100
},
{
"epoch": 6.471291866028708,
"grad_norm": 4.15606689453125,
"learning_rate": 3.378351340536215e-05,
"loss": 0.5397,
"step": 8115
},
{
"epoch": 6.483253588516747,
"grad_norm": 3.6769986152648926,
"learning_rate": 3.3753501400560225e-05,
"loss": 0.5449,
"step": 8130
},
{
"epoch": 6.4952153110047846,
"grad_norm": 3.846041440963745,
"learning_rate": 3.372348939575831e-05,
"loss": 0.5555,
"step": 8145
},
{
"epoch": 6.507177033492823,
"grad_norm": 4.353069305419922,
"learning_rate": 3.369347739095639e-05,
"loss": 0.608,
"step": 8160
},
{
"epoch": 6.519138755980861,
"grad_norm": 4.087284564971924,
"learning_rate": 3.3663465386154465e-05,
"loss": 0.5741,
"step": 8175
},
{
"epoch": 6.5311004784688995,
"grad_norm": 4.356995582580566,
"learning_rate": 3.363345338135254e-05,
"loss": 0.6432,
"step": 8190
},
{
"epoch": 6.543062200956938,
"grad_norm": 3.855937957763672,
"learning_rate": 3.360344137655062e-05,
"loss": 0.5783,
"step": 8205
},
{
"epoch": 6.555023923444976,
"grad_norm": 3.820133686065674,
"learning_rate": 3.35734293717487e-05,
"loss": 0.5814,
"step": 8220
},
{
"epoch": 6.5669856459330145,
"grad_norm": 4.873568058013916,
"learning_rate": 3.3543417366946776e-05,
"loss": 0.6264,
"step": 8235
},
{
"epoch": 6.578947368421053,
"grad_norm": 3.8670310974121094,
"learning_rate": 3.351340536214486e-05,
"loss": 0.6271,
"step": 8250
},
{
"epoch": 6.590909090909091,
"grad_norm": 4.838265895843506,
"learning_rate": 3.348339335734294e-05,
"loss": 0.6346,
"step": 8265
},
{
"epoch": 6.6028708133971294,
"grad_norm": 4.0044403076171875,
"learning_rate": 3.345338135254102e-05,
"loss": 0.5724,
"step": 8280
},
{
"epoch": 6.614832535885167,
"grad_norm": 3.866497039794922,
"learning_rate": 3.3423369347739095e-05,
"loss": 0.6172,
"step": 8295
},
{
"epoch": 6.626794258373206,
"grad_norm": 4.213998317718506,
"learning_rate": 3.339335734293718e-05,
"loss": 0.6246,
"step": 8310
},
{
"epoch": 6.638755980861244,
"grad_norm": 4.162674427032471,
"learning_rate": 3.336334533813526e-05,
"loss": 0.6301,
"step": 8325
},
{
"epoch": 6.650717703349282,
"grad_norm": 4.032559394836426,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.6557,
"step": 8340
},
{
"epoch": 6.662679425837321,
"grad_norm": 4.416426658630371,
"learning_rate": 3.330332132853141e-05,
"loss": 0.6592,
"step": 8355
},
{
"epoch": 6.6746411483253585,
"grad_norm": 4.758429527282715,
"learning_rate": 3.32733093237295e-05,
"loss": 0.6753,
"step": 8370
},
{
"epoch": 6.686602870813397,
"grad_norm": 4.513240337371826,
"learning_rate": 3.3243297318927575e-05,
"loss": 0.5713,
"step": 8385
},
{
"epoch": 6.698564593301436,
"grad_norm": 4.007817268371582,
"learning_rate": 3.321328531412565e-05,
"loss": 0.596,
"step": 8400
},
{
"epoch": 6.7105263157894735,
"grad_norm": 4.17065954208374,
"learning_rate": 3.318327330932373e-05,
"loss": 0.5975,
"step": 8415
},
{
"epoch": 6.722488038277512,
"grad_norm": 3.68249773979187,
"learning_rate": 3.315326130452181e-05,
"loss": 0.563,
"step": 8430
},
{
"epoch": 6.73444976076555,
"grad_norm": 4.292535781860352,
"learning_rate": 3.312324929971989e-05,
"loss": 0.6413,
"step": 8445
},
{
"epoch": 6.746411483253588,
"grad_norm": 4.380221843719482,
"learning_rate": 3.3093237294917965e-05,
"loss": 0.5637,
"step": 8460
},
{
"epoch": 6.758373205741627,
"grad_norm": 3.799266815185547,
"learning_rate": 3.306322529011604e-05,
"loss": 0.6653,
"step": 8475
},
{
"epoch": 6.770334928229665,
"grad_norm": 4.119513034820557,
"learning_rate": 3.303321328531413e-05,
"loss": 0.6436,
"step": 8490
},
{
"epoch": 6.782296650717703,
"grad_norm": 4.17624044418335,
"learning_rate": 3.3003201280512205e-05,
"loss": 0.6778,
"step": 8505
},
{
"epoch": 6.794258373205742,
"grad_norm": 4.3085761070251465,
"learning_rate": 3.297318927571029e-05,
"loss": 0.6298,
"step": 8520
},
{
"epoch": 6.80622009569378,
"grad_norm": 3.8202457427978516,
"learning_rate": 3.294317727090837e-05,
"loss": 0.5924,
"step": 8535
},
{
"epoch": 6.818181818181818,
"grad_norm": 4.103767395019531,
"learning_rate": 3.2913165266106445e-05,
"loss": 0.5925,
"step": 8550
},
{
"epoch": 6.830143540669856,
"grad_norm": 4.139376640319824,
"learning_rate": 3.288315326130452e-05,
"loss": 0.6656,
"step": 8565
},
{
"epoch": 6.842105263157895,
"grad_norm": 4.039120674133301,
"learning_rate": 3.28531412565026e-05,
"loss": 0.6807,
"step": 8580
},
{
"epoch": 6.854066985645933,
"grad_norm": 4.153085708618164,
"learning_rate": 3.282312925170068e-05,
"loss": 0.6194,
"step": 8595
},
{
"epoch": 6.866028708133971,
"grad_norm": 4.125678539276123,
"learning_rate": 3.279311724689876e-05,
"loss": 0.6333,
"step": 8610
},
{
"epoch": 6.87799043062201,
"grad_norm": 4.25078821182251,
"learning_rate": 3.276310524209684e-05,
"loss": 0.6895,
"step": 8625
},
{
"epoch": 6.889952153110048,
"grad_norm": 3.782094955444336,
"learning_rate": 3.273309323729492e-05,
"loss": 0.6789,
"step": 8640
},
{
"epoch": 6.901913875598086,
"grad_norm": 4.739928245544434,
"learning_rate": 3.2703081232493e-05,
"loss": 0.6427,
"step": 8655
},
{
"epoch": 6.913875598086125,
"grad_norm": 4.479592800140381,
"learning_rate": 3.2673069227691075e-05,
"loss": 0.6309,
"step": 8670
},
{
"epoch": 6.925837320574162,
"grad_norm": 4.018124580383301,
"learning_rate": 3.264305722288916e-05,
"loss": 0.6828,
"step": 8685
},
{
"epoch": 6.937799043062201,
"grad_norm": 3.8505430221557617,
"learning_rate": 3.261304521808724e-05,
"loss": 0.6361,
"step": 8700
},
{
"epoch": 6.94976076555024,
"grad_norm": 3.596605062484741,
"learning_rate": 3.2583033213285315e-05,
"loss": 0.6297,
"step": 8715
},
{
"epoch": 6.961722488038277,
"grad_norm": 4.318160533905029,
"learning_rate": 3.25530212084834e-05,
"loss": 0.6403,
"step": 8730
},
{
"epoch": 6.973684210526316,
"grad_norm": 4.0697431564331055,
"learning_rate": 3.252300920368148e-05,
"loss": 0.6154,
"step": 8745
},
{
"epoch": 6.985645933014354,
"grad_norm": 4.358625411987305,
"learning_rate": 3.2492997198879555e-05,
"loss": 0.6782,
"step": 8760
},
{
"epoch": 6.997607655502392,
"grad_norm": 4.264054298400879,
"learning_rate": 3.246298519407763e-05,
"loss": 0.6498,
"step": 8775
},
{
"epoch": 7.009569377990431,
"grad_norm": 3.4622254371643066,
"learning_rate": 3.243297318927571e-05,
"loss": 0.4423,
"step": 8790
},
{
"epoch": 7.021531100478469,
"grad_norm": 4.359318733215332,
"learning_rate": 3.240296118447379e-05,
"loss": 0.2942,
"step": 8805
},
{
"epoch": 7.033492822966507,
"grad_norm": 4.986384391784668,
"learning_rate": 3.237294917967187e-05,
"loss": 0.3187,
"step": 8820
},
{
"epoch": 7.045454545454546,
"grad_norm": 4.1987104415893555,
"learning_rate": 3.2342937174869945e-05,
"loss": 0.3306,
"step": 8835
},
{
"epoch": 7.057416267942584,
"grad_norm": 4.6675028800964355,
"learning_rate": 3.231292517006803e-05,
"loss": 0.349,
"step": 8850
},
{
"epoch": 7.069377990430622,
"grad_norm": 4.357269763946533,
"learning_rate": 3.228291316526611e-05,
"loss": 0.3153,
"step": 8865
},
{
"epoch": 7.08133971291866,
"grad_norm": 3.168750762939453,
"learning_rate": 3.225290116046419e-05,
"loss": 0.3223,
"step": 8880
},
{
"epoch": 7.0933014354066986,
"grad_norm": 4.13469934463501,
"learning_rate": 3.222288915566227e-05,
"loss": 0.3289,
"step": 8895
},
{
"epoch": 7.105263157894737,
"grad_norm": 3.306483507156372,
"learning_rate": 3.219287715086035e-05,
"loss": 0.3357,
"step": 8910
},
{
"epoch": 7.117224880382775,
"grad_norm": 3.830190896987915,
"learning_rate": 3.2162865146058425e-05,
"loss": 0.3425,
"step": 8925
},
{
"epoch": 7.1291866028708135,
"grad_norm": 3.848161220550537,
"learning_rate": 3.21328531412565e-05,
"loss": 0.3412,
"step": 8940
},
{
"epoch": 7.141148325358851,
"grad_norm": 4.058409214019775,
"learning_rate": 3.210284113645458e-05,
"loss": 0.3333,
"step": 8955
},
{
"epoch": 7.15311004784689,
"grad_norm": 3.780856132507324,
"learning_rate": 3.2072829131652666e-05,
"loss": 0.3205,
"step": 8970
},
{
"epoch": 7.1650717703349285,
"grad_norm": 3.9334750175476074,
"learning_rate": 3.2042817126850744e-05,
"loss": 0.3546,
"step": 8985
},
{
"epoch": 7.177033492822966,
"grad_norm": 4.092038631439209,
"learning_rate": 3.201280512204882e-05,
"loss": 0.3295,
"step": 9000
},
{
"epoch": 7.188995215311005,
"grad_norm": 4.35646390914917,
"learning_rate": 3.19827931172469e-05,
"loss": 0.3593,
"step": 9015
},
{
"epoch": 7.2009569377990434,
"grad_norm": 3.8881773948669434,
"learning_rate": 3.195278111244498e-05,
"loss": 0.3541,
"step": 9030
},
{
"epoch": 7.212918660287081,
"grad_norm": 4.399089336395264,
"learning_rate": 3.1922769107643055e-05,
"loss": 0.3271,
"step": 9045
},
{
"epoch": 7.22488038277512,
"grad_norm": 4.376395225524902,
"learning_rate": 3.189275710284114e-05,
"loss": 0.4131,
"step": 9060
},
{
"epoch": 7.2368421052631575,
"grad_norm": 4.1286468505859375,
"learning_rate": 3.186274509803922e-05,
"loss": 0.3824,
"step": 9075
},
{
"epoch": 7.248803827751196,
"grad_norm": 4.728172302246094,
"learning_rate": 3.18327330932373e-05,
"loss": 0.3706,
"step": 9090
},
{
"epoch": 7.260765550239235,
"grad_norm": 3.76225209236145,
"learning_rate": 3.180272108843538e-05,
"loss": 0.3568,
"step": 9105
},
{
"epoch": 7.2727272727272725,
"grad_norm": 3.939035415649414,
"learning_rate": 3.177270908363346e-05,
"loss": 0.4092,
"step": 9120
},
{
"epoch": 7.284688995215311,
"grad_norm": 4.537744045257568,
"learning_rate": 3.1742697078831536e-05,
"loss": 0.3606,
"step": 9135
},
{
"epoch": 7.296650717703349,
"grad_norm": 4.309103965759277,
"learning_rate": 3.1712685074029613e-05,
"loss": 0.406,
"step": 9150
},
{
"epoch": 7.3086124401913874,
"grad_norm": 4.298764228820801,
"learning_rate": 3.168267306922769e-05,
"loss": 0.373,
"step": 9165
},
{
"epoch": 7.320574162679426,
"grad_norm": 4.205005645751953,
"learning_rate": 3.165266106442577e-05,
"loss": 0.3567,
"step": 9180
},
{
"epoch": 7.332535885167464,
"grad_norm": 4.051873207092285,
"learning_rate": 3.162264905962385e-05,
"loss": 0.377,
"step": 9195
},
{
"epoch": 7.344497607655502,
"grad_norm": 4.320316314697266,
"learning_rate": 3.159263705482193e-05,
"loss": 0.4071,
"step": 9210
},
{
"epoch": 7.356459330143541,
"grad_norm": 4.617473125457764,
"learning_rate": 3.156262505002001e-05,
"loss": 0.4048,
"step": 9225
},
{
"epoch": 7.368421052631579,
"grad_norm": 4.013522148132324,
"learning_rate": 3.153261304521809e-05,
"loss": 0.3792,
"step": 9240
},
{
"epoch": 7.380382775119617,
"grad_norm": 4.339334487915039,
"learning_rate": 3.150260104041617e-05,
"loss": 0.4172,
"step": 9255
},
{
"epoch": 7.392344497607655,
"grad_norm": 4.555285453796387,
"learning_rate": 3.147258903561425e-05,
"loss": 0.397,
"step": 9270
},
{
"epoch": 7.404306220095694,
"grad_norm": 3.832693576812744,
"learning_rate": 3.144257703081233e-05,
"loss": 0.3784,
"step": 9285
},
{
"epoch": 7.416267942583732,
"grad_norm": 4.14719295501709,
"learning_rate": 3.1412565026010406e-05,
"loss": 0.3979,
"step": 9300
},
{
"epoch": 7.42822966507177,
"grad_norm": 3.914750337600708,
"learning_rate": 3.138255302120848e-05,
"loss": 0.3848,
"step": 9315
},
{
"epoch": 7.440191387559809,
"grad_norm": 4.9536967277526855,
"learning_rate": 3.135254101640656e-05,
"loss": 0.4144,
"step": 9330
},
{
"epoch": 7.452153110047847,
"grad_norm": 4.35673713684082,
"learning_rate": 3.1322529011604646e-05,
"loss": 0.4446,
"step": 9345
},
{
"epoch": 7.464114832535885,
"grad_norm": 4.106342315673828,
"learning_rate": 3.1292517006802724e-05,
"loss": 0.4056,
"step": 9360
},
{
"epoch": 7.476076555023924,
"grad_norm": 4.211533546447754,
"learning_rate": 3.12625050020008e-05,
"loss": 0.4072,
"step": 9375
},
{
"epoch": 7.488038277511961,
"grad_norm": 3.965963840484619,
"learning_rate": 3.123249299719888e-05,
"loss": 0.4329,
"step": 9390
},
{
"epoch": 7.5,
"grad_norm": 4.13434362411499,
"learning_rate": 3.120248099239696e-05,
"loss": 0.4161,
"step": 9405
},
{
"epoch": 7.511961722488039,
"grad_norm": 6.448205947875977,
"learning_rate": 3.1172468987595035e-05,
"loss": 0.3927,
"step": 9420
},
{
"epoch": 7.523923444976076,
"grad_norm": 4.125397682189941,
"learning_rate": 3.114245698279312e-05,
"loss": 0.4021,
"step": 9435
},
{
"epoch": 7.535885167464115,
"grad_norm": 4.477077007293701,
"learning_rate": 3.11124449779912e-05,
"loss": 0.4195,
"step": 9450
},
{
"epoch": 7.547846889952153,
"grad_norm": 3.9981772899627686,
"learning_rate": 3.108243297318928e-05,
"loss": 0.4473,
"step": 9465
},
{
"epoch": 7.559808612440191,
"grad_norm": 4.3731689453125,
"learning_rate": 3.105242096838736e-05,
"loss": 0.4264,
"step": 9480
},
{
"epoch": 7.57177033492823,
"grad_norm": 4.046823501586914,
"learning_rate": 3.102240896358544e-05,
"loss": 0.4151,
"step": 9495
},
{
"epoch": 7.583732057416268,
"grad_norm": 4.526839733123779,
"learning_rate": 3.0992396958783516e-05,
"loss": 0.4426,
"step": 9510
},
{
"epoch": 7.595693779904306,
"grad_norm": 4.215605735778809,
"learning_rate": 3.0962384953981594e-05,
"loss": 0.4376,
"step": 9525
},
{
"epoch": 7.607655502392344,
"grad_norm": 4.018391132354736,
"learning_rate": 3.093237294917967e-05,
"loss": 0.4385,
"step": 9540
},
{
"epoch": 7.619617224880383,
"grad_norm": 5.19038200378418,
"learning_rate": 3.090236094437775e-05,
"loss": 0.4379,
"step": 9555
},
{
"epoch": 7.631578947368421,
"grad_norm": 4.6209611892700195,
"learning_rate": 3.087234893957583e-05,
"loss": 0.4445,
"step": 9570
},
{
"epoch": 7.643540669856459,
"grad_norm": 4.700253486633301,
"learning_rate": 3.084233693477391e-05,
"loss": 0.4309,
"step": 9585
},
{
"epoch": 7.655502392344498,
"grad_norm": 4.6337761878967285,
"learning_rate": 3.081232492997199e-05,
"loss": 0.4256,
"step": 9600
},
{
"epoch": 7.667464114832536,
"grad_norm": 4.5144734382629395,
"learning_rate": 3.078231292517007e-05,
"loss": 0.4685,
"step": 9615
},
{
"epoch": 7.679425837320574,
"grad_norm": 4.41657829284668,
"learning_rate": 3.075230092036815e-05,
"loss": 0.4455,
"step": 9630
},
{
"epoch": 7.6913875598086126,
"grad_norm": 4.547213554382324,
"learning_rate": 3.072228891556623e-05,
"loss": 0.4935,
"step": 9645
},
{
"epoch": 7.703349282296651,
"grad_norm": 4.367729187011719,
"learning_rate": 3.069227691076431e-05,
"loss": 0.4636,
"step": 9660
},
{
"epoch": 7.715311004784689,
"grad_norm": 4.459219932556152,
"learning_rate": 3.0662264905962386e-05,
"loss": 0.4668,
"step": 9675
},
{
"epoch": 7.7272727272727275,
"grad_norm": 4.355218887329102,
"learning_rate": 3.0632252901160464e-05,
"loss": 0.4296,
"step": 9690
},
{
"epoch": 7.739234449760765,
"grad_norm": 3.960000514984131,
"learning_rate": 3.060224089635855e-05,
"loss": 0.4429,
"step": 9705
},
{
"epoch": 7.751196172248804,
"grad_norm": 4.526662349700928,
"learning_rate": 3.0572228891556626e-05,
"loss": 0.4751,
"step": 9720
},
{
"epoch": 7.7631578947368425,
"grad_norm": 4.3358259201049805,
"learning_rate": 3.0542216886754704e-05,
"loss": 0.4885,
"step": 9735
},
{
"epoch": 7.77511961722488,
"grad_norm": 4.190465927124023,
"learning_rate": 3.0512204881952782e-05,
"loss": 0.4633,
"step": 9750
},
{
"epoch": 7.787081339712919,
"grad_norm": 4.320166110992432,
"learning_rate": 3.0482192877150863e-05,
"loss": 0.4926,
"step": 9765
},
{
"epoch": 7.7990430622009566,
"grad_norm": 3.990604877471924,
"learning_rate": 3.045218087234894e-05,
"loss": 0.4516,
"step": 9780
},
{
"epoch": 7.811004784688995,
"grad_norm": 5.037746906280518,
"learning_rate": 3.042216886754702e-05,
"loss": 0.4121,
"step": 9795
},
{
"epoch": 7.822966507177034,
"grad_norm": 5.006950855255127,
"learning_rate": 3.0392156862745097e-05,
"loss": 0.4643,
"step": 9810
},
{
"epoch": 7.8349282296650715,
"grad_norm": 4.678879261016846,
"learning_rate": 3.036214485794318e-05,
"loss": 0.4733,
"step": 9825
},
{
"epoch": 7.84688995215311,
"grad_norm": 4.293395042419434,
"learning_rate": 3.033213285314126e-05,
"loss": 0.4866,
"step": 9840
},
{
"epoch": 7.858851674641148,
"grad_norm": 4.712632656097412,
"learning_rate": 3.0302120848339337e-05,
"loss": 0.4878,
"step": 9855
},
{
"epoch": 7.8708133971291865,
"grad_norm": 4.51541805267334,
"learning_rate": 3.0272108843537418e-05,
"loss": 0.4721,
"step": 9870
},
{
"epoch": 7.882775119617225,
"grad_norm": 4.705857753753662,
"learning_rate": 3.0242096838735496e-05,
"loss": 0.4849,
"step": 9885
},
{
"epoch": 7.894736842105263,
"grad_norm": 4.610105037689209,
"learning_rate": 3.0212084833933574e-05,
"loss": 0.4974,
"step": 9900
},
{
"epoch": 7.9066985645933014,
"grad_norm": 4.228977680206299,
"learning_rate": 3.018207282913165e-05,
"loss": 0.468,
"step": 9915
},
{
"epoch": 7.91866028708134,
"grad_norm": 4.514330863952637,
"learning_rate": 3.015206082432973e-05,
"loss": 0.4857,
"step": 9930
},
{
"epoch": 7.930622009569378,
"grad_norm": 4.639202117919922,
"learning_rate": 3.0122048819527814e-05,
"loss": 0.3874,
"step": 9945
},
{
"epoch": 7.942583732057416,
"grad_norm": 4.870967864990234,
"learning_rate": 3.0092036814725892e-05,
"loss": 0.4849,
"step": 9960
},
{
"epoch": 7.954545454545455,
"grad_norm": 4.402018070220947,
"learning_rate": 3.0062024809923973e-05,
"loss": 0.492,
"step": 9975
},
{
"epoch": 7.966507177033493,
"grad_norm": 4.405611991882324,
"learning_rate": 3.003201280512205e-05,
"loss": 0.4874,
"step": 9990
},
{
"epoch": 7.978468899521531,
"grad_norm": 4.78075647354126,
"learning_rate": 3.000200080032013e-05,
"loss": 0.5089,
"step": 10005
},
{
"epoch": 7.990430622009569,
"grad_norm": 4.583403587341309,
"learning_rate": 2.9971988795518207e-05,
"loss": 0.4791,
"step": 10020
},
{
"epoch": 8.002392344497608,
"grad_norm": 3.6340909004211426,
"learning_rate": 2.9941976790716285e-05,
"loss": 0.4022,
"step": 10035
},
{
"epoch": 8.014354066985646,
"grad_norm": 3.58935809135437,
"learning_rate": 2.9911964785914366e-05,
"loss": 0.2033,
"step": 10050
},
{
"epoch": 8.026315789473685,
"grad_norm": 4.309442520141602,
"learning_rate": 2.988195278111245e-05,
"loss": 0.209,
"step": 10065
},
{
"epoch": 8.038277511961722,
"grad_norm": 3.540694236755371,
"learning_rate": 2.985194077631053e-05,
"loss": 0.2269,
"step": 10080
},
{
"epoch": 8.05023923444976,
"grad_norm": 4.051588535308838,
"learning_rate": 2.9821928771508606e-05,
"loss": 0.225,
"step": 10095
},
{
"epoch": 8.062200956937799,
"grad_norm": 3.8642947673797607,
"learning_rate": 2.9791916766706684e-05,
"loss": 0.2408,
"step": 10110
},
{
"epoch": 8.074162679425838,
"grad_norm": 4.4070539474487305,
"learning_rate": 2.9761904761904762e-05,
"loss": 0.2131,
"step": 10125
},
{
"epoch": 8.086124401913876,
"grad_norm": 3.5634195804595947,
"learning_rate": 2.9731892757102843e-05,
"loss": 0.2253,
"step": 10140
},
{
"epoch": 8.098086124401913,
"grad_norm": 4.4950737953186035,
"learning_rate": 2.970188075230092e-05,
"loss": 0.2438,
"step": 10155
},
{
"epoch": 8.110047846889952,
"grad_norm": 4.489715576171875,
"learning_rate": 2.9671868747499e-05,
"loss": 0.2151,
"step": 10170
},
{
"epoch": 8.12200956937799,
"grad_norm": 4.503179550170898,
"learning_rate": 2.9641856742697083e-05,
"loss": 0.2375,
"step": 10185
},
{
"epoch": 8.133971291866029,
"grad_norm": 4.019615173339844,
"learning_rate": 2.961184473789516e-05,
"loss": 0.253,
"step": 10200
},
{
"epoch": 8.145933014354068,
"grad_norm": 3.398512601852417,
"learning_rate": 2.958183273309324e-05,
"loss": 0.2437,
"step": 10215
},
{
"epoch": 8.157894736842104,
"grad_norm": 2.8724753856658936,
"learning_rate": 2.9551820728291317e-05,
"loss": 0.2236,
"step": 10230
},
{
"epoch": 8.169856459330143,
"grad_norm": 3.7883143424987793,
"learning_rate": 2.9521808723489398e-05,
"loss": 0.2164,
"step": 10245
},
{
"epoch": 8.181818181818182,
"grad_norm": 4.483898639678955,
"learning_rate": 2.9491796718687476e-05,
"loss": 0.231,
"step": 10260
},
{
"epoch": 8.19377990430622,
"grad_norm": 4.909805774688721,
"learning_rate": 2.9461784713885554e-05,
"loss": 0.2511,
"step": 10275
},
{
"epoch": 8.205741626794259,
"grad_norm": 4.415759563446045,
"learning_rate": 2.9431772709083632e-05,
"loss": 0.2259,
"step": 10290
},
{
"epoch": 8.217703349282298,
"grad_norm": 3.9223194122314453,
"learning_rate": 2.9401760704281716e-05,
"loss": 0.2479,
"step": 10305
},
{
"epoch": 8.229665071770334,
"grad_norm": 3.4528160095214844,
"learning_rate": 2.9371748699479794e-05,
"loss": 0.2275,
"step": 10320
},
{
"epoch": 8.241626794258373,
"grad_norm": 4.239967346191406,
"learning_rate": 2.9341736694677872e-05,
"loss": 0.2316,
"step": 10335
},
{
"epoch": 8.253588516746412,
"grad_norm": 4.16427755355835,
"learning_rate": 2.9311724689875953e-05,
"loss": 0.2818,
"step": 10350
},
{
"epoch": 8.26555023923445,
"grad_norm": 4.7562994956970215,
"learning_rate": 2.928171268507403e-05,
"loss": 0.2658,
"step": 10365
},
{
"epoch": 8.277511961722489,
"grad_norm": 4.450767517089844,
"learning_rate": 2.925170068027211e-05,
"loss": 0.2792,
"step": 10380
},
{
"epoch": 8.289473684210526,
"grad_norm": 4.766055583953857,
"learning_rate": 2.9221688675470187e-05,
"loss": 0.2926,
"step": 10395
},
{
"epoch": 8.301435406698564,
"grad_norm": 4.053709030151367,
"learning_rate": 2.9191676670668268e-05,
"loss": 0.2418,
"step": 10410
},
{
"epoch": 8.313397129186603,
"grad_norm": 4.844228267669678,
"learning_rate": 2.916166466586635e-05,
"loss": 0.2426,
"step": 10425
},
{
"epoch": 8.325358851674642,
"grad_norm": 3.6860673427581787,
"learning_rate": 2.913165266106443e-05,
"loss": 0.2542,
"step": 10440
},
{
"epoch": 8.33732057416268,
"grad_norm": 3.938351631164551,
"learning_rate": 2.910164065626251e-05,
"loss": 0.2769,
"step": 10455
},
{
"epoch": 8.349282296650717,
"grad_norm": 4.569359302520752,
"learning_rate": 2.9071628651460586e-05,
"loss": 0.2456,
"step": 10470
},
{
"epoch": 8.361244019138756,
"grad_norm": 3.8243377208709717,
"learning_rate": 2.9041616646658664e-05,
"loss": 0.2666,
"step": 10485
},
{
"epoch": 8.373205741626794,
"grad_norm": 4.553408145904541,
"learning_rate": 2.9011604641856742e-05,
"loss": 0.2891,
"step": 10500
},
{
"epoch": 8.385167464114833,
"grad_norm": 4.640753746032715,
"learning_rate": 2.8981592637054823e-05,
"loss": 0.2912,
"step": 10515
},
{
"epoch": 8.397129186602871,
"grad_norm": 4.968740940093994,
"learning_rate": 2.89515806322529e-05,
"loss": 0.2761,
"step": 10530
},
{
"epoch": 8.409090909090908,
"grad_norm": 4.833539962768555,
"learning_rate": 2.8921568627450986e-05,
"loss": 0.2915,
"step": 10545
},
{
"epoch": 8.421052631578947,
"grad_norm": 4.913358211517334,
"learning_rate": 2.8891556622649064e-05,
"loss": 0.2703,
"step": 10560
},
{
"epoch": 8.433014354066986,
"grad_norm": 3.7276763916015625,
"learning_rate": 2.886154461784714e-05,
"loss": 0.2705,
"step": 10575
},
{
"epoch": 8.444976076555024,
"grad_norm": 4.225296974182129,
"learning_rate": 2.883153261304522e-05,
"loss": 0.2944,
"step": 10590
},
{
"epoch": 8.456937799043063,
"grad_norm": 4.071160793304443,
"learning_rate": 2.8801520608243297e-05,
"loss": 0.3017,
"step": 10605
},
{
"epoch": 8.4688995215311,
"grad_norm": 4.818964958190918,
"learning_rate": 2.877150860344138e-05,
"loss": 0.3057,
"step": 10620
},
{
"epoch": 8.480861244019138,
"grad_norm": 4.391495704650879,
"learning_rate": 2.8741496598639456e-05,
"loss": 0.2854,
"step": 10635
},
{
"epoch": 8.492822966507177,
"grad_norm": 4.263548374176025,
"learning_rate": 2.8711484593837534e-05,
"loss": 0.2604,
"step": 10650
}
],
"logging_steps": 15,
"max_steps": 25000,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 15,
"total_flos": 7.844568831858917e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}