diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12715 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999447544334568, + "eval_steps": 500, + "global_step": 9050, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001104911330865698, + "grad_norm": 2.765625, + "learning_rate": 2.2099447513812156e-07, + "loss": 2.1213, + "step": 1 + }, + { + "epoch": 0.000552455665432849, + "grad_norm": 1.7265625, + "learning_rate": 1.1049723756906078e-06, + "loss": 2.3029, + "step": 5 + }, + { + "epoch": 0.001104911330865698, + "grad_norm": 1.9453125, + "learning_rate": 2.2099447513812157e-06, + "loss": 2.2939, + "step": 10 + }, + { + "epoch": 0.001657366996298547, + "grad_norm": 1.828125, + "learning_rate": 3.314917127071823e-06, + "loss": 2.2427, + "step": 15 + }, + { + "epoch": 0.002209822661731396, + "grad_norm": 2.859375, + "learning_rate": 4.419889502762431e-06, + "loss": 2.0646, + "step": 20 + }, + { + "epoch": 0.002762278327164245, + "grad_norm": 3.65625, + "learning_rate": 5.524861878453038e-06, + "loss": 2.1394, + "step": 25 + }, + { + "epoch": 0.003314733992597094, + "grad_norm": 4.375, + "learning_rate": 6.629834254143646e-06, + "loss": 2.137, + "step": 30 + }, + { + "epoch": 0.003867189658029943, + "grad_norm": 3.890625, + "learning_rate": 7.734806629834254e-06, + "loss": 2.0171, + "step": 35 + }, + { + "epoch": 0.004419645323462792, + "grad_norm": 2.421875, + "learning_rate": 8.839779005524863e-06, + "loss": 1.9479, + "step": 40 + }, + { + "epoch": 0.004972100988895641, + "grad_norm": 1.34375, + "learning_rate": 9.94475138121547e-06, + "loss": 1.847, + "step": 45 + }, + { + "epoch": 0.00552455665432849, + "grad_norm": 0.97265625, + "learning_rate": 1.1049723756906077e-05, + "loss": 1.8616, + "step": 50 + }, + { + "epoch": 0.0060770123197613395, + "grad_norm": 0.91796875, + "learning_rate": 1.2154696132596685e-05, + "loss": 1.761, + "step": 55 + }, + { + "epoch": 0.006629467985194188, + "grad_norm": 0.81640625, + "learning_rate": 1.3259668508287292e-05, + "loss": 1.7377, + "step": 60 + }, + { + "epoch": 0.007181923650627037, + "grad_norm": 0.71875, + "learning_rate": 1.4364640883977901e-05, + "loss": 1.6858, + "step": 65 + }, + { + "epoch": 0.007734379316059886, + "grad_norm": 0.73828125, + "learning_rate": 1.5469613259668508e-05, + "loss": 1.6879, + "step": 70 + }, + { + "epoch": 0.008286834981492736, + "grad_norm": 0.55859375, + "learning_rate": 1.6574585635359117e-05, + "loss": 1.6719, + "step": 75 + }, + { + "epoch": 0.008839290646925584, + "grad_norm": 0.53125, + "learning_rate": 1.7679558011049725e-05, + "loss": 1.6673, + "step": 80 + }, + { + "epoch": 0.009391746312358434, + "grad_norm": 0.3984375, + "learning_rate": 1.878453038674033e-05, + "loss": 1.5939, + "step": 85 + }, + { + "epoch": 0.009944201977791282, + "grad_norm": 0.55859375, + "learning_rate": 1.988950276243094e-05, + "loss": 1.4704, + "step": 90 + }, + { + "epoch": 0.010496657643224131, + "grad_norm": 1.203125, + "learning_rate": 2.0994475138121548e-05, + "loss": 1.6011, + "step": 95 + }, + { + "epoch": 0.01104911330865698, + "grad_norm": 0.462890625, + "learning_rate": 2.2099447513812153e-05, + "loss": 1.5358, + "step": 100 + }, + { + "epoch": 0.01160156897408983, + "grad_norm": 0.466796875, + "learning_rate": 2.3204419889502762e-05, + "loss": 1.5575, + "step": 105 + }, + { + "epoch": 0.012154024639522679, + "grad_norm": 0.3984375, + "learning_rate": 2.430939226519337e-05, + "loss": 1.5238, + "step": 110 + }, + { + "epoch": 0.012706480304955527, + "grad_norm": 0.365234375, + "learning_rate": 2.541436464088398e-05, + "loss": 1.5489, + "step": 115 + }, + { + "epoch": 0.013258935970388377, + "grad_norm": 0.4140625, + "learning_rate": 2.6519337016574585e-05, + "loss": 1.5478, + "step": 120 + }, + { + "epoch": 0.013811391635821225, + "grad_norm": 0.373046875, + "learning_rate": 2.7624309392265197e-05, + "loss": 1.4958, + "step": 125 + }, + { + "epoch": 0.014363847301254075, + "grad_norm": 0.37109375, + "learning_rate": 2.8729281767955802e-05, + "loss": 1.5456, + "step": 130 + }, + { + "epoch": 0.014916302966686923, + "grad_norm": 0.3984375, + "learning_rate": 2.983425414364641e-05, + "loss": 1.48, + "step": 135 + }, + { + "epoch": 0.015468758632119772, + "grad_norm": 0.384765625, + "learning_rate": 3.0939226519337016e-05, + "loss": 1.5851, + "step": 140 + }, + { + "epoch": 0.01602121429755262, + "grad_norm": 0.39453125, + "learning_rate": 3.2044198895027625e-05, + "loss": 1.5424, + "step": 145 + }, + { + "epoch": 0.016573669962985472, + "grad_norm": 0.361328125, + "learning_rate": 3.3149171270718233e-05, + "loss": 1.5361, + "step": 150 + }, + { + "epoch": 0.01712612562841832, + "grad_norm": 0.375, + "learning_rate": 3.425414364640884e-05, + "loss": 1.5748, + "step": 155 + }, + { + "epoch": 0.017678581293851168, + "grad_norm": 0.392578125, + "learning_rate": 3.535911602209945e-05, + "loss": 1.4981, + "step": 160 + }, + { + "epoch": 0.018231036959284016, + "grad_norm": 0.388671875, + "learning_rate": 3.646408839779006e-05, + "loss": 1.4862, + "step": 165 + }, + { + "epoch": 0.018783492624716867, + "grad_norm": 0.384765625, + "learning_rate": 3.756906077348066e-05, + "loss": 1.4597, + "step": 170 + }, + { + "epoch": 0.019335948290149715, + "grad_norm": 0.45703125, + "learning_rate": 3.867403314917128e-05, + "loss": 1.5168, + "step": 175 + }, + { + "epoch": 0.019888403955582563, + "grad_norm": 0.416015625, + "learning_rate": 3.977900552486188e-05, + "loss": 1.4759, + "step": 180 + }, + { + "epoch": 0.020440859621015415, + "grad_norm": 0.421875, + "learning_rate": 4.088397790055249e-05, + "loss": 1.511, + "step": 185 + }, + { + "epoch": 0.020993315286448263, + "grad_norm": 0.400390625, + "learning_rate": 4.1988950276243096e-05, + "loss": 1.4943, + "step": 190 + }, + { + "epoch": 0.02154577095188111, + "grad_norm": 0.400390625, + "learning_rate": 4.3093922651933705e-05, + "loss": 1.472, + "step": 195 + }, + { + "epoch": 0.02209822661731396, + "grad_norm": 0.4453125, + "learning_rate": 4.419889502762431e-05, + "loss": 1.4631, + "step": 200 + }, + { + "epoch": 0.02265068228274681, + "grad_norm": 0.4375, + "learning_rate": 4.530386740331492e-05, + "loss": 1.5233, + "step": 205 + }, + { + "epoch": 0.02320313794817966, + "grad_norm": 0.482421875, + "learning_rate": 4.6408839779005524e-05, + "loss": 1.5071, + "step": 210 + }, + { + "epoch": 0.023755593613612506, + "grad_norm": 0.44921875, + "learning_rate": 4.751381215469613e-05, + "loss": 1.4828, + "step": 215 + }, + { + "epoch": 0.024308049279045358, + "grad_norm": 0.4609375, + "learning_rate": 4.861878453038674e-05, + "loss": 1.4937, + "step": 220 + }, + { + "epoch": 0.024860504944478206, + "grad_norm": 0.458984375, + "learning_rate": 4.972375690607735e-05, + "loss": 1.6039, + "step": 225 + }, + { + "epoch": 0.025412960609911054, + "grad_norm": 0.421875, + "learning_rate": 5.082872928176796e-05, + "loss": 1.4386, + "step": 230 + }, + { + "epoch": 0.025965416275343902, + "grad_norm": 0.451171875, + "learning_rate": 5.193370165745857e-05, + "loss": 1.5466, + "step": 235 + }, + { + "epoch": 0.026517871940776754, + "grad_norm": 0.54296875, + "learning_rate": 5.303867403314917e-05, + "loss": 1.4743, + "step": 240 + }, + { + "epoch": 0.0270703276062096, + "grad_norm": 0.455078125, + "learning_rate": 5.414364640883978e-05, + "loss": 1.4524, + "step": 245 + }, + { + "epoch": 0.02762278327164245, + "grad_norm": 0.427734375, + "learning_rate": 5.5248618784530394e-05, + "loss": 1.4057, + "step": 250 + }, + { + "epoch": 0.0281752389370753, + "grad_norm": 0.453125, + "learning_rate": 5.6353591160220996e-05, + "loss": 1.4469, + "step": 255 + }, + { + "epoch": 0.02872769460250815, + "grad_norm": 0.4921875, + "learning_rate": 5.7458563535911604e-05, + "loss": 1.4177, + "step": 260 + }, + { + "epoch": 0.029280150267940997, + "grad_norm": 0.5078125, + "learning_rate": 5.8563535911602206e-05, + "loss": 1.4479, + "step": 265 + }, + { + "epoch": 0.029832605933373845, + "grad_norm": 0.47265625, + "learning_rate": 5.966850828729282e-05, + "loss": 1.427, + "step": 270 + }, + { + "epoch": 0.030385061598806697, + "grad_norm": 0.470703125, + "learning_rate": 6.077348066298343e-05, + "loss": 1.4548, + "step": 275 + }, + { + "epoch": 0.030937517264239545, + "grad_norm": 0.486328125, + "learning_rate": 6.187845303867403e-05, + "loss": 1.5069, + "step": 280 + }, + { + "epoch": 0.03148997292967239, + "grad_norm": 0.46484375, + "learning_rate": 6.298342541436464e-05, + "loss": 1.4171, + "step": 285 + }, + { + "epoch": 0.03204242859510524, + "grad_norm": 0.462890625, + "learning_rate": 6.408839779005525e-05, + "loss": 1.4631, + "step": 290 + }, + { + "epoch": 0.03259488426053809, + "grad_norm": 0.466796875, + "learning_rate": 6.519337016574586e-05, + "loss": 1.4806, + "step": 295 + }, + { + "epoch": 0.033147339925970944, + "grad_norm": 0.478515625, + "learning_rate": 6.629834254143647e-05, + "loss": 1.4121, + "step": 300 + }, + { + "epoch": 0.03369979559140379, + "grad_norm": 0.490234375, + "learning_rate": 6.740331491712708e-05, + "loss": 1.4418, + "step": 305 + }, + { + "epoch": 0.03425225125683664, + "grad_norm": 0.5, + "learning_rate": 6.850828729281768e-05, + "loss": 1.4879, + "step": 310 + }, + { + "epoch": 0.03480470692226949, + "grad_norm": 0.4765625, + "learning_rate": 6.961325966850829e-05, + "loss": 1.4641, + "step": 315 + }, + { + "epoch": 0.035357162587702336, + "grad_norm": 0.4765625, + "learning_rate": 7.07182320441989e-05, + "loss": 1.495, + "step": 320 + }, + { + "epoch": 0.035909618253135184, + "grad_norm": 0.474609375, + "learning_rate": 7.182320441988951e-05, + "loss": 1.4523, + "step": 325 + }, + { + "epoch": 0.03646207391856803, + "grad_norm": 0.46875, + "learning_rate": 7.292817679558012e-05, + "loss": 1.4367, + "step": 330 + }, + { + "epoch": 0.03701452958400089, + "grad_norm": 0.451171875, + "learning_rate": 7.403314917127073e-05, + "loss": 1.47, + "step": 335 + }, + { + "epoch": 0.037566985249433735, + "grad_norm": 0.486328125, + "learning_rate": 7.513812154696132e-05, + "loss": 1.5092, + "step": 340 + }, + { + "epoch": 0.03811944091486658, + "grad_norm": 0.63671875, + "learning_rate": 7.624309392265195e-05, + "loss": 1.4718, + "step": 345 + }, + { + "epoch": 0.03867189658029943, + "grad_norm": 0.423828125, + "learning_rate": 7.734806629834255e-05, + "loss": 1.4461, + "step": 350 + }, + { + "epoch": 0.03922435224573228, + "grad_norm": 0.458984375, + "learning_rate": 7.845303867403315e-05, + "loss": 1.4724, + "step": 355 + }, + { + "epoch": 0.03977680791116513, + "grad_norm": 0.47265625, + "learning_rate": 7.955801104972376e-05, + "loss": 1.4095, + "step": 360 + }, + { + "epoch": 0.040329263576597975, + "grad_norm": 0.494140625, + "learning_rate": 8.066298342541438e-05, + "loss": 1.4511, + "step": 365 + }, + { + "epoch": 0.04088171924203083, + "grad_norm": 0.44140625, + "learning_rate": 8.176795580110498e-05, + "loss": 1.49, + "step": 370 + }, + { + "epoch": 0.04143417490746368, + "grad_norm": 0.458984375, + "learning_rate": 8.287292817679558e-05, + "loss": 1.4709, + "step": 375 + }, + { + "epoch": 0.041986630572896526, + "grad_norm": 0.447265625, + "learning_rate": 8.397790055248619e-05, + "loss": 1.46, + "step": 380 + }, + { + "epoch": 0.042539086238329374, + "grad_norm": 0.435546875, + "learning_rate": 8.50828729281768e-05, + "loss": 1.5146, + "step": 385 + }, + { + "epoch": 0.04309154190376222, + "grad_norm": 0.4765625, + "learning_rate": 8.618784530386741e-05, + "loss": 1.4947, + "step": 390 + }, + { + "epoch": 0.04364399756919507, + "grad_norm": 0.447265625, + "learning_rate": 8.729281767955802e-05, + "loss": 1.4608, + "step": 395 + }, + { + "epoch": 0.04419645323462792, + "grad_norm": 0.443359375, + "learning_rate": 8.839779005524861e-05, + "loss": 1.3874, + "step": 400 + }, + { + "epoch": 0.04474890890006077, + "grad_norm": 0.447265625, + "learning_rate": 8.950276243093924e-05, + "loss": 1.4436, + "step": 405 + }, + { + "epoch": 0.04530136456549362, + "grad_norm": 0.458984375, + "learning_rate": 9.060773480662984e-05, + "loss": 1.3886, + "step": 410 + }, + { + "epoch": 0.04585382023092647, + "grad_norm": 0.4375, + "learning_rate": 9.171270718232044e-05, + "loss": 1.4823, + "step": 415 + }, + { + "epoch": 0.04640627589635932, + "grad_norm": 0.431640625, + "learning_rate": 9.281767955801105e-05, + "loss": 1.4335, + "step": 420 + }, + { + "epoch": 0.046958731561792165, + "grad_norm": 0.4453125, + "learning_rate": 9.392265193370167e-05, + "loss": 1.4415, + "step": 425 + }, + { + "epoch": 0.04751118722722501, + "grad_norm": 0.4765625, + "learning_rate": 9.502762430939227e-05, + "loss": 1.4289, + "step": 430 + }, + { + "epoch": 0.04806364289265786, + "grad_norm": 0.4609375, + "learning_rate": 9.613259668508287e-05, + "loss": 1.4191, + "step": 435 + }, + { + "epoch": 0.048616098558090716, + "grad_norm": 0.451171875, + "learning_rate": 9.723756906077348e-05, + "loss": 1.3955, + "step": 440 + }, + { + "epoch": 0.049168554223523564, + "grad_norm": 0.427734375, + "learning_rate": 9.834254143646409e-05, + "loss": 1.4465, + "step": 445 + }, + { + "epoch": 0.04972100988895641, + "grad_norm": 0.5, + "learning_rate": 9.94475138121547e-05, + "loss": 1.4598, + "step": 450 + }, + { + "epoch": 0.05027346555438926, + "grad_norm": 0.42578125, + "learning_rate": 0.00010055248618784532, + "loss": 1.4326, + "step": 455 + }, + { + "epoch": 0.05082592121982211, + "grad_norm": 0.439453125, + "learning_rate": 0.00010165745856353592, + "loss": 1.3476, + "step": 460 + }, + { + "epoch": 0.051378376885254956, + "grad_norm": 0.416015625, + "learning_rate": 0.00010276243093922653, + "loss": 1.4352, + "step": 465 + }, + { + "epoch": 0.051930832550687804, + "grad_norm": 0.419921875, + "learning_rate": 0.00010386740331491714, + "loss": 1.4684, + "step": 470 + }, + { + "epoch": 0.05248328821612066, + "grad_norm": 0.453125, + "learning_rate": 0.00010497237569060774, + "loss": 1.441, + "step": 475 + }, + { + "epoch": 0.05303574388155351, + "grad_norm": 0.453125, + "learning_rate": 0.00010607734806629834, + "loss": 1.4258, + "step": 480 + }, + { + "epoch": 0.053588199546986355, + "grad_norm": 0.4453125, + "learning_rate": 0.00010718232044198895, + "loss": 1.419, + "step": 485 + }, + { + "epoch": 0.0541406552124192, + "grad_norm": 0.43359375, + "learning_rate": 0.00010828729281767956, + "loss": 1.3802, + "step": 490 + }, + { + "epoch": 0.05469311087785205, + "grad_norm": 0.4296875, + "learning_rate": 0.00010939226519337018, + "loss": 1.3694, + "step": 495 + }, + { + "epoch": 0.0552455665432849, + "grad_norm": 0.416015625, + "learning_rate": 0.00011049723756906079, + "loss": 1.4704, + "step": 500 + }, + { + "epoch": 0.05579802220871775, + "grad_norm": 0.423828125, + "learning_rate": 0.0001116022099447514, + "loss": 1.3262, + "step": 505 + }, + { + "epoch": 0.0563504778741506, + "grad_norm": 0.439453125, + "learning_rate": 0.00011270718232044199, + "loss": 1.4695, + "step": 510 + }, + { + "epoch": 0.05690293353958345, + "grad_norm": 0.451171875, + "learning_rate": 0.0001138121546961326, + "loss": 1.4465, + "step": 515 + }, + { + "epoch": 0.0574553892050163, + "grad_norm": 0.419921875, + "learning_rate": 0.00011491712707182321, + "loss": 1.4442, + "step": 520 + }, + { + "epoch": 0.058007844870449146, + "grad_norm": 0.40625, + "learning_rate": 0.0001160220994475138, + "loss": 1.4143, + "step": 525 + }, + { + "epoch": 0.058560300535881994, + "grad_norm": 0.39453125, + "learning_rate": 0.00011712707182320441, + "loss": 1.4964, + "step": 530 + }, + { + "epoch": 0.05911275620131484, + "grad_norm": 0.404296875, + "learning_rate": 0.00011823204419889505, + "loss": 1.4598, + "step": 535 + }, + { + "epoch": 0.05966521186674769, + "grad_norm": 0.400390625, + "learning_rate": 0.00011933701657458564, + "loss": 1.348, + "step": 540 + }, + { + "epoch": 0.060217667532180545, + "grad_norm": 0.388671875, + "learning_rate": 0.00012044198895027625, + "loss": 1.3783, + "step": 545 + }, + { + "epoch": 0.06077012319761339, + "grad_norm": 0.40234375, + "learning_rate": 0.00012154696132596686, + "loss": 1.3765, + "step": 550 + }, + { + "epoch": 0.06132257886304624, + "grad_norm": 0.412109375, + "learning_rate": 0.00012265193370165746, + "loss": 1.4892, + "step": 555 + }, + { + "epoch": 0.06187503452847909, + "grad_norm": 1.4765625, + "learning_rate": 0.00012375690607734806, + "loss": 1.4185, + "step": 560 + }, + { + "epoch": 0.06242749019391194, + "grad_norm": 0.44140625, + "learning_rate": 0.00012486187845303867, + "loss": 1.3789, + "step": 565 + }, + { + "epoch": 0.06297994585934479, + "grad_norm": 0.40625, + "learning_rate": 0.00012596685082872928, + "loss": 1.4294, + "step": 570 + }, + { + "epoch": 0.06353240152477764, + "grad_norm": 0.40625, + "learning_rate": 0.00012707182320441992, + "loss": 1.3654, + "step": 575 + }, + { + "epoch": 0.06408485719021048, + "grad_norm": 0.41015625, + "learning_rate": 0.0001281767955801105, + "loss": 1.3842, + "step": 580 + }, + { + "epoch": 0.06463731285564334, + "grad_norm": 0.41015625, + "learning_rate": 0.0001292817679558011, + "loss": 1.422, + "step": 585 + }, + { + "epoch": 0.06518976852107618, + "grad_norm": 0.404296875, + "learning_rate": 0.00013038674033149172, + "loss": 1.4607, + "step": 590 + }, + { + "epoch": 0.06574222418650903, + "grad_norm": 0.388671875, + "learning_rate": 0.00013149171270718233, + "loss": 1.4292, + "step": 595 + }, + { + "epoch": 0.06629467985194189, + "grad_norm": 0.4140625, + "learning_rate": 0.00013259668508287293, + "loss": 1.4353, + "step": 600 + }, + { + "epoch": 0.06684713551737473, + "grad_norm": 0.392578125, + "learning_rate": 0.00013370165745856354, + "loss": 1.4552, + "step": 605 + }, + { + "epoch": 0.06739959118280758, + "grad_norm": 0.416015625, + "learning_rate": 0.00013480662983425415, + "loss": 1.4496, + "step": 610 + }, + { + "epoch": 0.06795204684824042, + "grad_norm": 0.38671875, + "learning_rate": 0.00013591160220994476, + "loss": 1.4456, + "step": 615 + }, + { + "epoch": 0.06850450251367328, + "grad_norm": 0.375, + "learning_rate": 0.00013701657458563537, + "loss": 1.4358, + "step": 620 + }, + { + "epoch": 0.06905695817910612, + "grad_norm": 0.416015625, + "learning_rate": 0.00013812154696132598, + "loss": 1.473, + "step": 625 + }, + { + "epoch": 0.06960941384453898, + "grad_norm": 0.361328125, + "learning_rate": 0.00013922651933701659, + "loss": 1.3869, + "step": 630 + }, + { + "epoch": 0.07016186950997183, + "grad_norm": 0.453125, + "learning_rate": 0.0001403314917127072, + "loss": 1.3762, + "step": 635 + }, + { + "epoch": 0.07071432517540467, + "grad_norm": 0.412109375, + "learning_rate": 0.0001414364640883978, + "loss": 1.3875, + "step": 640 + }, + { + "epoch": 0.07126678084083753, + "grad_norm": 0.384765625, + "learning_rate": 0.00014254143646408839, + "loss": 1.3839, + "step": 645 + }, + { + "epoch": 0.07181923650627037, + "grad_norm": 0.408203125, + "learning_rate": 0.00014364640883977902, + "loss": 1.4395, + "step": 650 + }, + { + "epoch": 0.07237169217170322, + "grad_norm": 0.39453125, + "learning_rate": 0.00014475138121546963, + "loss": 1.4761, + "step": 655 + }, + { + "epoch": 0.07292414783713606, + "grad_norm": 0.373046875, + "learning_rate": 0.00014585635359116024, + "loss": 1.4714, + "step": 660 + }, + { + "epoch": 0.07347660350256892, + "grad_norm": 0.375, + "learning_rate": 0.00014696132596685085, + "loss": 1.403, + "step": 665 + }, + { + "epoch": 0.07402905916800177, + "grad_norm": 0.7734375, + "learning_rate": 0.00014806629834254146, + "loss": 1.4563, + "step": 670 + }, + { + "epoch": 0.07458151483343461, + "grad_norm": 0.41015625, + "learning_rate": 0.00014917127071823204, + "loss": 1.4246, + "step": 675 + }, + { + "epoch": 0.07513397049886747, + "grad_norm": 0.3828125, + "learning_rate": 0.00015027624309392265, + "loss": 1.4217, + "step": 680 + }, + { + "epoch": 0.07568642616430031, + "grad_norm": 0.39453125, + "learning_rate": 0.00015138121546961325, + "loss": 1.4074, + "step": 685 + }, + { + "epoch": 0.07623888182973317, + "grad_norm": 0.369140625, + "learning_rate": 0.0001524861878453039, + "loss": 1.4298, + "step": 690 + }, + { + "epoch": 0.076791337495166, + "grad_norm": 0.37109375, + "learning_rate": 0.0001535911602209945, + "loss": 1.4207, + "step": 695 + }, + { + "epoch": 0.07734379316059886, + "grad_norm": 0.36328125, + "learning_rate": 0.0001546961325966851, + "loss": 1.4175, + "step": 700 + }, + { + "epoch": 0.07789624882603172, + "grad_norm": 0.36328125, + "learning_rate": 0.0001558011049723757, + "loss": 1.4153, + "step": 705 + }, + { + "epoch": 0.07844870449146456, + "grad_norm": 0.384765625, + "learning_rate": 0.0001569060773480663, + "loss": 1.2493, + "step": 710 + }, + { + "epoch": 0.07900116015689741, + "grad_norm": 0.369140625, + "learning_rate": 0.0001580110497237569, + "loss": 1.2664, + "step": 715 + }, + { + "epoch": 0.07955361582233025, + "grad_norm": 0.380859375, + "learning_rate": 0.00015911602209944752, + "loss": 1.3678, + "step": 720 + }, + { + "epoch": 0.08010607148776311, + "grad_norm": 0.380859375, + "learning_rate": 0.00016022099447513812, + "loss": 1.395, + "step": 725 + }, + { + "epoch": 0.08065852715319595, + "grad_norm": 0.361328125, + "learning_rate": 0.00016132596685082876, + "loss": 1.4848, + "step": 730 + }, + { + "epoch": 0.0812109828186288, + "grad_norm": 0.392578125, + "learning_rate": 0.00016243093922651934, + "loss": 1.303, + "step": 735 + }, + { + "epoch": 0.08176343848406166, + "grad_norm": 0.380859375, + "learning_rate": 0.00016353591160220995, + "loss": 1.3469, + "step": 740 + }, + { + "epoch": 0.0823158941494945, + "grad_norm": 0.37890625, + "learning_rate": 0.00016464088397790056, + "loss": 1.407, + "step": 745 + }, + { + "epoch": 0.08286834981492736, + "grad_norm": 0.396484375, + "learning_rate": 0.00016574585635359117, + "loss": 1.4167, + "step": 750 + }, + { + "epoch": 0.0834208054803602, + "grad_norm": 0.375, + "learning_rate": 0.00016685082872928178, + "loss": 1.3591, + "step": 755 + }, + { + "epoch": 0.08397326114579305, + "grad_norm": 0.37109375, + "learning_rate": 0.00016795580110497238, + "loss": 1.4427, + "step": 760 + }, + { + "epoch": 0.08452571681122589, + "grad_norm": 0.373046875, + "learning_rate": 0.000169060773480663, + "loss": 1.407, + "step": 765 + }, + { + "epoch": 0.08507817247665875, + "grad_norm": 0.392578125, + "learning_rate": 0.0001701657458563536, + "loss": 1.4429, + "step": 770 + }, + { + "epoch": 0.0856306281420916, + "grad_norm": 0.3828125, + "learning_rate": 0.0001712707182320442, + "loss": 1.3577, + "step": 775 + }, + { + "epoch": 0.08618308380752444, + "grad_norm": 0.353515625, + "learning_rate": 0.00017237569060773482, + "loss": 1.3686, + "step": 780 + }, + { + "epoch": 0.0867355394729573, + "grad_norm": 0.369140625, + "learning_rate": 0.00017348066298342543, + "loss": 1.3666, + "step": 785 + }, + { + "epoch": 0.08728799513839014, + "grad_norm": 0.361328125, + "learning_rate": 0.00017458563535911604, + "loss": 1.4079, + "step": 790 + }, + { + "epoch": 0.087840450803823, + "grad_norm": 0.353515625, + "learning_rate": 0.00017569060773480665, + "loss": 1.4109, + "step": 795 + }, + { + "epoch": 0.08839290646925584, + "grad_norm": 0.384765625, + "learning_rate": 0.00017679558011049723, + "loss": 1.3654, + "step": 800 + }, + { + "epoch": 0.08894536213468869, + "grad_norm": 0.34765625, + "learning_rate": 0.00017790055248618784, + "loss": 1.4313, + "step": 805 + }, + { + "epoch": 0.08949781780012155, + "grad_norm": 0.384765625, + "learning_rate": 0.00017900552486187847, + "loss": 1.3809, + "step": 810 + }, + { + "epoch": 0.09005027346555439, + "grad_norm": 0.376953125, + "learning_rate": 0.00018011049723756908, + "loss": 1.4881, + "step": 815 + }, + { + "epoch": 0.09060272913098724, + "grad_norm": 0.3671875, + "learning_rate": 0.0001812154696132597, + "loss": 1.4934, + "step": 820 + }, + { + "epoch": 0.09115518479642008, + "grad_norm": 0.36328125, + "learning_rate": 0.0001823204419889503, + "loss": 1.4283, + "step": 825 + }, + { + "epoch": 0.09170764046185294, + "grad_norm": 0.375, + "learning_rate": 0.00018342541436464088, + "loss": 1.3682, + "step": 830 + }, + { + "epoch": 0.09226009612728578, + "grad_norm": 0.369140625, + "learning_rate": 0.0001845303867403315, + "loss": 1.3781, + "step": 835 + }, + { + "epoch": 0.09281255179271863, + "grad_norm": 0.39453125, + "learning_rate": 0.0001856353591160221, + "loss": 1.4479, + "step": 840 + }, + { + "epoch": 0.09336500745815149, + "grad_norm": 0.41796875, + "learning_rate": 0.0001867403314917127, + "loss": 1.4122, + "step": 845 + }, + { + "epoch": 0.09391746312358433, + "grad_norm": 0.37109375, + "learning_rate": 0.00018784530386740334, + "loss": 1.3943, + "step": 850 + }, + { + "epoch": 0.09446991878901718, + "grad_norm": 0.40625, + "learning_rate": 0.00018895027624309395, + "loss": 1.3659, + "step": 855 + }, + { + "epoch": 0.09502237445445003, + "grad_norm": 0.359375, + "learning_rate": 0.00019005524861878453, + "loss": 1.3628, + "step": 860 + }, + { + "epoch": 0.09557483011988288, + "grad_norm": 0.357421875, + "learning_rate": 0.00019116022099447514, + "loss": 1.3492, + "step": 865 + }, + { + "epoch": 0.09612728578531572, + "grad_norm": 0.359375, + "learning_rate": 0.00019226519337016575, + "loss": 1.4476, + "step": 870 + }, + { + "epoch": 0.09667974145074858, + "grad_norm": 0.37890625, + "learning_rate": 0.00019337016574585636, + "loss": 1.3911, + "step": 875 + }, + { + "epoch": 0.09723219711618143, + "grad_norm": 0.376953125, + "learning_rate": 0.00019447513812154697, + "loss": 1.3648, + "step": 880 + }, + { + "epoch": 0.09778465278161427, + "grad_norm": 0.361328125, + "learning_rate": 0.00019558011049723757, + "loss": 1.3301, + "step": 885 + }, + { + "epoch": 0.09833710844704713, + "grad_norm": 0.376953125, + "learning_rate": 0.00019668508287292818, + "loss": 1.4445, + "step": 890 + }, + { + "epoch": 0.09888956411247997, + "grad_norm": 0.390625, + "learning_rate": 0.0001977900552486188, + "loss": 1.4455, + "step": 895 + }, + { + "epoch": 0.09944201977791282, + "grad_norm": 0.37109375, + "learning_rate": 0.0001988950276243094, + "loss": 1.3361, + "step": 900 + }, + { + "epoch": 0.09999447544334567, + "grad_norm": 0.359375, + "learning_rate": 0.0002, + "loss": 1.4043, + "step": 905 + }, + { + "epoch": 0.10054693110877852, + "grad_norm": 0.369140625, + "learning_rate": 0.00019999981403661345, + "loss": 1.375, + "step": 910 + }, + { + "epoch": 0.10109938677421137, + "grad_norm": 0.359375, + "learning_rate": 0.00019999925614714537, + "loss": 1.3785, + "step": 915 + }, + { + "epoch": 0.10165184243964422, + "grad_norm": 0.349609375, + "learning_rate": 0.00019999832633367076, + "loss": 1.2666, + "step": 920 + }, + { + "epoch": 0.10220429810507707, + "grad_norm": 0.361328125, + "learning_rate": 0.0001999970245996478, + "loss": 1.4409, + "step": 925 + }, + { + "epoch": 0.10275675377050991, + "grad_norm": 0.3515625, + "learning_rate": 0.00019999535094991798, + "loss": 1.3618, + "step": 930 + }, + { + "epoch": 0.10330920943594277, + "grad_norm": 0.36328125, + "learning_rate": 0.00019999330539070617, + "loss": 1.4307, + "step": 935 + }, + { + "epoch": 0.10386166510137561, + "grad_norm": 0.3515625, + "learning_rate": 0.00019999088792962017, + "loss": 1.2785, + "step": 940 + }, + { + "epoch": 0.10441412076680846, + "grad_norm": 0.37109375, + "learning_rate": 0.00019998809857565131, + "loss": 1.3679, + "step": 945 + }, + { + "epoch": 0.10496657643224132, + "grad_norm": 0.38671875, + "learning_rate": 0.00019998493733917384, + "loss": 1.3698, + "step": 950 + }, + { + "epoch": 0.10551903209767416, + "grad_norm": 0.3515625, + "learning_rate": 0.00019998140423194534, + "loss": 1.3474, + "step": 955 + }, + { + "epoch": 0.10607148776310701, + "grad_norm": 0.373046875, + "learning_rate": 0.0001999774992671063, + "loss": 1.4296, + "step": 960 + }, + { + "epoch": 0.10662394342853986, + "grad_norm": 0.37109375, + "learning_rate": 0.0001999732224591804, + "loss": 1.3559, + "step": 965 + }, + { + "epoch": 0.10717639909397271, + "grad_norm": 0.412109375, + "learning_rate": 0.0001999685738240742, + "loss": 1.41, + "step": 970 + }, + { + "epoch": 0.10772885475940555, + "grad_norm": 0.388671875, + "learning_rate": 0.00019996355337907718, + "loss": 1.3789, + "step": 975 + }, + { + "epoch": 0.1082813104248384, + "grad_norm": 0.365234375, + "learning_rate": 0.00019995816114286178, + "loss": 1.4477, + "step": 980 + }, + { + "epoch": 0.10883376609027126, + "grad_norm": 0.349609375, + "learning_rate": 0.00019995239713548318, + "loss": 1.4113, + "step": 985 + }, + { + "epoch": 0.1093862217557041, + "grad_norm": 0.38671875, + "learning_rate": 0.00019994626137837917, + "loss": 1.3787, + "step": 990 + }, + { + "epoch": 0.10993867742113696, + "grad_norm": 0.390625, + "learning_rate": 0.00019993975389437038, + "loss": 1.3096, + "step": 995 + }, + { + "epoch": 0.1104911330865698, + "grad_norm": 0.33984375, + "learning_rate": 0.00019993287470765984, + "loss": 1.359, + "step": 1000 + }, + { + "epoch": 0.11104358875200265, + "grad_norm": 0.353515625, + "learning_rate": 0.00019992562384383309, + "loss": 1.4228, + "step": 1005 + }, + { + "epoch": 0.1115960444174355, + "grad_norm": 0.3984375, + "learning_rate": 0.00019991800132985804, + "loss": 1.4003, + "step": 1010 + }, + { + "epoch": 0.11214850008286835, + "grad_norm": 0.337890625, + "learning_rate": 0.00019991000719408485, + "loss": 1.3181, + "step": 1015 + }, + { + "epoch": 0.1127009557483012, + "grad_norm": 0.359375, + "learning_rate": 0.00019990164146624584, + "loss": 1.3981, + "step": 1020 + }, + { + "epoch": 0.11325341141373405, + "grad_norm": 0.40625, + "learning_rate": 0.00019989290417745542, + "loss": 1.3001, + "step": 1025 + }, + { + "epoch": 0.1138058670791669, + "grad_norm": 0.37890625, + "learning_rate": 0.00019988379536020986, + "loss": 1.4415, + "step": 1030 + }, + { + "epoch": 0.11435832274459974, + "grad_norm": 0.34765625, + "learning_rate": 0.00019987431504838737, + "loss": 1.3646, + "step": 1035 + }, + { + "epoch": 0.1149107784100326, + "grad_norm": 0.380859375, + "learning_rate": 0.0001998644632772477, + "loss": 1.3522, + "step": 1040 + }, + { + "epoch": 0.11546323407546544, + "grad_norm": 0.3828125, + "learning_rate": 0.00019985424008343226, + "loss": 1.3817, + "step": 1045 + }, + { + "epoch": 0.11601568974089829, + "grad_norm": 0.3671875, + "learning_rate": 0.00019984364550496383, + "loss": 1.3372, + "step": 1050 + }, + { + "epoch": 0.11656814540633115, + "grad_norm": 0.37109375, + "learning_rate": 0.00019983267958124644, + "loss": 1.3866, + "step": 1055 + }, + { + "epoch": 0.11712060107176399, + "grad_norm": 0.359375, + "learning_rate": 0.0001998213423530654, + "loss": 1.4264, + "step": 1060 + }, + { + "epoch": 0.11767305673719684, + "grad_norm": 0.3671875, + "learning_rate": 0.00019980963386258683, + "loss": 1.4479, + "step": 1065 + }, + { + "epoch": 0.11822551240262968, + "grad_norm": 0.359375, + "learning_rate": 0.0001997975541533577, + "loss": 1.4479, + "step": 1070 + }, + { + "epoch": 0.11877796806806254, + "grad_norm": 0.3671875, + "learning_rate": 0.00019978510327030579, + "loss": 1.3959, + "step": 1075 + }, + { + "epoch": 0.11933042373349538, + "grad_norm": 0.35546875, + "learning_rate": 0.00019977228125973916, + "loss": 1.3914, + "step": 1080 + }, + { + "epoch": 0.11988287939892824, + "grad_norm": 0.37109375, + "learning_rate": 0.0001997590881693464, + "loss": 1.3624, + "step": 1085 + }, + { + "epoch": 0.12043533506436109, + "grad_norm": 0.349609375, + "learning_rate": 0.00019974552404819607, + "loss": 1.3704, + "step": 1090 + }, + { + "epoch": 0.12098779072979393, + "grad_norm": 0.345703125, + "learning_rate": 0.0001997315889467368, + "loss": 1.2807, + "step": 1095 + }, + { + "epoch": 0.12154024639522679, + "grad_norm": 0.35546875, + "learning_rate": 0.0001997172829167969, + "loss": 1.419, + "step": 1100 + }, + { + "epoch": 0.12209270206065963, + "grad_norm": 0.34375, + "learning_rate": 0.00019970260601158443, + "loss": 1.377, + "step": 1105 + }, + { + "epoch": 0.12264515772609248, + "grad_norm": 0.46484375, + "learning_rate": 0.00019968755828568668, + "loss": 1.3901, + "step": 1110 + }, + { + "epoch": 0.12319761339152532, + "grad_norm": 0.37890625, + "learning_rate": 0.00019967213979507018, + "loss": 1.3007, + "step": 1115 + }, + { + "epoch": 0.12375006905695818, + "grad_norm": 0.376953125, + "learning_rate": 0.0001996563505970804, + "loss": 1.335, + "step": 1120 + }, + { + "epoch": 0.12430252472239103, + "grad_norm": 0.384765625, + "learning_rate": 0.00019964019075044163, + "loss": 1.2959, + "step": 1125 + }, + { + "epoch": 0.12485498038782387, + "grad_norm": 0.3515625, + "learning_rate": 0.00019962366031525664, + "loss": 1.3547, + "step": 1130 + }, + { + "epoch": 0.12540743605325672, + "grad_norm": 0.36328125, + "learning_rate": 0.0001996067593530065, + "loss": 1.356, + "step": 1135 + }, + { + "epoch": 0.12595989171868957, + "grad_norm": 0.392578125, + "learning_rate": 0.00019958948792655055, + "loss": 1.3994, + "step": 1140 + }, + { + "epoch": 0.12651234738412243, + "grad_norm": 0.34765625, + "learning_rate": 0.0001995718461001257, + "loss": 1.3695, + "step": 1145 + }, + { + "epoch": 0.12706480304955528, + "grad_norm": 0.388671875, + "learning_rate": 0.00019955383393934674, + "loss": 1.3279, + "step": 1150 + }, + { + "epoch": 0.12761725871498814, + "grad_norm": 0.357421875, + "learning_rate": 0.00019953545151120565, + "loss": 1.3996, + "step": 1155 + }, + { + "epoch": 0.12816971438042096, + "grad_norm": 0.390625, + "learning_rate": 0.0001995166988840716, + "loss": 1.3476, + "step": 1160 + }, + { + "epoch": 0.12872217004585382, + "grad_norm": 0.392578125, + "learning_rate": 0.00019949757612769067, + "loss": 1.3534, + "step": 1165 + }, + { + "epoch": 0.12927462571128667, + "grad_norm": 0.349609375, + "learning_rate": 0.0001994780833131855, + "loss": 1.4534, + "step": 1170 + }, + { + "epoch": 0.12982708137671953, + "grad_norm": 0.388671875, + "learning_rate": 0.00019945822051305507, + "loss": 1.3668, + "step": 1175 + }, + { + "epoch": 0.13037953704215235, + "grad_norm": 0.369140625, + "learning_rate": 0.00019943798780117447, + "loss": 1.4138, + "step": 1180 + }, + { + "epoch": 0.1309319927075852, + "grad_norm": 0.349609375, + "learning_rate": 0.00019941738525279453, + "loss": 1.3172, + "step": 1185 + }, + { + "epoch": 0.13148444837301806, + "grad_norm": 0.37109375, + "learning_rate": 0.00019939641294454172, + "loss": 1.394, + "step": 1190 + }, + { + "epoch": 0.13203690403845092, + "grad_norm": 0.37109375, + "learning_rate": 0.0001993750709544176, + "loss": 1.3651, + "step": 1195 + }, + { + "epoch": 0.13258935970388377, + "grad_norm": 0.37109375, + "learning_rate": 0.00019935335936179874, + "loss": 1.4192, + "step": 1200 + }, + { + "epoch": 0.1331418153693166, + "grad_norm": 0.3671875, + "learning_rate": 0.00019933127824743645, + "loss": 1.3419, + "step": 1205 + }, + { + "epoch": 0.13369427103474946, + "grad_norm": 0.3671875, + "learning_rate": 0.00019930882769345624, + "loss": 1.3274, + "step": 1210 + }, + { + "epoch": 0.1342467267001823, + "grad_norm": 0.37109375, + "learning_rate": 0.00019928600778335774, + "loss": 1.4188, + "step": 1215 + }, + { + "epoch": 0.13479918236561517, + "grad_norm": 0.359375, + "learning_rate": 0.0001992628186020143, + "loss": 1.4108, + "step": 1220 + }, + { + "epoch": 0.13535163803104802, + "grad_norm": 0.36328125, + "learning_rate": 0.0001992392602356727, + "loss": 1.4137, + "step": 1225 + }, + { + "epoch": 0.13590409369648085, + "grad_norm": 0.369140625, + "learning_rate": 0.00019921533277195283, + "loss": 1.3725, + "step": 1230 + }, + { + "epoch": 0.1364565493619137, + "grad_norm": 0.37890625, + "learning_rate": 0.00019919103629984728, + "loss": 1.3933, + "step": 1235 + }, + { + "epoch": 0.13700900502734656, + "grad_norm": 0.3515625, + "learning_rate": 0.0001991663709097212, + "loss": 1.347, + "step": 1240 + }, + { + "epoch": 0.1375614606927794, + "grad_norm": 0.359375, + "learning_rate": 0.00019914133669331175, + "loss": 1.3592, + "step": 1245 + }, + { + "epoch": 0.13811391635821224, + "grad_norm": 0.365234375, + "learning_rate": 0.00019911593374372788, + "loss": 1.3287, + "step": 1250 + }, + { + "epoch": 0.1386663720236451, + "grad_norm": 0.376953125, + "learning_rate": 0.00019909016215544998, + "loss": 1.418, + "step": 1255 + }, + { + "epoch": 0.13921882768907795, + "grad_norm": 0.3671875, + "learning_rate": 0.00019906402202432945, + "loss": 1.3322, + "step": 1260 + }, + { + "epoch": 0.1397712833545108, + "grad_norm": 0.376953125, + "learning_rate": 0.00019903751344758848, + "loss": 1.3598, + "step": 1265 + }, + { + "epoch": 0.14032373901994366, + "grad_norm": 0.36328125, + "learning_rate": 0.00019901063652381953, + "loss": 1.3468, + "step": 1270 + }, + { + "epoch": 0.1408761946853765, + "grad_norm": 0.85546875, + "learning_rate": 0.00019898339135298508, + "loss": 1.2972, + "step": 1275 + }, + { + "epoch": 0.14142865035080934, + "grad_norm": 0.384765625, + "learning_rate": 0.00019895577803641726, + "loss": 1.3682, + "step": 1280 + }, + { + "epoch": 0.1419811060162422, + "grad_norm": 0.392578125, + "learning_rate": 0.00019892779667681732, + "loss": 1.3667, + "step": 1285 + }, + { + "epoch": 0.14253356168167505, + "grad_norm": 0.376953125, + "learning_rate": 0.00019889944737825545, + "loss": 1.4319, + "step": 1290 + }, + { + "epoch": 0.1430860173471079, + "grad_norm": 0.353515625, + "learning_rate": 0.0001988707302461703, + "loss": 1.3237, + "step": 1295 + }, + { + "epoch": 0.14363847301254073, + "grad_norm": 0.36328125, + "learning_rate": 0.00019884164538736858, + "loss": 1.3517, + "step": 1300 + }, + { + "epoch": 0.1441909286779736, + "grad_norm": 0.37890625, + "learning_rate": 0.00019881219291002463, + "loss": 1.3761, + "step": 1305 + }, + { + "epoch": 0.14474338434340644, + "grad_norm": 0.369140625, + "learning_rate": 0.00019878237292368013, + "loss": 1.421, + "step": 1310 + }, + { + "epoch": 0.1452958400088393, + "grad_norm": 0.37890625, + "learning_rate": 0.00019875218553924357, + "loss": 1.3595, + "step": 1315 + }, + { + "epoch": 0.14584829567427213, + "grad_norm": 0.38671875, + "learning_rate": 0.00019872163086898993, + "loss": 1.404, + "step": 1320 + }, + { + "epoch": 0.14640075133970498, + "grad_norm": 0.375, + "learning_rate": 0.00019869070902656018, + "loss": 1.3475, + "step": 1325 + }, + { + "epoch": 0.14695320700513784, + "grad_norm": 0.369140625, + "learning_rate": 0.00019865942012696098, + "loss": 1.4619, + "step": 1330 + }, + { + "epoch": 0.1475056626705707, + "grad_norm": 0.37109375, + "learning_rate": 0.0001986277642865641, + "loss": 1.3436, + "step": 1335 + }, + { + "epoch": 0.14805811833600355, + "grad_norm": 0.38671875, + "learning_rate": 0.00019859574162310608, + "loss": 1.3013, + "step": 1340 + }, + { + "epoch": 0.14861057400143637, + "grad_norm": 0.3828125, + "learning_rate": 0.0001985633522556878, + "loss": 1.3682, + "step": 1345 + }, + { + "epoch": 0.14916302966686923, + "grad_norm": 0.35546875, + "learning_rate": 0.00019853059630477396, + "loss": 1.3098, + "step": 1350 + }, + { + "epoch": 0.14971548533230208, + "grad_norm": 0.361328125, + "learning_rate": 0.00019849747389219272, + "loss": 1.3603, + "step": 1355 + }, + { + "epoch": 0.15026794099773494, + "grad_norm": 0.388671875, + "learning_rate": 0.0001984639851411352, + "loss": 1.3294, + "step": 1360 + }, + { + "epoch": 0.1508203966631678, + "grad_norm": 0.55078125, + "learning_rate": 0.00019843013017615505, + "loss": 1.3643, + "step": 1365 + }, + { + "epoch": 0.15137285232860062, + "grad_norm": 0.3828125, + "learning_rate": 0.00019839590912316791, + "loss": 1.3405, + "step": 1370 + }, + { + "epoch": 0.15192530799403348, + "grad_norm": 0.3828125, + "learning_rate": 0.0001983613221094511, + "loss": 1.3499, + "step": 1375 + }, + { + "epoch": 0.15247776365946633, + "grad_norm": 0.349609375, + "learning_rate": 0.0001983263692636429, + "loss": 1.3834, + "step": 1380 + }, + { + "epoch": 0.15303021932489919, + "grad_norm": 0.359375, + "learning_rate": 0.0001982910507157424, + "loss": 1.3726, + "step": 1385 + }, + { + "epoch": 0.153582674990332, + "grad_norm": 0.369140625, + "learning_rate": 0.00019825536659710867, + "loss": 1.3209, + "step": 1390 + }, + { + "epoch": 0.15413513065576487, + "grad_norm": 0.390625, + "learning_rate": 0.00019821931704046047, + "loss": 1.331, + "step": 1395 + }, + { + "epoch": 0.15468758632119772, + "grad_norm": 0.40625, + "learning_rate": 0.00019818290217987587, + "loss": 1.3082, + "step": 1400 + }, + { + "epoch": 0.15524004198663058, + "grad_norm": 0.369140625, + "learning_rate": 0.0001981461221507914, + "loss": 1.335, + "step": 1405 + }, + { + "epoch": 0.15579249765206343, + "grad_norm": 0.380859375, + "learning_rate": 0.0001981089770900018, + "loss": 1.3439, + "step": 1410 + }, + { + "epoch": 0.15634495331749626, + "grad_norm": 0.3828125, + "learning_rate": 0.00019807146713565955, + "loss": 1.3955, + "step": 1415 + }, + { + "epoch": 0.15689740898292912, + "grad_norm": 0.361328125, + "learning_rate": 0.00019803359242727425, + "loss": 1.3849, + "step": 1420 + }, + { + "epoch": 0.15744986464836197, + "grad_norm": 0.384765625, + "learning_rate": 0.00019799535310571203, + "loss": 1.351, + "step": 1425 + }, + { + "epoch": 0.15800232031379483, + "grad_norm": 0.365234375, + "learning_rate": 0.00019795674931319515, + "loss": 1.3958, + "step": 1430 + }, + { + "epoch": 0.15855477597922768, + "grad_norm": 0.375, + "learning_rate": 0.0001979177811933015, + "loss": 1.2827, + "step": 1435 + }, + { + "epoch": 0.1591072316446605, + "grad_norm": 0.353515625, + "learning_rate": 0.0001978784488909639, + "loss": 1.3038, + "step": 1440 + }, + { + "epoch": 0.15965968731009336, + "grad_norm": 0.37890625, + "learning_rate": 0.00019783875255246973, + "loss": 1.3968, + "step": 1445 + }, + { + "epoch": 0.16021214297552622, + "grad_norm": 11.0, + "learning_rate": 0.00019779869232546034, + "loss": 1.387, + "step": 1450 + }, + { + "epoch": 0.16076459864095907, + "grad_norm": 0.376953125, + "learning_rate": 0.0001977582683589304, + "loss": 1.427, + "step": 1455 + }, + { + "epoch": 0.1613170543063919, + "grad_norm": 0.3984375, + "learning_rate": 0.00019771748080322745, + "loss": 1.2821, + "step": 1460 + }, + { + "epoch": 0.16186950997182475, + "grad_norm": 0.388671875, + "learning_rate": 0.00019767632981005138, + "loss": 1.3751, + "step": 1465 + }, + { + "epoch": 0.1624219656372576, + "grad_norm": 0.373046875, + "learning_rate": 0.0001976348155324537, + "loss": 1.3822, + "step": 1470 + }, + { + "epoch": 0.16297442130269046, + "grad_norm": 0.369140625, + "learning_rate": 0.00019759293812483713, + "loss": 1.4484, + "step": 1475 + }, + { + "epoch": 0.16352687696812332, + "grad_norm": 0.39453125, + "learning_rate": 0.000197550697742955, + "loss": 1.3884, + "step": 1480 + }, + { + "epoch": 0.16407933263355615, + "grad_norm": 0.369140625, + "learning_rate": 0.0001975080945439106, + "loss": 1.3392, + "step": 1485 + }, + { + "epoch": 0.164631788298989, + "grad_norm": 0.359375, + "learning_rate": 0.00019746512868615656, + "loss": 1.3039, + "step": 1490 + }, + { + "epoch": 0.16518424396442186, + "grad_norm": 0.376953125, + "learning_rate": 0.0001974218003294945, + "loss": 1.3382, + "step": 1495 + }, + { + "epoch": 0.1657366996298547, + "grad_norm": 0.375, + "learning_rate": 0.0001973781096350741, + "loss": 1.2985, + "step": 1500 + }, + { + "epoch": 0.16628915529528757, + "grad_norm": 0.349609375, + "learning_rate": 0.0001973340567653928, + "loss": 1.321, + "step": 1505 + }, + { + "epoch": 0.1668416109607204, + "grad_norm": 0.3828125, + "learning_rate": 0.00019728964188429503, + "loss": 1.3347, + "step": 1510 + }, + { + "epoch": 0.16739406662615325, + "grad_norm": 0.376953125, + "learning_rate": 0.00019724486515697155, + "loss": 1.3552, + "step": 1515 + }, + { + "epoch": 0.1679465222915861, + "grad_norm": 0.384765625, + "learning_rate": 0.00019719972674995905, + "loss": 1.3776, + "step": 1520 + }, + { + "epoch": 0.16849897795701896, + "grad_norm": 0.396484375, + "learning_rate": 0.00019715422683113938, + "loss": 1.4107, + "step": 1525 + }, + { + "epoch": 0.16905143362245179, + "grad_norm": 0.359375, + "learning_rate": 0.00019710836556973885, + "loss": 1.3361, + "step": 1530 + }, + { + "epoch": 0.16960388928788464, + "grad_norm": 0.38671875, + "learning_rate": 0.00019706214313632784, + "loss": 1.3714, + "step": 1535 + }, + { + "epoch": 0.1701563449533175, + "grad_norm": 0.373046875, + "learning_rate": 0.00019701555970281988, + "loss": 1.2905, + "step": 1540 + }, + { + "epoch": 0.17070880061875035, + "grad_norm": 0.3828125, + "learning_rate": 0.0001969686154424713, + "loss": 1.3327, + "step": 1545 + }, + { + "epoch": 0.1712612562841832, + "grad_norm": 0.365234375, + "learning_rate": 0.00019692131052988034, + "loss": 1.3297, + "step": 1550 + }, + { + "epoch": 0.17181371194961603, + "grad_norm": 0.392578125, + "learning_rate": 0.00019687364514098664, + "loss": 1.3781, + "step": 1555 + }, + { + "epoch": 0.1723661676150489, + "grad_norm": 0.359375, + "learning_rate": 0.00019682561945307052, + "loss": 1.2793, + "step": 1560 + }, + { + "epoch": 0.17291862328048174, + "grad_norm": 0.376953125, + "learning_rate": 0.00019677723364475237, + "loss": 1.3794, + "step": 1565 + }, + { + "epoch": 0.1734710789459146, + "grad_norm": 0.37890625, + "learning_rate": 0.00019672848789599204, + "loss": 1.3247, + "step": 1570 + }, + { + "epoch": 0.17402353461134745, + "grad_norm": 0.384765625, + "learning_rate": 0.00019667938238808797, + "loss": 1.393, + "step": 1575 + }, + { + "epoch": 0.17457599027678028, + "grad_norm": 0.369140625, + "learning_rate": 0.00019662991730367663, + "loss": 1.3854, + "step": 1580 + }, + { + "epoch": 0.17512844594221313, + "grad_norm": 0.353515625, + "learning_rate": 0.00019658009282673202, + "loss": 1.3966, + "step": 1585 + }, + { + "epoch": 0.175680901607646, + "grad_norm": 0.40234375, + "learning_rate": 0.00019652990914256467, + "loss": 1.3691, + "step": 1590 + }, + { + "epoch": 0.17623335727307884, + "grad_norm": 0.3828125, + "learning_rate": 0.00019647936643782109, + "loss": 1.4073, + "step": 1595 + }, + { + "epoch": 0.17678581293851167, + "grad_norm": 0.38671875, + "learning_rate": 0.0001964284649004832, + "loss": 1.373, + "step": 1600 + }, + { + "epoch": 0.17733826860394453, + "grad_norm": 0.359375, + "learning_rate": 0.00019637720471986735, + "loss": 1.3112, + "step": 1605 + }, + { + "epoch": 0.17789072426937738, + "grad_norm": 0.3984375, + "learning_rate": 0.00019632558608662402, + "loss": 1.2798, + "step": 1610 + }, + { + "epoch": 0.17844317993481024, + "grad_norm": 0.35546875, + "learning_rate": 0.0001962736091927366, + "loss": 1.258, + "step": 1615 + }, + { + "epoch": 0.1789956356002431, + "grad_norm": 0.37109375, + "learning_rate": 0.00019622127423152112, + "loss": 1.308, + "step": 1620 + }, + { + "epoch": 0.17954809126567592, + "grad_norm": 0.361328125, + "learning_rate": 0.00019616858139762534, + "loss": 1.3554, + "step": 1625 + }, + { + "epoch": 0.18010054693110877, + "grad_norm": 0.39453125, + "learning_rate": 0.00019611553088702798, + "loss": 1.3128, + "step": 1630 + }, + { + "epoch": 0.18065300259654163, + "grad_norm": 0.369140625, + "learning_rate": 0.0001960621228970381, + "loss": 1.3385, + "step": 1635 + }, + { + "epoch": 0.18120545826197448, + "grad_norm": 0.38671875, + "learning_rate": 0.0001960083576262943, + "loss": 1.396, + "step": 1640 + }, + { + "epoch": 0.18175791392740734, + "grad_norm": 0.3671875, + "learning_rate": 0.00019595423527476405, + "loss": 1.3687, + "step": 1645 + }, + { + "epoch": 0.18231036959284017, + "grad_norm": 0.365234375, + "learning_rate": 0.00019589975604374286, + "loss": 1.4066, + "step": 1650 + }, + { + "epoch": 0.18286282525827302, + "grad_norm": 0.35546875, + "learning_rate": 0.00019584492013585355, + "loss": 1.3031, + "step": 1655 + }, + { + "epoch": 0.18341528092370588, + "grad_norm": 0.3984375, + "learning_rate": 0.00019578972775504555, + "loss": 1.3094, + "step": 1660 + }, + { + "epoch": 0.18396773658913873, + "grad_norm": 0.35546875, + "learning_rate": 0.00019573417910659412, + "loss": 1.3321, + "step": 1665 + }, + { + "epoch": 0.18452019225457156, + "grad_norm": 0.404296875, + "learning_rate": 0.00019567827439709954, + "loss": 1.3338, + "step": 1670 + }, + { + "epoch": 0.1850726479200044, + "grad_norm": 0.390625, + "learning_rate": 0.00019562201383448638, + "loss": 1.2639, + "step": 1675 + }, + { + "epoch": 0.18562510358543727, + "grad_norm": 0.376953125, + "learning_rate": 0.00019556539762800276, + "loss": 1.335, + "step": 1680 + }, + { + "epoch": 0.18617755925087012, + "grad_norm": 0.400390625, + "learning_rate": 0.00019550842598821952, + "loss": 1.3488, + "step": 1685 + }, + { + "epoch": 0.18673001491630298, + "grad_norm": 0.365234375, + "learning_rate": 0.0001954510991270294, + "loss": 1.3334, + "step": 1690 + }, + { + "epoch": 0.1872824705817358, + "grad_norm": 0.375, + "learning_rate": 0.00019539341725764638, + "loss": 1.3627, + "step": 1695 + }, + { + "epoch": 0.18783492624716866, + "grad_norm": 0.408203125, + "learning_rate": 0.00019533538059460475, + "loss": 1.3849, + "step": 1700 + }, + { + "epoch": 0.18838738191260151, + "grad_norm": 0.373046875, + "learning_rate": 0.0001952769893537584, + "loss": 1.352, + "step": 1705 + }, + { + "epoch": 0.18893983757803437, + "grad_norm": 0.369140625, + "learning_rate": 0.00019521824375228004, + "loss": 1.2987, + "step": 1710 + }, + { + "epoch": 0.18949229324346722, + "grad_norm": 0.365234375, + "learning_rate": 0.0001951591440086602, + "loss": 1.3289, + "step": 1715 + }, + { + "epoch": 0.19004474890890005, + "grad_norm": 0.365234375, + "learning_rate": 0.00019509969034270673, + "loss": 1.2721, + "step": 1720 + }, + { + "epoch": 0.1905972045743329, + "grad_norm": 0.400390625, + "learning_rate": 0.0001950398829755437, + "loss": 1.3934, + "step": 1725 + }, + { + "epoch": 0.19114966023976576, + "grad_norm": 0.39453125, + "learning_rate": 0.0001949797221296107, + "loss": 1.329, + "step": 1730 + }, + { + "epoch": 0.19170211590519862, + "grad_norm": 0.373046875, + "learning_rate": 0.00019491920802866205, + "loss": 1.4029, + "step": 1735 + }, + { + "epoch": 0.19225457157063144, + "grad_norm": 0.359375, + "learning_rate": 0.00019485834089776586, + "loss": 1.3859, + "step": 1740 + }, + { + "epoch": 0.1928070272360643, + "grad_norm": 0.36328125, + "learning_rate": 0.00019479712096330336, + "loss": 1.3165, + "step": 1745 + }, + { + "epoch": 0.19335948290149715, + "grad_norm": 0.37109375, + "learning_rate": 0.0001947355484529678, + "loss": 1.2785, + "step": 1750 + }, + { + "epoch": 0.19391193856693, + "grad_norm": 0.373046875, + "learning_rate": 0.00019467362359576386, + "loss": 1.2629, + "step": 1755 + }, + { + "epoch": 0.19446439423236286, + "grad_norm": 0.3671875, + "learning_rate": 0.00019461134662200668, + "loss": 1.308, + "step": 1760 + }, + { + "epoch": 0.1950168498977957, + "grad_norm": 0.392578125, + "learning_rate": 0.00019454871776332095, + "loss": 1.3369, + "step": 1765 + }, + { + "epoch": 0.19556930556322855, + "grad_norm": 0.369140625, + "learning_rate": 0.00019448573725264022, + "loss": 1.2883, + "step": 1770 + }, + { + "epoch": 0.1961217612286614, + "grad_norm": 0.376953125, + "learning_rate": 0.00019442240532420584, + "loss": 1.409, + "step": 1775 + }, + { + "epoch": 0.19667421689409426, + "grad_norm": 0.37890625, + "learning_rate": 0.0001943587222135662, + "loss": 1.2601, + "step": 1780 + }, + { + "epoch": 0.1972266725595271, + "grad_norm": 0.384765625, + "learning_rate": 0.00019429468815757587, + "loss": 1.2943, + "step": 1785 + }, + { + "epoch": 0.19777912822495994, + "grad_norm": 0.3828125, + "learning_rate": 0.00019423030339439464, + "loss": 1.2805, + "step": 1790 + }, + { + "epoch": 0.1983315838903928, + "grad_norm": 0.375, + "learning_rate": 0.00019416556816348663, + "loss": 1.3609, + "step": 1795 + }, + { + "epoch": 0.19888403955582565, + "grad_norm": 0.37890625, + "learning_rate": 0.00019410048270561956, + "loss": 1.3424, + "step": 1800 + }, + { + "epoch": 0.1994364952212585, + "grad_norm": 0.37890625, + "learning_rate": 0.0001940350472628637, + "loss": 1.3322, + "step": 1805 + }, + { + "epoch": 0.19998895088669133, + "grad_norm": 0.361328125, + "learning_rate": 0.00019396926207859084, + "loss": 1.3729, + "step": 1810 + }, + { + "epoch": 0.20054140655212419, + "grad_norm": 0.341796875, + "learning_rate": 0.00019390312739747385, + "loss": 1.3087, + "step": 1815 + }, + { + "epoch": 0.20109386221755704, + "grad_norm": 0.384765625, + "learning_rate": 0.0001938366434654852, + "loss": 1.2203, + "step": 1820 + }, + { + "epoch": 0.2016463178829899, + "grad_norm": 0.392578125, + "learning_rate": 0.00019376981052989653, + "loss": 1.294, + "step": 1825 + }, + { + "epoch": 0.20219877354842275, + "grad_norm": 0.35546875, + "learning_rate": 0.00019370262883927733, + "loss": 1.314, + "step": 1830 + }, + { + "epoch": 0.20275122921385558, + "grad_norm": 0.404296875, + "learning_rate": 0.00019363509864349436, + "loss": 1.3179, + "step": 1835 + }, + { + "epoch": 0.20330368487928843, + "grad_norm": 0.396484375, + "learning_rate": 0.0001935672201937105, + "loss": 1.3179, + "step": 1840 + }, + { + "epoch": 0.2038561405447213, + "grad_norm": 0.3671875, + "learning_rate": 0.00019349899374238383, + "loss": 1.2649, + "step": 1845 + }, + { + "epoch": 0.20440859621015414, + "grad_norm": 0.37890625, + "learning_rate": 0.0001934304195432668, + "loss": 1.326, + "step": 1850 + }, + { + "epoch": 0.204961051875587, + "grad_norm": 0.380859375, + "learning_rate": 0.00019336149785140525, + "loss": 1.3869, + "step": 1855 + }, + { + "epoch": 0.20551350754101982, + "grad_norm": 0.3515625, + "learning_rate": 0.00019329222892313736, + "loss": 1.3094, + "step": 1860 + }, + { + "epoch": 0.20606596320645268, + "grad_norm": 0.35546875, + "learning_rate": 0.00019322261301609286, + "loss": 1.3309, + "step": 1865 + }, + { + "epoch": 0.20661841887188553, + "grad_norm": 0.384765625, + "learning_rate": 0.00019315265038919192, + "loss": 1.3306, + "step": 1870 + }, + { + "epoch": 0.2071708745373184, + "grad_norm": 0.453125, + "learning_rate": 0.00019308234130264431, + "loss": 1.3068, + "step": 1875 + }, + { + "epoch": 0.20772333020275122, + "grad_norm": 0.375, + "learning_rate": 0.0001930116860179483, + "loss": 1.332, + "step": 1880 + }, + { + "epoch": 0.20827578586818407, + "grad_norm": 0.390625, + "learning_rate": 0.00019294068479788984, + "loss": 1.3728, + "step": 1885 + }, + { + "epoch": 0.20882824153361693, + "grad_norm": 0.373046875, + "learning_rate": 0.00019286933790654148, + "loss": 1.3674, + "step": 1890 + }, + { + "epoch": 0.20938069719904978, + "grad_norm": 0.40234375, + "learning_rate": 0.00019279764560926142, + "loss": 1.3572, + "step": 1895 + }, + { + "epoch": 0.20993315286448264, + "grad_norm": 0.412109375, + "learning_rate": 0.00019272560817269247, + "loss": 1.4198, + "step": 1900 + }, + { + "epoch": 0.21048560852991546, + "grad_norm": 0.373046875, + "learning_rate": 0.00019265322586476118, + "loss": 1.4785, + "step": 1905 + }, + { + "epoch": 0.21103806419534832, + "grad_norm": 0.388671875, + "learning_rate": 0.0001925804989546767, + "loss": 1.3762, + "step": 1910 + }, + { + "epoch": 0.21159051986078117, + "grad_norm": 0.388671875, + "learning_rate": 0.0001925074277129299, + "loss": 1.3075, + "step": 1915 + }, + { + "epoch": 0.21214297552621403, + "grad_norm": 0.365234375, + "learning_rate": 0.0001924340124112923, + "loss": 1.3402, + "step": 1920 + }, + { + "epoch": 0.21269543119164688, + "grad_norm": 0.40234375, + "learning_rate": 0.00019236025332281507, + "loss": 1.3579, + "step": 1925 + }, + { + "epoch": 0.2132478868570797, + "grad_norm": 0.3671875, + "learning_rate": 0.00019228615072182796, + "loss": 1.3567, + "step": 1930 + }, + { + "epoch": 0.21380034252251257, + "grad_norm": 0.37890625, + "learning_rate": 0.00019221170488393843, + "loss": 1.3228, + "step": 1935 + }, + { + "epoch": 0.21435279818794542, + "grad_norm": 0.37890625, + "learning_rate": 0.00019213691608603047, + "loss": 1.3398, + "step": 1940 + }, + { + "epoch": 0.21490525385337828, + "grad_norm": 0.365234375, + "learning_rate": 0.0001920617846062636, + "loss": 1.2582, + "step": 1945 + }, + { + "epoch": 0.2154577095188111, + "grad_norm": 0.37890625, + "learning_rate": 0.000191986310724072, + "loss": 1.3183, + "step": 1950 + }, + { + "epoch": 0.21601016518424396, + "grad_norm": 0.37890625, + "learning_rate": 0.00019191049472016313, + "loss": 1.2936, + "step": 1955 + }, + { + "epoch": 0.2165626208496768, + "grad_norm": 0.390625, + "learning_rate": 0.0001918343368765171, + "loss": 1.3027, + "step": 1960 + }, + { + "epoch": 0.21711507651510967, + "grad_norm": 0.384765625, + "learning_rate": 0.0001917578374763853, + "loss": 1.285, + "step": 1965 + }, + { + "epoch": 0.21766753218054252, + "grad_norm": 0.38671875, + "learning_rate": 0.00019168099680428943, + "loss": 1.3343, + "step": 1970 + }, + { + "epoch": 0.21821998784597535, + "grad_norm": 0.376953125, + "learning_rate": 0.0001916038151460206, + "loss": 1.3197, + "step": 1975 + }, + { + "epoch": 0.2187724435114082, + "grad_norm": 0.56640625, + "learning_rate": 0.000191526292788638, + "loss": 1.3124, + "step": 1980 + }, + { + "epoch": 0.21932489917684106, + "grad_norm": 0.3671875, + "learning_rate": 0.00019144843002046806, + "loss": 1.2425, + "step": 1985 + }, + { + "epoch": 0.21987735484227391, + "grad_norm": 0.41796875, + "learning_rate": 0.00019137022713110324, + "loss": 1.2814, + "step": 1990 + }, + { + "epoch": 0.22042981050770677, + "grad_norm": 0.375, + "learning_rate": 0.00019129168441140104, + "loss": 1.3209, + "step": 1995 + }, + { + "epoch": 0.2209822661731396, + "grad_norm": 0.376953125, + "learning_rate": 0.00019121280215348286, + "loss": 1.2622, + "step": 2000 + }, + { + "epoch": 0.22153472183857245, + "grad_norm": 0.376953125, + "learning_rate": 0.00019113358065073297, + "loss": 1.248, + "step": 2005 + }, + { + "epoch": 0.2220871775040053, + "grad_norm": 0.361328125, + "learning_rate": 0.00019105402019779728, + "loss": 1.3935, + "step": 2010 + }, + { + "epoch": 0.22263963316943816, + "grad_norm": 0.392578125, + "learning_rate": 0.00019097412109058247, + "loss": 1.3131, + "step": 2015 + }, + { + "epoch": 0.223192088834871, + "grad_norm": 0.404296875, + "learning_rate": 0.00019089388362625466, + "loss": 1.2621, + "step": 2020 + }, + { + "epoch": 0.22374454450030384, + "grad_norm": 0.37109375, + "learning_rate": 0.00019081330810323852, + "loss": 1.3199, + "step": 2025 + }, + { + "epoch": 0.2242970001657367, + "grad_norm": 0.37109375, + "learning_rate": 0.000190732394821216, + "loss": 1.3163, + "step": 2030 + }, + { + "epoch": 0.22484945583116955, + "grad_norm": 0.412109375, + "learning_rate": 0.00019065114408112517, + "loss": 1.3475, + "step": 2035 + }, + { + "epoch": 0.2254019114966024, + "grad_norm": 0.423828125, + "learning_rate": 0.00019056955618515934, + "loss": 1.2857, + "step": 2040 + }, + { + "epoch": 0.22595436716203524, + "grad_norm": 0.3828125, + "learning_rate": 0.00019048763143676578, + "loss": 1.3265, + "step": 2045 + }, + { + "epoch": 0.2265068228274681, + "grad_norm": 0.376953125, + "learning_rate": 0.0001904053701406445, + "loss": 1.3118, + "step": 2050 + }, + { + "epoch": 0.22705927849290095, + "grad_norm": 0.421875, + "learning_rate": 0.0001903227726027473, + "loss": 1.2948, + "step": 2055 + }, + { + "epoch": 0.2276117341583338, + "grad_norm": 0.404296875, + "learning_rate": 0.00019023983913027655, + "loss": 1.3374, + "step": 2060 + }, + { + "epoch": 0.22816418982376666, + "grad_norm": 0.375, + "learning_rate": 0.00019015657003168405, + "loss": 1.4124, + "step": 2065 + }, + { + "epoch": 0.22871664548919948, + "grad_norm": 0.3828125, + "learning_rate": 0.00019007296561666985, + "loss": 1.2983, + "step": 2070 + }, + { + "epoch": 0.22926910115463234, + "grad_norm": 0.369140625, + "learning_rate": 0.00018998902619618116, + "loss": 1.3912, + "step": 2075 + }, + { + "epoch": 0.2298215568200652, + "grad_norm": 0.375, + "learning_rate": 0.00018990475208241115, + "loss": 1.2852, + "step": 2080 + }, + { + "epoch": 0.23037401248549805, + "grad_norm": 0.392578125, + "learning_rate": 0.0001898201435887978, + "loss": 1.3782, + "step": 2085 + }, + { + "epoch": 0.23092646815093087, + "grad_norm": 0.3671875, + "learning_rate": 0.00018973520103002277, + "loss": 1.3141, + "step": 2090 + }, + { + "epoch": 0.23147892381636373, + "grad_norm": 0.37109375, + "learning_rate": 0.0001896499247220102, + "loss": 1.2974, + "step": 2095 + }, + { + "epoch": 0.23203137948179658, + "grad_norm": 0.375, + "learning_rate": 0.00018956431498192547, + "loss": 1.3403, + "step": 2100 + }, + { + "epoch": 0.23258383514722944, + "grad_norm": 0.388671875, + "learning_rate": 0.00018947837212817415, + "loss": 1.3873, + "step": 2105 + }, + { + "epoch": 0.2331362908126623, + "grad_norm": 0.365234375, + "learning_rate": 0.00018939209648040071, + "loss": 1.402, + "step": 2110 + }, + { + "epoch": 0.23368874647809512, + "grad_norm": 0.400390625, + "learning_rate": 0.00018930548835948736, + "loss": 1.3259, + "step": 2115 + }, + { + "epoch": 0.23424120214352798, + "grad_norm": 0.38671875, + "learning_rate": 0.00018921854808755294, + "loss": 1.3384, + "step": 2120 + }, + { + "epoch": 0.23479365780896083, + "grad_norm": 0.3828125, + "learning_rate": 0.00018913127598795156, + "loss": 1.2929, + "step": 2125 + }, + { + "epoch": 0.2353461134743937, + "grad_norm": 0.37109375, + "learning_rate": 0.00018904367238527155, + "loss": 1.4016, + "step": 2130 + }, + { + "epoch": 0.23589856913982654, + "grad_norm": 0.3828125, + "learning_rate": 0.00018895573760533413, + "loss": 1.3384, + "step": 2135 + }, + { + "epoch": 0.23645102480525937, + "grad_norm": 0.37890625, + "learning_rate": 0.00018886747197519233, + "loss": 1.3469, + "step": 2140 + }, + { + "epoch": 0.23700348047069222, + "grad_norm": 0.404296875, + "learning_rate": 0.0001887788758231296, + "loss": 1.3132, + "step": 2145 + }, + { + "epoch": 0.23755593613612508, + "grad_norm": 0.392578125, + "learning_rate": 0.00018868994947865883, + "loss": 1.4067, + "step": 2150 + }, + { + "epoch": 0.23810839180155793, + "grad_norm": 0.384765625, + "learning_rate": 0.00018860069327252086, + "loss": 1.3245, + "step": 2155 + }, + { + "epoch": 0.23866084746699076, + "grad_norm": 0.3671875, + "learning_rate": 0.0001885111075366834, + "loss": 1.3196, + "step": 2160 + }, + { + "epoch": 0.23921330313242362, + "grad_norm": 0.359375, + "learning_rate": 0.00018842119260433982, + "loss": 1.329, + "step": 2165 + }, + { + "epoch": 0.23976575879785647, + "grad_norm": 0.376953125, + "learning_rate": 0.0001883309488099078, + "loss": 1.333, + "step": 2170 + }, + { + "epoch": 0.24031821446328933, + "grad_norm": 0.421875, + "learning_rate": 0.00018824037648902819, + "loss": 1.3282, + "step": 2175 + }, + { + "epoch": 0.24087067012872218, + "grad_norm": 0.373046875, + "learning_rate": 0.00018814947597856367, + "loss": 1.2719, + "step": 2180 + }, + { + "epoch": 0.241423125794155, + "grad_norm": 0.380859375, + "learning_rate": 0.00018805824761659764, + "loss": 1.3457, + "step": 2185 + }, + { + "epoch": 0.24197558145958786, + "grad_norm": 0.376953125, + "learning_rate": 0.00018796669174243273, + "loss": 1.3, + "step": 2190 + }, + { + "epoch": 0.24252803712502072, + "grad_norm": 0.396484375, + "learning_rate": 0.0001878748086965898, + "loss": 1.2966, + "step": 2195 + }, + { + "epoch": 0.24308049279045357, + "grad_norm": 0.375, + "learning_rate": 0.0001877825988208065, + "loss": 1.2928, + "step": 2200 + }, + { + "epoch": 0.24363294845588643, + "grad_norm": 0.380859375, + "learning_rate": 0.00018769006245803596, + "loss": 1.2629, + "step": 2205 + }, + { + "epoch": 0.24418540412131925, + "grad_norm": 0.375, + "learning_rate": 0.0001875971999524458, + "loss": 1.2908, + "step": 2210 + }, + { + "epoch": 0.2447378597867521, + "grad_norm": 0.369140625, + "learning_rate": 0.0001875040116494165, + "loss": 1.2242, + "step": 2215 + }, + { + "epoch": 0.24529031545218496, + "grad_norm": 0.3984375, + "learning_rate": 0.00018741049789554028, + "loss": 1.3844, + "step": 2220 + }, + { + "epoch": 0.24584277111761782, + "grad_norm": 0.390625, + "learning_rate": 0.00018731665903861985, + "loss": 1.2795, + "step": 2225 + }, + { + "epoch": 0.24639522678305065, + "grad_norm": 0.416015625, + "learning_rate": 0.00018722249542766703, + "loss": 1.242, + "step": 2230 + }, + { + "epoch": 0.2469476824484835, + "grad_norm": 0.375, + "learning_rate": 0.00018712800741290154, + "loss": 1.2709, + "step": 2235 + }, + { + "epoch": 0.24750013811391636, + "grad_norm": 0.36328125, + "learning_rate": 0.00018703319534574956, + "loss": 1.2267, + "step": 2240 + }, + { + "epoch": 0.2480525937793492, + "grad_norm": 0.48046875, + "learning_rate": 0.00018693805957884258, + "loss": 1.2533, + "step": 2245 + }, + { + "epoch": 0.24860504944478207, + "grad_norm": 0.38671875, + "learning_rate": 0.00018684260046601594, + "loss": 1.3512, + "step": 2250 + }, + { + "epoch": 0.2491575051102149, + "grad_norm": 0.3671875, + "learning_rate": 0.0001867468183623077, + "loss": 1.265, + "step": 2255 + }, + { + "epoch": 0.24970996077564775, + "grad_norm": 0.369140625, + "learning_rate": 0.0001866507136239571, + "loss": 1.2883, + "step": 2260 + }, + { + "epoch": 0.2502624164410806, + "grad_norm": 0.37109375, + "learning_rate": 0.00018655428660840345, + "loss": 1.2778, + "step": 2265 + }, + { + "epoch": 0.25081487210651343, + "grad_norm": 0.41015625, + "learning_rate": 0.00018645753767428458, + "loss": 1.3121, + "step": 2270 + }, + { + "epoch": 0.2513673277719463, + "grad_norm": 0.380859375, + "learning_rate": 0.0001863604671814357, + "loss": 1.3, + "step": 2275 + }, + { + "epoch": 0.25191978343737914, + "grad_norm": 0.41015625, + "learning_rate": 0.00018626307549088792, + "loss": 1.2828, + "step": 2280 + }, + { + "epoch": 0.252472239102812, + "grad_norm": 0.3828125, + "learning_rate": 0.0001861653629648671, + "loss": 1.3681, + "step": 2285 + }, + { + "epoch": 0.25302469476824485, + "grad_norm": 0.37890625, + "learning_rate": 0.00018606732996679224, + "loss": 1.3393, + "step": 2290 + }, + { + "epoch": 0.2535771504336777, + "grad_norm": 0.369140625, + "learning_rate": 0.00018596897686127428, + "loss": 1.3363, + "step": 2295 + }, + { + "epoch": 0.25412960609911056, + "grad_norm": 0.375, + "learning_rate": 0.0001858703040141148, + "loss": 1.3374, + "step": 2300 + }, + { + "epoch": 0.2546820617645434, + "grad_norm": 0.37109375, + "learning_rate": 0.00018577131179230448, + "loss": 1.2833, + "step": 2305 + }, + { + "epoch": 0.25523451742997627, + "grad_norm": 0.40625, + "learning_rate": 0.00018567200056402195, + "loss": 1.3367, + "step": 2310 + }, + { + "epoch": 0.2557869730954091, + "grad_norm": 0.38671875, + "learning_rate": 0.00018557237069863222, + "loss": 1.277, + "step": 2315 + }, + { + "epoch": 0.2563394287608419, + "grad_norm": 0.375, + "learning_rate": 0.00018547242256668548, + "loss": 1.3652, + "step": 2320 + }, + { + "epoch": 0.2568918844262748, + "grad_norm": 0.419921875, + "learning_rate": 0.00018537215653991552, + "loss": 1.368, + "step": 2325 + }, + { + "epoch": 0.25744434009170764, + "grad_norm": 0.361328125, + "learning_rate": 0.0001852715729912386, + "loss": 1.3541, + "step": 2330 + }, + { + "epoch": 0.25799679575714046, + "grad_norm": 0.38671875, + "learning_rate": 0.00018517067229475184, + "loss": 1.3887, + "step": 2335 + }, + { + "epoch": 0.25854925142257335, + "grad_norm": 0.39453125, + "learning_rate": 0.00018506945482573195, + "loss": 1.3053, + "step": 2340 + }, + { + "epoch": 0.2591017070880062, + "grad_norm": 0.37890625, + "learning_rate": 0.0001849679209606338, + "loss": 1.3297, + "step": 2345 + }, + { + "epoch": 0.25965416275343906, + "grad_norm": 0.390625, + "learning_rate": 0.000184866071077089, + "loss": 1.3666, + "step": 2350 + }, + { + "epoch": 0.2602066184188719, + "grad_norm": 0.376953125, + "learning_rate": 0.00018476390555390457, + "loss": 1.292, + "step": 2355 + }, + { + "epoch": 0.2607590740843047, + "grad_norm": 0.375, + "learning_rate": 0.0001846614247710614, + "loss": 1.3541, + "step": 2360 + }, + { + "epoch": 0.2613115297497376, + "grad_norm": 0.361328125, + "learning_rate": 0.000184558629109713, + "loss": 1.3721, + "step": 2365 + }, + { + "epoch": 0.2618639854151704, + "grad_norm": 0.3984375, + "learning_rate": 0.00018445551895218394, + "loss": 1.2935, + "step": 2370 + }, + { + "epoch": 0.2624164410806033, + "grad_norm": 0.392578125, + "learning_rate": 0.00018435209468196847, + "loss": 1.3716, + "step": 2375 + }, + { + "epoch": 0.26296889674603613, + "grad_norm": 0.37890625, + "learning_rate": 0.00018424835668372919, + "loss": 1.3419, + "step": 2380 + }, + { + "epoch": 0.26352135241146896, + "grad_norm": 0.37109375, + "learning_rate": 0.0001841443053432955, + "loss": 1.2971, + "step": 2385 + }, + { + "epoch": 0.26407380807690184, + "grad_norm": 0.380859375, + "learning_rate": 0.00018403994104766212, + "loss": 1.3557, + "step": 2390 + }, + { + "epoch": 0.26462626374233467, + "grad_norm": 0.419921875, + "learning_rate": 0.00018393526418498786, + "loss": 1.296, + "step": 2395 + }, + { + "epoch": 0.26517871940776755, + "grad_norm": 0.427734375, + "learning_rate": 0.00018383027514459402, + "loss": 1.2831, + "step": 2400 + }, + { + "epoch": 0.2657311750732004, + "grad_norm": 0.408203125, + "learning_rate": 0.00018372497431696288, + "loss": 1.316, + "step": 2405 + }, + { + "epoch": 0.2662836307386332, + "grad_norm": 0.84375, + "learning_rate": 0.00018361936209373644, + "loss": 1.2525, + "step": 2410 + }, + { + "epoch": 0.2668360864040661, + "grad_norm": 0.392578125, + "learning_rate": 0.00018351343886771488, + "loss": 1.3607, + "step": 2415 + }, + { + "epoch": 0.2673885420694989, + "grad_norm": 0.40625, + "learning_rate": 0.00018340720503285497, + "loss": 1.3376, + "step": 2420 + }, + { + "epoch": 0.2679409977349318, + "grad_norm": 0.384765625, + "learning_rate": 0.00018330066098426882, + "loss": 1.3497, + "step": 2425 + }, + { + "epoch": 0.2684934534003646, + "grad_norm": 0.392578125, + "learning_rate": 0.00018319380711822225, + "loss": 1.3144, + "step": 2430 + }, + { + "epoch": 0.26904590906579745, + "grad_norm": 0.388671875, + "learning_rate": 0.00018308664383213344, + "loss": 1.4368, + "step": 2435 + }, + { + "epoch": 0.26959836473123033, + "grad_norm": 0.400390625, + "learning_rate": 0.00018297917152457126, + "loss": 1.3755, + "step": 2440 + }, + { + "epoch": 0.27015082039666316, + "grad_norm": 0.376953125, + "learning_rate": 0.00018287139059525412, + "loss": 1.2929, + "step": 2445 + }, + { + "epoch": 0.27070327606209604, + "grad_norm": 0.38671875, + "learning_rate": 0.00018276330144504803, + "loss": 1.3217, + "step": 2450 + }, + { + "epoch": 0.27125573172752887, + "grad_norm": 0.3828125, + "learning_rate": 0.0001826549044759655, + "loss": 1.3161, + "step": 2455 + }, + { + "epoch": 0.2718081873929617, + "grad_norm": 0.38671875, + "learning_rate": 0.00018254620009116397, + "loss": 1.2695, + "step": 2460 + }, + { + "epoch": 0.2723606430583946, + "grad_norm": 0.384765625, + "learning_rate": 0.00018243718869494408, + "loss": 1.3496, + "step": 2465 + }, + { + "epoch": 0.2729130987238274, + "grad_norm": 0.65625, + "learning_rate": 0.0001823278706927484, + "loss": 1.3295, + "step": 2470 + }, + { + "epoch": 0.27346555438926023, + "grad_norm": 0.45703125, + "learning_rate": 0.00018221824649115984, + "loss": 1.3566, + "step": 2475 + }, + { + "epoch": 0.2740180100546931, + "grad_norm": 0.375, + "learning_rate": 0.00018210831649790018, + "loss": 1.2789, + "step": 2480 + }, + { + "epoch": 0.27457046572012594, + "grad_norm": 0.373046875, + "learning_rate": 0.00018199808112182847, + "loss": 1.2588, + "step": 2485 + }, + { + "epoch": 0.2751229213855588, + "grad_norm": 0.408203125, + "learning_rate": 0.00018188754077293963, + "loss": 1.2579, + "step": 2490 + }, + { + "epoch": 0.27567537705099165, + "grad_norm": 0.3828125, + "learning_rate": 0.00018177669586236277, + "loss": 1.2771, + "step": 2495 + }, + { + "epoch": 0.2762278327164245, + "grad_norm": 0.404296875, + "learning_rate": 0.0001816655468023598, + "loss": 1.3111, + "step": 2500 + }, + { + "epoch": 0.27678028838185736, + "grad_norm": 0.3984375, + "learning_rate": 0.00018155409400632386, + "loss": 1.3309, + "step": 2505 + }, + { + "epoch": 0.2773327440472902, + "grad_norm": 0.396484375, + "learning_rate": 0.0001814423378887777, + "loss": 1.2619, + "step": 2510 + }, + { + "epoch": 0.2778851997127231, + "grad_norm": 0.380859375, + "learning_rate": 0.00018133027886537225, + "loss": 1.2475, + "step": 2515 + }, + { + "epoch": 0.2784376553781559, + "grad_norm": 0.37890625, + "learning_rate": 0.00018121791735288504, + "loss": 1.2622, + "step": 2520 + }, + { + "epoch": 0.27899011104358873, + "grad_norm": 0.37109375, + "learning_rate": 0.00018110525376921862, + "loss": 1.2824, + "step": 2525 + }, + { + "epoch": 0.2795425667090216, + "grad_norm": 0.392578125, + "learning_rate": 0.00018099228853339901, + "loss": 1.2897, + "step": 2530 + }, + { + "epoch": 0.28009502237445444, + "grad_norm": 0.376953125, + "learning_rate": 0.00018087902206557411, + "loss": 1.4139, + "step": 2535 + }, + { + "epoch": 0.2806474780398873, + "grad_norm": 0.388671875, + "learning_rate": 0.00018076545478701235, + "loss": 1.2527, + "step": 2540 + }, + { + "epoch": 0.28119993370532015, + "grad_norm": 0.388671875, + "learning_rate": 0.00018065158712010076, + "loss": 1.2459, + "step": 2545 + }, + { + "epoch": 0.281752389370753, + "grad_norm": 0.388671875, + "learning_rate": 0.0001805374194883437, + "loss": 1.4089, + "step": 2550 + }, + { + "epoch": 0.28230484503618586, + "grad_norm": 0.400390625, + "learning_rate": 0.00018042295231636115, + "loss": 1.4094, + "step": 2555 + }, + { + "epoch": 0.2828573007016187, + "grad_norm": 0.376953125, + "learning_rate": 0.0001803081860298872, + "loss": 1.3252, + "step": 2560 + }, + { + "epoch": 0.28340975636705157, + "grad_norm": 0.375, + "learning_rate": 0.0001801931210557684, + "loss": 1.3043, + "step": 2565 + }, + { + "epoch": 0.2839622120324844, + "grad_norm": 0.388671875, + "learning_rate": 0.00018007775782196214, + "loss": 1.2738, + "step": 2570 + }, + { + "epoch": 0.2845146676979172, + "grad_norm": 0.37109375, + "learning_rate": 0.00017996209675753523, + "loss": 1.3325, + "step": 2575 + }, + { + "epoch": 0.2850671233633501, + "grad_norm": 0.3828125, + "learning_rate": 0.0001798461382926621, + "loss": 1.2705, + "step": 2580 + }, + { + "epoch": 0.28561957902878293, + "grad_norm": 0.388671875, + "learning_rate": 0.00017972988285862337, + "loss": 1.3578, + "step": 2585 + }, + { + "epoch": 0.2861720346942158, + "grad_norm": 0.37890625, + "learning_rate": 0.00017961333088780404, + "loss": 1.2751, + "step": 2590 + }, + { + "epoch": 0.28672449035964864, + "grad_norm": 0.396484375, + "learning_rate": 0.00017949648281369217, + "loss": 1.3509, + "step": 2595 + }, + { + "epoch": 0.28727694602508147, + "grad_norm": 0.376953125, + "learning_rate": 0.00017937933907087703, + "loss": 1.2251, + "step": 2600 + }, + { + "epoch": 0.28782940169051435, + "grad_norm": 0.388671875, + "learning_rate": 0.00017926190009504752, + "loss": 1.3057, + "step": 2605 + }, + { + "epoch": 0.2883818573559472, + "grad_norm": 0.375, + "learning_rate": 0.00017914416632299065, + "loss": 1.2594, + "step": 2610 + }, + { + "epoch": 0.28893431302138, + "grad_norm": 0.380859375, + "learning_rate": 0.00017902613819258985, + "loss": 1.2807, + "step": 2615 + }, + { + "epoch": 0.2894867686868129, + "grad_norm": 0.388671875, + "learning_rate": 0.0001789078161428233, + "loss": 1.2231, + "step": 2620 + }, + { + "epoch": 0.2900392243522457, + "grad_norm": 0.37890625, + "learning_rate": 0.00017878920061376247, + "loss": 1.2458, + "step": 2625 + }, + { + "epoch": 0.2905916800176786, + "grad_norm": 0.373046875, + "learning_rate": 0.0001786702920465702, + "loss": 1.2969, + "step": 2630 + }, + { + "epoch": 0.2911441356831114, + "grad_norm": 0.392578125, + "learning_rate": 0.00017855109088349926, + "loss": 1.2793, + "step": 2635 + }, + { + "epoch": 0.29169659134854425, + "grad_norm": 0.41015625, + "learning_rate": 0.00017843159756789076, + "loss": 1.3146, + "step": 2640 + }, + { + "epoch": 0.29224904701397714, + "grad_norm": 0.376953125, + "learning_rate": 0.00017831181254417228, + "loss": 1.3319, + "step": 2645 + }, + { + "epoch": 0.29280150267940996, + "grad_norm": 0.376953125, + "learning_rate": 0.00017819173625785643, + "loss": 1.3275, + "step": 2650 + }, + { + "epoch": 0.29335395834484285, + "grad_norm": 0.39453125, + "learning_rate": 0.00017807136915553903, + "loss": 1.3764, + "step": 2655 + }, + { + "epoch": 0.2939064140102757, + "grad_norm": 0.38671875, + "learning_rate": 0.0001779507116848976, + "loss": 1.3263, + "step": 2660 + }, + { + "epoch": 0.2944588696757085, + "grad_norm": 0.357421875, + "learning_rate": 0.00017782976429468956, + "loss": 1.2854, + "step": 2665 + }, + { + "epoch": 0.2950113253411414, + "grad_norm": 0.400390625, + "learning_rate": 0.00017770852743475066, + "loss": 1.2836, + "step": 2670 + }, + { + "epoch": 0.2955637810065742, + "grad_norm": 0.375, + "learning_rate": 0.00017758700155599317, + "loss": 1.3247, + "step": 2675 + }, + { + "epoch": 0.2961162366720071, + "grad_norm": 0.384765625, + "learning_rate": 0.00017746518711040442, + "loss": 1.293, + "step": 2680 + }, + { + "epoch": 0.2966686923374399, + "grad_norm": 0.384765625, + "learning_rate": 0.00017734308455104496, + "loss": 1.2325, + "step": 2685 + }, + { + "epoch": 0.29722114800287275, + "grad_norm": 0.376953125, + "learning_rate": 0.00017722069433204687, + "loss": 1.3441, + "step": 2690 + }, + { + "epoch": 0.29777360366830563, + "grad_norm": 0.388671875, + "learning_rate": 0.00017709801690861214, + "loss": 1.3705, + "step": 2695 + }, + { + "epoch": 0.29832605933373846, + "grad_norm": 0.36328125, + "learning_rate": 0.00017697505273701097, + "loss": 1.2979, + "step": 2700 + }, + { + "epoch": 0.29887851499917134, + "grad_norm": 0.3984375, + "learning_rate": 0.00017685180227458003, + "loss": 1.3489, + "step": 2705 + }, + { + "epoch": 0.29943097066460417, + "grad_norm": 0.396484375, + "learning_rate": 0.0001767282659797208, + "loss": 1.2235, + "step": 2710 + }, + { + "epoch": 0.299983426330037, + "grad_norm": 0.41015625, + "learning_rate": 0.0001766044443118978, + "loss": 1.3033, + "step": 2715 + }, + { + "epoch": 0.3005358819954699, + "grad_norm": 0.392578125, + "learning_rate": 0.000176480337731637, + "loss": 1.1975, + "step": 2720 + }, + { + "epoch": 0.3010883376609027, + "grad_norm": 0.36328125, + "learning_rate": 0.000176355946700524, + "loss": 1.2594, + "step": 2725 + }, + { + "epoch": 0.3016407933263356, + "grad_norm": 0.37890625, + "learning_rate": 0.00017623127168120233, + "loss": 1.3387, + "step": 2730 + }, + { + "epoch": 0.3021932489917684, + "grad_norm": 0.404296875, + "learning_rate": 0.00017610631313737173, + "loss": 1.3067, + "step": 2735 + }, + { + "epoch": 0.30274570465720124, + "grad_norm": 0.376953125, + "learning_rate": 0.00017598107153378657, + "loss": 1.3715, + "step": 2740 + }, + { + "epoch": 0.3032981603226341, + "grad_norm": 0.40234375, + "learning_rate": 0.00017585554733625384, + "loss": 1.3346, + "step": 2745 + }, + { + "epoch": 0.30385061598806695, + "grad_norm": 0.384765625, + "learning_rate": 0.00017572974101163165, + "loss": 1.2732, + "step": 2750 + }, + { + "epoch": 0.3044030716534998, + "grad_norm": 0.412109375, + "learning_rate": 0.00017560365302782738, + "loss": 1.3209, + "step": 2755 + }, + { + "epoch": 0.30495552731893266, + "grad_norm": 0.484375, + "learning_rate": 0.00017547728385379605, + "loss": 1.2767, + "step": 2760 + }, + { + "epoch": 0.3055079829843655, + "grad_norm": 0.400390625, + "learning_rate": 0.0001753506339595384, + "loss": 1.3191, + "step": 2765 + }, + { + "epoch": 0.30606043864979837, + "grad_norm": 0.3828125, + "learning_rate": 0.00017522370381609935, + "loss": 1.2933, + "step": 2770 + }, + { + "epoch": 0.3066128943152312, + "grad_norm": 0.384765625, + "learning_rate": 0.00017509649389556607, + "loss": 1.3383, + "step": 2775 + }, + { + "epoch": 0.307165349980664, + "grad_norm": 0.41015625, + "learning_rate": 0.00017496900467106627, + "loss": 1.2884, + "step": 2780 + }, + { + "epoch": 0.3077178056460969, + "grad_norm": 0.396484375, + "learning_rate": 0.00017484123661676656, + "loss": 1.2966, + "step": 2785 + }, + { + "epoch": 0.30827026131152974, + "grad_norm": 0.37109375, + "learning_rate": 0.0001747131902078705, + "loss": 1.2811, + "step": 2790 + }, + { + "epoch": 0.3088227169769626, + "grad_norm": 0.38671875, + "learning_rate": 0.00017458486592061704, + "loss": 1.2886, + "step": 2795 + }, + { + "epoch": 0.30937517264239545, + "grad_norm": 0.392578125, + "learning_rate": 0.00017445626423227844, + "loss": 1.2527, + "step": 2800 + }, + { + "epoch": 0.3099276283078283, + "grad_norm": 0.396484375, + "learning_rate": 0.0001743273856211589, + "loss": 1.2855, + "step": 2805 + }, + { + "epoch": 0.31048008397326116, + "grad_norm": 0.400390625, + "learning_rate": 0.00017419823056659243, + "loss": 1.3601, + "step": 2810 + }, + { + "epoch": 0.311032539638694, + "grad_norm": 0.38671875, + "learning_rate": 0.00017406879954894134, + "loss": 1.2841, + "step": 2815 + }, + { + "epoch": 0.31158499530412687, + "grad_norm": 0.384765625, + "learning_rate": 0.00017393909304959414, + "loss": 1.3243, + "step": 2820 + }, + { + "epoch": 0.3121374509695597, + "grad_norm": 0.40234375, + "learning_rate": 0.00017380911155096408, + "loss": 1.3383, + "step": 2825 + }, + { + "epoch": 0.3126899066349925, + "grad_norm": 0.38671875, + "learning_rate": 0.00017367885553648717, + "loss": 1.3337, + "step": 2830 + }, + { + "epoch": 0.3132423623004254, + "grad_norm": 0.37890625, + "learning_rate": 0.00017354832549062034, + "loss": 1.2695, + "step": 2835 + }, + { + "epoch": 0.31379481796585823, + "grad_norm": 0.37890625, + "learning_rate": 0.00017341752189883983, + "loss": 1.3181, + "step": 2840 + }, + { + "epoch": 0.3143472736312911, + "grad_norm": 0.3515625, + "learning_rate": 0.0001732864452476392, + "loss": 1.3033, + "step": 2845 + }, + { + "epoch": 0.31489972929672394, + "grad_norm": 0.412109375, + "learning_rate": 0.0001731550960245276, + "loss": 1.3264, + "step": 2850 + }, + { + "epoch": 0.31545218496215677, + "grad_norm": 0.376953125, + "learning_rate": 0.00017302347471802798, + "loss": 1.317, + "step": 2855 + }, + { + "epoch": 0.31600464062758965, + "grad_norm": 0.38671875, + "learning_rate": 0.00017289158181767517, + "loss": 1.2907, + "step": 2860 + }, + { + "epoch": 0.3165570962930225, + "grad_norm": 0.3984375, + "learning_rate": 0.00017275941781401427, + "loss": 1.3268, + "step": 2865 + }, + { + "epoch": 0.31710955195845536, + "grad_norm": 0.455078125, + "learning_rate": 0.00017262698319859846, + "loss": 1.4696, + "step": 2870 + }, + { + "epoch": 0.3176620076238882, + "grad_norm": 0.404296875, + "learning_rate": 0.00017249427846398766, + "loss": 1.2852, + "step": 2875 + }, + { + "epoch": 0.318214463289321, + "grad_norm": 0.369140625, + "learning_rate": 0.00017236130410374625, + "loss": 1.3287, + "step": 2880 + }, + { + "epoch": 0.3187669189547539, + "grad_norm": 0.38671875, + "learning_rate": 0.0001722280606124415, + "loss": 1.2764, + "step": 2885 + }, + { + "epoch": 0.3193193746201867, + "grad_norm": 0.384765625, + "learning_rate": 0.00017209454848564156, + "loss": 1.2118, + "step": 2890 + }, + { + "epoch": 0.31987183028561955, + "grad_norm": 0.384765625, + "learning_rate": 0.00017196076821991384, + "loss": 1.3399, + "step": 2895 + }, + { + "epoch": 0.32042428595105243, + "grad_norm": 0.384765625, + "learning_rate": 0.00017182672031282296, + "loss": 1.2984, + "step": 2900 + }, + { + "epoch": 0.32097674161648526, + "grad_norm": 0.396484375, + "learning_rate": 0.00017169240526292896, + "loss": 1.435, + "step": 2905 + }, + { + "epoch": 0.32152919728191814, + "grad_norm": 0.3671875, + "learning_rate": 0.0001715578235697855, + "loss": 1.2973, + "step": 2910 + }, + { + "epoch": 0.32208165294735097, + "grad_norm": 0.421875, + "learning_rate": 0.0001714229757339379, + "loss": 1.3144, + "step": 2915 + }, + { + "epoch": 0.3226341086127838, + "grad_norm": 0.384765625, + "learning_rate": 0.00017128786225692136, + "loss": 1.3097, + "step": 2920 + }, + { + "epoch": 0.3231865642782167, + "grad_norm": 0.416015625, + "learning_rate": 0.00017115248364125906, + "loss": 1.2551, + "step": 2925 + }, + { + "epoch": 0.3237390199436495, + "grad_norm": 0.380859375, + "learning_rate": 0.00017101684039046036, + "loss": 1.3567, + "step": 2930 + }, + { + "epoch": 0.3242914756090824, + "grad_norm": 0.41796875, + "learning_rate": 0.00017088093300901883, + "loss": 1.394, + "step": 2935 + }, + { + "epoch": 0.3248439312745152, + "grad_norm": 0.404296875, + "learning_rate": 0.00017074476200241035, + "loss": 1.2932, + "step": 2940 + }, + { + "epoch": 0.32539638693994805, + "grad_norm": 0.4453125, + "learning_rate": 0.0001706083278770914, + "loss": 1.315, + "step": 2945 + }, + { + "epoch": 0.32594884260538093, + "grad_norm": 0.3828125, + "learning_rate": 0.00017047163114049702, + "loss": 1.3137, + "step": 2950 + }, + { + "epoch": 0.32650129827081376, + "grad_norm": 0.376953125, + "learning_rate": 0.00017033467230103894, + "loss": 1.2952, + "step": 2955 + }, + { + "epoch": 0.32705375393624664, + "grad_norm": 0.3984375, + "learning_rate": 0.00017019745186810378, + "loss": 1.3501, + "step": 2960 + }, + { + "epoch": 0.32760620960167947, + "grad_norm": 0.3828125, + "learning_rate": 0.0001700599703520511, + "loss": 1.3543, + "step": 2965 + }, + { + "epoch": 0.3281586652671123, + "grad_norm": 0.375, + "learning_rate": 0.00016992222826421133, + "loss": 1.2998, + "step": 2970 + }, + { + "epoch": 0.3287111209325452, + "grad_norm": 0.392578125, + "learning_rate": 0.0001697842261168843, + "loss": 1.3317, + "step": 2975 + }, + { + "epoch": 0.329263576597978, + "grad_norm": 0.384765625, + "learning_rate": 0.00016964596442333696, + "loss": 1.218, + "step": 2980 + }, + { + "epoch": 0.3298160322634109, + "grad_norm": 0.408203125, + "learning_rate": 0.00016950744369780148, + "loss": 1.2898, + "step": 2985 + }, + { + "epoch": 0.3303684879288437, + "grad_norm": 0.384765625, + "learning_rate": 0.00016936866445547353, + "loss": 1.2866, + "step": 2990 + }, + { + "epoch": 0.33092094359427654, + "grad_norm": 0.412109375, + "learning_rate": 0.00016922962721251038, + "loss": 1.3173, + "step": 2995 + }, + { + "epoch": 0.3314733992597094, + "grad_norm": 0.423828125, + "learning_rate": 0.0001690903324860286, + "loss": 1.2364, + "step": 3000 + }, + { + "epoch": 0.33202585492514225, + "grad_norm": 0.384765625, + "learning_rate": 0.0001689507807941027, + "loss": 1.3763, + "step": 3005 + }, + { + "epoch": 0.33257831059057513, + "grad_norm": 0.40234375, + "learning_rate": 0.0001688109726557627, + "loss": 1.2882, + "step": 3010 + }, + { + "epoch": 0.33313076625600796, + "grad_norm": 0.37890625, + "learning_rate": 0.00016867090859099256, + "loss": 1.2957, + "step": 3015 + }, + { + "epoch": 0.3336832219214408, + "grad_norm": 0.3828125, + "learning_rate": 0.00016853058912072802, + "loss": 1.2837, + "step": 3020 + }, + { + "epoch": 0.33423567758687367, + "grad_norm": 0.40625, + "learning_rate": 0.0001683900147668547, + "loss": 1.3043, + "step": 3025 + }, + { + "epoch": 0.3347881332523065, + "grad_norm": 0.39453125, + "learning_rate": 0.0001682491860522063, + "loss": 1.2826, + "step": 3030 + }, + { + "epoch": 0.3353405889177393, + "grad_norm": 0.396484375, + "learning_rate": 0.0001681081035005626, + "loss": 1.339, + "step": 3035 + }, + { + "epoch": 0.3358930445831722, + "grad_norm": 0.3984375, + "learning_rate": 0.00016796676763664725, + "loss": 1.3558, + "step": 3040 + }, + { + "epoch": 0.33644550024860503, + "grad_norm": 0.388671875, + "learning_rate": 0.00016782517898612619, + "loss": 1.2467, + "step": 3045 + }, + { + "epoch": 0.3369979559140379, + "grad_norm": 0.39453125, + "learning_rate": 0.00016768333807560558, + "loss": 1.2127, + "step": 3050 + }, + { + "epoch": 0.33755041157947074, + "grad_norm": 0.396484375, + "learning_rate": 0.00016754124543262973, + "loss": 1.3179, + "step": 3055 + }, + { + "epoch": 0.33810286724490357, + "grad_norm": 0.37890625, + "learning_rate": 0.00016739890158567916, + "loss": 1.2387, + "step": 3060 + }, + { + "epoch": 0.33865532291033645, + "grad_norm": 0.396484375, + "learning_rate": 0.0001672563070641688, + "loss": 1.3292, + "step": 3065 + }, + { + "epoch": 0.3392077785757693, + "grad_norm": 0.375, + "learning_rate": 0.00016711346239844588, + "loss": 1.3224, + "step": 3070 + }, + { + "epoch": 0.33976023424120216, + "grad_norm": 0.392578125, + "learning_rate": 0.00016697036811978786, + "loss": 1.2961, + "step": 3075 + }, + { + "epoch": 0.340312689906635, + "grad_norm": 0.373046875, + "learning_rate": 0.00016682702476040077, + "loss": 1.3446, + "step": 3080 + }, + { + "epoch": 0.3408651455720678, + "grad_norm": 0.392578125, + "learning_rate": 0.00016668343285341686, + "loss": 1.33, + "step": 3085 + }, + { + "epoch": 0.3414176012375007, + "grad_norm": 0.388671875, + "learning_rate": 0.00016653959293289297, + "loss": 1.3379, + "step": 3090 + }, + { + "epoch": 0.34197005690293353, + "grad_norm": 0.392578125, + "learning_rate": 0.00016639550553380818, + "loss": 1.3434, + "step": 3095 + }, + { + "epoch": 0.3425225125683664, + "grad_norm": 0.388671875, + "learning_rate": 0.00016625117119206214, + "loss": 1.3041, + "step": 3100 + }, + { + "epoch": 0.34307496823379924, + "grad_norm": 0.390625, + "learning_rate": 0.00016610659044447298, + "loss": 1.2399, + "step": 3105 + }, + { + "epoch": 0.34362742389923207, + "grad_norm": 0.40234375, + "learning_rate": 0.00016596176382877506, + "loss": 1.313, + "step": 3110 + }, + { + "epoch": 0.34417987956466495, + "grad_norm": 0.37890625, + "learning_rate": 0.00016581669188361748, + "loss": 1.2899, + "step": 3115 + }, + { + "epoch": 0.3447323352300978, + "grad_norm": 0.388671875, + "learning_rate": 0.00016567137514856154, + "loss": 1.3798, + "step": 3120 + }, + { + "epoch": 0.34528479089553066, + "grad_norm": 0.4140625, + "learning_rate": 0.00016552581416407917, + "loss": 1.3817, + "step": 3125 + }, + { + "epoch": 0.3458372465609635, + "grad_norm": 0.353515625, + "learning_rate": 0.00016538000947155062, + "loss": 1.2274, + "step": 3130 + }, + { + "epoch": 0.3463897022263963, + "grad_norm": 0.361328125, + "learning_rate": 0.0001652339616132625, + "loss": 1.266, + "step": 3135 + }, + { + "epoch": 0.3469421578918292, + "grad_norm": 0.3828125, + "learning_rate": 0.00016508767113240598, + "loss": 1.3188, + "step": 3140 + }, + { + "epoch": 0.347494613557262, + "grad_norm": 0.390625, + "learning_rate": 0.00016494113857307453, + "loss": 1.2402, + "step": 3145 + }, + { + "epoch": 0.3480470692226949, + "grad_norm": 0.421875, + "learning_rate": 0.00016479436448026195, + "loss": 1.291, + "step": 3150 + }, + { + "epoch": 0.34859952488812773, + "grad_norm": 0.384765625, + "learning_rate": 0.00016464734939986036, + "loss": 1.2659, + "step": 3155 + }, + { + "epoch": 0.34915198055356056, + "grad_norm": 0.3984375, + "learning_rate": 0.00016450009387865822, + "loss": 1.3496, + "step": 3160 + }, + { + "epoch": 0.34970443621899344, + "grad_norm": 0.375, + "learning_rate": 0.00016435259846433824, + "loss": 1.2812, + "step": 3165 + }, + { + "epoch": 0.35025689188442627, + "grad_norm": 0.451171875, + "learning_rate": 0.00016420486370547537, + "loss": 1.2947, + "step": 3170 + }, + { + "epoch": 0.3508093475498591, + "grad_norm": 0.380859375, + "learning_rate": 0.00016405689015153472, + "loss": 1.2675, + "step": 3175 + }, + { + "epoch": 0.351361803215292, + "grad_norm": 0.39453125, + "learning_rate": 0.00016390867835286953, + "loss": 1.3378, + "step": 3180 + }, + { + "epoch": 0.3519142588807248, + "grad_norm": 0.37890625, + "learning_rate": 0.0001637602288607192, + "loss": 1.2887, + "step": 3185 + }, + { + "epoch": 0.3524667145461577, + "grad_norm": 0.392578125, + "learning_rate": 0.00016361154222720715, + "loss": 1.2812, + "step": 3190 + }, + { + "epoch": 0.3530191702115905, + "grad_norm": 0.384765625, + "learning_rate": 0.00016346261900533867, + "loss": 1.3614, + "step": 3195 + }, + { + "epoch": 0.35357162587702334, + "grad_norm": 0.388671875, + "learning_rate": 0.00016331345974899923, + "loss": 1.2825, + "step": 3200 + }, + { + "epoch": 0.3541240815424562, + "grad_norm": 0.37890625, + "learning_rate": 0.00016316406501295198, + "loss": 1.2426, + "step": 3205 + }, + { + "epoch": 0.35467653720788905, + "grad_norm": 0.365234375, + "learning_rate": 0.0001630144353528359, + "loss": 1.2618, + "step": 3210 + }, + { + "epoch": 0.35522899287332194, + "grad_norm": 0.41015625, + "learning_rate": 0.00016286457132516383, + "loss": 1.3534, + "step": 3215 + }, + { + "epoch": 0.35578144853875476, + "grad_norm": 0.384765625, + "learning_rate": 0.0001627144734873202, + "loss": 1.2713, + "step": 3220 + }, + { + "epoch": 0.3563339042041876, + "grad_norm": 0.400390625, + "learning_rate": 0.00016256414239755902, + "loss": 1.2645, + "step": 3225 + }, + { + "epoch": 0.3568863598696205, + "grad_norm": 0.396484375, + "learning_rate": 0.00016241357861500184, + "loss": 1.2915, + "step": 3230 + }, + { + "epoch": 0.3574388155350533, + "grad_norm": 0.400390625, + "learning_rate": 0.00016226278269963578, + "loss": 1.3063, + "step": 3235 + }, + { + "epoch": 0.3579912712004862, + "grad_norm": 0.416015625, + "learning_rate": 0.00016211175521231108, + "loss": 1.2619, + "step": 3240 + }, + { + "epoch": 0.358543726865919, + "grad_norm": 0.4140625, + "learning_rate": 0.00016196049671473954, + "loss": 1.2738, + "step": 3245 + }, + { + "epoch": 0.35909618253135184, + "grad_norm": 0.40234375, + "learning_rate": 0.0001618090077694919, + "loss": 1.2864, + "step": 3250 + }, + { + "epoch": 0.3596486381967847, + "grad_norm": 0.396484375, + "learning_rate": 0.00016165728893999617, + "loss": 1.3055, + "step": 3255 + }, + { + "epoch": 0.36020109386221755, + "grad_norm": 0.4140625, + "learning_rate": 0.00016150534079053527, + "loss": 1.2808, + "step": 3260 + }, + { + "epoch": 0.36075354952765043, + "grad_norm": 0.380859375, + "learning_rate": 0.00016135316388624505, + "loss": 1.3164, + "step": 3265 + }, + { + "epoch": 0.36130600519308326, + "grad_norm": 0.380859375, + "learning_rate": 0.0001612007587931122, + "loss": 1.3134, + "step": 3270 + }, + { + "epoch": 0.3618584608585161, + "grad_norm": 0.384765625, + "learning_rate": 0.00016104812607797202, + "loss": 1.2804, + "step": 3275 + }, + { + "epoch": 0.36241091652394897, + "grad_norm": 0.3984375, + "learning_rate": 0.00016089526630850643, + "loss": 1.2536, + "step": 3280 + }, + { + "epoch": 0.3629633721893818, + "grad_norm": 0.40234375, + "learning_rate": 0.0001607421800532419, + "loss": 1.2638, + "step": 3285 + }, + { + "epoch": 0.3635158278548147, + "grad_norm": 0.40234375, + "learning_rate": 0.00016058886788154712, + "loss": 1.2628, + "step": 3290 + }, + { + "epoch": 0.3640682835202475, + "grad_norm": 0.37109375, + "learning_rate": 0.00016043533036363115, + "loss": 1.2148, + "step": 3295 + }, + { + "epoch": 0.36462073918568033, + "grad_norm": 0.376953125, + "learning_rate": 0.00016028156807054112, + "loss": 1.3718, + "step": 3300 + }, + { + "epoch": 0.3651731948511132, + "grad_norm": 0.380859375, + "learning_rate": 0.0001601275815741602, + "loss": 1.2852, + "step": 3305 + }, + { + "epoch": 0.36572565051654604, + "grad_norm": 0.380859375, + "learning_rate": 0.00015997337144720532, + "loss": 1.3223, + "step": 3310 + }, + { + "epoch": 0.36627810618197887, + "grad_norm": 0.39453125, + "learning_rate": 0.00015981893826322527, + "loss": 1.2156, + "step": 3315 + }, + { + "epoch": 0.36683056184741175, + "grad_norm": 0.40234375, + "learning_rate": 0.00015966428259659845, + "loss": 1.3567, + "step": 3320 + }, + { + "epoch": 0.3673830175128446, + "grad_norm": 0.396484375, + "learning_rate": 0.00015950940502253063, + "loss": 1.2521, + "step": 3325 + }, + { + "epoch": 0.36793547317827746, + "grad_norm": 0.40234375, + "learning_rate": 0.00015935430611705296, + "loss": 1.2567, + "step": 3330 + }, + { + "epoch": 0.3684879288437103, + "grad_norm": 0.3984375, + "learning_rate": 0.0001591989864570199, + "loss": 1.3041, + "step": 3335 + }, + { + "epoch": 0.3690403845091431, + "grad_norm": 0.408203125, + "learning_rate": 0.00015904344662010672, + "loss": 1.3083, + "step": 3340 + }, + { + "epoch": 0.369592840174576, + "grad_norm": 0.390625, + "learning_rate": 0.00015888768718480778, + "loss": 1.319, + "step": 3345 + }, + { + "epoch": 0.3701452958400088, + "grad_norm": 0.37890625, + "learning_rate": 0.00015873170873043413, + "loss": 1.2914, + "step": 3350 + }, + { + "epoch": 0.3706977515054417, + "grad_norm": 0.404296875, + "learning_rate": 0.00015857551183711137, + "loss": 1.3141, + "step": 3355 + }, + { + "epoch": 0.37125020717087454, + "grad_norm": 0.39453125, + "learning_rate": 0.0001584190970857776, + "loss": 1.3307, + "step": 3360 + }, + { + "epoch": 0.37180266283630736, + "grad_norm": 0.41015625, + "learning_rate": 0.00015826246505818112, + "loss": 1.3437, + "step": 3365 + }, + { + "epoch": 0.37235511850174025, + "grad_norm": 0.39453125, + "learning_rate": 0.00015810561633687842, + "loss": 1.282, + "step": 3370 + }, + { + "epoch": 0.3729075741671731, + "grad_norm": 0.40625, + "learning_rate": 0.00015794855150523182, + "loss": 1.2792, + "step": 3375 + }, + { + "epoch": 0.37346002983260596, + "grad_norm": 0.44140625, + "learning_rate": 0.00015779127114740757, + "loss": 1.301, + "step": 3380 + }, + { + "epoch": 0.3740124854980388, + "grad_norm": 0.3671875, + "learning_rate": 0.00015763377584837335, + "loss": 1.3113, + "step": 3385 + }, + { + "epoch": 0.3745649411634716, + "grad_norm": 0.3984375, + "learning_rate": 0.0001574760661938964, + "loss": 1.2551, + "step": 3390 + }, + { + "epoch": 0.3751173968289045, + "grad_norm": 0.384765625, + "learning_rate": 0.00015731814277054112, + "loss": 1.2969, + "step": 3395 + }, + { + "epoch": 0.3756698524943373, + "grad_norm": 0.392578125, + "learning_rate": 0.00015716000616566698, + "loss": 1.3498, + "step": 3400 + }, + { + "epoch": 0.3762223081597702, + "grad_norm": 0.404296875, + "learning_rate": 0.0001570016569674264, + "loss": 1.3202, + "step": 3405 + }, + { + "epoch": 0.37677476382520303, + "grad_norm": 0.384765625, + "learning_rate": 0.00015684309576476246, + "loss": 1.3306, + "step": 3410 + }, + { + "epoch": 0.37732721949063586, + "grad_norm": 0.38671875, + "learning_rate": 0.00015668432314740663, + "loss": 1.2447, + "step": 3415 + }, + { + "epoch": 0.37787967515606874, + "grad_norm": 0.3828125, + "learning_rate": 0.00015652533970587687, + "loss": 1.3604, + "step": 3420 + }, + { + "epoch": 0.37843213082150157, + "grad_norm": 0.4140625, + "learning_rate": 0.00015636614603147512, + "loss": 1.2636, + "step": 3425 + }, + { + "epoch": 0.37898458648693445, + "grad_norm": 0.39453125, + "learning_rate": 0.0001562067427162853, + "loss": 1.2856, + "step": 3430 + }, + { + "epoch": 0.3795370421523673, + "grad_norm": 0.357421875, + "learning_rate": 0.00015604713035317097, + "loss": 1.2489, + "step": 3435 + }, + { + "epoch": 0.3800894978178001, + "grad_norm": 0.4140625, + "learning_rate": 0.00015588730953577335, + "loss": 1.3487, + "step": 3440 + }, + { + "epoch": 0.380641953483233, + "grad_norm": 0.400390625, + "learning_rate": 0.0001557272808585087, + "loss": 1.3533, + "step": 3445 + }, + { + "epoch": 0.3811944091486658, + "grad_norm": 0.408203125, + "learning_rate": 0.00015556704491656665, + "loss": 1.3122, + "step": 3450 + }, + { + "epoch": 0.38174686481409864, + "grad_norm": 0.390625, + "learning_rate": 0.00015540660230590748, + "loss": 1.2188, + "step": 3455 + }, + { + "epoch": 0.3822993204795315, + "grad_norm": 0.39453125, + "learning_rate": 0.00015524595362326025, + "loss": 1.2876, + "step": 3460 + }, + { + "epoch": 0.38285177614496435, + "grad_norm": 0.388671875, + "learning_rate": 0.00015508509946612044, + "loss": 1.2888, + "step": 3465 + }, + { + "epoch": 0.38340423181039723, + "grad_norm": 0.375, + "learning_rate": 0.0001549240404327477, + "loss": 1.2401, + "step": 3470 + }, + { + "epoch": 0.38395668747583006, + "grad_norm": 0.388671875, + "learning_rate": 0.00015476277712216365, + "loss": 1.3932, + "step": 3475 + }, + { + "epoch": 0.3845091431412629, + "grad_norm": 0.39453125, + "learning_rate": 0.00015460131013414979, + "loss": 1.3198, + "step": 3480 + }, + { + "epoch": 0.38506159880669577, + "grad_norm": 0.4296875, + "learning_rate": 0.00015443964006924509, + "loss": 1.2714, + "step": 3485 + }, + { + "epoch": 0.3856140544721286, + "grad_norm": 0.39453125, + "learning_rate": 0.00015427776752874371, + "loss": 1.3115, + "step": 3490 + }, + { + "epoch": 0.3861665101375615, + "grad_norm": 0.375, + "learning_rate": 0.00015411569311469308, + "loss": 1.317, + "step": 3495 + }, + { + "epoch": 0.3867189658029943, + "grad_norm": 0.392578125, + "learning_rate": 0.00015395341742989124, + "loss": 1.2624, + "step": 3500 + }, + { + "epoch": 0.38727142146842713, + "grad_norm": 0.37890625, + "learning_rate": 0.00015379094107788497, + "loss": 1.3209, + "step": 3505 + }, + { + "epoch": 0.38782387713386, + "grad_norm": 0.38671875, + "learning_rate": 0.00015362826466296732, + "loss": 1.2344, + "step": 3510 + }, + { + "epoch": 0.38837633279929284, + "grad_norm": 0.39453125, + "learning_rate": 0.0001534653887901754, + "loss": 1.3466, + "step": 3515 + }, + { + "epoch": 0.3889287884647257, + "grad_norm": 0.375, + "learning_rate": 0.0001533023140652882, + "loss": 1.3633, + "step": 3520 + }, + { + "epoch": 0.38948124413015855, + "grad_norm": 0.412109375, + "learning_rate": 0.00015313904109482432, + "loss": 1.2775, + "step": 3525 + }, + { + "epoch": 0.3900336997955914, + "grad_norm": 0.38671875, + "learning_rate": 0.0001529755704860396, + "loss": 1.2686, + "step": 3530 + }, + { + "epoch": 0.39058615546102426, + "grad_norm": 0.388671875, + "learning_rate": 0.000152811902846925, + "loss": 1.3165, + "step": 3535 + }, + { + "epoch": 0.3911386111264571, + "grad_norm": 0.3984375, + "learning_rate": 0.0001526480387862043, + "loss": 1.2956, + "step": 3540 + }, + { + "epoch": 0.39169106679189, + "grad_norm": 0.412109375, + "learning_rate": 0.00015248397891333185, + "loss": 1.3655, + "step": 3545 + }, + { + "epoch": 0.3922435224573228, + "grad_norm": 0.3828125, + "learning_rate": 0.00015231972383849017, + "loss": 1.2996, + "step": 3550 + }, + { + "epoch": 0.39279597812275563, + "grad_norm": 0.4140625, + "learning_rate": 0.00015215527417258794, + "loss": 1.2721, + "step": 3555 + }, + { + "epoch": 0.3933484337881885, + "grad_norm": 0.390625, + "learning_rate": 0.00015199063052725745, + "loss": 1.2413, + "step": 3560 + }, + { + "epoch": 0.39390088945362134, + "grad_norm": 0.369140625, + "learning_rate": 0.00015182579351485248, + "loss": 1.2568, + "step": 3565 + }, + { + "epoch": 0.3944533451190542, + "grad_norm": 0.375, + "learning_rate": 0.00015166076374844605, + "loss": 1.2275, + "step": 3570 + }, + { + "epoch": 0.39500580078448705, + "grad_norm": 0.3984375, + "learning_rate": 0.00015149554184182802, + "loss": 1.3508, + "step": 3575 + }, + { + "epoch": 0.3955582564499199, + "grad_norm": 0.388671875, + "learning_rate": 0.00015133012840950292, + "loss": 1.27, + "step": 3580 + }, + { + "epoch": 0.39611071211535276, + "grad_norm": 0.375, + "learning_rate": 0.00015116452406668758, + "loss": 1.2897, + "step": 3585 + }, + { + "epoch": 0.3966631677807856, + "grad_norm": 0.376953125, + "learning_rate": 0.00015099872942930887, + "loss": 1.2945, + "step": 3590 + }, + { + "epoch": 0.3972156234462184, + "grad_norm": 0.416015625, + "learning_rate": 0.00015083274511400142, + "loss": 1.2058, + "step": 3595 + }, + { + "epoch": 0.3977680791116513, + "grad_norm": 0.41015625, + "learning_rate": 0.0001506665717381054, + "loss": 1.2996, + "step": 3600 + }, + { + "epoch": 0.3983205347770841, + "grad_norm": 0.40625, + "learning_rate": 0.00015050020991966406, + "loss": 1.2427, + "step": 3605 + }, + { + "epoch": 0.398872990442517, + "grad_norm": 0.3984375, + "learning_rate": 0.00015033366027742155, + "loss": 1.2581, + "step": 3610 + }, + { + "epoch": 0.39942544610794983, + "grad_norm": 0.423828125, + "learning_rate": 0.00015016692343082052, + "loss": 1.2623, + "step": 3615 + }, + { + "epoch": 0.39997790177338266, + "grad_norm": 0.41796875, + "learning_rate": 0.00015000000000000001, + "loss": 1.2438, + "step": 3620 + }, + { + "epoch": 0.40053035743881554, + "grad_norm": 0.421875, + "learning_rate": 0.00014983289060579294, + "loss": 1.3467, + "step": 3625 + }, + { + "epoch": 0.40108281310424837, + "grad_norm": 0.396484375, + "learning_rate": 0.00014966559586972387, + "loss": 1.3014, + "step": 3630 + }, + { + "epoch": 0.40163526876968125, + "grad_norm": 0.38671875, + "learning_rate": 0.0001494981164140067, + "loss": 1.3843, + "step": 3635 + }, + { + "epoch": 0.4021877244351141, + "grad_norm": 0.423828125, + "learning_rate": 0.0001493304528615424, + "loss": 1.3016, + "step": 3640 + }, + { + "epoch": 0.4027401801005469, + "grad_norm": 0.392578125, + "learning_rate": 0.00014916260583591658, + "loss": 1.2421, + "step": 3645 + }, + { + "epoch": 0.4032926357659798, + "grad_norm": 0.375, + "learning_rate": 0.00014899457596139729, + "loss": 1.3111, + "step": 3650 + }, + { + "epoch": 0.4038450914314126, + "grad_norm": 0.380859375, + "learning_rate": 0.0001488263638629326, + "loss": 1.3026, + "step": 3655 + }, + { + "epoch": 0.4043975470968455, + "grad_norm": 0.40625, + "learning_rate": 0.00014865797016614838, + "loss": 1.3204, + "step": 3660 + }, + { + "epoch": 0.4049500027622783, + "grad_norm": 0.412109375, + "learning_rate": 0.0001484893954973458, + "loss": 1.3437, + "step": 3665 + }, + { + "epoch": 0.40550245842771115, + "grad_norm": 0.39453125, + "learning_rate": 0.00014832064048349926, + "loss": 1.3389, + "step": 3670 + }, + { + "epoch": 0.40605491409314404, + "grad_norm": 0.380859375, + "learning_rate": 0.00014815170575225382, + "loss": 1.2688, + "step": 3675 + }, + { + "epoch": 0.40660736975857686, + "grad_norm": 0.408203125, + "learning_rate": 0.00014798259193192297, + "loss": 1.2756, + "step": 3680 + }, + { + "epoch": 0.40715982542400975, + "grad_norm": 0.396484375, + "learning_rate": 0.00014781329965148624, + "loss": 1.267, + "step": 3685 + }, + { + "epoch": 0.4077122810894426, + "grad_norm": 0.41796875, + "learning_rate": 0.000147643829540587, + "loss": 1.2677, + "step": 3690 + }, + { + "epoch": 0.4082647367548754, + "grad_norm": 0.392578125, + "learning_rate": 0.00014747418222952995, + "loss": 1.2098, + "step": 3695 + }, + { + "epoch": 0.4088171924203083, + "grad_norm": 0.390625, + "learning_rate": 0.00014730435834927884, + "loss": 1.2054, + "step": 3700 + }, + { + "epoch": 0.4093696480857411, + "grad_norm": 0.396484375, + "learning_rate": 0.0001471343585314542, + "loss": 1.2668, + "step": 3705 + }, + { + "epoch": 0.409922103751174, + "grad_norm": 0.390625, + "learning_rate": 0.0001469641834083308, + "loss": 1.2696, + "step": 3710 + }, + { + "epoch": 0.4104745594166068, + "grad_norm": 0.404296875, + "learning_rate": 0.00014679383361283554, + "loss": 1.2778, + "step": 3715 + }, + { + "epoch": 0.41102701508203965, + "grad_norm": 0.37890625, + "learning_rate": 0.00014662330977854488, + "loss": 1.3343, + "step": 3720 + }, + { + "epoch": 0.41157947074747253, + "grad_norm": 0.373046875, + "learning_rate": 0.00014645261253968262, + "loss": 1.2976, + "step": 3725 + }, + { + "epoch": 0.41213192641290536, + "grad_norm": 0.42578125, + "learning_rate": 0.00014628174253111752, + "loss": 1.2054, + "step": 3730 + }, + { + "epoch": 0.41268438207833824, + "grad_norm": 0.392578125, + "learning_rate": 0.00014611070038836083, + "loss": 1.2902, + "step": 3735 + }, + { + "epoch": 0.41323683774377107, + "grad_norm": 0.40234375, + "learning_rate": 0.00014593948674756417, + "loss": 1.2411, + "step": 3740 + }, + { + "epoch": 0.4137892934092039, + "grad_norm": 0.392578125, + "learning_rate": 0.00014576810224551683, + "loss": 1.2742, + "step": 3745 + }, + { + "epoch": 0.4143417490746368, + "grad_norm": 0.376953125, + "learning_rate": 0.00014559654751964364, + "loss": 1.3241, + "step": 3750 + }, + { + "epoch": 0.4148942047400696, + "grad_norm": 0.400390625, + "learning_rate": 0.00014542482320800264, + "loss": 1.3757, + "step": 3755 + }, + { + "epoch": 0.41544666040550243, + "grad_norm": 0.40234375, + "learning_rate": 0.00014525292994928247, + "loss": 1.3469, + "step": 3760 + }, + { + "epoch": 0.4159991160709353, + "grad_norm": 0.392578125, + "learning_rate": 0.00014508086838280017, + "loss": 1.354, + "step": 3765 + }, + { + "epoch": 0.41655157173636814, + "grad_norm": 0.392578125, + "learning_rate": 0.0001449086391484988, + "loss": 1.3268, + "step": 3770 + }, + { + "epoch": 0.417104027401801, + "grad_norm": 0.39453125, + "learning_rate": 0.00014473624288694498, + "loss": 1.3431, + "step": 3775 + }, + { + "epoch": 0.41765648306723385, + "grad_norm": 0.40234375, + "learning_rate": 0.00014456368023932657, + "loss": 1.2649, + "step": 3780 + }, + { + "epoch": 0.4182089387326667, + "grad_norm": 0.427734375, + "learning_rate": 0.00014439095184745024, + "loss": 1.2472, + "step": 3785 + }, + { + "epoch": 0.41876139439809956, + "grad_norm": 0.373046875, + "learning_rate": 0.00014421805835373915, + "loss": 1.3079, + "step": 3790 + }, + { + "epoch": 0.4193138500635324, + "grad_norm": 0.392578125, + "learning_rate": 0.0001440450004012305, + "loss": 1.3912, + "step": 3795 + }, + { + "epoch": 0.4198663057289653, + "grad_norm": 0.39453125, + "learning_rate": 0.00014387177863357307, + "loss": 1.2799, + "step": 3800 + }, + { + "epoch": 0.4204187613943981, + "grad_norm": 0.404296875, + "learning_rate": 0.00014369839369502506, + "loss": 1.2929, + "step": 3805 + }, + { + "epoch": 0.4209712170598309, + "grad_norm": 0.380859375, + "learning_rate": 0.00014352484623045148, + "loss": 1.3207, + "step": 3810 + }, + { + "epoch": 0.4215236727252638, + "grad_norm": 0.38671875, + "learning_rate": 0.00014335113688532182, + "loss": 1.3164, + "step": 3815 + }, + { + "epoch": 0.42207612839069664, + "grad_norm": 0.373046875, + "learning_rate": 0.0001431772663057076, + "loss": 1.2804, + "step": 3820 + }, + { + "epoch": 0.4226285840561295, + "grad_norm": 0.408203125, + "learning_rate": 0.00014300323513828008, + "loss": 1.3599, + "step": 3825 + }, + { + "epoch": 0.42318103972156235, + "grad_norm": 0.375, + "learning_rate": 0.00014282904403030772, + "loss": 1.2706, + "step": 3830 + }, + { + "epoch": 0.4237334953869952, + "grad_norm": 0.37109375, + "learning_rate": 0.000142654693629654, + "loss": 1.3047, + "step": 3835 + }, + { + "epoch": 0.42428595105242806, + "grad_norm": 0.392578125, + "learning_rate": 0.00014248018458477463, + "loss": 1.2786, + "step": 3840 + }, + { + "epoch": 0.4248384067178609, + "grad_norm": 0.39453125, + "learning_rate": 0.00014230551754471554, + "loss": 1.3008, + "step": 3845 + }, + { + "epoch": 0.42539086238329377, + "grad_norm": 0.392578125, + "learning_rate": 0.00014213069315911013, + "loss": 1.3216, + "step": 3850 + }, + { + "epoch": 0.4259433180487266, + "grad_norm": 0.388671875, + "learning_rate": 0.0001419557120781772, + "loss": 1.278, + "step": 3855 + }, + { + "epoch": 0.4264957737141594, + "grad_norm": 0.373046875, + "learning_rate": 0.00014178057495271815, + "loss": 1.2674, + "step": 3860 + }, + { + "epoch": 0.4270482293795923, + "grad_norm": 0.39453125, + "learning_rate": 0.00014160528243411494, + "loss": 1.3296, + "step": 3865 + }, + { + "epoch": 0.42760068504502513, + "grad_norm": 0.419921875, + "learning_rate": 0.00014142983517432723, + "loss": 1.2329, + "step": 3870 + }, + { + "epoch": 0.428153140710458, + "grad_norm": 0.38671875, + "learning_rate": 0.00014125423382589048, + "loss": 1.3433, + "step": 3875 + }, + { + "epoch": 0.42870559637589084, + "grad_norm": 0.462890625, + "learning_rate": 0.0001410784790419131, + "loss": 1.2509, + "step": 3880 + }, + { + "epoch": 0.42925805204132367, + "grad_norm": 0.390625, + "learning_rate": 0.00014090257147607413, + "loss": 1.2614, + "step": 3885 + }, + { + "epoch": 0.42981050770675655, + "grad_norm": 0.384765625, + "learning_rate": 0.00014072651178262096, + "loss": 1.3318, + "step": 3890 + }, + { + "epoch": 0.4303629633721894, + "grad_norm": 0.396484375, + "learning_rate": 0.00014055030061636668, + "loss": 1.2845, + "step": 3895 + }, + { + "epoch": 0.4309154190376222, + "grad_norm": 0.404296875, + "learning_rate": 0.00014037393863268783, + "loss": 1.2372, + "step": 3900 + }, + { + "epoch": 0.4314678747030551, + "grad_norm": 0.423828125, + "learning_rate": 0.00014019742648752184, + "loss": 1.2611, + "step": 3905 + }, + { + "epoch": 0.4320203303684879, + "grad_norm": 0.44921875, + "learning_rate": 0.0001400207648373646, + "loss": 1.3005, + "step": 3910 + }, + { + "epoch": 0.4325727860339208, + "grad_norm": 0.392578125, + "learning_rate": 0.00013984395433926816, + "loss": 1.3007, + "step": 3915 + }, + { + "epoch": 0.4331252416993536, + "grad_norm": 0.408203125, + "learning_rate": 0.00013966699565083802, + "loss": 1.2965, + "step": 3920 + }, + { + "epoch": 0.43367769736478645, + "grad_norm": 0.388671875, + "learning_rate": 0.00013948988943023096, + "loss": 1.3147, + "step": 3925 + }, + { + "epoch": 0.43423015303021933, + "grad_norm": 0.39453125, + "learning_rate": 0.00013931263633615241, + "loss": 1.3699, + "step": 3930 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.40234375, + "learning_rate": 0.0001391352370278541, + "loss": 1.3404, + "step": 3935 + }, + { + "epoch": 0.43533506436108504, + "grad_norm": 0.41796875, + "learning_rate": 0.00013895769216513157, + "loss": 1.3351, + "step": 3940 + }, + { + "epoch": 0.43588752002651787, + "grad_norm": 0.41015625, + "learning_rate": 0.00013878000240832167, + "loss": 1.2524, + "step": 3945 + }, + { + "epoch": 0.4364399756919507, + "grad_norm": 0.396484375, + "learning_rate": 0.00013860216841830018, + "loss": 1.2973, + "step": 3950 + }, + { + "epoch": 0.4369924313573836, + "grad_norm": 0.38671875, + "learning_rate": 0.00013842419085647933, + "loss": 1.3037, + "step": 3955 + }, + { + "epoch": 0.4375448870228164, + "grad_norm": 0.392578125, + "learning_rate": 0.00013824607038480532, + "loss": 1.2613, + "step": 3960 + }, + { + "epoch": 0.4380973426882493, + "grad_norm": 0.384765625, + "learning_rate": 0.00013806780766575588, + "loss": 1.2013, + "step": 3965 + }, + { + "epoch": 0.4386497983536821, + "grad_norm": 0.41015625, + "learning_rate": 0.0001378894033623378, + "loss": 1.3508, + "step": 3970 + }, + { + "epoch": 0.43920225401911495, + "grad_norm": 0.41796875, + "learning_rate": 0.00013771085813808442, + "loss": 1.3022, + "step": 3975 + }, + { + "epoch": 0.43975470968454783, + "grad_norm": 0.404296875, + "learning_rate": 0.00013753217265705323, + "loss": 1.3253, + "step": 3980 + }, + { + "epoch": 0.44030716534998066, + "grad_norm": 0.375, + "learning_rate": 0.0001373533475838234, + "loss": 1.2449, + "step": 3985 + }, + { + "epoch": 0.44085962101541354, + "grad_norm": 0.373046875, + "learning_rate": 0.0001371743835834932, + "loss": 1.2731, + "step": 3990 + }, + { + "epoch": 0.44141207668084637, + "grad_norm": 0.375, + "learning_rate": 0.00013699528132167776, + "loss": 1.3044, + "step": 3995 + }, + { + "epoch": 0.4419645323462792, + "grad_norm": 0.40625, + "learning_rate": 0.00013681604146450625, + "loss": 1.2405, + "step": 4000 + }, + { + "epoch": 0.4425169880117121, + "grad_norm": 0.388671875, + "learning_rate": 0.00013663666467861972, + "loss": 1.3248, + "step": 4005 + }, + { + "epoch": 0.4430694436771449, + "grad_norm": 0.384765625, + "learning_rate": 0.00013645715163116846, + "loss": 1.2828, + "step": 4010 + }, + { + "epoch": 0.4436218993425778, + "grad_norm": 0.392578125, + "learning_rate": 0.0001362775029898096, + "loss": 1.3126, + "step": 4015 + }, + { + "epoch": 0.4441743550080106, + "grad_norm": 0.404296875, + "learning_rate": 0.00013609771942270444, + "loss": 1.2572, + "step": 4020 + }, + { + "epoch": 0.44472681067344344, + "grad_norm": 0.404296875, + "learning_rate": 0.0001359178015985163, + "loss": 1.3412, + "step": 4025 + }, + { + "epoch": 0.4452792663388763, + "grad_norm": 0.39453125, + "learning_rate": 0.00013573775018640766, + "loss": 1.2685, + "step": 4030 + }, + { + "epoch": 0.44583172200430915, + "grad_norm": 0.404296875, + "learning_rate": 0.00013555756585603793, + "loss": 1.3055, + "step": 4035 + }, + { + "epoch": 0.446384177669742, + "grad_norm": 0.3984375, + "learning_rate": 0.00013537724927756094, + "loss": 1.2442, + "step": 4040 + }, + { + "epoch": 0.44693663333517486, + "grad_norm": 0.408203125, + "learning_rate": 0.0001351968011216223, + "loss": 1.3003, + "step": 4045 + }, + { + "epoch": 0.4474890890006077, + "grad_norm": 0.39453125, + "learning_rate": 0.00013501622205935697, + "loss": 1.3651, + "step": 4050 + }, + { + "epoch": 0.44804154466604057, + "grad_norm": 0.3984375, + "learning_rate": 0.0001348355127623869, + "loss": 1.2699, + "step": 4055 + }, + { + "epoch": 0.4485940003314734, + "grad_norm": 0.392578125, + "learning_rate": 0.00013465467390281826, + "loss": 1.2554, + "step": 4060 + }, + { + "epoch": 0.4491464559969062, + "grad_norm": 0.435546875, + "learning_rate": 0.00013447370615323923, + "loss": 1.2467, + "step": 4065 + }, + { + "epoch": 0.4496989116623391, + "grad_norm": 0.412109375, + "learning_rate": 0.00013429261018671734, + "loss": 1.298, + "step": 4070 + }, + { + "epoch": 0.45025136732777193, + "grad_norm": 0.39453125, + "learning_rate": 0.00013411138667679696, + "loss": 1.1604, + "step": 4075 + }, + { + "epoch": 0.4508038229932048, + "grad_norm": 0.390625, + "learning_rate": 0.00013393003629749684, + "loss": 1.2926, + "step": 4080 + }, + { + "epoch": 0.45135627865863764, + "grad_norm": 0.37890625, + "learning_rate": 0.00013374855972330757, + "loss": 1.2703, + "step": 4085 + }, + { + "epoch": 0.45190873432407047, + "grad_norm": 0.390625, + "learning_rate": 0.00013356695762918914, + "loss": 1.3145, + "step": 4090 + }, + { + "epoch": 0.45246118998950335, + "grad_norm": 0.404296875, + "learning_rate": 0.0001333852306905684, + "loss": 1.3115, + "step": 4095 + }, + { + "epoch": 0.4530136456549362, + "grad_norm": 0.39453125, + "learning_rate": 0.0001332033795833364, + "loss": 1.3531, + "step": 4100 + }, + { + "epoch": 0.45356610132036906, + "grad_norm": 0.380859375, + "learning_rate": 0.00013302140498384617, + "loss": 1.3463, + "step": 4105 + }, + { + "epoch": 0.4541185569858019, + "grad_norm": 0.40625, + "learning_rate": 0.0001328393075689099, + "loss": 1.2554, + "step": 4110 + }, + { + "epoch": 0.4546710126512347, + "grad_norm": 0.396484375, + "learning_rate": 0.0001326570880157967, + "loss": 1.2158, + "step": 4115 + }, + { + "epoch": 0.4552234683166676, + "grad_norm": 0.412109375, + "learning_rate": 0.0001324747470022298, + "loss": 1.3134, + "step": 4120 + }, + { + "epoch": 0.45577592398210043, + "grad_norm": 0.404296875, + "learning_rate": 0.00013229228520638436, + "loss": 1.3496, + "step": 4125 + }, + { + "epoch": 0.4563283796475333, + "grad_norm": 0.40625, + "learning_rate": 0.00013210970330688454, + "loss": 1.3995, + "step": 4130 + }, + { + "epoch": 0.45688083531296614, + "grad_norm": 0.421875, + "learning_rate": 0.0001319270019828013, + "loss": 1.3097, + "step": 4135 + }, + { + "epoch": 0.45743329097839897, + "grad_norm": 0.412109375, + "learning_rate": 0.00013174418191364988, + "loss": 1.2007, + "step": 4140 + }, + { + "epoch": 0.45798574664383185, + "grad_norm": 0.392578125, + "learning_rate": 0.00013156124377938699, + "loss": 1.297, + "step": 4145 + }, + { + "epoch": 0.4585382023092647, + "grad_norm": 0.404296875, + "learning_rate": 0.00013137818826040854, + "loss": 1.3482, + "step": 4150 + }, + { + "epoch": 0.45909065797469756, + "grad_norm": 0.408203125, + "learning_rate": 0.00013119501603754704, + "loss": 1.3098, + "step": 4155 + }, + { + "epoch": 0.4596431136401304, + "grad_norm": 0.392578125, + "learning_rate": 0.000131011727792069, + "loss": 1.2378, + "step": 4160 + }, + { + "epoch": 0.4601955693055632, + "grad_norm": 0.3828125, + "learning_rate": 0.00013082832420567247, + "loss": 1.2998, + "step": 4165 + }, + { + "epoch": 0.4607480249709961, + "grad_norm": 0.3828125, + "learning_rate": 0.00013064480596048454, + "loss": 1.271, + "step": 4170 + }, + { + "epoch": 0.4613004806364289, + "grad_norm": 0.419921875, + "learning_rate": 0.00013046117373905866, + "loss": 1.282, + "step": 4175 + }, + { + "epoch": 0.46185293630186175, + "grad_norm": 0.408203125, + "learning_rate": 0.0001302774282243722, + "loss": 1.2486, + "step": 4180 + }, + { + "epoch": 0.46240539196729463, + "grad_norm": 0.388671875, + "learning_rate": 0.00013009357009982397, + "loss": 1.2647, + "step": 4185 + }, + { + "epoch": 0.46295784763272746, + "grad_norm": 0.41796875, + "learning_rate": 0.00012990960004923154, + "loss": 1.2852, + "step": 4190 + }, + { + "epoch": 0.46351030329816034, + "grad_norm": 0.373046875, + "learning_rate": 0.0001297255187568288, + "loss": 1.2556, + "step": 4195 + }, + { + "epoch": 0.46406275896359317, + "grad_norm": 0.392578125, + "learning_rate": 0.00012954132690726336, + "loss": 1.3166, + "step": 4200 + }, + { + "epoch": 0.464615214629026, + "grad_norm": 0.390625, + "learning_rate": 0.00012935702518559398, + "loss": 1.2838, + "step": 4205 + }, + { + "epoch": 0.4651676702944589, + "grad_norm": 0.419921875, + "learning_rate": 0.00012917261427728815, + "loss": 1.2858, + "step": 4210 + }, + { + "epoch": 0.4657201259598917, + "grad_norm": 0.3984375, + "learning_rate": 0.0001289880948682194, + "loss": 1.2866, + "step": 4215 + }, + { + "epoch": 0.4662725816253246, + "grad_norm": 0.40234375, + "learning_rate": 0.0001288034676446648, + "loss": 1.2379, + "step": 4220 + }, + { + "epoch": 0.4668250372907574, + "grad_norm": 0.3984375, + "learning_rate": 0.00012861873329330248, + "loss": 1.2144, + "step": 4225 + }, + { + "epoch": 0.46737749295619024, + "grad_norm": 0.40625, + "learning_rate": 0.00012843389250120885, + "loss": 1.3324, + "step": 4230 + }, + { + "epoch": 0.4679299486216231, + "grad_norm": 0.380859375, + "learning_rate": 0.00012824894595585637, + "loss": 1.2816, + "step": 4235 + }, + { + "epoch": 0.46848240428705595, + "grad_norm": 0.41796875, + "learning_rate": 0.00012806389434511076, + "loss": 1.2862, + "step": 4240 + }, + { + "epoch": 0.46903485995248884, + "grad_norm": 0.380859375, + "learning_rate": 0.0001278787383572285, + "loss": 1.2216, + "step": 4245 + }, + { + "epoch": 0.46958731561792166, + "grad_norm": 0.396484375, + "learning_rate": 0.00012769347868085427, + "loss": 1.2242, + "step": 4250 + }, + { + "epoch": 0.4701397712833545, + "grad_norm": 0.40234375, + "learning_rate": 0.00012750811600501842, + "loss": 1.3282, + "step": 4255 + }, + { + "epoch": 0.4706922269487874, + "grad_norm": 0.376953125, + "learning_rate": 0.00012732265101913435, + "loss": 1.2393, + "step": 4260 + }, + { + "epoch": 0.4712446826142202, + "grad_norm": 0.396484375, + "learning_rate": 0.000127137084412996, + "loss": 1.2319, + "step": 4265 + }, + { + "epoch": 0.4717971382796531, + "grad_norm": 0.396484375, + "learning_rate": 0.00012695141687677527, + "loss": 1.2498, + "step": 4270 + }, + { + "epoch": 0.4723495939450859, + "grad_norm": 0.41796875, + "learning_rate": 0.00012676564910101947, + "loss": 1.3246, + "step": 4275 + }, + { + "epoch": 0.47290204961051874, + "grad_norm": 0.396484375, + "learning_rate": 0.0001265797817766486, + "loss": 1.3203, + "step": 4280 + }, + { + "epoch": 0.4734545052759516, + "grad_norm": 0.4140625, + "learning_rate": 0.0001263938155949531, + "loss": 1.2485, + "step": 4285 + }, + { + "epoch": 0.47400696094138445, + "grad_norm": 0.396484375, + "learning_rate": 0.00012620775124759092, + "loss": 1.2922, + "step": 4290 + }, + { + "epoch": 0.47455941660681733, + "grad_norm": 0.431640625, + "learning_rate": 0.0001260215894265852, + "loss": 1.3289, + "step": 4295 + }, + { + "epoch": 0.47511187227225016, + "grad_norm": 0.3984375, + "learning_rate": 0.0001258353308243217, + "loss": 1.2854, + "step": 4300 + }, + { + "epoch": 0.475664327937683, + "grad_norm": 0.39453125, + "learning_rate": 0.00012564897613354586, + "loss": 1.2822, + "step": 4305 + }, + { + "epoch": 0.47621678360311587, + "grad_norm": 0.408203125, + "learning_rate": 0.00012546252604736074, + "loss": 1.2535, + "step": 4310 + }, + { + "epoch": 0.4767692392685487, + "grad_norm": 0.3828125, + "learning_rate": 0.00012527598125922413, + "loss": 1.2759, + "step": 4315 + }, + { + "epoch": 0.4773216949339815, + "grad_norm": 0.4140625, + "learning_rate": 0.00012508934246294604, + "loss": 1.3582, + "step": 4320 + }, + { + "epoch": 0.4778741505994144, + "grad_norm": 0.3828125, + "learning_rate": 0.00012490261035268612, + "loss": 1.2938, + "step": 4325 + }, + { + "epoch": 0.47842660626484723, + "grad_norm": 0.396484375, + "learning_rate": 0.00012471578562295115, + "loss": 1.2608, + "step": 4330 + }, + { + "epoch": 0.4789790619302801, + "grad_norm": 0.375, + "learning_rate": 0.0001245288689685922, + "loss": 1.3062, + "step": 4335 + }, + { + "epoch": 0.47953151759571294, + "grad_norm": 0.392578125, + "learning_rate": 0.0001243418610848024, + "loss": 1.228, + "step": 4340 + }, + { + "epoch": 0.48008397326114577, + "grad_norm": 0.390625, + "learning_rate": 0.00012415476266711413, + "loss": 1.3653, + "step": 4345 + }, + { + "epoch": 0.48063642892657865, + "grad_norm": 0.40625, + "learning_rate": 0.00012396757441139654, + "loss": 1.2368, + "step": 4350 + }, + { + "epoch": 0.4811888845920115, + "grad_norm": 0.404296875, + "learning_rate": 0.00012378029701385287, + "loss": 1.2846, + "step": 4355 + }, + { + "epoch": 0.48174134025744436, + "grad_norm": 0.376953125, + "learning_rate": 0.00012359293117101782, + "loss": 1.2454, + "step": 4360 + }, + { + "epoch": 0.4822937959228772, + "grad_norm": 0.388671875, + "learning_rate": 0.0001234054775797552, + "loss": 1.2893, + "step": 4365 + }, + { + "epoch": 0.48284625158831, + "grad_norm": 0.39453125, + "learning_rate": 0.00012321793693725509, + "loss": 1.2866, + "step": 4370 + }, + { + "epoch": 0.4833987072537429, + "grad_norm": 0.40625, + "learning_rate": 0.00012303030994103133, + "loss": 1.3267, + "step": 4375 + }, + { + "epoch": 0.4839511629191757, + "grad_norm": 0.388671875, + "learning_rate": 0.00012284259728891897, + "loss": 1.3041, + "step": 4380 + }, + { + "epoch": 0.4845036185846086, + "grad_norm": 0.5, + "learning_rate": 0.00012265479967907159, + "loss": 1.2848, + "step": 4385 + }, + { + "epoch": 0.48505607425004144, + "grad_norm": 0.384765625, + "learning_rate": 0.00012246691780995881, + "loss": 1.2127, + "step": 4390 + }, + { + "epoch": 0.48560852991547426, + "grad_norm": 0.380859375, + "learning_rate": 0.0001222789523803636, + "loss": 1.2426, + "step": 4395 + }, + { + "epoch": 0.48616098558090715, + "grad_norm": 0.380859375, + "learning_rate": 0.00012209090408937971, + "loss": 1.2206, + "step": 4400 + }, + { + "epoch": 0.48671344124634, + "grad_norm": 0.408203125, + "learning_rate": 0.00012190277363640907, + "loss": 1.3072, + "step": 4405 + }, + { + "epoch": 0.48726589691177286, + "grad_norm": 0.412109375, + "learning_rate": 0.00012171456172115923, + "loss": 1.3464, + "step": 4410 + }, + { + "epoch": 0.4878183525772057, + "grad_norm": 0.40234375, + "learning_rate": 0.00012152626904364067, + "loss": 1.2361, + "step": 4415 + }, + { + "epoch": 0.4883708082426385, + "grad_norm": 0.40625, + "learning_rate": 0.00012133789630416425, + "loss": 1.3115, + "step": 4420 + }, + { + "epoch": 0.4889232639080714, + "grad_norm": 0.38671875, + "learning_rate": 0.00012114944420333869, + "loss": 1.2941, + "step": 4425 + }, + { + "epoch": 0.4894757195735042, + "grad_norm": 0.37890625, + "learning_rate": 0.00012096091344206777, + "loss": 1.2627, + "step": 4430 + }, + { + "epoch": 0.4900281752389371, + "grad_norm": 0.416015625, + "learning_rate": 0.00012077230472154786, + "loss": 1.3617, + "step": 4435 + }, + { + "epoch": 0.49058063090436993, + "grad_norm": 0.408203125, + "learning_rate": 0.00012058361874326526, + "loss": 1.2621, + "step": 4440 + }, + { + "epoch": 0.49113308656980276, + "grad_norm": 0.41015625, + "learning_rate": 0.00012039485620899369, + "loss": 1.2315, + "step": 4445 + }, + { + "epoch": 0.49168554223523564, + "grad_norm": 0.37890625, + "learning_rate": 0.00012020601782079155, + "loss": 1.2041, + "step": 4450 + }, + { + "epoch": 0.49223799790066847, + "grad_norm": 0.392578125, + "learning_rate": 0.00012001710428099935, + "loss": 1.2634, + "step": 4455 + }, + { + "epoch": 0.4927904535661013, + "grad_norm": 0.40234375, + "learning_rate": 0.00011982811629223709, + "loss": 1.3279, + "step": 4460 + }, + { + "epoch": 0.4933429092315342, + "grad_norm": 0.384765625, + "learning_rate": 0.00011963905455740177, + "loss": 1.2847, + "step": 4465 + }, + { + "epoch": 0.493895364896967, + "grad_norm": 0.390625, + "learning_rate": 0.00011944991977966452, + "loss": 1.2713, + "step": 4470 + }, + { + "epoch": 0.4944478205623999, + "grad_norm": 0.38671875, + "learning_rate": 0.00011926071266246826, + "loss": 1.2448, + "step": 4475 + }, + { + "epoch": 0.4950002762278327, + "grad_norm": 0.38671875, + "learning_rate": 0.00011907143390952493, + "loss": 1.2263, + "step": 4480 + }, + { + "epoch": 0.49555273189326554, + "grad_norm": 0.3984375, + "learning_rate": 0.00011888208422481287, + "loss": 1.3221, + "step": 4485 + }, + { + "epoch": 0.4961051875586984, + "grad_norm": 0.390625, + "learning_rate": 0.00011869266431257422, + "loss": 1.2537, + "step": 4490 + }, + { + "epoch": 0.49665764322413125, + "grad_norm": 0.4140625, + "learning_rate": 0.00011850317487731239, + "loss": 1.2193, + "step": 4495 + }, + { + "epoch": 0.49721009888956413, + "grad_norm": 0.392578125, + "learning_rate": 0.00011831361662378932, + "loss": 1.3306, + "step": 4500 + }, + { + "epoch": 0.49776255455499696, + "grad_norm": 0.421875, + "learning_rate": 0.0001181239902570229, + "loss": 1.3171, + "step": 4505 + }, + { + "epoch": 0.4983150102204298, + "grad_norm": 0.396484375, + "learning_rate": 0.00011793429648228436, + "loss": 1.2719, + "step": 4510 + }, + { + "epoch": 0.49886746588586267, + "grad_norm": 0.40234375, + "learning_rate": 0.00011774453600509559, + "loss": 1.2798, + "step": 4515 + }, + { + "epoch": 0.4994199215512955, + "grad_norm": 0.37890625, + "learning_rate": 0.00011755470953122667, + "loss": 1.2583, + "step": 4520 + }, + { + "epoch": 0.4999723772167284, + "grad_norm": 0.435546875, + "learning_rate": 0.00011736481776669306, + "loss": 1.372, + "step": 4525 + }, + { + "epoch": 0.5005248328821612, + "grad_norm": 0.41796875, + "learning_rate": 0.00011717486141775305, + "loss": 1.2945, + "step": 4530 + }, + { + "epoch": 0.501077288547594, + "grad_norm": 0.41796875, + "learning_rate": 0.00011698484119090518, + "loss": 1.2232, + "step": 4535 + }, + { + "epoch": 0.5016297442130269, + "grad_norm": 0.404296875, + "learning_rate": 0.00011679475779288555, + "loss": 1.2779, + "step": 4540 + }, + { + "epoch": 0.5021821998784598, + "grad_norm": 0.396484375, + "learning_rate": 0.00011660461193066521, + "loss": 1.2376, + "step": 4545 + }, + { + "epoch": 0.5027346555438926, + "grad_norm": 0.4140625, + "learning_rate": 0.0001164144043114475, + "loss": 1.328, + "step": 4550 + }, + { + "epoch": 0.5032871112093255, + "grad_norm": 0.384765625, + "learning_rate": 0.00011622413564266555, + "loss": 1.2981, + "step": 4555 + }, + { + "epoch": 0.5038395668747583, + "grad_norm": 0.40625, + "learning_rate": 0.0001160338066319794, + "loss": 1.2478, + "step": 4560 + }, + { + "epoch": 0.5043920225401911, + "grad_norm": 0.400390625, + "learning_rate": 0.00011584341798727365, + "loss": 1.3444, + "step": 4565 + }, + { + "epoch": 0.504944478205624, + "grad_norm": 0.396484375, + "learning_rate": 0.0001156529704166546, + "loss": 1.3496, + "step": 4570 + }, + { + "epoch": 0.5054969338710569, + "grad_norm": 0.396484375, + "learning_rate": 0.00011546246462844779, + "loss": 1.2273, + "step": 4575 + }, + { + "epoch": 0.5060493895364897, + "grad_norm": 0.43359375, + "learning_rate": 0.00011527190133119526, + "loss": 1.2409, + "step": 4580 + }, + { + "epoch": 0.5066018452019225, + "grad_norm": 0.412109375, + "learning_rate": 0.0001150812812336529, + "loss": 1.2615, + "step": 4585 + }, + { + "epoch": 0.5071543008673554, + "grad_norm": 0.392578125, + "learning_rate": 0.00011489060504478788, + "loss": 1.2579, + "step": 4590 + }, + { + "epoch": 0.5077067565327883, + "grad_norm": 0.404296875, + "learning_rate": 0.00011469987347377602, + "loss": 1.343, + "step": 4595 + }, + { + "epoch": 0.5082592121982211, + "grad_norm": 0.390625, + "learning_rate": 0.00011450908722999909, + "loss": 1.1766, + "step": 4600 + }, + { + "epoch": 0.508811667863654, + "grad_norm": 0.416015625, + "learning_rate": 0.00011431824702304221, + "loss": 1.2905, + "step": 4605 + }, + { + "epoch": 0.5093641235290868, + "grad_norm": 0.38671875, + "learning_rate": 0.00011412735356269124, + "loss": 1.2963, + "step": 4610 + }, + { + "epoch": 0.5099165791945196, + "grad_norm": 0.41015625, + "learning_rate": 0.00011393640755893002, + "loss": 1.3127, + "step": 4615 + }, + { + "epoch": 0.5104690348599525, + "grad_norm": 0.40234375, + "learning_rate": 0.00011374540972193786, + "loss": 1.2661, + "step": 4620 + }, + { + "epoch": 0.5110214905253854, + "grad_norm": 0.396484375, + "learning_rate": 0.00011355436076208687, + "loss": 1.2579, + "step": 4625 + }, + { + "epoch": 0.5115739461908182, + "grad_norm": 0.365234375, + "learning_rate": 0.00011336326138993927, + "loss": 1.2682, + "step": 4630 + }, + { + "epoch": 0.512126401856251, + "grad_norm": 0.3984375, + "learning_rate": 0.00011317211231624483, + "loss": 1.2175, + "step": 4635 + }, + { + "epoch": 0.5126788575216839, + "grad_norm": 0.380859375, + "learning_rate": 0.00011298091425193806, + "loss": 1.2072, + "step": 4640 + }, + { + "epoch": 0.5132313131871168, + "grad_norm": 0.39453125, + "learning_rate": 0.00011278966790813582, + "loss": 1.2484, + "step": 4645 + }, + { + "epoch": 0.5137837688525496, + "grad_norm": 0.390625, + "learning_rate": 0.0001125983739961344, + "loss": 1.333, + "step": 4650 + }, + { + "epoch": 0.5143362245179824, + "grad_norm": 0.400390625, + "learning_rate": 0.00011240703322740711, + "loss": 1.3642, + "step": 4655 + }, + { + "epoch": 0.5148886801834153, + "grad_norm": 0.37890625, + "learning_rate": 0.00011221564631360152, + "loss": 1.3279, + "step": 4660 + }, + { + "epoch": 0.5154411358488481, + "grad_norm": 0.37890625, + "learning_rate": 0.00011202421396653677, + "loss": 1.2654, + "step": 4665 + }, + { + "epoch": 0.5159935915142809, + "grad_norm": 0.40625, + "learning_rate": 0.000111832736898201, + "loss": 1.2242, + "step": 4670 + }, + { + "epoch": 0.5165460471797139, + "grad_norm": 0.41015625, + "learning_rate": 0.00011164121582074873, + "loss": 1.3846, + "step": 4675 + }, + { + "epoch": 0.5170985028451467, + "grad_norm": 0.384765625, + "learning_rate": 0.00011144965144649809, + "loss": 1.302, + "step": 4680 + }, + { + "epoch": 0.5176509585105795, + "grad_norm": 0.419921875, + "learning_rate": 0.00011125804448792831, + "loss": 1.2522, + "step": 4685 + }, + { + "epoch": 0.5182034141760123, + "grad_norm": 0.408203125, + "learning_rate": 0.00011106639565767692, + "loss": 1.2961, + "step": 4690 + }, + { + "epoch": 0.5187558698414452, + "grad_norm": 0.396484375, + "learning_rate": 0.00011087470566853726, + "loss": 1.2395, + "step": 4695 + }, + { + "epoch": 0.5193083255068781, + "grad_norm": 0.39453125, + "learning_rate": 0.00011068297523345573, + "loss": 1.3415, + "step": 4700 + }, + { + "epoch": 0.5198607811723109, + "grad_norm": 0.3984375, + "learning_rate": 0.00011049120506552913, + "loss": 1.2696, + "step": 4705 + }, + { + "epoch": 0.5204132368377438, + "grad_norm": 0.38671875, + "learning_rate": 0.00011029939587800206, + "loss": 1.2383, + "step": 4710 + }, + { + "epoch": 0.5209656925031766, + "grad_norm": 0.404296875, + "learning_rate": 0.00011010754838426428, + "loss": 1.404, + "step": 4715 + }, + { + "epoch": 0.5215181481686094, + "grad_norm": 0.38671875, + "learning_rate": 0.0001099156632978479, + "loss": 1.3234, + "step": 4720 + }, + { + "epoch": 0.5220706038340424, + "grad_norm": 0.408203125, + "learning_rate": 0.00010972374133242502, + "loss": 1.2487, + "step": 4725 + }, + { + "epoch": 0.5226230594994752, + "grad_norm": 0.384765625, + "learning_rate": 0.00010953178320180475, + "loss": 1.2092, + "step": 4730 + }, + { + "epoch": 0.523175515164908, + "grad_norm": 0.40625, + "learning_rate": 0.00010933978961993083, + "loss": 1.3322, + "step": 4735 + }, + { + "epoch": 0.5237279708303408, + "grad_norm": 0.388671875, + "learning_rate": 0.00010914776130087873, + "loss": 1.2356, + "step": 4740 + }, + { + "epoch": 0.5242804264957737, + "grad_norm": 0.400390625, + "learning_rate": 0.0001089556989588532, + "loss": 1.2565, + "step": 4745 + }, + { + "epoch": 0.5248328821612066, + "grad_norm": 0.4296875, + "learning_rate": 0.00010876360330818553, + "loss": 1.2775, + "step": 4750 + }, + { + "epoch": 0.5253853378266394, + "grad_norm": 0.423828125, + "learning_rate": 0.00010857147506333088, + "loss": 1.2035, + "step": 4755 + }, + { + "epoch": 0.5259377934920723, + "grad_norm": 0.4140625, + "learning_rate": 0.00010837931493886562, + "loss": 1.3017, + "step": 4760 + }, + { + "epoch": 0.5264902491575051, + "grad_norm": 0.39453125, + "learning_rate": 0.0001081871236494847, + "loss": 1.2652, + "step": 4765 + }, + { + "epoch": 0.5270427048229379, + "grad_norm": 0.388671875, + "learning_rate": 0.00010799490190999892, + "loss": 1.2914, + "step": 4770 + }, + { + "epoch": 0.5275951604883709, + "grad_norm": 0.3984375, + "learning_rate": 0.0001078026504353325, + "loss": 1.2566, + "step": 4775 + }, + { + "epoch": 0.5281476161538037, + "grad_norm": 0.423828125, + "learning_rate": 0.00010761036994052008, + "loss": 1.3195, + "step": 4780 + }, + { + "epoch": 0.5287000718192365, + "grad_norm": 0.41015625, + "learning_rate": 0.00010741806114070434, + "loss": 1.3088, + "step": 4785 + }, + { + "epoch": 0.5292525274846693, + "grad_norm": 0.431640625, + "learning_rate": 0.00010722572475113315, + "loss": 1.2676, + "step": 4790 + }, + { + "epoch": 0.5298049831501022, + "grad_norm": 0.375, + "learning_rate": 0.00010703336148715705, + "loss": 1.3551, + "step": 4795 + }, + { + "epoch": 0.5303574388155351, + "grad_norm": 0.4296875, + "learning_rate": 0.00010684097206422654, + "loss": 1.2925, + "step": 4800 + }, + { + "epoch": 0.5309098944809679, + "grad_norm": 0.40234375, + "learning_rate": 0.00010664855719788935, + "loss": 1.281, + "step": 4805 + }, + { + "epoch": 0.5314623501464008, + "grad_norm": 0.400390625, + "learning_rate": 0.00010645611760378795, + "loss": 1.3131, + "step": 4810 + }, + { + "epoch": 0.5320148058118336, + "grad_norm": 0.412109375, + "learning_rate": 0.00010626365399765667, + "loss": 1.3253, + "step": 4815 + }, + { + "epoch": 0.5325672614772664, + "grad_norm": 0.37890625, + "learning_rate": 0.00010607116709531918, + "loss": 1.2705, + "step": 4820 + }, + { + "epoch": 0.5331197171426993, + "grad_norm": 0.392578125, + "learning_rate": 0.00010587865761268583, + "loss": 1.3432, + "step": 4825 + }, + { + "epoch": 0.5336721728081322, + "grad_norm": 0.416015625, + "learning_rate": 0.00010568612626575092, + "loss": 1.2824, + "step": 4830 + }, + { + "epoch": 0.534224628473565, + "grad_norm": 0.423828125, + "learning_rate": 0.00010549357377059006, + "loss": 1.2724, + "step": 4835 + }, + { + "epoch": 0.5347770841389978, + "grad_norm": 0.390625, + "learning_rate": 0.00010530100084335758, + "loss": 1.2218, + "step": 4840 + }, + { + "epoch": 0.5353295398044307, + "grad_norm": 0.400390625, + "learning_rate": 0.0001051084082002837, + "loss": 1.4052, + "step": 4845 + }, + { + "epoch": 0.5358819954698636, + "grad_norm": 0.404296875, + "learning_rate": 0.00010491579655767203, + "loss": 1.3135, + "step": 4850 + }, + { + "epoch": 0.5364344511352964, + "grad_norm": 0.423828125, + "learning_rate": 0.00010472316663189683, + "loss": 1.2908, + "step": 4855 + }, + { + "epoch": 0.5369869068007292, + "grad_norm": 0.42578125, + "learning_rate": 0.00010453051913940042, + "loss": 1.2162, + "step": 4860 + }, + { + "epoch": 0.5375393624661621, + "grad_norm": 0.4140625, + "learning_rate": 0.00010433785479669038, + "loss": 1.3188, + "step": 4865 + }, + { + "epoch": 0.5380918181315949, + "grad_norm": 0.3828125, + "learning_rate": 0.00010414517432033695, + "loss": 1.251, + "step": 4870 + }, + { + "epoch": 0.5386442737970278, + "grad_norm": 0.44140625, + "learning_rate": 0.0001039524784269704, + "loss": 1.2296, + "step": 4875 + }, + { + "epoch": 0.5391967294624607, + "grad_norm": 0.41796875, + "learning_rate": 0.00010375976783327841, + "loss": 1.2543, + "step": 4880 + }, + { + "epoch": 0.5397491851278935, + "grad_norm": 0.40234375, + "learning_rate": 0.00010356704325600324, + "loss": 1.2968, + "step": 4885 + }, + { + "epoch": 0.5403016407933263, + "grad_norm": 0.40625, + "learning_rate": 0.00010337430541193918, + "loss": 1.2856, + "step": 4890 + }, + { + "epoch": 0.5408540964587591, + "grad_norm": 0.388671875, + "learning_rate": 0.00010318155501792988, + "loss": 1.2595, + "step": 4895 + }, + { + "epoch": 0.5414065521241921, + "grad_norm": 0.419921875, + "learning_rate": 0.00010298879279086568, + "loss": 1.2911, + "step": 4900 + }, + { + "epoch": 0.5419590077896249, + "grad_norm": 0.392578125, + "learning_rate": 0.00010279601944768089, + "loss": 1.2429, + "step": 4905 + }, + { + "epoch": 0.5425114634550577, + "grad_norm": 0.41796875, + "learning_rate": 0.0001026032357053512, + "loss": 1.3281, + "step": 4910 + }, + { + "epoch": 0.5430639191204906, + "grad_norm": 0.412109375, + "learning_rate": 0.00010241044228089096, + "loss": 1.2073, + "step": 4915 + }, + { + "epoch": 0.5436163747859234, + "grad_norm": 0.40234375, + "learning_rate": 0.00010221763989135052, + "loss": 1.3138, + "step": 4920 + }, + { + "epoch": 0.5441688304513563, + "grad_norm": 0.388671875, + "learning_rate": 0.00010202482925381358, + "loss": 1.2066, + "step": 4925 + }, + { + "epoch": 0.5447212861167892, + "grad_norm": 0.396484375, + "learning_rate": 0.00010183201108539453, + "loss": 1.2961, + "step": 4930 + }, + { + "epoch": 0.545273741782222, + "grad_norm": 0.46875, + "learning_rate": 0.00010163918610323579, + "loss": 1.3148, + "step": 4935 + }, + { + "epoch": 0.5458261974476548, + "grad_norm": 0.396484375, + "learning_rate": 0.00010144635502450508, + "loss": 1.2551, + "step": 4940 + }, + { + "epoch": 0.5463786531130876, + "grad_norm": 0.40234375, + "learning_rate": 0.00010125351856639278, + "loss": 1.2512, + "step": 4945 + }, + { + "epoch": 0.5469311087785205, + "grad_norm": 0.40625, + "learning_rate": 0.00010106067744610933, + "loss": 1.3768, + "step": 4950 + }, + { + "epoch": 0.5474835644439534, + "grad_norm": 0.404296875, + "learning_rate": 0.00010086783238088244, + "loss": 1.307, + "step": 4955 + }, + { + "epoch": 0.5480360201093862, + "grad_norm": 0.404296875, + "learning_rate": 0.00010067498408795462, + "loss": 1.3127, + "step": 4960 + }, + { + "epoch": 0.5485884757748191, + "grad_norm": 0.40234375, + "learning_rate": 0.00010048213328458028, + "loss": 1.2862, + "step": 4965 + }, + { + "epoch": 0.5491409314402519, + "grad_norm": 0.40625, + "learning_rate": 0.00010028928068802315, + "loss": 1.171, + "step": 4970 + }, + { + "epoch": 0.5496933871056847, + "grad_norm": 0.41796875, + "learning_rate": 0.00010009642701555368, + "loss": 1.2546, + "step": 4975 + }, + { + "epoch": 0.5502458427711177, + "grad_norm": 0.3828125, + "learning_rate": 9.990357298444632e-05, + "loss": 1.1777, + "step": 4980 + }, + { + "epoch": 0.5507982984365505, + "grad_norm": 0.390625, + "learning_rate": 9.971071931197685e-05, + "loss": 1.2886, + "step": 4985 + }, + { + "epoch": 0.5513507541019833, + "grad_norm": 0.40234375, + "learning_rate": 9.951786671541973e-05, + "loss": 1.2069, + "step": 4990 + }, + { + "epoch": 0.5519032097674161, + "grad_norm": 0.42578125, + "learning_rate": 9.932501591204536e-05, + "loss": 1.3094, + "step": 4995 + }, + { + "epoch": 0.552455665432849, + "grad_norm": 0.40234375, + "learning_rate": 9.913216761911755e-05, + "loss": 1.3084, + "step": 5000 + }, + { + "epoch": 0.5530081210982819, + "grad_norm": 0.396484375, + "learning_rate": 9.893932255389068e-05, + "loss": 1.2617, + "step": 5005 + }, + { + "epoch": 0.5535605767637147, + "grad_norm": 0.40234375, + "learning_rate": 9.874648143360723e-05, + "loss": 1.2746, + "step": 5010 + }, + { + "epoch": 0.5541130324291476, + "grad_norm": 0.416015625, + "learning_rate": 9.855364497549496e-05, + "loss": 1.2916, + "step": 5015 + }, + { + "epoch": 0.5546654880945804, + "grad_norm": 0.412109375, + "learning_rate": 9.836081389676422e-05, + "loss": 1.3503, + "step": 5020 + }, + { + "epoch": 0.5552179437600132, + "grad_norm": 0.390625, + "learning_rate": 9.816798891460546e-05, + "loss": 1.2901, + "step": 5025 + }, + { + "epoch": 0.5557703994254461, + "grad_norm": 0.376953125, + "learning_rate": 9.797517074618642e-05, + "loss": 1.245, + "step": 5030 + }, + { + "epoch": 0.556322855090879, + "grad_norm": 0.400390625, + "learning_rate": 9.778236010864949e-05, + "loss": 1.2161, + "step": 5035 + }, + { + "epoch": 0.5568753107563118, + "grad_norm": 0.412109375, + "learning_rate": 9.758955771910906e-05, + "loss": 1.2126, + "step": 5040 + }, + { + "epoch": 0.5574277664217446, + "grad_norm": 0.404296875, + "learning_rate": 9.739676429464881e-05, + "loss": 1.2392, + "step": 5045 + }, + { + "epoch": 0.5579802220871775, + "grad_norm": 0.39453125, + "learning_rate": 9.720398055231911e-05, + "loss": 1.2809, + "step": 5050 + }, + { + "epoch": 0.5585326777526104, + "grad_norm": 0.42578125, + "learning_rate": 9.701120720913433e-05, + "loss": 1.2809, + "step": 5055 + }, + { + "epoch": 0.5590851334180432, + "grad_norm": 0.3984375, + "learning_rate": 9.68184449820701e-05, + "loss": 1.2276, + "step": 5060 + }, + { + "epoch": 0.559637589083476, + "grad_norm": 0.404296875, + "learning_rate": 9.662569458806085e-05, + "loss": 1.1839, + "step": 5065 + }, + { + "epoch": 0.5601900447489089, + "grad_norm": 0.412109375, + "learning_rate": 9.64329567439968e-05, + "loss": 1.294, + "step": 5070 + }, + { + "epoch": 0.5607425004143417, + "grad_norm": 0.384765625, + "learning_rate": 9.624023216672161e-05, + "loss": 1.2283, + "step": 5075 + }, + { + "epoch": 0.5612949560797746, + "grad_norm": 0.423828125, + "learning_rate": 9.604752157302961e-05, + "loss": 1.3297, + "step": 5080 + }, + { + "epoch": 0.5618474117452075, + "grad_norm": 0.423828125, + "learning_rate": 9.585482567966309e-05, + "loss": 1.2957, + "step": 5085 + }, + { + "epoch": 0.5623998674106403, + "grad_norm": 0.41015625, + "learning_rate": 9.566214520330966e-05, + "loss": 1.2493, + "step": 5090 + }, + { + "epoch": 0.5629523230760731, + "grad_norm": 0.412109375, + "learning_rate": 9.54694808605996e-05, + "loss": 1.2427, + "step": 5095 + }, + { + "epoch": 0.563504778741506, + "grad_norm": 0.392578125, + "learning_rate": 9.527683336810318e-05, + "loss": 1.1977, + "step": 5100 + }, + { + "epoch": 0.5640572344069389, + "grad_norm": 0.412109375, + "learning_rate": 9.5084203442328e-05, + "loss": 1.3098, + "step": 5105 + }, + { + "epoch": 0.5646096900723717, + "grad_norm": 0.396484375, + "learning_rate": 9.489159179971632e-05, + "loss": 1.2023, + "step": 5110 + }, + { + "epoch": 0.5651621457378045, + "grad_norm": 0.3828125, + "learning_rate": 9.469899915664244e-05, + "loss": 1.2855, + "step": 5115 + }, + { + "epoch": 0.5657146014032374, + "grad_norm": 0.37890625, + "learning_rate": 9.450642622940995e-05, + "loss": 1.2757, + "step": 5120 + }, + { + "epoch": 0.5662670570686702, + "grad_norm": 0.392578125, + "learning_rate": 9.43138737342491e-05, + "loss": 1.2366, + "step": 5125 + }, + { + "epoch": 0.5668195127341031, + "grad_norm": 0.41796875, + "learning_rate": 9.412134238731418e-05, + "loss": 1.4092, + "step": 5130 + }, + { + "epoch": 0.567371968399536, + "grad_norm": 0.396484375, + "learning_rate": 9.392883290468083e-05, + "loss": 1.3325, + "step": 5135 + }, + { + "epoch": 0.5679244240649688, + "grad_norm": 0.40234375, + "learning_rate": 9.373634600234334e-05, + "loss": 1.3355, + "step": 5140 + }, + { + "epoch": 0.5684768797304016, + "grad_norm": 0.41796875, + "learning_rate": 9.354388239621208e-05, + "loss": 1.2945, + "step": 5145 + }, + { + "epoch": 0.5690293353958344, + "grad_norm": 0.419921875, + "learning_rate": 9.335144280211066e-05, + "loss": 1.2847, + "step": 5150 + }, + { + "epoch": 0.5695817910612674, + "grad_norm": 0.408203125, + "learning_rate": 9.31590279357735e-05, + "loss": 1.2663, + "step": 5155 + }, + { + "epoch": 0.5701342467267002, + "grad_norm": 0.408203125, + "learning_rate": 9.296663851284297e-05, + "loss": 1.3607, + "step": 5160 + }, + { + "epoch": 0.570686702392133, + "grad_norm": 0.3984375, + "learning_rate": 9.277427524886689e-05, + "loss": 1.3177, + "step": 5165 + }, + { + "epoch": 0.5712391580575659, + "grad_norm": 0.40625, + "learning_rate": 9.258193885929569e-05, + "loss": 1.2937, + "step": 5170 + }, + { + "epoch": 0.5717916137229987, + "grad_norm": 0.4375, + "learning_rate": 9.238963005947993e-05, + "loss": 1.2867, + "step": 5175 + }, + { + "epoch": 0.5723440693884316, + "grad_norm": 0.400390625, + "learning_rate": 9.219734956466752e-05, + "loss": 1.2528, + "step": 5180 + }, + { + "epoch": 0.5728965250538645, + "grad_norm": 0.388671875, + "learning_rate": 9.200509809000108e-05, + "loss": 1.3146, + "step": 5185 + }, + { + "epoch": 0.5734489807192973, + "grad_norm": 0.373046875, + "learning_rate": 9.181287635051534e-05, + "loss": 1.2696, + "step": 5190 + }, + { + "epoch": 0.5740014363847301, + "grad_norm": 0.388671875, + "learning_rate": 9.16206850611344e-05, + "loss": 1.3678, + "step": 5195 + }, + { + "epoch": 0.5745538920501629, + "grad_norm": 0.416015625, + "learning_rate": 9.142852493666914e-05, + "loss": 1.3089, + "step": 5200 + }, + { + "epoch": 0.5751063477155959, + "grad_norm": 0.423828125, + "learning_rate": 9.123639669181448e-05, + "loss": 1.2477, + "step": 5205 + }, + { + "epoch": 0.5756588033810287, + "grad_norm": 0.41015625, + "learning_rate": 9.104430104114681e-05, + "loss": 1.2444, + "step": 5210 + }, + { + "epoch": 0.5762112590464615, + "grad_norm": 0.39453125, + "learning_rate": 9.085223869912129e-05, + "loss": 1.2129, + "step": 5215 + }, + { + "epoch": 0.5767637147118944, + "grad_norm": 0.431640625, + "learning_rate": 9.066021038006919e-05, + "loss": 1.3529, + "step": 5220 + }, + { + "epoch": 0.5773161703773272, + "grad_norm": 0.42578125, + "learning_rate": 9.046821679819527e-05, + "loss": 1.2836, + "step": 5225 + }, + { + "epoch": 0.57786862604276, + "grad_norm": 0.40625, + "learning_rate": 9.0276258667575e-05, + "loss": 1.2341, + "step": 5230 + }, + { + "epoch": 0.578421081708193, + "grad_norm": 0.392578125, + "learning_rate": 9.008433670215212e-05, + "loss": 1.2246, + "step": 5235 + }, + { + "epoch": 0.5789735373736258, + "grad_norm": 0.421875, + "learning_rate": 8.989245161573576e-05, + "loss": 1.2864, + "step": 5240 + }, + { + "epoch": 0.5795259930390586, + "grad_norm": 0.390625, + "learning_rate": 8.970060412199795e-05, + "loss": 1.2539, + "step": 5245 + }, + { + "epoch": 0.5800784487044914, + "grad_norm": 0.40625, + "learning_rate": 8.950879493447091e-05, + "loss": 1.3407, + "step": 5250 + }, + { + "epoch": 0.5806309043699243, + "grad_norm": 0.3984375, + "learning_rate": 8.931702476654431e-05, + "loss": 1.2563, + "step": 5255 + }, + { + "epoch": 0.5811833600353572, + "grad_norm": 0.396484375, + "learning_rate": 8.912529433146278e-05, + "loss": 1.2357, + "step": 5260 + }, + { + "epoch": 0.58173581570079, + "grad_norm": 0.421875, + "learning_rate": 8.893360434232312e-05, + "loss": 1.2888, + "step": 5265 + }, + { + "epoch": 0.5822882713662229, + "grad_norm": 0.396484375, + "learning_rate": 8.874195551207174e-05, + "loss": 1.2673, + "step": 5270 + }, + { + "epoch": 0.5828407270316557, + "grad_norm": 0.388671875, + "learning_rate": 8.855034855350194e-05, + "loss": 1.3072, + "step": 5275 + }, + { + "epoch": 0.5833931826970885, + "grad_norm": 0.404296875, + "learning_rate": 8.835878417925132e-05, + "loss": 1.2163, + "step": 5280 + }, + { + "epoch": 0.5839456383625214, + "grad_norm": 0.392578125, + "learning_rate": 8.816726310179904e-05, + "loss": 1.296, + "step": 5285 + }, + { + "epoch": 0.5844980940279543, + "grad_norm": 0.392578125, + "learning_rate": 8.797578603346328e-05, + "loss": 1.2156, + "step": 5290 + }, + { + "epoch": 0.5850505496933871, + "grad_norm": 0.3984375, + "learning_rate": 8.778435368639851e-05, + "loss": 1.1998, + "step": 5295 + }, + { + "epoch": 0.5856030053588199, + "grad_norm": 0.4140625, + "learning_rate": 8.759296677259291e-05, + "loss": 1.3334, + "step": 5300 + }, + { + "epoch": 0.5861554610242528, + "grad_norm": 0.4140625, + "learning_rate": 8.740162600386565e-05, + "loss": 1.2662, + "step": 5305 + }, + { + "epoch": 0.5867079166896857, + "grad_norm": 0.388671875, + "learning_rate": 8.721033209186425e-05, + "loss": 1.3158, + "step": 5310 + }, + { + "epoch": 0.5872603723551185, + "grad_norm": 0.40625, + "learning_rate": 8.701908574806197e-05, + "loss": 1.2605, + "step": 5315 + }, + { + "epoch": 0.5878128280205513, + "grad_norm": 0.412109375, + "learning_rate": 8.682788768375521e-05, + "loss": 1.2702, + "step": 5320 + }, + { + "epoch": 0.5883652836859842, + "grad_norm": 0.376953125, + "learning_rate": 8.663673861006074e-05, + "loss": 1.2213, + "step": 5325 + }, + { + "epoch": 0.588917739351417, + "grad_norm": 0.38671875, + "learning_rate": 8.644563923791318e-05, + "loss": 1.1698, + "step": 5330 + }, + { + "epoch": 0.5894701950168499, + "grad_norm": 0.41796875, + "learning_rate": 8.625459027806214e-05, + "loss": 1.2026, + "step": 5335 + }, + { + "epoch": 0.5900226506822828, + "grad_norm": 0.40625, + "learning_rate": 8.606359244106999e-05, + "loss": 1.3359, + "step": 5340 + }, + { + "epoch": 0.5905751063477156, + "grad_norm": 0.408203125, + "learning_rate": 8.587264643730877e-05, + "loss": 1.3318, + "step": 5345 + }, + { + "epoch": 0.5911275620131484, + "grad_norm": 0.4140625, + "learning_rate": 8.568175297695777e-05, + "loss": 1.2001, + "step": 5350 + }, + { + "epoch": 0.5916800176785812, + "grad_norm": 0.427734375, + "learning_rate": 8.549091277000091e-05, + "loss": 1.2584, + "step": 5355 + }, + { + "epoch": 0.5922324733440142, + "grad_norm": 0.404296875, + "learning_rate": 8.530012652622397e-05, + "loss": 1.2887, + "step": 5360 + }, + { + "epoch": 0.592784929009447, + "grad_norm": 0.408203125, + "learning_rate": 8.510939495521213e-05, + "loss": 1.2336, + "step": 5365 + }, + { + "epoch": 0.5933373846748798, + "grad_norm": 0.3984375, + "learning_rate": 8.491871876634712e-05, + "loss": 1.2963, + "step": 5370 + }, + { + "epoch": 0.5938898403403127, + "grad_norm": 0.4140625, + "learning_rate": 8.472809866880475e-05, + "loss": 1.2591, + "step": 5375 + }, + { + "epoch": 0.5944422960057455, + "grad_norm": 0.408203125, + "learning_rate": 8.45375353715522e-05, + "loss": 1.2437, + "step": 5380 + }, + { + "epoch": 0.5949947516711784, + "grad_norm": 0.423828125, + "learning_rate": 8.434702958334539e-05, + "loss": 1.2978, + "step": 5385 + }, + { + "epoch": 0.5955472073366113, + "grad_norm": 0.41015625, + "learning_rate": 8.415658201272636e-05, + "loss": 1.2626, + "step": 5390 + }, + { + "epoch": 0.5960996630020441, + "grad_norm": 0.408203125, + "learning_rate": 8.39661933680206e-05, + "loss": 1.2776, + "step": 5395 + }, + { + "epoch": 0.5966521186674769, + "grad_norm": 0.412109375, + "learning_rate": 8.377586435733446e-05, + "loss": 1.2243, + "step": 5400 + }, + { + "epoch": 0.5972045743329097, + "grad_norm": 0.388671875, + "learning_rate": 8.358559568855249e-05, + "loss": 1.3193, + "step": 5405 + }, + { + "epoch": 0.5977570299983427, + "grad_norm": 0.416015625, + "learning_rate": 8.33953880693348e-05, + "loss": 1.3303, + "step": 5410 + }, + { + "epoch": 0.5983094856637755, + "grad_norm": 0.388671875, + "learning_rate": 8.320524220711446e-05, + "loss": 1.2865, + "step": 5415 + }, + { + "epoch": 0.5988619413292083, + "grad_norm": 0.40234375, + "learning_rate": 8.301515880909481e-05, + "loss": 1.2883, + "step": 5420 + }, + { + "epoch": 0.5994143969946412, + "grad_norm": 0.376953125, + "learning_rate": 8.282513858224698e-05, + "loss": 1.2033, + "step": 5425 + }, + { + "epoch": 0.599966852660074, + "grad_norm": 0.41796875, + "learning_rate": 8.263518223330697e-05, + "loss": 1.3027, + "step": 5430 + }, + { + "epoch": 0.6005193083255069, + "grad_norm": 0.4375, + "learning_rate": 8.244529046877336e-05, + "loss": 1.2636, + "step": 5435 + }, + { + "epoch": 0.6010717639909398, + "grad_norm": 0.4375, + "learning_rate": 8.225546399490442e-05, + "loss": 1.3465, + "step": 5440 + }, + { + "epoch": 0.6016242196563726, + "grad_norm": 0.3984375, + "learning_rate": 8.206570351771568e-05, + "loss": 1.2668, + "step": 5445 + }, + { + "epoch": 0.6021766753218054, + "grad_norm": 0.431640625, + "learning_rate": 8.187600974297714e-05, + "loss": 1.2356, + "step": 5450 + }, + { + "epoch": 0.6027291309872382, + "grad_norm": 0.42578125, + "learning_rate": 8.16863833762107e-05, + "loss": 1.3551, + "step": 5455 + }, + { + "epoch": 0.6032815866526712, + "grad_norm": 0.390625, + "learning_rate": 8.149682512268763e-05, + "loss": 1.2005, + "step": 5460 + }, + { + "epoch": 0.603834042318104, + "grad_norm": 0.39453125, + "learning_rate": 8.130733568742579e-05, + "loss": 1.2591, + "step": 5465 + }, + { + "epoch": 0.6043864979835368, + "grad_norm": 0.4296875, + "learning_rate": 8.111791577518716e-05, + "loss": 1.2687, + "step": 5470 + }, + { + "epoch": 0.6049389536489697, + "grad_norm": 0.41015625, + "learning_rate": 8.092856609047508e-05, + "loss": 1.215, + "step": 5475 + }, + { + "epoch": 0.6054914093144025, + "grad_norm": 0.400390625, + "learning_rate": 8.073928733753175e-05, + "loss": 1.313, + "step": 5480 + }, + { + "epoch": 0.6060438649798354, + "grad_norm": 0.41015625, + "learning_rate": 8.055008022033551e-05, + "loss": 1.2682, + "step": 5485 + }, + { + "epoch": 0.6065963206452682, + "grad_norm": 0.400390625, + "learning_rate": 8.036094544259827e-05, + "loss": 1.3116, + "step": 5490 + }, + { + "epoch": 0.6071487763107011, + "grad_norm": 0.4453125, + "learning_rate": 8.017188370776292e-05, + "loss": 1.2642, + "step": 5495 + }, + { + "epoch": 0.6077012319761339, + "grad_norm": 0.40625, + "learning_rate": 7.998289571900067e-05, + "loss": 1.2091, + "step": 5500 + }, + { + "epoch": 0.6082536876415667, + "grad_norm": 0.44140625, + "learning_rate": 7.979398217920849e-05, + "loss": 1.3081, + "step": 5505 + }, + { + "epoch": 0.6088061433069996, + "grad_norm": 0.443359375, + "learning_rate": 7.960514379100632e-05, + "loss": 1.2977, + "step": 5510 + }, + { + "epoch": 0.6093585989724325, + "grad_norm": 0.400390625, + "learning_rate": 7.941638125673475e-05, + "loss": 1.2382, + "step": 5515 + }, + { + "epoch": 0.6099110546378653, + "grad_norm": 0.3828125, + "learning_rate": 7.922769527845217e-05, + "loss": 1.3764, + "step": 5520 + }, + { + "epoch": 0.6104635103032982, + "grad_norm": 0.38671875, + "learning_rate": 7.903908655793224e-05, + "loss": 1.2624, + "step": 5525 + }, + { + "epoch": 0.611015965968731, + "grad_norm": 0.38671875, + "learning_rate": 7.885055579666133e-05, + "loss": 1.2123, + "step": 5530 + }, + { + "epoch": 0.6115684216341638, + "grad_norm": 0.408203125, + "learning_rate": 7.866210369583576e-05, + "loss": 1.2908, + "step": 5535 + }, + { + "epoch": 0.6121208772995967, + "grad_norm": 0.404296875, + "learning_rate": 7.847373095635937e-05, + "loss": 1.2451, + "step": 5540 + }, + { + "epoch": 0.6126733329650296, + "grad_norm": 0.3984375, + "learning_rate": 7.82854382788408e-05, + "loss": 1.2879, + "step": 5545 + }, + { + "epoch": 0.6132257886304624, + "grad_norm": 0.423828125, + "learning_rate": 7.809722636359095e-05, + "loss": 1.3246, + "step": 5550 + }, + { + "epoch": 0.6137782442958952, + "grad_norm": 0.388671875, + "learning_rate": 7.790909591062032e-05, + "loss": 1.2816, + "step": 5555 + }, + { + "epoch": 0.614330699961328, + "grad_norm": 0.40234375, + "learning_rate": 7.772104761963645e-05, + "loss": 1.2925, + "step": 5560 + }, + { + "epoch": 0.614883155626761, + "grad_norm": 0.400390625, + "learning_rate": 7.753308219004122e-05, + "loss": 1.3182, + "step": 5565 + }, + { + "epoch": 0.6154356112921938, + "grad_norm": 0.400390625, + "learning_rate": 7.734520032092845e-05, + "loss": 1.3615, + "step": 5570 + }, + { + "epoch": 0.6159880669576266, + "grad_norm": 0.39453125, + "learning_rate": 7.715740271108107e-05, + "loss": 1.2766, + "step": 5575 + }, + { + "epoch": 0.6165405226230595, + "grad_norm": 0.404296875, + "learning_rate": 7.69696900589687e-05, + "loss": 1.2598, + "step": 5580 + }, + { + "epoch": 0.6170929782884923, + "grad_norm": 0.38671875, + "learning_rate": 7.678206306274495e-05, + "loss": 1.3825, + "step": 5585 + }, + { + "epoch": 0.6176454339539252, + "grad_norm": 0.40625, + "learning_rate": 7.659452242024482e-05, + "loss": 1.2055, + "step": 5590 + }, + { + "epoch": 0.6181978896193581, + "grad_norm": 0.423828125, + "learning_rate": 7.64070688289822e-05, + "loss": 1.217, + "step": 5595 + }, + { + "epoch": 0.6187503452847909, + "grad_norm": 0.3984375, + "learning_rate": 7.621970298614717e-05, + "loss": 1.363, + "step": 5600 + }, + { + "epoch": 0.6193028009502237, + "grad_norm": 0.40234375, + "learning_rate": 7.603242558860347e-05, + "loss": 1.2693, + "step": 5605 + }, + { + "epoch": 0.6198552566156565, + "grad_norm": 0.38671875, + "learning_rate": 7.584523733288589e-05, + "loss": 1.3201, + "step": 5610 + }, + { + "epoch": 0.6204077122810895, + "grad_norm": 0.43359375, + "learning_rate": 7.565813891519765e-05, + "loss": 1.3271, + "step": 5615 + }, + { + "epoch": 0.6209601679465223, + "grad_norm": 0.42578125, + "learning_rate": 7.547113103140786e-05, + "loss": 1.3233, + "step": 5620 + }, + { + "epoch": 0.6215126236119551, + "grad_norm": 0.40234375, + "learning_rate": 7.528421437704892e-05, + "loss": 1.2494, + "step": 5625 + }, + { + "epoch": 0.622065079277388, + "grad_norm": 0.396484375, + "learning_rate": 7.509738964731389e-05, + "loss": 1.2124, + "step": 5630 + }, + { + "epoch": 0.6226175349428208, + "grad_norm": 0.408203125, + "learning_rate": 7.491065753705399e-05, + "loss": 1.2642, + "step": 5635 + }, + { + "epoch": 0.6231699906082537, + "grad_norm": 0.408203125, + "learning_rate": 7.472401874077592e-05, + "loss": 1.3271, + "step": 5640 + }, + { + "epoch": 0.6237224462736866, + "grad_norm": 0.41015625, + "learning_rate": 7.453747395263931e-05, + "loss": 1.2386, + "step": 5645 + }, + { + "epoch": 0.6242749019391194, + "grad_norm": 0.39453125, + "learning_rate": 7.43510238664542e-05, + "loss": 1.2424, + "step": 5650 + }, + { + "epoch": 0.6248273576045522, + "grad_norm": 0.40625, + "learning_rate": 7.416466917567837e-05, + "loss": 1.3081, + "step": 5655 + }, + { + "epoch": 0.625379813269985, + "grad_norm": 0.421875, + "learning_rate": 7.397841057341479e-05, + "loss": 1.2392, + "step": 5660 + }, + { + "epoch": 0.625932268935418, + "grad_norm": 0.41015625, + "learning_rate": 7.379224875240911e-05, + "loss": 1.3145, + "step": 5665 + }, + { + "epoch": 0.6264847246008508, + "grad_norm": 0.42578125, + "learning_rate": 7.360618440504694e-05, + "loss": 1.2563, + "step": 5670 + }, + { + "epoch": 0.6270371802662836, + "grad_norm": 0.412109375, + "learning_rate": 7.342021822335143e-05, + "loss": 1.2278, + "step": 5675 + }, + { + "epoch": 0.6275896359317165, + "grad_norm": 0.392578125, + "learning_rate": 7.323435089898059e-05, + "loss": 1.3552, + "step": 5680 + }, + { + "epoch": 0.6281420915971493, + "grad_norm": 0.40234375, + "learning_rate": 7.304858312322475e-05, + "loss": 1.205, + "step": 5685 + }, + { + "epoch": 0.6286945472625822, + "grad_norm": 0.42578125, + "learning_rate": 7.2862915587004e-05, + "loss": 1.1788, + "step": 5690 + }, + { + "epoch": 0.629247002928015, + "grad_norm": 0.396484375, + "learning_rate": 7.267734898086564e-05, + "loss": 1.3006, + "step": 5695 + }, + { + "epoch": 0.6297994585934479, + "grad_norm": 0.408203125, + "learning_rate": 7.249188399498158e-05, + "loss": 1.269, + "step": 5700 + }, + { + "epoch": 0.6303519142588807, + "grad_norm": 0.41796875, + "learning_rate": 7.230652131914574e-05, + "loss": 1.2548, + "step": 5705 + }, + { + "epoch": 0.6309043699243135, + "grad_norm": 0.39453125, + "learning_rate": 7.21212616427715e-05, + "loss": 1.3417, + "step": 5710 + }, + { + "epoch": 0.6314568255897465, + "grad_norm": 0.419921875, + "learning_rate": 7.193610565488924e-05, + "loss": 1.314, + "step": 5715 + }, + { + "epoch": 0.6320092812551793, + "grad_norm": 0.412109375, + "learning_rate": 7.175105404414362e-05, + "loss": 1.2951, + "step": 5720 + }, + { + "epoch": 0.6325617369206121, + "grad_norm": 0.4140625, + "learning_rate": 7.156610749879116e-05, + "loss": 1.2535, + "step": 5725 + }, + { + "epoch": 0.633114192586045, + "grad_norm": 0.380859375, + "learning_rate": 7.138126670669755e-05, + "loss": 1.2447, + "step": 5730 + }, + { + "epoch": 0.6336666482514778, + "grad_norm": 0.412109375, + "learning_rate": 7.11965323553352e-05, + "loss": 1.2694, + "step": 5735 + }, + { + "epoch": 0.6342191039169107, + "grad_norm": 0.41015625, + "learning_rate": 7.10119051317806e-05, + "loss": 1.1998, + "step": 5740 + }, + { + "epoch": 0.6347715595823435, + "grad_norm": 0.404296875, + "learning_rate": 7.082738572271185e-05, + "loss": 1.1994, + "step": 5745 + }, + { + "epoch": 0.6353240152477764, + "grad_norm": 0.4140625, + "learning_rate": 7.0642974814406e-05, + "loss": 1.2666, + "step": 5750 + }, + { + "epoch": 0.6358764709132092, + "grad_norm": 0.416015625, + "learning_rate": 7.045867309273664e-05, + "loss": 1.297, + "step": 5755 + }, + { + "epoch": 0.636428926578642, + "grad_norm": 0.40234375, + "learning_rate": 7.027448124317119e-05, + "loss": 1.2803, + "step": 5760 + }, + { + "epoch": 0.636981382244075, + "grad_norm": 0.40625, + "learning_rate": 7.009039995076844e-05, + "loss": 1.3067, + "step": 5765 + }, + { + "epoch": 0.6375338379095078, + "grad_norm": 0.37890625, + "learning_rate": 6.990642990017602e-05, + "loss": 1.2694, + "step": 5770 + }, + { + "epoch": 0.6380862935749406, + "grad_norm": 0.384765625, + "learning_rate": 6.97225717756278e-05, + "loss": 1.3048, + "step": 5775 + }, + { + "epoch": 0.6386387492403734, + "grad_norm": 0.388671875, + "learning_rate": 6.953882626094136e-05, + "loss": 1.2704, + "step": 5780 + }, + { + "epoch": 0.6391912049058063, + "grad_norm": 0.376953125, + "learning_rate": 6.93551940395155e-05, + "loss": 1.3077, + "step": 5785 + }, + { + "epoch": 0.6397436605712391, + "grad_norm": 0.3984375, + "learning_rate": 6.917167579432753e-05, + "loss": 1.2559, + "step": 5790 + }, + { + "epoch": 0.640296116236672, + "grad_norm": 0.392578125, + "learning_rate": 6.898827220793103e-05, + "loss": 1.3388, + "step": 5795 + }, + { + "epoch": 0.6408485719021049, + "grad_norm": 0.390625, + "learning_rate": 6.880498396245298e-05, + "loss": 1.2705, + "step": 5800 + }, + { + "epoch": 0.6414010275675377, + "grad_norm": 0.408203125, + "learning_rate": 6.862181173959146e-05, + "loss": 1.2947, + "step": 5805 + }, + { + "epoch": 0.6419534832329705, + "grad_norm": 0.4375, + "learning_rate": 6.843875622061304e-05, + "loss": 1.2814, + "step": 5810 + }, + { + "epoch": 0.6425059388984034, + "grad_norm": 0.40234375, + "learning_rate": 6.825581808635016e-05, + "loss": 1.2504, + "step": 5815 + }, + { + "epoch": 0.6430583945638363, + "grad_norm": 0.404296875, + "learning_rate": 6.80729980171987e-05, + "loss": 1.2417, + "step": 5820 + }, + { + "epoch": 0.6436108502292691, + "grad_norm": 0.42578125, + "learning_rate": 6.789029669311551e-05, + "loss": 1.3356, + "step": 5825 + }, + { + "epoch": 0.6441633058947019, + "grad_norm": 0.412109375, + "learning_rate": 6.770771479361568e-05, + "loss": 1.2996, + "step": 5830 + }, + { + "epoch": 0.6447157615601348, + "grad_norm": 0.408203125, + "learning_rate": 6.752525299777021e-05, + "loss": 1.2249, + "step": 5835 + }, + { + "epoch": 0.6452682172255676, + "grad_norm": 0.416015625, + "learning_rate": 6.734291198420333e-05, + "loss": 1.3227, + "step": 5840 + }, + { + "epoch": 0.6458206728910005, + "grad_norm": 0.376953125, + "learning_rate": 6.716069243109011e-05, + "loss": 1.1827, + "step": 5845 + }, + { + "epoch": 0.6463731285564334, + "grad_norm": 0.40234375, + "learning_rate": 6.697859501615387e-05, + "loss": 1.3241, + "step": 5850 + }, + { + "epoch": 0.6469255842218662, + "grad_norm": 0.412109375, + "learning_rate": 6.679662041666362e-05, + "loss": 1.2363, + "step": 5855 + }, + { + "epoch": 0.647478039887299, + "grad_norm": 0.400390625, + "learning_rate": 6.661476930943163e-05, + "loss": 1.2711, + "step": 5860 + }, + { + "epoch": 0.6480304955527318, + "grad_norm": 0.40625, + "learning_rate": 6.643304237081087e-05, + "loss": 1.2842, + "step": 5865 + }, + { + "epoch": 0.6485829512181648, + "grad_norm": 0.44140625, + "learning_rate": 6.625144027669245e-05, + "loss": 1.249, + "step": 5870 + }, + { + "epoch": 0.6491354068835976, + "grad_norm": 0.421875, + "learning_rate": 6.60699637025032e-05, + "loss": 1.2837, + "step": 5875 + }, + { + "epoch": 0.6496878625490304, + "grad_norm": 0.41796875, + "learning_rate": 6.588861332320306e-05, + "loss": 1.3581, + "step": 5880 + }, + { + "epoch": 0.6502403182144633, + "grad_norm": 0.396484375, + "learning_rate": 6.570738981328266e-05, + "loss": 1.2256, + "step": 5885 + }, + { + "epoch": 0.6507927738798961, + "grad_norm": 0.3828125, + "learning_rate": 6.552629384676079e-05, + "loss": 1.2203, + "step": 5890 + }, + { + "epoch": 0.651345229545329, + "grad_norm": 0.388671875, + "learning_rate": 6.534532609718177e-05, + "loss": 1.309, + "step": 5895 + }, + { + "epoch": 0.6518976852107619, + "grad_norm": 0.404296875, + "learning_rate": 6.516448723761315e-05, + "loss": 1.2835, + "step": 5900 + }, + { + "epoch": 0.6524501408761947, + "grad_norm": 0.400390625, + "learning_rate": 6.498377794064303e-05, + "loss": 1.3349, + "step": 5905 + }, + { + "epoch": 0.6530025965416275, + "grad_norm": 0.419921875, + "learning_rate": 6.480319887837771e-05, + "loss": 1.2314, + "step": 5910 + }, + { + "epoch": 0.6535550522070603, + "grad_norm": 0.458984375, + "learning_rate": 6.462275072243908e-05, + "loss": 1.3439, + "step": 5915 + }, + { + "epoch": 0.6541075078724933, + "grad_norm": 0.412109375, + "learning_rate": 6.444243414396208e-05, + "loss": 1.2487, + "step": 5920 + }, + { + "epoch": 0.6546599635379261, + "grad_norm": 0.40234375, + "learning_rate": 6.426224981359238e-05, + "loss": 1.2491, + "step": 5925 + }, + { + "epoch": 0.6552124192033589, + "grad_norm": 0.38671875, + "learning_rate": 6.408219840148375e-05, + "loss": 1.2978, + "step": 5930 + }, + { + "epoch": 0.6557648748687918, + "grad_norm": 1.2890625, + "learning_rate": 6.390228057729557e-05, + "loss": 1.2105, + "step": 5935 + }, + { + "epoch": 0.6563173305342246, + "grad_norm": 0.384765625, + "learning_rate": 6.372249701019045e-05, + "loss": 1.2386, + "step": 5940 + }, + { + "epoch": 0.6568697861996575, + "grad_norm": 0.416015625, + "learning_rate": 6.354284836883156e-05, + "loss": 1.2054, + "step": 5945 + }, + { + "epoch": 0.6574222418650904, + "grad_norm": 0.416015625, + "learning_rate": 6.336333532138032e-05, + "loss": 1.1582, + "step": 5950 + }, + { + "epoch": 0.6579746975305232, + "grad_norm": 0.421875, + "learning_rate": 6.31839585354938e-05, + "loss": 1.3335, + "step": 5955 + }, + { + "epoch": 0.658527153195956, + "grad_norm": 0.40234375, + "learning_rate": 6.300471867832229e-05, + "loss": 1.2848, + "step": 5960 + }, + { + "epoch": 0.6590796088613888, + "grad_norm": 0.4375, + "learning_rate": 6.282561641650682e-05, + "loss": 1.3053, + "step": 5965 + }, + { + "epoch": 0.6596320645268218, + "grad_norm": 0.3828125, + "learning_rate": 6.264665241617666e-05, + "loss": 1.2261, + "step": 5970 + }, + { + "epoch": 0.6601845201922546, + "grad_norm": 0.458984375, + "learning_rate": 6.246782734294683e-05, + "loss": 1.3282, + "step": 5975 + }, + { + "epoch": 0.6607369758576874, + "grad_norm": 0.384765625, + "learning_rate": 6.228914186191563e-05, + "loss": 1.2368, + "step": 5980 + }, + { + "epoch": 0.6612894315231203, + "grad_norm": 0.4296875, + "learning_rate": 6.211059663766224e-05, + "loss": 1.3354, + "step": 5985 + }, + { + "epoch": 0.6618418871885531, + "grad_norm": 0.4140625, + "learning_rate": 6.193219233424414e-05, + "loss": 1.2843, + "step": 5990 + }, + { + "epoch": 0.662394342853986, + "grad_norm": 0.423828125, + "learning_rate": 6.175392961519471e-05, + "loss": 1.289, + "step": 5995 + }, + { + "epoch": 0.6629467985194188, + "grad_norm": 0.40625, + "learning_rate": 6.15758091435207e-05, + "loss": 1.266, + "step": 6000 + }, + { + "epoch": 0.6634992541848517, + "grad_norm": 0.392578125, + "learning_rate": 6.139783158169984e-05, + "loss": 1.3227, + "step": 6005 + }, + { + "epoch": 0.6640517098502845, + "grad_norm": 0.384765625, + "learning_rate": 6.121999759167837e-05, + "loss": 1.2908, + "step": 6010 + }, + { + "epoch": 0.6646041655157173, + "grad_norm": 0.39453125, + "learning_rate": 6.104230783486847e-05, + "loss": 1.1879, + "step": 6015 + }, + { + "epoch": 0.6651566211811503, + "grad_norm": 0.400390625, + "learning_rate": 6.086476297214594e-05, + "loss": 1.2822, + "step": 6020 + }, + { + "epoch": 0.6657090768465831, + "grad_norm": 0.4140625, + "learning_rate": 6.068736366384764e-05, + "loss": 1.2844, + "step": 6025 + }, + { + "epoch": 0.6662615325120159, + "grad_norm": 0.384765625, + "learning_rate": 6.0510110569769095e-05, + "loss": 1.1896, + "step": 6030 + }, + { + "epoch": 0.6668139881774487, + "grad_norm": 0.42578125, + "learning_rate": 6.033300434916203e-05, + "loss": 1.3519, + "step": 6035 + }, + { + "epoch": 0.6673664438428816, + "grad_norm": 0.412109375, + "learning_rate": 6.0156045660731873e-05, + "loss": 1.2742, + "step": 6040 + }, + { + "epoch": 0.6679188995083145, + "grad_norm": 0.384765625, + "learning_rate": 5.9979235162635394e-05, + "loss": 1.1947, + "step": 6045 + }, + { + "epoch": 0.6684713551737473, + "grad_norm": 0.4140625, + "learning_rate": 5.980257351247818e-05, + "loss": 1.2286, + "step": 6050 + }, + { + "epoch": 0.6690238108391802, + "grad_norm": 0.439453125, + "learning_rate": 5.9626061367312166e-05, + "loss": 1.2746, + "step": 6055 + }, + { + "epoch": 0.669576266504613, + "grad_norm": 0.392578125, + "learning_rate": 5.9449699383633316e-05, + "loss": 1.2367, + "step": 6060 + }, + { + "epoch": 0.6701287221700458, + "grad_norm": 0.416015625, + "learning_rate": 5.927348821737906e-05, + "loss": 1.3109, + "step": 6065 + }, + { + "epoch": 0.6706811778354786, + "grad_norm": 0.404296875, + "learning_rate": 5.909742852392587e-05, + "loss": 1.2512, + "step": 6070 + }, + { + "epoch": 0.6712336335009116, + "grad_norm": 0.404296875, + "learning_rate": 5.8921520958086905e-05, + "loss": 1.2713, + "step": 6075 + }, + { + "epoch": 0.6717860891663444, + "grad_norm": 0.41796875, + "learning_rate": 5.8745766174109495e-05, + "loss": 1.341, + "step": 6080 + }, + { + "epoch": 0.6723385448317772, + "grad_norm": 0.396484375, + "learning_rate": 5.857016482567275e-05, + "loss": 1.3013, + "step": 6085 + }, + { + "epoch": 0.6728910004972101, + "grad_norm": 0.423828125, + "learning_rate": 5.8394717565885106e-05, + "loss": 1.3698, + "step": 6090 + }, + { + "epoch": 0.6734434561626429, + "grad_norm": 0.412109375, + "learning_rate": 5.821942504728183e-05, + "loss": 1.2323, + "step": 6095 + }, + { + "epoch": 0.6739959118280758, + "grad_norm": 0.412109375, + "learning_rate": 5.804428792182279e-05, + "loss": 1.2904, + "step": 6100 + }, + { + "epoch": 0.6745483674935087, + "grad_norm": 0.412109375, + "learning_rate": 5.786930684088988e-05, + "loss": 1.3443, + "step": 6105 + }, + { + "epoch": 0.6751008231589415, + "grad_norm": 0.404296875, + "learning_rate": 5.7694482455284504e-05, + "loss": 1.2083, + "step": 6110 + }, + { + "epoch": 0.6756532788243743, + "grad_norm": 0.404296875, + "learning_rate": 5.751981541522539e-05, + "loss": 1.1749, + "step": 6115 + }, + { + "epoch": 0.6762057344898071, + "grad_norm": 0.39453125, + "learning_rate": 5.734530637034603e-05, + "loss": 1.3391, + "step": 6120 + }, + { + "epoch": 0.6767581901552401, + "grad_norm": 0.39453125, + "learning_rate": 5.7170955969692265e-05, + "loss": 1.2439, + "step": 6125 + }, + { + "epoch": 0.6773106458206729, + "grad_norm": 0.400390625, + "learning_rate": 5.699676486171994e-05, + "loss": 1.1964, + "step": 6130 + }, + { + "epoch": 0.6778631014861057, + "grad_norm": 0.412109375, + "learning_rate": 5.6822733694292427e-05, + "loss": 1.2591, + "step": 6135 + }, + { + "epoch": 0.6784155571515386, + "grad_norm": 0.39453125, + "learning_rate": 5.664886311467821e-05, + "loss": 1.2042, + "step": 6140 + }, + { + "epoch": 0.6789680128169714, + "grad_norm": 0.4375, + "learning_rate": 5.647515376954852e-05, + "loss": 1.2, + "step": 6145 + }, + { + "epoch": 0.6795204684824043, + "grad_norm": 0.416015625, + "learning_rate": 5.630160630497493e-05, + "loss": 1.2134, + "step": 6150 + }, + { + "epoch": 0.6800729241478372, + "grad_norm": 0.40625, + "learning_rate": 5.612822136642697e-05, + "loss": 1.2775, + "step": 6155 + }, + { + "epoch": 0.68062537981327, + "grad_norm": 0.3984375, + "learning_rate": 5.5954999598769575e-05, + "loss": 1.2292, + "step": 6160 + }, + { + "epoch": 0.6811778354787028, + "grad_norm": 0.41015625, + "learning_rate": 5.578194164626089e-05, + "loss": 1.2832, + "step": 6165 + }, + { + "epoch": 0.6817302911441356, + "grad_norm": 0.39453125, + "learning_rate": 5.5609048152549794e-05, + "loss": 1.2754, + "step": 6170 + }, + { + "epoch": 0.6822827468095686, + "grad_norm": 0.39453125, + "learning_rate": 5.543631976067345e-05, + "loss": 1.2711, + "step": 6175 + }, + { + "epoch": 0.6828352024750014, + "grad_norm": 0.396484375, + "learning_rate": 5.526375711305504e-05, + "loss": 1.292, + "step": 6180 + }, + { + "epoch": 0.6833876581404342, + "grad_norm": 0.408203125, + "learning_rate": 5.509136085150122e-05, + "loss": 1.2672, + "step": 6185 + }, + { + "epoch": 0.6839401138058671, + "grad_norm": 0.41796875, + "learning_rate": 5.491913161719984e-05, + "loss": 1.2646, + "step": 6190 + }, + { + "epoch": 0.6844925694712999, + "grad_norm": 0.39453125, + "learning_rate": 5.4747070050717556e-05, + "loss": 1.2929, + "step": 6195 + }, + { + "epoch": 0.6850450251367328, + "grad_norm": 0.419921875, + "learning_rate": 5.457517679199736e-05, + "loss": 1.3068, + "step": 6200 + }, + { + "epoch": 0.6855974808021656, + "grad_norm": 0.400390625, + "learning_rate": 5.4403452480356346e-05, + "loss": 1.2772, + "step": 6205 + }, + { + "epoch": 0.6861499364675985, + "grad_norm": 0.41015625, + "learning_rate": 5.423189775448323e-05, + "loss": 1.3216, + "step": 6210 + }, + { + "epoch": 0.6867023921330313, + "grad_norm": 0.39453125, + "learning_rate": 5.406051325243586e-05, + "loss": 1.1877, + "step": 6215 + }, + { + "epoch": 0.6872548477984641, + "grad_norm": 0.427734375, + "learning_rate": 5.3889299611639174e-05, + "loss": 1.2715, + "step": 6220 + }, + { + "epoch": 0.6878073034638971, + "grad_norm": 0.416015625, + "learning_rate": 5.371825746888251e-05, + "loss": 1.3154, + "step": 6225 + }, + { + "epoch": 0.6883597591293299, + "grad_norm": 0.404296875, + "learning_rate": 5.35473874603174e-05, + "loss": 1.2977, + "step": 6230 + }, + { + "epoch": 0.6889122147947627, + "grad_norm": 0.404296875, + "learning_rate": 5.337669022145515e-05, + "loss": 1.2979, + "step": 6235 + }, + { + "epoch": 0.6894646704601956, + "grad_norm": 0.421875, + "learning_rate": 5.320616638716448e-05, + "loss": 1.3065, + "step": 6240 + }, + { + "epoch": 0.6900171261256284, + "grad_norm": 0.431640625, + "learning_rate": 5.3035816591669205e-05, + "loss": 1.3274, + "step": 6245 + }, + { + "epoch": 0.6905695817910613, + "grad_norm": 0.392578125, + "learning_rate": 5.286564146854581e-05, + "loss": 1.2623, + "step": 6250 + }, + { + "epoch": 0.6911220374564941, + "grad_norm": 0.408203125, + "learning_rate": 5.269564165072115e-05, + "loss": 1.2494, + "step": 6255 + }, + { + "epoch": 0.691674493121927, + "grad_norm": 0.392578125, + "learning_rate": 5.2525817770470084e-05, + "loss": 1.2287, + "step": 6260 + }, + { + "epoch": 0.6922269487873598, + "grad_norm": 0.3984375, + "learning_rate": 5.2356170459413035e-05, + "loss": 1.2245, + "step": 6265 + }, + { + "epoch": 0.6927794044527926, + "grad_norm": 0.408203125, + "learning_rate": 5.2186700348513786e-05, + "loss": 1.3329, + "step": 6270 + }, + { + "epoch": 0.6933318601182256, + "grad_norm": 0.408203125, + "learning_rate": 5.2017408068077064e-05, + "loss": 1.2252, + "step": 6275 + }, + { + "epoch": 0.6938843157836584, + "grad_norm": 0.380859375, + "learning_rate": 5.18482942477462e-05, + "loss": 1.172, + "step": 6280 + }, + { + "epoch": 0.6944367714490912, + "grad_norm": 0.4140625, + "learning_rate": 5.1679359516500735e-05, + "loss": 1.2943, + "step": 6285 + }, + { + "epoch": 0.694989227114524, + "grad_norm": 0.41015625, + "learning_rate": 5.151060450265419e-05, + "loss": 1.3527, + "step": 6290 + }, + { + "epoch": 0.6955416827799569, + "grad_norm": 0.40625, + "learning_rate": 5.1342029833851634e-05, + "loss": 1.2435, + "step": 6295 + }, + { + "epoch": 0.6960941384453898, + "grad_norm": 0.392578125, + "learning_rate": 5.1173636137067406e-05, + "loss": 1.2147, + "step": 6300 + }, + { + "epoch": 0.6966465941108226, + "grad_norm": 0.41015625, + "learning_rate": 5.1005424038602724e-05, + "loss": 1.2178, + "step": 6305 + }, + { + "epoch": 0.6971990497762555, + "grad_norm": 0.41015625, + "learning_rate": 5.083739416408343e-05, + "loss": 1.2232, + "step": 6310 + }, + { + "epoch": 0.6977515054416883, + "grad_norm": 0.408203125, + "learning_rate": 5.066954713845766e-05, + "loss": 1.2486, + "step": 6315 + }, + { + "epoch": 0.6983039611071211, + "grad_norm": 0.41015625, + "learning_rate": 5.050188358599335e-05, + "loss": 1.2738, + "step": 6320 + }, + { + "epoch": 0.6988564167725541, + "grad_norm": 0.3984375, + "learning_rate": 5.033440413027619e-05, + "loss": 1.2159, + "step": 6325 + }, + { + "epoch": 0.6994088724379869, + "grad_norm": 0.392578125, + "learning_rate": 5.01671093942071e-05, + "loss": 1.1586, + "step": 6330 + }, + { + "epoch": 0.6999613281034197, + "grad_norm": 0.41015625, + "learning_rate": 5.000000000000002e-05, + "loss": 1.2886, + "step": 6335 + }, + { + "epoch": 0.7005137837688525, + "grad_norm": 0.40625, + "learning_rate": 4.9833076569179506e-05, + "loss": 1.2738, + "step": 6340 + }, + { + "epoch": 0.7010662394342854, + "grad_norm": 0.392578125, + "learning_rate": 4.9666339722578494e-05, + "loss": 1.2559, + "step": 6345 + }, + { + "epoch": 0.7016186950997182, + "grad_norm": 0.39453125, + "learning_rate": 4.949979008033596e-05, + "loss": 1.2716, + "step": 6350 + }, + { + "epoch": 0.7021711507651511, + "grad_norm": 0.38671875, + "learning_rate": 4.93334282618946e-05, + "loss": 1.2993, + "step": 6355 + }, + { + "epoch": 0.702723606430584, + "grad_norm": 0.3984375, + "learning_rate": 4.9167254885998584e-05, + "loss": 1.2113, + "step": 6360 + }, + { + "epoch": 0.7032760620960168, + "grad_norm": 0.412109375, + "learning_rate": 4.900127057069116e-05, + "loss": 1.3193, + "step": 6365 + }, + { + "epoch": 0.7038285177614496, + "grad_norm": 0.4140625, + "learning_rate": 4.883547593331248e-05, + "loss": 1.2175, + "step": 6370 + }, + { + "epoch": 0.7043809734268824, + "grad_norm": 0.40625, + "learning_rate": 4.866987159049713e-05, + "loss": 1.2761, + "step": 6375 + }, + { + "epoch": 0.7049334290923154, + "grad_norm": 0.39453125, + "learning_rate": 4.850445815817202e-05, + "loss": 1.2263, + "step": 6380 + }, + { + "epoch": 0.7054858847577482, + "grad_norm": 0.455078125, + "learning_rate": 4.833923625155399e-05, + "loss": 1.3204, + "step": 6385 + }, + { + "epoch": 0.706038340423181, + "grad_norm": 0.392578125, + "learning_rate": 4.817420648514755e-05, + "loss": 1.234, + "step": 6390 + }, + { + "epoch": 0.7065907960886139, + "grad_norm": 0.390625, + "learning_rate": 4.800936947274255e-05, + "loss": 1.2804, + "step": 6395 + }, + { + "epoch": 0.7071432517540467, + "grad_norm": 0.423828125, + "learning_rate": 4.7844725827412054e-05, + "loss": 1.3185, + "step": 6400 + }, + { + "epoch": 0.7076957074194796, + "grad_norm": 0.4609375, + "learning_rate": 4.7680276161509795e-05, + "loss": 1.1644, + "step": 6405 + }, + { + "epoch": 0.7082481630849125, + "grad_norm": 0.40234375, + "learning_rate": 4.751602108666818e-05, + "loss": 1.2911, + "step": 6410 + }, + { + "epoch": 0.7088006187503453, + "grad_norm": 0.408203125, + "learning_rate": 4.735196121379571e-05, + "loss": 1.2112, + "step": 6415 + }, + { + "epoch": 0.7093530744157781, + "grad_norm": 0.390625, + "learning_rate": 4.7188097153075017e-05, + "loss": 1.2008, + "step": 6420 + }, + { + "epoch": 0.7099055300812109, + "grad_norm": 0.40625, + "learning_rate": 4.7024429513960425e-05, + "loss": 1.2918, + "step": 6425 + }, + { + "epoch": 0.7104579857466439, + "grad_norm": 0.419921875, + "learning_rate": 4.686095890517569e-05, + "loss": 1.2994, + "step": 6430 + }, + { + "epoch": 0.7110104414120767, + "grad_norm": 0.3671875, + "learning_rate": 4.6697685934711785e-05, + "loss": 1.1452, + "step": 6435 + }, + { + "epoch": 0.7115628970775095, + "grad_norm": 0.408203125, + "learning_rate": 4.65346112098246e-05, + "loss": 1.2731, + "step": 6440 + }, + { + "epoch": 0.7121153527429424, + "grad_norm": 0.4765625, + "learning_rate": 4.637173533703267e-05, + "loss": 1.2484, + "step": 6445 + }, + { + "epoch": 0.7126678084083752, + "grad_norm": 0.3984375, + "learning_rate": 4.6209058922115015e-05, + "loss": 1.3127, + "step": 6450 + }, + { + "epoch": 0.7132202640738081, + "grad_norm": 0.427734375, + "learning_rate": 4.6046582570108744e-05, + "loss": 1.2954, + "step": 6455 + }, + { + "epoch": 0.713772719739241, + "grad_norm": 0.388671875, + "learning_rate": 4.588430688530696e-05, + "loss": 1.2205, + "step": 6460 + }, + { + "epoch": 0.7143251754046738, + "grad_norm": 0.412109375, + "learning_rate": 4.5722232471256296e-05, + "loss": 1.2443, + "step": 6465 + }, + { + "epoch": 0.7148776310701066, + "grad_norm": 0.412109375, + "learning_rate": 4.556035993075495e-05, + "loss": 1.3188, + "step": 6470 + }, + { + "epoch": 0.7154300867355394, + "grad_norm": 0.42578125, + "learning_rate": 4.539868986585022e-05, + "loss": 1.3274, + "step": 6475 + }, + { + "epoch": 0.7159825424009724, + "grad_norm": 0.412109375, + "learning_rate": 4.523722287783636e-05, + "loss": 1.2691, + "step": 6480 + }, + { + "epoch": 0.7165349980664052, + "grad_norm": 0.41015625, + "learning_rate": 4.5075959567252335e-05, + "loss": 1.3686, + "step": 6485 + }, + { + "epoch": 0.717087453731838, + "grad_norm": 0.3984375, + "learning_rate": 4.491490053387958e-05, + "loss": 1.2057, + "step": 6490 + }, + { + "epoch": 0.7176399093972708, + "grad_norm": 0.408203125, + "learning_rate": 4.475404637673974e-05, + "loss": 1.2386, + "step": 6495 + }, + { + "epoch": 0.7181923650627037, + "grad_norm": 0.416015625, + "learning_rate": 4.459339769409252e-05, + "loss": 1.241, + "step": 6500 + }, + { + "epoch": 0.7187448207281366, + "grad_norm": 0.40234375, + "learning_rate": 4.443295508343336e-05, + "loss": 1.2884, + "step": 6505 + }, + { + "epoch": 0.7192972763935694, + "grad_norm": 0.408203125, + "learning_rate": 4.427271914149128e-05, + "loss": 1.3309, + "step": 6510 + }, + { + "epoch": 0.7198497320590023, + "grad_norm": 0.412109375, + "learning_rate": 4.41126904642267e-05, + "loss": 1.2217, + "step": 6515 + }, + { + "epoch": 0.7204021877244351, + "grad_norm": 0.42578125, + "learning_rate": 4.395286964682903e-05, + "loss": 1.2936, + "step": 6520 + }, + { + "epoch": 0.7209546433898679, + "grad_norm": 0.404296875, + "learning_rate": 4.379325728371473e-05, + "loss": 1.2449, + "step": 6525 + }, + { + "epoch": 0.7215070990553009, + "grad_norm": 0.421875, + "learning_rate": 4.363385396852491e-05, + "loss": 1.2352, + "step": 6530 + }, + { + "epoch": 0.7220595547207337, + "grad_norm": 0.404296875, + "learning_rate": 4.347466029412316e-05, + "loss": 1.3162, + "step": 6535 + }, + { + "epoch": 0.7226120103861665, + "grad_norm": 0.408203125, + "learning_rate": 4.331567685259338e-05, + "loss": 1.2229, + "step": 6540 + }, + { + "epoch": 0.7231644660515993, + "grad_norm": 0.408203125, + "learning_rate": 4.3156904235237574e-05, + "loss": 1.288, + "step": 6545 + }, + { + "epoch": 0.7237169217170322, + "grad_norm": 0.4140625, + "learning_rate": 4.2998343032573596e-05, + "loss": 1.3225, + "step": 6550 + }, + { + "epoch": 0.7242693773824651, + "grad_norm": 0.40625, + "learning_rate": 4.2839993834333014e-05, + "loss": 1.3295, + "step": 6555 + }, + { + "epoch": 0.7248218330478979, + "grad_norm": 0.416015625, + "learning_rate": 4.2681857229458885e-05, + "loss": 1.2258, + "step": 6560 + }, + { + "epoch": 0.7253742887133308, + "grad_norm": 0.4453125, + "learning_rate": 4.25239338061036e-05, + "loss": 1.3371, + "step": 6565 + }, + { + "epoch": 0.7259267443787636, + "grad_norm": 0.390625, + "learning_rate": 4.236622415162668e-05, + "loss": 1.3193, + "step": 6570 + }, + { + "epoch": 0.7264792000441964, + "grad_norm": 0.400390625, + "learning_rate": 4.220872885259247e-05, + "loss": 1.2166, + "step": 6575 + }, + { + "epoch": 0.7270316557096294, + "grad_norm": 0.40625, + "learning_rate": 4.20514484947682e-05, + "loss": 1.204, + "step": 6580 + }, + { + "epoch": 0.7275841113750622, + "grad_norm": 0.419921875, + "learning_rate": 4.189438366312162e-05, + "loss": 1.3055, + "step": 6585 + }, + { + "epoch": 0.728136567040495, + "grad_norm": 0.39453125, + "learning_rate": 4.17375349418189e-05, + "loss": 1.2411, + "step": 6590 + }, + { + "epoch": 0.7286890227059278, + "grad_norm": 0.412109375, + "learning_rate": 4.158090291422243e-05, + "loss": 1.2973, + "step": 6595 + }, + { + "epoch": 0.7292414783713607, + "grad_norm": 0.404296875, + "learning_rate": 4.142448816288864e-05, + "loss": 1.1524, + "step": 6600 + }, + { + "epoch": 0.7297939340367936, + "grad_norm": 0.400390625, + "learning_rate": 4.1268291269565885e-05, + "loss": 1.2506, + "step": 6605 + }, + { + "epoch": 0.7303463897022264, + "grad_norm": 0.3828125, + "learning_rate": 4.111231281519222e-05, + "loss": 1.2727, + "step": 6610 + }, + { + "epoch": 0.7308988453676593, + "grad_norm": 0.376953125, + "learning_rate": 4.095655337989329e-05, + "loss": 1.2792, + "step": 6615 + }, + { + "epoch": 0.7314513010330921, + "grad_norm": 0.41796875, + "learning_rate": 4.080101354298016e-05, + "loss": 1.2532, + "step": 6620 + }, + { + "epoch": 0.7320037566985249, + "grad_norm": 0.3984375, + "learning_rate": 4.0645693882947046e-05, + "loss": 1.2731, + "step": 6625 + }, + { + "epoch": 0.7325562123639577, + "grad_norm": 0.412109375, + "learning_rate": 4.0490594977469406e-05, + "loss": 1.226, + "step": 6630 + }, + { + "epoch": 0.7331086680293907, + "grad_norm": 0.421875, + "learning_rate": 4.0335717403401576e-05, + "loss": 1.3171, + "step": 6635 + }, + { + "epoch": 0.7336611236948235, + "grad_norm": 0.421875, + "learning_rate": 4.018106173677473e-05, + "loss": 1.2098, + "step": 6640 + }, + { + "epoch": 0.7342135793602563, + "grad_norm": 0.396484375, + "learning_rate": 4.00266285527947e-05, + "loss": 1.2317, + "step": 6645 + }, + { + "epoch": 0.7347660350256892, + "grad_norm": 0.421875, + "learning_rate": 3.987241842583983e-05, + "loss": 1.2911, + "step": 6650 + }, + { + "epoch": 0.735318490691122, + "grad_norm": 0.396484375, + "learning_rate": 3.971843192945889e-05, + "loss": 1.2999, + "step": 6655 + }, + { + "epoch": 0.7358709463565549, + "grad_norm": 0.408203125, + "learning_rate": 3.9564669636368866e-05, + "loss": 1.3028, + "step": 6660 + }, + { + "epoch": 0.7364234020219877, + "grad_norm": 0.423828125, + "learning_rate": 3.9411132118452896e-05, + "loss": 1.2787, + "step": 6665 + }, + { + "epoch": 0.7369758576874206, + "grad_norm": 0.41796875, + "learning_rate": 3.9257819946758135e-05, + "loss": 1.2632, + "step": 6670 + }, + { + "epoch": 0.7375283133528534, + "grad_norm": 0.40234375, + "learning_rate": 3.910473369149361e-05, + "loss": 1.257, + "step": 6675 + }, + { + "epoch": 0.7380807690182862, + "grad_norm": 0.404296875, + "learning_rate": 3.895187392202804e-05, + "loss": 1.318, + "step": 6680 + }, + { + "epoch": 0.7386332246837192, + "grad_norm": 0.416015625, + "learning_rate": 3.8799241206887836e-05, + "loss": 1.2326, + "step": 6685 + }, + { + "epoch": 0.739185680349152, + "grad_norm": 0.40625, + "learning_rate": 3.864683611375497e-05, + "loss": 1.2301, + "step": 6690 + }, + { + "epoch": 0.7397381360145848, + "grad_norm": 0.38671875, + "learning_rate": 3.849465920946475e-05, + "loss": 1.2722, + "step": 6695 + }, + { + "epoch": 0.7402905916800177, + "grad_norm": 0.396484375, + "learning_rate": 3.834271106000385e-05, + "loss": 1.2496, + "step": 6700 + }, + { + "epoch": 0.7408430473454505, + "grad_norm": 0.44140625, + "learning_rate": 3.819099223050813e-05, + "loss": 1.2122, + "step": 6705 + }, + { + "epoch": 0.7413955030108834, + "grad_norm": 0.40625, + "learning_rate": 3.8039503285260506e-05, + "loss": 1.2146, + "step": 6710 + }, + { + "epoch": 0.7419479586763162, + "grad_norm": 0.408203125, + "learning_rate": 3.788824478768893e-05, + "loss": 1.228, + "step": 6715 + }, + { + "epoch": 0.7425004143417491, + "grad_norm": 0.3828125, + "learning_rate": 3.773721730036426e-05, + "loss": 1.2635, + "step": 6720 + }, + { + "epoch": 0.7430528700071819, + "grad_norm": 0.400390625, + "learning_rate": 3.758642138499819e-05, + "loss": 1.2543, + "step": 6725 + }, + { + "epoch": 0.7436053256726147, + "grad_norm": 0.37109375, + "learning_rate": 3.743585760244104e-05, + "loss": 1.2001, + "step": 6730 + }, + { + "epoch": 0.7441577813380477, + "grad_norm": 0.388671875, + "learning_rate": 3.728552651267985e-05, + "loss": 1.2411, + "step": 6735 + }, + { + "epoch": 0.7447102370034805, + "grad_norm": 0.408203125, + "learning_rate": 3.7135428674836184e-05, + "loss": 1.2956, + "step": 6740 + }, + { + "epoch": 0.7452626926689133, + "grad_norm": 0.388671875, + "learning_rate": 3.698556464716411e-05, + "loss": 1.2976, + "step": 6745 + }, + { + "epoch": 0.7458151483343461, + "grad_norm": 0.40234375, + "learning_rate": 3.683593498704801e-05, + "loss": 1.2107, + "step": 6750 + }, + { + "epoch": 0.746367603999779, + "grad_norm": 0.421875, + "learning_rate": 3.6686540251000756e-05, + "loss": 1.3122, + "step": 6755 + }, + { + "epoch": 0.7469200596652119, + "grad_norm": 0.39453125, + "learning_rate": 3.6537380994661295e-05, + "loss": 1.2732, + "step": 6760 + }, + { + "epoch": 0.7474725153306447, + "grad_norm": 0.41015625, + "learning_rate": 3.638845777279286e-05, + "loss": 1.2454, + "step": 6765 + }, + { + "epoch": 0.7480249709960776, + "grad_norm": 0.41796875, + "learning_rate": 3.623977113928081e-05, + "loss": 1.2426, + "step": 6770 + }, + { + "epoch": 0.7485774266615104, + "grad_norm": 0.3984375, + "learning_rate": 3.6091321647130484e-05, + "loss": 1.2851, + "step": 6775 + }, + { + "epoch": 0.7491298823269432, + "grad_norm": 0.412109375, + "learning_rate": 3.59431098484653e-05, + "loss": 1.216, + "step": 6780 + }, + { + "epoch": 0.7496823379923762, + "grad_norm": 0.392578125, + "learning_rate": 3.579513629452464e-05, + "loss": 1.2255, + "step": 6785 + }, + { + "epoch": 0.750234793657809, + "grad_norm": 0.40234375, + "learning_rate": 3.564740153566176e-05, + "loss": 1.2575, + "step": 6790 + }, + { + "epoch": 0.7507872493232418, + "grad_norm": 0.41015625, + "learning_rate": 3.5499906121341785e-05, + "loss": 1.2037, + "step": 6795 + }, + { + "epoch": 0.7513397049886746, + "grad_norm": 0.419921875, + "learning_rate": 3.535265060013965e-05, + "loss": 1.2699, + "step": 6800 + }, + { + "epoch": 0.7518921606541075, + "grad_norm": 0.40625, + "learning_rate": 3.520563551973806e-05, + "loss": 1.2491, + "step": 6805 + }, + { + "epoch": 0.7524446163195404, + "grad_norm": 0.458984375, + "learning_rate": 3.5058861426925447e-05, + "loss": 1.2921, + "step": 6810 + }, + { + "epoch": 0.7529970719849732, + "grad_norm": 0.431640625, + "learning_rate": 3.491232886759398e-05, + "loss": 1.2566, + "step": 6815 + }, + { + "epoch": 0.7535495276504061, + "grad_norm": 0.38671875, + "learning_rate": 3.4766038386737506e-05, + "loss": 1.303, + "step": 6820 + }, + { + "epoch": 0.7541019833158389, + "grad_norm": 0.3984375, + "learning_rate": 3.461999052844942e-05, + "loss": 1.3304, + "step": 6825 + }, + { + "epoch": 0.7546544389812717, + "grad_norm": 0.40625, + "learning_rate": 3.447418583592084e-05, + "loss": 1.2985, + "step": 6830 + }, + { + "epoch": 0.7552068946467047, + "grad_norm": 0.40234375, + "learning_rate": 3.432862485143846e-05, + "loss": 1.2463, + "step": 6835 + }, + { + "epoch": 0.7557593503121375, + "grad_norm": 0.400390625, + "learning_rate": 3.418330811638255e-05, + "loss": 1.311, + "step": 6840 + }, + { + "epoch": 0.7563118059775703, + "grad_norm": 0.41796875, + "learning_rate": 3.4038236171224946e-05, + "loss": 1.2148, + "step": 6845 + }, + { + "epoch": 0.7568642616430031, + "grad_norm": 0.40625, + "learning_rate": 3.389340955552707e-05, + "loss": 1.2363, + "step": 6850 + }, + { + "epoch": 0.757416717308436, + "grad_norm": 0.421875, + "learning_rate": 3.374882880793785e-05, + "loss": 1.2539, + "step": 6855 + }, + { + "epoch": 0.7579691729738689, + "grad_norm": 0.3984375, + "learning_rate": 3.360449446619183e-05, + "loss": 1.2374, + "step": 6860 + }, + { + "epoch": 0.7585216286393017, + "grad_norm": 0.390625, + "learning_rate": 3.346040706710705e-05, + "loss": 1.2242, + "step": 6865 + }, + { + "epoch": 0.7590740843047346, + "grad_norm": 0.404296875, + "learning_rate": 3.331656714658313e-05, + "loss": 1.283, + "step": 6870 + }, + { + "epoch": 0.7596265399701674, + "grad_norm": 0.4140625, + "learning_rate": 3.317297523959927e-05, + "loss": 1.3228, + "step": 6875 + }, + { + "epoch": 0.7601789956356002, + "grad_norm": 0.412109375, + "learning_rate": 3.302963188021216e-05, + "loss": 1.2234, + "step": 6880 + }, + { + "epoch": 0.7607314513010331, + "grad_norm": 0.423828125, + "learning_rate": 3.2886537601554165e-05, + "loss": 1.2781, + "step": 6885 + }, + { + "epoch": 0.761283906966466, + "grad_norm": 0.41796875, + "learning_rate": 3.274369293583121e-05, + "loss": 1.2353, + "step": 6890 + }, + { + "epoch": 0.7618363626318988, + "grad_norm": 0.3984375, + "learning_rate": 3.260109841432085e-05, + "loss": 1.238, + "step": 6895 + }, + { + "epoch": 0.7623888182973316, + "grad_norm": 0.3984375, + "learning_rate": 3.24587545673703e-05, + "loss": 1.2044, + "step": 6900 + }, + { + "epoch": 0.7629412739627645, + "grad_norm": 0.396484375, + "learning_rate": 3.231666192439442e-05, + "loss": 1.2474, + "step": 6905 + }, + { + "epoch": 0.7634937296281973, + "grad_norm": 0.412109375, + "learning_rate": 3.217482101387381e-05, + "loss": 1.3326, + "step": 6910 + }, + { + "epoch": 0.7640461852936302, + "grad_norm": 0.462890625, + "learning_rate": 3.203323236335277e-05, + "loss": 1.2619, + "step": 6915 + }, + { + "epoch": 0.764598640959063, + "grad_norm": 0.400390625, + "learning_rate": 3.189189649943743e-05, + "loss": 1.3013, + "step": 6920 + }, + { + "epoch": 0.7651510966244959, + "grad_norm": 0.41015625, + "learning_rate": 3.17508139477937e-05, + "loss": 1.2109, + "step": 6925 + }, + { + "epoch": 0.7657035522899287, + "grad_norm": 0.419921875, + "learning_rate": 3.1609985233145334e-05, + "loss": 1.2275, + "step": 6930 + }, + { + "epoch": 0.7662560079553615, + "grad_norm": 0.412109375, + "learning_rate": 3.146941087927203e-05, + "loss": 1.3199, + "step": 6935 + }, + { + "epoch": 0.7668084636207945, + "grad_norm": 0.390625, + "learning_rate": 3.132909140900746e-05, + "loss": 1.2437, + "step": 6940 + }, + { + "epoch": 0.7673609192862273, + "grad_norm": 0.404296875, + "learning_rate": 3.118902734423731e-05, + "loss": 1.2415, + "step": 6945 + }, + { + "epoch": 0.7679133749516601, + "grad_norm": 0.4140625, + "learning_rate": 3.104921920589733e-05, + "loss": 1.2964, + "step": 6950 + }, + { + "epoch": 0.768465830617093, + "grad_norm": 0.390625, + "learning_rate": 3.0909667513971396e-05, + "loss": 1.2566, + "step": 6955 + }, + { + "epoch": 0.7690182862825258, + "grad_norm": 0.41796875, + "learning_rate": 3.077037278748965e-05, + "loss": 1.3033, + "step": 6960 + }, + { + "epoch": 0.7695707419479587, + "grad_norm": 0.41015625, + "learning_rate": 3.063133554452645e-05, + "loss": 1.212, + "step": 6965 + }, + { + "epoch": 0.7701231976133915, + "grad_norm": 0.41796875, + "learning_rate": 3.0492556302198526e-05, + "loss": 1.2162, + "step": 6970 + }, + { + "epoch": 0.7706756532788244, + "grad_norm": 0.423828125, + "learning_rate": 3.0354035576663043e-05, + "loss": 1.3013, + "step": 6975 + }, + { + "epoch": 0.7712281089442572, + "grad_norm": 0.400390625, + "learning_rate": 3.0215773883115706e-05, + "loss": 1.2962, + "step": 6980 + }, + { + "epoch": 0.77178056460969, + "grad_norm": 0.41796875, + "learning_rate": 3.0077771735788684e-05, + "loss": 1.2916, + "step": 6985 + }, + { + "epoch": 0.772333020275123, + "grad_norm": 0.3984375, + "learning_rate": 2.9940029647948963e-05, + "loss": 1.2644, + "step": 6990 + }, + { + "epoch": 0.7728854759405558, + "grad_norm": 0.419921875, + "learning_rate": 2.9802548131896236e-05, + "loss": 1.3068, + "step": 6995 + }, + { + "epoch": 0.7734379316059886, + "grad_norm": 0.400390625, + "learning_rate": 2.9665327698961077e-05, + "loss": 1.3421, + "step": 7000 + }, + { + "epoch": 0.7739903872714214, + "grad_norm": 0.404296875, + "learning_rate": 2.9528368859502996e-05, + "loss": 1.2997, + "step": 7005 + }, + { + "epoch": 0.7745428429368543, + "grad_norm": 0.38671875, + "learning_rate": 2.93916721229086e-05, + "loss": 1.1679, + "step": 7010 + }, + { + "epoch": 0.7750952986022872, + "grad_norm": 0.404296875, + "learning_rate": 2.9255237997589657e-05, + "loss": 1.2539, + "step": 7015 + }, + { + "epoch": 0.77564775426772, + "grad_norm": 0.416015625, + "learning_rate": 2.9119066990981193e-05, + "loss": 1.1873, + "step": 7020 + }, + { + "epoch": 0.7762002099331529, + "grad_norm": 0.431640625, + "learning_rate": 2.8983159609539635e-05, + "loss": 1.2057, + "step": 7025 + }, + { + "epoch": 0.7767526655985857, + "grad_norm": 0.4296875, + "learning_rate": 2.884751635874098e-05, + "loss": 1.2382, + "step": 7030 + }, + { + "epoch": 0.7773051212640185, + "grad_norm": 0.408203125, + "learning_rate": 2.8712137743078695e-05, + "loss": 1.1869, + "step": 7035 + }, + { + "epoch": 0.7778575769294515, + "grad_norm": 0.42578125, + "learning_rate": 2.8577024266062146e-05, + "loss": 1.2406, + "step": 7040 + }, + { + "epoch": 0.7784100325948843, + "grad_norm": 0.396484375, + "learning_rate": 2.844217643021454e-05, + "loss": 1.2458, + "step": 7045 + }, + { + "epoch": 0.7789624882603171, + "grad_norm": 0.421875, + "learning_rate": 2.8307594737071052e-05, + "loss": 1.2471, + "step": 7050 + }, + { + "epoch": 0.7795149439257499, + "grad_norm": 0.421875, + "learning_rate": 2.8173279687177057e-05, + "loss": 1.2669, + "step": 7055 + }, + { + "epoch": 0.7800673995911828, + "grad_norm": 0.416015625, + "learning_rate": 2.8039231780086183e-05, + "loss": 1.2655, + "step": 7060 + }, + { + "epoch": 0.7806198552566157, + "grad_norm": 0.388671875, + "learning_rate": 2.7905451514358472e-05, + "loss": 1.2019, + "step": 7065 + }, + { + "epoch": 0.7811723109220485, + "grad_norm": 0.412109375, + "learning_rate": 2.7771939387558554e-05, + "loss": 1.2506, + "step": 7070 + }, + { + "epoch": 0.7817247665874814, + "grad_norm": 0.40625, + "learning_rate": 2.7638695896253774e-05, + "loss": 1.2125, + "step": 7075 + }, + { + "epoch": 0.7822772222529142, + "grad_norm": 0.458984375, + "learning_rate": 2.7505721536012353e-05, + "loss": 1.2211, + "step": 7080 + }, + { + "epoch": 0.782829677918347, + "grad_norm": 0.39453125, + "learning_rate": 2.7373016801401576e-05, + "loss": 1.2609, + "step": 7085 + }, + { + "epoch": 0.78338213358378, + "grad_norm": 0.392578125, + "learning_rate": 2.7240582185985798e-05, + "loss": 1.3198, + "step": 7090 + }, + { + "epoch": 0.7839345892492128, + "grad_norm": 0.400390625, + "learning_rate": 2.7108418182324857e-05, + "loss": 1.2867, + "step": 7095 + }, + { + "epoch": 0.7844870449146456, + "grad_norm": 0.404296875, + "learning_rate": 2.6976525281972078e-05, + "loss": 1.2847, + "step": 7100 + }, + { + "epoch": 0.7850395005800784, + "grad_norm": 0.41015625, + "learning_rate": 2.68449039754724e-05, + "loss": 1.3226, + "step": 7105 + }, + { + "epoch": 0.7855919562455113, + "grad_norm": 0.38671875, + "learning_rate": 2.6713554752360802e-05, + "loss": 1.2922, + "step": 7110 + }, + { + "epoch": 0.7861444119109442, + "grad_norm": 0.396484375, + "learning_rate": 2.6582478101160167e-05, + "loss": 1.2294, + "step": 7115 + }, + { + "epoch": 0.786696867576377, + "grad_norm": 0.41015625, + "learning_rate": 2.6451674509379643e-05, + "loss": 1.2535, + "step": 7120 + }, + { + "epoch": 0.7872493232418099, + "grad_norm": 0.42578125, + "learning_rate": 2.632114446351286e-05, + "loss": 1.3219, + "step": 7125 + }, + { + "epoch": 0.7878017789072427, + "grad_norm": 0.423828125, + "learning_rate": 2.619088844903592e-05, + "loss": 1.2972, + "step": 7130 + }, + { + "epoch": 0.7883542345726755, + "grad_norm": 0.419921875, + "learning_rate": 2.606090695040586e-05, + "loss": 1.3204, + "step": 7135 + }, + { + "epoch": 0.7889066902381084, + "grad_norm": 0.419921875, + "learning_rate": 2.5931200451058678e-05, + "loss": 1.2019, + "step": 7140 + }, + { + "epoch": 0.7894591459035413, + "grad_norm": 0.41015625, + "learning_rate": 2.5801769433407565e-05, + "loss": 1.2714, + "step": 7145 + }, + { + "epoch": 0.7900116015689741, + "grad_norm": 0.39453125, + "learning_rate": 2.567261437884112e-05, + "loss": 1.1561, + "step": 7150 + }, + { + "epoch": 0.7905640572344069, + "grad_norm": 0.41015625, + "learning_rate": 2.5543735767721576e-05, + "loss": 1.2677, + "step": 7155 + }, + { + "epoch": 0.7911165128998398, + "grad_norm": 0.40234375, + "learning_rate": 2.5415134079383006e-05, + "loss": 1.2254, + "step": 7160 + }, + { + "epoch": 0.7916689685652727, + "grad_norm": 0.427734375, + "learning_rate": 2.5286809792129496e-05, + "loss": 1.2255, + "step": 7165 + }, + { + "epoch": 0.7922214242307055, + "grad_norm": 0.396484375, + "learning_rate": 2.5158763383233443e-05, + "loss": 1.2183, + "step": 7170 + }, + { + "epoch": 0.7927738798961383, + "grad_norm": 0.408203125, + "learning_rate": 2.5030995328933726e-05, + "loss": 1.2753, + "step": 7175 + }, + { + "epoch": 0.7933263355615712, + "grad_norm": 0.4296875, + "learning_rate": 2.490350610443396e-05, + "loss": 1.2647, + "step": 7180 + }, + { + "epoch": 0.793878791227004, + "grad_norm": 0.4296875, + "learning_rate": 2.477629618390066e-05, + "loss": 1.2645, + "step": 7185 + }, + { + "epoch": 0.7944312468924368, + "grad_norm": 0.4140625, + "learning_rate": 2.4649366040461597e-05, + "loss": 1.2241, + "step": 7190 + }, + { + "epoch": 0.7949837025578698, + "grad_norm": 0.40234375, + "learning_rate": 2.4522716146203974e-05, + "loss": 1.2617, + "step": 7195 + }, + { + "epoch": 0.7955361582233026, + "grad_norm": 0.40625, + "learning_rate": 2.4396346972172634e-05, + "loss": 1.1895, + "step": 7200 + }, + { + "epoch": 0.7960886138887354, + "grad_norm": 0.41015625, + "learning_rate": 2.4270258988368376e-05, + "loss": 1.2278, + "step": 7205 + }, + { + "epoch": 0.7966410695541682, + "grad_norm": 0.431640625, + "learning_rate": 2.4144452663746176e-05, + "loss": 1.2745, + "step": 7210 + }, + { + "epoch": 0.7971935252196011, + "grad_norm": 0.384765625, + "learning_rate": 2.401892846621344e-05, + "loss": 1.2702, + "step": 7215 + }, + { + "epoch": 0.797745980885034, + "grad_norm": 0.416015625, + "learning_rate": 2.3893686862628263e-05, + "loss": 1.1883, + "step": 7220 + }, + { + "epoch": 0.7982984365504668, + "grad_norm": 0.4140625, + "learning_rate": 2.3768728318797684e-05, + "loss": 1.3128, + "step": 7225 + }, + { + "epoch": 0.7988508922158997, + "grad_norm": 0.419921875, + "learning_rate": 2.364405329947603e-05, + "loss": 1.2534, + "step": 7230 + }, + { + "epoch": 0.7994033478813325, + "grad_norm": 0.41015625, + "learning_rate": 2.3519662268363006e-05, + "loss": 1.2878, + "step": 7235 + }, + { + "epoch": 0.7999558035467653, + "grad_norm": 0.41796875, + "learning_rate": 2.339555568810221e-05, + "loss": 1.176, + "step": 7240 + }, + { + "epoch": 0.8005082592121983, + "grad_norm": 0.408203125, + "learning_rate": 2.3271734020279225e-05, + "loss": 1.3506, + "step": 7245 + }, + { + "epoch": 0.8010607148776311, + "grad_norm": 0.4140625, + "learning_rate": 2.3148197725419983e-05, + "loss": 1.355, + "step": 7250 + }, + { + "epoch": 0.8016131705430639, + "grad_norm": 0.396484375, + "learning_rate": 2.3024947262989038e-05, + "loss": 1.2662, + "step": 7255 + }, + { + "epoch": 0.8021656262084967, + "grad_norm": 0.40234375, + "learning_rate": 2.2901983091387867e-05, + "loss": 1.2308, + "step": 7260 + }, + { + "epoch": 0.8027180818739296, + "grad_norm": 0.4453125, + "learning_rate": 2.2779305667953154e-05, + "loss": 1.263, + "step": 7265 + }, + { + "epoch": 0.8032705375393625, + "grad_norm": 0.38671875, + "learning_rate": 2.2656915448955053e-05, + "loss": 1.3141, + "step": 7270 + }, + { + "epoch": 0.8038229932047953, + "grad_norm": 0.40625, + "learning_rate": 2.253481288959558e-05, + "loss": 1.2824, + "step": 7275 + }, + { + "epoch": 0.8043754488702282, + "grad_norm": 0.39453125, + "learning_rate": 2.241299844400684e-05, + "loss": 1.2874, + "step": 7280 + }, + { + "epoch": 0.804927904535661, + "grad_norm": 0.416015625, + "learning_rate": 2.2291472565249384e-05, + "loss": 1.0976, + "step": 7285 + }, + { + "epoch": 0.8054803602010938, + "grad_norm": 0.419921875, + "learning_rate": 2.217023570531045e-05, + "loss": 1.2376, + "step": 7290 + }, + { + "epoch": 0.8060328158665268, + "grad_norm": 0.396484375, + "learning_rate": 2.2049288315102412e-05, + "loss": 1.2104, + "step": 7295 + }, + { + "epoch": 0.8065852715319596, + "grad_norm": 0.3828125, + "learning_rate": 2.1928630844460973e-05, + "loss": 1.2477, + "step": 7300 + }, + { + "epoch": 0.8071377271973924, + "grad_norm": 0.3828125, + "learning_rate": 2.1808263742143585e-05, + "loss": 1.2546, + "step": 7305 + }, + { + "epoch": 0.8076901828628252, + "grad_norm": 0.3984375, + "learning_rate": 2.1688187455827736e-05, + "loss": 1.2315, + "step": 7310 + }, + { + "epoch": 0.8082426385282581, + "grad_norm": 0.447265625, + "learning_rate": 2.1568402432109257e-05, + "loss": 1.3435, + "step": 7315 + }, + { + "epoch": 0.808795094193691, + "grad_norm": 0.41796875, + "learning_rate": 2.1448909116500747e-05, + "loss": 1.2878, + "step": 7320 + }, + { + "epoch": 0.8093475498591238, + "grad_norm": 0.40234375, + "learning_rate": 2.1329707953429822e-05, + "loss": 1.2644, + "step": 7325 + }, + { + "epoch": 0.8099000055245567, + "grad_norm": 0.400390625, + "learning_rate": 2.1210799386237535e-05, + "loss": 1.3027, + "step": 7330 + }, + { + "epoch": 0.8104524611899895, + "grad_norm": 0.427734375, + "learning_rate": 2.1092183857176683e-05, + "loss": 1.3232, + "step": 7335 + }, + { + "epoch": 0.8110049168554223, + "grad_norm": 0.412109375, + "learning_rate": 2.097386180741019e-05, + "loss": 1.3211, + "step": 7340 + }, + { + "epoch": 0.8115573725208552, + "grad_norm": 0.443359375, + "learning_rate": 2.0855833677009384e-05, + "loss": 1.2968, + "step": 7345 + }, + { + "epoch": 0.8121098281862881, + "grad_norm": 0.40234375, + "learning_rate": 2.0738099904952512e-05, + "loss": 1.2396, + "step": 7350 + }, + { + "epoch": 0.8126622838517209, + "grad_norm": 0.404296875, + "learning_rate": 2.0620660929123004e-05, + "loss": 1.334, + "step": 7355 + }, + { + "epoch": 0.8132147395171537, + "grad_norm": 0.38671875, + "learning_rate": 2.0503517186307842e-05, + "loss": 1.2061, + "step": 7360 + }, + { + "epoch": 0.8137671951825866, + "grad_norm": 0.3828125, + "learning_rate": 2.0386669112195976e-05, + "loss": 1.1548, + "step": 7365 + }, + { + "epoch": 0.8143196508480195, + "grad_norm": 0.4140625, + "learning_rate": 2.0270117141376664e-05, + "loss": 1.2279, + "step": 7370 + }, + { + "epoch": 0.8148721065134523, + "grad_norm": 0.408203125, + "learning_rate": 2.0153861707337906e-05, + "loss": 1.2821, + "step": 7375 + }, + { + "epoch": 0.8154245621788851, + "grad_norm": 0.40625, + "learning_rate": 2.0037903242464785e-05, + "loss": 1.2913, + "step": 7380 + }, + { + "epoch": 0.815977017844318, + "grad_norm": 0.40234375, + "learning_rate": 1.9922242178037864e-05, + "loss": 1.1988, + "step": 7385 + }, + { + "epoch": 0.8165294735097508, + "grad_norm": 0.3828125, + "learning_rate": 1.9806878944231643e-05, + "loss": 1.3184, + "step": 7390 + }, + { + "epoch": 0.8170819291751837, + "grad_norm": 0.412109375, + "learning_rate": 1.9691813970112827e-05, + "loss": 1.2636, + "step": 7395 + }, + { + "epoch": 0.8176343848406166, + "grad_norm": 0.4375, + "learning_rate": 1.9577047683638873e-05, + "loss": 1.265, + "step": 7400 + }, + { + "epoch": 0.8181868405060494, + "grad_norm": 0.4375, + "learning_rate": 1.9462580511656338e-05, + "loss": 1.3265, + "step": 7405 + }, + { + "epoch": 0.8187392961714822, + "grad_norm": 0.3828125, + "learning_rate": 1.934841287989928e-05, + "loss": 1.2263, + "step": 7410 + }, + { + "epoch": 0.819291751836915, + "grad_norm": 0.42578125, + "learning_rate": 1.9234545212987688e-05, + "loss": 1.2812, + "step": 7415 + }, + { + "epoch": 0.819844207502348, + "grad_norm": 0.400390625, + "learning_rate": 1.91209779344259e-05, + "loss": 1.2624, + "step": 7420 + }, + { + "epoch": 0.8203966631677808, + "grad_norm": 0.361328125, + "learning_rate": 1.900771146660103e-05, + "loss": 1.2781, + "step": 7425 + }, + { + "epoch": 0.8209491188332136, + "grad_norm": 0.41015625, + "learning_rate": 1.88947462307814e-05, + "loss": 1.3008, + "step": 7430 + }, + { + "epoch": 0.8215015744986465, + "grad_norm": 0.38671875, + "learning_rate": 1.8782082647114962e-05, + "loss": 1.2242, + "step": 7435 + }, + { + "epoch": 0.8220540301640793, + "grad_norm": 0.40625, + "learning_rate": 1.8669721134627748e-05, + "loss": 1.2445, + "step": 7440 + }, + { + "epoch": 0.8226064858295122, + "grad_norm": 0.40625, + "learning_rate": 1.855766211122234e-05, + "loss": 1.194, + "step": 7445 + }, + { + "epoch": 0.8231589414949451, + "grad_norm": 0.43359375, + "learning_rate": 1.8445905993676183e-05, + "loss": 1.3307, + "step": 7450 + }, + { + "epoch": 0.8237113971603779, + "grad_norm": 0.41796875, + "learning_rate": 1.8334453197640224e-05, + "loss": 1.2829, + "step": 7455 + }, + { + "epoch": 0.8242638528258107, + "grad_norm": 0.41796875, + "learning_rate": 1.8223304137637243e-05, + "loss": 1.2545, + "step": 7460 + }, + { + "epoch": 0.8248163084912435, + "grad_norm": 0.40234375, + "learning_rate": 1.8112459227060386e-05, + "loss": 1.2735, + "step": 7465 + }, + { + "epoch": 0.8253687641566765, + "grad_norm": 0.390625, + "learning_rate": 1.8001918878171532e-05, + "loss": 1.2054, + "step": 7470 + }, + { + "epoch": 0.8259212198221093, + "grad_norm": 0.3828125, + "learning_rate": 1.789168350209983e-05, + "loss": 1.2966, + "step": 7475 + }, + { + "epoch": 0.8264736754875421, + "grad_norm": 0.412109375, + "learning_rate": 1.778175350884016e-05, + "loss": 1.321, + "step": 7480 + }, + { + "epoch": 0.827026131152975, + "grad_norm": 0.4140625, + "learning_rate": 1.767212930725163e-05, + "loss": 1.249, + "step": 7485 + }, + { + "epoch": 0.8275785868184078, + "grad_norm": 0.390625, + "learning_rate": 1.756281130505595e-05, + "loss": 1.2322, + "step": 7490 + }, + { + "epoch": 0.8281310424838406, + "grad_norm": 0.447265625, + "learning_rate": 1.745379990883603e-05, + "loss": 1.2847, + "step": 7495 + }, + { + "epoch": 0.8286834981492736, + "grad_norm": 0.412109375, + "learning_rate": 1.7345095524034484e-05, + "loss": 1.2884, + "step": 7500 + }, + { + "epoch": 0.8292359538147064, + "grad_norm": 0.3671875, + "learning_rate": 1.723669855495199e-05, + "loss": 1.2099, + "step": 7505 + }, + { + "epoch": 0.8297884094801392, + "grad_norm": 0.390625, + "learning_rate": 1.712860940474591e-05, + "loss": 1.2257, + "step": 7510 + }, + { + "epoch": 0.830340865145572, + "grad_norm": 0.396484375, + "learning_rate": 1.702082847542873e-05, + "loss": 1.2523, + "step": 7515 + }, + { + "epoch": 0.8308933208110049, + "grad_norm": 0.400390625, + "learning_rate": 1.6913356167866578e-05, + "loss": 1.2422, + "step": 7520 + }, + { + "epoch": 0.8314457764764378, + "grad_norm": 0.41796875, + "learning_rate": 1.680619288177775e-05, + "loss": 1.2598, + "step": 7525 + }, + { + "epoch": 0.8319982321418706, + "grad_norm": 0.39453125, + "learning_rate": 1.6699339015731185e-05, + "loss": 1.2946, + "step": 7530 + }, + { + "epoch": 0.8325506878073035, + "grad_norm": 0.390625, + "learning_rate": 1.659279496714503e-05, + "loss": 1.2523, + "step": 7535 + }, + { + "epoch": 0.8331031434727363, + "grad_norm": 0.42578125, + "learning_rate": 1.648656113228515e-05, + "loss": 1.3067, + "step": 7540 + }, + { + "epoch": 0.8336555991381691, + "grad_norm": 0.40234375, + "learning_rate": 1.6380637906263574e-05, + "loss": 1.2586, + "step": 7545 + }, + { + "epoch": 0.834208054803602, + "grad_norm": 0.443359375, + "learning_rate": 1.6275025683037148e-05, + "loss": 1.2271, + "step": 7550 + }, + { + "epoch": 0.8347605104690349, + "grad_norm": 0.412109375, + "learning_rate": 1.616972485540601e-05, + "loss": 1.3128, + "step": 7555 + }, + { + "epoch": 0.8353129661344677, + "grad_norm": 0.439453125, + "learning_rate": 1.6064735815012145e-05, + "loss": 1.3026, + "step": 7560 + }, + { + "epoch": 0.8358654217999005, + "grad_norm": 0.408203125, + "learning_rate": 1.5960058952337887e-05, + "loss": 1.2414, + "step": 7565 + }, + { + "epoch": 0.8364178774653334, + "grad_norm": 0.400390625, + "learning_rate": 1.585569465670451e-05, + "loss": 1.324, + "step": 7570 + }, + { + "epoch": 0.8369703331307663, + "grad_norm": 0.39453125, + "learning_rate": 1.575164331627079e-05, + "loss": 1.2407, + "step": 7575 + }, + { + "epoch": 0.8375227887961991, + "grad_norm": 0.400390625, + "learning_rate": 1.5647905318031507e-05, + "loss": 1.2253, + "step": 7580 + }, + { + "epoch": 0.838075244461632, + "grad_norm": 0.404296875, + "learning_rate": 1.554448104781606e-05, + "loss": 1.3098, + "step": 7585 + }, + { + "epoch": 0.8386277001270648, + "grad_norm": 0.419921875, + "learning_rate": 1.5441370890287022e-05, + "loss": 1.223, + "step": 7590 + }, + { + "epoch": 0.8391801557924976, + "grad_norm": 0.412109375, + "learning_rate": 1.5338575228938614e-05, + "loss": 1.2348, + "step": 7595 + }, + { + "epoch": 0.8397326114579305, + "grad_norm": 0.427734375, + "learning_rate": 1.523609444609545e-05, + "loss": 1.3541, + "step": 7600 + }, + { + "epoch": 0.8402850671233634, + "grad_norm": 0.376953125, + "learning_rate": 1.5133928922911012e-05, + "loss": 1.1748, + "step": 7605 + }, + { + "epoch": 0.8408375227887962, + "grad_norm": 0.439453125, + "learning_rate": 1.5032079039366209e-05, + "loss": 1.3035, + "step": 7610 + }, + { + "epoch": 0.841389978454229, + "grad_norm": 0.41015625, + "learning_rate": 1.4930545174268062e-05, + "loss": 1.3263, + "step": 7615 + }, + { + "epoch": 0.8419424341196619, + "grad_norm": 0.392578125, + "learning_rate": 1.4829327705248164e-05, + "loss": 1.2054, + "step": 7620 + }, + { + "epoch": 0.8424948897850948, + "grad_norm": 0.42578125, + "learning_rate": 1.4728427008761402e-05, + "loss": 1.3129, + "step": 7625 + }, + { + "epoch": 0.8430473454505276, + "grad_norm": 0.416015625, + "learning_rate": 1.4627843460084478e-05, + "loss": 1.345, + "step": 7630 + }, + { + "epoch": 0.8435998011159604, + "grad_norm": 0.39453125, + "learning_rate": 1.4527577433314532e-05, + "loss": 1.2276, + "step": 7635 + }, + { + "epoch": 0.8441522567813933, + "grad_norm": 0.392578125, + "learning_rate": 1.4427629301367773e-05, + "loss": 1.296, + "step": 7640 + }, + { + "epoch": 0.8447047124468261, + "grad_norm": 0.388671875, + "learning_rate": 1.4327999435978068e-05, + "loss": 1.2482, + "step": 7645 + }, + { + "epoch": 0.845257168112259, + "grad_norm": 0.40625, + "learning_rate": 1.422868820769554e-05, + "loss": 1.2892, + "step": 7650 + }, + { + "epoch": 0.8458096237776919, + "grad_norm": 0.400390625, + "learning_rate": 1.4129695985885228e-05, + "loss": 1.2172, + "step": 7655 + }, + { + "epoch": 0.8463620794431247, + "grad_norm": 0.404296875, + "learning_rate": 1.403102313872573e-05, + "loss": 1.2775, + "step": 7660 + }, + { + "epoch": 0.8469145351085575, + "grad_norm": 0.412109375, + "learning_rate": 1.3932670033207784e-05, + "loss": 1.2685, + "step": 7665 + }, + { + "epoch": 0.8474669907739903, + "grad_norm": 0.41796875, + "learning_rate": 1.3834637035132903e-05, + "loss": 1.3008, + "step": 7670 + }, + { + "epoch": 0.8480194464394233, + "grad_norm": 0.40234375, + "learning_rate": 1.373692450911207e-05, + "loss": 1.2244, + "step": 7675 + }, + { + "epoch": 0.8485719021048561, + "grad_norm": 0.40234375, + "learning_rate": 1.3639532818564327e-05, + "loss": 1.1604, + "step": 7680 + }, + { + "epoch": 0.8491243577702889, + "grad_norm": 0.419921875, + "learning_rate": 1.3542462325715443e-05, + "loss": 1.2167, + "step": 7685 + }, + { + "epoch": 0.8496768134357218, + "grad_norm": 0.419921875, + "learning_rate": 1.344571339159657e-05, + "loss": 1.2944, + "step": 7690 + }, + { + "epoch": 0.8502292691011546, + "grad_norm": 0.41015625, + "learning_rate": 1.3349286376042914e-05, + "loss": 1.2022, + "step": 7695 + }, + { + "epoch": 0.8507817247665875, + "grad_norm": 0.390625, + "learning_rate": 1.3253181637692324e-05, + "loss": 1.2281, + "step": 7700 + }, + { + "epoch": 0.8513341804320204, + "grad_norm": 0.419921875, + "learning_rate": 1.3157399533984082e-05, + "loss": 1.2718, + "step": 7705 + }, + { + "epoch": 0.8518866360974532, + "grad_norm": 0.408203125, + "learning_rate": 1.3061940421157459e-05, + "loss": 1.1907, + "step": 7710 + }, + { + "epoch": 0.852439091762886, + "grad_norm": 0.427734375, + "learning_rate": 1.2966804654250465e-05, + "loss": 1.1876, + "step": 7715 + }, + { + "epoch": 0.8529915474283188, + "grad_norm": 0.3984375, + "learning_rate": 1.287199258709848e-05, + "loss": 1.2103, + "step": 7720 + }, + { + "epoch": 0.8535440030937518, + "grad_norm": 0.427734375, + "learning_rate": 1.2777504572332976e-05, + "loss": 1.2198, + "step": 7725 + }, + { + "epoch": 0.8540964587591846, + "grad_norm": 0.400390625, + "learning_rate": 1.2683340961380163e-05, + "loss": 1.2929, + "step": 7730 + }, + { + "epoch": 0.8546489144246174, + "grad_norm": 0.384765625, + "learning_rate": 1.2589502104459738e-05, + "loss": 1.2406, + "step": 7735 + }, + { + "epoch": 0.8552013700900503, + "grad_norm": 0.390625, + "learning_rate": 1.249598835058352e-05, + "loss": 1.2474, + "step": 7740 + }, + { + "epoch": 0.8557538257554831, + "grad_norm": 0.423828125, + "learning_rate": 1.2402800047554208e-05, + "loss": 1.3239, + "step": 7745 + }, + { + "epoch": 0.856306281420916, + "grad_norm": 0.421875, + "learning_rate": 1.2309937541964057e-05, + "loss": 1.2417, + "step": 7750 + }, + { + "epoch": 0.8568587370863489, + "grad_norm": 0.41015625, + "learning_rate": 1.2217401179193556e-05, + "loss": 1.2904, + "step": 7755 + }, + { + "epoch": 0.8574111927517817, + "grad_norm": 0.419921875, + "learning_rate": 1.2125191303410221e-05, + "loss": 1.2472, + "step": 7760 + }, + { + "epoch": 0.8579636484172145, + "grad_norm": 0.404296875, + "learning_rate": 1.2033308257567289e-05, + "loss": 1.2792, + "step": 7765 + }, + { + "epoch": 0.8585161040826473, + "grad_norm": 0.38671875, + "learning_rate": 1.1941752383402394e-05, + "loss": 1.2666, + "step": 7770 + }, + { + "epoch": 0.8590685597480802, + "grad_norm": 0.37890625, + "learning_rate": 1.1850524021436337e-05, + "loss": 1.2452, + "step": 7775 + }, + { + "epoch": 0.8596210154135131, + "grad_norm": 0.400390625, + "learning_rate": 1.175962351097184e-05, + "loss": 1.2691, + "step": 7780 + }, + { + "epoch": 0.8601734710789459, + "grad_norm": 0.400390625, + "learning_rate": 1.166905119009223e-05, + "loss": 1.2344, + "step": 7785 + }, + { + "epoch": 0.8607259267443788, + "grad_norm": 0.41796875, + "learning_rate": 1.1578807395660207e-05, + "loss": 1.3321, + "step": 7790 + }, + { + "epoch": 0.8612783824098116, + "grad_norm": 0.39453125, + "learning_rate": 1.1488892463316615e-05, + "loss": 1.2145, + "step": 7795 + }, + { + "epoch": 0.8618308380752444, + "grad_norm": 0.435546875, + "learning_rate": 1.1399306727479164e-05, + "loss": 1.205, + "step": 7800 + }, + { + "epoch": 0.8623832937406773, + "grad_norm": 0.408203125, + "learning_rate": 1.1310050521341198e-05, + "loss": 1.3174, + "step": 7805 + }, + { + "epoch": 0.8629357494061102, + "grad_norm": 0.390625, + "learning_rate": 1.1221124176870412e-05, + "loss": 1.2388, + "step": 7810 + }, + { + "epoch": 0.863488205071543, + "grad_norm": 0.40234375, + "learning_rate": 1.1132528024807686e-05, + "loss": 1.2616, + "step": 7815 + }, + { + "epoch": 0.8640406607369758, + "grad_norm": 0.392578125, + "learning_rate": 1.1044262394665872e-05, + "loss": 1.3381, + "step": 7820 + }, + { + "epoch": 0.8645931164024087, + "grad_norm": 0.40234375, + "learning_rate": 1.0956327614728457e-05, + "loss": 1.2389, + "step": 7825 + }, + { + "epoch": 0.8651455720678416, + "grad_norm": 0.412109375, + "learning_rate": 1.0868724012048438e-05, + "loss": 1.2653, + "step": 7830 + }, + { + "epoch": 0.8656980277332744, + "grad_norm": 0.427734375, + "learning_rate": 1.078145191244706e-05, + "loss": 1.2068, + "step": 7835 + }, + { + "epoch": 0.8662504833987072, + "grad_norm": 0.400390625, + "learning_rate": 1.069451164051264e-05, + "loss": 1.2396, + "step": 7840 + }, + { + "epoch": 0.8668029390641401, + "grad_norm": 0.38671875, + "learning_rate": 1.0607903519599328e-05, + "loss": 1.2372, + "step": 7845 + }, + { + "epoch": 0.8673553947295729, + "grad_norm": 0.390625, + "learning_rate": 1.052162787182588e-05, + "loss": 1.2051, + "step": 7850 + }, + { + "epoch": 0.8679078503950058, + "grad_norm": 0.40625, + "learning_rate": 1.0435685018074548e-05, + "loss": 1.3016, + "step": 7855 + }, + { + "epoch": 0.8684603060604387, + "grad_norm": 0.42578125, + "learning_rate": 1.0350075277989812e-05, + "loss": 1.2742, + "step": 7860 + }, + { + "epoch": 0.8690127617258715, + "grad_norm": 0.396484375, + "learning_rate": 1.026479896997723e-05, + "loss": 1.2405, + "step": 7865 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.4296875, + "learning_rate": 1.0179856411202204e-05, + "loss": 1.3052, + "step": 7870 + }, + { + "epoch": 0.8701176730567372, + "grad_norm": 0.390625, + "learning_rate": 1.0095247917588869e-05, + "loss": 1.3322, + "step": 7875 + }, + { + "epoch": 0.8706701287221701, + "grad_norm": 0.3984375, + "learning_rate": 1.0010973803818857e-05, + "loss": 1.215, + "step": 7880 + }, + { + "epoch": 0.8712225843876029, + "grad_norm": 0.38671875, + "learning_rate": 9.927034383330159e-06, + "loss": 1.1596, + "step": 7885 + }, + { + "epoch": 0.8717750400530357, + "grad_norm": 0.392578125, + "learning_rate": 9.843429968315943e-06, + "loss": 1.2819, + "step": 7890 + }, + { + "epoch": 0.8723274957184686, + "grad_norm": 0.435546875, + "learning_rate": 9.760160869723456e-06, + "loss": 1.3613, + "step": 7895 + }, + { + "epoch": 0.8728799513839014, + "grad_norm": 0.400390625, + "learning_rate": 9.677227397252708e-06, + "loss": 1.293, + "step": 7900 + }, + { + "epoch": 0.8734324070493343, + "grad_norm": 0.404296875, + "learning_rate": 9.594629859355519e-06, + "loss": 1.2734, + "step": 7905 + }, + { + "epoch": 0.8739848627147672, + "grad_norm": 0.404296875, + "learning_rate": 9.512368563234241e-06, + "loss": 1.2817, + "step": 7910 + }, + { + "epoch": 0.8745373183802, + "grad_norm": 0.3984375, + "learning_rate": 9.430443814840662e-06, + "loss": 1.2749, + "step": 7915 + }, + { + "epoch": 0.8750897740456328, + "grad_norm": 0.384765625, + "learning_rate": 9.348855918874844e-06, + "loss": 1.2575, + "step": 7920 + }, + { + "epoch": 0.8756422297110656, + "grad_norm": 0.39453125, + "learning_rate": 9.267605178784033e-06, + "loss": 1.2978, + "step": 7925 + }, + { + "epoch": 0.8761946853764986, + "grad_norm": 0.41015625, + "learning_rate": 9.186691896761479e-06, + "loss": 1.1692, + "step": 7930 + }, + { + "epoch": 0.8767471410419314, + "grad_norm": 0.3828125, + "learning_rate": 9.106116373745332e-06, + "loss": 1.3137, + "step": 7935 + }, + { + "epoch": 0.8772995967073642, + "grad_norm": 0.42578125, + "learning_rate": 9.025878909417552e-06, + "loss": 1.2491, + "step": 7940 + }, + { + "epoch": 0.8778520523727971, + "grad_norm": 0.40625, + "learning_rate": 8.94597980220273e-06, + "loss": 1.2099, + "step": 7945 + }, + { + "epoch": 0.8784045080382299, + "grad_norm": 0.396484375, + "learning_rate": 8.866419349267064e-06, + "loss": 1.2653, + "step": 7950 + }, + { + "epoch": 0.8789569637036628, + "grad_norm": 0.4140625, + "learning_rate": 8.787197846517148e-06, + "loss": 1.255, + "step": 7955 + }, + { + "epoch": 0.8795094193690957, + "grad_norm": 0.388671875, + "learning_rate": 8.70831558859897e-06, + "loss": 1.2871, + "step": 7960 + }, + { + "epoch": 0.8800618750345285, + "grad_norm": 0.4296875, + "learning_rate": 8.629772868896779e-06, + "loss": 1.2383, + "step": 7965 + }, + { + "epoch": 0.8806143306999613, + "grad_norm": 0.41015625, + "learning_rate": 8.55156997953197e-06, + "loss": 1.2273, + "step": 7970 + }, + { + "epoch": 0.8811667863653941, + "grad_norm": 0.39453125, + "learning_rate": 8.473707211362026e-06, + "loss": 1.2977, + "step": 7975 + }, + { + "epoch": 0.8817192420308271, + "grad_norm": 0.4296875, + "learning_rate": 8.396184853979416e-06, + "loss": 1.2394, + "step": 7980 + }, + { + "epoch": 0.8822716976962599, + "grad_norm": 0.396484375, + "learning_rate": 8.319003195710574e-06, + "loss": 1.3483, + "step": 7985 + }, + { + "epoch": 0.8828241533616927, + "grad_norm": 0.40234375, + "learning_rate": 8.242162523614716e-06, + "loss": 1.2118, + "step": 7990 + }, + { + "epoch": 0.8833766090271256, + "grad_norm": 0.38671875, + "learning_rate": 8.165663123482903e-06, + "loss": 1.2282, + "step": 7995 + }, + { + "epoch": 0.8839290646925584, + "grad_norm": 0.39453125, + "learning_rate": 8.089505279836873e-06, + "loss": 1.2785, + "step": 8000 + }, + { + "epoch": 0.8844815203579913, + "grad_norm": 0.412109375, + "learning_rate": 8.013689275928037e-06, + "loss": 1.24, + "step": 8005 + }, + { + "epoch": 0.8850339760234242, + "grad_norm": 0.423828125, + "learning_rate": 7.938215393736414e-06, + "loss": 1.2231, + "step": 8010 + }, + { + "epoch": 0.885586431688857, + "grad_norm": 0.421875, + "learning_rate": 7.86308391396956e-06, + "loss": 1.2741, + "step": 8015 + }, + { + "epoch": 0.8861388873542898, + "grad_norm": 0.3828125, + "learning_rate": 7.788295116061584e-06, + "loss": 1.1967, + "step": 8020 + }, + { + "epoch": 0.8866913430197226, + "grad_norm": 0.416015625, + "learning_rate": 7.713849278172047e-06, + "loss": 1.2404, + "step": 8025 + }, + { + "epoch": 0.8872437986851556, + "grad_norm": 0.419921875, + "learning_rate": 7.639746677184945e-06, + "loss": 1.317, + "step": 8030 + }, + { + "epoch": 0.8877962543505884, + "grad_norm": 0.40625, + "learning_rate": 7.5659875887076905e-06, + "loss": 1.2615, + "step": 8035 + }, + { + "epoch": 0.8883487100160212, + "grad_norm": 0.435546875, + "learning_rate": 7.492572287070088e-06, + "loss": 1.2847, + "step": 8040 + }, + { + "epoch": 0.888901165681454, + "grad_norm": 0.40234375, + "learning_rate": 7.419501045323296e-06, + "loss": 1.3259, + "step": 8045 + }, + { + "epoch": 0.8894536213468869, + "grad_norm": 0.408203125, + "learning_rate": 7.346774135238832e-06, + "loss": 1.2395, + "step": 8050 + }, + { + "epoch": 0.8900060770123197, + "grad_norm": 0.388671875, + "learning_rate": 7.274391827307547e-06, + "loss": 1.284, + "step": 8055 + }, + { + "epoch": 0.8905585326777526, + "grad_norm": 0.359375, + "learning_rate": 7.202354390738608e-06, + "loss": 1.2127, + "step": 8060 + }, + { + "epoch": 0.8911109883431855, + "grad_norm": 0.408203125, + "learning_rate": 7.130662093458529e-06, + "loss": 1.3602, + "step": 8065 + }, + { + "epoch": 0.8916634440086183, + "grad_norm": 0.3828125, + "learning_rate": 7.059315202110173e-06, + "loss": 1.3355, + "step": 8070 + }, + { + "epoch": 0.8922158996740511, + "grad_norm": 0.412109375, + "learning_rate": 6.98831398205172e-06, + "loss": 1.3255, + "step": 8075 + }, + { + "epoch": 0.892768355339484, + "grad_norm": 0.423828125, + "learning_rate": 6.917658697355722e-06, + "loss": 1.2978, + "step": 8080 + }, + { + "epoch": 0.8933208110049169, + "grad_norm": 0.396484375, + "learning_rate": 6.8473496108080845e-06, + "loss": 1.1923, + "step": 8085 + }, + { + "epoch": 0.8938732666703497, + "grad_norm": 0.39453125, + "learning_rate": 6.777386983907152e-06, + "loss": 1.2953, + "step": 8090 + }, + { + "epoch": 0.8944257223357825, + "grad_norm": 0.41796875, + "learning_rate": 6.7077710768626455e-06, + "loss": 1.3616, + "step": 8095 + }, + { + "epoch": 0.8949781780012154, + "grad_norm": 0.404296875, + "learning_rate": 6.638502148594772e-06, + "loss": 1.2919, + "step": 8100 + }, + { + "epoch": 0.8955306336666482, + "grad_norm": 0.380859375, + "learning_rate": 6.5695804567332044e-06, + "loss": 1.2795, + "step": 8105 + }, + { + "epoch": 0.8960830893320811, + "grad_norm": 0.41796875, + "learning_rate": 6.501006257616205e-06, + "loss": 1.2792, + "step": 8110 + }, + { + "epoch": 0.896635544997514, + "grad_norm": 0.439453125, + "learning_rate": 6.432779806289535e-06, + "loss": 1.3561, + "step": 8115 + }, + { + "epoch": 0.8971880006629468, + "grad_norm": 0.412109375, + "learning_rate": 6.364901356505648e-06, + "loss": 1.2155, + "step": 8120 + }, + { + "epoch": 0.8977404563283796, + "grad_norm": 0.416015625, + "learning_rate": 6.297371160722676e-06, + "loss": 1.2476, + "step": 8125 + }, + { + "epoch": 0.8982929119938124, + "grad_norm": 0.404296875, + "learning_rate": 6.230189470103498e-06, + "loss": 1.2504, + "step": 8130 + }, + { + "epoch": 0.8988453676592454, + "grad_norm": 0.40234375, + "learning_rate": 6.163356534514808e-06, + "loss": 1.2738, + "step": 8135 + }, + { + "epoch": 0.8993978233246782, + "grad_norm": 0.412109375, + "learning_rate": 6.096872602526182e-06, + "loss": 1.3324, + "step": 8140 + }, + { + "epoch": 0.899950278990111, + "grad_norm": 0.404296875, + "learning_rate": 6.030737921409169e-06, + "loss": 1.2297, + "step": 8145 + }, + { + "epoch": 0.9005027346555439, + "grad_norm": 0.404296875, + "learning_rate": 5.964952737136353e-06, + "loss": 1.2203, + "step": 8150 + }, + { + "epoch": 0.9010551903209767, + "grad_norm": 0.419921875, + "learning_rate": 5.899517294380441e-06, + "loss": 1.2453, + "step": 8155 + }, + { + "epoch": 0.9016076459864096, + "grad_norm": 0.412109375, + "learning_rate": 5.834431836513388e-06, + "loss": 1.3345, + "step": 8160 + }, + { + "epoch": 0.9021601016518425, + "grad_norm": 0.431640625, + "learning_rate": 5.769696605605379e-06, + "loss": 1.2634, + "step": 8165 + }, + { + "epoch": 0.9027125573172753, + "grad_norm": 0.396484375, + "learning_rate": 5.705311842424133e-06, + "loss": 1.2929, + "step": 8170 + }, + { + "epoch": 0.9032650129827081, + "grad_norm": 0.408203125, + "learning_rate": 5.641277786433796e-06, + "loss": 1.2304, + "step": 8175 + }, + { + "epoch": 0.9038174686481409, + "grad_norm": 0.4140625, + "learning_rate": 5.577594675794162e-06, + "loss": 1.3047, + "step": 8180 + }, + { + "epoch": 0.9043699243135739, + "grad_norm": 0.3984375, + "learning_rate": 5.514262747359778e-06, + "loss": 1.2051, + "step": 8185 + }, + { + "epoch": 0.9049223799790067, + "grad_norm": 0.412109375, + "learning_rate": 5.451282236679045e-06, + "loss": 1.2079, + "step": 8190 + }, + { + "epoch": 0.9054748356444395, + "grad_norm": 0.45703125, + "learning_rate": 5.388653377993324e-06, + "loss": 1.2679, + "step": 8195 + }, + { + "epoch": 0.9060272913098724, + "grad_norm": 0.412109375, + "learning_rate": 5.326376404236133e-06, + "loss": 1.3061, + "step": 8200 + }, + { + "epoch": 0.9065797469753052, + "grad_norm": 0.400390625, + "learning_rate": 5.264451547032212e-06, + "loss": 1.1406, + "step": 8205 + }, + { + "epoch": 0.9071322026407381, + "grad_norm": 0.41015625, + "learning_rate": 5.202879036696662e-06, + "loss": 1.2934, + "step": 8210 + }, + { + "epoch": 0.907684658306171, + "grad_norm": 0.427734375, + "learning_rate": 5.141659102234131e-06, + "loss": 1.2391, + "step": 8215 + }, + { + "epoch": 0.9082371139716038, + "grad_norm": 0.40234375, + "learning_rate": 5.080791971337972e-06, + "loss": 1.2339, + "step": 8220 + }, + { + "epoch": 0.9087895696370366, + "grad_norm": 0.404296875, + "learning_rate": 5.020277870389312e-06, + "loss": 1.2473, + "step": 8225 + }, + { + "epoch": 0.9093420253024694, + "grad_norm": 0.427734375, + "learning_rate": 4.960117024456323e-06, + "loss": 1.2827, + "step": 8230 + }, + { + "epoch": 0.9098944809679024, + "grad_norm": 0.40234375, + "learning_rate": 4.9003096572932785e-06, + "loss": 1.2659, + "step": 8235 + }, + { + "epoch": 0.9104469366333352, + "grad_norm": 0.40234375, + "learning_rate": 4.840855991339799e-06, + "loss": 1.155, + "step": 8240 + }, + { + "epoch": 0.910999392298768, + "grad_norm": 0.396484375, + "learning_rate": 4.781756247719982e-06, + "loss": 1.3278, + "step": 8245 + }, + { + "epoch": 0.9115518479642009, + "grad_norm": 0.375, + "learning_rate": 4.7230106462415876e-06, + "loss": 1.3132, + "step": 8250 + }, + { + "epoch": 0.9121043036296337, + "grad_norm": 0.4140625, + "learning_rate": 4.6646194053952656e-06, + "loss": 1.3053, + "step": 8255 + }, + { + "epoch": 0.9126567592950666, + "grad_norm": 0.408203125, + "learning_rate": 4.6065827423536375e-06, + "loss": 1.3756, + "step": 8260 + }, + { + "epoch": 0.9132092149604994, + "grad_norm": 0.400390625, + "learning_rate": 4.548900872970607e-06, + "loss": 1.299, + "step": 8265 + }, + { + "epoch": 0.9137616706259323, + "grad_norm": 0.43359375, + "learning_rate": 4.491574011780497e-06, + "loss": 1.3044, + "step": 8270 + }, + { + "epoch": 0.9143141262913651, + "grad_norm": 0.4375, + "learning_rate": 4.434602371997243e-06, + "loss": 1.2712, + "step": 8275 + }, + { + "epoch": 0.9148665819567979, + "grad_norm": 0.40625, + "learning_rate": 4.3779861655136255e-06, + "loss": 1.2805, + "step": 8280 + }, + { + "epoch": 0.9154190376222309, + "grad_norm": 0.40625, + "learning_rate": 4.321725602900473e-06, + "loss": 1.3191, + "step": 8285 + }, + { + "epoch": 0.9159714932876637, + "grad_norm": 0.408203125, + "learning_rate": 4.265820893405892e-06, + "loss": 1.2231, + "step": 8290 + }, + { + "epoch": 0.9165239489530965, + "grad_norm": 0.39453125, + "learning_rate": 4.210272244954449e-06, + "loss": 1.3134, + "step": 8295 + }, + { + "epoch": 0.9170764046185294, + "grad_norm": 0.392578125, + "learning_rate": 4.1550798641464605e-06, + "loss": 1.2432, + "step": 8300 + }, + { + "epoch": 0.9176288602839622, + "grad_norm": 0.421875, + "learning_rate": 4.100243956257144e-06, + "loss": 1.2339, + "step": 8305 + }, + { + "epoch": 0.9181813159493951, + "grad_norm": 0.4140625, + "learning_rate": 4.045764725235956e-06, + "loss": 1.2767, + "step": 8310 + }, + { + "epoch": 0.9187337716148279, + "grad_norm": 0.38671875, + "learning_rate": 3.991642373705695e-06, + "loss": 1.2005, + "step": 8315 + }, + { + "epoch": 0.9192862272802608, + "grad_norm": 0.404296875, + "learning_rate": 3.937877102961918e-06, + "loss": 1.2877, + "step": 8320 + }, + { + "epoch": 0.9198386829456936, + "grad_norm": 0.39453125, + "learning_rate": 3.884469112972033e-06, + "loss": 1.3861, + "step": 8325 + }, + { + "epoch": 0.9203911386111264, + "grad_norm": 0.41015625, + "learning_rate": 3.83141860237467e-06, + "loss": 1.3506, + "step": 8330 + }, + { + "epoch": 0.9209435942765593, + "grad_norm": 0.390625, + "learning_rate": 3.7787257684788745e-06, + "loss": 1.1882, + "step": 8335 + }, + { + "epoch": 0.9214960499419922, + "grad_norm": 0.41796875, + "learning_rate": 3.7263908072634025e-06, + "loss": 1.2829, + "step": 8340 + }, + { + "epoch": 0.922048505607425, + "grad_norm": 0.384765625, + "learning_rate": 3.6744139133759957e-06, + "loss": 1.2689, + "step": 8345 + }, + { + "epoch": 0.9226009612728578, + "grad_norm": 0.41796875, + "learning_rate": 3.6227952801326404e-06, + "loss": 1.3579, + "step": 8350 + }, + { + "epoch": 0.9231534169382907, + "grad_norm": 0.421875, + "learning_rate": 3.571535099516832e-06, + "loss": 1.2262, + "step": 8355 + }, + { + "epoch": 0.9237058726037235, + "grad_norm": 0.408203125, + "learning_rate": 3.520633562178932e-06, + "loss": 1.2218, + "step": 8360 + }, + { + "epoch": 0.9242583282691564, + "grad_norm": 0.3984375, + "learning_rate": 3.470090857435371e-06, + "loss": 1.2624, + "step": 8365 + }, + { + "epoch": 0.9248107839345893, + "grad_norm": 0.45703125, + "learning_rate": 3.419907173268e-06, + "loss": 1.2848, + "step": 8370 + }, + { + "epoch": 0.9253632396000221, + "grad_norm": 0.39453125, + "learning_rate": 3.3700826963233735e-06, + "loss": 1.2628, + "step": 8375 + }, + { + "epoch": 0.9259156952654549, + "grad_norm": 0.408203125, + "learning_rate": 3.320617611912069e-06, + "loss": 1.3411, + "step": 8380 + }, + { + "epoch": 0.9264681509308877, + "grad_norm": 0.400390625, + "learning_rate": 3.271512104007979e-06, + "loss": 1.2467, + "step": 8385 + }, + { + "epoch": 0.9270206065963207, + "grad_norm": 0.40234375, + "learning_rate": 3.2227663552476194e-06, + "loss": 1.2752, + "step": 8390 + }, + { + "epoch": 0.9275730622617535, + "grad_norm": 0.390625, + "learning_rate": 3.174380546929501e-06, + "loss": 1.2332, + "step": 8395 + }, + { + "epoch": 0.9281255179271863, + "grad_norm": 0.38671875, + "learning_rate": 3.1263548590133917e-06, + "loss": 1.2656, + "step": 8400 + }, + { + "epoch": 0.9286779735926192, + "grad_norm": 0.412109375, + "learning_rate": 3.0786894701196777e-06, + "loss": 1.2307, + "step": 8405 + }, + { + "epoch": 0.929230429258052, + "grad_norm": 0.451171875, + "learning_rate": 3.031384557528716e-06, + "loss": 1.2646, + "step": 8410 + }, + { + "epoch": 0.9297828849234849, + "grad_norm": 0.392578125, + "learning_rate": 2.9844402971801242e-06, + "loss": 1.2306, + "step": 8415 + }, + { + "epoch": 0.9303353405889178, + "grad_norm": 0.369140625, + "learning_rate": 2.9378568636721835e-06, + "loss": 1.1948, + "step": 8420 + }, + { + "epoch": 0.9308877962543506, + "grad_norm": 0.408203125, + "learning_rate": 2.8916344302611586e-06, + "loss": 1.2318, + "step": 8425 + }, + { + "epoch": 0.9314402519197834, + "grad_norm": 0.3828125, + "learning_rate": 2.845773168860644e-06, + "loss": 1.2282, + "step": 8430 + }, + { + "epoch": 0.9319927075852162, + "grad_norm": 0.408203125, + "learning_rate": 2.800273250040952e-06, + "loss": 1.2427, + "step": 8435 + }, + { + "epoch": 0.9325451632506492, + "grad_norm": 0.40234375, + "learning_rate": 2.755134843028462e-06, + "loss": 1.3636, + "step": 8440 + }, + { + "epoch": 0.933097618916082, + "grad_norm": 0.408203125, + "learning_rate": 2.710358115705003e-06, + "loss": 1.2182, + "step": 8445 + }, + { + "epoch": 0.9336500745815148, + "grad_norm": 0.439453125, + "learning_rate": 2.6659432346072156e-06, + "loss": 1.3193, + "step": 8450 + }, + { + "epoch": 0.9342025302469477, + "grad_norm": 0.404296875, + "learning_rate": 2.6218903649259163e-06, + "loss": 1.2126, + "step": 8455 + }, + { + "epoch": 0.9347549859123805, + "grad_norm": 0.408203125, + "learning_rate": 2.578199670505532e-06, + "loss": 1.3129, + "step": 8460 + }, + { + "epoch": 0.9353074415778134, + "grad_norm": 0.400390625, + "learning_rate": 2.5348713138434564e-06, + "loss": 1.2968, + "step": 8465 + }, + { + "epoch": 0.9358598972432463, + "grad_norm": 0.39453125, + "learning_rate": 2.4919054560894383e-06, + "loss": 1.2434, + "step": 8470 + }, + { + "epoch": 0.9364123529086791, + "grad_norm": 0.423828125, + "learning_rate": 2.4493022570450164e-06, + "loss": 1.3078, + "step": 8475 + }, + { + "epoch": 0.9369648085741119, + "grad_norm": 0.39453125, + "learning_rate": 2.4070618751628748e-06, + "loss": 1.3117, + "step": 8480 + }, + { + "epoch": 0.9375172642395447, + "grad_norm": 0.3828125, + "learning_rate": 2.365184467546333e-06, + "loss": 1.1773, + "step": 8485 + }, + { + "epoch": 0.9380697199049777, + "grad_norm": 0.40625, + "learning_rate": 2.3236701899486566e-06, + "loss": 1.3479, + "step": 8490 + }, + { + "epoch": 0.9386221755704105, + "grad_norm": 0.390625, + "learning_rate": 2.28251919677257e-06, + "loss": 1.1983, + "step": 8495 + }, + { + "epoch": 0.9391746312358433, + "grad_norm": 0.40625, + "learning_rate": 2.2417316410696333e-06, + "loss": 1.3284, + "step": 8500 + }, + { + "epoch": 0.9397270869012762, + "grad_norm": 0.404296875, + "learning_rate": 2.2013076745396765e-06, + "loss": 1.2226, + "step": 8505 + }, + { + "epoch": 0.940279542566709, + "grad_norm": 0.4140625, + "learning_rate": 2.161247447530268e-06, + "loss": 1.2644, + "step": 8510 + }, + { + "epoch": 0.9408319982321419, + "grad_norm": 0.4140625, + "learning_rate": 2.121551109036124e-06, + "loss": 1.2459, + "step": 8515 + }, + { + "epoch": 0.9413844538975747, + "grad_norm": 2.328125, + "learning_rate": 2.0822188066985214e-06, + "loss": 1.2049, + "step": 8520 + }, + { + "epoch": 0.9419369095630076, + "grad_norm": 0.3984375, + "learning_rate": 2.043250686804865e-06, + "loss": 1.3138, + "step": 8525 + }, + { + "epoch": 0.9424893652284404, + "grad_norm": 0.388671875, + "learning_rate": 2.004646894287987e-06, + "loss": 1.3041, + "step": 8530 + }, + { + "epoch": 0.9430418208938732, + "grad_norm": 0.416015625, + "learning_rate": 1.9664075727257593e-06, + "loss": 1.2856, + "step": 8535 + }, + { + "epoch": 0.9435942765593062, + "grad_norm": 0.3984375, + "learning_rate": 1.928532864340438e-06, + "loss": 1.2399, + "step": 8540 + }, + { + "epoch": 0.944146732224739, + "grad_norm": 0.396484375, + "learning_rate": 1.891022909998208e-06, + "loss": 1.2144, + "step": 8545 + }, + { + "epoch": 0.9446991878901718, + "grad_norm": 0.4296875, + "learning_rate": 1.8538778492086407e-06, + "loss": 1.239, + "step": 8550 + }, + { + "epoch": 0.9452516435556046, + "grad_norm": 0.384765625, + "learning_rate": 1.8170978201241474e-06, + "loss": 1.214, + "step": 8555 + }, + { + "epoch": 0.9458040992210375, + "grad_norm": 0.40234375, + "learning_rate": 1.7806829595395147e-06, + "loss": 1.2959, + "step": 8560 + }, + { + "epoch": 0.9463565548864704, + "grad_norm": 0.3984375, + "learning_rate": 1.7446334028913491e-06, + "loss": 1.2792, + "step": 8565 + }, + { + "epoch": 0.9469090105519032, + "grad_norm": 0.390625, + "learning_rate": 1.7089492842576106e-06, + "loss": 1.2391, + "step": 8570 + }, + { + "epoch": 0.9474614662173361, + "grad_norm": 0.4140625, + "learning_rate": 1.6736307363570903e-06, + "loss": 1.1879, + "step": 8575 + }, + { + "epoch": 0.9480139218827689, + "grad_norm": 0.392578125, + "learning_rate": 1.638677890548912e-06, + "loss": 1.2647, + "step": 8580 + }, + { + "epoch": 0.9485663775482017, + "grad_norm": 0.3984375, + "learning_rate": 1.6040908768320872e-06, + "loss": 1.2889, + "step": 8585 + }, + { + "epoch": 0.9491188332136347, + "grad_norm": 0.416015625, + "learning_rate": 1.5698698238449716e-06, + "loss": 1.2407, + "step": 8590 + }, + { + "epoch": 0.9496712888790675, + "grad_norm": 0.41015625, + "learning_rate": 1.5360148588648093e-06, + "loss": 1.1342, + "step": 8595 + }, + { + "epoch": 0.9502237445445003, + "grad_norm": 0.421875, + "learning_rate": 1.5025261078073005e-06, + "loss": 1.3172, + "step": 8600 + }, + { + "epoch": 0.9507762002099331, + "grad_norm": 0.388671875, + "learning_rate": 1.469403695226057e-06, + "loss": 1.2656, + "step": 8605 + }, + { + "epoch": 0.951328655875366, + "grad_norm": 0.3984375, + "learning_rate": 1.436647744312214e-06, + "loss": 1.2591, + "step": 8610 + }, + { + "epoch": 0.9518811115407988, + "grad_norm": 0.439453125, + "learning_rate": 1.4042583768939298e-06, + "loss": 1.211, + "step": 8615 + }, + { + "epoch": 0.9524335672062317, + "grad_norm": 0.41796875, + "learning_rate": 1.3722357134359099e-06, + "loss": 1.2279, + "step": 8620 + }, + { + "epoch": 0.9529860228716646, + "grad_norm": 0.40625, + "learning_rate": 1.3405798730390273e-06, + "loss": 1.3243, + "step": 8625 + }, + { + "epoch": 0.9535384785370974, + "grad_norm": 0.388671875, + "learning_rate": 1.3092909734398251e-06, + "loss": 1.3247, + "step": 8630 + }, + { + "epoch": 0.9540909342025302, + "grad_norm": 0.41796875, + "learning_rate": 1.278369131010093e-06, + "loss": 1.2227, + "step": 8635 + }, + { + "epoch": 0.954643389867963, + "grad_norm": 0.408203125, + "learning_rate": 1.2478144607564469e-06, + "loss": 1.2665, + "step": 8640 + }, + { + "epoch": 0.955195845533396, + "grad_norm": 0.400390625, + "learning_rate": 1.2176270763198828e-06, + "loss": 1.2682, + "step": 8645 + }, + { + "epoch": 0.9557483011988288, + "grad_norm": 0.3984375, + "learning_rate": 1.187807089975379e-06, + "loss": 1.1739, + "step": 8650 + }, + { + "epoch": 0.9563007568642616, + "grad_norm": 0.435546875, + "learning_rate": 1.1583546126314293e-06, + "loss": 1.2203, + "step": 8655 + }, + { + "epoch": 0.9568532125296945, + "grad_norm": 0.42578125, + "learning_rate": 1.1292697538296982e-06, + "loss": 1.2483, + "step": 8660 + }, + { + "epoch": 0.9574056681951273, + "grad_norm": 0.41796875, + "learning_rate": 1.100552621744555e-06, + "loss": 1.219, + "step": 8665 + }, + { + "epoch": 0.9579581238605602, + "grad_norm": 0.3984375, + "learning_rate": 1.0722033231826967e-06, + "loss": 1.2911, + "step": 8670 + }, + { + "epoch": 0.9585105795259931, + "grad_norm": 0.4375, + "learning_rate": 1.0442219635827587e-06, + "loss": 1.2531, + "step": 8675 + }, + { + "epoch": 0.9590630351914259, + "grad_norm": 0.419921875, + "learning_rate": 1.016608647014916e-06, + "loss": 1.2883, + "step": 8680 + }, + { + "epoch": 0.9596154908568587, + "grad_norm": 0.390625, + "learning_rate": 9.893634761804827e-07, + "loss": 1.3147, + "step": 8685 + }, + { + "epoch": 0.9601679465222915, + "grad_norm": 0.419921875, + "learning_rate": 9.624865524115346e-07, + "loss": 1.256, + "step": 8690 + }, + { + "epoch": 0.9607204021877245, + "grad_norm": 0.40625, + "learning_rate": 9.359779756705544e-07, + "loss": 1.4092, + "step": 8695 + }, + { + "epoch": 0.9612728578531573, + "grad_norm": 0.42578125, + "learning_rate": 9.098378445500322e-07, + "loss": 1.2386, + "step": 8700 + }, + { + "epoch": 0.9618253135185901, + "grad_norm": 0.41015625, + "learning_rate": 8.840662562721314e-07, + "loss": 1.3856, + "step": 8705 + }, + { + "epoch": 0.962377769184023, + "grad_norm": 0.39453125, + "learning_rate": 8.586633066882565e-07, + "loss": 1.2555, + "step": 8710 + }, + { + "epoch": 0.9629302248494558, + "grad_norm": 0.41015625, + "learning_rate": 8.336290902788091e-07, + "loss": 1.2677, + "step": 8715 + }, + { + "epoch": 0.9634826805148887, + "grad_norm": 0.423828125, + "learning_rate": 8.089637001527317e-07, + "loss": 1.2327, + "step": 8720 + }, + { + "epoch": 0.9640351361803216, + "grad_norm": 0.388671875, + "learning_rate": 7.84667228047209e-07, + "loss": 1.2806, + "step": 8725 + }, + { + "epoch": 0.9645875918457544, + "grad_norm": 0.396484375, + "learning_rate": 7.607397643273229e-07, + "loss": 1.2801, + "step": 8730 + }, + { + "epoch": 0.9651400475111872, + "grad_norm": 0.427734375, + "learning_rate": 7.371813979857312e-07, + "loss": 1.3325, + "step": 8735 + }, + { + "epoch": 0.96569250317662, + "grad_norm": 0.408203125, + "learning_rate": 7.139922166422896e-07, + "loss": 1.2013, + "step": 8740 + }, + { + "epoch": 0.966244958842053, + "grad_norm": 0.408203125, + "learning_rate": 6.911723065437858e-07, + "loss": 1.2617, + "step": 8745 + }, + { + "epoch": 0.9667974145074858, + "grad_norm": 0.392578125, + "learning_rate": 6.687217525635614e-07, + "loss": 1.2695, + "step": 8750 + }, + { + "epoch": 0.9673498701729186, + "grad_norm": 0.412109375, + "learning_rate": 6.466406382012457e-07, + "loss": 1.1961, + "step": 8755 + }, + { + "epoch": 0.9679023258383515, + "grad_norm": 0.421875, + "learning_rate": 6.249290455824231e-07, + "loss": 1.2049, + "step": 8760 + }, + { + "epoch": 0.9684547815037843, + "grad_norm": 0.40625, + "learning_rate": 6.03587055458299e-07, + "loss": 1.3271, + "step": 8765 + }, + { + "epoch": 0.9690072371692172, + "grad_norm": 0.3984375, + "learning_rate": 5.826147472054677e-07, + "loss": 1.2651, + "step": 8770 + }, + { + "epoch": 0.96955969283465, + "grad_norm": 0.416015625, + "learning_rate": 5.620121988255567e-07, + "loss": 1.2567, + "step": 8775 + }, + { + "epoch": 0.9701121485000829, + "grad_norm": 0.400390625, + "learning_rate": 5.417794869449377e-07, + "loss": 1.3054, + "step": 8780 + }, + { + "epoch": 0.9706646041655157, + "grad_norm": 0.396484375, + "learning_rate": 5.219166868145164e-07, + "loss": 1.2261, + "step": 8785 + }, + { + "epoch": 0.9712170598309485, + "grad_norm": 0.404296875, + "learning_rate": 5.024238723093322e-07, + "loss": 1.2853, + "step": 8790 + }, + { + "epoch": 0.9717695154963815, + "grad_norm": 0.427734375, + "learning_rate": 4.833011159284029e-07, + "loss": 1.1949, + "step": 8795 + }, + { + "epoch": 0.9723219711618143, + "grad_norm": 0.41796875, + "learning_rate": 4.645484887943696e-07, + "loss": 1.2838, + "step": 8800 + }, + { + "epoch": 0.9728744268272471, + "grad_norm": 0.431640625, + "learning_rate": 4.461660606532747e-07, + "loss": 1.2869, + "step": 8805 + }, + { + "epoch": 0.97342688249268, + "grad_norm": 0.357421875, + "learning_rate": 4.281538998742951e-07, + "loss": 1.2944, + "step": 8810 + }, + { + "epoch": 0.9739793381581128, + "grad_norm": 0.41015625, + "learning_rate": 4.1051207344946495e-07, + "loss": 1.2441, + "step": 8815 + }, + { + "epoch": 0.9745317938235457, + "grad_norm": 0.421875, + "learning_rate": 3.9324064699348686e-07, + "loss": 1.2831, + "step": 8820 + }, + { + "epoch": 0.9750842494889785, + "grad_norm": 0.43359375, + "learning_rate": 3.763396847433875e-07, + "loss": 1.2034, + "step": 8825 + }, + { + "epoch": 0.9756367051544114, + "grad_norm": 0.408203125, + "learning_rate": 3.59809249558396e-07, + "loss": 1.3094, + "step": 8830 + }, + { + "epoch": 0.9761891608198442, + "grad_norm": 0.39453125, + "learning_rate": 3.436494029196102e-07, + "loss": 1.222, + "step": 8835 + }, + { + "epoch": 0.976741616485277, + "grad_norm": 0.439453125, + "learning_rate": 3.278602049298418e-07, + "loss": 1.3205, + "step": 8840 + }, + { + "epoch": 0.97729407215071, + "grad_norm": 0.392578125, + "learning_rate": 3.1244171431332737e-07, + "loss": 1.2384, + "step": 8845 + }, + { + "epoch": 0.9778465278161428, + "grad_norm": 0.40625, + "learning_rate": 2.9739398841557295e-07, + "loss": 1.3438, + "step": 8850 + }, + { + "epoch": 0.9783989834815756, + "grad_norm": 0.396484375, + "learning_rate": 2.8271708320309895e-07, + "loss": 1.2542, + "step": 8855 + }, + { + "epoch": 0.9789514391470084, + "grad_norm": 0.412109375, + "learning_rate": 2.6841105326324e-07, + "loss": 1.2843, + "step": 8860 + }, + { + "epoch": 0.9795038948124413, + "grad_norm": 0.416015625, + "learning_rate": 2.544759518039674e-07, + "loss": 1.2062, + "step": 8865 + }, + { + "epoch": 0.9800563504778742, + "grad_norm": 0.39453125, + "learning_rate": 2.409118306536229e-07, + "loss": 1.3051, + "step": 8870 + }, + { + "epoch": 0.980608806143307, + "grad_norm": 0.408203125, + "learning_rate": 2.277187402608405e-07, + "loss": 1.3078, + "step": 8875 + }, + { + "epoch": 0.9811612618087399, + "grad_norm": 0.400390625, + "learning_rate": 2.1489672969423614e-07, + "loss": 1.2903, + "step": 8880 + }, + { + "epoch": 0.9817137174741727, + "grad_norm": 0.435546875, + "learning_rate": 2.0244584664229628e-07, + "loss": 1.2979, + "step": 8885 + }, + { + "epoch": 0.9822661731396055, + "grad_norm": 0.392578125, + "learning_rate": 1.9036613741320043e-07, + "loss": 1.3551, + "step": 8890 + }, + { + "epoch": 0.9828186288050383, + "grad_norm": 0.412109375, + "learning_rate": 1.7865764693461017e-07, + "loss": 1.1832, + "step": 8895 + }, + { + "epoch": 0.9833710844704713, + "grad_norm": 0.3984375, + "learning_rate": 1.6732041875354709e-07, + "loss": 1.2429, + "step": 8900 + }, + { + "epoch": 0.9839235401359041, + "grad_norm": 0.40625, + "learning_rate": 1.563544950361928e-07, + "loss": 1.254, + "step": 8905 + }, + { + "epoch": 0.9844759958013369, + "grad_norm": 0.412109375, + "learning_rate": 1.457599165677559e-07, + "loss": 1.1677, + "step": 8910 + }, + { + "epoch": 0.9850284514667698, + "grad_norm": 0.369140625, + "learning_rate": 1.3553672275230523e-07, + "loss": 1.1959, + "step": 8915 + }, + { + "epoch": 0.9855809071322026, + "grad_norm": 0.408203125, + "learning_rate": 1.2568495161264793e-07, + "loss": 1.2862, + "step": 8920 + }, + { + "epoch": 0.9861333627976355, + "grad_norm": 0.40234375, + "learning_rate": 1.1620463979014062e-07, + "loss": 1.2759, + "step": 8925 + }, + { + "epoch": 0.9866858184630684, + "grad_norm": 0.431640625, + "learning_rate": 1.070958225446006e-07, + "loss": 1.3177, + "step": 8930 + }, + { + "epoch": 0.9872382741285012, + "grad_norm": 0.390625, + "learning_rate": 9.835853375418368e-08, + "loss": 1.1833, + "step": 8935 + }, + { + "epoch": 0.987790729793934, + "grad_norm": 0.3984375, + "learning_rate": 8.999280591518444e-08, + "loss": 1.2862, + "step": 8940 + }, + { + "epoch": 0.9883431854593668, + "grad_norm": 0.37109375, + "learning_rate": 8.199867014198059e-08, + "loss": 1.3404, + "step": 8945 + }, + { + "epoch": 0.9888956411247998, + "grad_norm": 0.3984375, + "learning_rate": 7.437615616692206e-08, + "loss": 1.2988, + "step": 8950 + }, + { + "epoch": 0.9894480967902326, + "grad_norm": 0.400390625, + "learning_rate": 6.712529234016441e-08, + "loss": 1.2459, + "step": 8955 + }, + { + "epoch": 0.9900005524556654, + "grad_norm": 0.38671875, + "learning_rate": 6.024610562962441e-08, + "loss": 1.2718, + "step": 8960 + }, + { + "epoch": 0.9905530081210983, + "grad_norm": 0.40625, + "learning_rate": 5.3738621620824657e-08, + "loss": 1.2696, + "step": 8965 + }, + { + "epoch": 0.9911054637865311, + "grad_norm": 0.41015625, + "learning_rate": 4.7602864516849144e-08, + "loss": 1.2373, + "step": 8970 + }, + { + "epoch": 0.991657919451964, + "grad_norm": 0.392578125, + "learning_rate": 4.183885713822111e-08, + "loss": 1.2656, + "step": 8975 + }, + { + "epoch": 0.9922103751173968, + "grad_norm": 0.419921875, + "learning_rate": 3.6446620922825356e-08, + "loss": 1.2292, + "step": 8980 + }, + { + "epoch": 0.9927628307828297, + "grad_norm": 0.4453125, + "learning_rate": 3.142617592583052e-08, + "loss": 1.2738, + "step": 8985 + }, + { + "epoch": 0.9933152864482625, + "grad_norm": 0.40234375, + "learning_rate": 2.677754081961137e-08, + "loss": 1.2733, + "step": 8990 + }, + { + "epoch": 0.9938677421136953, + "grad_norm": 0.416015625, + "learning_rate": 2.2500732893704358e-08, + "loss": 1.3335, + "step": 8995 + }, + { + "epoch": 0.9944201977791283, + "grad_norm": 0.41015625, + "learning_rate": 1.8595768054674444e-08, + "loss": 1.2119, + "step": 9000 + }, + { + "epoch": 0.9949726534445611, + "grad_norm": 0.396484375, + "learning_rate": 1.506266082615948e-08, + "loss": 1.2262, + "step": 9005 + }, + { + "epoch": 0.9955251091099939, + "grad_norm": 0.421875, + "learning_rate": 1.1901424348714774e-08, + "loss": 1.2112, + "step": 9010 + }, + { + "epoch": 0.9960775647754267, + "grad_norm": 0.392578125, + "learning_rate": 9.112070379835302e-09, + "loss": 1.1947, + "step": 9015 + }, + { + "epoch": 0.9966300204408596, + "grad_norm": 0.416015625, + "learning_rate": 6.6946092938668934e-09, + "loss": 1.3042, + "step": 9020 + }, + { + "epoch": 0.9971824761062925, + "grad_norm": 0.435546875, + "learning_rate": 4.649050082006223e-09, + "loss": 1.2832, + "step": 9025 + }, + { + "epoch": 0.9977349317717253, + "grad_norm": 0.4140625, + "learning_rate": 2.975400352211999e-09, + "loss": 1.2572, + "step": 9030 + }, + { + "epoch": 0.9982873874371582, + "grad_norm": 0.396484375, + "learning_rate": 1.6736663292604704e-09, + "loss": 1.2085, + "step": 9035 + }, + { + "epoch": 0.998839843102591, + "grad_norm": 0.404296875, + "learning_rate": 7.438528546344082e-10, + "loss": 1.2474, + "step": 9040 + }, + { + "epoch": 0.9993922987680238, + "grad_norm": 0.376953125, + "learning_rate": 1.8596338656751145e-10, + "loss": 1.2125, + "step": 9045 + }, + { + "epoch": 0.9999447544334568, + "grad_norm": 0.4453125, + "learning_rate": 0.0, + "loss": 1.2875, + "step": 9050 + }, + { + "epoch": 0.9999447544334568, + "eval_loss": 1.266111135482788, + "eval_runtime": 1890.6789, + "eval_samples_per_second": 3.812, + "eval_steps_per_second": 0.477, + "step": 9050 + }, + { + "epoch": 0.9999447544334568, + "step": 9050, + "total_flos": 6.363322205866033e+18, + "train_loss": 0.5663559655985121, + "train_runtime": 33167.9656, + "train_samples_per_second": 2.183, + "train_steps_per_second": 0.273 + } + ], + "logging_steps": 5, + "max_steps": 9050, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 6.363322205866033e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}