{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9987565282268093, "eval_steps": 500, "global_step": 502, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001989554837105198, "grad_norm": 2.019071375807948, "learning_rate": 5.88235294117647e-06, "loss": 1.4931, "step": 1 }, { "epoch": 0.009947774185525988, "grad_norm": 1.5423217337624162, "learning_rate": 2.941176470588235e-05, "loss": 1.4424, "step": 5 }, { "epoch": 0.019895548371051976, "grad_norm": 0.8308012142463063, "learning_rate": 5.88235294117647e-05, "loss": 1.2728, "step": 10 }, { "epoch": 0.029843322556577966, "grad_norm": 0.5145126949446309, "learning_rate": 8.823529411764705e-05, "loss": 1.1614, "step": 15 }, { "epoch": 0.03979109674210395, "grad_norm": 0.36488357130003074, "learning_rate": 0.0001176470588235294, "loss": 1.1138, "step": 20 }, { "epoch": 0.04973887092762994, "grad_norm": 0.45152250150726514, "learning_rate": 0.00014705882352941175, "loss": 1.07, "step": 25 }, { "epoch": 0.05968664511315593, "grad_norm": 0.3640886200970852, "learning_rate": 0.0001764705882352941, "loss": 1.0509, "step": 30 }, { "epoch": 0.06963441929868192, "grad_norm": 1.7718021462555353, "learning_rate": 0.00020588235294117645, "loss": 1.0168, "step": 35 }, { "epoch": 0.0795821934842079, "grad_norm": 0.3768307338988579, "learning_rate": 0.0002352941176470588, "loss": 1.0086, "step": 40 }, { "epoch": 0.0895299676697339, "grad_norm": 0.33093766782578965, "learning_rate": 0.00026470588235294115, "loss": 1.0008, "step": 45 }, { "epoch": 0.09947774185525989, "grad_norm": 0.37314906796958397, "learning_rate": 0.0002941176470588235, "loss": 0.9975, "step": 50 }, { "epoch": 0.10942551604078588, "grad_norm": 0.39939561360809905, "learning_rate": 0.00029994177629874796, "loss": 0.9884, "step": 55 }, { "epoch": 0.11937329022631187, "grad_norm": 0.3919443406321306, "learning_rate": 0.00029970531997706437, "loss": 0.9843, "step": 60 }, { "epoch": 0.12932106441183785, "grad_norm": 0.3915947062044229, "learning_rate": 0.00029928727864250395, "loss": 0.9913, "step": 65 }, { "epoch": 0.13926883859736383, "grad_norm": 0.3534055778790222, "learning_rate": 0.00029868815935814996, "loss": 0.9893, "step": 70 }, { "epoch": 0.14921661278288983, "grad_norm": 0.36735315351449394, "learning_rate": 0.0002979086888255182, "loss": 0.9775, "step": 75 }, { "epoch": 0.1591643869684158, "grad_norm": 0.3088341190569696, "learning_rate": 0.00029694981250310496, "loss": 0.981, "step": 80 }, { "epoch": 0.16911216115394181, "grad_norm": 0.325839880040256, "learning_rate": 0.0002958126934595933, "loss": 0.9659, "step": 85 }, { "epoch": 0.1790599353394678, "grad_norm": 1.4788476638739554, "learning_rate": 0.0002944987109631094, "loss": 0.9681, "step": 90 }, { "epoch": 0.1890077095249938, "grad_norm": 0.3505592753718136, "learning_rate": 0.00029300945880823956, "loss": 0.9653, "step": 95 }, { "epoch": 0.19895548371051977, "grad_norm": 0.28451648461527196, "learning_rate": 0.0002913467433828382, "loss": 0.9511, "step": 100 }, { "epoch": 0.20890325789604575, "grad_norm": 0.3138636289880952, "learning_rate": 0.00028951258147696967, "loss": 0.9572, "step": 105 }, { "epoch": 0.21885103208157175, "grad_norm": 0.2732249403521582, "learning_rate": 0.00028750919783664407, "loss": 0.9617, "step": 110 }, { "epoch": 0.22879880626709773, "grad_norm": 0.311679721602736, "learning_rate": 0.000285339022465312, "loss": 0.9484, "step": 115 }, { "epoch": 0.23874658045262373, "grad_norm": 0.2676711284506792, "learning_rate": 0.00028300468767639305, "loss": 0.9397, "step": 120 }, { "epoch": 0.2486943546381497, "grad_norm": 0.25074023315013144, "learning_rate": 0.00028050902490041194, "loss": 0.9457, "step": 125 }, { "epoch": 0.2586421288236757, "grad_norm": 0.27129765694428115, "learning_rate": 0.00027785506125061604, "loss": 0.9268, "step": 130 }, { "epoch": 0.2685899030092017, "grad_norm": 2.105526076305897, "learning_rate": 0.00027504601585123963, "loss": 0.9459, "step": 135 }, { "epoch": 0.27853767719472766, "grad_norm": 0.27202583577769746, "learning_rate": 0.00027208529593286804, "loss": 0.9395, "step": 140 }, { "epoch": 0.28848545138025367, "grad_norm": 0.26240068306380243, "learning_rate": 0.00026897649269963866, "loss": 0.9166, "step": 145 }, { "epoch": 0.29843322556577967, "grad_norm": 0.2631437322326041, "learning_rate": 0.00026572337697329144, "loss": 0.92, "step": 150 }, { "epoch": 0.3083809997513056, "grad_norm": 0.2780992526939389, "learning_rate": 0.00026232989461935164, "loss": 0.929, "step": 155 }, { "epoch": 0.3183287739368316, "grad_norm": 0.2644307921832001, "learning_rate": 0.000258800161760994, "loss": 0.9119, "step": 160 }, { "epoch": 0.3282765481223576, "grad_norm": 0.27009906970951136, "learning_rate": 0.0002551384597863925, "loss": 0.9141, "step": 165 }, { "epoch": 0.33822432230788363, "grad_norm": 0.24907417369935828, "learning_rate": 0.0002513492301556124, "loss": 0.9045, "step": 170 }, { "epoch": 0.3481720964934096, "grad_norm": 0.4637150153647203, "learning_rate": 0.0002474370690133423, "loss": 0.9185, "step": 175 }, { "epoch": 0.3581198706789356, "grad_norm": 0.28404376319877433, "learning_rate": 0.00024340672161400278, "loss": 0.9224, "step": 180 }, { "epoch": 0.3680676448644616, "grad_norm": 0.2633604383062445, "learning_rate": 0.00023926307656599145, "loss": 0.9049, "step": 185 }, { "epoch": 0.3780154190499876, "grad_norm": 0.3089691505145177, "learning_rate": 0.00023501115990204728, "loss": 0.906, "step": 190 }, { "epoch": 0.38796319323551354, "grad_norm": 0.2712113108379062, "learning_rate": 0.00023065612898292607, "loss": 0.9033, "step": 195 }, { "epoch": 0.39791096742103954, "grad_norm": 0.2759532883918752, "learning_rate": 0.00022620326624178135, "loss": 0.9047, "step": 200 }, { "epoch": 0.40785874160656554, "grad_norm": 0.25413106263769636, "learning_rate": 0.0002216579727768394, "loss": 0.8884, "step": 205 }, { "epoch": 0.4178065157920915, "grad_norm": 0.2679789855086274, "learning_rate": 0.00021702576180013906, "loss": 0.892, "step": 210 }, { "epoch": 0.4277542899776175, "grad_norm": 0.2531713028754476, "learning_rate": 0.00021231225195028297, "loss": 0.8907, "step": 215 }, { "epoch": 0.4377020641631435, "grad_norm": 0.24842966918028864, "learning_rate": 0.00020752316047731214, "loss": 0.882, "step": 220 }, { "epoch": 0.4476498383486695, "grad_norm": 0.23591143252036872, "learning_rate": 0.00020266429630796956, "loss": 0.8846, "step": 225 }, { "epoch": 0.45759761253419545, "grad_norm": 0.23767648270009806, "learning_rate": 0.00019774155299976477, "loss": 0.8793, "step": 230 }, { "epoch": 0.46754538671972146, "grad_norm": 0.2271591056583853, "learning_rate": 0.00019276090159238524, "loss": 0.8741, "step": 235 }, { "epoch": 0.47749316090524746, "grad_norm": 0.22901636532179012, "learning_rate": 0.000187728383365126, "loss": 0.8837, "step": 240 }, { "epoch": 0.48744093509077346, "grad_norm": 0.22668623781094616, "learning_rate": 0.0001826501025091223, "loss": 0.8735, "step": 245 }, { "epoch": 0.4973887092762994, "grad_norm": 0.23947671322760095, "learning_rate": 0.00017753221872327318, "loss": 0.8692, "step": 250 }, { "epoch": 0.5073364834618255, "grad_norm": 0.26156533719751, "learning_rate": 0.00017238093974283674, "loss": 0.8625, "step": 255 }, { "epoch": 0.5172842576473514, "grad_norm": 0.25671509792902836, "learning_rate": 0.00016720251380976007, "loss": 0.8604, "step": 260 }, { "epoch": 0.5272320318328774, "grad_norm": 0.24704965941321674, "learning_rate": 0.00016200322209387663, "loss": 0.8626, "step": 265 }, { "epoch": 0.5371798060184034, "grad_norm": 0.2514007967545614, "learning_rate": 0.00015678937107416343, "loss": 0.8528, "step": 270 }, { "epoch": 0.5471275802039294, "grad_norm": 0.24028475499604857, "learning_rate": 0.00015156728488929967, "loss": 0.8574, "step": 275 }, { "epoch": 0.5570753543894553, "grad_norm": 0.23213673166180135, "learning_rate": 0.0001463432976668051, "loss": 0.86, "step": 280 }, { "epoch": 0.5670231285749814, "grad_norm": 0.23102662796096035, "learning_rate": 0.00014112374584006253, "loss": 0.8617, "step": 285 }, { "epoch": 0.5769709027605073, "grad_norm": 0.23635078987821154, "learning_rate": 0.00013591496046254278, "loss": 0.8468, "step": 290 }, { "epoch": 0.5869186769460333, "grad_norm": 0.2473298815208931, "learning_rate": 0.00013072325952855624, "loss": 0.8465, "step": 295 }, { "epoch": 0.5968664511315593, "grad_norm": 0.22199589321301555, "learning_rate": 0.00012555494030984393, "loss": 0.8474, "step": 300 }, { "epoch": 0.6068142253170853, "grad_norm": 0.224149793992071, "learning_rate": 0.00012041627171730368, "loss": 0.8523, "step": 305 }, { "epoch": 0.6167619995026112, "grad_norm": 0.2216854964579631, "learning_rate": 0.00011531348669711734, "loss": 0.8296, "step": 310 }, { "epoch": 0.6267097736881373, "grad_norm": 0.25823608221836053, "learning_rate": 0.00011025277467050076, "loss": 0.8275, "step": 315 }, { "epoch": 0.6366575478736632, "grad_norm": 0.25511921593962283, "learning_rate": 0.00010524027402624775, "loss": 0.8379, "step": 320 }, { "epoch": 0.6466053220591893, "grad_norm": 0.2169176240841302, "learning_rate": 0.00010028206467517357, "loss": 0.842, "step": 325 }, { "epoch": 0.6565530962447153, "grad_norm": 0.23684621611339568, "learning_rate": 9.538416067548939e-05, "loss": 0.8363, "step": 330 }, { "epoch": 0.6665008704302412, "grad_norm": 0.21588755800082085, "learning_rate": 9.055250293805247e-05, "loss": 0.8257, "step": 335 }, { "epoch": 0.6764486446157673, "grad_norm": 0.22987884680681675, "learning_rate": 8.579295202034084e-05, "loss": 0.8434, "step": 340 }, { "epoch": 0.6863964188012932, "grad_norm": 0.23044188787166803, "learning_rate": 8.111128101789177e-05, "loss": 0.8368, "step": 345 }, { "epoch": 0.6963441929868192, "grad_norm": 0.20871504598447846, "learning_rate": 7.651316856182797e-05, "loss": 0.8235, "step": 350 }, { "epoch": 0.7062919671723452, "grad_norm": 0.21764586591268964, "learning_rate": 7.200419193096416e-05, "loss": 0.8366, "step": 355 }, { "epoch": 0.7162397413578712, "grad_norm": 0.20093905811705248, "learning_rate": 6.758982028684842e-05, "loss": 0.8212, "step": 360 }, { "epoch": 0.7261875155433971, "grad_norm": 0.20201151370456955, "learning_rate": 6.327540803994507e-05, "loss": 0.8132, "step": 365 }, { "epoch": 0.7361352897289232, "grad_norm": 0.20205806825962228, "learning_rate": 5.9066188355004337e-05, "loss": 0.8115, "step": 370 }, { "epoch": 0.7460830639144491, "grad_norm": 0.4492212407917222, "learning_rate": 5.4967266803496726e-05, "loss": 0.8178, "step": 375 }, { "epoch": 0.7560308380999752, "grad_norm": 0.20165879598033634, "learning_rate": 5.0983615170812656e-05, "loss": 0.8202, "step": 380 }, { "epoch": 0.7659786122855011, "grad_norm": 0.20731013148125454, "learning_rate": 4.7120065425736744e-05, "loss": 0.8224, "step": 385 }, { "epoch": 0.7759263864710271, "grad_norm": 0.19813282037840257, "learning_rate": 4.3381303859513076e-05, "loss": 0.8031, "step": 390 }, { "epoch": 0.7858741606565531, "grad_norm": 0.19914236210141723, "learning_rate": 3.977186540161016e-05, "loss": 0.8146, "step": 395 }, { "epoch": 0.7958219348420791, "grad_norm": 0.20390757189890973, "learning_rate": 3.629612811907965e-05, "loss": 0.8132, "step": 400 }, { "epoch": 0.805769709027605, "grad_norm": 0.19393901917008016, "learning_rate": 3.295830790618167e-05, "loss": 0.8142, "step": 405 }, { "epoch": 0.8157174832131311, "grad_norm": 0.1907323400363912, "learning_rate": 2.976245337071748e-05, "loss": 0.8129, "step": 410 }, { "epoch": 0.825665257398657, "grad_norm": 0.199622342411806, "learning_rate": 2.671244092327191e-05, "loss": 0.7951, "step": 415 }, { "epoch": 0.835613031584183, "grad_norm": 0.20412611674653627, "learning_rate": 2.38119700753228e-05, "loss": 0.8143, "step": 420 }, { "epoch": 0.845560805769709, "grad_norm": 0.19936335181674378, "learning_rate": 2.106455895191985e-05, "loss": 0.802, "step": 425 }, { "epoch": 0.855508579955235, "grad_norm": 0.19669632646100263, "learning_rate": 1.847354002437588e-05, "loss": 0.7948, "step": 430 }, { "epoch": 0.865456354140761, "grad_norm": 0.19474265071015343, "learning_rate": 1.6042056068147402e-05, "loss": 0.8078, "step": 435 }, { "epoch": 0.875404128326287, "grad_norm": 0.19134910562287546, "learning_rate": 1.3773056350806022e-05, "loss": 0.8067, "step": 440 }, { "epoch": 0.885351902511813, "grad_norm": 0.18803806717884775, "learning_rate": 1.1669293054725392e-05, "loss": 0.7952, "step": 445 }, { "epoch": 0.895299676697339, "grad_norm": 0.18932335388504165, "learning_rate": 9.7333179388228e-06, "loss": 0.8102, "step": 450 }, { "epoch": 0.905247450882865, "grad_norm": 0.19773216218571474, "learning_rate": 7.967479243403913e-06, "loss": 0.8015, "step": 455 }, { "epoch": 0.9151952250683909, "grad_norm": 0.18837812375636145, "learning_rate": 6.373918841865727e-06, "loss": 0.7997, "step": 460 }, { "epoch": 0.925142999253917, "grad_norm": 0.18803298571004357, "learning_rate": 4.954569642711964e-06, "loss": 0.8068, "step": 465 }, { "epoch": 0.9350907734394429, "grad_norm": 0.1810703612957102, "learning_rate": 3.711153245032361e-06, "loss": 0.7992, "step": 470 }, { "epoch": 0.9450385476249689, "grad_norm": 0.18601132727527311, "learning_rate": 2.645177850289787e-06, "loss": 0.8039, "step": 475 }, { "epoch": 0.9549863218104949, "grad_norm": 0.18789475936900635, "learning_rate": 1.7579364329477375e-06, "loss": 0.8024, "step": 480 }, { "epoch": 0.9649340959960209, "grad_norm": 0.7153792725698854, "learning_rate": 1.0505051721574398e-06, "loss": 0.8047, "step": 485 }, { "epoch": 0.9748818701815469, "grad_norm": 0.19016384162520752, "learning_rate": 5.23742146406858e-07, "loss": 0.8004, "step": 490 }, { "epoch": 0.9848296443670729, "grad_norm": 0.19392554346569527, "learning_rate": 1.7828629271456894e-07, "loss": 0.7991, "step": 495 }, { "epoch": 0.9947774185525988, "grad_norm": 0.19065289345162353, "learning_rate": 1.4556631631429393e-08, "loss": 0.8032, "step": 500 }, { "epoch": 0.9987565282268093, "eval_loss": 1.1664291620254517, "eval_runtime": 1405.6452, "eval_samples_per_second": 16.582, "eval_steps_per_second": 1.037, "step": 502 }, { "epoch": 0.9987565282268093, "step": 502, "total_flos": 234414617395200.0, "train_loss": 0.8937448220423968, "train_runtime": 13768.4166, "train_samples_per_second": 4.672, "train_steps_per_second": 0.036 } ], "logging_steps": 5, "max_steps": 502, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 234414617395200.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }