|
{ |
|
"best_metric": 1.0332649946212769, |
|
"best_model_checkpoint": "/scratch/czm5kz/finetuned_Falcon-7B_16_1_0.0003_sequential/checkpoint-2700", |
|
"epoch": 0.9618810117563235, |
|
"eval_steps": 20, |
|
"global_step": 2700, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.6305112838745117, |
|
"learning_rate": 0.0002994656216601353, |
|
"loss": 2.8293, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.9544750452041626, |
|
"learning_rate": 0.0002989312433202707, |
|
"loss": 2.3887, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.2381925582885742, |
|
"learning_rate": 0.0002983968649804061, |
|
"loss": 2.1336, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.6555545330047607, |
|
"learning_rate": 0.0002978624866405415, |
|
"loss": 2.0826, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 1.952557921409607, |
|
"eval_runtime": 244.0037, |
|
"eval_samples_per_second": 46.02, |
|
"eval_steps_per_second": 5.754, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.8518421053886414, |
|
"learning_rate": 0.0002973281083006768, |
|
"loss": 1.9436, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.0245165824890137, |
|
"learning_rate": 0.00029679372996081223, |
|
"loss": 1.8312, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.3011702299118042, |
|
"learning_rate": 0.0002962593516209476, |
|
"loss": 1.823, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.7834419012069702, |
|
"learning_rate": 0.000295724973281083, |
|
"loss": 1.7408, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 1.688796877861023, |
|
"eval_runtime": 244.0831, |
|
"eval_samples_per_second": 46.005, |
|
"eval_steps_per_second": 5.752, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.2179532051086426, |
|
"learning_rate": 0.0002951905949412184, |
|
"loss": 1.7326, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.8537722229957581, |
|
"learning_rate": 0.00029465621660135373, |
|
"loss": 1.6131, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.7369029521942139, |
|
"learning_rate": 0.00029412183826148914, |
|
"loss": 1.6402, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.8261582255363464, |
|
"learning_rate": 0.0002935874599216245, |
|
"loss": 1.6312, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 1.6201761960983276, |
|
"eval_runtime": 244.0446, |
|
"eval_samples_per_second": 46.012, |
|
"eval_steps_per_second": 5.753, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.6707925200462341, |
|
"learning_rate": 0.0002930530815817599, |
|
"loss": 1.688, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.5533925890922546, |
|
"learning_rate": 0.00029251870324189524, |
|
"loss": 1.6054, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.1391127109527588, |
|
"learning_rate": 0.0002919843249020306, |
|
"loss": 1.71, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.9591465592384338, |
|
"learning_rate": 0.000291449946562166, |
|
"loss": 1.5608, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 1.5900055170059204, |
|
"eval_runtime": 242.9934, |
|
"eval_samples_per_second": 46.211, |
|
"eval_steps_per_second": 5.778, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.8105848431587219, |
|
"learning_rate": 0.00029091556822230134, |
|
"loss": 1.5863, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.0604382753372192, |
|
"learning_rate": 0.00029038118988243674, |
|
"loss": 1.5693, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.8133612871170044, |
|
"learning_rate": 0.0002898468115425721, |
|
"loss": 1.5386, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.8607632517814636, |
|
"learning_rate": 0.0002893124332027075, |
|
"loss": 1.5285, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 1.5691229104995728, |
|
"eval_runtime": 242.5788, |
|
"eval_samples_per_second": 46.29, |
|
"eval_steps_per_second": 5.788, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.9120557904243469, |
|
"learning_rate": 0.00028877805486284284, |
|
"loss": 1.5746, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.2741694450378418, |
|
"learning_rate": 0.00028824367652297825, |
|
"loss": 1.6623, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.8563589453697205, |
|
"learning_rate": 0.0002877092981831136, |
|
"loss": 1.5452, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.9986954927444458, |
|
"learning_rate": 0.000287174919843249, |
|
"loss": 1.5181, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 1.5524612665176392, |
|
"eval_runtime": 243.3678, |
|
"eval_samples_per_second": 46.14, |
|
"eval_steps_per_second": 5.769, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.7078434228897095, |
|
"learning_rate": 0.0002866405415033844, |
|
"loss": 1.5195, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.9647915363311768, |
|
"learning_rate": 0.00028610616316351975, |
|
"loss": 1.6456, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.033664345741272, |
|
"learning_rate": 0.00028557178482365515, |
|
"loss": 1.5991, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.3080883026123047, |
|
"learning_rate": 0.0002850374064837905, |
|
"loss": 1.5599, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 1.5477995872497559, |
|
"eval_runtime": 243.1511, |
|
"eval_samples_per_second": 46.181, |
|
"eval_steps_per_second": 5.774, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.6780478954315186, |
|
"learning_rate": 0.00028450302814392585, |
|
"loss": 1.5578, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.7684407234191895, |
|
"learning_rate": 0.00028396864980406126, |
|
"loss": 1.6286, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.960607647895813, |
|
"learning_rate": 0.0002834342714641966, |
|
"loss": 1.5832, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.9325504899024963, |
|
"learning_rate": 0.000282899893124332, |
|
"loss": 1.4888, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 1.5451940298080444, |
|
"eval_runtime": 243.2895, |
|
"eval_samples_per_second": 46.155, |
|
"eval_steps_per_second": 5.771, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.2480655908584595, |
|
"learning_rate": 0.00028236551478446736, |
|
"loss": 1.5704, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.0189040899276733, |
|
"learning_rate": 0.00028183113644460276, |
|
"loss": 1.5648, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.8874185085296631, |
|
"learning_rate": 0.0002812967581047381, |
|
"loss": 1.5737, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.0464727878570557, |
|
"learning_rate": 0.0002807623797648735, |
|
"loss": 1.4978, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 1.5298898220062256, |
|
"eval_runtime": 242.8219, |
|
"eval_samples_per_second": 46.244, |
|
"eval_steps_per_second": 5.782, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.9442905187606812, |
|
"learning_rate": 0.00028022800142500886, |
|
"loss": 1.4986, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.7404810786247253, |
|
"learning_rate": 0.00027969362308514426, |
|
"loss": 1.4716, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.00387704372406, |
|
"learning_rate": 0.00027915924474527967, |
|
"loss": 1.5135, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.0239578485488892, |
|
"learning_rate": 0.000278624866405415, |
|
"loss": 1.4792, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 1.5180584192276, |
|
"eval_runtime": 243.1799, |
|
"eval_samples_per_second": 46.176, |
|
"eval_steps_per_second": 5.774, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.8795331120491028, |
|
"learning_rate": 0.0002780904880655504, |
|
"loss": 1.5942, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.1808823347091675, |
|
"learning_rate": 0.00027755610972568577, |
|
"loss": 1.5523, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.10402512550354, |
|
"learning_rate": 0.0002770217313858211, |
|
"loss": 1.5887, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.1447386741638184, |
|
"learning_rate": 0.0002764873530459565, |
|
"loss": 1.5465, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 1.514195203781128, |
|
"eval_runtime": 243.5401, |
|
"eval_samples_per_second": 46.107, |
|
"eval_steps_per_second": 5.765, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.8197668790817261, |
|
"learning_rate": 0.00027595297470609187, |
|
"loss": 1.5674, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.9403067231178284, |
|
"learning_rate": 0.0002754185963662273, |
|
"loss": 1.4565, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.8626269698143005, |
|
"learning_rate": 0.0002748842180263626, |
|
"loss": 1.5636, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.9701679348945618, |
|
"learning_rate": 0.000274349839686498, |
|
"loss": 1.4903, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 1.5165505409240723, |
|
"eval_runtime": 242.914, |
|
"eval_samples_per_second": 46.226, |
|
"eval_steps_per_second": 5.78, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.8026458621025085, |
|
"learning_rate": 0.0002738154613466334, |
|
"loss": 1.5408, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.783799409866333, |
|
"learning_rate": 0.0002732810830067688, |
|
"loss": 1.5652, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.8295706510543823, |
|
"learning_rate": 0.0002727467046669041, |
|
"loss": 1.4772, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.9329605102539062, |
|
"learning_rate": 0.00027221232632703953, |
|
"loss": 1.5244, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 1.505288004875183, |
|
"eval_runtime": 243.1666, |
|
"eval_samples_per_second": 46.178, |
|
"eval_steps_per_second": 5.774, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.1129286289215088, |
|
"learning_rate": 0.0002716779479871749, |
|
"loss": 1.4654, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.0153439044952393, |
|
"learning_rate": 0.0002711435696473103, |
|
"loss": 1.4938, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.7375654578208923, |
|
"learning_rate": 0.0002706091913074457, |
|
"loss": 1.454, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.2654181718826294, |
|
"learning_rate": 0.00027007481296758103, |
|
"loss": 1.5073, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 1.4969700574874878, |
|
"eval_runtime": 243.386, |
|
"eval_samples_per_second": 46.137, |
|
"eval_steps_per_second": 5.769, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.8651628494262695, |
|
"learning_rate": 0.0002695404346277164, |
|
"loss": 1.5153, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.7497414946556091, |
|
"learning_rate": 0.0002690060562878518, |
|
"loss": 1.4811, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.9008344411849976, |
|
"learning_rate": 0.00026847167794798714, |
|
"loss": 1.5365, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.1004663705825806, |
|
"learning_rate": 0.00026793729960812254, |
|
"loss": 1.4646, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"eval_loss": 1.4890170097351074, |
|
"eval_runtime": 242.8989, |
|
"eval_samples_per_second": 46.229, |
|
"eval_steps_per_second": 5.78, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.9690203070640564, |
|
"learning_rate": 0.0002674029212682579, |
|
"loss": 1.5305, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.3153674602508545, |
|
"learning_rate": 0.0002668685429283933, |
|
"loss": 1.5001, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.1792179346084595, |
|
"learning_rate": 0.00026633416458852864, |
|
"loss": 1.4683, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.7645769715309143, |
|
"learning_rate": 0.000265799786248664, |
|
"loss": 1.4512, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"eval_loss": 1.4869476556777954, |
|
"eval_runtime": 243.0462, |
|
"eval_samples_per_second": 46.201, |
|
"eval_steps_per_second": 5.777, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.8460853695869446, |
|
"learning_rate": 0.0002652654079087994, |
|
"loss": 1.4417, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.8589680194854736, |
|
"learning_rate": 0.0002647310295689348, |
|
"loss": 1.5643, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.7434174418449402, |
|
"learning_rate": 0.00026419665122907014, |
|
"loss": 1.4943, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.9790274500846863, |
|
"learning_rate": 0.00026366227288920555, |
|
"loss": 1.4968, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 1.4747017621994019, |
|
"eval_runtime": 243.372, |
|
"eval_samples_per_second": 46.139, |
|
"eval_steps_per_second": 5.769, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.0329363346099854, |
|
"learning_rate": 0.0002631278945493409, |
|
"loss": 1.4943, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.045272707939148, |
|
"learning_rate": 0.0002625935162094763, |
|
"loss": 1.4858, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.9788330793380737, |
|
"learning_rate": 0.00026205913786961165, |
|
"loss": 1.5004, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.041139006614685, |
|
"learning_rate": 0.00026152475952974705, |
|
"loss": 1.5084, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"eval_loss": 1.4680562019348145, |
|
"eval_runtime": 242.2635, |
|
"eval_samples_per_second": 46.35, |
|
"eval_steps_per_second": 5.795, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.8507019281387329, |
|
"learning_rate": 0.0002609903811898824, |
|
"loss": 1.5108, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.8787792921066284, |
|
"learning_rate": 0.0002604560028500178, |
|
"loss": 1.426, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.9176210761070251, |
|
"learning_rate": 0.00025992162451015315, |
|
"loss": 1.4643, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.035382866859436, |
|
"learning_rate": 0.00025938724617028856, |
|
"loss": 1.5125, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_loss": 1.4658011198043823, |
|
"eval_runtime": 243.1807, |
|
"eval_samples_per_second": 46.176, |
|
"eval_steps_per_second": 5.773, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.9265539646148682, |
|
"learning_rate": 0.0002588528678304239, |
|
"loss": 1.5172, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.895794153213501, |
|
"learning_rate": 0.00025831848949055925, |
|
"loss": 1.4075, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.9875014424324036, |
|
"learning_rate": 0.00025778411115069466, |
|
"loss": 1.5134, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.0643417835235596, |
|
"learning_rate": 0.00025724973281083006, |
|
"loss": 1.5189, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_loss": 1.4540351629257202, |
|
"eval_runtime": 243.2953, |
|
"eval_samples_per_second": 46.154, |
|
"eval_steps_per_second": 5.771, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.2020907402038574, |
|
"learning_rate": 0.0002567153544709654, |
|
"loss": 1.5471, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.2714240550994873, |
|
"learning_rate": 0.0002561809761311008, |
|
"loss": 1.4448, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.9557583928108215, |
|
"learning_rate": 0.00025564659779123616, |
|
"loss": 1.444, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.8486252427101135, |
|
"learning_rate": 0.00025511221945137156, |
|
"loss": 1.5165, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_loss": 1.4487340450286865, |
|
"eval_runtime": 243.0706, |
|
"eval_samples_per_second": 46.196, |
|
"eval_steps_per_second": 5.776, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.738248884677887, |
|
"learning_rate": 0.0002545778411115069, |
|
"loss": 1.4421, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.8879930973052979, |
|
"learning_rate": 0.0002540434627716423, |
|
"loss": 1.4019, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.8274502754211426, |
|
"learning_rate": 0.00025350908443177767, |
|
"loss": 1.3916, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.0490825176239014, |
|
"learning_rate": 0.00025297470609191307, |
|
"loss": 1.4196, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 1.4473365545272827, |
|
"eval_runtime": 242.1251, |
|
"eval_samples_per_second": 46.377, |
|
"eval_steps_per_second": 5.799, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.233782410621643, |
|
"learning_rate": 0.0002524403277520484, |
|
"loss": 1.4154, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.2139190435409546, |
|
"learning_rate": 0.0002519059494121838, |
|
"loss": 1.4619, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.8714913725852966, |
|
"learning_rate": 0.00025137157107231917, |
|
"loss": 1.4809, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.795221745967865, |
|
"learning_rate": 0.0002508371927324545, |
|
"loss": 1.4804, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 1.4365161657333374, |
|
"eval_runtime": 242.2216, |
|
"eval_samples_per_second": 46.358, |
|
"eval_steps_per_second": 5.796, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.8788973093032837, |
|
"learning_rate": 0.0002503028143925899, |
|
"loss": 1.5129, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.1877810955047607, |
|
"learning_rate": 0.00024976843605272527, |
|
"loss": 1.4867, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.0848227739334106, |
|
"learning_rate": 0.0002492340577128607, |
|
"loss": 1.3657, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.0620073080062866, |
|
"learning_rate": 0.0002486996793729961, |
|
"loss": 1.4301, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_loss": 1.4357354640960693, |
|
"eval_runtime": 242.0528, |
|
"eval_samples_per_second": 46.391, |
|
"eval_steps_per_second": 5.8, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.137902855873108, |
|
"learning_rate": 0.0002481653010331314, |
|
"loss": 1.3764, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.0430387258529663, |
|
"learning_rate": 0.00024763092269326683, |
|
"loss": 1.458, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.9362395405769348, |
|
"learning_rate": 0.0002470965443534022, |
|
"loss": 1.5025, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.0825713872909546, |
|
"learning_rate": 0.0002465621660135376, |
|
"loss": 1.4216, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"eval_loss": 1.4344379901885986, |
|
"eval_runtime": 242.4123, |
|
"eval_samples_per_second": 46.322, |
|
"eval_steps_per_second": 5.792, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.114099383354187, |
|
"learning_rate": 0.00024602778767367293, |
|
"loss": 1.4232, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.7564448714256287, |
|
"learning_rate": 0.00024549340933380833, |
|
"loss": 1.3576, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.9348393678665161, |
|
"learning_rate": 0.0002449590309939437, |
|
"loss": 1.4352, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.2036116123199463, |
|
"learning_rate": 0.0002444246526540791, |
|
"loss": 1.4637, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_loss": 1.419852614402771, |
|
"eval_runtime": 243.2752, |
|
"eval_samples_per_second": 46.158, |
|
"eval_steps_per_second": 5.771, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.1303725242614746, |
|
"learning_rate": 0.00024389027431421443, |
|
"loss": 1.413, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7789029479026794, |
|
"learning_rate": 0.00024335589597434984, |
|
"loss": 1.4399, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.9571925401687622, |
|
"learning_rate": 0.0002428215176344852, |
|
"loss": 1.3619, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.0115916728973389, |
|
"learning_rate": 0.00024228713929462056, |
|
"loss": 1.4548, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_loss": 1.4131032228469849, |
|
"eval_runtime": 242.8013, |
|
"eval_samples_per_second": 46.248, |
|
"eval_steps_per_second": 5.783, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.2657145261764526, |
|
"learning_rate": 0.00024175276095475597, |
|
"loss": 1.4381, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.9139773845672607, |
|
"learning_rate": 0.00024121838261489132, |
|
"loss": 1.3604, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.8233699202537537, |
|
"learning_rate": 0.00024068400427502672, |
|
"loss": 1.3726, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.3115651607513428, |
|
"learning_rate": 0.00024014962593516207, |
|
"loss": 1.477, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 1.4067269563674927, |
|
"eval_runtime": 242.5538, |
|
"eval_samples_per_second": 46.295, |
|
"eval_steps_per_second": 5.788, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.2231495380401611, |
|
"learning_rate": 0.00023961524759529747, |
|
"loss": 1.3265, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.1421154737472534, |
|
"learning_rate": 0.00023908086925543282, |
|
"loss": 1.4385, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.3289763927459717, |
|
"learning_rate": 0.0002385464909155682, |
|
"loss": 1.404, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.7699000835418701, |
|
"learning_rate": 0.00023801211257570357, |
|
"loss": 1.415, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"eval_loss": 1.3919585943222046, |
|
"eval_runtime": 242.7734, |
|
"eval_samples_per_second": 46.253, |
|
"eval_steps_per_second": 5.783, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.7574182152748108, |
|
"learning_rate": 0.00023747773423583895, |
|
"loss": 1.3588, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.871526837348938, |
|
"learning_rate": 0.00023694335589597435, |
|
"loss": 1.4096, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.8445005416870117, |
|
"learning_rate": 0.0002364089775561097, |
|
"loss": 1.3575, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.124621033668518, |
|
"learning_rate": 0.0002358745992162451, |
|
"loss": 1.4043, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"eval_loss": 1.391379475593567, |
|
"eval_runtime": 242.7975, |
|
"eval_samples_per_second": 46.248, |
|
"eval_steps_per_second": 5.783, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.824134349822998, |
|
"learning_rate": 0.00023534022087638045, |
|
"loss": 1.4347, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.7855484485626221, |
|
"learning_rate": 0.00023480584253651583, |
|
"loss": 1.4023, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.8013989329338074, |
|
"learning_rate": 0.0002342714641966512, |
|
"loss": 1.3739, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.8415730595588684, |
|
"learning_rate": 0.00023373708585678658, |
|
"loss": 1.3765, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"eval_loss": 1.3790498971939087, |
|
"eval_runtime": 242.5435, |
|
"eval_samples_per_second": 46.297, |
|
"eval_steps_per_second": 5.789, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.1093789339065552, |
|
"learning_rate": 0.00023320270751692198, |
|
"loss": 1.4332, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.311341404914856, |
|
"learning_rate": 0.00023266832917705733, |
|
"loss": 1.4063, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.1992008686065674, |
|
"learning_rate": 0.00023213395083719274, |
|
"loss": 1.3182, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.9611808061599731, |
|
"learning_rate": 0.00023159957249732808, |
|
"loss": 1.4105, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_loss": 1.3601434230804443, |
|
"eval_runtime": 242.3398, |
|
"eval_samples_per_second": 46.336, |
|
"eval_steps_per_second": 5.794, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.0147098302841187, |
|
"learning_rate": 0.00023106519415746346, |
|
"loss": 1.3047, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.0039702653884888, |
|
"learning_rate": 0.00023053081581759884, |
|
"loss": 1.3057, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.056921362876892, |
|
"learning_rate": 0.0002299964374777342, |
|
"loss": 1.3699, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.8924540281295776, |
|
"learning_rate": 0.00022946205913786962, |
|
"loss": 1.3247, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 1.3447396755218506, |
|
"eval_runtime": 242.972, |
|
"eval_samples_per_second": 46.215, |
|
"eval_steps_per_second": 5.778, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.3511056900024414, |
|
"learning_rate": 0.00022892768079800496, |
|
"loss": 1.3216, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.9699820280075073, |
|
"learning_rate": 0.00022839330245814037, |
|
"loss": 1.3659, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.8704125285148621, |
|
"learning_rate": 0.00022785892411827572, |
|
"loss": 1.3111, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.1639297008514404, |
|
"learning_rate": 0.0002273245457784111, |
|
"loss": 1.359, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 1.3231163024902344, |
|
"eval_runtime": 242.9134, |
|
"eval_samples_per_second": 46.226, |
|
"eval_steps_per_second": 5.78, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.2916362285614014, |
|
"learning_rate": 0.00022679016743854647, |
|
"loss": 1.295, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.8053829669952393, |
|
"learning_rate": 0.00022625578909868185, |
|
"loss": 1.334, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.3208507299423218, |
|
"learning_rate": 0.00022572141075881722, |
|
"loss": 1.3426, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.9985295534133911, |
|
"learning_rate": 0.0002251870324189526, |
|
"loss": 1.2714, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.3060131072998047, |
|
"eval_runtime": 243.2736, |
|
"eval_samples_per_second": 46.158, |
|
"eval_steps_per_second": 5.771, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.9396908283233643, |
|
"learning_rate": 0.000224652654079088, |
|
"loss": 1.3108, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.4417626857757568, |
|
"learning_rate": 0.00022411827573922335, |
|
"loss": 1.3042, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.9226711392402649, |
|
"learning_rate": 0.00022358389739935873, |
|
"loss": 1.2282, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.0715467929840088, |
|
"learning_rate": 0.0002230495190594941, |
|
"loss": 1.319, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_loss": 1.2914878129959106, |
|
"eval_runtime": 243.1709, |
|
"eval_samples_per_second": 46.177, |
|
"eval_steps_per_second": 5.774, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.2655442953109741, |
|
"learning_rate": 0.00022251514071962948, |
|
"loss": 1.2446, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.3114930391311646, |
|
"learning_rate": 0.00022198076237976485, |
|
"loss": 1.2685, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.3273890018463135, |
|
"learning_rate": 0.00022144638403990023, |
|
"loss": 1.2024, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.3590787649154663, |
|
"learning_rate": 0.00022091200570003563, |
|
"loss": 1.2551, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_loss": 1.2825496196746826, |
|
"eval_runtime": 242.3707, |
|
"eval_samples_per_second": 46.33, |
|
"eval_steps_per_second": 5.793, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.8397738337516785, |
|
"learning_rate": 0.00022037762736017098, |
|
"loss": 1.244, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.1953057050704956, |
|
"learning_rate": 0.00021984324902030636, |
|
"loss": 1.2534, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.427286982536316, |
|
"learning_rate": 0.00021930887068044173, |
|
"loss": 1.2897, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.1270551681518555, |
|
"learning_rate": 0.0002187744923405771, |
|
"loss": 1.25, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"eval_loss": 1.2916063070297241, |
|
"eval_runtime": 243.3228, |
|
"eval_samples_per_second": 46.149, |
|
"eval_steps_per_second": 5.77, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.1659835577011108, |
|
"learning_rate": 0.00021824011400071249, |
|
"loss": 1.3947, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.1815775632858276, |
|
"learning_rate": 0.00021770573566084786, |
|
"loss": 1.3375, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.9640941023826599, |
|
"learning_rate": 0.00021717135732098327, |
|
"loss": 1.2115, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.144339919090271, |
|
"learning_rate": 0.00021663697898111861, |
|
"loss": 1.2535, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_loss": 1.2552716732025146, |
|
"eval_runtime": 243.3981, |
|
"eval_samples_per_second": 46.134, |
|
"eval_steps_per_second": 5.768, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.4256541728973389, |
|
"learning_rate": 0.00021610260064125396, |
|
"loss": 1.2659, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.6958651542663574, |
|
"learning_rate": 0.00021556822230138937, |
|
"loss": 1.2669, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.2060660123825073, |
|
"learning_rate": 0.00021503384396152474, |
|
"loss": 1.2205, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.2157624959945679, |
|
"learning_rate": 0.00021449946562166012, |
|
"loss": 1.3206, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_loss": 1.2229853868484497, |
|
"eval_runtime": 242.4855, |
|
"eval_samples_per_second": 46.308, |
|
"eval_steps_per_second": 5.79, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.1371393203735352, |
|
"learning_rate": 0.0002139650872817955, |
|
"loss": 1.1855, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.8342422246932983, |
|
"learning_rate": 0.00021343070894193087, |
|
"loss": 1.2631, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.139548420906067, |
|
"learning_rate": 0.00021289633060206625, |
|
"loss": 1.2119, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.027574896812439, |
|
"learning_rate": 0.0002123619522622016, |
|
"loss": 1.2822, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_loss": 1.2131222486495972, |
|
"eval_runtime": 242.8594, |
|
"eval_samples_per_second": 46.237, |
|
"eval_steps_per_second": 5.781, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.2451738119125366, |
|
"learning_rate": 0.000211827573922337, |
|
"loss": 1.1576, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.1453763246536255, |
|
"learning_rate": 0.00021129319558247238, |
|
"loss": 1.2733, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.153079867362976, |
|
"learning_rate": 0.00021075881724260775, |
|
"loss": 1.1779, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.0220108032226562, |
|
"learning_rate": 0.00021022443890274313, |
|
"loss": 1.294, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 1.2016112804412842, |
|
"eval_runtime": 242.5462, |
|
"eval_samples_per_second": 46.296, |
|
"eval_steps_per_second": 5.789, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.6792449951171875, |
|
"learning_rate": 0.0002096900605628785, |
|
"loss": 1.1431, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.8450753688812256, |
|
"learning_rate": 0.00020915568222301388, |
|
"loss": 1.1796, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.9148487448692322, |
|
"learning_rate": 0.00020862130388314923, |
|
"loss": 1.122, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.242084264755249, |
|
"learning_rate": 0.00020808692554328463, |
|
"loss": 1.1864, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"eval_loss": 1.1892478466033936, |
|
"eval_runtime": 242.6928, |
|
"eval_samples_per_second": 46.268, |
|
"eval_steps_per_second": 5.785, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.928069829940796, |
|
"learning_rate": 0.00020755254720342, |
|
"loss": 1.2334, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.2880061864852905, |
|
"learning_rate": 0.00020701816886355538, |
|
"loss": 1.3125, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.4170634746551514, |
|
"learning_rate": 0.00020648379052369076, |
|
"loss": 1.2393, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.9899844527244568, |
|
"learning_rate": 0.00020594941218382614, |
|
"loss": 1.202, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"eval_loss": 1.1809042692184448, |
|
"eval_runtime": 242.6549, |
|
"eval_samples_per_second": 46.276, |
|
"eval_steps_per_second": 5.786, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.3030658960342407, |
|
"learning_rate": 0.0002054150338439615, |
|
"loss": 1.2495, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.9489758610725403, |
|
"learning_rate": 0.00020488065550409686, |
|
"loss": 1.1947, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.2204822301864624, |
|
"learning_rate": 0.00020434627716423226, |
|
"loss": 1.1442, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.2140864133834839, |
|
"learning_rate": 0.0002038118988243676, |
|
"loss": 1.2175, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 1.1709703207015991, |
|
"eval_runtime": 243.5845, |
|
"eval_samples_per_second": 46.099, |
|
"eval_steps_per_second": 5.764, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.048947811126709, |
|
"learning_rate": 0.00020327752048450302, |
|
"loss": 1.1554, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.9663936495780945, |
|
"learning_rate": 0.0002027431421446384, |
|
"loss": 1.1373, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.4801690578460693, |
|
"learning_rate": 0.00020220876380477377, |
|
"loss": 1.2032, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.2649509906768799, |
|
"learning_rate": 0.00020167438546490914, |
|
"loss": 1.2164, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_loss": 1.1549891233444214, |
|
"eval_runtime": 243.0477, |
|
"eval_samples_per_second": 46.201, |
|
"eval_steps_per_second": 5.777, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.5645591020584106, |
|
"learning_rate": 0.0002011400071250445, |
|
"loss": 1.2082, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.1867625713348389, |
|
"learning_rate": 0.0002006056287851799, |
|
"loss": 1.2105, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.8491153120994568, |
|
"learning_rate": 0.00020007125044531525, |
|
"loss": 1.1146, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.154305338859558, |
|
"learning_rate": 0.00019953687210545065, |
|
"loss": 1.1803, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_loss": 1.1484609842300415, |
|
"eval_runtime": 242.7646, |
|
"eval_samples_per_second": 46.255, |
|
"eval_steps_per_second": 5.783, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.7545225620269775, |
|
"learning_rate": 0.00019900249376558603, |
|
"loss": 1.16, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.1874881982803345, |
|
"learning_rate": 0.0001984681154257214, |
|
"loss": 1.2113, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.3878768682479858, |
|
"learning_rate": 0.00019793373708585678, |
|
"loss": 1.1738, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.4664456844329834, |
|
"learning_rate": 0.00019739935874599213, |
|
"loss": 1.2146, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"eval_loss": 1.1772421598434448, |
|
"eval_runtime": 242.552, |
|
"eval_samples_per_second": 46.295, |
|
"eval_steps_per_second": 5.788, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.8794177770614624, |
|
"learning_rate": 0.00019686498040612753, |
|
"loss": 1.1482, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.044394850730896, |
|
"learning_rate": 0.00019633060206626288, |
|
"loss": 1.1143, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.1520146131515503, |
|
"learning_rate": 0.00019579622372639828, |
|
"loss": 1.2389, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.041759967803955, |
|
"learning_rate": 0.00019526184538653366, |
|
"loss": 1.1671, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"eval_loss": 1.1441816091537476, |
|
"eval_runtime": 243.7305, |
|
"eval_samples_per_second": 46.071, |
|
"eval_steps_per_second": 5.76, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.5767799615859985, |
|
"learning_rate": 0.00019472746704666903, |
|
"loss": 1.0781, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.2156161069869995, |
|
"learning_rate": 0.0001941930887068044, |
|
"loss": 1.1804, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.0692851543426514, |
|
"learning_rate": 0.00019365871036693976, |
|
"loss": 1.1296, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.0492011308670044, |
|
"learning_rate": 0.00019312433202707516, |
|
"loss": 1.195, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 1.1373920440673828, |
|
"eval_runtime": 243.5127, |
|
"eval_samples_per_second": 46.113, |
|
"eval_steps_per_second": 5.766, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.9718597531318665, |
|
"learning_rate": 0.0001925899536872105, |
|
"loss": 1.1621, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.5123802423477173, |
|
"learning_rate": 0.00019205557534734591, |
|
"loss": 1.1181, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.209719181060791, |
|
"learning_rate": 0.00019152119700748126, |
|
"loss": 1.1386, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.3843159675598145, |
|
"learning_rate": 0.00019098681866761667, |
|
"loss": 1.1055, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 1.1293574571609497, |
|
"eval_runtime": 243.5039, |
|
"eval_samples_per_second": 46.114, |
|
"eval_steps_per_second": 5.766, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.0101613998413086, |
|
"learning_rate": 0.00019045244032775204, |
|
"loss": 1.1154, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.160309076309204, |
|
"learning_rate": 0.0001899180619878874, |
|
"loss": 1.122, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.289068579673767, |
|
"learning_rate": 0.0001893836836480228, |
|
"loss": 1.1491, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.579099178314209, |
|
"learning_rate": 0.00018884930530815814, |
|
"loss": 1.1484, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"eval_loss": 1.1274968385696411, |
|
"eval_runtime": 243.2415, |
|
"eval_samples_per_second": 46.164, |
|
"eval_steps_per_second": 5.772, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.8187231421470642, |
|
"learning_rate": 0.00018831492696829355, |
|
"loss": 1.1322, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.8516179919242859, |
|
"learning_rate": 0.0001877805486284289, |
|
"loss": 1.0975, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.9972283244132996, |
|
"learning_rate": 0.0001872461702885643, |
|
"loss": 1.1063, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.9460062384605408, |
|
"learning_rate": 0.00018671179194869967, |
|
"loss": 1.1337, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"eval_loss": 1.124145269393921, |
|
"eval_runtime": 243.6433, |
|
"eval_samples_per_second": 46.088, |
|
"eval_steps_per_second": 5.763, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.1518056392669678, |
|
"learning_rate": 0.00018617741360883502, |
|
"loss": 1.139, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.3771902322769165, |
|
"learning_rate": 0.00018564303526897043, |
|
"loss": 1.1773, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.3464629650115967, |
|
"learning_rate": 0.00018510865692910578, |
|
"loss": 1.1176, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.1559851169586182, |
|
"learning_rate": 0.00018457427858924118, |
|
"loss": 1.1147, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"eval_loss": 1.1198575496673584, |
|
"eval_runtime": 242.729, |
|
"eval_samples_per_second": 46.261, |
|
"eval_steps_per_second": 5.784, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.8448060750961304, |
|
"learning_rate": 0.00018403990024937653, |
|
"loss": 1.1604, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.1841185092926025, |
|
"learning_rate": 0.00018350552190951193, |
|
"loss": 1.1384, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.97804856300354, |
|
"learning_rate": 0.0001829711435696473, |
|
"loss": 1.0934, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.7992210388183594, |
|
"learning_rate": 0.00018243676522978266, |
|
"loss": 1.1227, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"eval_loss": 1.115310549736023, |
|
"eval_runtime": 242.2075, |
|
"eval_samples_per_second": 46.361, |
|
"eval_steps_per_second": 5.797, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.9271190166473389, |
|
"learning_rate": 0.00018190238688991806, |
|
"loss": 1.1979, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.1744805574417114, |
|
"learning_rate": 0.0001813680085500534, |
|
"loss": 1.0411, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.5105717182159424, |
|
"learning_rate": 0.0001808336302101888, |
|
"loss": 1.1139, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.0240634679794312, |
|
"learning_rate": 0.00018029925187032416, |
|
"loss": 1.145, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 1.116929054260254, |
|
"eval_runtime": 243.2586, |
|
"eval_samples_per_second": 46.161, |
|
"eval_steps_per_second": 5.772, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.387168049812317, |
|
"learning_rate": 0.00017976487353045956, |
|
"loss": 1.0915, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.8388767838478088, |
|
"learning_rate": 0.0001792304951905949, |
|
"loss": 1.1492, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.1189345121383667, |
|
"learning_rate": 0.0001786961168507303, |
|
"loss": 1.0952, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.9314697980880737, |
|
"learning_rate": 0.0001781617385108657, |
|
"loss": 1.1508, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"eval_loss": 1.1075862646102905, |
|
"eval_runtime": 242.7453, |
|
"eval_samples_per_second": 46.258, |
|
"eval_steps_per_second": 5.784, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.495107650756836, |
|
"learning_rate": 0.00017762736017100104, |
|
"loss": 1.1027, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.4054560661315918, |
|
"learning_rate": 0.00017709298183113644, |
|
"loss": 1.0901, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.1209503412246704, |
|
"learning_rate": 0.0001765586034912718, |
|
"loss": 1.1063, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.1039865016937256, |
|
"learning_rate": 0.0001760242251514072, |
|
"loss": 1.1729, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"eval_loss": 1.108239769935608, |
|
"eval_runtime": 242.5373, |
|
"eval_samples_per_second": 46.298, |
|
"eval_steps_per_second": 5.789, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.1784499883651733, |
|
"learning_rate": 0.00017548984681154255, |
|
"loss": 1.1479, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.4968849420547485, |
|
"learning_rate": 0.00017495546847167792, |
|
"loss": 1.1732, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.8712586760520935, |
|
"learning_rate": 0.00017442109013181332, |
|
"loss": 1.1105, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.8051401972770691, |
|
"learning_rate": 0.00017388671179194867, |
|
"loss": 1.1253, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_loss": 1.10274338722229, |
|
"eval_runtime": 243.3942, |
|
"eval_samples_per_second": 46.135, |
|
"eval_steps_per_second": 5.768, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.945894181728363, |
|
"learning_rate": 0.00017335233345208408, |
|
"loss": 1.145, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.0374635457992554, |
|
"learning_rate": 0.00017281795511221943, |
|
"loss": 1.1411, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.0259712934494019, |
|
"learning_rate": 0.00017228357677235483, |
|
"loss": 1.0862, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.2664566040039062, |
|
"learning_rate": 0.00017174919843249018, |
|
"loss": 1.1152, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"eval_loss": 1.099318265914917, |
|
"eval_runtime": 243.0373, |
|
"eval_samples_per_second": 46.203, |
|
"eval_steps_per_second": 5.777, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.3761073350906372, |
|
"learning_rate": 0.00017121482009262555, |
|
"loss": 1.1706, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.5430552959442139, |
|
"learning_rate": 0.00017068044175276096, |
|
"loss": 1.0759, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.2533998489379883, |
|
"learning_rate": 0.0001701460634128963, |
|
"loss": 1.1051, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.9215791821479797, |
|
"learning_rate": 0.0001696116850730317, |
|
"loss": 1.1606, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"eval_loss": 1.0982745885849, |
|
"eval_runtime": 242.3387, |
|
"eval_samples_per_second": 46.336, |
|
"eval_steps_per_second": 5.794, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.1964141130447388, |
|
"learning_rate": 0.00016907730673316706, |
|
"loss": 1.1286, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.441573143005371, |
|
"learning_rate": 0.00016854292839330246, |
|
"loss": 1.0948, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.9406346082687378, |
|
"learning_rate": 0.0001680085500534378, |
|
"loss": 1.0833, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.2231512069702148, |
|
"learning_rate": 0.0001674741717135732, |
|
"loss": 1.1399, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_loss": 1.097598671913147, |
|
"eval_runtime": 243.0716, |
|
"eval_samples_per_second": 46.196, |
|
"eval_steps_per_second": 5.776, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.9176976680755615, |
|
"learning_rate": 0.00016693979337370856, |
|
"loss": 1.0967, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.0640438795089722, |
|
"learning_rate": 0.00016640541503384394, |
|
"loss": 1.0571, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.9471141695976257, |
|
"learning_rate": 0.00016587103669397934, |
|
"loss": 1.0619, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.984430730342865, |
|
"learning_rate": 0.0001653366583541147, |
|
"loss": 1.0867, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_loss": 1.0952093601226807, |
|
"eval_runtime": 243.3103, |
|
"eval_samples_per_second": 46.151, |
|
"eval_steps_per_second": 5.77, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.0785785913467407, |
|
"learning_rate": 0.0001648022800142501, |
|
"loss": 1.1129, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.0736885070800781, |
|
"learning_rate": 0.00016426790167438544, |
|
"loss": 1.165, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.7780587673187256, |
|
"learning_rate": 0.00016373352333452082, |
|
"loss": 1.1582, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.2111455202102661, |
|
"learning_rate": 0.0001631991449946562, |
|
"loss": 1.1036, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_loss": 1.097571849822998, |
|
"eval_runtime": 242.4227, |
|
"eval_samples_per_second": 46.32, |
|
"eval_steps_per_second": 5.792, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.2604445219039917, |
|
"learning_rate": 0.00016266476665479157, |
|
"loss": 1.1973, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.853370189666748, |
|
"learning_rate": 0.00016213038831492697, |
|
"loss": 1.0787, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.808193027973175, |
|
"learning_rate": 0.00016159600997506232, |
|
"loss": 1.07, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.1741416454315186, |
|
"learning_rate": 0.00016106163163519773, |
|
"loss": 1.127, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_loss": 1.0869979858398438, |
|
"eval_runtime": 243.2654, |
|
"eval_samples_per_second": 46.159, |
|
"eval_steps_per_second": 5.771, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.1077781915664673, |
|
"learning_rate": 0.00016052725329533308, |
|
"loss": 1.0511, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.9855788350105286, |
|
"learning_rate": 0.00015999287495546845, |
|
"loss": 1.0973, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.3673230409622192, |
|
"learning_rate": 0.00015945849661560383, |
|
"loss": 1.1164, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.2939677238464355, |
|
"learning_rate": 0.0001589241182757392, |
|
"loss": 1.0769, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"eval_loss": 1.0876641273498535, |
|
"eval_runtime": 242.1557, |
|
"eval_samples_per_second": 46.371, |
|
"eval_steps_per_second": 5.798, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.06653892993927, |
|
"learning_rate": 0.0001583897399358746, |
|
"loss": 1.0127, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.9674182534217834, |
|
"learning_rate": 0.00015785536159600996, |
|
"loss": 1.0367, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.662380576133728, |
|
"learning_rate": 0.00015732098325614536, |
|
"loss": 1.092, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.4071582555770874, |
|
"learning_rate": 0.0001567866049162807, |
|
"loss": 1.1572, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 1.0884110927581787, |
|
"eval_runtime": 242.9901, |
|
"eval_samples_per_second": 46.212, |
|
"eval_steps_per_second": 5.778, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.9712650775909424, |
|
"learning_rate": 0.00015625222657641608, |
|
"loss": 1.0277, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.070185899734497, |
|
"learning_rate": 0.00015571784823655146, |
|
"loss": 1.1811, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.8417230248451233, |
|
"learning_rate": 0.00015518346989668684, |
|
"loss": 1.0426, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.8787222504615784, |
|
"learning_rate": 0.0001546490915568222, |
|
"loss": 0.9953, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 1.082081913948059, |
|
"eval_runtime": 243.0414, |
|
"eval_samples_per_second": 46.202, |
|
"eval_steps_per_second": 5.777, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.9979585409164429, |
|
"learning_rate": 0.0001541147132169576, |
|
"loss": 1.0731, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.108880639076233, |
|
"learning_rate": 0.000153580334877093, |
|
"loss": 1.0171, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.2933177947998047, |
|
"learning_rate": 0.00015304595653722834, |
|
"loss": 1.0382, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.237389087677002, |
|
"learning_rate": 0.00015251157819736372, |
|
"loss": 1.1542, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"eval_loss": 1.0847558975219727, |
|
"eval_runtime": 243.7745, |
|
"eval_samples_per_second": 46.063, |
|
"eval_steps_per_second": 5.759, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.2602064609527588, |
|
"learning_rate": 0.0001519771998574991, |
|
"loss": 1.0919, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.1393736600875854, |
|
"learning_rate": 0.00015144282151763447, |
|
"loss": 1.0688, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.037117838859558, |
|
"learning_rate": 0.00015090844317776984, |
|
"loss": 1.062, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.169497013092041, |
|
"learning_rate": 0.00015037406483790522, |
|
"loss": 1.1268, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.0819194316864014, |
|
"eval_runtime": 242.9727, |
|
"eval_samples_per_second": 46.215, |
|
"eval_steps_per_second": 5.778, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.9044406414031982, |
|
"learning_rate": 0.0001498396864980406, |
|
"loss": 1.0518, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.8581820726394653, |
|
"learning_rate": 0.00014930530815817597, |
|
"loss": 1.0841, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.9462387561798096, |
|
"learning_rate": 0.00014877092981831135, |
|
"loss": 1.0453, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.9421359896659851, |
|
"learning_rate": 0.00014823655147844673, |
|
"loss": 1.0587, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_loss": 1.0787664651870728, |
|
"eval_runtime": 242.7641, |
|
"eval_samples_per_second": 46.255, |
|
"eval_steps_per_second": 5.783, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.2465983629226685, |
|
"learning_rate": 0.0001477021731385821, |
|
"loss": 1.0594, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.5653964281082153, |
|
"learning_rate": 0.00014716779479871748, |
|
"loss": 1.1475, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.5499756336212158, |
|
"learning_rate": 0.00014663341645885285, |
|
"loss": 1.0816, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.3674159049987793, |
|
"learning_rate": 0.00014609903811898823, |
|
"loss": 1.1724, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_loss": 1.0885018110275269, |
|
"eval_runtime": 241.7321, |
|
"eval_samples_per_second": 46.452, |
|
"eval_steps_per_second": 5.808, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.9569036960601807, |
|
"learning_rate": 0.0001455646597791236, |
|
"loss": 1.1324, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7200804352760315, |
|
"learning_rate": 0.00014503028143925898, |
|
"loss": 1.1145, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.5475003719329834, |
|
"learning_rate": 0.00014449590309939436, |
|
"loss": 1.1219, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.9555292725563049, |
|
"learning_rate": 0.00014396152475952973, |
|
"loss": 1.0941, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"eval_loss": 1.0766382217407227, |
|
"eval_runtime": 242.1732, |
|
"eval_samples_per_second": 46.368, |
|
"eval_steps_per_second": 5.798, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.847709596157074, |
|
"learning_rate": 0.0001434271464196651, |
|
"loss": 1.079, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.9792425632476807, |
|
"learning_rate": 0.00014289276807980049, |
|
"loss": 1.051, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.1272369623184204, |
|
"learning_rate": 0.00014235838973993586, |
|
"loss": 1.089, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.2628141641616821, |
|
"learning_rate": 0.00014182401140007124, |
|
"loss": 1.0375, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_loss": 1.078391432762146, |
|
"eval_runtime": 243.4192, |
|
"eval_samples_per_second": 46.13, |
|
"eval_steps_per_second": 5.768, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.011176586151123, |
|
"learning_rate": 0.00014128963306020661, |
|
"loss": 1.0432, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.7509900331497192, |
|
"learning_rate": 0.000140755254720342, |
|
"loss": 1.026, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.8755093812942505, |
|
"learning_rate": 0.00014022087638047737, |
|
"loss": 1.0355, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.0947227478027344, |
|
"learning_rate": 0.00013968649804061274, |
|
"loss": 1.1522, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_loss": 1.0746785402297974, |
|
"eval_runtime": 243.2633, |
|
"eval_samples_per_second": 46.16, |
|
"eval_steps_per_second": 5.772, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.576208472251892, |
|
"learning_rate": 0.00013915211970074812, |
|
"loss": 1.0697, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.0187541246414185, |
|
"learning_rate": 0.0001386177413608835, |
|
"loss": 1.0736, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.9235592484474182, |
|
"learning_rate": 0.00013808336302101887, |
|
"loss": 1.066, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.0646727085113525, |
|
"learning_rate": 0.00013754898468115425, |
|
"loss": 1.0208, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"eval_loss": 1.0796867609024048, |
|
"eval_runtime": 242.7054, |
|
"eval_samples_per_second": 46.266, |
|
"eval_steps_per_second": 5.785, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.865246057510376, |
|
"learning_rate": 0.00013701460634128962, |
|
"loss": 1.0993, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.3428738117218018, |
|
"learning_rate": 0.000136480228001425, |
|
"loss": 1.1218, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.060223937034607, |
|
"learning_rate": 0.00013594584966156037, |
|
"loss": 1.0701, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.3685177564620972, |
|
"learning_rate": 0.00013541147132169575, |
|
"loss": 1.1178, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"eval_loss": 1.0757360458374023, |
|
"eval_runtime": 242.3944, |
|
"eval_samples_per_second": 46.325, |
|
"eval_steps_per_second": 5.792, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.018256425857544, |
|
"learning_rate": 0.00013487709298183113, |
|
"loss": 1.0114, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.9656644463539124, |
|
"learning_rate": 0.0001343427146419665, |
|
"loss": 1.0382, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.9408764243125916, |
|
"learning_rate": 0.00013380833630210188, |
|
"loss": 1.0636, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.3006953001022339, |
|
"learning_rate": 0.00013327395796223726, |
|
"loss": 1.0406, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 1.068661093711853, |
|
"eval_runtime": 243.1912, |
|
"eval_samples_per_second": 46.174, |
|
"eval_steps_per_second": 5.773, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.797767698764801, |
|
"learning_rate": 0.00013273957962237263, |
|
"loss": 1.0338, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.7967373728752136, |
|
"learning_rate": 0.000132205201282508, |
|
"loss": 1.0541, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.1280144453048706, |
|
"learning_rate": 0.00013167082294264338, |
|
"loss": 1.0926, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.9334732294082642, |
|
"learning_rate": 0.00013113644460277876, |
|
"loss": 1.073, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 1.0698789358139038, |
|
"eval_runtime": 243.8062, |
|
"eval_samples_per_second": 46.057, |
|
"eval_steps_per_second": 5.759, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.763647198677063, |
|
"learning_rate": 0.00013060206626291414, |
|
"loss": 1.0405, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.2175519466400146, |
|
"learning_rate": 0.0001300676879230495, |
|
"loss": 1.0477, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.8948994278907776, |
|
"learning_rate": 0.0001295333095831849, |
|
"loss": 1.0485, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.2777072191238403, |
|
"learning_rate": 0.00012899893124332026, |
|
"loss": 1.1138, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"eval_loss": 1.0665650367736816, |
|
"eval_runtime": 243.1663, |
|
"eval_samples_per_second": 46.178, |
|
"eval_steps_per_second": 5.774, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.6561847925186157, |
|
"learning_rate": 0.00012846455290345564, |
|
"loss": 1.038, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.475900650024414, |
|
"learning_rate": 0.00012793017456359102, |
|
"loss": 1.0814, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.2426915168762207, |
|
"learning_rate": 0.0001273957962237264, |
|
"loss": 1.1084, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.9208836555480957, |
|
"learning_rate": 0.00012686141788386177, |
|
"loss": 1.0727, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"eval_loss": 1.0669995546340942, |
|
"eval_runtime": 242.7171, |
|
"eval_samples_per_second": 46.264, |
|
"eval_steps_per_second": 5.785, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.1481331586837769, |
|
"learning_rate": 0.00012632703954399714, |
|
"loss": 1.0752, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.9362524747848511, |
|
"learning_rate": 0.00012579266120413252, |
|
"loss": 1.0576, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.0292400121688843, |
|
"learning_rate": 0.0001252582828642679, |
|
"loss": 1.1245, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.7851674556732178, |
|
"learning_rate": 0.00012472390452440327, |
|
"loss": 1.0422, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"eval_loss": 1.065503716468811, |
|
"eval_runtime": 242.4737, |
|
"eval_samples_per_second": 46.31, |
|
"eval_steps_per_second": 5.79, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.1515681743621826, |
|
"learning_rate": 0.00012418952618453862, |
|
"loss": 1.1243, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.7999959588050842, |
|
"learning_rate": 0.00012365514784467402, |
|
"loss": 1.0751, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.5812344551086426, |
|
"learning_rate": 0.0001231207695048094, |
|
"loss": 1.064, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.4247961044311523, |
|
"learning_rate": 0.00012258639116494478, |
|
"loss": 1.0476, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"eval_loss": 1.065142035484314, |
|
"eval_runtime": 243.1043, |
|
"eval_samples_per_second": 46.19, |
|
"eval_steps_per_second": 5.775, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.6534472703933716, |
|
"learning_rate": 0.00012205201282508015, |
|
"loss": 1.0618, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.7015573382377625, |
|
"learning_rate": 0.00012151763448521553, |
|
"loss": 1.0167, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.0347164869308472, |
|
"learning_rate": 0.0001209832561453509, |
|
"loss": 1.1369, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.8698253035545349, |
|
"learning_rate": 0.00012044887780548627, |
|
"loss": 1.0642, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 1.0631712675094604, |
|
"eval_runtime": 242.718, |
|
"eval_samples_per_second": 46.264, |
|
"eval_steps_per_second": 5.784, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.0342012643814087, |
|
"learning_rate": 0.00011991449946562164, |
|
"loss": 1.1106, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.7427274584770203, |
|
"learning_rate": 0.00011938012112575702, |
|
"loss": 1.0906, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.8352236151695251, |
|
"learning_rate": 0.0001188457427858924, |
|
"loss": 1.0482, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.9208966493606567, |
|
"learning_rate": 0.00011831136444602779, |
|
"loss": 1.0269, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"eval_loss": 1.061272144317627, |
|
"eval_runtime": 243.2083, |
|
"eval_samples_per_second": 46.17, |
|
"eval_steps_per_second": 5.773, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.7243204116821289, |
|
"learning_rate": 0.00011777698610616316, |
|
"loss": 1.0752, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.1476917266845703, |
|
"learning_rate": 0.00011724260776629854, |
|
"loss": 1.0978, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.6801986694335938, |
|
"learning_rate": 0.0001167082294264339, |
|
"loss": 1.0752, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.7917805314064026, |
|
"learning_rate": 0.00011617385108656928, |
|
"loss": 1.0756, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"eval_loss": 1.0615525245666504, |
|
"eval_runtime": 242.3231, |
|
"eval_samples_per_second": 46.339, |
|
"eval_steps_per_second": 5.794, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.9897934794425964, |
|
"learning_rate": 0.00011563947274670465, |
|
"loss": 1.0492, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.9467993378639221, |
|
"learning_rate": 0.00011510509440684003, |
|
"loss": 1.023, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.7573145627975464, |
|
"learning_rate": 0.0001145707160669754, |
|
"loss": 1.0759, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.9526081085205078, |
|
"learning_rate": 0.0001140363377271108, |
|
"loss": 1.1442, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"eval_loss": 1.0587379932403564, |
|
"eval_runtime": 241.9784, |
|
"eval_samples_per_second": 46.405, |
|
"eval_steps_per_second": 5.802, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.9118272662162781, |
|
"learning_rate": 0.00011350195938724617, |
|
"loss": 1.1047, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.951062798500061, |
|
"learning_rate": 0.00011296758104738153, |
|
"loss": 1.0981, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.9109734296798706, |
|
"learning_rate": 0.00011243320270751691, |
|
"loss": 1.0994, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.8266547918319702, |
|
"learning_rate": 0.00011189882436765228, |
|
"loss": 1.0417, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"eval_loss": 1.0577157735824585, |
|
"eval_runtime": 243.6894, |
|
"eval_samples_per_second": 46.079, |
|
"eval_steps_per_second": 5.761, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.2194539308547974, |
|
"learning_rate": 0.00011136444602778766, |
|
"loss": 1.1562, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.9784891605377197, |
|
"learning_rate": 0.00011083006768792304, |
|
"loss": 1.08, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.8719102740287781, |
|
"learning_rate": 0.00011029568934805841, |
|
"loss": 1.0187, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.679275631904602, |
|
"learning_rate": 0.0001097613110081938, |
|
"loss": 1.0336, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"eval_loss": 1.0579383373260498, |
|
"eval_runtime": 242.7508, |
|
"eval_samples_per_second": 46.257, |
|
"eval_steps_per_second": 5.784, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.8664696216583252, |
|
"learning_rate": 0.00010922693266832918, |
|
"loss": 1.0849, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.8156464099884033, |
|
"learning_rate": 0.00010869255432846454, |
|
"loss": 1.0582, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.7599331140518188, |
|
"learning_rate": 0.00010815817598859992, |
|
"loss": 1.0747, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.429771065711975, |
|
"learning_rate": 0.0001076237976487353, |
|
"loss": 1.0762, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 1.0569549798965454, |
|
"eval_runtime": 241.936, |
|
"eval_samples_per_second": 46.413, |
|
"eval_steps_per_second": 5.803, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.29656183719635, |
|
"learning_rate": 0.00010708941930887067, |
|
"loss": 1.1023, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.0881860256195068, |
|
"learning_rate": 0.00010655504096900605, |
|
"loss": 1.081, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.3329012393951416, |
|
"learning_rate": 0.00010602066262914144, |
|
"loss": 1.1507, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.3197088241577148, |
|
"learning_rate": 0.00010548628428927681, |
|
"loss": 1.1399, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_loss": 1.059924840927124, |
|
"eval_runtime": 243.3976, |
|
"eval_samples_per_second": 46.134, |
|
"eval_steps_per_second": 5.768, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.7774195075035095, |
|
"learning_rate": 0.00010495190594941217, |
|
"loss": 0.9988, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.8935036063194275, |
|
"learning_rate": 0.00010441752760954755, |
|
"loss": 1.0987, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.0009044408798218, |
|
"learning_rate": 0.00010388314926968293, |
|
"loss": 0.9937, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.0484709739685059, |
|
"learning_rate": 0.0001033487709298183, |
|
"loss": 1.0625, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"eval_loss": 1.0552020072937012, |
|
"eval_runtime": 243.1169, |
|
"eval_samples_per_second": 46.188, |
|
"eval_steps_per_second": 5.775, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.8426492214202881, |
|
"learning_rate": 0.00010281439258995368, |
|
"loss": 1.1666, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.8092815279960632, |
|
"learning_rate": 0.00010228001425008905, |
|
"loss": 1.033, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.9522695541381836, |
|
"learning_rate": 0.00010174563591022444, |
|
"loss": 1.0193, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.704237699508667, |
|
"learning_rate": 0.00010121125757035979, |
|
"loss": 1.0285, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"eval_loss": 1.0537731647491455, |
|
"eval_runtime": 241.9108, |
|
"eval_samples_per_second": 46.418, |
|
"eval_steps_per_second": 5.804, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.1165724992752075, |
|
"learning_rate": 0.00010067687923049518, |
|
"loss": 1.087, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.6024936437606812, |
|
"learning_rate": 0.00010014250089063056, |
|
"loss": 1.0875, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.0185704231262207, |
|
"learning_rate": 9.960812255076593e-05, |
|
"loss": 1.0953, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.7029600739479065, |
|
"learning_rate": 9.907374421090131e-05, |
|
"loss": 1.0705, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"eval_loss": 1.0541706085205078, |
|
"eval_runtime": 243.1191, |
|
"eval_samples_per_second": 46.187, |
|
"eval_steps_per_second": 5.775, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.8626222610473633, |
|
"learning_rate": 9.853936587103669e-05, |
|
"loss": 1.0528, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.8519126176834106, |
|
"learning_rate": 9.800498753117206e-05, |
|
"loss": 1.0469, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.7846513390541077, |
|
"learning_rate": 9.747060919130743e-05, |
|
"loss": 1.1003, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.7816912531852722, |
|
"learning_rate": 9.693623085144281e-05, |
|
"loss": 1.0338, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_loss": 1.0513091087341309, |
|
"eval_runtime": 241.7979, |
|
"eval_samples_per_second": 46.44, |
|
"eval_steps_per_second": 5.807, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.9167221188545227, |
|
"learning_rate": 9.640185251157819e-05, |
|
"loss": 1.0941, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.6667194366455078, |
|
"learning_rate": 9.586747417171357e-05, |
|
"loss": 1.0701, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.9745165705680847, |
|
"learning_rate": 9.533309583184894e-05, |
|
"loss": 1.0187, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.6843913197517395, |
|
"learning_rate": 9.479871749198432e-05, |
|
"loss": 1.0812, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_loss": 1.0521106719970703, |
|
"eval_runtime": 243.4663, |
|
"eval_samples_per_second": 46.121, |
|
"eval_steps_per_second": 5.767, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.2488641738891602, |
|
"learning_rate": 9.42643391521197e-05, |
|
"loss": 1.0166, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.964064359664917, |
|
"learning_rate": 9.372996081225506e-05, |
|
"loss": 1.0181, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.8608153462409973, |
|
"learning_rate": 9.319558247239043e-05, |
|
"loss": 1.0447, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.841242790222168, |
|
"learning_rate": 9.266120413252582e-05, |
|
"loss": 1.0161, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"eval_loss": 1.0501881837844849, |
|
"eval_runtime": 242.961, |
|
"eval_samples_per_second": 46.217, |
|
"eval_steps_per_second": 5.779, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.064781904220581, |
|
"learning_rate": 9.21268257926612e-05, |
|
"loss": 1.0696, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.0806126594543457, |
|
"learning_rate": 9.159244745279658e-05, |
|
"loss": 1.0982, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.3890222311019897, |
|
"learning_rate": 9.105806911293195e-05, |
|
"loss": 1.1408, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.9842618107795715, |
|
"learning_rate": 9.052369077306733e-05, |
|
"loss": 1.0007, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 1.0495084524154663, |
|
"eval_runtime": 242.9243, |
|
"eval_samples_per_second": 46.224, |
|
"eval_steps_per_second": 5.78, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.2397950887680054, |
|
"learning_rate": 8.998931243320269e-05, |
|
"loss": 1.0121, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.7010890245437622, |
|
"learning_rate": 8.945493409333807e-05, |
|
"loss": 1.0179, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.7378676533699036, |
|
"learning_rate": 8.892055575347344e-05, |
|
"loss": 1.0801, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.0177286863327026, |
|
"learning_rate": 8.838617741360883e-05, |
|
"loss": 1.1177, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"eval_loss": 1.0490984916687012, |
|
"eval_runtime": 243.5095, |
|
"eval_samples_per_second": 46.113, |
|
"eval_steps_per_second": 5.766, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.9546262621879578, |
|
"learning_rate": 8.785179907374421e-05, |
|
"loss": 1.062, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.7818737030029297, |
|
"learning_rate": 8.731742073387958e-05, |
|
"loss": 0.9988, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.0717649459838867, |
|
"learning_rate": 8.678304239401496e-05, |
|
"loss": 1.0817, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.8660954236984253, |
|
"learning_rate": 8.624866405415032e-05, |
|
"loss": 1.0646, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"eval_loss": 1.050535798072815, |
|
"eval_runtime": 242.3199, |
|
"eval_samples_per_second": 46.34, |
|
"eval_steps_per_second": 5.794, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.8482767939567566, |
|
"learning_rate": 8.57142857142857e-05, |
|
"loss": 1.0838, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.7331980466842651, |
|
"learning_rate": 8.517990737442108e-05, |
|
"loss": 1.1195, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.8289487361907959, |
|
"learning_rate": 8.464552903455646e-05, |
|
"loss": 1.0255, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.1590027809143066, |
|
"learning_rate": 8.411115069469184e-05, |
|
"loss": 1.0598, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 1.0510787963867188, |
|
"eval_runtime": 242.7439, |
|
"eval_samples_per_second": 46.259, |
|
"eval_steps_per_second": 5.784, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.8864056468009949, |
|
"learning_rate": 8.357677235482722e-05, |
|
"loss": 1.0605, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.1364105939865112, |
|
"learning_rate": 8.304239401496259e-05, |
|
"loss": 1.0255, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.044231653213501, |
|
"learning_rate": 8.250801567509796e-05, |
|
"loss": 1.1362, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.038897156715393, |
|
"learning_rate": 8.197363733523333e-05, |
|
"loss": 1.1022, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"eval_loss": 1.0472804307937622, |
|
"eval_runtime": 242.5807, |
|
"eval_samples_per_second": 46.29, |
|
"eval_steps_per_second": 5.788, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.0197745561599731, |
|
"learning_rate": 8.143925899536871e-05, |
|
"loss": 1.0834, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.3458046913146973, |
|
"learning_rate": 8.090488065550408e-05, |
|
"loss": 1.0143, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.6543688774108887, |
|
"learning_rate": 8.037050231563947e-05, |
|
"loss": 1.0277, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.9424459338188171, |
|
"learning_rate": 7.983612397577485e-05, |
|
"loss": 1.0171, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"eval_loss": 1.0462262630462646, |
|
"eval_runtime": 242.9409, |
|
"eval_samples_per_second": 46.221, |
|
"eval_steps_per_second": 5.779, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.9467738270759583, |
|
"learning_rate": 7.930174563591023e-05, |
|
"loss": 1.0026, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.7832115888595581, |
|
"learning_rate": 7.876736729604559e-05, |
|
"loss": 1.0163, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.8243778347969055, |
|
"learning_rate": 7.823298895618096e-05, |
|
"loss": 1.0625, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.7256205081939697, |
|
"learning_rate": 7.769861061631634e-05, |
|
"loss": 1.0061, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"eval_loss": 1.0456407070159912, |
|
"eval_runtime": 242.46, |
|
"eval_samples_per_second": 46.313, |
|
"eval_steps_per_second": 5.791, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.847520112991333, |
|
"learning_rate": 7.716423227645172e-05, |
|
"loss": 1.0741, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.7502401471138, |
|
"learning_rate": 7.662985393658709e-05, |
|
"loss": 1.0904, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.865737795829773, |
|
"learning_rate": 7.609547559672248e-05, |
|
"loss": 1.1138, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.011161208152771, |
|
"learning_rate": 7.556109725685786e-05, |
|
"loss": 1.137, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 1.0469642877578735, |
|
"eval_runtime": 243.2507, |
|
"eval_samples_per_second": 46.162, |
|
"eval_steps_per_second": 5.772, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.1329538822174072, |
|
"learning_rate": 7.502671891699322e-05, |
|
"loss": 1.0289, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.9494741559028625, |
|
"learning_rate": 7.449234057712861e-05, |
|
"loss": 0.9842, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.8564668893814087, |
|
"learning_rate": 7.395796223726397e-05, |
|
"loss": 1.0844, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.7696930170059204, |
|
"learning_rate": 7.342358389739935e-05, |
|
"loss": 1.0693, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_loss": 1.0445717573165894, |
|
"eval_runtime": 243.4168, |
|
"eval_samples_per_second": 46.131, |
|
"eval_steps_per_second": 5.768, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.7331804633140564, |
|
"learning_rate": 7.288920555753472e-05, |
|
"loss": 0.9968, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.696283221244812, |
|
"learning_rate": 7.23548272176701e-05, |
|
"loss": 1.0031, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.9141848683357239, |
|
"learning_rate": 7.182044887780548e-05, |
|
"loss": 1.1274, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.858689546585083, |
|
"learning_rate": 7.128607053794085e-05, |
|
"loss": 0.9971, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_loss": 1.044740080833435, |
|
"eval_runtime": 243.7413, |
|
"eval_samples_per_second": 46.069, |
|
"eval_steps_per_second": 5.76, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.7686411142349243, |
|
"learning_rate": 7.075169219807623e-05, |
|
"loss": 1.035, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.8768565654754639, |
|
"learning_rate": 7.02173138582116e-05, |
|
"loss": 1.0225, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.9776067733764648, |
|
"learning_rate": 6.968293551834698e-05, |
|
"loss": 1.0651, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.8231525421142578, |
|
"learning_rate": 6.914855717848236e-05, |
|
"loss": 1.0761, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"eval_loss": 1.0438101291656494, |
|
"eval_runtime": 242.7745, |
|
"eval_samples_per_second": 46.253, |
|
"eval_steps_per_second": 5.783, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.7318928837776184, |
|
"learning_rate": 6.861417883861773e-05, |
|
"loss": 0.9767, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.4240113496780396, |
|
"learning_rate": 6.807980049875311e-05, |
|
"loss": 1.0337, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.8493779301643372, |
|
"learning_rate": 6.754542215888849e-05, |
|
"loss": 1.0163, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.6638810038566589, |
|
"learning_rate": 6.701104381902386e-05, |
|
"loss": 1.1004, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"eval_loss": 1.0434257984161377, |
|
"eval_runtime": 242.9626, |
|
"eval_samples_per_second": 46.217, |
|
"eval_steps_per_second": 5.779, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.0353915691375732, |
|
"learning_rate": 6.647666547915924e-05, |
|
"loss": 1.0303, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.8497571349143982, |
|
"learning_rate": 6.594228713929461e-05, |
|
"loss": 1.0088, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.8979742527008057, |
|
"learning_rate": 6.540790879942999e-05, |
|
"loss": 1.0702, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.791603147983551, |
|
"learning_rate": 6.487353045956537e-05, |
|
"loss": 1.0408, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"eval_loss": 1.0422961711883545, |
|
"eval_runtime": 243.1773, |
|
"eval_samples_per_second": 46.176, |
|
"eval_steps_per_second": 5.774, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.6958451271057129, |
|
"learning_rate": 6.433915211970074e-05, |
|
"loss": 1.0654, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.7097263932228088, |
|
"learning_rate": 6.380477377983612e-05, |
|
"loss": 1.0529, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.6926958560943604, |
|
"learning_rate": 6.32703954399715e-05, |
|
"loss": 1.0213, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.8106698989868164, |
|
"learning_rate": 6.273601710010687e-05, |
|
"loss": 1.0025, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_loss": 1.0433177947998047, |
|
"eval_runtime": 243.1578, |
|
"eval_samples_per_second": 46.18, |
|
"eval_steps_per_second": 5.774, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.7857884168624878, |
|
"learning_rate": 6.220163876024225e-05, |
|
"loss": 0.9925, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.9492070078849792, |
|
"learning_rate": 6.166726042037762e-05, |
|
"loss": 1.0084, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.7410191297531128, |
|
"learning_rate": 6.1132882080513e-05, |
|
"loss": 1.0947, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.8114225268363953, |
|
"learning_rate": 6.0598503740648375e-05, |
|
"loss": 1.0484, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 1.044475793838501, |
|
"eval_runtime": 243.0705, |
|
"eval_samples_per_second": 46.196, |
|
"eval_steps_per_second": 5.776, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.9174725413322449, |
|
"learning_rate": 6.006412540078375e-05, |
|
"loss": 1.0192, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.96259605884552, |
|
"learning_rate": 5.952974706091913e-05, |
|
"loss": 1.105, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.8854503631591797, |
|
"learning_rate": 5.8995368721054496e-05, |
|
"loss": 1.0242, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.8376172184944153, |
|
"learning_rate": 5.846099038118988e-05, |
|
"loss": 1.0875, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"eval_loss": 1.040363073348999, |
|
"eval_runtime": 243.0906, |
|
"eval_samples_per_second": 46.193, |
|
"eval_steps_per_second": 5.776, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.6750116944313049, |
|
"learning_rate": 5.7926612041325255e-05, |
|
"loss": 1.11, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.060460090637207, |
|
"learning_rate": 5.739223370146063e-05, |
|
"loss": 1.0577, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.9560518860816956, |
|
"learning_rate": 5.6857855361596e-05, |
|
"loss": 1.0264, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.3398056030273438, |
|
"learning_rate": 5.632347702173138e-05, |
|
"loss": 1.0567, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"eval_loss": 1.0413881540298462, |
|
"eval_runtime": 244.0341, |
|
"eval_samples_per_second": 46.014, |
|
"eval_steps_per_second": 5.753, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.0977717638015747, |
|
"learning_rate": 5.578909868186676e-05, |
|
"loss": 1.0415, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.8628262281417847, |
|
"learning_rate": 5.525472034200213e-05, |
|
"loss": 1.068, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.3997876644134521, |
|
"learning_rate": 5.4720342002137505e-05, |
|
"loss": 1.1642, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.9169967174530029, |
|
"learning_rate": 5.418596366227289e-05, |
|
"loss": 1.0511, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"eval_loss": 1.0395700931549072, |
|
"eval_runtime": 243.4818, |
|
"eval_samples_per_second": 46.118, |
|
"eval_steps_per_second": 5.766, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.7814826369285583, |
|
"learning_rate": 5.3651585322408264e-05, |
|
"loss": 0.9881, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.747515082359314, |
|
"learning_rate": 5.311720698254363e-05, |
|
"loss": 1.0909, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.7690112590789795, |
|
"learning_rate": 5.258282864267901e-05, |
|
"loss": 1.0002, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.9121204614639282, |
|
"learning_rate": 5.204845030281439e-05, |
|
"loss": 1.039, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"eval_loss": 1.0400930643081665, |
|
"eval_runtime": 242.4036, |
|
"eval_samples_per_second": 46.324, |
|
"eval_steps_per_second": 5.792, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.6267992854118347, |
|
"learning_rate": 5.151407196294976e-05, |
|
"loss": 1.0034, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.822167158126831, |
|
"learning_rate": 5.097969362308514e-05, |
|
"loss": 1.0909, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.8151036500930786, |
|
"learning_rate": 5.044531528322051e-05, |
|
"loss": 1.111, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.8981462717056274, |
|
"learning_rate": 4.9910936943355896e-05, |
|
"loss": 1.1096, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"eval_loss": 1.0389798879623413, |
|
"eval_runtime": 242.8318, |
|
"eval_samples_per_second": 46.242, |
|
"eval_steps_per_second": 5.782, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.8009606003761292, |
|
"learning_rate": 4.9376558603491265e-05, |
|
"loss": 1.0797, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.7975730299949646, |
|
"learning_rate": 4.884218026362664e-05, |
|
"loss": 1.1148, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.7241278290748596, |
|
"learning_rate": 4.8307801923762024e-05, |
|
"loss": 1.0344, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.057424545288086, |
|
"learning_rate": 4.7773423583897394e-05, |
|
"loss": 1.0552, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_loss": 1.0383328199386597, |
|
"eval_runtime": 242.6213, |
|
"eval_samples_per_second": 46.282, |
|
"eval_steps_per_second": 5.787, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.554969847202301, |
|
"learning_rate": 4.723904524403277e-05, |
|
"loss": 1.0223, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.5786603093147278, |
|
"learning_rate": 4.6704666904168146e-05, |
|
"loss": 0.9964, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.9003456234931946, |
|
"learning_rate": 4.617028856430353e-05, |
|
"loss": 1.1079, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.770336925983429, |
|
"learning_rate": 4.56359102244389e-05, |
|
"loss": 1.0313, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"eval_loss": 1.038406252861023, |
|
"eval_runtime": 243.2381, |
|
"eval_samples_per_second": 46.165, |
|
"eval_steps_per_second": 5.772, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.7360727787017822, |
|
"learning_rate": 4.5101531884574274e-05, |
|
"loss": 1.0323, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.8868034482002258, |
|
"learning_rate": 4.456715354470965e-05, |
|
"loss": 1.0505, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.797825276851654, |
|
"learning_rate": 4.403277520484503e-05, |
|
"loss": 1.0687, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.6810696721076965, |
|
"learning_rate": 4.34983968649804e-05, |
|
"loss": 1.0873, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"eval_loss": 1.039219856262207, |
|
"eval_runtime": 243.163, |
|
"eval_samples_per_second": 46.179, |
|
"eval_steps_per_second": 5.774, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.7306551933288574, |
|
"learning_rate": 4.296401852511578e-05, |
|
"loss": 1.0555, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.3633157014846802, |
|
"learning_rate": 4.2429640185251154e-05, |
|
"loss": 1.0211, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.0363646745681763, |
|
"learning_rate": 4.1895261845386524e-05, |
|
"loss": 0.9987, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.359657645225525, |
|
"learning_rate": 4.1360883505521907e-05, |
|
"loss": 1.0497, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"eval_loss": 1.0370477437973022, |
|
"eval_runtime": 242.9392, |
|
"eval_samples_per_second": 46.221, |
|
"eval_steps_per_second": 5.779, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.0343430042266846, |
|
"learning_rate": 4.082650516565728e-05, |
|
"loss": 1.0131, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.6140369176864624, |
|
"learning_rate": 4.029212682579266e-05, |
|
"loss": 1.0307, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.0775642395019531, |
|
"learning_rate": 3.9757748485928035e-05, |
|
"loss": 1.1141, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.9400127530097961, |
|
"learning_rate": 3.922337014606341e-05, |
|
"loss": 1.0086, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"eval_loss": 1.0370286703109741, |
|
"eval_runtime": 242.1391, |
|
"eval_samples_per_second": 46.374, |
|
"eval_steps_per_second": 5.798, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.8459872603416443, |
|
"learning_rate": 3.868899180619879e-05, |
|
"loss": 1.0375, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.6824354529380798, |
|
"learning_rate": 3.8154613466334156e-05, |
|
"loss": 0.9693, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.6828789710998535, |
|
"learning_rate": 3.762023512646954e-05, |
|
"loss": 1.0693, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.1990960836410522, |
|
"learning_rate": 3.7085856786604915e-05, |
|
"loss": 1.0551, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 1.0373618602752686, |
|
"eval_runtime": 244.1596, |
|
"eval_samples_per_second": 45.99, |
|
"eval_steps_per_second": 5.75, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.9297399520874023, |
|
"learning_rate": 3.655147844674029e-05, |
|
"loss": 1.0265, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.5350477695465088, |
|
"learning_rate": 3.601710010687567e-05, |
|
"loss": 1.0933, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.795394778251648, |
|
"learning_rate": 3.5482721767011043e-05, |
|
"loss": 1.0517, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.8578258752822876, |
|
"learning_rate": 3.494834342714641e-05, |
|
"loss": 0.9656, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 1.0366795063018799, |
|
"eval_runtime": 243.2195, |
|
"eval_samples_per_second": 46.168, |
|
"eval_steps_per_second": 5.773, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.7419169545173645, |
|
"learning_rate": 3.4413965087281796e-05, |
|
"loss": 1.045, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.700157880783081, |
|
"learning_rate": 3.3879586747417165e-05, |
|
"loss": 1.005, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.1453505754470825, |
|
"learning_rate": 3.334520840755255e-05, |
|
"loss": 1.0606, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.1287420988082886, |
|
"learning_rate": 3.281083006768792e-05, |
|
"loss": 1.0068, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"eval_loss": 1.0359498262405396, |
|
"eval_runtime": 242.9315, |
|
"eval_samples_per_second": 46.223, |
|
"eval_steps_per_second": 5.779, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.7225552797317505, |
|
"learning_rate": 3.22764517278233e-05, |
|
"loss": 1.0658, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.6859227418899536, |
|
"learning_rate": 3.174207338795867e-05, |
|
"loss": 1.0376, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.9955888390541077, |
|
"learning_rate": 3.1207695048094045e-05, |
|
"loss": 1.1427, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.6565488576889038, |
|
"learning_rate": 3.067331670822942e-05, |
|
"loss": 1.0982, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_loss": 1.0358622074127197, |
|
"eval_runtime": 243.3766, |
|
"eval_samples_per_second": 46.138, |
|
"eval_steps_per_second": 5.769, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.7512806057929993, |
|
"learning_rate": 3.0138938368364797e-05, |
|
"loss": 0.9968, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.8107959628105164, |
|
"learning_rate": 2.9604560028500177e-05, |
|
"loss": 1.0999, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.6686025857925415, |
|
"learning_rate": 2.907018168863555e-05, |
|
"loss": 0.9487, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.282087802886963, |
|
"learning_rate": 2.853580334877093e-05, |
|
"loss": 1.0322, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_loss": 1.0352236032485962, |
|
"eval_runtime": 243.0812, |
|
"eval_samples_per_second": 46.194, |
|
"eval_steps_per_second": 5.776, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.9066522121429443, |
|
"learning_rate": 2.8001425008906302e-05, |
|
"loss": 1.1081, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.8009347319602966, |
|
"learning_rate": 2.7467046669041678e-05, |
|
"loss": 1.0118, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.7938979864120483, |
|
"learning_rate": 2.6932668329177054e-05, |
|
"loss": 1.0318, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.9669837355613708, |
|
"learning_rate": 2.639828998931243e-05, |
|
"loss": 1.0846, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"eval_loss": 1.034826397895813, |
|
"eval_runtime": 243.4608, |
|
"eval_samples_per_second": 46.122, |
|
"eval_steps_per_second": 5.767, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.6504008769989014, |
|
"learning_rate": 2.5863911649447806e-05, |
|
"loss": 1.0308, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.8472669124603271, |
|
"learning_rate": 2.5329533309583182e-05, |
|
"loss": 1.1081, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.6711983680725098, |
|
"learning_rate": 2.479515496971856e-05, |
|
"loss": 0.9521, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.2170182466506958, |
|
"learning_rate": 2.4260776629853934e-05, |
|
"loss": 1.0646, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_loss": 1.034729242324829, |
|
"eval_runtime": 242.7006, |
|
"eval_samples_per_second": 46.267, |
|
"eval_steps_per_second": 5.785, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.8178135752677917, |
|
"learning_rate": 2.3726398289989314e-05, |
|
"loss": 1.1114, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.6775932908058167, |
|
"learning_rate": 2.3192019950124686e-05, |
|
"loss": 1.0224, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.634033203125, |
|
"learning_rate": 2.265764161026006e-05, |
|
"loss": 1.0107, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.1500298976898193, |
|
"learning_rate": 2.212326327039544e-05, |
|
"loss": 1.0405, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"eval_loss": 1.0347503423690796, |
|
"eval_runtime": 243.9831, |
|
"eval_samples_per_second": 46.024, |
|
"eval_steps_per_second": 5.754, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.7258419990539551, |
|
"learning_rate": 2.158888493053081e-05, |
|
"loss": 1.107, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.6233177185058594, |
|
"learning_rate": 2.105450659066619e-05, |
|
"loss": 1.0495, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.6832902431488037, |
|
"learning_rate": 2.0520128250801567e-05, |
|
"loss": 1.0314, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.795818030834198, |
|
"learning_rate": 1.9985749910936943e-05, |
|
"loss": 1.0524, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"eval_loss": 1.035040259361267, |
|
"eval_runtime": 243.1568, |
|
"eval_samples_per_second": 46.18, |
|
"eval_steps_per_second": 5.774, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.9443835020065308, |
|
"learning_rate": 1.945137157107232e-05, |
|
"loss": 1.0814, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.6237119436264038, |
|
"learning_rate": 1.891699323120769e-05, |
|
"loss": 1.0426, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.7754202485084534, |
|
"learning_rate": 1.838261489134307e-05, |
|
"loss": 1.0111, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.9115328788757324, |
|
"learning_rate": 1.7848236551478447e-05, |
|
"loss": 1.0586, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"eval_loss": 1.034472107887268, |
|
"eval_runtime": 242.6911, |
|
"eval_samples_per_second": 46.269, |
|
"eval_steps_per_second": 5.785, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.9519437551498413, |
|
"learning_rate": 1.731385821161382e-05, |
|
"loss": 1.0329, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.142217755317688, |
|
"learning_rate": 1.6779479871749196e-05, |
|
"loss": 1.0159, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.872619092464447, |
|
"learning_rate": 1.6245101531884572e-05, |
|
"loss": 1.0312, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.7944605350494385, |
|
"learning_rate": 1.5710723192019948e-05, |
|
"loss": 1.0029, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_loss": 1.0337694883346558, |
|
"eval_runtime": 242.9274, |
|
"eval_samples_per_second": 46.224, |
|
"eval_steps_per_second": 5.78, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.7382352352142334, |
|
"learning_rate": 1.5176344852155324e-05, |
|
"loss": 1.05, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.706157386302948, |
|
"learning_rate": 1.46419665122907e-05, |
|
"loss": 1.0117, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.6966803669929504, |
|
"learning_rate": 1.4107588172426076e-05, |
|
"loss": 1.0328, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.8918120265007019, |
|
"learning_rate": 1.3573209832561454e-05, |
|
"loss": 1.1869, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_loss": 1.034013271331787, |
|
"eval_runtime": 243.2908, |
|
"eval_samples_per_second": 46.155, |
|
"eval_steps_per_second": 5.771, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.8896795511245728, |
|
"learning_rate": 1.3038831492696827e-05, |
|
"loss": 1.0083, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.7305176258087158, |
|
"learning_rate": 1.2504453152832203e-05, |
|
"loss": 1.0409, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.9530869126319885, |
|
"learning_rate": 1.1970074812967579e-05, |
|
"loss": 1.0329, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.8421928286552429, |
|
"learning_rate": 1.1435696473102957e-05, |
|
"loss": 1.0572, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_loss": 1.0332649946212769, |
|
"eval_runtime": 243.0973, |
|
"eval_samples_per_second": 46.191, |
|
"eval_steps_per_second": 5.775, |
|
"step": 2700 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2807, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"total_flos": 2.5170359635714867e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|