{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 793, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.5e-06, "loss": 2.731, "step": 1 }, { "epoch": 0.01, "learning_rate": 1.25e-05, "loss": 2.4673, "step": 5 }, { "epoch": 0.01, "learning_rate": 2.5e-05, "loss": 2.6706, "step": 10 }, { "epoch": 0.02, "learning_rate": 3.7500000000000003e-05, "loss": 2.6655, "step": 15 }, { "epoch": 0.03, "learning_rate": 5e-05, "loss": 2.6413, "step": 20 }, { "epoch": 0.03, "learning_rate": 6.25e-05, "loss": 2.5822, "step": 25 }, { "epoch": 0.04, "learning_rate": 7.500000000000001e-05, "loss": 2.5791, "step": 30 }, { "epoch": 0.04, "learning_rate": 8.75e-05, "loss": 2.4541, "step": 35 }, { "epoch": 0.05, "learning_rate": 0.0001, "loss": 2.2348, "step": 40 }, { "epoch": 0.06, "learning_rate": 0.00011250000000000001, "loss": 2.4262, "step": 45 }, { "epoch": 0.06, "learning_rate": 0.000125, "loss": 2.5609, "step": 50 }, { "epoch": 0.07, "learning_rate": 0.0001375, "loss": 2.3085, "step": 55 }, { "epoch": 0.08, "learning_rate": 0.00015000000000000001, "loss": 2.4856, "step": 60 }, { "epoch": 0.08, "learning_rate": 0.00016250000000000002, "loss": 2.5578, "step": 65 }, { "epoch": 0.09, "learning_rate": 0.000175, "loss": 2.5337, "step": 70 }, { "epoch": 0.09, "learning_rate": 0.0001875, "loss": 2.6725, "step": 75 }, { "epoch": 0.1, "learning_rate": 0.0002, "loss": 2.0052, "step": 80 }, { "epoch": 0.11, "learning_rate": 0.0001999757331652354, "loss": 2.8976, "step": 85 }, { "epoch": 0.11, "learning_rate": 0.00019990294443852685, "loss": 2.7538, "step": 90 }, { "epoch": 0.12, "learning_rate": 0.00019978166914691453, "loss": 2.11, "step": 95 }, { "epoch": 0.13, "learning_rate": 0.00019961196614974767, "loss": 2.4711, "step": 100 }, { "epoch": 0.13, "learning_rate": 0.00019939391781011807, "loss": 2.1637, "step": 105 }, { "epoch": 0.14, "learning_rate": 0.00019912762995488633, "loss": 2.6475, "step": 110 }, { "epoch": 0.15, "learning_rate": 0.00019881323182332006, "loss": 2.3961, "step": 115 }, { "epoch": 0.15, "learning_rate": 0.00019845087600436947, "loss": 2.6875, "step": 120 }, { "epoch": 0.16, "learning_rate": 0.00019804073836261025, "loss": 2.6841, "step": 125 }, { "epoch": 0.16, "learning_rate": 0.0001975830179528901, "loss": 2.6747, "step": 130 }, { "epoch": 0.17, "learning_rate": 0.00019707793692372, "loss": 2.6586, "step": 135 }, { "epoch": 0.18, "learning_rate": 0.00019652574040945745, "loss": 2.4275, "step": 140 }, { "epoch": 0.18, "learning_rate": 0.00019592669641133395, "loss": 2.4158, "step": 145 }, { "epoch": 0.19, "learning_rate": 0.00019528109566738382, "loss": 2.2207, "step": 150 }, { "epoch": 0.2, "learning_rate": 0.0001945892515113386, "loss": 2.3564, "step": 155 }, { "epoch": 0.2, "learning_rate": 0.00019385149972055466, "loss": 2.9308, "step": 160 }, { "epoch": 0.21, "learning_rate": 0.0001930681983530481, "loss": 2.7843, "step": 165 }, { "epoch": 0.21, "learning_rate": 0.000192239727573716, "loss": 2.5293, "step": 170 }, { "epoch": 0.22, "learning_rate": 0.0001913664894698286, "loss": 2.374, "step": 175 }, { "epoch": 0.23, "learning_rate": 0.0001904489078558814, "loss": 2.6003, "step": 180 }, { "epoch": 0.23, "learning_rate": 0.0001894874280679026, "loss": 2.3425, "step": 185 }, { "epoch": 0.24, "learning_rate": 0.00018848251674731507, "loss": 2.5159, "step": 190 }, { "epoch": 0.25, "learning_rate": 0.00018743466161445823, "loss": 2.6329, "step": 195 }, { "epoch": 0.25, "learning_rate": 0.00018634437123187937, "loss": 2.3351, "step": 200 }, { "epoch": 0.26, "learning_rate": 0.00018521217475750973, "loss": 2.5704, "step": 205 }, { "epoch": 0.26, "learning_rate": 0.00018403862168784457, "loss": 2.4092, "step": 210 }, { "epoch": 0.27, "learning_rate": 0.00018282428159125248, "loss": 2.4303, "step": 215 }, { "epoch": 0.28, "learning_rate": 0.0001815697438315429, "loss": 2.4577, "step": 220 }, { "epoch": 0.28, "learning_rate": 0.00018027561728192626, "loss": 2.46, "step": 225 }, { "epoch": 0.29, "learning_rate": 0.00017894253002950542, "loss": 2.5202, "step": 230 }, { "epoch": 0.3, "learning_rate": 0.000177571129070442, "loss": 2.6827, "step": 235 }, { "epoch": 0.3, "learning_rate": 0.0001761620799959454, "loss": 2.989, "step": 240 }, { "epoch": 0.31, "learning_rate": 0.000174716066669237, "loss": 2.7209, "step": 245 }, { "epoch": 0.32, "learning_rate": 0.00017323379089364602, "loss": 2.6086, "step": 250 }, { "epoch": 0.32, "learning_rate": 0.000171715972071999, "loss": 2.6182, "step": 255 }, { "epoch": 0.33, "learning_rate": 0.00017016334685746685, "loss": 2.5666, "step": 260 }, { "epoch": 0.33, "learning_rate": 0.0001685766687960402, "loss": 2.4663, "step": 265 }, { "epoch": 0.34, "learning_rate": 0.00016695670796080592, "loss": 2.3282, "step": 270 }, { "epoch": 0.35, "learning_rate": 0.00016530425057820227, "loss": 2.4567, "step": 275 }, { "epoch": 0.35, "learning_rate": 0.00016362009864643443, "loss": 2.2274, "step": 280 }, { "epoch": 0.36, "learning_rate": 0.0001619050695462353, "loss": 2.2665, "step": 285 }, { "epoch": 0.37, "learning_rate": 0.00016015999564416078, "loss": 2.046, "step": 290 }, { "epoch": 0.37, "learning_rate": 0.00015838572388861147, "loss": 2.2182, "step": 295 }, { "epoch": 0.38, "learning_rate": 0.0001565831153987778, "loss": 2.4857, "step": 300 }, { "epoch": 0.38, "learning_rate": 0.0001547530450467071, "loss": 2.3942, "step": 305 }, { "epoch": 0.39, "learning_rate": 0.00015289640103269625, "loss": 2.635, "step": 310 }, { "epoch": 0.4, "learning_rate": 0.0001510140844542153, "loss": 2.4103, "step": 315 }, { "epoch": 0.4, "learning_rate": 0.00014910700886857197, "loss": 2.5356, "step": 320 }, { "epoch": 0.41, "learning_rate": 0.00014717609984952866, "loss": 2.6382, "step": 325 }, { "epoch": 0.42, "learning_rate": 0.00014522229453808758, "loss": 2.1514, "step": 330 }, { "epoch": 0.42, "learning_rate": 0.0001432465411876618, "loss": 2.5063, "step": 335 }, { "epoch": 0.43, "learning_rate": 0.0001412497987038532, "loss": 2.4789, "step": 340 }, { "epoch": 0.44, "learning_rate": 0.00013923303617906015, "loss": 2.3494, "step": 345 }, { "epoch": 0.44, "learning_rate": 0.00013719723242214168, "loss": 2.443, "step": 350 }, { "epoch": 0.45, "learning_rate": 0.0001351433754833654, "loss": 2.382, "step": 355 }, { "epoch": 0.45, "learning_rate": 0.0001330724621748706, "loss": 2.5961, "step": 360 }, { "epoch": 0.46, "learning_rate": 0.00013098549758687865, "loss": 2.8336, "step": 365 }, { "epoch": 0.47, "learning_rate": 0.0001288834945998859, "loss": 2.5518, "step": 370 }, { "epoch": 0.47, "learning_rate": 0.00012676747339307554, "loss": 2.1127, "step": 375 }, { "epoch": 0.48, "learning_rate": 0.00012463846094918723, "loss": 2.6908, "step": 380 }, { "epoch": 0.49, "learning_rate": 0.00012249749055608474, "loss": 2.6419, "step": 385 }, { "epoch": 0.49, "learning_rate": 0.0001203456013052634, "loss": 2.4063, "step": 390 }, { "epoch": 0.5, "learning_rate": 0.00011818383758754082, "loss": 2.2766, "step": 395 }, { "epoch": 0.5, "learning_rate": 0.00011601324858617572, "loss": 2.5315, "step": 400 }, { "epoch": 0.51, "learning_rate": 0.00011383488776766093, "loss": 2.8151, "step": 405 }, { "epoch": 0.52, "learning_rate": 0.00011164981237043728, "loss": 2.6057, "step": 410 }, { "epoch": 0.52, "learning_rate": 0.00010945908289177694, "loss": 2.4863, "step": 415 }, { "epoch": 0.53, "learning_rate": 0.00010726376257308544, "loss": 2.2287, "step": 420 }, { "epoch": 0.54, "learning_rate": 0.00010506491688387127, "loss": 2.3064, "step": 425 }, { "epoch": 0.54, "learning_rate": 0.00010286361300463481, "loss": 2.4449, "step": 430 }, { "epoch": 0.55, "learning_rate": 0.00010066091930892605, "loss": 2.7777, "step": 435 }, { "epoch": 0.55, "learning_rate": 9.845790484482404e-05, "loss": 2.5837, "step": 440 }, { "epoch": 0.56, "learning_rate": 9.625563881608847e-05, "loss": 2.4259, "step": 445 }, { "epoch": 0.57, "learning_rate": 9.405519006323588e-05, "loss": 2.7834, "step": 450 }, { "epoch": 0.57, "learning_rate": 9.185762654479219e-05, "loss": 2.6675, "step": 455 }, { "epoch": 0.58, "learning_rate": 8.966401481897304e-05, "loss": 2.5957, "step": 460 }, { "epoch": 0.59, "learning_rate": 8.747541952604423e-05, "loss": 2.3405, "step": 465 }, { "epoch": 0.59, "learning_rate": 8.52929028716126e-05, "loss": 2.4473, "step": 470 }, { "epoch": 0.6, "learning_rate": 8.311752411109863e-05, "loss": 2.2817, "step": 475 }, { "epoch": 0.61, "learning_rate": 8.095033903564091e-05, "loss": 2.7025, "step": 480 }, { "epoch": 0.61, "learning_rate": 7.879239945968209e-05, "loss": 2.3289, "step": 485 }, { "epoch": 0.62, "learning_rate": 7.664475271048458e-05, "loss": 2.6471, "step": 490 }, { "epoch": 0.62, "learning_rate": 7.450844111982436e-05, "loss": 2.5394, "step": 495 }, { "epoch": 0.63, "learning_rate": 7.238450151810885e-05, "loss": 2.422, "step": 500 }, { "epoch": 0.64, "learning_rate": 7.02739647311654e-05, "loss": 2.4525, "step": 505 }, { "epoch": 0.64, "learning_rate": 6.817785507994349e-05, "loss": 2.5048, "step": 510 }, { "epoch": 0.65, "learning_rate": 6.609718988337423e-05, "loss": 2.5573, "step": 515 }, { "epoch": 0.66, "learning_rate": 6.403297896462807e-05, "loss": 2.6636, "step": 520 }, { "epoch": 0.66, "learning_rate": 6.198622416101077e-05, "loss": 2.4896, "step": 525 }, { "epoch": 0.67, "learning_rate": 5.995791883773474e-05, "loss": 2.5308, "step": 530 }, { "epoch": 0.67, "learning_rate": 5.794904740580269e-05, "loss": 2.4272, "step": 535 }, { "epoch": 0.68, "learning_rate": 5.596058484423656e-05, "loss": 2.4119, "step": 540 }, { "epoch": 0.69, "learning_rate": 5.399349622688479e-05, "loss": 2.7247, "step": 545 }, { "epoch": 0.69, "learning_rate": 5.20487362540362e-05, "loss": 2.4899, "step": 550 }, { "epoch": 0.7, "learning_rate": 5.012724878906926e-05, "loss": 2.4428, "step": 555 }, { "epoch": 0.71, "learning_rate": 4.82299664003601e-05, "loss": 2.613, "step": 560 }, { "epoch": 0.71, "learning_rate": 4.6357809908673435e-05, "loss": 2.5947, "step": 565 }, { "epoch": 0.72, "learning_rate": 4.451168794025391e-05, "loss": 2.3137, "step": 570 }, { "epoch": 0.73, "learning_rate": 4.269249648583692e-05, "loss": 2.3568, "step": 575 }, { "epoch": 0.73, "learning_rate": 4.0901118465790875e-05, "loss": 1.9487, "step": 580 }, { "epoch": 0.74, "learning_rate": 3.9138423301604154e-05, "loss": 2.3949, "step": 585 }, { "epoch": 0.74, "learning_rate": 3.740526649392256e-05, "loss": 2.6208, "step": 590 }, { "epoch": 0.75, "learning_rate": 3.570248920734354e-05, "loss": 2.3749, "step": 595 }, { "epoch": 0.76, "learning_rate": 3.4030917862168165e-05, "loss": 2.5309, "step": 600 }, { "epoch": 0.76, "learning_rate": 3.239136373330903e-05, "loss": 2.6263, "step": 605 }, { "epoch": 0.77, "learning_rate": 3.0784622556548856e-05, "loss": 2.5203, "step": 610 }, { "epoch": 0.78, "learning_rate": 2.921147414234051e-05, "loss": 2.3685, "step": 615 }, { "epoch": 0.78, "learning_rate": 2.767268199733657e-05, "loss": 2.7285, "step": 620 }, { "epoch": 0.79, "learning_rate": 2.6168992953831396e-05, "loss": 2.3638, "step": 625 }, { "epoch": 0.79, "learning_rate": 2.4701136807296177e-05, "loss": 2.1867, "step": 630 }, { "epoch": 0.8, "learning_rate": 2.326982596218219e-05, "loss": 2.6267, "step": 635 }, { "epoch": 0.81, "learning_rate": 2.187575508616493e-05, "loss": 2.6749, "step": 640 }, { "epoch": 0.81, "learning_rate": 2.0519600772996405e-05, "loss": 2.4208, "step": 645 }, { "epoch": 0.82, "learning_rate": 1.920202121412924e-05, "loss": 2.6287, "step": 650 }, { "epoch": 0.83, "learning_rate": 1.7923655879272393e-05, "loss": 2.6315, "step": 655 }, { "epoch": 0.83, "learning_rate": 1.668512520603276e-05, "loss": 2.5708, "step": 660 }, { "epoch": 0.84, "learning_rate": 1.548703029879437e-05, "loss": 2.3316, "step": 665 }, { "epoch": 0.84, "learning_rate": 1.4329952636980115e-05, "loss": 2.1933, "step": 670 }, { "epoch": 0.85, "learning_rate": 1.321445379283861e-05, "loss": 2.1816, "step": 675 }, { "epoch": 0.86, "learning_rate": 1.214107515889248e-05, "loss": 2.474, "step": 680 }, { "epoch": 0.86, "learning_rate": 1.111033768518066e-05, "loss": 2.5638, "step": 685 }, { "epoch": 0.87, "learning_rate": 1.0122741626422438e-05, "loss": 2.5808, "step": 690 }, { "epoch": 0.88, "learning_rate": 9.178766299225205e-06, "loss": 2.1113, "step": 695 }, { "epoch": 0.88, "learning_rate": 8.278869849454718e-06, "loss": 2.6343, "step": 700 }, { "epoch": 0.89, "learning_rate": 7.423489029879982e-06, "loss": 2.3117, "step": 705 }, { "epoch": 0.9, "learning_rate": 6.613038988201237e-06, "loss": 2.7684, "step": 710 }, { "epoch": 0.9, "learning_rate": 5.847913065563382e-06, "loss": 2.5271, "step": 715 }, { "epoch": 0.91, "learning_rate": 5.128482605653173e-06, "loss": 2.2342, "step": 720 }, { "epoch": 0.91, "learning_rate": 4.455096774472567e-06, "loss": 2.2851, "step": 725 }, { "epoch": 0.92, "learning_rate": 3.828082390875487e-06, "loss": 2.4365, "step": 730 }, { "epoch": 0.93, "learning_rate": 3.247743767950795e-06, "loss": 2.6543, "step": 735 }, { "epoch": 0.93, "learning_rate": 2.714362565327866e-06, "loss": 2.6314, "step": 740 }, { "epoch": 0.94, "learning_rate": 2.2281976524769266e-06, "loss": 2.1431, "step": 745 }, { "epoch": 0.95, "learning_rate": 1.7894849830701443e-06, "loss": 2.4283, "step": 750 }, { "epoch": 0.95, "learning_rate": 1.398437480464676e-06, "loss": 2.5455, "step": 755 }, { "epoch": 0.96, "learning_rate": 1.055244934363131e-06, "loss": 2.4203, "step": 760 }, { "epoch": 0.96, "learning_rate": 7.600739087016862e-07, "loss": 2.5425, "step": 765 }, { "epoch": 0.97, "learning_rate": 5.130676608104845e-07, "loss": 2.623, "step": 770 }, { "epoch": 0.98, "learning_rate": 3.143460718855962e-07, "loss": 2.5706, "step": 775 }, { "epoch": 0.98, "learning_rate": 1.6400558880627082e-07, "loss": 2.4692, "step": 780 }, { "epoch": 0.99, "learning_rate": 6.21191773257368e-08, "loss": 2.684, "step": 785 }, { "epoch": 1.0, "learning_rate": 8.736286658228565e-09, "loss": 2.3449, "step": 790 }, { "epoch": 1.0, "eval_loss": 2.400331974029541, "eval_runtime": 240.8241, "eval_samples_per_second": 3.293, "eval_steps_per_second": 3.293, "step": 793 }, { "epoch": 1.0, "step": 793, "total_flos": 5.686041454942618e+16, "train_loss": 0.28998795253181214, "train_runtime": 345.0365, "train_samples_per_second": 2.298, "train_steps_per_second": 2.298 } ], "logging_steps": 5, "max_steps": 793, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 5.686041454942618e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }