{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.246376811594203, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07, "learning_rate": 0.00019800000000000002, "loss": 3.3296, "step": 10 }, { "epoch": 0.14, "learning_rate": 0.000196, "loss": 2.5227, "step": 20 }, { "epoch": 0.22, "learning_rate": 0.000194, "loss": 2.0348, "step": 30 }, { "epoch": 0.29, "learning_rate": 0.000192, "loss": 1.9388, "step": 40 }, { "epoch": 0.36, "learning_rate": 0.00019, "loss": 1.9197, "step": 50 }, { "epoch": 0.43, "learning_rate": 0.000188, "loss": 1.814, "step": 60 }, { "epoch": 0.51, "learning_rate": 0.00018600000000000002, "loss": 1.5649, "step": 70 }, { "epoch": 0.58, "learning_rate": 0.00018400000000000003, "loss": 1.6691, "step": 80 }, { "epoch": 0.65, "learning_rate": 0.000182, "loss": 1.6885, "step": 90 }, { "epoch": 0.72, "learning_rate": 0.00018, "loss": 1.6748, "step": 100 }, { "epoch": 0.8, "learning_rate": 0.00017800000000000002, "loss": 1.6405, "step": 110 }, { "epoch": 0.87, "learning_rate": 0.00017600000000000002, "loss": 1.5524, "step": 120 }, { "epoch": 0.94, "learning_rate": 0.000174, "loss": 1.374, "step": 130 }, { "epoch": 1.01, "learning_rate": 0.000172, "loss": 1.4287, "step": 140 }, { "epoch": 1.09, "learning_rate": 0.00017, "loss": 1.228, "step": 150 }, { "epoch": 1.16, "learning_rate": 0.000168, "loss": 1.3442, "step": 160 }, { "epoch": 1.23, "learning_rate": 0.000166, "loss": 1.3386, "step": 170 }, { "epoch": 1.3, "learning_rate": 0.000164, "loss": 1.2911, "step": 180 }, { "epoch": 1.38, "learning_rate": 0.000162, "loss": 1.2203, "step": 190 }, { "epoch": 1.45, "learning_rate": 0.00016, "loss": 1.1671, "step": 200 }, { "epoch": 1.52, "learning_rate": 0.00015800000000000002, "loss": 1.4117, "step": 210 }, { "epoch": 1.59, "learning_rate": 0.00015600000000000002, "loss": 1.1522, "step": 220 }, { "epoch": 1.67, "learning_rate": 0.000154, "loss": 1.3722, "step": 230 }, { "epoch": 1.74, "learning_rate": 0.000152, "loss": 1.3268, "step": 240 }, { "epoch": 1.81, "learning_rate": 0.00015000000000000001, "loss": 1.1648, "step": 250 }, { "epoch": 1.88, "learning_rate": 0.000148, "loss": 1.2187, "step": 260 }, { "epoch": 1.96, "learning_rate": 0.000146, "loss": 1.2501, "step": 270 }, { "epoch": 2.03, "learning_rate": 0.000144, "loss": 1.0928, "step": 280 }, { "epoch": 2.1, "learning_rate": 0.000142, "loss": 0.8553, "step": 290 }, { "epoch": 2.17, "learning_rate": 0.00014, "loss": 0.8745, "step": 300 }, { "epoch": 2.25, "learning_rate": 0.000138, "loss": 0.9068, "step": 310 }, { "epoch": 2.32, "learning_rate": 0.00013600000000000003, "loss": 0.9783, "step": 320 }, { "epoch": 2.39, "learning_rate": 0.000134, "loss": 0.9185, "step": 330 }, { "epoch": 2.46, "learning_rate": 0.000132, "loss": 0.929, "step": 340 }, { "epoch": 2.54, "learning_rate": 0.00013000000000000002, "loss": 0.9308, "step": 350 }, { "epoch": 2.61, "learning_rate": 0.00012800000000000002, "loss": 0.929, "step": 360 }, { "epoch": 2.68, "learning_rate": 0.000126, "loss": 1.0682, "step": 370 }, { "epoch": 2.75, "learning_rate": 0.000124, "loss": 0.9615, "step": 380 }, { "epoch": 2.83, "learning_rate": 0.000122, "loss": 0.9984, "step": 390 }, { "epoch": 2.9, "learning_rate": 0.00012, "loss": 0.9162, "step": 400 }, { "epoch": 2.97, "learning_rate": 0.000118, "loss": 0.7737, "step": 410 }, { "epoch": 3.04, "learning_rate": 0.000116, "loss": 0.8563, "step": 420 }, { "epoch": 3.12, "learning_rate": 0.00011399999999999999, "loss": 0.5843, "step": 430 }, { "epoch": 3.19, "learning_rate": 0.00011200000000000001, "loss": 0.6378, "step": 440 }, { "epoch": 3.26, "learning_rate": 0.00011000000000000002, "loss": 0.5818, "step": 450 }, { "epoch": 3.33, "learning_rate": 0.00010800000000000001, "loss": 0.6841, "step": 460 }, { "epoch": 3.41, "learning_rate": 0.00010600000000000002, "loss": 0.6338, "step": 470 }, { "epoch": 3.48, "learning_rate": 0.00010400000000000001, "loss": 0.6083, "step": 480 }, { "epoch": 3.55, "learning_rate": 0.00010200000000000001, "loss": 0.6408, "step": 490 }, { "epoch": 3.62, "learning_rate": 0.0001, "loss": 0.677, "step": 500 }, { "epoch": 3.7, "learning_rate": 9.8e-05, "loss": 0.6878, "step": 510 }, { "epoch": 3.77, "learning_rate": 9.6e-05, "loss": 0.5942, "step": 520 }, { "epoch": 3.84, "learning_rate": 9.4e-05, "loss": 0.5855, "step": 530 }, { "epoch": 3.91, "learning_rate": 9.200000000000001e-05, "loss": 0.6314, "step": 540 }, { "epoch": 3.99, "learning_rate": 9e-05, "loss": 0.6157, "step": 550 }, { "epoch": 4.06, "learning_rate": 8.800000000000001e-05, "loss": 0.399, "step": 560 }, { "epoch": 4.13, "learning_rate": 8.6e-05, "loss": 0.3511, "step": 570 }, { "epoch": 4.2, "learning_rate": 8.4e-05, "loss": 0.4271, "step": 580 }, { "epoch": 4.28, "learning_rate": 8.2e-05, "loss": 0.4539, "step": 590 }, { "epoch": 4.35, "learning_rate": 8e-05, "loss": 0.3656, "step": 600 }, { "epoch": 4.42, "learning_rate": 7.800000000000001e-05, "loss": 0.4041, "step": 610 }, { "epoch": 4.49, "learning_rate": 7.6e-05, "loss": 0.4036, "step": 620 }, { "epoch": 4.57, "learning_rate": 7.4e-05, "loss": 0.3964, "step": 630 }, { "epoch": 4.64, "learning_rate": 7.2e-05, "loss": 0.4016, "step": 640 }, { "epoch": 4.71, "learning_rate": 7e-05, "loss": 0.3499, "step": 650 }, { "epoch": 4.78, "learning_rate": 6.800000000000001e-05, "loss": 0.4193, "step": 660 }, { "epoch": 4.86, "learning_rate": 6.6e-05, "loss": 0.3946, "step": 670 }, { "epoch": 4.93, "learning_rate": 6.400000000000001e-05, "loss": 0.4058, "step": 680 }, { "epoch": 5.0, "learning_rate": 6.2e-05, "loss": 0.4132, "step": 690 }, { "epoch": 5.07, "learning_rate": 6e-05, "loss": 0.2782, "step": 700 }, { "epoch": 5.14, "learning_rate": 5.8e-05, "loss": 0.2496, "step": 710 }, { "epoch": 5.22, "learning_rate": 5.6000000000000006e-05, "loss": 0.259, "step": 720 }, { "epoch": 5.29, "learning_rate": 5.4000000000000005e-05, "loss": 0.2572, "step": 730 }, { "epoch": 5.36, "learning_rate": 5.2000000000000004e-05, "loss": 0.2885, "step": 740 }, { "epoch": 5.43, "learning_rate": 5e-05, "loss": 0.2383, "step": 750 }, { "epoch": 5.51, "learning_rate": 4.8e-05, "loss": 0.2976, "step": 760 }, { "epoch": 5.58, "learning_rate": 4.600000000000001e-05, "loss": 0.287, "step": 770 }, { "epoch": 5.65, "learning_rate": 4.4000000000000006e-05, "loss": 0.2737, "step": 780 }, { "epoch": 5.72, "learning_rate": 4.2e-05, "loss": 0.2412, "step": 790 }, { "epoch": 5.8, "learning_rate": 4e-05, "loss": 0.2736, "step": 800 }, { "epoch": 5.87, "learning_rate": 3.8e-05, "loss": 0.278, "step": 810 }, { "epoch": 5.94, "learning_rate": 3.6e-05, "loss": 0.2871, "step": 820 }, { "epoch": 6.01, "learning_rate": 3.4000000000000007e-05, "loss": 0.243, "step": 830 }, { "epoch": 6.09, "learning_rate": 3.2000000000000005e-05, "loss": 0.1926, "step": 840 }, { "epoch": 6.16, "learning_rate": 3e-05, "loss": 0.2136, "step": 850 }, { "epoch": 6.23, "learning_rate": 2.8000000000000003e-05, "loss": 0.2247, "step": 860 }, { "epoch": 6.3, "learning_rate": 2.6000000000000002e-05, "loss": 0.2212, "step": 870 }, { "epoch": 6.38, "learning_rate": 2.4e-05, "loss": 0.201, "step": 880 }, { "epoch": 6.45, "learning_rate": 2.2000000000000003e-05, "loss": 0.2074, "step": 890 }, { "epoch": 6.52, "learning_rate": 2e-05, "loss": 0.2087, "step": 900 }, { "epoch": 6.59, "learning_rate": 1.8e-05, "loss": 0.1979, "step": 910 }, { "epoch": 6.67, "learning_rate": 1.6000000000000003e-05, "loss": 0.2152, "step": 920 }, { "epoch": 6.74, "learning_rate": 1.4000000000000001e-05, "loss": 0.2207, "step": 930 }, { "epoch": 6.81, "learning_rate": 1.2e-05, "loss": 0.2088, "step": 940 }, { "epoch": 6.88, "learning_rate": 1e-05, "loss": 0.2059, "step": 950 }, { "epoch": 6.96, "learning_rate": 8.000000000000001e-06, "loss": 0.2092, "step": 960 }, { "epoch": 7.03, "learning_rate": 6e-06, "loss": 0.1882, "step": 970 }, { "epoch": 7.1, "learning_rate": 4.000000000000001e-06, "loss": 0.1814, "step": 980 }, { "epoch": 7.17, "learning_rate": 2.0000000000000003e-06, "loss": 0.16, "step": 990 }, { "epoch": 7.25, "learning_rate": 0.0, "loss": 0.1653, "step": 1000 } ], "logging_steps": 10, "max_steps": 1000, "num_train_epochs": 8, "save_steps": 500, "total_flos": 9570470597959680.0, "trial_name": null, "trial_params": null }