{ "best_metric": null, "best_model_checkpoint": null, "epoch": 13.722126929674099, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07, "learning_rate": 0.00995, "loss": 3.247, "step": 10 }, { "epoch": 0.14, "learning_rate": 0.0099, "loss": 2.397, "step": 20 }, { "epoch": 0.21, "learning_rate": 0.00985, "loss": 2.1303, "step": 30 }, { "epoch": 0.27, "learning_rate": 0.0098, "loss": 1.9873, "step": 40 }, { "epoch": 0.34, "learning_rate": 0.00975, "loss": 1.8902, "step": 50 }, { "epoch": 0.41, "learning_rate": 0.0097, "loss": 1.7747, "step": 60 }, { "epoch": 0.48, "learning_rate": 0.00965, "loss": 1.711, "step": 70 }, { "epoch": 0.55, "learning_rate": 0.0096, "loss": 1.6649, "step": 80 }, { "epoch": 0.62, "learning_rate": 0.00955, "loss": 1.6457, "step": 90 }, { "epoch": 0.69, "learning_rate": 0.0095, "loss": 1.5889, "step": 100 }, { "epoch": 0.75, "learning_rate": 0.00945, "loss": 1.602, "step": 110 }, { "epoch": 0.82, "learning_rate": 0.0094, "loss": 1.5384, "step": 120 }, { "epoch": 0.89, "learning_rate": 0.00935, "loss": 1.5033, "step": 130 }, { "epoch": 0.96, "learning_rate": 0.009300000000000001, "loss": 1.5284, "step": 140 }, { "epoch": 1.03, "learning_rate": 0.009250000000000001, "loss": 1.4809, "step": 150 }, { "epoch": 1.1, "learning_rate": 0.0092, "loss": 1.4795, "step": 160 }, { "epoch": 1.17, "learning_rate": 0.00915, "loss": 1.4555, "step": 170 }, { "epoch": 1.23, "learning_rate": 0.0091, "loss": 1.4335, "step": 180 }, { "epoch": 1.3, "learning_rate": 0.00905, "loss": 1.4275, "step": 190 }, { "epoch": 1.37, "learning_rate": 0.009000000000000001, "loss": 1.4365, "step": 200 }, { "epoch": 1.44, "learning_rate": 0.00895, "loss": 1.4345, "step": 210 }, { "epoch": 1.51, "learning_rate": 0.0089, "loss": 1.4284, "step": 220 }, { "epoch": 1.58, "learning_rate": 0.00885, "loss": 1.4069, "step": 230 }, { "epoch": 1.65, "learning_rate": 0.0088, "loss": 1.4395, "step": 240 }, { "epoch": 1.72, "learning_rate": 0.00875, "loss": 1.3858, "step": 250 }, { "epoch": 1.78, "learning_rate": 0.0087, "loss": 1.3875, "step": 260 }, { "epoch": 1.85, "learning_rate": 0.00865, "loss": 1.3721, "step": 270 }, { "epoch": 1.92, "learning_rate": 0.0086, "loss": 1.374, "step": 280 }, { "epoch": 1.99, "learning_rate": 0.00855, "loss": 1.348, "step": 290 }, { "epoch": 2.06, "learning_rate": 0.0085, "loss": 1.358, "step": 300 }, { "epoch": 2.13, "learning_rate": 0.00845, "loss": 1.3352, "step": 310 }, { "epoch": 2.2, "learning_rate": 0.0084, "loss": 1.3385, "step": 320 }, { "epoch": 2.26, "learning_rate": 0.00835, "loss": 1.3546, "step": 330 }, { "epoch": 2.33, "learning_rate": 0.0083, "loss": 1.3472, "step": 340 }, { "epoch": 2.4, "learning_rate": 0.00825, "loss": 1.3179, "step": 350 }, { "epoch": 2.47, "learning_rate": 0.008199999999999999, "loss": 1.2895, "step": 360 }, { "epoch": 2.54, "learning_rate": 0.00815, "loss": 1.303, "step": 370 }, { "epoch": 2.61, "learning_rate": 0.008100000000000001, "loss": 1.3066, "step": 380 }, { "epoch": 2.68, "learning_rate": 0.00805, "loss": 1.3264, "step": 390 }, { "epoch": 2.74, "learning_rate": 0.008, "loss": 1.3202, "step": 400 }, { "epoch": 2.81, "learning_rate": 0.00795, "loss": 1.2923, "step": 410 }, { "epoch": 2.88, "learning_rate": 0.0079, "loss": 1.2711, "step": 420 }, { "epoch": 2.95, "learning_rate": 0.007850000000000001, "loss": 1.313, "step": 430 }, { "epoch": 3.02, "learning_rate": 0.0078000000000000005, "loss": 1.2948, "step": 440 }, { "epoch": 3.09, "learning_rate": 0.007750000000000001, "loss": 1.2596, "step": 450 }, { "epoch": 3.16, "learning_rate": 0.0077, "loss": 1.2505, "step": 460 }, { "epoch": 3.22, "learning_rate": 0.0076500000000000005, "loss": 1.2581, "step": 470 }, { "epoch": 3.29, "learning_rate": 0.0076, "loss": 1.2829, "step": 480 }, { "epoch": 3.36, "learning_rate": 0.00755, "loss": 1.2487, "step": 490 }, { "epoch": 3.43, "learning_rate": 0.0075, "loss": 1.2927, "step": 500 }, { "epoch": 3.5, "learning_rate": 0.00745, "loss": 1.263, "step": 510 }, { "epoch": 3.57, "learning_rate": 0.0074, "loss": 1.2565, "step": 520 }, { "epoch": 3.64, "learning_rate": 0.00735, "loss": 1.2842, "step": 530 }, { "epoch": 3.7, "learning_rate": 0.0073, "loss": 1.2506, "step": 540 }, { "epoch": 3.77, "learning_rate": 0.0072499999999999995, "loss": 1.2436, "step": 550 }, { "epoch": 3.84, "learning_rate": 0.0072, "loss": 1.2752, "step": 560 }, { "epoch": 3.91, "learning_rate": 0.00715, "loss": 1.2366, "step": 570 }, { "epoch": 3.98, "learning_rate": 0.0070999999999999995, "loss": 1.2681, "step": 580 }, { "epoch": 4.05, "learning_rate": 0.00705, "loss": 1.2076, "step": 590 }, { "epoch": 4.12, "learning_rate": 0.006999999999999999, "loss": 1.2311, "step": 600 }, { "epoch": 4.19, "learning_rate": 0.00695, "loss": 1.2203, "step": 610 }, { "epoch": 4.25, "learning_rate": 0.0069, "loss": 1.2084, "step": 620 }, { "epoch": 4.32, "learning_rate": 0.006850000000000001, "loss": 1.2243, "step": 630 }, { "epoch": 4.39, "learning_rate": 0.0068000000000000005, "loss": 1.2382, "step": 640 }, { "epoch": 4.46, "learning_rate": 0.006750000000000001, "loss": 1.206, "step": 650 }, { "epoch": 4.53, "learning_rate": 0.0067, "loss": 1.2301, "step": 660 }, { "epoch": 4.6, "learning_rate": 0.0066500000000000005, "loss": 1.2079, "step": 670 }, { "epoch": 4.67, "learning_rate": 0.006600000000000001, "loss": 1.2072, "step": 680 }, { "epoch": 4.73, "learning_rate": 0.00655, "loss": 1.169, "step": 690 }, { "epoch": 4.8, "learning_rate": 0.006500000000000001, "loss": 1.2034, "step": 700 }, { "epoch": 4.87, "learning_rate": 0.00645, "loss": 1.2168, "step": 710 }, { "epoch": 4.94, "learning_rate": 0.0064, "loss": 1.2131, "step": 720 }, { "epoch": 5.01, "learning_rate": 0.006350000000000001, "loss": 1.2044, "step": 730 }, { "epoch": 5.08, "learning_rate": 0.0063, "loss": 1.1908, "step": 740 }, { "epoch": 5.15, "learning_rate": 0.00625, "loss": 1.1594, "step": 750 }, { "epoch": 5.21, "learning_rate": 0.0062, "loss": 1.1371, "step": 760 }, { "epoch": 5.28, "learning_rate": 0.00615, "loss": 1.1905, "step": 770 }, { "epoch": 5.35, "learning_rate": 0.0061, "loss": 1.1684, "step": 780 }, { "epoch": 5.42, "learning_rate": 0.00605, "loss": 1.1942, "step": 790 }, { "epoch": 5.49, "learning_rate": 0.006, "loss": 1.1848, "step": 800 }, { "epoch": 5.56, "learning_rate": 0.0059499999999999996, "loss": 1.1786, "step": 810 }, { "epoch": 5.63, "learning_rate": 0.0059, "loss": 1.1686, "step": 820 }, { "epoch": 5.69, "learning_rate": 0.00585, "loss": 1.1852, "step": 830 }, { "epoch": 5.76, "learning_rate": 0.0058, "loss": 1.198, "step": 840 }, { "epoch": 5.83, "learning_rate": 0.00575, "loss": 1.1738, "step": 850 }, { "epoch": 5.9, "learning_rate": 0.005699999999999999, "loss": 1.1808, "step": 860 }, { "epoch": 5.97, "learning_rate": 0.00565, "loss": 1.1655, "step": 870 }, { "epoch": 6.04, "learning_rate": 0.005600000000000001, "loss": 1.1467, "step": 880 }, { "epoch": 6.11, "learning_rate": 0.00555, "loss": 1.1559, "step": 890 }, { "epoch": 6.17, "learning_rate": 0.0055000000000000005, "loss": 1.1497, "step": 900 }, { "epoch": 6.24, "learning_rate": 0.005450000000000001, "loss": 1.1473, "step": 910 }, { "epoch": 6.31, "learning_rate": 0.0054, "loss": 1.184, "step": 920 }, { "epoch": 6.38, "learning_rate": 0.005350000000000001, "loss": 1.1053, "step": 930 }, { "epoch": 6.45, "learning_rate": 0.0053, "loss": 1.1721, "step": 940 }, { "epoch": 6.52, "learning_rate": 0.00525, "loss": 1.1418, "step": 950 }, { "epoch": 6.59, "learning_rate": 0.005200000000000001, "loss": 1.1243, "step": 960 }, { "epoch": 6.66, "learning_rate": 0.00515, "loss": 1.1386, "step": 970 }, { "epoch": 6.72, "learning_rate": 0.0051, "loss": 1.1743, "step": 980 }, { "epoch": 6.79, "learning_rate": 0.00505, "loss": 1.1201, "step": 990 }, { "epoch": 6.86, "learning_rate": 0.005, "loss": 1.1224, "step": 1000 }, { "epoch": 6.93, "learning_rate": 0.00495, "loss": 1.1464, "step": 1010 }, { "epoch": 7.0, "learning_rate": 0.0049, "loss": 1.1547, "step": 1020 }, { "epoch": 7.07, "learning_rate": 0.00485, "loss": 1.145, "step": 1030 }, { "epoch": 7.14, "learning_rate": 0.0048, "loss": 1.1261, "step": 1040 }, { "epoch": 7.2, "learning_rate": 0.00475, "loss": 1.1422, "step": 1050 }, { "epoch": 7.27, "learning_rate": 0.0047, "loss": 1.1115, "step": 1060 }, { "epoch": 7.34, "learning_rate": 0.0046500000000000005, "loss": 1.0921, "step": 1070 }, { "epoch": 7.41, "learning_rate": 0.0046, "loss": 1.1041, "step": 1080 }, { "epoch": 7.48, "learning_rate": 0.00455, "loss": 1.1196, "step": 1090 }, { "epoch": 7.55, "learning_rate": 0.0045000000000000005, "loss": 1.133, "step": 1100 }, { "epoch": 7.62, "learning_rate": 0.00445, "loss": 1.1133, "step": 1110 }, { "epoch": 7.68, "learning_rate": 0.0044, "loss": 1.1034, "step": 1120 }, { "epoch": 7.75, "learning_rate": 0.00435, "loss": 1.118, "step": 1130 }, { "epoch": 7.82, "learning_rate": 0.0043, "loss": 1.128, "step": 1140 }, { "epoch": 7.89, "learning_rate": 0.00425, "loss": 1.1316, "step": 1150 }, { "epoch": 7.96, "learning_rate": 0.0042, "loss": 1.129, "step": 1160 }, { "epoch": 8.03, "learning_rate": 0.00415, "loss": 1.0815, "step": 1170 }, { "epoch": 8.1, "learning_rate": 0.0040999999999999995, "loss": 1.091, "step": 1180 }, { "epoch": 8.16, "learning_rate": 0.004050000000000001, "loss": 1.0751, "step": 1190 }, { "epoch": 8.23, "learning_rate": 0.004, "loss": 1.1042, "step": 1200 }, { "epoch": 8.3, "learning_rate": 0.00395, "loss": 1.0957, "step": 1210 }, { "epoch": 8.37, "learning_rate": 0.0039000000000000003, "loss": 1.1186, "step": 1220 }, { "epoch": 8.44, "learning_rate": 0.00385, "loss": 1.101, "step": 1230 }, { "epoch": 8.51, "learning_rate": 0.0038, "loss": 1.0889, "step": 1240 }, { "epoch": 8.58, "learning_rate": 0.00375, "loss": 1.1111, "step": 1250 }, { "epoch": 8.64, "learning_rate": 0.0037, "loss": 1.1158, "step": 1260 }, { "epoch": 8.71, "learning_rate": 0.00365, "loss": 1.0982, "step": 1270 }, { "epoch": 8.78, "learning_rate": 0.0036, "loss": 1.1002, "step": 1280 }, { "epoch": 8.85, "learning_rate": 0.0035499999999999998, "loss": 1.0892, "step": 1290 }, { "epoch": 8.92, "learning_rate": 0.0034999999999999996, "loss": 1.0698, "step": 1300 }, { "epoch": 8.99, "learning_rate": 0.00345, "loss": 1.0703, "step": 1310 }, { "epoch": 9.06, "learning_rate": 0.0034000000000000002, "loss": 1.0602, "step": 1320 }, { "epoch": 9.13, "learning_rate": 0.00335, "loss": 1.0611, "step": 1330 }, { "epoch": 9.19, "learning_rate": 0.0033000000000000004, "loss": 1.0886, "step": 1340 }, { "epoch": 9.26, "learning_rate": 0.0032500000000000003, "loss": 1.1028, "step": 1350 }, { "epoch": 9.33, "learning_rate": 0.0032, "loss": 1.0727, "step": 1360 }, { "epoch": 9.4, "learning_rate": 0.00315, "loss": 1.074, "step": 1370 }, { "epoch": 9.47, "learning_rate": 0.0031, "loss": 1.0605, "step": 1380 }, { "epoch": 9.54, "learning_rate": 0.00305, "loss": 1.0896, "step": 1390 }, { "epoch": 9.61, "learning_rate": 0.003, "loss": 1.0674, "step": 1400 }, { "epoch": 9.67, "learning_rate": 0.00295, "loss": 1.0894, "step": 1410 }, { "epoch": 9.74, "learning_rate": 0.0029, "loss": 1.0782, "step": 1420 }, { "epoch": 9.81, "learning_rate": 0.0028499999999999997, "loss": 1.0905, "step": 1430 }, { "epoch": 9.88, "learning_rate": 0.0028000000000000004, "loss": 1.0713, "step": 1440 }, { "epoch": 9.95, "learning_rate": 0.0027500000000000003, "loss": 1.0694, "step": 1450 }, { "epoch": 10.02, "learning_rate": 0.0027, "loss": 1.0734, "step": 1460 }, { "epoch": 10.09, "learning_rate": 0.00265, "loss": 1.0836, "step": 1470 }, { "epoch": 10.15, "learning_rate": 0.0026000000000000003, "loss": 1.033, "step": 1480 }, { "epoch": 10.22, "learning_rate": 0.00255, "loss": 1.0646, "step": 1490 }, { "epoch": 10.29, "learning_rate": 0.0025, "loss": 1.0718, "step": 1500 }, { "epoch": 10.36, "learning_rate": 0.00245, "loss": 1.0507, "step": 1510 }, { "epoch": 10.43, "learning_rate": 0.0024, "loss": 1.057, "step": 1520 }, { "epoch": 10.5, "learning_rate": 0.00235, "loss": 1.0603, "step": 1530 }, { "epoch": 10.57, "learning_rate": 0.0023, "loss": 1.0669, "step": 1540 }, { "epoch": 10.63, "learning_rate": 0.0022500000000000003, "loss": 1.0766, "step": 1550 }, { "epoch": 10.7, "learning_rate": 0.0022, "loss": 1.0346, "step": 1560 }, { "epoch": 10.77, "learning_rate": 0.00215, "loss": 1.0648, "step": 1570 }, { "epoch": 10.84, "learning_rate": 0.0021, "loss": 1.053, "step": 1580 }, { "epoch": 10.91, "learning_rate": 0.0020499999999999997, "loss": 1.0558, "step": 1590 }, { "epoch": 10.98, "learning_rate": 0.002, "loss": 1.0997, "step": 1600 }, { "epoch": 11.05, "learning_rate": 0.0019500000000000001, "loss": 1.0345, "step": 1610 }, { "epoch": 11.11, "learning_rate": 0.0019, "loss": 1.0371, "step": 1620 }, { "epoch": 11.18, "learning_rate": 0.00185, "loss": 1.0542, "step": 1630 }, { "epoch": 11.25, "learning_rate": 0.0018, "loss": 1.0702, "step": 1640 }, { "epoch": 11.32, "learning_rate": 0.0017499999999999998, "loss": 1.0396, "step": 1650 }, { "epoch": 11.39, "learning_rate": 0.0017000000000000001, "loss": 1.0159, "step": 1660 }, { "epoch": 11.46, "learning_rate": 0.0016500000000000002, "loss": 1.0364, "step": 1670 }, { "epoch": 11.53, "learning_rate": 0.0016, "loss": 1.054, "step": 1680 }, { "epoch": 11.6, "learning_rate": 0.00155, "loss": 1.0384, "step": 1690 }, { "epoch": 11.66, "learning_rate": 0.0015, "loss": 1.0737, "step": 1700 }, { "epoch": 11.73, "learning_rate": 0.00145, "loss": 1.0502, "step": 1710 }, { "epoch": 11.8, "learning_rate": 0.0014000000000000002, "loss": 1.0762, "step": 1720 }, { "epoch": 11.87, "learning_rate": 0.00135, "loss": 1.0386, "step": 1730 }, { "epoch": 11.94, "learning_rate": 0.0013000000000000002, "loss": 1.0463, "step": 1740 }, { "epoch": 12.01, "learning_rate": 0.00125, "loss": 1.0513, "step": 1750 }, { "epoch": 12.08, "learning_rate": 0.0012, "loss": 1.044, "step": 1760 }, { "epoch": 12.14, "learning_rate": 0.00115, "loss": 1.0252, "step": 1770 }, { "epoch": 12.21, "learning_rate": 0.0011, "loss": 1.0311, "step": 1780 }, { "epoch": 12.28, "learning_rate": 0.00105, "loss": 1.015, "step": 1790 }, { "epoch": 12.35, "learning_rate": 0.001, "loss": 1.0654, "step": 1800 }, { "epoch": 12.42, "learning_rate": 0.00095, "loss": 1.0541, "step": 1810 }, { "epoch": 12.49, "learning_rate": 0.0009, "loss": 1.0525, "step": 1820 }, { "epoch": 12.56, "learning_rate": 0.0008500000000000001, "loss": 1.0251, "step": 1830 }, { "epoch": 12.62, "learning_rate": 0.0008, "loss": 1.0657, "step": 1840 }, { "epoch": 12.69, "learning_rate": 0.00075, "loss": 1.041, "step": 1850 }, { "epoch": 12.76, "learning_rate": 0.0007000000000000001, "loss": 1.025, "step": 1860 }, { "epoch": 12.83, "learning_rate": 0.0006500000000000001, "loss": 1.0255, "step": 1870 }, { "epoch": 12.9, "learning_rate": 0.0006, "loss": 1.0641, "step": 1880 }, { "epoch": 12.97, "learning_rate": 0.00055, "loss": 1.0529, "step": 1890 }, { "epoch": 13.04, "learning_rate": 0.0005, "loss": 1.0367, "step": 1900 }, { "epoch": 13.1, "learning_rate": 0.00045, "loss": 1.0461, "step": 1910 }, { "epoch": 13.17, "learning_rate": 0.0004, "loss": 1.0197, "step": 1920 }, { "epoch": 13.24, "learning_rate": 0.00035000000000000005, "loss": 1.0339, "step": 1930 }, { "epoch": 13.31, "learning_rate": 0.0003, "loss": 1.0274, "step": 1940 }, { "epoch": 13.38, "learning_rate": 0.00025, "loss": 1.0222, "step": 1950 }, { "epoch": 13.45, "learning_rate": 0.0002, "loss": 1.0174, "step": 1960 }, { "epoch": 13.52, "learning_rate": 0.00015, "loss": 1.0295, "step": 1970 }, { "epoch": 13.58, "learning_rate": 0.0001, "loss": 1.0226, "step": 1980 }, { "epoch": 13.65, "learning_rate": 5e-05, "loss": 1.0655, "step": 1990 }, { "epoch": 13.72, "learning_rate": 0.0, "loss": 1.0367, "step": 2000 } ], "max_steps": 2000, "num_train_epochs": 14, "total_flos": 3.447484134404915e+18, "trial_name": null, "trial_params": null }