{ "best_metric": 0.8524497747421265, "best_model_checkpoint": "/content/results/checkpoint-2080", "epoch": 0.5196662693682956, "eval_steps": 20, "global_step": 2180, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.9999999999999995e-05, "loss": 1.4915, "step": 20 }, { "epoch": 0.0, "eval_jit_compilation_time": 2.8718, "eval_loss": 1.510138750076294, "eval_runtime": 16.0972, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 20 }, { "epoch": 0.01, "learning_rate": 0.000117, "loss": 1.2636, "step": 40 }, { "epoch": 0.01, "eval_jit_compilation_time": 2.7525, "eval_loss": 1.3001515865325928, "eval_runtime": 16.0999, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 40 }, { "epoch": 0.01, "learning_rate": 0.00017699999999999997, "loss": 0.8644, "step": 60 }, { "epoch": 0.01, "eval_jit_compilation_time": 2.7931, "eval_loss": 1.0469141006469727, "eval_runtime": 16.0937, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 60 }, { "epoch": 0.02, "learning_rate": 0.000237, "loss": 0.5912, "step": 80 }, { "epoch": 0.02, "eval_jit_compilation_time": 3.1505, "eval_loss": 1.0087261199951172, "eval_runtime": 16.0933, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 80 }, { "epoch": 0.02, "learning_rate": 0.00029699999999999996, "loss": 0.6031, "step": 100 }, { "epoch": 0.02, "eval_jit_compilation_time": 2.8564, "eval_loss": 0.981471061706543, "eval_runtime": 16.0909, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 100 }, { "epoch": 0.03, "learning_rate": 0.0002986080586080586, "loss": 0.5061, "step": 120 }, { "epoch": 0.03, "eval_jit_compilation_time": 2.7663, "eval_loss": 0.9665440320968628, "eval_runtime": 16.0983, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 120 }, { "epoch": 0.03, "learning_rate": 0.00029714285714285715, "loss": 0.4463, "step": 140 }, { "epoch": 0.03, "eval_jit_compilation_time": 2.7684, "eval_loss": 0.9606207609176636, "eval_runtime": 16.1051, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 140 }, { "epoch": 0.04, "learning_rate": 0.00029567765567765567, "loss": 0.5639, "step": 160 }, { "epoch": 0.04, "eval_jit_compilation_time": 3.1193, "eval_loss": 0.957785427570343, "eval_runtime": 16.0918, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 160 }, { "epoch": 0.04, "learning_rate": 0.0002942124542124542, "loss": 0.3525, "step": 180 }, { "epoch": 0.04, "eval_jit_compilation_time": 2.7423, "eval_loss": 0.9552174806594849, "eval_runtime": 16.0894, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 180 }, { "epoch": 0.05, "learning_rate": 0.0002927472527472527, "loss": 0.5691, "step": 200 }, { "epoch": 0.05, "eval_jit_compilation_time": 2.7739, "eval_loss": 0.9536301493644714, "eval_runtime": 16.0969, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 200 }, { "epoch": 0.05, "learning_rate": 0.00029128205128205127, "loss": 0.3336, "step": 220 }, { "epoch": 0.05, "eval_jit_compilation_time": 2.7559, "eval_loss": 0.9527376294136047, "eval_runtime": 16.1008, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 220 }, { "epoch": 0.06, "learning_rate": 0.0002898168498168498, "loss": 0.47, "step": 240 }, { "epoch": 0.06, "eval_jit_compilation_time": 3.1796, "eval_loss": 0.9467018246650696, "eval_runtime": 16.1033, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 240 }, { "epoch": 0.06, "learning_rate": 0.00028835164835164836, "loss": 0.6132, "step": 260 }, { "epoch": 0.06, "eval_jit_compilation_time": 2.7884, "eval_loss": 0.9371232986450195, "eval_runtime": 16.098, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 260 }, { "epoch": 0.07, "learning_rate": 0.0002868864468864469, "loss": 0.4537, "step": 280 }, { "epoch": 0.07, "eval_jit_compilation_time": 2.7823, "eval_loss": 0.9320486187934875, "eval_runtime": 16.1043, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 280 }, { "epoch": 0.07, "learning_rate": 0.0002854212454212454, "loss": 0.427, "step": 300 }, { "epoch": 0.07, "eval_jit_compilation_time": 3.2166, "eval_loss": 0.9320515394210815, "eval_runtime": 16.1134, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 300 }, { "epoch": 0.08, "learning_rate": 0.0002839560439560439, "loss": 0.526, "step": 320 }, { "epoch": 0.08, "eval_jit_compilation_time": 2.7763, "eval_loss": 0.9264691472053528, "eval_runtime": 16.0959, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 320 }, { "epoch": 0.08, "learning_rate": 0.0002824908424908425, "loss": 0.5597, "step": 340 }, { "epoch": 0.08, "eval_jit_compilation_time": 2.8594, "eval_loss": 0.9179269671440125, "eval_runtime": 16.0993, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 340 }, { "epoch": 0.09, "learning_rate": 0.000281025641025641, "loss": 0.4573, "step": 360 }, { "epoch": 0.09, "eval_jit_compilation_time": 2.8621, "eval_loss": 0.9196782112121582, "eval_runtime": 16.1058, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 360 }, { "epoch": 0.09, "learning_rate": 0.0002795604395604395, "loss": 0.4637, "step": 380 }, { "epoch": 0.09, "eval_jit_compilation_time": 2.8753, "eval_loss": 0.9178407788276672, "eval_runtime": 16.0965, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 380 }, { "epoch": 0.1, "learning_rate": 0.0002780952380952381, "loss": 0.4727, "step": 400 }, { "epoch": 0.1, "eval_jit_compilation_time": 3.2977, "eval_loss": 0.9196626543998718, "eval_runtime": 16.1017, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 400 }, { "epoch": 0.1, "learning_rate": 0.0002766300366300366, "loss": 0.4375, "step": 420 }, { "epoch": 0.1, "eval_jit_compilation_time": 2.897, "eval_loss": 0.918095588684082, "eval_runtime": 16.0943, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 420 }, { "epoch": 0.1, "learning_rate": 0.0002751648351648351, "loss": 0.4282, "step": 440 }, { "epoch": 0.1, "eval_jit_compilation_time": 2.8358, "eval_loss": 0.9136374592781067, "eval_runtime": 16.1052, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 440 }, { "epoch": 0.11, "learning_rate": 0.00027369963369963364, "loss": 0.4355, "step": 460 }, { "epoch": 0.11, "eval_jit_compilation_time": 2.878, "eval_loss": 0.9091897010803223, "eval_runtime": 16.101, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 460 }, { "epoch": 0.11, "learning_rate": 0.0002722344322344322, "loss": 0.3906, "step": 480 }, { "epoch": 0.11, "eval_jit_compilation_time": 3.3423, "eval_loss": 0.9091867208480835, "eval_runtime": 16.0988, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 480 }, { "epoch": 0.12, "learning_rate": 0.00027076923076923073, "loss": 0.4685, "step": 500 }, { "epoch": 0.12, "eval_jit_compilation_time": 2.8264, "eval_loss": 0.9081636667251587, "eval_runtime": 16.0993, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 500 }, { "epoch": 0.12, "learning_rate": 0.0002693040293040293, "loss": 0.2393, "step": 520 }, { "epoch": 0.12, "eval_jit_compilation_time": 2.8497, "eval_loss": 0.9099282026290894, "eval_runtime": 16.105, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 520 }, { "epoch": 0.13, "learning_rate": 0.0002678388278388278, "loss": 0.3558, "step": 540 }, { "epoch": 0.13, "eval_jit_compilation_time": 2.9202, "eval_loss": 0.9030946493148804, "eval_runtime": 16.1008, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 540 }, { "epoch": 0.13, "learning_rate": 0.00026637362637362634, "loss": 0.3544, "step": 560 }, { "epoch": 0.13, "eval_jit_compilation_time": 3.4423, "eval_loss": 0.9009218215942383, "eval_runtime": 16.111, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 560 }, { "epoch": 0.14, "learning_rate": 0.00026490842490842486, "loss": 0.4759, "step": 580 }, { "epoch": 0.14, "eval_jit_compilation_time": 2.9149, "eval_loss": 0.9010920524597168, "eval_runtime": 16.1115, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 580 }, { "epoch": 0.14, "learning_rate": 0.00026344322344322343, "loss": 0.5252, "step": 600 }, { "epoch": 0.14, "eval_jit_compilation_time": 2.8251, "eval_loss": 0.9025772213935852, "eval_runtime": 16.0975, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 600 }, { "epoch": 0.15, "learning_rate": 0.00026197802197802195, "loss": 0.5811, "step": 620 }, { "epoch": 0.15, "eval_jit_compilation_time": 2.9244, "eval_loss": 0.8996238708496094, "eval_runtime": 16.106, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 620 }, { "epoch": 0.15, "learning_rate": 0.0002605128205128205, "loss": 0.3146, "step": 640 }, { "epoch": 0.15, "eval_jit_compilation_time": 2.8689, "eval_loss": 0.900309681892395, "eval_runtime": 16.0987, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 640 }, { "epoch": 0.16, "learning_rate": 0.00025904761904761904, "loss": 0.3857, "step": 660 }, { "epoch": 0.16, "eval_jit_compilation_time": 3.3255, "eval_loss": 0.8993622064590454, "eval_runtime": 16.1145, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 660 }, { "epoch": 0.16, "learning_rate": 0.00025758241758241755, "loss": 0.5575, "step": 680 }, { "epoch": 0.16, "eval_jit_compilation_time": 2.809, "eval_loss": 0.8985673785209656, "eval_runtime": 16.0961, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 680 }, { "epoch": 0.17, "learning_rate": 0.00025611721611721607, "loss": 0.3649, "step": 700 }, { "epoch": 0.17, "eval_jit_compilation_time": 2.8592, "eval_loss": 0.8972913026809692, "eval_runtime": 16.1009, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 700 }, { "epoch": 0.17, "learning_rate": 0.00025465201465201464, "loss": 0.5024, "step": 720 }, { "epoch": 0.17, "eval_jit_compilation_time": 2.8022, "eval_loss": 0.8976465463638306, "eval_runtime": 16.1105, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 720 }, { "epoch": 0.18, "learning_rate": 0.00025318681318681316, "loss": 0.3982, "step": 740 }, { "epoch": 0.18, "eval_jit_compilation_time": 2.8212, "eval_loss": 0.8923035860061646, "eval_runtime": 16.1032, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 740 }, { "epoch": 0.18, "learning_rate": 0.00025172161172161173, "loss": 0.3255, "step": 760 }, { "epoch": 0.18, "eval_jit_compilation_time": 3.3632, "eval_loss": 0.892743706703186, "eval_runtime": 16.0973, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 760 }, { "epoch": 0.19, "learning_rate": 0.00025025641025641025, "loss": 0.4907, "step": 780 }, { "epoch": 0.19, "eval_jit_compilation_time": 2.8228, "eval_loss": 0.8934652209281921, "eval_runtime": 16.1079, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 780 }, { "epoch": 0.19, "learning_rate": 0.00024879120879120877, "loss": 0.4249, "step": 800 }, { "epoch": 0.19, "eval_jit_compilation_time": 2.8419, "eval_loss": 0.8928836584091187, "eval_runtime": 16.1123, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 800 }, { "epoch": 0.2, "learning_rate": 0.0002473260073260073, "loss": 0.4295, "step": 820 }, { "epoch": 0.2, "eval_jit_compilation_time": 2.8321, "eval_loss": 0.8941049575805664, "eval_runtime": 16.1016, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 820 }, { "epoch": 0.2, "learning_rate": 0.00024586080586080585, "loss": 0.4235, "step": 840 }, { "epoch": 0.2, "eval_jit_compilation_time": 3.4624, "eval_loss": 0.8888626098632812, "eval_runtime": 16.1033, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 840 }, { "epoch": 0.21, "learning_rate": 0.00024439560439560437, "loss": 0.5269, "step": 860 }, { "epoch": 0.21, "eval_jit_compilation_time": 2.817, "eval_loss": 0.8869616389274597, "eval_runtime": 16.1029, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 860 }, { "epoch": 0.21, "learning_rate": 0.00024293040293040292, "loss": 0.4359, "step": 880 }, { "epoch": 0.21, "eval_jit_compilation_time": 2.8776, "eval_loss": 0.8852108120918274, "eval_runtime": 16.1053, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 880 }, { "epoch": 0.21, "learning_rate": 0.00024146520146520146, "loss": 0.5236, "step": 900 }, { "epoch": 0.21, "eval_jit_compilation_time": 2.8297, "eval_loss": 0.886044979095459, "eval_runtime": 16.1072, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 900 }, { "epoch": 0.22, "learning_rate": 0.00023999999999999998, "loss": 0.2865, "step": 920 }, { "epoch": 0.22, "eval_jit_compilation_time": 2.8516, "eval_loss": 0.8847950100898743, "eval_runtime": 16.1039, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 920 }, { "epoch": 0.22, "learning_rate": 0.0002385347985347985, "loss": 0.444, "step": 940 }, { "epoch": 0.22, "eval_jit_compilation_time": 2.8862, "eval_loss": 0.8814730644226074, "eval_runtime": 16.1042, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 940 }, { "epoch": 0.23, "learning_rate": 0.00023706959706959704, "loss": 0.4293, "step": 960 }, { "epoch": 0.23, "eval_jit_compilation_time": 3.5717, "eval_loss": 0.8812665939331055, "eval_runtime": 16.0995, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 960 }, { "epoch": 0.23, "learning_rate": 0.00023560439560439559, "loss": 0.6338, "step": 980 }, { "epoch": 0.23, "eval_jit_compilation_time": 2.8846, "eval_loss": 0.8828846216201782, "eval_runtime": 16.1117, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 980 }, { "epoch": 0.24, "learning_rate": 0.00023413919413919413, "loss": 0.4094, "step": 1000 }, { "epoch": 0.24, "eval_jit_compilation_time": 2.8559, "eval_loss": 0.8811469078063965, "eval_runtime": 16.1036, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1000 }, { "epoch": 0.24, "learning_rate": 0.00023267399267399265, "loss": 0.5024, "step": 1020 }, { "epoch": 0.24, "eval_jit_compilation_time": 2.8475, "eval_loss": 0.878176212310791, "eval_runtime": 16.0953, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1020 }, { "epoch": 0.25, "learning_rate": 0.0002312087912087912, "loss": 0.2928, "step": 1040 }, { "epoch": 0.25, "eval_jit_compilation_time": 2.9032, "eval_loss": 0.8761852979660034, "eval_runtime": 16.1082, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1040 }, { "epoch": 0.25, "learning_rate": 0.00022974358974358974, "loss": 0.6423, "step": 1060 }, { "epoch": 0.25, "eval_jit_compilation_time": 3.552, "eval_loss": 0.8761229515075684, "eval_runtime": 16.111, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1060 }, { "epoch": 0.26, "learning_rate": 0.00022827838827838825, "loss": 0.4124, "step": 1080 }, { "epoch": 0.26, "eval_jit_compilation_time": 2.8397, "eval_loss": 0.876449704170227, "eval_runtime": 16.0991, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1080 }, { "epoch": 0.26, "learning_rate": 0.0002268131868131868, "loss": 0.3806, "step": 1100 }, { "epoch": 0.26, "eval_jit_compilation_time": 2.8968, "eval_loss": 0.8773566484451294, "eval_runtime": 16.1014, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1100 }, { "epoch": 0.27, "learning_rate": 0.00022534798534798532, "loss": 0.3676, "step": 1120 }, { "epoch": 0.27, "eval_jit_compilation_time": 2.895, "eval_loss": 0.8758118748664856, "eval_runtime": 16.0982, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1120 }, { "epoch": 0.27, "learning_rate": 0.00022388278388278386, "loss": 0.6223, "step": 1140 }, { "epoch": 0.27, "eval_jit_compilation_time": 2.8319, "eval_loss": 0.8716633915901184, "eval_runtime": 16.1012, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1140 }, { "epoch": 0.28, "learning_rate": 0.0002224175824175824, "loss": 0.3516, "step": 1160 }, { "epoch": 0.28, "eval_jit_compilation_time": 3.5168, "eval_loss": 0.8740739822387695, "eval_runtime": 16.1038, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1160 }, { "epoch": 0.28, "learning_rate": 0.00022095238095238095, "loss": 0.5225, "step": 1180 }, { "epoch": 0.28, "eval_jit_compilation_time": 2.9322, "eval_loss": 0.8755660057067871, "eval_runtime": 16.1016, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1180 }, { "epoch": 0.29, "learning_rate": 0.00021948717948717947, "loss": 0.3976, "step": 1200 }, { "epoch": 0.29, "eval_jit_compilation_time": 2.8372, "eval_loss": 0.871749758720398, "eval_runtime": 16.0924, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1200 }, { "epoch": 0.29, "learning_rate": 0.00021802197802197798, "loss": 0.4755, "step": 1220 }, { "epoch": 0.29, "eval_jit_compilation_time": 2.8375, "eval_loss": 0.8700782060623169, "eval_runtime": 16.1058, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1220 }, { "epoch": 0.3, "learning_rate": 0.00021655677655677653, "loss": 0.3923, "step": 1240 }, { "epoch": 0.3, "eval_jit_compilation_time": 2.9236, "eval_loss": 0.8688793182373047, "eval_runtime": 16.1012, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1240 }, { "epoch": 0.3, "learning_rate": 0.00021509157509157507, "loss": 0.5693, "step": 1260 }, { "epoch": 0.3, "eval_jit_compilation_time": 2.9155, "eval_loss": 0.8698239326477051, "eval_runtime": 16.109, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1260 }, { "epoch": 0.31, "learning_rate": 0.00021362637362637362, "loss": 0.3598, "step": 1280 }, { "epoch": 0.31, "eval_jit_compilation_time": 3.49, "eval_loss": 0.8744434118270874, "eval_runtime": 16.1139, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1280 }, { "epoch": 0.31, "learning_rate": 0.00021216117216117216, "loss": 0.3065, "step": 1300 }, { "epoch": 0.31, "eval_jit_compilation_time": 2.8613, "eval_loss": 0.8703974485397339, "eval_runtime": 16.1137, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1300 }, { "epoch": 0.31, "learning_rate": 0.00021069597069597065, "loss": 0.5961, "step": 1320 }, { "epoch": 0.31, "eval_jit_compilation_time": 2.957, "eval_loss": 0.8677728772163391, "eval_runtime": 16.0971, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1320 }, { "epoch": 0.32, "learning_rate": 0.0002092307692307692, "loss": 0.34, "step": 1340 }, { "epoch": 0.32, "eval_jit_compilation_time": 2.8442, "eval_loss": 0.8699345588684082, "eval_runtime": 16.105, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1340 }, { "epoch": 0.32, "learning_rate": 0.00020776556776556774, "loss": 0.5374, "step": 1360 }, { "epoch": 0.32, "eval_jit_compilation_time": 2.9256, "eval_loss": 0.8692423105239868, "eval_runtime": 16.1085, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1360 }, { "epoch": 0.33, "learning_rate": 0.0002063003663003663, "loss": 0.4794, "step": 1380 }, { "epoch": 0.33, "eval_jit_compilation_time": 3.6522, "eval_loss": 0.8717620968818665, "eval_runtime": 16.112, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1380 }, { "epoch": 0.33, "learning_rate": 0.00020483516483516483, "loss": 0.4093, "step": 1400 }, { "epoch": 0.33, "eval_jit_compilation_time": 2.9303, "eval_loss": 0.8726860284805298, "eval_runtime": 16.114, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1400 }, { "epoch": 0.34, "learning_rate": 0.00020336996336996335, "loss": 0.3719, "step": 1420 }, { "epoch": 0.34, "eval_jit_compilation_time": 2.8477, "eval_loss": 0.8732159733772278, "eval_runtime": 16.106, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1420 }, { "epoch": 0.34, "learning_rate": 0.0002019047619047619, "loss": 0.5624, "step": 1440 }, { "epoch": 0.34, "eval_jit_compilation_time": 2.8412, "eval_loss": 0.8670560717582703, "eval_runtime": 16.1022, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1440 }, { "epoch": 0.35, "learning_rate": 0.0002004395604395604, "loss": 0.3062, "step": 1460 }, { "epoch": 0.35, "eval_jit_compilation_time": 2.8653, "eval_loss": 0.8671110272407532, "eval_runtime": 16.0996, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1460 }, { "epoch": 0.35, "learning_rate": 0.00019897435897435896, "loss": 0.2853, "step": 1480 }, { "epoch": 0.35, "eval_jit_compilation_time": 2.8456, "eval_loss": 0.8677889704704285, "eval_runtime": 16.1105, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1480 }, { "epoch": 0.36, "learning_rate": 0.0001975091575091575, "loss": 0.245, "step": 1500 }, { "epoch": 0.36, "eval_jit_compilation_time": 3.6346, "eval_loss": 0.8664909601211548, "eval_runtime": 16.1033, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1500 }, { "epoch": 0.36, "learning_rate": 0.00019604395604395602, "loss": 0.4323, "step": 1520 }, { "epoch": 0.36, "eval_jit_compilation_time": 2.8753, "eval_loss": 0.8659638166427612, "eval_runtime": 16.1015, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1520 }, { "epoch": 0.37, "learning_rate": 0.00019457875457875456, "loss": 0.6195, "step": 1540 }, { "epoch": 0.37, "eval_jit_compilation_time": 2.8473, "eval_loss": 0.8610908389091492, "eval_runtime": 16.1057, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1540 }, { "epoch": 0.37, "learning_rate": 0.0001931135531135531, "loss": 0.3475, "step": 1560 }, { "epoch": 0.37, "eval_jit_compilation_time": 2.8942, "eval_loss": 0.8603633046150208, "eval_runtime": 16.1105, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1560 }, { "epoch": 0.38, "learning_rate": 0.00019164835164835162, "loss": 0.5757, "step": 1580 }, { "epoch": 0.38, "eval_jit_compilation_time": 2.8844, "eval_loss": 0.860958456993103, "eval_runtime": 16.11, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1580 }, { "epoch": 0.38, "learning_rate": 0.00019018315018315017, "loss": 0.2439, "step": 1600 }, { "epoch": 0.38, "eval_jit_compilation_time": 2.9238, "eval_loss": 0.8623489141464233, "eval_runtime": 16.0923, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1600 }, { "epoch": 0.39, "learning_rate": 0.00018871794871794869, "loss": 0.2255, "step": 1620 }, { "epoch": 0.39, "eval_jit_compilation_time": 3.8041, "eval_loss": 0.8627634048461914, "eval_runtime": 16.1099, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1620 }, { "epoch": 0.39, "learning_rate": 0.00018725274725274723, "loss": 0.3422, "step": 1640 }, { "epoch": 0.39, "eval_jit_compilation_time": 2.8586, "eval_loss": 0.8616544008255005, "eval_runtime": 16.1045, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1640 }, { "epoch": 0.4, "learning_rate": 0.00018578754578754578, "loss": 0.5345, "step": 1660 }, { "epoch": 0.4, "eval_jit_compilation_time": 2.8662, "eval_loss": 0.8599859476089478, "eval_runtime": 16.1033, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1660 }, { "epoch": 0.4, "learning_rate": 0.00018432234432234432, "loss": 0.5832, "step": 1680 }, { "epoch": 0.4, "eval_jit_compilation_time": 2.8329, "eval_loss": 0.8589729070663452, "eval_runtime": 16.0986, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1680 }, { "epoch": 0.41, "learning_rate": 0.00018285714285714286, "loss": 0.4712, "step": 1700 }, { "epoch": 0.41, "eval_jit_compilation_time": 2.9638, "eval_loss": 0.8594633340835571, "eval_runtime": 16.1038, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1700 }, { "epoch": 0.41, "learning_rate": 0.00018139194139194135, "loss": 0.4541, "step": 1720 }, { "epoch": 0.41, "eval_jit_compilation_time": 2.9613, "eval_loss": 0.8567690849304199, "eval_runtime": 16.1073, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1720 }, { "epoch": 0.41, "learning_rate": 0.0001799267399267399, "loss": 0.3453, "step": 1740 }, { "epoch": 0.41, "eval_jit_compilation_time": 3.6765, "eval_loss": 0.8566139340400696, "eval_runtime": 16.0971, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1740 }, { "epoch": 0.42, "learning_rate": 0.00017846153846153844, "loss": 0.5157, "step": 1760 }, { "epoch": 0.42, "eval_jit_compilation_time": 2.8559, "eval_loss": 0.8555769920349121, "eval_runtime": 16.0975, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1760 }, { "epoch": 0.42, "learning_rate": 0.000176996336996337, "loss": 0.3657, "step": 1780 }, { "epoch": 0.42, "eval_jit_compilation_time": 2.9078, "eval_loss": 0.8547216653823853, "eval_runtime": 16.1086, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1780 }, { "epoch": 0.43, "learning_rate": 0.00017553113553113553, "loss": 0.344, "step": 1800 }, { "epoch": 0.43, "eval_jit_compilation_time": 2.9004, "eval_loss": 0.854417622089386, "eval_runtime": 16.106, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1800 }, { "epoch": 0.43, "learning_rate": 0.00017406593406593408, "loss": 0.3656, "step": 1820 }, { "epoch": 0.43, "eval_jit_compilation_time": 2.8703, "eval_loss": 0.8565861582756042, "eval_runtime": 16.0995, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1820 }, { "epoch": 0.44, "learning_rate": 0.00017260073260073257, "loss": 0.5061, "step": 1840 }, { "epoch": 0.44, "eval_jit_compilation_time": 2.8473, "eval_loss": 0.855994701385498, "eval_runtime": 16.0977, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1840 }, { "epoch": 0.44, "learning_rate": 0.0001711355311355311, "loss": 0.3537, "step": 1860 }, { "epoch": 0.44, "eval_jit_compilation_time": 2.8662, "eval_loss": 0.8566320538520813, "eval_runtime": 16.1019, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1860 }, { "epoch": 0.45, "learning_rate": 0.00016967032967032966, "loss": 0.3224, "step": 1880 }, { "epoch": 0.45, "eval_jit_compilation_time": 3.6742, "eval_loss": 0.8573530912399292, "eval_runtime": 16.0971, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1880 }, { "epoch": 0.45, "learning_rate": 0.0001682051282051282, "loss": 0.4498, "step": 1900 }, { "epoch": 0.45, "eval_jit_compilation_time": 2.8304, "eval_loss": 0.8554651141166687, "eval_runtime": 16.1094, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1900 }, { "epoch": 0.46, "learning_rate": 0.00016673992673992672, "loss": 0.442, "step": 1920 }, { "epoch": 0.46, "eval_jit_compilation_time": 2.8894, "eval_loss": 0.8565356135368347, "eval_runtime": 16.1074, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1920 }, { "epoch": 0.46, "learning_rate": 0.00016527472527472526, "loss": 0.4293, "step": 1940 }, { "epoch": 0.46, "eval_jit_compilation_time": 2.8807, "eval_loss": 0.8563782572746277, "eval_runtime": 16.1094, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1940 }, { "epoch": 0.47, "learning_rate": 0.00016380952380952378, "loss": 0.4025, "step": 1960 }, { "epoch": 0.47, "eval_jit_compilation_time": 2.8437, "eval_loss": 0.8553330302238464, "eval_runtime": 16.1119, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 1960 }, { "epoch": 0.47, "learning_rate": 0.00016234432234432233, "loss": 0.301, "step": 1980 }, { "epoch": 0.47, "eval_jit_compilation_time": 2.8788, "eval_loss": 0.8556132316589355, "eval_runtime": 16.0985, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 1980 }, { "epoch": 0.48, "learning_rate": 0.00016087912087912087, "loss": 0.5224, "step": 2000 }, { "epoch": 0.48, "eval_jit_compilation_time": 3.7117, "eval_loss": 0.8564403653144836, "eval_runtime": 16.1036, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 2000 }, { "epoch": 0.48, "learning_rate": 0.0001594139194139194, "loss": 0.6616, "step": 2020 }, { "epoch": 0.48, "eval_jit_compilation_time": 2.8513, "eval_loss": 0.8541139364242554, "eval_runtime": 16.1106, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 2020 }, { "epoch": 0.49, "learning_rate": 0.00015794871794871793, "loss": 0.3086, "step": 2040 }, { "epoch": 0.49, "eval_jit_compilation_time": 2.844, "eval_loss": 0.8550373911857605, "eval_runtime": 16.1059, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 2040 }, { "epoch": 0.49, "learning_rate": 0.00015648351648351648, "loss": 0.4124, "step": 2060 }, { "epoch": 0.49, "eval_jit_compilation_time": 2.8623, "eval_loss": 0.8542302846908569, "eval_runtime": 16.1038, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 2060 }, { "epoch": 0.5, "learning_rate": 0.00015501831501831502, "loss": 0.4167, "step": 2080 }, { "epoch": 0.5, "eval_jit_compilation_time": 2.8838, "eval_loss": 0.8524497747421265, "eval_runtime": 16.1019, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 2080 }, { "epoch": 0.5, "learning_rate": 0.00015355311355311354, "loss": 0.3931, "step": 2100 }, { "epoch": 0.5, "eval_jit_compilation_time": 2.9311, "eval_loss": 0.8538318872451782, "eval_runtime": 16.1114, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 2100 }, { "epoch": 0.51, "learning_rate": 0.00015208791208791206, "loss": 0.516, "step": 2120 }, { "epoch": 0.51, "eval_jit_compilation_time": 2.9007, "eval_loss": 0.8529971837997437, "eval_runtime": 16.1094, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 2120 }, { "epoch": 0.51, "learning_rate": 0.0001506227106227106, "loss": 0.2885, "step": 2140 }, { "epoch": 0.51, "eval_jit_compilation_time": 3.7926, "eval_loss": 0.8536807894706726, "eval_runtime": 16.0987, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 2140 }, { "epoch": 0.51, "learning_rate": 0.00014915750915750915, "loss": 0.3312, "step": 2160 }, { "epoch": 0.51, "eval_jit_compilation_time": 2.845, "eval_loss": 0.8531376123428345, "eval_runtime": 16.0991, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.311, "step": 2160 }, { "epoch": 0.52, "learning_rate": 0.0001476923076923077, "loss": 0.4314, "step": 2180 }, { "epoch": 0.52, "eval_jit_compilation_time": 2.8318, "eval_loss": 0.8527898788452148, "eval_runtime": 16.1058, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.31, "step": 2180 } ], "logging_steps": 20, "max_steps": 4195, "num_train_epochs": 1, "save_steps": 20, "total_flos": 8.083361658322944e+16, "trial_name": null, "trial_params": null }