diff --git "a/training.log" "b/training.log" new file mode 100644--- /dev/null +++ "b/training.log" @@ -0,0 +1,2470 @@ +[2024-01-22 00:45:45,205] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-01-22 00:45:47,429] [WARNING] [runner.py:202:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. +[2024-01-22 00:45:47,430] [INFO] [runner.py:571:main] cmd = /hpc2hdd/home/yli258/.conda/envs/graphR/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=29501 --enable_each_rank_log=None main.py --data_path local/jsonfile --data_split 10,0,0 --model_name_or_path /hpc2hdd/home/yli258/olddata/yuhanli/LLaMA_v2_ckpts/hf/Llama-2-13b-hf --per_device_train_batch_size 4 --per_device_eval_batch_size 2 --max_seq_len 2048 --learning_rate 2e-5 --weight_decay 0. --num_train_epochs 3 --gradient_accumulation_steps 1 --lr_scheduler_type cosine --num_warmup_steps 100 --seed 1234 --zero_stage 3 --deepspeed --gradient_checkpointing --output_dir /hpc2hdd/home/yli258/jhaidata/Graph-Reasoning-LLM/ckpts/llama_13b_rft_v1_k5/ +[2024-01-22 00:45:49,050] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-01-22 00:45:50,929] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_DEV_PACKAGE=libnccl-dev=2.15.5-1+cuda11.8 +[2024-01-22 00:45:50,929] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_DEV_PACKAGE_VERSION=2.15.5-1 +[2024-01-22 00:45:50,930] [INFO] [launch.py:138:main] 0 NCCL_VERSION=2.15.5-1 +[2024-01-22 00:45:50,930] [INFO] [launch.py:138:main] 0 NCCL_SOCKET_IFNAME=eth1 +[2024-01-22 00:45:50,930] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_DEV_PACKAGE_NAME=libnccl-dev +[2024-01-22 00:45:50,930] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_PACKAGE=libnccl2=2.15.5-1+cuda11.8 +[2024-01-22 00:45:50,930] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_PACKAGE_NAME=libnccl2 +[2024-01-22 00:45:50,930] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_PACKAGE_VERSION=2.15.5-1 +[2024-01-22 00:45:50,930] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]} +[2024-01-22 00:45:50,930] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=8, node_rank=0 +[2024-01-22 00:45:50,930] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}) +[2024-01-22 00:45:50,930] [INFO] [launch.py:163:main] dist_world_size=8 +[2024-01-22 00:45:50,930] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +[2024-01-22 00:45:52,973] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-01-22 00:45:53,012] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-01-22 00:45:53,044] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-01-22 00:45:53,044] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-01-22 00:45:53,053] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-01-22 00:45:53,053] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-01-22 00:45:53,082] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-01-22 00:45:53,082] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +/hpc2hdd/home/yli258/.conda/envs/graphR/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations + warnings.warn( +/hpc2hdd/home/yli258/.conda/envs/graphR/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations + warnings.warn( +[2024-01-22 00:45:55,604] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-01-22 00:45:55,607] [INFO] [comm.py:637:init_distributed] cdb=None +/hpc2hdd/home/yli258/.conda/envs/graphR/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations + warnings.warn( +[2024-01-22 00:45:55,726] [INFO] [comm.py:637:init_distributed] cdb=None +/hpc2hdd/home/yli258/.conda/envs/graphR/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations + warnings.warn( +/hpc2hdd/home/yli258/.conda/envs/graphR/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations + warnings.warn( +[2024-01-22 00:45:55,746] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-01-22 00:45:55,754] [INFO] [comm.py:637:init_distributed] cdb=None +/hpc2hdd/home/yli258/.conda/envs/graphR/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations + warnings.warn( +/hpc2hdd/home/yli258/.conda/envs/graphR/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations + warnings.warn( +[2024-01-22 00:45:55,774] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-01-22 00:45:55,775] [INFO] [comm.py:637:init_distributed] cdb=None +/hpc2hdd/home/yli258/.conda/envs/graphR/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations + warnings.warn( +[2024-01-22 00:45:55,788] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-01-22 00:45:55,788] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +loading from ... /hpc2hdd/home/yli258/olddata/yuhanli/LLaMA_v2_ckpts/hf/Llama-2-13b-hf +loading from ...loading from ...loading from ... /hpc2hdd/home/yli258/olddata/yuhanli/LLaMA_v2_ckpts/hf/Llama-2-13b-hf/hpc2hdd/home/yli258/olddata/yuhanli/LLaMA_v2_ckpts/hf/Llama-2-13b-hf + +/hpc2hdd/home/yli258/olddata/yuhanli/LLaMA_v2_ckpts/hf/Llama-2-13b-hf +loading from ... /hpc2hdd/home/yli258/olddata/yuhanli/LLaMA_v2_ckpts/hf/Llama-2-13b-hf +loading from ...loading from ...loading from ... /hpc2hdd/home/yli258/olddata/yuhanli/LLaMA_v2_ckpts/hf/Llama-2-13b-hf /hpc2hdd/home/yli258/olddata/yuhanli/LLaMA_v2_ckpts/hf/Llama-2-13b-hf +/hpc2hdd/home/yli258/olddata/yuhanli/LLaMA_v2_ckpts/hf/Llama-2-13b-hf + +[2024-01-22 00:45:58,896] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 363, num_elems = 13.02B + +Loading checkpoint shards: 0%| | 0/3 [00:00 +[2024-01-22 00:54:53,869] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False +[2024-01-22 00:54:53,869] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 3 optimizer +[2024-01-22 00:54:54,239] [INFO] [utils.py:791:see_memory_usage] Stage 3 initialize beginning +[2024-01-22 00:54:54,240] [INFO] [utils.py:792:see_memory_usage] MA 3.65 GB Max_MA 4.34 GB CA 5.26 GB Max_CA 5 GB +[2024-01-22 00:54:54,240] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 113.46 GB, percent = 5.6% +[2024-01-22 00:54:54,242] [INFO] [stage3.py:127:__init__] Reduce bucket size 500,000,000 +[2024-01-22 00:54:54,242] [INFO] [stage3.py:128:__init__] Prefetch bucket size 30000000 +[2024-01-22 00:54:54,609] [INFO] [utils.py:791:see_memory_usage] DeepSpeedZeRoOffload initialize [begin] +[2024-01-22 00:54:54,609] [INFO] [utils.py:792:see_memory_usage] MA 3.65 GB Max_MA 3.65 GB CA 5.26 GB Max_CA 5 GB +[2024-01-22 00:54:54,610] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 113.46 GB, percent = 5.6% +Parameter Offload: Total persistent parameters: 414720 in 81 params +[2024-01-22 00:54:54,999] [INFO] [utils.py:791:see_memory_usage] DeepSpeedZeRoOffload initialize [end] +[2024-01-22 00:54:54,999] [INFO] [utils.py:792:see_memory_usage] MA 3.12 GB Max_MA 3.69 GB CA 5.26 GB Max_CA 5 GB +[2024-01-22 00:54:55,000] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 113.46 GB, percent = 5.6% +[2024-01-22 00:54:55,365] [INFO] [utils.py:791:see_memory_usage] Before creating fp16 partitions +[2024-01-22 00:54:55,365] [INFO] [utils.py:792:see_memory_usage] MA 3.12 GB Max_MA 3.12 GB CA 5.26 GB Max_CA 5 GB +[2024-01-22 00:54:55,366] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 113.45 GB, percent = 5.6% +[2024-01-22 00:54:57,551] [INFO] [utils.py:791:see_memory_usage] After creating fp16 partitions: 3 +[2024-01-22 00:54:57,552] [INFO] [utils.py:792:see_memory_usage] MA 3.11 GB Max_MA 3.12 GB CA 3.11 GB Max_CA 5 GB +[2024-01-22 00:54:57,552] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 125.63 GB, percent = 6.2% +[2024-01-22 00:54:57,935] [INFO] [utils.py:791:see_memory_usage] Before creating fp32 partitions +[2024-01-22 00:54:57,936] [INFO] [utils.py:792:see_memory_usage] MA 3.11 GB Max_MA 3.11 GB CA 3.11 GB Max_CA 3 GB +[2024-01-22 00:54:57,936] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 113.49 GB, percent = 5.6% +[2024-01-22 00:54:58,311] [INFO] [utils.py:791:see_memory_usage] After creating fp32 partitions +[2024-01-22 00:54:58,312] [INFO] [utils.py:792:see_memory_usage] MA 9.17 GB Max_MA 10.33 GB CA 11.04 GB Max_CA 11 GB +[2024-01-22 00:54:58,312] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 113.49 GB, percent = 5.6% +[2024-01-22 00:54:58,676] [INFO] [utils.py:791:see_memory_usage] Before initializing optimizer states +[2024-01-22 00:54:58,677] [INFO] [utils.py:792:see_memory_usage] MA 9.17 GB Max_MA 9.17 GB CA 11.04 GB Max_CA 11 GB +[2024-01-22 00:54:58,677] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 113.5 GB, percent = 5.6% +[2024-01-22 00:54:59,058] [INFO] [utils.py:791:see_memory_usage] After initializing optimizer states +[2024-01-22 00:54:59,059] [INFO] [utils.py:792:see_memory_usage] MA 21.29 GB Max_MA 25.03 GB CA 26.9 GB Max_CA 27 GB +[2024-01-22 00:54:59,059] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 113.5 GB, percent = 5.6% +[2024-01-22 00:54:59,060] [INFO] [stage3.py:479:_setup_for_real_optimizer] optimizer state initialized +[2024-01-22 00:54:59,635] [INFO] [utils.py:791:see_memory_usage] After initializing ZeRO optimizer +[2024-01-22 00:54:59,636] [INFO] [utils.py:792:see_memory_usage] MA 25.26 GB Max_MA 25.87 GB CA 35.19 GB Max_CA 35 GB +[2024-01-22 00:54:59,636] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 113.79 GB, percent = 5.6% +[2024-01-22 00:54:59,636] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +[2024-01-22 00:54:59,636] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[2024-01-22 00:54:59,636] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[2024-01-22 00:54:59,636] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:54:59,638] [INFO] [config.py:984:print] DeepSpeedEngine configuration: +[2024-01-22 00:54:59,638] [INFO] [config.py:988:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2024-01-22 00:54:59,638] [INFO] [config.py:988:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[2024-01-22 00:54:59,638] [INFO] [config.py:988:print] amp_enabled .................. False +[2024-01-22 00:54:59,638] [INFO] [config.py:988:print] amp_params ................... False +[2024-01-22 00:54:59,638] [INFO] [config.py:988:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2024-01-22 00:54:59,638] [INFO] [config.py:988:print] bfloat16_enabled ............. True +[2024-01-22 00:54:59,638] [INFO] [config.py:988:print] checkpoint_parallel_write_pipeline False +[2024-01-22 00:54:59,638] [INFO] [config.py:988:print] checkpoint_tag_validation_enabled True +[2024-01-22 00:54:59,638] [INFO] [config.py:988:print] checkpoint_tag_validation_fail False +[2024-01-22 00:54:59,638] [INFO] [config.py:988:print] comms_config ................. +[2024-01-22 00:54:59,638] [INFO] [config.py:988:print] communication_data_type ...... None +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] curriculum_enabled_legacy .... False +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] curriculum_params_legacy ..... False +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] data_efficiency_enabled ...... False +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] dataloader_drop_last ......... False +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] disable_allgather ............ False +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] dump_state ................... False +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] dynamic_loss_scale_args ...... None +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] eigenvalue_enabled ........... False +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] eigenvalue_gas_boundary_resolution 1 +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] eigenvalue_layer_name ........ bert.encoder.layer +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] eigenvalue_layer_num ......... 0 +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] eigenvalue_max_iter .......... 100 +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] eigenvalue_stability ......... 1e-06 +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] eigenvalue_tol ............... 0.01 +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] eigenvalue_verbose ........... False +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] elasticity_enabled ........... False +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] fp16_auto_cast ............... None +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] fp16_enabled ................. False +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] fp16_master_weights_and_gradients False +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] global_rank .................. 0 +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] grad_accum_dtype ............. None +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] gradient_accumulation_steps .. 1 +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] gradient_clipping ............ 1.0 +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] gradient_predivide_factor .... 1.0 +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] graph_harvesting ............. False +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] initial_dynamic_scale ........ 1 +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] load_universal_checkpoint .... False +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] loss_scale ................... 1.0 +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] memory_breakdown ............. False +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] mics_hierarchial_params_gather False +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] mics_shard_size .............. -1 +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False +[2024-01-22 00:54:59,639] [INFO] [config.py:988:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] optimizer_legacy_fusion ...... False +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] optimizer_name ............... None +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] optimizer_params ............. None +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] pld_enabled .................. False +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] pld_params ................... False +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] prescale_gradients ........... False +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] scheduler_name ............... None +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] scheduler_params ............. None +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] seq_parallel_communication_data_type torch.float32 +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] sparse_attention ............. None +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] sparse_gradients_enabled ..... False +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] steps_per_print .............. 10 +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] train_batch_size ............. 32 +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] train_micro_batch_size_per_gpu 4 +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] use_data_before_expert_parallel_ False +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] use_node_local_storage ....... False +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] wall_clock_breakdown ......... False +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] weight_quantization_config ... None +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] world_size ................... 8 +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] zero_allow_untested_optimizer False +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=False, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=30000000 param_persistence_threshold=10000 model_persistence_threshold=sys.maxsize max_live_parameters=30000000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=False pipeline_loading_checkpoint=False override_module_apply=True +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] zero_enabled ................. True +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] zero_force_ds_cpu_optimizer .. True +[2024-01-22 00:54:59,640] [INFO] [config.py:988:print] zero_optimization_stage ...... 3 +[2024-01-22 00:54:59,640] [INFO] [config.py:974:print_user_config] json = { + "train_batch_size": 32, + "train_micro_batch_size_per_gpu": 4, + "steps_per_print": 10, + "zero_optimization": { + "stage": 3, + "offload_param": { + "device": "none" + }, + "offload_optimizer": { + "device": "none" + }, + "stage3_param_persistence_threshold": 1.000000e+04, + "stage3_max_live_parameters": 3.000000e+07, + "stage3_prefetch_bucket_size": 3.000000e+07, + "memory_efficient_linear": false + }, + "bf16": { + "enabled": true, + "loss_scale_window": 50, + "min_loss_scale": 1e-10 + }, + "gradient_clipping": 1.0, + "prescale_gradients": false, + "wall_clock_breakdown": false, + "hybrid_engine": { + "enabled": false, + "max_out_tokens": 512, + "inference_tp_size": 1, + "release_inference_cache": false, + "pin_parameters": true, + "tp_gather_partition_size": 8 + } +} +***** Running training ***** +***** Evaluating perplexity, Epoch 0/3 ***** +[2024-01-22 00:54:59,801] [WARNING] [parameter_offload.py:86:_apply_to_tensors_only] A module has unknown inputs or outputs type () and the tensors embedded in it cannot be detected. The ZeRO-3 hooks designed to trigger before or after backward pass of the module relies on knowing the input and output tensors and therefore may not get triggered properly. +ppl: (17.07481575012207, 26042606.0) +Beginning of Epoch 1/3, Total Micro Batches 2329 +Epoch: 0, Total Step: 1, Loss: 13.9678316116333 +[2024-01-22 00:56:08,285] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[2.0000000000000003e-06, 2.0000000000000003e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:56:08,286] [INFO] [timer.py:260:stop] epoch=0/micro_step=10/global_step=10, RunningAvgSamplesPerSec=4.794323256573658, CurrSamplesPerSec=4.784098246553044, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 11, Loss: 13.194334030151367 +[2024-01-22 00:57:15,358] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[4.000000000000001e-06, 4.000000000000001e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:57:15,359] [INFO] [timer.py:260:stop] epoch=0/micro_step=20/global_step=20, RunningAvgSamplesPerSec=4.781875432874668, CurrSamplesPerSec=4.748622586022921, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 21, Loss: 1.269156575202942 +[2024-01-22 00:58:22,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[6e-06, 6e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:58:22,669] [INFO] [timer.py:260:stop] epoch=0/micro_step=30/global_step=30, RunningAvgSamplesPerSec=4.772293298919014, CurrSamplesPerSec=4.755363816645486, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 31, Loss: 0.443787544965744 +[2024-01-22 00:59:30,100] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[8.000000000000001e-06, 8.000000000000001e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:59:30,101] [INFO] [timer.py:260:stop] epoch=0/micro_step=40/global_step=40, RunningAvgSamplesPerSec=4.765488427992049, CurrSamplesPerSec=4.7438164285151805, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 41, Loss: 0.20617564022541046 +[2024-01-22 01:00:37,527] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[1e-05, 1e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:00:37,528] [INFO] [timer.py:260:stop] epoch=0/micro_step=50/global_step=50, RunningAvgSamplesPerSec=4.761595584434026, CurrSamplesPerSec=4.75693089333357, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 51, Loss: 0.1847778707742691 +[2024-01-22 01:01:45,032] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[1.2e-05, 1.2e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:01:45,032] [INFO] [timer.py:260:stop] epoch=0/micro_step=60/global_step=60, RunningAvgSamplesPerSec=4.758107262696554, CurrSamplesPerSec=4.747012120506262, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 61, Loss: 0.14981406927108765 +[2024-01-22 01:02:52,518] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[1.4e-05, 1.4e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:02:52,518] [INFO] [timer.py:260:stop] epoch=0/micro_step=70/global_step=70, RunningAvgSamplesPerSec=4.755837233291555, CurrSamplesPerSec=4.755618745987091, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 71, Loss: 0.22007307410240173 +[2024-01-22 01:03:59,991] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[1.6000000000000003e-05, 1.6000000000000003e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:03:59,991] [INFO] [timer.py:260:stop] epoch=0/micro_step=80/global_step=80, RunningAvgSamplesPerSec=4.754272429585112, CurrSamplesPerSec=4.742392195406826, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 81, Loss: 0.036728113889694214 +[2024-01-22 01:05:07,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[1.8e-05, 1.8e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:05:07,360] [INFO] [timer.py:260:stop] epoch=0/micro_step=90/global_step=90, RunningAvgSamplesPerSec=4.753897238407314, CurrSamplesPerSec=4.7400914624994215, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 91, Loss: 0.06395600736141205 +[2024-01-22 01:06:14,739] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[2e-05, 2e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:06:14,740] [INFO] [timer.py:260:stop] epoch=0/micro_step=100/global_step=100, RunningAvgSamplesPerSec=4.753524259612998, CurrSamplesPerSec=4.73888479322549, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 101, Loss: 0.04671293497085571 +[2024-01-22 01:07:22,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[1.9999895957966793e-05, 1.9999895957966793e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:07:22,041] [INFO] [timer.py:260:stop] epoch=0/micro_step=110/global_step=110, RunningAvgSamplesPerSec=4.753734814040484, CurrSamplesPerSec=4.751890707071143, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 111, Loss: 0.08704915642738342 +[2024-01-22 01:08:29,517] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=0, lr=[1.9999583834032114e-05, 1.9999583834032114e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:08:29,517] [INFO] [timer.py:260:stop] epoch=0/micro_step=120/global_step=120, RunningAvgSamplesPerSec=4.752854139480708, CurrSamplesPerSec=4.750530391078762, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 121, Loss: 0.10781726986169815 +[2024-01-22 01:09:36,896] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=0, lr=[1.999906363469077e-05, 1.999906363469077e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:09:36,897] [INFO] [timer.py:260:stop] epoch=0/micro_step=130/global_step=130, RunningAvgSamplesPerSec=4.752647395871641, CurrSamplesPerSec=4.763771666308721, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 131, Loss: 0.030689241364598274 +[2024-01-22 01:10:44,363] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=0, lr=[1.999833537076728e-05, 1.999833537076728e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:10:44,363] [INFO] [timer.py:260:stop] epoch=0/micro_step=140/global_step=140, RunningAvgSamplesPerSec=4.752026250687486, CurrSamplesPerSec=4.756745952187692, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 141, Loss: 0.07948678731918335 +[2024-01-22 01:11:51,981] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=0, lr=[1.999739905741565e-05, 1.999739905741565e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:11:51,981] [INFO] [timer.py:260:stop] epoch=0/micro_step=150/global_step=150, RunningAvgSamplesPerSec=4.750765947089275, CurrSamplesPerSec=4.745993904218904, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 151, Loss: 0.16311782598495483 +[2024-01-22 01:12:59,407] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=0, lr=[1.9996254714119076e-05, 1.9996254714119076e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:12:59,407] [INFO] [timer.py:260:stop] epoch=0/micro_step=160/global_step=160, RunningAvgSamplesPerSec=4.750524920108736, CurrSamplesPerSec=4.716570433694271, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 161, Loss: 0.18346308171749115 +[2024-01-22 01:14:06,913] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=0, lr=[1.9994902364689513e-05, 1.9994902364689513e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:14:06,914] [INFO] [timer.py:260:stop] epoch=0/micro_step=170/global_step=170, RunningAvgSamplesPerSec=4.74997430591651, CurrSamplesPerSec=4.748412587504126, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 171, Loss: 0.047864899039268494 +[2024-01-22 01:15:14,414] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=0, lr=[1.9993342037267202e-05, 1.9993342037267202e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:15:14,414] [INFO] [timer.py:260:stop] epoch=0/micro_step=180/global_step=180, RunningAvgSamplesPerSec=4.749509363019531, CurrSamplesPerSec=4.7470425092084705, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 181, Loss: 0.038907475769519806 +[2024-01-22 01:16:21,838] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=0, lr=[1.9991573764320068e-05, 1.9991573764320068e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:16:21,839] [INFO] [timer.py:260:stop] epoch=0/micro_step=190/global_step=190, RunningAvgSamplesPerSec=4.749380631441361, CurrSamplesPerSec=4.734334772729037, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 191, Loss: 0.1333180069923401 +[2024-01-22 01:17:29,328] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=0, lr=[1.998959758264306e-05, 1.998959758264306e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:17:29,329] [INFO] [timer.py:260:stop] epoch=0/micro_step=200/global_step=200, RunningAvgSamplesPerSec=4.749032174753523, CurrSamplesPerSec=4.7386343315596875, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 201, Loss: 0.13933925330638885 +[2024-01-22 01:18:36,744] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=0, lr=[1.9987413533357358e-05, 1.9987413533357358e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:18:36,744] [INFO] [timer.py:260:stop] epoch=0/micro_step=210/global_step=210, RunningAvgSamplesPerSec=4.748968696842663, CurrSamplesPerSec=4.752677684529322, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 211, Loss: 0.20111733675003052 +[2024-01-22 01:19:44,156] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=0, lr=[1.9985021661909556e-05, 1.9985021661909556e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:19:44,156] [INFO] [timer.py:260:stop] epoch=0/micro_step=220/global_step=220, RunningAvgSamplesPerSec=4.748921355373508, CurrSamplesPerSec=4.742867459275828, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 221, Loss: 0.1336376965045929 +[2024-01-22 01:20:51,625] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=0, lr=[1.998242201807069e-05, 1.998242201807069e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:20:51,626] [INFO] [timer.py:260:stop] epoch=0/micro_step=230/global_step=230, RunningAvgSamplesPerSec=4.748700590098343, CurrSamplesPerSec=4.752132476768965, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 231, Loss: 0.06628328561782837 +[2024-01-22 01:21:59,115] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=0, lr=[1.99796146559352e-05, 1.99796146559352e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:21:59,115] [INFO] [timer.py:260:stop] epoch=0/micro_step=240/global_step=240, RunningAvgSamplesPerSec=4.74843889947716, CurrSamplesPerSec=4.742001799532172, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 241, Loss: 0.14359793066978455 +[2024-01-22 01:23:06,618] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=0, lr=[1.997659963391982e-05, 1.997659963391982e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:23:06,619] [INFO] [timer.py:260:stop] epoch=0/micro_step=250/global_step=250, RunningAvgSamplesPerSec=4.748160055691726, CurrSamplesPerSec=4.738931308059688, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 251, Loss: 0.033903706818819046 +[2024-01-22 01:24:13,947] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=0, lr=[1.9973377014762352e-05, 1.9973377014762352e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:24:13,948] [INFO] [timer.py:260:stop] epoch=0/micro_step=260/global_step=260, RunningAvgSamplesPerSec=4.748377691131433, CurrSamplesPerSec=4.747040158687234, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 261, Loss: 0.2909040153026581 +[2024-01-22 01:25:21,384] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=0, lr=[1.9969946865520372e-05, 1.9969946865520372e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:25:21,384] [INFO] [timer.py:260:stop] epoch=0/micro_step=270/global_step=270, RunningAvgSamplesPerSec=4.748295444610142, CurrSamplesPerSec=4.7260354134708376, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 271, Loss: 0.24154512584209442 +[2024-01-22 01:26:28,681] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=0, lr=[1.996630925756982e-05, 1.996630925756982e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:26:28,682] [INFO] [timer.py:260:stop] epoch=0/micro_step=280/global_step=280, RunningAvgSamplesPerSec=4.748573879819989, CurrSamplesPerSec=4.752206846508627, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 281, Loss: 0.2269619107246399 +[2024-01-22 01:27:36,041] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=0, lr=[1.9962464266603517e-05, 1.9962464266603517e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:27:36,042] [INFO] [timer.py:260:stop] epoch=0/micro_step=290/global_step=290, RunningAvgSamplesPerSec=4.748678479675014, CurrSamplesPerSec=4.750758737896501, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 291, Loss: 0.144073486328125 +[2024-01-22 01:28:43,445] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=0, lr=[1.99584119726296e-05, 1.99584119726296e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:28:43,446] [INFO] [timer.py:260:stop] epoch=0/micro_step=300/global_step=300, RunningAvgSamplesPerSec=4.74867345919712, CurrSamplesPerSec=4.753179083179032, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 301, Loss: 0.1041024923324585 +[2024-01-22 01:29:50,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=0, lr=[1.995415245996985e-05, 1.995415245996985e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:29:50,864] [INFO] [timer.py:260:stop] epoch=0/micro_step=310/global_step=310, RunningAvgSamplesPerSec=4.748636176566975, CurrSamplesPerSec=4.715381175389257, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 311, Loss: 0.09015242755413055 +[2024-01-22 01:30:58,324] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=0, lr=[1.9949685817257935e-05, 1.9949685817257935e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:30:58,324] [INFO] [timer.py:260:stop] epoch=0/micro_step=320/global_step=320, RunningAvgSamplesPerSec=4.748505853081816, CurrSamplesPerSec=4.7338949441273135, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 321, Loss: 0.05885539576411247 +[2024-01-22 01:32:05,739] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=0, lr=[1.9945012137437583e-05, 1.9945012137437583e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:32:05,739] [INFO] [timer.py:260:stop] epoch=0/micro_step=330/global_step=330, RunningAvgSamplesPerSec=4.74848089644745, CurrSamplesPerSec=4.7538002976930995, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 331, Loss: 0.1296696960926056 +[2024-01-22 01:33:13,114] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=0, lr=[1.9940131517760616e-05, 1.9940131517760616e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:33:13,115] [INFO] [timer.py:260:stop] epoch=0/micro_step=340/global_step=340, RunningAvgSamplesPerSec=4.748541968750021, CurrSamplesPerSec=4.73567045554943, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 341, Loss: 0.18984800577163696 +[2024-01-22 01:34:20,587] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=0, lr=[1.9935044059784953e-05, 1.9935044059784953e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:34:20,587] [INFO] [timer.py:260:stop] epoch=0/micro_step=350/global_step=350, RunningAvgSamplesPerSec=4.748401357210011, CurrSamplesPerSec=4.748585456933135, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 351, Loss: 0.04796263948082924 +[2024-01-22 01:35:27,963] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=0, lr=[1.992974986937249e-05, 1.992974986937249e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:35:27,964] [INFO] [timer.py:260:stop] epoch=0/micro_step=360/global_step=360, RunningAvgSamplesPerSec=4.748458304125143, CurrSamplesPerSec=4.749730341544728, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 361, Loss: 0.02675013057887554 +[2024-01-22 01:36:35,212] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=0, lr=[1.9924249056686893e-05, 1.9924249056686893e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:36:35,212] [INFO] [timer.py:260:stop] epoch=0/micro_step=370/global_step=370, RunningAvgSamplesPerSec=4.7487574153190435, CurrSamplesPerSec=4.765364762502797, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 371, Loss: 0.17363549768924713 +[2024-01-22 01:37:42,622] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=0, lr=[1.991854173619131e-05, 1.991854173619131e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:37:42,622] [INFO] [timer.py:260:stop] epoch=0/micro_step=380/global_step=380, RunningAvgSamplesPerSec=4.748739145660775, CurrSamplesPerSec=4.749715382059, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 381, Loss: 0.04196053743362427 +[2024-01-22 01:38:50,121] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=0, lr=[1.9912628026645993e-05, 1.9912628026645993e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:38:50,121] [INFO] [timer.py:260:stop] epoch=0/micro_step=390/global_step=390, RunningAvgSamplesPerSec=4.74856104414122, CurrSamplesPerSec=4.727242872923242, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 391, Loss: 0.1728733330965042 +[2024-01-22 01:39:57,528] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=0, lr=[1.9906508051105802e-05, 1.9906508051105802e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:39:57,529] [INFO] [timer.py:260:stop] epoch=0/micro_step=400/global_step=400, RunningAvgSamplesPerSec=4.748552122379008, CurrSamplesPerSec=4.760061225736869, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 401, Loss: 0.06589411944150925 +[2024-01-22 01:41:04,933] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=0, lr=[1.9900181936917686e-05, 1.9900181936917686e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:41:04,934] [INFO] [timer.py:260:stop] epoch=0/micro_step=410/global_step=410, RunningAvgSamplesPerSec=4.7485494443131575, CurrSamplesPerSec=4.723351551613114, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 411, Loss: 0.05584992468357086 +[2024-01-22 01:42:12,331] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=0, lr=[1.9893649815718e-05, 1.9893649815718e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:42:12,332] [INFO] [timer.py:260:stop] epoch=0/micro_step=420/global_step=420, RunningAvgSamplesPerSec=4.74855741777724, CurrSamplesPerSec=4.768520947598105, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 421, Loss: 0.1342315375804901 +[2024-01-22 01:43:19,720] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=0, lr=[1.9886911823429776e-05, 1.9886911823429776e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:43:19,720] [INFO] [timer.py:260:stop] epoch=0/micro_step=430/global_step=430, RunningAvgSamplesPerSec=4.748581310623176, CurrSamplesPerSec=4.74282773858414, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 431, Loss: 0.11732257157564163 +[2024-01-22 01:44:27,061] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=0, lr=[1.98799681002599e-05, 1.98799681002599e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:44:27,061] [INFO] [timer.py:260:stop] epoch=0/micro_step=440/global_step=440, RunningAvgSamplesPerSec=4.748680661658941, CurrSamplesPerSec=4.769953461425325, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 441, Loss: 0.06892146915197372 +[2024-01-22 01:45:34,618] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=0, lr=[1.9872818790696186e-05, 1.9872818790696186e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:45:34,618] [INFO] [timer.py:260:stop] epoch=0/micro_step=450/global_step=450, RunningAvgSamplesPerSec=4.748435144579589, CurrSamplesPerSec=4.727382401378326, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 451, Loss: 0.11380548775196075 +[2024-01-22 01:46:42,086] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=0, lr=[1.986546404350437e-05, 1.986546404350437e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:46:42,087] [INFO] [timer.py:260:stop] epoch=0/micro_step=460/global_step=460, RunningAvgSamplesPerSec=4.748336714196768, CurrSamplesPerSec=4.736193173894559, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 461, Loss: 0.055848781019449234 +[2024-01-22 01:47:49,547] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=0, lr=[1.9857904011725033e-05, 1.9857904011725033e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:47:49,548] [INFO] [timer.py:260:stop] epoch=0/micro_step=470/global_step=470, RunningAvgSamplesPerSec=4.7482550498351515, CurrSamplesPerSec=4.749663276720838, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 471, Loss: 0.034475721418857574 +[2024-01-22 01:48:57,007] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=0, lr=[1.985013885267038e-05, 1.985013885267038e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:48:57,008] [INFO] [timer.py:260:stop] epoch=0/micro_step=480/global_step=480, RunningAvgSamplesPerSec=4.748176992329904, CurrSamplesPerSec=4.776074670576221, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 481, Loss: 0.2271655946969986 +[2024-01-22 01:50:04,591] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=0, lr=[1.9842168727921006e-05, 1.9842168727921006e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:50:04,592] [INFO] [timer.py:260:stop] epoch=0/micro_step=490/global_step=490, RunningAvgSamplesPerSec=4.7479242908955595, CurrSamplesPerSec=4.746624992418604, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 491, Loss: 0.050789546221494675 +[2024-01-22 01:51:12,022] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=0, lr=[1.98339938033225e-05, 1.98339938033225e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:51:12,023] [INFO] [timer.py:260:stop] epoch=0/micro_step=500/global_step=500, RunningAvgSamplesPerSec=4.747898159165991, CurrSamplesPerSec=4.759254081558514, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 501, Loss: 0.17733146250247955 +[2024-01-22 01:52:19,351] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=0, lr=[1.9825614248982025e-05, 1.9825614248982025e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:52:19,352] [INFO] [timer.py:260:stop] epoch=0/micro_step=510/global_step=510, RunningAvgSamplesPerSec=4.748013095273396, CurrSamplesPerSec=4.747058962922303, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 511, Loss: 0.04953478276729584 +[2024-01-22 01:53:26,789] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=0, lr=[1.9817030239264753e-05, 1.9817030239264753e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:53:26,790] [INFO] [timer.py:260:stop] epoch=0/micro_step=520/global_step=520, RunningAvgSamplesPerSec=4.7479765391015265, CurrSamplesPerSec=4.737959538580871, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 521, Loss: 0.03135230764746666 +[2024-01-22 01:54:34,225] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=0, lr=[1.9808241952790245e-05, 1.9808241952790245e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:54:34,226] [INFO] [timer.py:260:stop] epoch=0/micro_step=530/global_step=530, RunningAvgSamplesPerSec=4.747944323655317, CurrSamplesPerSec=4.752047509666502, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 531, Loss: 0.19229106605052948 +[2024-01-22 01:55:41,600] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=0, lr=[1.9799249572428744e-05, 1.9799249572428744e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:55:41,600] [INFO] [timer.py:260:stop] epoch=0/micro_step=540/global_step=540, RunningAvgSamplesPerSec=4.7479939199439745, CurrSamplesPerSec=4.751053872524893, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 541, Loss: 0.15774601697921753 +[2024-01-22 01:56:49,094] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=0, lr=[1.9790053285297356e-05, 1.9790053285297356e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:56:49,094] [INFO] [timer.py:260:stop] epoch=0/micro_step=550/global_step=550, RunningAvgSamplesPerSec=4.7478880577101625, CurrSamplesPerSec=4.71456774810139, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 551, Loss: 0.11499917507171631 +[2024-01-22 01:57:56,532] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=0, lr=[1.9780653282756162e-05, 1.9780653282756162e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:57:56,533] [INFO] [timer.py:260:stop] epoch=0/micro_step=560/global_step=560, RunningAvgSamplesPerSec=4.747855466997648, CurrSamplesPerSec=4.732953945378135, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 561, Loss: 0.03715988248586655 +[2024-01-22 01:59:03,884] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=0, lr=[1.9771049760404236e-05, 1.9771049760404236e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 01:59:03,884] [INFO] [timer.py:260:stop] epoch=0/micro_step=570/global_step=570, RunningAvgSamplesPerSec=4.747932895456062, CurrSamplesPerSec=4.757620039511388, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 571, Loss: 0.06784096360206604 +[2024-01-22 02:00:11,288] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=0, lr=[1.9761242918075584e-05, 1.9761242918075584e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:00:11,288] [INFO] [timer.py:260:stop] epoch=0/micro_step=580/global_step=580, RunningAvgSamplesPerSec=4.747941854301967, CurrSamplesPerSec=4.764729527442086, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 581, Loss: 0.07552769035100937 +[2024-01-22 02:01:18,713] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=0, lr=[1.975123295983496e-05, 1.975123295983496e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:01:18,713] [INFO] [timer.py:260:stop] epoch=0/micro_step=590/global_step=590, RunningAvgSamplesPerSec=4.747925539015416, CurrSamplesPerSec=4.751842086908988, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 591, Loss: 0.2795135974884033 +[2024-01-22 02:02:26,093] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=0, lr=[1.9741020093973648e-05, 1.9741020093973648e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:02:26,094] [INFO] [timer.py:260:stop] epoch=0/micro_step=600/global_step=600, RunningAvgSamplesPerSec=4.747961916978484, CurrSamplesPerSec=4.760809369737908, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 601, Loss: 0.09199900925159454 +[2024-01-22 02:03:33,565] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=0, lr=[1.9730604533005116e-05, 1.9730604533005116e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:03:33,565] [INFO] [timer.py:260:stop] epoch=0/micro_step=610/global_step=610, RunningAvgSamplesPerSec=4.747892257778514, CurrSamplesPerSec=4.748770100088163, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 611, Loss: 0.115862637758255 +[2024-01-22 02:04:40,940] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=0, lr=[1.971998649366059e-05, 1.971998649366059e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:04:40,941] [INFO] [timer.py:260:stop] epoch=0/micro_step=620/global_step=620, RunningAvgSamplesPerSec=4.747933904080394, CurrSamplesPerSec=4.758960120891405, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 621, Loss: 0.1467607319355011 +[2024-01-22 02:05:48,228] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=0, lr=[1.9709166196884553e-05, 1.9709166196884553e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:05:48,229] [INFO] [timer.py:260:stop] epoch=0/micro_step=630/global_step=630, RunningAvgSamplesPerSec=4.748072999015769, CurrSamplesPerSec=4.758246632655724, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 631, Loss: 0.031429972499608994 +[2024-01-22 02:06:55,502] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=0, lr=[1.969814386783014e-05, 1.969814386783014e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:06:55,503] [INFO] [timer.py:260:stop] epoch=0/micro_step=640/global_step=640, RunningAvgSamplesPerSec=4.7482229930980315, CurrSamplesPerSec=4.750164880086743, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 641, Loss: 0.12642978131771088 +[2024-01-22 02:08:02,971] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=0, lr=[1.968691973585445e-05, 1.968691973585445e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:08:02,971] [INFO] [timer.py:260:stop] epoch=0/micro_step=650/global_step=650, RunningAvgSamplesPerSec=4.74815709983461, CurrSamplesPerSec=4.747756671759867, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 651, Loss: 0.1284780651330948 +[2024-01-22 02:09:10,311] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=0, lr=[1.9675494034513792e-05, 1.9675494034513792e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:09:10,311] [INFO] [timer.py:260:stop] epoch=0/micro_step=660/global_step=660, RunningAvgSamplesPerSec=4.74823116271848, CurrSamplesPerSec=4.754413590612556, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 661, Loss: 0.07917825877666473 +[2024-01-22 02:10:17,630] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=0, lr=[1.9663867001558805e-05, 1.9663867001558805e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:10:17,630] [INFO] [timer.py:260:stop] epoch=0/micro_step=670/global_step=670, RunningAvgSamplesPerSec=4.748324542286354, CurrSamplesPerSec=4.749423943015007, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 671, Loss: 0.10683419555425644 +[2024-01-22 02:11:25,094] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=0, lr=[1.9652038878929516e-05, 1.9652038878929516e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:11:25,094] [INFO] [timer.py:260:stop] epoch=0/micro_step=680/global_step=680, RunningAvgSamplesPerSec=4.748264654637092, CurrSamplesPerSec=4.744320151662456, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 681, Loss: 0.04668274149298668 +[2024-01-22 02:12:32,473] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=0, lr=[1.9640009912750313e-05, 1.9640009912750313e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:12:32,473] [INFO] [timer.py:260:stop] epoch=0/micro_step=690/global_step=690, RunningAvgSamplesPerSec=4.748293221881698, CurrSamplesPerSec=4.760111533620621, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 691, Loss: 0.05570341646671295 +[2024-01-22 02:13:39,928] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=0, lr=[1.9627780353324816e-05, 1.9627780353324816e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:13:39,928] [INFO] [timer.py:260:stop] epoch=0/micro_step=700/global_step=700, RunningAvgSamplesPerSec=4.748244476919252, CurrSamplesPerSec=4.7613410308821935, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 701, Loss: 0.18699362874031067 +[2024-01-22 02:14:47,269] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=0, lr=[1.9615350455130666e-05, 1.9615350455130666e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:14:47,269] [INFO] [timer.py:260:stop] epoch=0/micro_step=710/global_step=710, RunningAvgSamplesPerSec=4.748310443108867, CurrSamplesPerSec=4.754867515783756, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 711, Loss: 0.0719674676656723 +[2024-01-22 02:15:54,585] [INFO] [logging.py:96:log_dist] [Rank 0] step=720, skipped=0, lr=[1.9602720476814246e-05, 1.9602720476814246e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:15:54,585] [INFO] [timer.py:260:stop] epoch=0/micro_step=720/global_step=720, RunningAvgSamplesPerSec=4.748399013698587, CurrSamplesPerSec=4.750499453286903, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 721, Loss: 0.15827733278274536 +[2024-01-22 02:17:02,071] [INFO] [logging.py:96:log_dist] [Rank 0] step=730, skipped=0, lr=[1.958989068118527e-05, 1.958989068118527e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:17:02,072] [INFO] [timer.py:260:stop] epoch=0/micro_step=730/global_step=730, RunningAvgSamplesPerSec=4.748320064128485, CurrSamplesPerSec=4.726042069951325, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 731, Loss: 0.19582848250865936 +[2024-01-22 02:18:09,422] [INFO] [logging.py:96:log_dist] [Rank 0] step=740, skipped=0, lr=[1.957686133521136e-05, 1.957686133521136e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:18:09,423] [INFO] [timer.py:260:stop] epoch=0/micro_step=740/global_step=740, RunningAvgSamplesPerSec=4.7483731385376995, CurrSamplesPerSec=4.765061418747742, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 741, Loss: 0.1310730129480362 +[2024-01-22 02:19:16,883] [INFO] [logging.py:96:log_dist] [Rank 0] step=750, skipped=0, lr=[1.9563632710012426e-05, 1.9563632710012426e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:19:16,883] [INFO] [timer.py:260:stop] epoch=0/micro_step=750/global_step=750, RunningAvgSamplesPerSec=4.7483207368465585, CurrSamplesPerSec=4.745811490574232, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 751, Loss: 0.09426483511924744 +[2024-01-22 02:20:24,229] [INFO] [logging.py:96:log_dist] [Rank 0] step=760, skipped=0, lr=[1.9550205080855097e-05, 1.9550205080855097e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:20:24,230] [INFO] [timer.py:260:stop] epoch=0/micro_step=760/global_step=760, RunningAvgSamplesPerSec=4.748375728751562, CurrSamplesPerSec=4.759470778457565, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 761, Loss: 0.05870091915130615 +[2024-01-22 02:21:31,629] [INFO] [logging.py:96:log_dist] [Rank 0] step=770, skipped=0, lr=[1.9536578727146928e-05, 1.9536578727146928e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:21:31,630] [INFO] [timer.py:260:stop] epoch=0/micro_step=770/global_step=770, RunningAvgSamplesPerSec=4.7483808526560525, CurrSamplesPerSec=4.753572163471533, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 771, Loss: 0.08545812964439392 +[2024-01-22 02:22:39,139] [INFO] [logging.py:96:log_dist] [Rank 0] step=780, skipped=0, lr=[1.9522753932430633e-05, 1.9522753932430633e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:22:39,139] [INFO] [timer.py:260:stop] epoch=0/micro_step=780/global_step=780, RunningAvgSamplesPerSec=4.748285552707637, CurrSamplesPerSec=4.756429377115149, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 781, Loss: 0.1312224268913269 +[2024-01-22 02:23:46,520] [INFO] [logging.py:96:log_dist] [Rank 0] step=790, skipped=0, lr=[1.9508730984378164e-05, 1.9508730984378164e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:23:46,520] [INFO] [timer.py:260:stop] epoch=0/micro_step=790/global_step=790, RunningAvgSamplesPerSec=4.748308387164608, CurrSamplesPerSec=4.736958576180685, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 791, Loss: 0.15706858038902283 +[2024-01-22 02:24:53,936] [INFO] [logging.py:96:log_dist] [Rank 0] step=800, skipped=0, lr=[1.9494510174784725e-05, 1.9494510174784725e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:24:53,937] [INFO] [timer.py:260:stop] epoch=0/micro_step=800/global_step=800, RunningAvgSamplesPerSec=4.748298638818767, CurrSamplesPerSec=4.748881833811302, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 801, Loss: 0.1851167380809784 +[2024-01-22 02:26:01,229] [INFO] [logging.py:96:log_dist] [Rank 0] step=810, skipped=0, lr=[1.9480091799562706e-05, 1.9480091799562706e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:26:01,229] [INFO] [timer.py:260:stop] epoch=0/micro_step=810/global_step=810, RunningAvgSamplesPerSec=4.748397261874125, CurrSamplesPerSec=4.743661845193407, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 811, Loss: 0.20512887835502625 +[2024-01-22 02:27:08,662] [INFO] [logging.py:96:log_dist] [Rank 0] step=820, skipped=0, lr=[1.946547615873552e-05, 1.946547615873552e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:27:08,663] [INFO] [timer.py:260:stop] epoch=0/micro_step=820/global_step=820, RunningAvgSamplesPerSec=4.7483722485559525, CurrSamplesPerSec=4.738367000891592, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 821, Loss: 0.0896933451294899 +[2024-01-22 02:28:15,986] [INFO] [logging.py:96:log_dist] [Rank 0] step=830, skipped=0, lr=[1.945066355643136e-05, 1.945066355643136e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:28:15,987] [INFO] [timer.py:260:stop] epoch=0/micro_step=830/global_step=830, RunningAvgSamplesPerSec=4.748441649785934, CurrSamplesPerSec=4.752073251853949, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 831, Loss: 0.03550824150443077 +[2024-01-22 02:29:23,461] [INFO] [logging.py:96:log_dist] [Rank 0] step=840, skipped=0, lr=[1.943565430087689e-05, 1.943565430087689e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:29:23,462] [INFO] [timer.py:260:stop] epoch=0/micro_step=840/global_step=840, RunningAvgSamplesPerSec=4.748382013268807, CurrSamplesPerSec=4.7350270755209385, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 841, Loss: 0.17985300719738007 +[2024-01-22 02:30:30,860] [INFO] [logging.py:96:log_dist] [Rank 0] step=850, skipped=0, lr=[1.9420448704390792e-05, 1.9420448704390792e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:30:30,860] [INFO] [timer.py:260:stop] epoch=0/micro_step=850/global_step=850, RunningAvgSamplesPerSec=4.748386958060487, CurrSamplesPerSec=4.728528911152707, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 851, Loss: 0.09111388027667999 +[2024-01-22 02:31:38,212] [INFO] [logging.py:96:log_dist] [Rank 0] step=860, skipped=0, lr=[1.9405047083377305e-05, 1.9405047083377305e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:31:38,212] [INFO] [timer.py:260:stop] epoch=0/micro_step=860/global_step=860, RunningAvgSamplesPerSec=4.748430032240107, CurrSamplesPerSec=4.728487264663324, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 861, Loss: 0.06262162327766418 +[2024-01-22 02:32:45,572] [INFO] [logging.py:96:log_dist] [Rank 0] step=870, skipped=0, lr=[1.9389449758319624e-05, 1.9389449758319624e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:32:45,572] [INFO] [timer.py:260:stop] epoch=0/micro_step=870/global_step=870, RunningAvgSamplesPerSec=4.74846518371225, CurrSamplesPerSec=4.7420671402854895, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 871, Loss: 0.23333986103534698 +[2024-01-22 02:33:53,066] [INFO] [logging.py:96:log_dist] [Rank 0] step=880, skipped=0, lr=[1.937365705377323e-05, 1.937365705377323e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:33:53,066] [INFO] [timer.py:260:stop] epoch=0/micro_step=880/global_step=880, RunningAvgSamplesPerSec=4.7483924567493165, CurrSamplesPerSec=4.746480968683269, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 881, Loss: 0.08031116425991058 +[2024-01-22 02:35:00,405] [INFO] [logging.py:96:log_dist] [Rank 0] step=890, skipped=0, lr=[1.9357669298359137e-05, 1.9357669298359137e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:35:00,406] [INFO] [timer.py:260:stop] epoch=0/micro_step=890/global_step=890, RunningAvgSamplesPerSec=4.748444103880982, CurrSamplesPerSec=4.705535425863073, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 891, Loss: 0.1671382635831833 +[2024-01-22 02:36:07,806] [INFO] [logging.py:96:log_dist] [Rank 0] step=900, skipped=0, lr=[1.9341486824757068e-05, 1.9341486824757068e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:36:07,807] [INFO] [timer.py:260:stop] epoch=0/micro_step=900/global_step=900, RunningAvgSamplesPerSec=4.748446171240021, CurrSamplesPerSec=4.74615854163518, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 901, Loss: 0.11018312722444534 +[2024-01-22 02:37:15,302] [INFO] [logging.py:96:log_dist] [Rank 0] step=910, skipped=0, lr=[1.9325109969698507e-05, 1.9325109969698507e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:37:15,303] [INFO] [timer.py:260:stop] epoch=0/micro_step=910/global_step=910, RunningAvgSamplesPerSec=4.74837487133247, CurrSamplesPerSec=4.754097831085907, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 911, Loss: 0.1308337152004242 +[2024-01-22 02:38:22,625] [INFO] [logging.py:96:log_dist] [Rank 0] step=920, skipped=0, lr=[1.930853907395972e-05, 1.930853907395972e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:38:22,626] [INFO] [timer.py:260:stop] epoch=0/micro_step=920/global_step=920, RunningAvgSamplesPerSec=4.748437960926836, CurrSamplesPerSec=4.756557991191678, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 921, Loss: 0.1674511879682541 +[2024-01-22 02:39:30,108] [INFO] [logging.py:96:log_dist] [Rank 0] step=930, skipped=0, lr=[1.929177448235464e-05, 1.929177448235464e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:39:30,109] [INFO] [timer.py:260:stop] epoch=0/micro_step=930/global_step=930, RunningAvgSamplesPerSec=4.748377671077224, CurrSamplesPerSec=4.749388145875705, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 931, Loss: 0.043794792145490646 +[2024-01-22 02:40:37,558] [INFO] [logging.py:96:log_dist] [Rank 0] step=940, skipped=0, lr=[1.927481654372771e-05, 1.927481654372771e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:40:37,558] [INFO] [timer.py:260:stop] epoch=0/micro_step=940/global_step=940, RunningAvgSamplesPerSec=4.748343937688607, CurrSamplesPerSec=4.752270786154746, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 941, Loss: 0.1234627515077591 +[2024-01-22 02:41:44,969] [INFO] [logging.py:96:log_dist] [Rank 0] step=950, skipped=0, lr=[1.9257665610946604e-05, 1.9257665610946604e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:41:44,970] [INFO] [timer.py:260:stop] epoch=0/micro_step=950/global_step=950, RunningAvgSamplesPerSec=4.748339202097665, CurrSamplesPerSec=4.754722822988578, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 951, Loss: 0.16021107137203217 +[2024-01-22 02:42:52,412] [INFO] [logging.py:96:log_dist] [Rank 0] step=960, skipped=0, lr=[1.9240322040894916e-05, 1.9240322040894916e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:42:52,412] [INFO] [timer.py:260:stop] epoch=0/micro_step=960/global_step=960, RunningAvgSamplesPerSec=4.748311567384765, CurrSamplesPerSec=4.7613783597118, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 961, Loss: 0.10376229882240295 +[2024-01-22 02:43:59,849] [INFO] [logging.py:96:log_dist] [Rank 0] step=970, skipped=0, lr=[1.92227861944647e-05, 1.92227861944647e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:43:59,849] [INFO] [timer.py:260:stop] epoch=0/micro_step=970/global_step=970, RunningAvgSamplesPerSec=4.7482884300182056, CurrSamplesPerSec=4.75234953524124, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 971, Loss: 0.05136357992887497 +[2024-01-22 02:45:07,336] [INFO] [logging.py:96:log_dist] [Rank 0] step=980, skipped=0, lr=[1.920505843654898e-05, 1.920505843654898e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:45:07,336] [INFO] [timer.py:260:stop] epoch=0/micro_step=980/global_step=980, RunningAvgSamplesPerSec=4.74823058835305, CurrSamplesPerSec=4.734733594586494, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 981, Loss: 0.1390877217054367 +[2024-01-22 02:46:14,799] [INFO] [logging.py:96:log_dist] [Rank 0] step=990, skipped=0, lr=[1.918713913603415e-05, 1.918713913603415e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:46:14,800] [INFO] [timer.py:260:stop] epoch=0/micro_step=990/global_step=990, RunningAvgSamplesPerSec=4.748189549289067, CurrSamplesPerSec=4.756844405848133, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 991, Loss: 0.14867332577705383 +[2024-01-22 02:47:22,315] [INFO] [logging.py:96:log_dist] [Rank 0] step=1000, skipped=0, lr=[1.9169028665792303e-05, 1.9169028665792303e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:47:22,316] [INFO] [timer.py:260:stop] epoch=0/micro_step=1000/global_step=1000, RunningAvgSamplesPerSec=4.748113064292023, CurrSamplesPerSec=4.735532609485289, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1001, Loss: 0.2542916238307953 +[2024-01-22 02:48:29,713] [INFO] [logging.py:96:log_dist] [Rank 0] step=1010, skipped=0, lr=[1.915072740267347e-05, 1.915072740267347e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:48:29,714] [INFO] [timer.py:260:stop] epoch=0/micro_step=1010/global_step=1010, RunningAvgSamplesPerSec=4.748120791566963, CurrSamplesPerSec=4.736582613899589, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1011, Loss: 0.04315679892897606 +[2024-01-22 02:49:37,062] [INFO] [logging.py:96:log_dist] [Rank 0] step=1020, skipped=0, lr=[1.913223572749777e-05, 1.913223572749777e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:49:37,063] [INFO] [timer.py:260:stop] epoch=0/micro_step=1020/global_step=1020, RunningAvgSamplesPerSec=4.748161693085453, CurrSamplesPerSec=4.776040849896302, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1021, Loss: 0.027347611263394356 +[2024-01-22 02:50:44,468] [INFO] [logging.py:96:log_dist] [Rank 0] step=1030, skipped=0, lr=[1.9113554025047507e-05, 1.9113554025047507e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:50:44,468] [INFO] [timer.py:260:stop] epoch=0/micro_step=1030/global_step=1030, RunningAvgSamplesPerSec=4.7481633003953965, CurrSamplesPerSec=4.755676205746607, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1031, Loss: 0.28249624371528625 +[2024-01-22 02:51:51,804] [INFO] [logging.py:96:log_dist] [Rank 0] step=1040, skipped=0, lr=[1.9094682684059135e-05, 1.9094682684059135e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:51:51,805] [INFO] [timer.py:260:stop] epoch=0/micro_step=1040/global_step=1040, RunningAvgSamplesPerSec=4.748211395109431, CurrSamplesPerSec=4.759960444349936, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1041, Loss: 0.09312671422958374 +[2024-01-22 02:52:59,248] [INFO] [logging.py:96:log_dist] [Rank 0] step=1050, skipped=0, lr=[1.90756220972152e-05, 1.90756220972152e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:52:59,249] [INFO] [timer.py:260:stop] epoch=0/micro_step=1050/global_step=1050, RunningAvgSamplesPerSec=4.748186802109267, CurrSamplesPerSec=4.718008551895563, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1051, Loss: 0.06319618225097656 +[2024-01-22 02:54:06,616] [INFO] [logging.py:96:log_dist] [Rank 0] step=1060, skipped=0, lr=[1.9056372661136137e-05, 1.9056372661136137e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:54:06,617] [INFO] [timer.py:260:stop] epoch=0/micro_step=1060/global_step=1060, RunningAvgSamplesPerSec=4.748213338869202, CurrSamplesPerSec=4.7260309203571085, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1061, Loss: 0.07180161029100418 +[2024-01-22 02:55:14,124] [INFO] [logging.py:96:log_dist] [Rank 0] step=1070, skipped=0, lr=[1.903693477637204e-05, 1.903693477637204e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:55:14,125] [INFO] [timer.py:260:stop] epoch=0/micro_step=1070/global_step=1070, RunningAvgSamplesPerSec=4.748146512396191, CurrSamplesPerSec=4.730914313133247, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1071, Loss: 0.13549475371837616 +[2024-01-22 02:56:21,583] [INFO] [logging.py:96:log_dist] [Rank 0] step=1080, skipped=0, lr=[1.9017308847394322e-05, 1.9017308847394322e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:56:21,584] [INFO] [timer.py:260:stop] epoch=0/micro_step=1080/global_step=1080, RunningAvgSamplesPerSec=4.748112826112923, CurrSamplesPerSec=4.743507439590382, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1081, Loss: 0.25711843371391296 +[2024-01-22 02:57:28,986] [INFO] [logging.py:96:log_dist] [Rank 0] step=1090, skipped=0, lr=[1.8997495282587293e-05, 1.8997495282587293e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:57:28,987] [INFO] [timer.py:260:stop] epoch=0/micro_step=1090/global_step=1090, RunningAvgSamplesPerSec=4.7481163879141395, CurrSamplesPerSec=4.75213197200493, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1091, Loss: 0.032875485718250275 +[2024-01-22 02:58:36,333] [INFO] [logging.py:96:log_dist] [Rank 0] step=1100, skipped=0, lr=[1.897749449423967e-05, 1.897749449423967e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:58:36,333] [INFO] [timer.py:260:stop] epoch=0/micro_step=1100/global_step=1100, RunningAvgSamplesPerSec=4.748156134965661, CurrSamplesPerSec=4.760501540201896, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1101, Loss: 0.06471862643957138 +[2024-01-22 02:59:43,641] [INFO] [logging.py:96:log_dist] [Rank 0] step=1110, skipped=0, lr=[1.895730689853598e-05, 1.895730689853598e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 02:59:43,641] [INFO] [timer.py:260:stop] epoch=0/micro_step=1110/global_step=1110, RunningAvgSamplesPerSec=4.748219532314845, CurrSamplesPerSec=4.74615132487298, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1111, Loss: 0.0725710466504097 +[2024-01-22 03:00:51,037] [INFO] [logging.py:96:log_dist] [Rank 0] step=1120, skipped=0, lr=[1.8936932915547934e-05, 1.8936932915547934e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:00:51,037] [INFO] [timer.py:260:stop] epoch=0/micro_step=1120/global_step=1120, RunningAvgSamplesPerSec=4.748226327524935, CurrSamplesPerSec=4.739781117426375, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1121, Loss: 0.11449851095676422 +[2024-01-22 03:01:58,460] [INFO] [logging.py:96:log_dist] [Rank 0] step=1130, skipped=0, lr=[1.891637296922565e-05, 1.891637296922565e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:01:58,460] [INFO] [timer.py:260:stop] epoch=0/micro_step=1130/global_step=1130, RunningAvgSamplesPerSec=4.748216139684488, CurrSamplesPerSec=4.745353588390565, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1131, Loss: 0.08597186952829361 +[2024-01-22 03:03:05,760] [INFO] [logging.py:96:log_dist] [Rank 0] step=1140, skipped=0, lr=[1.8895627487388856e-05, 1.8895627487388856e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:03:05,761] [INFO] [timer.py:260:stop] epoch=0/micro_step=1140/global_step=1140, RunningAvgSamplesPerSec=4.748281842139282, CurrSamplesPerSec=4.7700936577649, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1141, Loss: 0.1202729120850563 +[2024-01-22 03:04:13,083] [INFO] [logging.py:96:log_dist] [Rank 0] step=1150, skipped=0, lr=[1.8874696901717967e-05, 1.8874696901717967e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:04:13,084] [INFO] [timer.py:260:stop] epoch=0/micro_step=1150/global_step=1150, RunningAvgSamplesPerSec=4.7483326915308615, CurrSamplesPerSec=4.758487362513567, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1151, Loss: 0.04734383895993233 +[2024-01-22 03:05:20,526] [INFO] [logging.py:96:log_dist] [Rank 0] step=1160, skipped=0, lr=[1.8853581647745122e-05, 1.8853581647745122e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:05:20,526] [INFO] [timer.py:260:stop] epoch=0/micro_step=1160/global_step=1160, RunningAvgSamplesPerSec=4.748310037506157, CurrSamplesPerSec=4.724326982322896, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1161, Loss: 0.28276824951171875 +[2024-01-22 03:06:27,923] [INFO] [logging.py:96:log_dist] [Rank 0] step=1170, skipped=0, lr=[1.8832282164845117e-05, 1.8832282164845117e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:06:27,924] [INFO] [timer.py:260:stop] epoch=0/micro_step=1170/global_step=1170, RunningAvgSamplesPerSec=4.7483148176678505, CurrSamplesPerSec=4.760903600888915, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1171, Loss: 0.017675651237368584 +[2024-01-22 03:07:35,225] [INFO] [logging.py:96:log_dist] [Rank 0] step=1180, skipped=0, lr=[1.8810798896226253e-05, 1.8810798896226253e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:07:35,226] [INFO] [timer.py:260:stop] epoch=0/micro_step=1180/global_step=1180, RunningAvgSamplesPerSec=4.748376869067915, CurrSamplesPerSec=4.753934326235995, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1181, Loss: 0.16474437713623047 +[2024-01-22 03:08:42,650] [INFO] [logging.py:96:log_dist] [Rank 0] step=1190, skipped=0, lr=[1.8789132288921116e-05, 1.8789132288921116e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:08:42,650] [INFO] [timer.py:260:stop] epoch=0/micro_step=1190/global_step=1190, RunningAvgSamplesPerSec=4.748364782558023, CurrSamplesPerSec=4.75912160913937, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1191, Loss: 0.1801006942987442 +[2024-01-22 03:09:50,132] [INFO] [logging.py:96:log_dist] [Rank 0] step=1200, skipped=0, lr=[1.8767282793777282e-05, 1.8767282793777282e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:09:50,133] [INFO] [timer.py:260:stop] epoch=0/micro_step=1200/global_step=1200, RunningAvgSamplesPerSec=4.748319128640349, CurrSamplesPerSec=4.74858293688021, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1201, Loss: 0.1575186401605606 +[2024-01-22 03:10:57,352] [INFO] [logging.py:96:log_dist] [Rank 0] step=1210, skipped=0, lr=[1.8745250865447933e-05, 1.8745250865447933e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:10:57,352] [INFO] [timer.py:260:stop] epoch=0/micro_step=1210/global_step=1210, RunningAvgSamplesPerSec=4.748427338667705, CurrSamplesPerSec=4.777478566299333, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1211, Loss: 0.09744629263877869 +[2024-01-22 03:12:04,789] [INFO] [logging.py:96:log_dist] [Rank 0] step=1220, skipped=0, lr=[1.872303696238239e-05, 1.872303696238239e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:12:04,790] [INFO] [timer.py:260:stop] epoch=0/micro_step=1220/global_step=1220, RunningAvgSamplesPerSec=4.7484080362935925, CurrSamplesPerSec=4.726599616745236, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1221, Loss: 0.027106812223792076 +[2024-01-22 03:13:12,181] [INFO] [logging.py:96:log_dist] [Rank 0] step=1230, skipped=0, lr=[1.8700641546816584e-05, 1.8700641546816584e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:13:12,181] [INFO] [timer.py:260:stop] epoch=0/micro_step=1230/global_step=1230, RunningAvgSamplesPerSec=4.7484150708503945, CurrSamplesPerSec=4.745267521371389, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1231, Loss: 0.1590980887413025 +[2024-01-22 03:14:19,584] [INFO] [logging.py:96:log_dist] [Rank 0] step=1240, skipped=0, lr=[1.8678065084763425e-05, 1.8678065084763425e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:14:19,585] [INFO] [timer.py:260:stop] epoch=0/micro_step=1240/global_step=1240, RunningAvgSamplesPerSec=4.748415172781687, CurrSamplesPerSec=4.757141308769784, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1241, Loss: 0.17706091701984406 +[2024-01-22 03:15:26,903] [INFO] [logging.py:96:log_dist] [Rank 0] step=1250, skipped=0, lr=[1.865530804600312e-05, 1.865530804600312e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:15:26,904] [INFO] [timer.py:260:stop] epoch=0/micro_step=1250/global_step=1250, RunningAvgSamplesPerSec=4.748463261946418, CurrSamplesPerSec=4.7451430400951224, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1251, Loss: 0.028573883697390556 +[2024-01-22 03:16:34,294] [INFO] [logging.py:96:log_dist] [Rank 0] step=1260, skipped=0, lr=[1.8632370904073385e-05, 1.8632370904073385e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:16:34,295] [INFO] [timer.py:260:stop] epoch=0/micro_step=1260/global_step=1260, RunningAvgSamplesPerSec=4.748470820869701, CurrSamplesPerSec=4.753417785428676, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1261, Loss: 0.2186567336320877 +[2024-01-22 03:17:41,673] [INFO] [logging.py:96:log_dist] [Rank 0] step=1270, skipped=0, lr=[1.8609254136259594e-05, 1.8609254136259594e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:17:41,674] [INFO] [timer.py:260:stop] epoch=0/micro_step=1270/global_step=1270, RunningAvgSamplesPerSec=4.7484842394285, CurrSamplesPerSec=4.765678636207421, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1271, Loss: 0.09239359200000763 +[2024-01-22 03:18:49,064] [INFO] [logging.py:96:log_dist] [Rank 0] step=1280, skipped=0, lr=[1.8585958223584856e-05, 1.8585958223584856e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:18:49,065] [INFO] [timer.py:260:stop] epoch=0/micro_step=1280/global_step=1280, RunningAvgSamplesPerSec=4.748490942450549, CurrSamplesPerSec=4.758236680088534, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1281, Loss: 0.026049096137285233 +[2024-01-22 03:19:56,429] [INFO] [logging.py:96:log_dist] [Rank 0] step=1290, skipped=0, lr=[1.8562483650799988e-05, 1.8562483650799988e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:19:56,430] [INFO] [timer.py:260:stop] epoch=0/micro_step=1290/global_step=1290, RunningAvgSamplesPerSec=4.7485114368409285, CurrSamplesPerSec=4.768147581319572, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1291, Loss: 0.06872789561748505 +[2024-01-22 03:21:03,799] [INFO] [logging.py:96:log_dist] [Rank 0] step=1300, skipped=0, lr=[1.853883090637345e-05, 1.853883090637345e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:21:03,800] [INFO] [timer.py:260:stop] epoch=0/micro_step=1300/global_step=1300, RunningAvgSamplesPerSec=4.748529134323686, CurrSamplesPerSec=4.772254774593368, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1301, Loss: 0.1325576901435852 +[2024-01-22 03:22:11,222] [INFO] [logging.py:96:log_dist] [Rank 0] step=1310, skipped=0, lr=[1.8515000482481173e-05, 1.8515000482481173e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:22:11,223] [INFO] [timer.py:260:stop] epoch=0/micro_step=1310/global_step=1310, RunningAvgSamplesPerSec=4.7485178883295625, CurrSamplesPerSec=4.727818522527123, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1311, Loss: 0.08951763808727264 +[2024-01-22 03:23:18,522] [INFO] [logging.py:96:log_dist] [Rank 0] step=1320, skipped=0, lr=[1.8490992874996298e-05, 1.8490992874996298e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:23:18,523] [INFO] [timer.py:260:stop] epoch=0/micro_step=1320/global_step=1320, RunningAvgSamplesPerSec=4.748572183792171, CurrSamplesPerSec=4.755975322228807, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1321, Loss: 0.16072556376457214 +[2024-01-22 03:24:25,833] [INFO] [logging.py:96:log_dist] [Rank 0] step=1330, skipped=0, lr=[1.8466808583478886e-05, 1.8466808583478886e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:24:25,834] [INFO] [timer.py:260:stop] epoch=0/micro_step=1330/global_step=1330, RunningAvgSamplesPerSec=4.748620247599804, CurrSamplesPerSec=4.751436845442829, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1331, Loss: 0.14524498581886292 +[2024-01-22 03:25:33,185] [INFO] [logging.py:96:log_dist] [Rank 0] step=1340, skipped=0, lr=[1.844244811116551e-05, 1.844244811116551e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:25:33,186] [INFO] [timer.py:260:stop] epoch=0/micro_step=1340/global_step=1340, RunningAvgSamplesPerSec=4.7486461810979, CurrSamplesPerSec=4.766956046372734, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1341, Loss: 0.09969011694192886 +[2024-01-22 03:26:40,465] [INFO] [logging.py:96:log_dist] [Rank 0] step=1350, skipped=0, lr=[1.841791196495879e-05, 1.841791196495879e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:26:40,466] [INFO] [timer.py:260:stop] epoch=0/micro_step=1350/global_step=1350, RunningAvgSamplesPerSec=4.748708804703992, CurrSamplesPerSec=4.7520584458558535, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1351, Loss: 0.08495720475912094 +[2024-01-22 03:27:47,838] [INFO] [logging.py:96:log_dist] [Rank 0] step=1360, skipped=0, lr=[1.8393200655416824e-05, 1.8393200655416824e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:27:47,839] [INFO] [timer.py:260:stop] epoch=0/micro_step=1360/global_step=1360, RunningAvgSamplesPerSec=4.748722469444673, CurrSamplesPerSec=4.735875652190727, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1361, Loss: 0.04776746407151222 +[2024-01-22 03:28:55,199] [INFO] [logging.py:96:log_dist] [Rank 0] step=1370, skipped=0, lr=[1.8368314696742597e-05, 1.8368314696742597e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:28:55,204] [INFO] [timer.py:260:stop] epoch=0/micro_step=1370/global_step=1370, RunningAvgSamplesPerSec=4.7487400394164, CurrSamplesPerSec=4.757002884340672, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1371, Loss: 0.15735392272472382 +[2024-01-22 03:30:02,458] [INFO] [logging.py:96:log_dist] [Rank 0] step=1380, skipped=0, lr=[1.834325460677325e-05, 1.834325460677325e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:30:02,458] [INFO] [timer.py:260:stop] epoch=0/micro_step=1380/global_step=1380, RunningAvgSamplesPerSec=4.7488138065311825, CurrSamplesPerSec=4.7621269178334344, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1381, Loss: 0.17660652101039886 +[2024-01-22 03:31:09,853] [INFO] [logging.py:96:log_dist] [Rank 0] step=1390, skipped=0, lr=[1.8318020906969335e-05, 1.8318020906969335e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:31:09,854] [INFO] [timer.py:260:stop] epoch=0/micro_step=1390/global_step=1390, RunningAvgSamplesPerSec=4.748814932398106, CurrSamplesPerSec=4.735580395212079, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1391, Loss: 0.14825184643268585 +[2024-01-22 03:32:17,233] [INFO] [logging.py:96:log_dist] [Rank 0] step=1400, skipped=0, lr=[1.8292614122403928e-05, 1.8292614122403928e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:32:17,234] [INFO] [timer.py:260:stop] epoch=0/micro_step=1400/global_step=1400, RunningAvgSamplesPerSec=4.7488234878091236, CurrSamplesPerSec=4.731880358468392, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1401, Loss: 0.0928417444229126 +[2024-01-22 03:33:24,593] [INFO] [logging.py:96:log_dist] [Rank 0] step=1410, skipped=0, lr=[1.826703478175174e-05, 1.826703478175174e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:33:24,594] [INFO] [timer.py:260:stop] epoch=0/micro_step=1410/global_step=1410, RunningAvgSamplesPerSec=4.748842207069994, CurrSamplesPerSec=4.732677409295161, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1411, Loss: 0.10678210854530334 +[2024-01-22 03:34:32,044] [INFO] [logging.py:96:log_dist] [Rank 0] step=1420, skipped=0, lr=[1.8241283417278094e-05, 1.8241283417278094e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:34:32,045] [INFO] [timer.py:260:stop] epoch=0/micro_step=1420/global_step=1420, RunningAvgSamplesPerSec=4.748815430782121, CurrSamplesPerSec=4.715972995450039, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1421, Loss: 0.2152009755373001 +[2024-01-22 03:35:39,402] [INFO] [logging.py:96:log_dist] [Rank 0] step=1430, skipped=0, lr=[1.821536056482785e-05, 1.821536056482785e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:35:39,403] [INFO] [timer.py:260:stop] epoch=0/micro_step=1430/global_step=1430, RunningAvgSamplesPerSec=4.748834726914045, CurrSamplesPerSec=4.750009882750063, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1431, Loss: 0.1519220769405365 +[2024-01-22 03:36:46,958] [INFO] [logging.py:96:log_dist] [Rank 0] step=1440, skipped=0, lr=[1.818926676381426e-05, 1.818926676381426e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:36:46,958] [INFO] [timer.py:260:stop] epoch=0/micro_step=1440/global_step=1440, RunningAvgSamplesPerSec=4.748757227568116, CurrSamplesPerSec=4.725021683999198, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1441, Loss: 0.044663213193416595 +[2024-01-22 03:37:54,484] [INFO] [logging.py:96:log_dist] [Rank 0] step=1450, skipped=0, lr=[1.8163002557207754e-05, 1.8163002557207754e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:37:54,485] [INFO] [timer.py:260:stop] epoch=0/micro_step=1450/global_step=1450, RunningAvgSamplesPerSec=4.748694932493752, CurrSamplesPerSec=4.719977645312327, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1451, Loss: 0.03878221660852432 +[2024-01-22 03:39:01,883] [INFO] [logging.py:96:log_dist] [Rank 0] step=1460, skipped=0, lr=[1.813656849152462e-05, 1.813656849152462e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:39:01,884] [INFO] [timer.py:260:stop] epoch=0/micro_step=1460/global_step=1460, RunningAvgSamplesPerSec=4.748695048060696, CurrSamplesPerSec=4.733090806811689, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1461, Loss: 0.2213730365037918 +[2024-01-22 03:40:09,181] [INFO] [logging.py:96:log_dist] [Rank 0] step=1470, skipped=0, lr=[1.8109965116815647e-05, 1.8109965116815647e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:40:09,182] [INFO] [timer.py:260:stop] epoch=0/micro_step=1470/global_step=1470, RunningAvgSamplesPerSec=4.748743825880781, CurrSamplesPerSec=4.766863099177612, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1471, Loss: 0.036826733499765396 +[2024-01-22 03:41:16,547] [INFO] [logging.py:96:log_dist] [Rank 0] step=1480, skipped=0, lr=[1.8083192986654668e-05, 1.8083192986654668e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:41:16,547] [INFO] [timer.py:260:stop] epoch=0/micro_step=1480/global_step=1480, RunningAvgSamplesPerSec=4.748759852115718, CurrSamplesPerSec=4.742200340474691, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1481, Loss: 0.1885986477136612 +[2024-01-22 03:42:23,907] [INFO] [logging.py:96:log_dist] [Rank 0] step=1490, skipped=0, lr=[1.8056252658127064e-05, 1.8056252658127064e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:42:23,907] [INFO] [timer.py:260:stop] epoch=0/micro_step=1490/global_step=1490, RunningAvgSamplesPerSec=4.748777815657129, CurrSamplesPerSec=4.772885737983412, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1491, Loss: 0.07854046672582626 +[2024-01-22 03:43:31,313] [INFO] [logging.py:96:log_dist] [Rank 0] step=1500, skipped=0, lr=[1.8029144691818138e-05, 1.8029144691818138e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:43:31,314] [INFO] [timer.py:260:stop] epoch=0/micro_step=1500/global_step=1500, RunningAvgSamplesPerSec=4.748773690466071, CurrSamplesPerSec=4.755514108760045, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1501, Loss: 0.058306097984313965 +[2024-01-22 03:44:38,672] [INFO] [logging.py:96:log_dist] [Rank 0] step=1510, skipped=0, lr=[1.800186965180148e-05, 1.800186965180148e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:44:38,672] [INFO] [timer.py:260:stop] epoch=0/micro_step=1510/global_step=1510, RunningAvgSamplesPerSec=4.748792369645197, CurrSamplesPerSec=4.747682777119545, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1511, Loss: 0.08597294986248016 +[2024-01-22 03:45:46,151] [INFO] [logging.py:96:log_dist] [Rank 0] step=1520, skipped=0, lr=[1.797442810562721e-05, 1.797442810562721e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:45:46,152] [INFO] [timer.py:260:stop] epoch=0/micro_step=1520/global_step=1520, RunningAvgSamplesPerSec=4.748754503144062, CurrSamplesPerSec=4.694120449910138, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1521, Loss: 0.1018645316362381 +[2024-01-22 03:46:53,582] [INFO] [logging.py:96:log_dist] [Rank 0] step=1530, skipped=0, lr=[1.7946820624310184e-05, 1.7946820624310184e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:46:53,583] [INFO] [timer.py:260:stop] epoch=0/micro_step=1530/global_step=1530, RunningAvgSamplesPerSec=4.748739408982514, CurrSamplesPerSec=4.754331573305865, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1531, Loss: 0.14111517369747162 +[2024-01-22 03:48:00,898] [INFO] [logging.py:96:log_dist] [Rank 0] step=1540, skipped=0, lr=[1.79190477823181e-05, 1.79190477823181e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:48:00,898] [INFO] [timer.py:260:stop] epoch=0/micro_step=1540/global_step=1540, RunningAvgSamplesPerSec=4.748777399757875, CurrSamplesPerSec=4.762701630924687, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1541, Loss: 0.039234861731529236 +[2024-01-22 03:49:08,186] [INFO] [logging.py:96:log_dist] [Rank 0] step=1550, skipped=0, lr=[1.7891110157559542e-05, 1.7891110157559542e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:49:08,187] [INFO] [timer.py:260:stop] epoch=0/micro_step=1550/global_step=1550, RunningAvgSamplesPerSec=4.748827292440239, CurrSamplesPerSec=4.757872512239623, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1551, Loss: 0.028371471911668777 +[2024-01-22 03:50:15,449] [INFO] [logging.py:96:log_dist] [Rank 0] step=1560, skipped=0, lr=[1.7863008331371974e-05, 1.7863008331371974e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:50:15,449] [INFO] [timer.py:260:stop] epoch=0/micro_step=1560/global_step=1560, RunningAvgSamplesPerSec=4.748888095269911, CurrSamplesPerSec=4.77203708172371, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1561, Loss: 0.23861978948116302 +[2024-01-22 03:51:22,824] [INFO] [logging.py:96:log_dist] [Rank 0] step=1570, skipped=0, lr=[1.783474288850962e-05, 1.783474288850962e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:51:22,824] [INFO] [timer.py:260:stop] epoch=0/micro_step=1570/global_step=1570, RunningAvgSamplesPerSec=4.748897833063759, CurrSamplesPerSec=4.737934785329126, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1571, Loss: 0.20623454451560974 +[2024-01-22 03:52:30,299] [INFO] [logging.py:96:log_dist] [Rank 0] step=1580, skipped=0, lr=[1.7806314417131303e-05, 1.7806314417131303e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:52:30,300] [INFO] [timer.py:260:stop] epoch=0/micro_step=1580/global_step=1580, RunningAvgSamplesPerSec=4.748862566111549, CurrSamplesPerSec=4.756412352706243, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1581, Loss: 0.08203073590993881 +[2024-01-22 03:53:37,643] [INFO] [logging.py:96:log_dist] [Rank 0] step=1590, skipped=0, lr=[1.7777723508788226e-05, 1.7777723508788226e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:53:37,644] [INFO] [timer.py:260:stop] epoch=0/micro_step=1590/global_step=1590, RunningAvgSamplesPerSec=4.748886033265858, CurrSamplesPerSec=4.766245574892917, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1591, Loss: 0.05756440386176109 +[2024-01-22 03:54:44,906] [INFO] [logging.py:96:log_dist] [Rank 0] step=1600, skipped=0, lr=[1.7748970758411627e-05, 1.7748970758411627e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:54:44,906] [INFO] [timer.py:260:stop] epoch=0/micro_step=1600/global_step=1600, RunningAvgSamplesPerSec=4.748945022833066, CurrSamplesPerSec=4.760777791315707, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1601, Loss: 0.045844752341508865 +[2024-01-22 03:55:52,338] [INFO] [logging.py:96:log_dist] [Rank 0] step=1610, skipped=0, lr=[1.7720056764300434e-05, 1.7720056764300434e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:55:52,338] [INFO] [timer.py:260:stop] epoch=0/micro_step=1610/global_step=1610, RunningAvgSamplesPerSec=4.748929137070706, CurrSamplesPerSec=4.739744461225977, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1611, Loss: 0.15491454303264618 +[2024-01-22 03:56:59,698] [INFO] [logging.py:96:log_dist] [Rank 0] step=1620, skipped=0, lr=[1.769098212810879e-05, 1.769098212810879e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:56:59,699] [INFO] [timer.py:260:stop] epoch=0/micro_step=1620/global_step=1620, RunningAvgSamplesPerSec=4.748944589644048, CurrSamplesPerSec=4.759552466833545, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1621, Loss: 0.16068507730960846 +[2024-01-22 03:58:07,020] [INFO] [logging.py:96:log_dist] [Rank 0] step=1630, skipped=0, lr=[1.766174745483355e-05, 1.766174745483355e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:58:07,020] [INFO] [timer.py:260:stop] epoch=0/micro_step=1630/global_step=1630, RunningAvgSamplesPerSec=4.748976607660582, CurrSamplesPerSec=4.754983916495939, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1631, Loss: 0.03136483207345009 +[2024-01-22 03:59:14,286] [INFO] [logging.py:96:log_dist] [Rank 0] step=1640, skipped=0, lr=[1.7632353352801686e-05, 1.7632353352801686e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 03:59:14,287] [INFO] [timer.py:260:stop] epoch=0/micro_step=1640/global_step=1640, RunningAvgSamplesPerSec=4.7490320922467175, CurrSamplesPerSec=4.767750223916125, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1641, Loss: 0.06244714558124542 +[2024-01-22 04:00:21,661] [INFO] [logging.py:96:log_dist] [Rank 0] step=1650, skipped=0, lr=[1.760280043365762e-05, 1.760280043365762e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:00:21,662] [INFO] [timer.py:260:stop] epoch=0/micro_step=1650/global_step=1650, RunningAvgSamplesPerSec=4.749040493985739, CurrSamplesPerSec=4.764954504777691, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1651, Loss: 0.03847730532288551 +[2024-01-22 04:01:29,015] [INFO] [logging.py:96:log_dist] [Rank 0] step=1660, skipped=0, lr=[1.7573089312350517e-05, 1.7573089312350517e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:01:29,015] [INFO] [timer.py:260:stop] epoch=0/micro_step=1660/global_step=1660, RunningAvgSamplesPerSec=4.7490579283880185, CurrSamplesPerSec=4.7521100989997205, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1661, Loss: 0.05777880176901817 +[2024-01-22 04:02:36,405] [INFO] [logging.py:96:log_dist] [Rank 0] step=1670, skipped=0, lr=[1.7543220607121466e-05, 1.7543220607121466e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:02:36,405] [INFO] [timer.py:260:stop] epoch=0/micro_step=1670/global_step=1670, RunningAvgSamplesPerSec=4.749059744770763, CurrSamplesPerSec=4.7310113668508125, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1671, Loss: 0.14692629873752594 +[2024-01-22 04:03:43,764] [INFO] [logging.py:96:log_dist] [Rank 0] step=1680, skipped=0, lr=[1.7513194939490633e-05, 1.7513194939490633e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:03:43,765] [INFO] [timer.py:260:stop] epoch=0/micro_step=1680/global_step=1680, RunningAvgSamplesPerSec=4.749074277228147, CurrSamplesPerSec=4.735321929371171, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1681, Loss: 0.08384459465742111 +[2024-01-22 04:04:51,150] [INFO] [logging.py:96:log_dist] [Rank 0] step=1690, skipped=0, lr=[1.748301293424432e-05, 1.748301293424432e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:04:51,151] [INFO] [timer.py:260:stop] epoch=0/micro_step=1690/global_step=1690, RunningAvgSamplesPerSec=4.749077519449459, CurrSamplesPerSec=4.75864730023031, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1691, Loss: 0.09616201370954514 +[2024-01-22 04:05:58,466] [INFO] [logging.py:96:log_dist] [Rank 0] step=1700, skipped=0, lr=[1.745267521942197e-05, 1.745267521942197e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:05:58,467] [INFO] [timer.py:260:stop] epoch=0/micro_step=1700/global_step=1700, RunningAvgSamplesPerSec=4.749109945273231, CurrSamplesPerSec=4.741572940824401, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1701, Loss: 0.03736364096403122 +[2024-01-22 04:07:05,788] [INFO] [logging.py:96:log_dist] [Rank 0] step=1710, skipped=0, lr=[1.742218242630308e-05, 1.742218242630308e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:07:05,788] [INFO] [timer.py:260:stop] epoch=0/micro_step=1710/global_step=1710, RunningAvgSamplesPerSec=4.749139716864079, CurrSamplesPerSec=4.751831151715452, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1711, Loss: 0.058751631528139114 +[2024-01-22 04:08:13,175] [INFO] [logging.py:96:log_dist] [Rank 0] step=1720, skipped=0, lr=[1.7391535189394094e-05, 1.7391535189394094e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:08:13,175] [INFO] [timer.py:260:stop] epoch=0/micro_step=1720/global_step=1720, RunningAvgSamplesPerSec=4.7491421859504985, CurrSamplesPerSec=4.753345061188401, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1721, Loss: 0.0534539632499218 +[2024-01-22 04:09:20,607] [INFO] [logging.py:96:log_dist] [Rank 0] step=1730, skipped=0, lr=[1.7360734146415182e-05, 1.7360734146415182e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:09:20,607] [INFO] [timer.py:260:stop] epoch=0/micro_step=1730/global_step=1730, RunningAvgSamplesPerSec=4.749126482146122, CurrSamplesPerSec=4.752198097006674, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1731, Loss: 0.11313589662313461 +[2024-01-22 04:10:28,064] [INFO] [logging.py:96:log_dist] [Rank 0] step=1740, skipped=0, lr=[1.7329779938286972e-05, 1.7329779938286972e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:10:28,065] [INFO] [timer.py:260:stop] epoch=0/micro_step=1740/global_step=1740, RunningAvgSamplesPerSec=4.749100332483343, CurrSamplesPerSec=4.727157128615155, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1741, Loss: 0.18278264999389648 +[2024-01-22 04:11:35,392] [INFO] [logging.py:96:log_dist] [Rank 0] step=1750, skipped=0, lr=[1.729867320911721e-05, 1.729867320911721e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:11:35,392] [INFO] [timer.py:260:stop] epoch=0/micro_step=1750/global_step=1750, RunningAvgSamplesPerSec=4.749126768621114, CurrSamplesPerSec=4.764705508560693, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1751, Loss: 0.20674708485603333 +[2024-01-22 04:12:42,790] [INFO] [logging.py:96:log_dist] [Rank 0] step=1760, skipped=0, lr=[1.7267414606187364e-05, 1.7267414606187364e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:12:42,791] [INFO] [timer.py:260:stop] epoch=0/micro_step=1760/global_step=1760, RunningAvgSamplesPerSec=4.749124621970878, CurrSamplesPerSec=4.728156617572667, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1761, Loss: 0.03745114430785179 +[2024-01-22 04:13:50,256] [INFO] [logging.py:96:log_dist] [Rank 0] step=1770, skipped=0, lr=[1.723600477993916e-05, 1.723600477993916e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:13:50,257] [INFO] [timer.py:260:stop] epoch=0/micro_step=1770/global_step=1770, RunningAvgSamplesPerSec=4.749095962769763, CurrSamplesPerSec=4.712656446023861, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1771, Loss: 0.08894414454698563 +[2024-01-22 04:14:57,559] [INFO] [logging.py:96:log_dist] [Rank 0] step=1780, skipped=0, lr=[1.7204444383961032e-05, 1.7204444383961032e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:14:57,559] [INFO] [timer.py:260:stop] epoch=0/micro_step=1780/global_step=1780, RunningAvgSamplesPerSec=4.749132020201398, CurrSamplesPerSec=4.752138365690637, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1781, Loss: 0.022153053432703018 +[2024-01-22 04:16:04,866] [INFO] [logging.py:96:log_dist] [Rank 0] step=1790, skipped=0, lr=[1.7172734074974534e-05, 1.7172734074974534e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:16:04,866] [INFO] [timer.py:260:stop] epoch=0/micro_step=1790/global_step=1790, RunningAvgSamplesPerSec=4.749165944866907, CurrSamplesPerSec=4.754264041672457, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1791, Loss: 0.03452971577644348 +[2024-01-22 04:17:12,338] [INFO] [logging.py:96:log_dist] [Rank 0] step=1800, skipped=0, lr=[1.7140874512820674e-05, 1.7140874512820674e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:17:12,338] [INFO] [timer.py:260:stop] epoch=0/micro_step=1800/global_step=1800, RunningAvgSamplesPerSec=4.749134734846167, CurrSamplesPerSec=4.750295509341026, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1801, Loss: 0.022489216178655624 +[2024-01-22 04:18:19,826] [INFO] [logging.py:96:log_dist] [Rank 0] step=1810, skipped=0, lr=[1.7108866360446172e-05, 1.7108866360446172e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:18:19,827] [INFO] [timer.py:260:stop] epoch=0/micro_step=1810/global_step=1810, RunningAvgSamplesPerSec=4.749097733942231, CurrSamplesPerSec=4.764212329295918, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1811, Loss: 0.20105984807014465 +[2024-01-22 04:19:27,201] [INFO] [logging.py:96:log_dist] [Rank 0] step=1820, skipped=0, lr=[1.7076710283889678e-05, 1.7076710283889678e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:19:27,202] [INFO] [timer.py:260:stop] epoch=0/micro_step=1820/global_step=1820, RunningAvgSamplesPerSec=4.749104914031051, CurrSamplesPerSec=4.740390630764116, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1821, Loss: 0.10563098639249802 +[2024-01-22 04:20:34,542] [INFO] [logging.py:96:log_dist] [Rank 0] step=1830, skipped=0, lr=[1.704440695226791e-05, 1.704440695226791e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:20:34,542] [INFO] [timer.py:260:stop] epoch=0/micro_step=1830/global_step=1830, RunningAvgSamplesPerSec=4.749125427545827, CurrSamplesPerSec=4.755074042524728, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1831, Loss: 0.16648419201374054 +[2024-01-22 04:21:41,955] [INFO] [logging.py:96:log_dist] [Rank 0] step=1840, skipped=0, lr=[1.701195703776173e-05, 1.701195703776173e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:21:41,955] [INFO] [timer.py:260:stop] epoch=0/micro_step=1840/global_step=1840, RunningAvgSamplesPerSec=4.749117735323717, CurrSamplesPerSec=4.740817768526908, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1841, Loss: 0.11515253037214279 +[2024-01-22 04:22:49,371] [INFO] [logging.py:96:log_dist] [Rank 0] step=1850, skipped=0, lr=[1.6979361215602156e-05, 1.6979361215602156e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:22:49,371] [INFO] [timer.py:260:stop] epoch=0/micro_step=1850/global_step=1850, RunningAvgSamplesPerSec=4.7491092304309355, CurrSamplesPerSec=4.751803561604678, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1851, Loss: 0.10942445695400238 +[2024-01-22 04:23:56,730] [INFO] [logging.py:96:log_dist] [Rank 0] step=1860, skipped=0, lr=[1.6946620164056305e-05, 1.6946620164056305e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:23:56,731] [INFO] [timer.py:260:stop] epoch=0/micro_step=1860/global_step=1860, RunningAvgSamplesPerSec=4.74912186888886, CurrSamplesPerSec=4.733023042722374, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1861, Loss: 0.05242861807346344 +[2024-01-22 04:25:04,105] [INFO] [logging.py:96:log_dist] [Rank 0] step=1870, skipped=0, lr=[1.6913734564413296e-05, 1.6913734564413296e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:25:04,105] [INFO] [timer.py:260:stop] epoch=0/micro_step=1870/global_step=1870, RunningAvgSamplesPerSec=4.749128898219687, CurrSamplesPerSec=4.734340951617229, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1871, Loss: 0.1382388323545456 +[2024-01-22 04:26:11,519] [INFO] [logging.py:96:log_dist] [Rank 0] step=1880, skipped=0, lr=[1.6880705100970057e-05, 1.6880705100970057e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:26:11,519] [INFO] [timer.py:260:stop] epoch=0/micro_step=1880/global_step=1880, RunningAvgSamplesPerSec=4.749121116193495, CurrSamplesPerSec=4.729862486311817, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1881, Loss: 0.08719504624605179 +[2024-01-22 04:27:18,904] [INFO] [logging.py:96:log_dist] [Rank 0] step=1890, skipped=0, lr=[1.6847532461017094e-05, 1.6847532461017094e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:27:18,904] [INFO] [timer.py:260:stop] epoch=0/micro_step=1890/global_step=1890, RunningAvgSamplesPerSec=4.749124187296982, CurrSamplesPerSec=4.74736656763124, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1891, Loss: 0.0397082082927227 +[2024-01-22 04:28:26,291] [INFO] [logging.py:96:log_dist] [Rank 0] step=1900, skipped=0, lr=[1.681421733482419e-05, 1.681421733482419e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:28:26,292] [INFO] [timer.py:260:stop] epoch=0/micro_step=1900/global_step=1900, RunningAvgSamplesPerSec=4.749126293489922, CurrSamplesPerSec=4.757119052410644, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1901, Loss: 0.0881124883890152 +[2024-01-22 04:29:33,818] [INFO] [logging.py:96:log_dist] [Rank 0] step=1910, skipped=0, lr=[1.678076041562604e-05, 1.678076041562604e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:29:33,818] [INFO] [timer.py:260:stop] epoch=0/micro_step=1910/global_step=1910, RunningAvgSamplesPerSec=4.7490772169324575, CurrSamplesPerSec=4.709325886236312, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1911, Loss: 0.06078184396028519 +[2024-01-22 04:30:41,161] [INFO] [logging.py:96:log_dist] [Rank 0] step=1920, skipped=0, lr=[1.6747162399607817e-05, 1.6747162399607817e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:30:41,162] [INFO] [timer.py:260:stop] epoch=0/micro_step=1920/global_step=1920, RunningAvgSamplesPerSec=4.749095562297998, CurrSamplesPerSec=4.765552404938633, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1921, Loss: 0.258779376745224 +[2024-01-22 04:31:48,568] [INFO] [logging.py:96:log_dist] [Rank 0] step=1930, skipped=0, lr=[1.671342398589071e-05, 1.671342398589071e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:31:48,569] [INFO] [timer.py:260:stop] epoch=0/micro_step=1930/global_step=1930, RunningAvgSamplesPerSec=4.749090780797284, CurrSamplesPerSec=4.739902137127457, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1931, Loss: 0.16057130694389343 +[2024-01-22 04:32:55,906] [INFO] [logging.py:96:log_dist] [Rank 0] step=1940, skipped=0, lr=[1.667954587651734e-05, 1.667954587651734e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:32:55,906] [INFO] [timer.py:260:stop] epoch=0/micro_step=1940/global_step=1940, RunningAvgSamplesPerSec=4.749111264352012, CurrSamplesPerSec=4.754089748192622, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1941, Loss: 0.10712216794490814 +[2024-01-22 04:34:03,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=1950, skipped=0, lr=[1.664552877643719e-05, 1.664552877643719e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:34:03,273] [INFO] [timer.py:260:stop] epoch=0/micro_step=1950/global_step=1950, RunningAvgSamplesPerSec=4.749120683290008, CurrSamplesPerSec=4.755533991163812, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1951, Loss: 0.08145476132631302 +[2024-01-22 04:35:10,697] [INFO] [logging.py:96:log_dist] [Rank 0] step=1960, skipped=0, lr=[1.6611373393491915e-05, 1.6611373393491915e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:35:10,697] [INFO] [timer.py:260:stop] epoch=0/micro_step=1960/global_step=1960, RunningAvgSamplesPerSec=4.749109534180691, CurrSamplesPerSec=4.732014655320733, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1961, Loss: 0.0621233731508255 +[2024-01-22 04:36:18,004] [INFO] [logging.py:96:log_dist] [Rank 0] step=1970, skipped=0, lr=[1.6577080438400604e-05, 1.6577080438400604e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:36:18,004] [INFO] [timer.py:260:stop] epoch=0/micro_step=1970/global_step=1970, RunningAvgSamplesPerSec=4.74914038308839, CurrSamplesPerSec=4.7594150834452496, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1971, Loss: 0.21577757596969604 +[2024-01-22 04:37:25,302] [INFO] [logging.py:96:log_dist] [Rank 0] step=1980, skipped=0, lr=[1.6542650624745013e-05, 1.6542650624745013e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:37:25,303] [INFO] [timer.py:260:stop] epoch=0/micro_step=1980/global_step=1980, RunningAvgSamplesPerSec=4.749173986487604, CurrSamplesPerSec=4.748108710535543, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1981, Loss: 0.026383545249700546 +[2024-01-22 04:38:32,688] [INFO] [logging.py:96:log_dist] [Rank 0] step=1990, skipped=0, lr=[1.650808466895471e-05, 1.650808466895471e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:38:32,688] [INFO] [timer.py:260:stop] epoch=0/micro_step=1990/global_step=1990, RunningAvgSamplesPerSec=4.749176412627494, CurrSamplesPerSec=4.739456754850115, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 1991, Loss: 0.05928436294198036 +[2024-01-22 04:39:40,110] [INFO] [logging.py:96:log_dist] [Rank 0] step=2000, skipped=0, lr=[1.6473383290292158e-05, 1.6473383290292158e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:39:40,110] [INFO] [timer.py:260:stop] epoch=0/micro_step=2000/global_step=2000, RunningAvgSamplesPerSec=4.7491660363239285, CurrSamplesPerSec=4.743542980488978, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2001, Loss: 0.0591588094830513 +[2024-01-22 04:40:47,485] [INFO] [logging.py:96:log_dist] [Rank 0] step=2010, skipped=0, lr=[1.6438547210837753e-05, 1.6438547210837753e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:40:47,486] [INFO] [timer.py:260:stop] epoch=0/micro_step=2010/global_step=2010, RunningAvgSamplesPerSec=4.7491719464058315, CurrSamplesPerSec=4.7624653751161405, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2011, Loss: 0.07002607733011246 +[2024-01-22 04:41:54,876] [INFO] [logging.py:96:log_dist] [Rank 0] step=2020, skipped=0, lr=[1.64035771554748e-05, 1.64035771554748e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:41:54,876] [INFO] [timer.py:260:stop] epoch=0/micro_step=2020/global_step=2020, RunningAvgSamplesPerSec=4.749172736249261, CurrSamplesPerSec=4.763531754008788, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2021, Loss: 0.1126142293214798 +[2024-01-22 04:43:02,346] [INFO] [logging.py:96:log_dist] [Rank 0] step=2030, skipped=0, lr=[1.6368473851874432e-05, 1.6368473851874432e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:43:02,346] [INFO] [timer.py:260:stop] epoch=0/micro_step=2030/global_step=2030, RunningAvgSamplesPerSec=4.749145583351242, CurrSamplesPerSec=4.733497265161731, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2031, Loss: 0.03270665183663368 +[2024-01-22 04:44:09,824] [INFO] [logging.py:96:log_dist] [Rank 0] step=2040, skipped=0, lr=[1.6333238030480473e-05, 1.6333238030480473e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:44:09,825] [INFO] [timer.py:260:stop] epoch=0/micro_step=2040/global_step=2040, RunningAvgSamplesPerSec=4.749116179739594, CurrSamplesPerSec=4.741484498365093, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2041, Loss: 0.1796988546848297 +[2024-01-22 04:45:17,235] [INFO] [logging.py:96:log_dist] [Rank 0] step=2050, skipped=0, lr=[1.629787042449421e-05, 1.629787042449421e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:45:17,236] [INFO] [timer.py:260:stop] epoch=0/micro_step=2050/global_step=2050, RunningAvgSamplesPerSec=4.749109836568522, CurrSamplesPerSec=4.753287657746163, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2051, Loss: 0.053939707577228546 +[2024-01-22 04:46:24,655] [INFO] [logging.py:96:log_dist] [Rank 0] step=2060, skipped=0, lr=[1.6262371769859182e-05, 1.6262371769859182e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:46:24,656] [INFO] [timer.py:260:stop] epoch=0/micro_step=2060/global_step=2060, RunningAvgSamplesPerSec=4.749100443048954, CurrSamplesPerSec=4.7600689913139504, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2061, Loss: 0.04878688603639603 +[2024-01-22 04:47:32,108] [INFO] [logging.py:96:log_dist] [Rank 0] step=2070, skipped=0, lr=[1.6226742805245824e-05, 1.6226742805245824e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:47:32,108] [INFO] [timer.py:260:stop] epoch=0/micro_step=2070/global_step=2070, RunningAvgSamplesPerSec=4.749080408260659, CurrSamplesPerSec=4.747937723350439, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2071, Loss: 0.11740563809871674 +[2024-01-22 04:48:39,497] [INFO] [logging.py:96:log_dist] [Rank 0] step=2080, skipped=0, lr=[1.6190984272036118e-05, 1.6190984272036118e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:48:39,497] [INFO] [timer.py:260:stop] epoch=0/micro_step=2080/global_step=2080, RunningAvgSamplesPerSec=4.749082044924945, CurrSamplesPerSec=4.751499755110595, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2081, Loss: 0.1531887799501419 +[2024-01-22 04:49:46,951] [INFO] [logging.py:96:log_dist] [Rank 0] step=2090, skipped=0, lr=[1.615509691430817e-05, 1.615509691430817e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:49:46,952] [INFO] [timer.py:260:stop] epoch=0/micro_step=2090/global_step=2090, RunningAvgSamplesPerSec=4.7490615841148545, CurrSamplesPerSec=4.7410833661315035, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2091, Loss: 0.16665396094322205 +[2024-01-22 04:50:54,310] [INFO] [logging.py:96:log_dist] [Rank 0] step=2100, skipped=0, lr=[1.6119081478820706e-05, 1.6119081478820706e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:50:54,310] [INFO] [timer.py:260:stop] epoch=0/micro_step=2100/global_step=2100, RunningAvgSamplesPerSec=4.749073561212387, CurrSamplesPerSec=4.763218332064611, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2101, Loss: 0.20431390404701233 +[2024-01-22 04:52:01,612] [INFO] [logging.py:96:log_dist] [Rank 0] step=2110, skipped=0, lr=[1.608293871499756e-05, 1.608293871499756e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:52:01,613] [INFO] [timer.py:260:stop] epoch=0/micro_step=2110/global_step=2110, RunningAvgSamplesPerSec=4.74910406557793, CurrSamplesPerSec=4.748821513749808, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2111, Loss: 0.07705129683017731 +[2024-01-22 04:53:09,024] [INFO] [logging.py:96:log_dist] [Rank 0] step=2120, skipped=0, lr=[1.604666937491205e-05, 1.604666937491205e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:53:09,025] [INFO] [timer.py:260:stop] epoch=0/micro_step=2120/global_step=2120, RunningAvgSamplesPerSec=4.749097773156643, CurrSamplesPerSec=4.768491977432451, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2121, Loss: 0.08502361178398132 +[2024-01-22 04:54:16,355] [INFO] [logging.py:96:log_dist] [Rank 0] step=2130, skipped=0, lr=[1.6010274213271363e-05, 1.6010274213271363e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:54:16,355] [INFO] [timer.py:260:stop] epoch=0/micro_step=2130/global_step=2130, RunningAvgSamplesPerSec=4.749118573625111, CurrSamplesPerSec=4.757271984776188, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2131, Loss: 0.06391873955726624 +[2024-01-22 04:55:23,693] [INFO] [logging.py:96:log_dist] [Rank 0] step=2140, skipped=0, lr=[1.5973753987400815e-05, 1.5973753987400815e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:55:23,693] [INFO] [timer.py:260:stop] epoch=0/micro_step=2140/global_step=2140, RunningAvgSamplesPerSec=4.749136727276918, CurrSamplesPerSec=4.748632162398185, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2141, Loss: 0.23616105318069458 +[2024-01-22 04:56:31,208] [INFO] [logging.py:96:log_dist] [Rank 0] step=2150, skipped=0, lr=[1.5937109457228122e-05, 1.5937109457228122e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:56:31,208] [INFO] [timer.py:260:stop] epoch=0/micro_step=2150/global_step=2150, RunningAvgSamplesPerSec=4.749096736034725, CurrSamplesPerSec=4.760159816683575, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2151, Loss: 0.02967120334506035 +[2024-01-22 04:57:38,590] [INFO] [logging.py:96:log_dist] [Rank 0] step=2160, skipped=0, lr=[1.5900341385267566e-05, 1.5900341385267566e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:57:38,590] [INFO] [timer.py:260:stop] epoch=0/micro_step=2160/global_step=2160, RunningAvgSamplesPerSec=4.749100471508766, CurrSamplesPerSec=4.769139739510025, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2161, Loss: 0.07981369644403458 +[2024-01-22 04:58:45,885] [INFO] [logging.py:96:log_dist] [Rank 0] step=2170, skipped=0, lr=[1.586345053660414e-05, 1.586345053660414e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:58:45,886] [INFO] [timer.py:260:stop] epoch=0/micro_step=2170/global_step=2170, RunningAvgSamplesPerSec=4.749132454782503, CurrSamplesPerSec=4.771534071683434, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2171, Loss: 0.08959215879440308 +[2024-01-22 04:59:53,225] [INFO] [logging.py:96:log_dist] [Rank 0] step=2180, skipped=0, lr=[1.582643767887762e-05, 1.582643767887762e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 04:59:53,225] [INFO] [timer.py:260:stop] epoch=0/micro_step=2180/global_step=2180, RunningAvgSamplesPerSec=4.749149696007344, CurrSamplesPerSec=4.7553532022028335, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2181, Loss: 0.028753140941262245 +[2024-01-22 05:01:00,637] [INFO] [logging.py:96:log_dist] [Rank 0] step=2190, skipped=0, lr=[1.5789303582266612e-05, 1.5789303582266612e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:01:00,638] [INFO] [timer.py:260:stop] epoch=0/micro_step=2190/global_step=2190, RunningAvgSamplesPerSec=4.749143260597586, CurrSamplesPerSec=4.733223169360921, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2191, Loss: 0.04205947741866112 +[2024-01-22 05:02:07,994] [INFO] [logging.py:96:log_dist] [Rank 0] step=2200, skipped=0, lr=[1.5752049019472486e-05, 1.5752049019472486e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:02:07,994] [INFO] [timer.py:260:stop] epoch=0/micro_step=2200/global_step=2200, RunningAvgSamplesPerSec=4.749154903918633, CurrSamplesPerSec=4.739534745099679, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2201, Loss: 0.15703639388084412 +[2024-01-22 05:03:15,382] [INFO] [logging.py:96:log_dist] [Rank 0] step=2210, skipped=0, lr=[1.571467476570334e-05, 1.571467476570334e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:03:15,383] [INFO] [timer.py:260:stop] epoch=0/micro_step=2210/global_step=2210, RunningAvgSamplesPerSec=4.749156160834175, CurrSamplesPerSec=4.743982088637267, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2211, Loss: 0.12762081623077393 +[2024-01-22 05:04:22,728] [INFO] [logging.py:96:log_dist] [Rank 0] step=2220, skipped=0, lr=[1.5677181598657843e-05, 1.5677181598657843e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:04:22,729] [INFO] [timer.py:260:stop] epoch=0/micro_step=2220/global_step=2220, RunningAvgSamplesPerSec=4.749171044468682, CurrSamplesPerSec=4.740916066315313, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2221, Loss: 0.05257009342312813 +[2024-01-22 05:05:30,056] [INFO] [logging.py:96:log_dist] [Rank 0] step=2230, skipped=0, lr=[1.5639570298509067e-05, 1.5639570298509067e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:05:30,057] [INFO] [timer.py:260:stop] epoch=0/micro_step=2230/global_step=2230, RunningAvgSamplesPerSec=4.749191207294557, CurrSamplesPerSec=4.76408144083425, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2231, Loss: 0.14350222051143646 +[2024-01-22 05:06:37,546] [INFO] [logging.py:96:log_dist] [Rank 0] step=2240, skipped=0, lr=[1.5601841647888233e-05, 1.5601841647888233e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:06:37,547] [INFO] [timer.py:260:stop] epoch=0/micro_step=2240/global_step=2240, RunningAvgSamplesPerSec=4.749160546954131, CurrSamplesPerSec=4.726536199569555, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2241, Loss: 0.07641326636075974 +[2024-01-22 05:07:44,913] [INFO] [logging.py:96:log_dist] [Rank 0] step=2250, skipped=0, lr=[1.5563996431868443e-05, 1.5563996431868443e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:07:44,913] [INFO] [timer.py:260:stop] epoch=0/micro_step=2250/global_step=2250, RunningAvgSamplesPerSec=4.749168645114085, CurrSamplesPerSec=4.749824470807797, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2251, Loss: 0.08262906223535538 +[2024-01-22 05:08:52,284] [INFO] [logging.py:96:log_dist] [Rank 0] step=2260, skipped=0, lr=[1.552603543794835e-05, 1.552603543794835e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:08:52,285] [INFO] [timer.py:260:stop] epoch=0/micro_step=2260/global_step=2260, RunningAvgSamplesPerSec=4.749175184169939, CurrSamplesPerSec=4.757717180155169, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2261, Loss: 0.15214814245700836 +[2024-01-22 05:09:59,753] [INFO] [logging.py:96:log_dist] [Rank 0] step=2270, skipped=0, lr=[1.5487959456035745e-05, 1.5487959456035745e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:09:59,754] [INFO] [timer.py:260:stop] epoch=0/micro_step=2270/global_step=2270, RunningAvgSamplesPerSec=4.749151385963464, CurrSamplesPerSec=4.719190839509459, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2271, Loss: 0.21529079973697662 +[2024-01-22 05:11:07,071] [INFO] [logging.py:96:log_dist] [Rank 0] step=2280, skipped=0, lr=[1.5449769278431145e-05, 1.5449769278431145e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:11:07,071] [INFO] [timer.py:260:stop] epoch=0/micro_step=2280/global_step=2280, RunningAvgSamplesPerSec=4.749174609998285, CurrSamplesPerSec=4.764109342810126, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2281, Loss: 0.13264992833137512 +[2024-01-22 05:12:14,413] [INFO] [logging.py:96:log_dist] [Rank 0] step=2290, skipped=0, lr=[1.5411465699811293e-05, 1.5411465699811293e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:12:14,414] [INFO] [timer.py:260:stop] epoch=0/micro_step=2290/global_step=2290, RunningAvgSamplesPerSec=4.749189848588941, CurrSamplesPerSec=4.751715746433648, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2291, Loss: 0.12293980270624161 +[2024-01-22 05:13:21,854] [INFO] [logging.py:96:log_dist] [Rank 0] step=2300, skipped=0, lr=[1.5373049517212633e-05, 1.5373049517212633e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:13:21,855] [INFO] [timer.py:260:stop] epoch=0/micro_step=2300/global_step=2300, RunningAvgSamplesPerSec=4.749174864475143, CurrSamplesPerSec=4.719794072552853, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2301, Loss: 0.03802747651934624 +[2024-01-22 05:14:29,299] [INFO] [logging.py:96:log_dist] [Rank 0] step=2310, skipped=0, lr=[1.5334521530014713e-05, 1.5334521530014713e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:14:29,300] [INFO] [timer.py:260:stop] epoch=0/micro_step=2310/global_step=2310, RunningAvgSamplesPerSec=4.749158777781155, CurrSamplesPerSec=4.7418206977794535, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2311, Loss: 0.07587782293558121 +[2024-01-22 05:15:36,749] [INFO] [logging.py:96:log_dist] [Rank 0] step=2320, skipped=0, lr=[1.529588253992356e-05, 1.529588253992356e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:15:36,749] [INFO] [timer.py:260:stop] epoch=0/micro_step=2320/global_step=2320, RunningAvgSamplesPerSec=4.749141454620919, CurrSamplesPerSec=4.757172164521266, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 0, Total Step: 2321, Loss: 0.03802649676799774 +***** Evaluating perplexity, Epoch 1/3 ***** +ppl: 1.0078471899032593 +eval loss: 0.007816458120942116 +Beginning of Epoch 2/3, Total Micro Batches 2329 +Epoch: 1, Total Step: 2330, Loss: 0.05883161351084709 +[2024-01-22 05:16:40,011] [INFO] [logging.py:96:log_dist] [Rank 0] step=2330, skipped=0, lr=[1.5257133350954987e-05, 1.5257133350954987e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:16:40,012] [INFO] [timer.py:260:stop] epoch=1/micro_step=1/global_step=2330, RunningAvgSamplesPerSec=4.750566192943663, CurrSamplesPerSec=4.765055497753655, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2340, Loss: 0.02371196821331978 +[2024-01-22 05:17:47,274] [INFO] [logging.py:96:log_dist] [Rank 0] step=2340, skipped=0, lr=[1.5218274769417875e-05, 1.5218274769417875e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:17:47,275] [INFO] [timer.py:260:stop] epoch=1/micro_step=11/global_step=2340, RunningAvgSamplesPerSec=4.750599420904488, CurrSamplesPerSec=4.770071280045525, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2350, Loss: 0.1107647642493248 +[2024-01-22 05:18:54,500] [INFO] [logging.py:96:log_dist] [Rank 0] step=2350, skipped=0, lr=[1.5179307603897394e-05, 1.5179307603897394e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:18:54,501] [INFO] [timer.py:260:stop] epoch=1/micro_step=21/global_step=2350, RunningAvgSamplesPerSec=4.750643452660038, CurrSamplesPerSec=4.761961508635663, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2360, Loss: 0.042540181428194046 +[2024-01-22 05:20:01,832] [INFO] [logging.py:96:log_dist] [Rank 0] step=2360, skipped=0, lr=[1.5140232665238171e-05, 1.5140232665238171e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:20:01,832] [INFO] [timer.py:260:stop] epoch=1/micro_step=31/global_step=2360, RunningAvgSamplesPerSec=4.750655446479668, CurrSamplesPerSec=4.7567665192252, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2370, Loss: 0.13792255520820618 +[2024-01-22 05:21:09,222] [INFO] [logging.py:96:log_dist] [Rank 0] step=2370, skipped=0, lr=[1.5101050766527414e-05, 1.5101050766527414e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:21:09,222] [INFO] [timer.py:260:stop] epoch=1/micro_step=41/global_step=2370, RunningAvgSamplesPerSec=4.750649955807692, CurrSamplesPerSec=4.722414739390739, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2380, Loss: 0.14344920217990875 +[2024-01-22 05:22:16,670] [INFO] [logging.py:96:log_dist] [Rank 0] step=2380, skipped=0, lr=[1.5061762723078007e-05, 1.5061762723078007e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:22:16,671] [INFO] [timer.py:260:stop] epoch=1/micro_step=51/global_step=2380, RunningAvgSamplesPerSec=4.750627073568524, CurrSamplesPerSec=4.736273563774481, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2390, Loss: 0.1402510702610016 +[2024-01-22 05:23:24,162] [INFO] [logging.py:96:log_dist] [Rank 0] step=2390, skipped=0, lr=[1.5022369352411535e-05, 1.5022369352411535e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:23:24,162] [INFO] [timer.py:260:stop] epoch=1/micro_step=61/global_step=2390, RunningAvgSamplesPerSec=4.750591720764389, CurrSamplesPerSec=4.745647548417155, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2400, Loss: 0.19211484491825104 +[2024-01-22 05:24:31,595] [INFO] [logging.py:96:log_dist] [Rank 0] step=2400, skipped=0, lr=[1.498287147424127e-05, 1.498287147424127e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:24:31,595] [INFO] [timer.py:260:stop] epoch=1/micro_step=71/global_step=2400, RunningAvgSamplesPerSec=4.750573764559641, CurrSamplesPerSec=4.742520051461207, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2410, Loss: 0.02481241151690483 +[2024-01-22 05:25:39,082] [INFO] [logging.py:96:log_dist] [Rank 0] step=2410, skipped=0, lr=[1.4943269910455127e-05, 1.4943269910455127e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:25:39,082] [INFO] [timer.py:260:stop] epoch=1/micro_step=81/global_step=2410, RunningAvgSamplesPerSec=4.7505404360046, CurrSamplesPerSec=4.749113382985325, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2420, Loss: 0.05591238662600517 +[2024-01-22 05:26:46,423] [INFO] [logging.py:96:log_dist] [Rank 0] step=2420, skipped=0, lr=[1.4903565485098547e-05, 1.4903565485098547e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:26:46,423] [INFO] [timer.py:260:stop] epoch=1/micro_step=91/global_step=2420, RunningAvgSamplesPerSec=4.750549745344085, CurrSamplesPerSec=4.757376699376696, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2430, Loss: 0.03262871876358986 +[2024-01-22 05:27:53,788] [INFO] [logging.py:96:log_dist] [Rank 0] step=2430, skipped=0, lr=[1.4863759024357358e-05, 1.4863759024357358e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:27:53,788] [INFO] [timer.py:260:stop] epoch=1/micro_step=101/global_step=2430, RunningAvgSamplesPerSec=4.750552041443237, CurrSamplesPerSec=4.760690488446174, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2440, Loss: 0.05730247497558594 +[2024-01-22 05:29:01,024] [INFO] [logging.py:96:log_dist] [Rank 0] step=2440, skipped=0, lr=[1.4823851356540584e-05, 1.4823851356540584e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:29:01,024] [INFO] [timer.py:260:stop] epoch=1/micro_step=111/global_step=2440, RunningAvgSamplesPerSec=4.750591663981689, CurrSamplesPerSec=4.749374533020689, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2450, Loss: 0.09343577921390533 +[2024-01-22 05:30:08,455] [INFO] [logging.py:96:log_dist] [Rank 0] step=2450, skipped=0, lr=[1.4783843312063204e-05, 1.4783843312063204e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:30:08,455] [INFO] [timer.py:260:stop] epoch=1/micro_step=121/global_step=2450, RunningAvgSamplesPerSec=4.750574835531434, CurrSamplesPerSec=4.747327443203979, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2460, Loss: 0.02385859191417694 +[2024-01-22 05:31:15,805] [INFO] [logging.py:96:log_dist] [Rank 0] step=2460, skipped=0, lr=[1.4743735723428873e-05, 1.4743735723428873e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:31:15,805] [INFO] [timer.py:260:stop] epoch=1/micro_step=131/global_step=2460, RunningAvgSamplesPerSec=4.750581122331094, CurrSamplesPerSec=4.7631047394723565, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2470, Loss: 0.04971740022301674 +[2024-01-22 05:32:23,221] [INFO] [logging.py:96:log_dist] [Rank 0] step=2470, skipped=0, lr=[1.470352942521261e-05, 1.470352942521261e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:32:23,221] [INFO] [timer.py:260:stop] epoch=1/micro_step=141/global_step=2470, RunningAvgSamplesPerSec=4.750568678203278, CurrSamplesPerSec=4.728215247839892, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2480, Loss: 0.14900454878807068 +[2024-01-22 05:33:30,730] [INFO] [logging.py:96:log_dist] [Rank 0] step=2480, skipped=0, lr=[1.4663225254043416e-05, 1.4663225254043416e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:33:30,730] [INFO] [timer.py:260:stop] epoch=1/micro_step=151/global_step=2480, RunningAvgSamplesPerSec=4.750529706707258, CurrSamplesPerSec=4.75858791281901, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2490, Loss: 0.17574326694011688 +[2024-01-22 05:34:38,078] [INFO] [logging.py:96:log_dist] [Rank 0] step=2490, skipped=0, lr=[1.462282404858687e-05, 1.462282404858687e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:34:38,078] [INFO] [timer.py:260:stop] epoch=1/micro_step=161/global_step=2490, RunningAvgSamplesPerSec=4.750536928322948, CurrSamplesPerSec=4.7529902257503185, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2500, Loss: 0.040888771414756775 +[2024-01-22 05:35:45,522] [INFO] [logging.py:96:log_dist] [Rank 0] step=2500, skipped=0, lr=[1.4582326649527692e-05, 1.4582326649527692e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:35:45,523] [INFO] [timer.py:260:stop] epoch=1/micro_step=171/global_step=2500, RunningAvgSamplesPerSec=4.750516725567758, CurrSamplesPerSec=4.7342469340629085, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2510, Loss: 0.03205130621790886 +[2024-01-22 05:36:52,904] [INFO] [logging.py:96:log_dist] [Rank 0] step=2510, skipped=0, lr=[1.4541733899552221e-05, 1.4541733899552221e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:36:52,904] [INFO] [timer.py:260:stop] epoch=1/micro_step=181/global_step=2510, RunningAvgSamplesPerSec=4.7505145512587115, CurrSamplesPerSec=4.7636619359790195, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2520, Loss: 0.1001964807510376 +[2024-01-22 05:38:00,197] [INFO] [logging.py:96:log_dist] [Rank 0] step=2520, skipped=0, lr=[1.4501046643330913e-05, 1.4501046643330913e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:38:00,198] [INFO] [timer.py:260:stop] epoch=1/micro_step=191/global_step=2520, RunningAvgSamplesPerSec=4.750537011700505, CurrSamplesPerSec=4.756069699180763, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2530, Loss: 0.09946943074464798 +[2024-01-22 05:39:07,577] [INFO] [logging.py:96:log_dist] [Rank 0] step=2530, skipped=0, lr=[1.4460265727500736e-05, 1.4460265727500736e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:39:07,577] [INFO] [timer.py:260:stop] epoch=1/micro_step=201/global_step=2530, RunningAvgSamplesPerSec=4.750535297975468, CurrSamplesPerSec=4.753782113498168, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2540, Loss: 0.1821090131998062 +[2024-01-22 05:40:14,874] [INFO] [logging.py:96:log_dist] [Rank 0] step=2540, skipped=0, lr=[1.441939200064757e-05, 1.441939200064757e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:40:14,875] [INFO] [timer.py:260:stop] epoch=1/micro_step=211/global_step=2540, RunningAvgSamplesPerSec=4.750556174734808, CurrSamplesPerSec=4.776590197816889, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2550, Loss: 0.05891280993819237 +[2024-01-22 05:41:22,292] [INFO] [logging.py:96:log_dist] [Rank 0] step=2550, skipped=0, lr=[1.4378426313288546e-05, 1.4378426313288546e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:41:22,292] [INFO] [timer.py:260:stop] epoch=1/micro_step=221/global_step=2550, RunningAvgSamplesPerSec=4.7505440698194485, CurrSamplesPerSec=4.745086337851341, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2560, Loss: 0.0598636120557785 +[2024-01-22 05:42:29,661] [INFO] [logging.py:96:log_dist] [Rank 0] step=2560, skipped=0, lr=[1.4337369517854344e-05, 1.4337369517854344e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:42:29,661] [INFO] [timer.py:260:stop] epoch=1/micro_step=231/global_step=2560, RunningAvgSamplesPerSec=4.750545194655498, CurrSamplesPerSec=4.757633362385802, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2570, Loss: 0.1297481507062912 +[2024-01-22 05:43:37,139] [INFO] [logging.py:96:log_dist] [Rank 0] step=2570, skipped=0, lr=[1.4296222468671458e-05, 1.4296222468671458e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:43:37,140] [INFO] [timer.py:260:stop] epoch=1/micro_step=241/global_step=2570, RunningAvgSamplesPerSec=4.750516376990636, CurrSamplesPerSec=4.723919270787963, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2580, Loss: 0.02763376012444496 +[2024-01-22 05:44:44,571] [INFO] [logging.py:96:log_dist] [Rank 0] step=2580, skipped=0, lr=[1.425498602194442e-05, 1.425498602194442e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:44:44,572] [INFO] [timer.py:260:stop] epoch=1/micro_step=251/global_step=2580, RunningAvgSamplesPerSec=4.750500282568808, CurrSamplesPerSec=4.747145430742124, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2590, Loss: 0.20636805891990662 +[2024-01-22 05:45:51,834] [INFO] [logging.py:96:log_dist] [Rank 0] step=2590, skipped=0, lr=[1.4213661035737984e-05, 1.4213661035737984e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:45:51,834] [INFO] [timer.py:260:stop] epoch=1/micro_step=261/global_step=2590, RunningAvgSamplesPerSec=4.750530594108702, CurrSamplesPerSec=4.77147707622313, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2600, Loss: 0.2417963296175003 +[2024-01-22 05:46:59,211] [INFO] [logging.py:96:log_dist] [Rank 0] step=2600, skipped=0, lr=[1.4172248369959266e-05, 1.4172248369959266e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:46:59,212] [INFO] [timer.py:260:stop] epoch=1/micro_step=271/global_step=2600, RunningAvgSamplesPerSec=4.750529456788204, CurrSamplesPerSec=4.769518006852132, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2610, Loss: 0.2156227082014084 +[2024-01-22 05:48:06,477] [INFO] [logging.py:96:log_dist] [Rank 0] step=2610, skipped=0, lr=[1.4130748886339851e-05, 1.4130748886339851e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:48:06,478] [INFO] [timer.py:260:stop] epoch=1/micro_step=281/global_step=2610, RunningAvgSamplesPerSec=4.750558365037795, CurrSamplesPerSec=4.753405832912301, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2620, Loss: 0.13724471628665924 +[2024-01-22 05:49:13,775] [INFO] [logging.py:96:log_dist] [Rank 0] step=2620, skipped=0, lr=[1.408916344841788e-05, 1.408916344841788e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:49:13,775] [INFO] [timer.py:260:stop] epoch=1/micro_step=291/global_step=2620, RunningAvgSamplesPerSec=4.750578654214435, CurrSamplesPerSec=4.7406372595247905, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2630, Loss: 0.09461900591850281 +[2024-01-22 05:50:21,069] [INFO] [logging.py:96:log_dist] [Rank 0] step=2630, skipped=0, lr=[1.4047492921520046e-05, 1.4047492921520046e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:50:21,070] [INFO] [timer.py:260:stop] epoch=1/micro_step=301/global_step=2630, RunningAvgSamplesPerSec=4.750599511517636, CurrSamplesPerSec=4.767874369818752, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2640, Loss: 0.05895576253533363 +[2024-01-22 05:51:28,446] [INFO] [logging.py:96:log_dist] [Rank 0] step=2640, skipped=0, lr=[1.400573817274362e-05, 1.400573817274362e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:51:28,447] [INFO] [timer.py:260:stop] epoch=1/micro_step=311/global_step=2640, RunningAvgSamplesPerSec=4.750598234053635, CurrSamplesPerSec=4.746418863242791, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2650, Loss: 0.04432009160518646 +[2024-01-22 05:52:35,867] [INFO] [logging.py:96:log_dist] [Rank 0] step=2650, skipped=0, lr=[1.3963900070938398e-05, 1.3963900070938398e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:52:35,868] [INFO] [timer.py:260:stop] epoch=1/micro_step=321/global_step=2650, RunningAvgSamplesPerSec=4.750585241514559, CurrSamplesPerSec=4.752918187833393, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2660, Loss: 0.11739644408226013 +[2024-01-22 05:53:43,255] [INFO] [logging.py:96:log_dist] [Rank 0] step=2660, skipped=0, lr=[1.3921979486688613e-05, 1.3921979486688613e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:53:43,256] [INFO] [timer.py:260:stop] epoch=1/micro_step=331/global_step=2660, RunningAvgSamplesPerSec=4.750581159846644, CurrSamplesPerSec=4.737226583950038, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2670, Loss: 0.13375094532966614 +[2024-01-22 05:54:50,539] [INFO] [logging.py:96:log_dist] [Rank 0] step=2670, skipped=0, lr=[1.3879977292294825e-05, 1.3879977292294825e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:54:50,540] [INFO] [timer.py:260:stop] epoch=1/micro_step=341/global_step=2670, RunningAvgSamplesPerSec=4.750604528859358, CurrSamplesPerSec=4.7725576772931895, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2680, Loss: 0.03625553473830223 +[2024-01-22 05:55:57,994] [INFO] [logging.py:96:log_dist] [Rank 0] step=2680, skipped=0, lr=[1.3837894361755782e-05, 1.3837894361755782e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:55:57,994] [INFO] [timer.py:260:stop] epoch=1/micro_step=351/global_step=2680, RunningAvgSamplesPerSec=4.7505827232704885, CurrSamplesPerSec=4.726962175949582, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2690, Loss: 0.018074439838528633 +[2024-01-22 05:57:05,310] [INFO] [logging.py:96:log_dist] [Rank 0] step=2690, skipped=0, lr=[1.3795731570750208e-05, 1.3795731570750208e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:57:05,311] [INFO] [timer.py:260:stop] epoch=1/micro_step=361/global_step=2690, RunningAvgSamplesPerSec=4.750597492652863, CurrSamplesPerSec=4.748686261243515, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2700, Loss: 0.12432271242141724 +[2024-01-22 05:58:12,514] [INFO] [logging.py:96:log_dist] [Rank 0] step=2700, skipped=0, lr=[1.3753489796618608e-05, 1.3753489796618608e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:58:12,515] [INFO] [timer.py:260:stop] epoch=1/micro_step=371/global_step=2700, RunningAvgSamplesPerSec=4.7506432213481755, CurrSamplesPerSec=4.767749038379324, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2710, Loss: 0.03336840867996216 +[2024-01-22 05:59:19,887] [INFO] [logging.py:96:log_dist] [Rank 0] step=2710, skipped=0, lr=[1.3711169918344995e-05, 1.3711169918344995e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 05:59:19,888] [INFO] [timer.py:260:stop] epoch=1/micro_step=381/global_step=2710, RunningAvgSamplesPerSec=4.750642831196893, CurrSamplesPerSec=4.760260437682536, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2720, Loss: 0.08031318336725235 +[2024-01-22 06:00:27,320] [INFO] [logging.py:96:log_dist] [Rank 0] step=2720, skipped=0, lr=[1.3668772816538604e-05, 1.3668772816538604e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:00:27,321] [INFO] [timer.py:260:stop] epoch=1/micro_step=391/global_step=2720, RunningAvgSamplesPerSec=4.750626766993938, CurrSamplesPerSec=4.754363571507767, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2730, Loss: 0.06371303647756577 +[2024-01-22 06:01:34,701] [INFO] [logging.py:96:log_dist] [Rank 0] step=2730, skipped=0, lr=[1.362629937341557e-05, 1.362629937341557e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:01:34,701] [INFO] [timer.py:260:stop] epoch=1/micro_step=401/global_step=2730, RunningAvgSamplesPerSec=4.750624507676058, CurrSamplesPerSec=4.754594476564026, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2740, Loss: 0.048484329134225845 +[2024-01-22 06:02:42,054] [INFO] [logging.py:96:log_dist] [Rank 0] step=2740, skipped=0, lr=[1.3583750472780567e-05, 1.3583750472780567e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:02:42,055] [INFO] [timer.py:260:stop] epoch=1/micro_step=411/global_step=2740, RunningAvgSamplesPerSec=4.7506293507245685, CurrSamplesPerSec=4.761961846538552, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2750, Loss: 0.1041703149676323 +[2024-01-22 06:03:49,472] [INFO] [logging.py:96:log_dist] [Rank 0] step=2750, skipped=0, lr=[1.3541127000008427e-05, 1.3541127000008427e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:03:49,473] [INFO] [timer.py:260:stop] epoch=1/micro_step=421/global_step=2750, RunningAvgSamplesPerSec=4.750617560097558, CurrSamplesPerSec=4.75359236635652, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2760, Loss: 0.10369225591421127 +[2024-01-22 06:04:56,853] [INFO] [logging.py:96:log_dist] [Rank 0] step=2760, skipped=0, lr=[1.34984298420257e-05, 1.34984298420257e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:04:56,854] [INFO] [timer.py:260:stop] epoch=1/micro_step=431/global_step=2760, RunningAvgSamplesPerSec=4.750615223216478, CurrSamplesPerSec=4.729919825409367, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2770, Loss: 0.04748295247554779 +[2024-01-22 06:06:04,128] [INFO] [logging.py:96:log_dist] [Rank 0] step=2770, skipped=0, lr=[1.3455659887292212e-05, 1.3455659887292212e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:06:04,128] [INFO] [timer.py:260:stop] epoch=1/micro_step=441/global_step=2770, RunningAvgSamplesPerSec=4.75064001825295, CurrSamplesPerSec=4.737549136314501, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2780, Loss: 0.06050010025501251 +[2024-01-22 06:07:11,593] [INFO] [logging.py:96:log_dist] [Rank 0] step=2780, skipped=0, lr=[1.3412818025782574e-05, 1.3412818025782574e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:07:11,594] [INFO] [timer.py:260:stop] epoch=1/micro_step=451/global_step=2780, RunningAvgSamplesPerSec=4.750616266577267, CurrSamplesPerSec=4.751300434746747, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2790, Loss: 0.042863089591264725 +[2024-01-22 06:08:18,991] [INFO] [logging.py:96:log_dist] [Rank 0] step=2790, skipped=0, lr=[1.3369905148967658e-05, 1.3369905148967658e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:08:18,991] [INFO] [timer.py:260:stop] epoch=1/micro_step=461/global_step=2790, RunningAvgSamplesPerSec=4.750609797964272, CurrSamplesPerSec=4.74254267415456, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2800, Loss: 0.02812485583126545 +[2024-01-22 06:09:26,415] [INFO] [logging.py:96:log_dist] [Rank 0] step=2800, skipped=0, lr=[1.3326922149796064e-05, 1.3326922149796064e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:09:26,416] [INFO] [timer.py:260:stop] epoch=1/micro_step=471/global_step=2800, RunningAvgSamplesPerSec=4.750596611117128, CurrSamplesPerSec=4.750547205265091, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2810, Loss: 0.21685662865638733 +[2024-01-22 06:10:33,825] [INFO] [logging.py:96:log_dist] [Rank 0] step=2810, skipped=0, lr=[1.3283869922675507e-05, 1.3283869922675507e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:10:33,826] [INFO] [timer.py:260:stop] epoch=1/micro_step=481/global_step=2810, RunningAvgSamplesPerSec=4.750587036140281, CurrSamplesPerSec=4.72282584733529, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2820, Loss: 0.04834204539656639 +[2024-01-22 06:11:41,316] [INFO] [logging.py:96:log_dist] [Rank 0] step=2820, skipped=0, lr=[1.3240749363454242e-05, 1.3240749363454242e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:11:41,316] [INFO] [timer.py:260:stop] epoch=1/micro_step=491/global_step=2820, RunningAvgSamplesPerSec=4.750557594916527, CurrSamplesPerSec=4.740839705158719, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2830, Loss: 0.12824319303035736 +[2024-01-22 06:12:48,625] [INFO] [logging.py:96:log_dist] [Rank 0] step=2830, skipped=0, lr=[1.3197561369402397e-05, 1.3197561369402397e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:12:48,626] [INFO] [timer.py:260:stop] epoch=1/micro_step=501/global_step=2830, RunningAvgSamplesPerSec=4.750573145423976, CurrSamplesPerSec=4.76951766787604, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2840, Loss: 0.04162723571062088 +[2024-01-22 06:13:55,933] [INFO] [logging.py:96:log_dist] [Rank 0] step=2840, skipped=0, lr=[1.3154306839193315e-05, 1.3154306839193315e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:13:55,934] [INFO] [timer.py:260:stop] epoch=1/micro_step=511/global_step=2840, RunningAvgSamplesPerSec=4.750589244747279, CurrSamplesPerSec=4.756681554909043, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2850, Loss: 0.028961317613720894 +[2024-01-22 06:15:03,317] [INFO] [logging.py:96:log_dist] [Rank 0] step=2850, skipped=0, lr=[1.3110986672884854e-05, 1.3110986672884854e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:15:03,317] [INFO] [timer.py:260:stop] epoch=1/micro_step=521/global_step=2850, RunningAvgSamplesPerSec=4.750586419857964, CurrSamplesPerSec=4.7697741170588905, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2860, Loss: 0.14956451952457428 +[2024-01-22 06:16:10,766] [INFO] [logging.py:96:log_dist] [Rank 0] step=2860, skipped=0, lr=[1.306760177190064e-05, 1.306760177190064e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:16:10,767] [INFO] [timer.py:260:stop] epoch=1/micro_step=531/global_step=2860, RunningAvgSamplesPerSec=4.7505674130388105, CurrSamplesPerSec=4.767762756769791, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2870, Loss: 0.13183489441871643 +[2024-01-22 06:17:18,167] [INFO] [logging.py:96:log_dist] [Rank 0] step=2870, skipped=0, lr=[1.3024153039011345e-05, 1.3024153039011345e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:17:18,168] [INFO] [timer.py:260:stop] epoch=1/micro_step=541/global_step=2870, RunningAvgSamplesPerSec=4.750560423186893, CurrSamplesPerSec=4.746639596711996, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2880, Loss: 0.09314028173685074 +[2024-01-22 06:18:25,625] [INFO] [logging.py:96:log_dist] [Rank 0] step=2880, skipped=0, lr=[1.2980641378315866e-05, 1.2980641378315866e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:18:25,625] [INFO] [timer.py:260:stop] epoch=1/micro_step=551/global_step=2880, RunningAvgSamplesPerSec=4.75053960521476, CurrSamplesPerSec=4.750994001795086, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2890, Loss: 0.028248688206076622 +[2024-01-22 06:19:33,007] [INFO] [logging.py:96:log_dist] [Rank 0] step=2890, skipped=0, lr=[1.2937067695222535e-05, 1.2937067695222535e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:19:33,007] [INFO] [timer.py:260:stop] epoch=1/micro_step=561/global_step=2890, RunningAvgSamplesPerSec=4.750537581166582, CurrSamplesPerSec=4.747296379203844, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2900, Loss: 0.04733101278543472 +[2024-01-22 06:20:40,336] [INFO] [logging.py:96:log_dist] [Rank 0] step=2900, skipped=0, lr=[1.2893432896430267e-05, 1.2893432896430267e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:20:40,336] [INFO] [timer.py:260:stop] epoch=1/micro_step=571/global_step=2900, RunningAvgSamplesPerSec=4.750548398392207, CurrSamplesPerSec=4.754260168345174, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2910, Loss: 0.05448748916387558 +[2024-01-22 06:21:47,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=2910, skipped=0, lr=[1.2849737889909699e-05, 1.2849737889909699e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:21:47,749] [INFO] [timer.py:260:stop] epoch=1/micro_step=581/global_step=2910, RunningAvgSamplesPerSec=4.75053894091574, CurrSamplesPerSec=4.7390335435121935, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2920, Loss: 0.22973594069480896 +[2024-01-22 06:22:55,087] [INFO] [logging.py:96:log_dist] [Rank 0] step=2920, skipped=0, lr=[1.2805983584884296e-05, 1.2805983584884296e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:22:55,087] [INFO] [timer.py:260:stop] epoch=1/micro_step=591/global_step=2920, RunningAvgSamplesPerSec=4.750547223819537, CurrSamplesPerSec=4.7656881122829615, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2930, Loss: 0.08731473237276077 +[2024-01-22 06:24:02,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=2930, skipped=0, lr=[1.276217089181143e-05, 1.276217089181143e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:24:02,503] [INFO] [timer.py:260:stop] epoch=1/micro_step=601/global_step=2930, RunningAvgSamplesPerSec=4.7505368365027865, CurrSamplesPerSec=4.69769160305087, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2940, Loss: 0.07433625310659409 +[2024-01-22 06:25:09,889] [INFO] [logging.py:96:log_dist] [Rank 0] step=2940, skipped=0, lr=[1.2718300722363431e-05, 1.2718300722363431e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:25:09,889] [INFO] [timer.py:260:stop] epoch=1/micro_step=611/global_step=2940, RunningAvgSamplesPerSec=4.750533781090718, CurrSamplesPerSec=4.736002822877659, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2950, Loss: 0.1341237723827362 +[2024-01-22 06:26:17,175] [INFO] [logging.py:96:log_dist] [Rank 0] step=2950, skipped=0, lr=[1.2674373989408626e-05, 1.2674373989408626e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:26:17,176] [INFO] [timer.py:260:stop] epoch=1/micro_step=621/global_step=2950, RunningAvgSamplesPerSec=4.750554610275066, CurrSamplesPerSec=4.772341144609682, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2960, Loss: 0.023333415389060974 +[2024-01-22 06:27:24,471] [INFO] [logging.py:96:log_dist] [Rank 0] step=2960, skipped=0, lr=[1.2630391606992337e-05, 1.2630391606992337e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:27:24,471] [INFO] [timer.py:260:stop] epoch=1/micro_step=631/global_step=2960, RunningAvgSamplesPerSec=4.750573007882977, CurrSamplesPerSec=4.7564949475118175, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2970, Loss: 0.11731809377670288 +[2024-01-22 06:28:31,758] [INFO] [logging.py:96:log_dist] [Rank 0] step=2970, skipped=0, lr=[1.2586354490317862e-05, 1.2586354490317862e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:28:31,759] [INFO] [timer.py:260:stop] epoch=1/micro_step=641/global_step=2970, RunningAvgSamplesPerSec=4.750593200377433, CurrSamplesPerSec=4.746338128599862, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2980, Loss: 0.09584458917379379 +[2024-01-22 06:29:39,153] [INFO] [logging.py:96:log_dist] [Rank 0] step=2980, skipped=0, lr=[1.2542263555727435e-05, 1.2542263555727435e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:29:39,154] [INFO] [timer.py:260:stop] epoch=1/micro_step=651/global_step=2980, RunningAvgSamplesPerSec=4.7505878481518975, CurrSamplesPerSec=4.758999943516501, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 2990, Loss: 0.06320009380578995 +[2024-01-22 06:30:46,454] [INFO] [logging.py:96:log_dist] [Rank 0] step=2990, skipped=0, lr=[1.249811972068315e-05, 1.249811972068315e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:30:46,454] [INFO] [timer.py:260:stop] epoch=1/micro_step=661/global_step=2990, RunningAvgSamplesPerSec=4.750604880010254, CurrSamplesPerSec=4.764626518595102, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3000, Loss: 0.09870254993438721 +[2024-01-22 06:31:53,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=3000, skipped=0, lr=[1.2453923903747875e-05, 1.2453923903747875e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:31:53,749] [INFO] [timer.py:260:stop] epoch=1/micro_step=671/global_step=3000, RunningAvgSamplesPerSec=4.75062306491683, CurrSamplesPerSec=4.735758514144904, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3010, Loss: 0.03532011806964874 +[2024-01-22 06:33:01,173] [INFO] [logging.py:96:log_dist] [Rank 0] step=3010, skipped=0, lr=[1.2409677024566145e-05, 1.2409677024566145e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:33:01,173] [INFO] [timer.py:260:stop] epoch=1/micro_step=681/global_step=3010, RunningAvgSamplesPerSec=4.750610731934076, CurrSamplesPerSec=4.7556940674210235, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3020, Loss: 0.0272184107452631 +[2024-01-22 06:34:08,536] [INFO] [logging.py:96:log_dist] [Rank 0] step=3020, skipped=0, lr=[1.2365380003845012e-05, 1.2365380003845012e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:34:08,537] [INFO] [timer.py:260:stop] epoch=1/micro_step=691/global_step=3020, RunningAvgSamplesPerSec=4.750612697878389, CurrSamplesPerSec=4.732787385677466, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3030, Loss: 0.14704132080078125 +[2024-01-22 06:35:15,886] [INFO] [logging.py:96:log_dist] [Rank 0] step=3030, skipped=0, lr=[1.2321033763334896e-05, 1.2321033763334896e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:35:15,886] [INFO] [timer.py:260:stop] epoch=1/micro_step=701/global_step=3030, RunningAvgSamplesPerSec=4.750617818317599, CurrSamplesPerSec=4.742903828593883, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3040, Loss: 0.0654720813035965 +[2024-01-22 06:36:23,221] [INFO] [logging.py:96:log_dist] [Rank 0] step=3040, skipped=0, lr=[1.2276639225810402e-05, 1.2276639225810402e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:36:23,222] [INFO] [timer.py:260:stop] epoch=1/micro_step=711/global_step=3040, RunningAvgSamplesPerSec=4.750626369968707, CurrSamplesPerSec=4.7414004140864865, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3050, Loss: 0.078664131462574 +[2024-01-22 06:37:30,528] [INFO] [logging.py:96:log_dist] [Rank 0] step=3050, skipped=0, lr=[1.2232197315051123e-05, 1.2232197315051123e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:37:30,528] [INFO] [timer.py:260:stop] epoch=1/micro_step=721/global_step=3050, RunningAvgSamplesPerSec=4.75064134362768, CurrSamplesPerSec=4.736222254326101, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3060, Loss: 0.18412095308303833 +[2024-01-22 06:38:37,998] [INFO] [logging.py:96:log_dist] [Rank 0] step=3060, skipped=0, lr=[1.2187708955822405e-05, 1.2187708955822405e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:38:37,998] [INFO] [timer.py:260:stop] epoch=1/micro_step=731/global_step=3060, RunningAvgSamplesPerSec=4.750618717032219, CurrSamplesPerSec=4.7461677724095175, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3070, Loss: 0.11269430816173553 +[2024-01-22 06:39:45,378] [INFO] [logging.py:96:log_dist] [Rank 0] step=3070, skipped=0, lr=[1.2143175073856124e-05, 1.2143175073856124e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:39:45,378] [INFO] [timer.py:260:stop] epoch=1/micro_step=741/global_step=3070, RunningAvgSamplesPerSec=4.750616868342895, CurrSamplesPerSec=4.765278983191119, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3080, Loss: 0.09143193811178207 +[2024-01-22 06:40:52,843] [INFO] [logging.py:96:log_dist] [Rank 0] step=3080, skipped=0, lr=[1.20985965958314e-05, 1.20985965958314e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:40:52,843] [INFO] [timer.py:260:stop] epoch=1/micro_step=751/global_step=3080, RunningAvgSamplesPerSec=4.7505956032540855, CurrSamplesPerSec=4.744812072459902, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3090, Loss: 0.045307062566280365 +[2024-01-22 06:42:00,123] [INFO] [logging.py:96:log_dist] [Rank 0] step=3090, skipped=0, lr=[1.2053974449355333e-05, 1.2053974449355333e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:42:00,123] [INFO] [timer.py:260:stop] epoch=1/micro_step=761/global_step=3090, RunningAvgSamplesPerSec=4.750616625755687, CurrSamplesPerSec=4.769428349355063, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3100, Loss: 0.06055546924471855 +[2024-01-22 06:43:07,628] [INFO] [logging.py:96:log_dist] [Rank 0] step=3100, skipped=0, lr=[1.2009309562943692e-05, 1.2009309562943692e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:43:07,629] [INFO] [timer.py:260:stop] epoch=1/micro_step=771/global_step=3100, RunningAvgSamplesPerSec=4.750586286984892, CurrSamplesPerSec=4.700919279590663, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3110, Loss: 0.06281973421573639 +[2024-01-22 06:44:15,058] [INFO] [logging.py:96:log_dist] [Rank 0] step=3110, skipped=0, lr=[1.1964602866001596e-05, 1.1964602866001596e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:44:15,058] [INFO] [timer.py:260:stop] epoch=1/micro_step=781/global_step=3110, RunningAvgSamplesPerSec=4.75057334490692, CurrSamplesPerSec=4.7672589540971115, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3120, Loss: 0.14632922410964966 +[2024-01-22 06:45:22,471] [INFO] [logging.py:96:log_dist] [Rank 0] step=3120, skipped=0, lr=[1.1919855288804174e-05, 1.1919855288804174e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:45:22,471] [INFO] [timer.py:260:stop] epoch=1/micro_step=791/global_step=3120, RunningAvgSamplesPerSec=4.750564121330251, CurrSamplesPerSec=4.7475449025884355, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3130, Loss: 0.1331828236579895 +[2024-01-22 06:46:29,867] [INFO] [logging.py:96:log_dist] [Rank 0] step=3130, skipped=0, lr=[1.187506776247721e-05, 1.187506776247721e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:46:29,867] [INFO] [timer.py:260:stop] epoch=1/micro_step=801/global_step=3130, RunningAvgSamplesPerSec=4.7505588038978015, CurrSamplesPerSec=4.759487655991373, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3140, Loss: 0.18538637459278107 +[2024-01-22 06:47:37,192] [INFO] [logging.py:96:log_dist] [Rank 0] step=3140, skipped=0, lr=[1.1830241218977762e-05, 1.1830241218977762e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:47:37,192] [INFO] [timer.py:260:stop] epoch=1/micro_step=811/global_step=3140, RunningAvgSamplesPerSec=4.750569395457978, CurrSamplesPerSec=4.7526850894463255, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3150, Loss: 0.0731845572590828 +[2024-01-22 06:48:44,575] [INFO] [logging.py:96:log_dist] [Rank 0] step=3150, skipped=0, lr=[1.178537659107478e-05, 1.178537659107478e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:48:44,576] [INFO] [timer.py:260:stop] epoch=1/micro_step=821/global_step=3150, RunningAvgSamplesPerSec=4.750566942750646, CurrSamplesPerSec=4.761668395943367, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3160, Loss: 0.026224959641695023 +[2024-01-22 06:49:51,976] [INFO] [logging.py:96:log_dist] [Rank 0] step=3160, skipped=0, lr=[1.1740474812329682e-05, 1.1740474812329682e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:49:51,976] [INFO] [timer.py:260:stop] epoch=1/micro_step=831/global_step=3160, RunningAvgSamplesPerSec=4.750560711647289, CurrSamplesPerSec=4.736757799213265, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3170, Loss: 0.0836123451590538 +[2024-01-22 06:50:59,419] [INFO] [logging.py:96:log_dist] [Rank 0] step=3170, skipped=0, lr=[1.1695536817076936e-05, 1.1695536817076936e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:50:59,419] [INFO] [timer.py:260:stop] epoch=1/micro_step=841/global_step=3170, RunningAvgSamplesPerSec=4.75054499139042, CurrSamplesPerSec=4.751920485308804, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3180, Loss: 0.04170098528265953 +[2024-01-22 06:52:06,819] [INFO] [logging.py:96:log_dist] [Rank 0] step=3180, skipped=0, lr=[1.1650563540404625e-05, 1.1650563540404625e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:52:06,820] [INFO] [timer.py:260:stop] epoch=1/micro_step=851/global_step=3180, RunningAvgSamplesPerSec=4.750538913496655, CurrSamplesPerSec=4.746396707100602, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3190, Loss: 0.039577849209308624 +[2024-01-22 06:53:14,188] [INFO] [logging.py:96:log_dist] [Rank 0] step=3190, skipped=0, lr=[1.1605555918134978e-05, 1.1605555918134978e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:53:14,189] [INFO] [timer.py:260:stop] epoch=1/micro_step=861/global_step=3190, RunningAvgSamplesPerSec=4.750539785911587, CurrSamplesPerSec=4.7331208506248394, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3200, Loss: 0.11221074312925339 +[2024-01-22 06:54:21,584] [INFO] [logging.py:96:log_dist] [Rank 0] step=3200, skipped=0, lr=[1.15605148868049e-05, 1.15605148868049e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:54:21,584] [INFO] [timer.py:260:stop] epoch=1/micro_step=871/global_step=3200, RunningAvgSamplesPerSec=4.750534730420839, CurrSamplesPerSec=4.728610873586365, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3210, Loss: 0.06836682558059692 +[2024-01-22 06:55:28,994] [INFO] [logging.py:96:log_dist] [Rank 0] step=3210, skipped=0, lr=[1.151544138364649e-05, 1.151544138364649e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:55:28,995] [INFO] [timer.py:260:stop] epoch=1/micro_step=881/global_step=3210, RunningAvgSamplesPerSec=4.750526528497502, CurrSamplesPerSec=4.771114610177306, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3220, Loss: 0.1425667256116867 +[2024-01-22 06:56:36,371] [INFO] [logging.py:96:log_dist] [Rank 0] step=3220, skipped=0, lr=[1.1470336346567523e-05, 1.1470336346567523e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:56:36,372] [INFO] [timer.py:260:stop] epoch=1/micro_step=891/global_step=3220, RunningAvgSamplesPerSec=4.750525809207499, CurrSamplesPerSec=4.750043167659296, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3230, Loss: 0.09923180937767029 +[2024-01-22 06:57:43,942] [INFO] [logging.py:96:log_dist] [Rank 0] step=3230, skipped=0, lr=[1.1425200714131957e-05, 1.1425200714131957e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:57:43,942] [INFO] [timer.py:260:stop] epoch=1/micro_step=901/global_step=3230, RunningAvgSamplesPerSec=4.750482721788663, CurrSamplesPerSec=4.692798090290446, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3240, Loss: 0.10976522415876389 +[2024-01-22 06:58:51,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=3240, skipped=0, lr=[1.1380035425540383e-05, 1.1380035425540383e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:58:51,341] [INFO] [timer.py:260:stop] epoch=1/micro_step=911/global_step=3240, RunningAvgSamplesPerSec=4.750477319526474, CurrSamplesPerSec=4.750771349737446, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3250, Loss: 0.12999014556407928 +[2024-01-22 06:59:58,693] [INFO] [logging.py:96:log_dist] [Rank 0] step=3250, skipped=0, lr=[1.13348414206105e-05, 1.13348414206105e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 06:59:58,694] [INFO] [timer.py:260:stop] epoch=1/micro_step=921/global_step=3250, RunningAvgSamplesPerSec=4.750481851033548, CurrSamplesPerSec=4.744028871272793, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3260, Loss: 0.042844709008932114 +[2024-01-22 07:01:06,136] [INFO] [logging.py:96:log_dist] [Rank 0] step=3260, skipped=0, lr=[1.128961963975753e-05, 1.128961963975753e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:01:06,137] [INFO] [timer.py:260:stop] epoch=1/micro_step=931/global_step=3260, RunningAvgSamplesPerSec=4.750466866216636, CurrSamplesPerSec=4.746922971365193, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3270, Loss: 0.04025120288133621 +[2024-01-22 07:02:13,536] [INFO] [logging.py:96:log_dist] [Rank 0] step=3270, skipped=0, lr=[1.1244371023974686e-05, 1.1244371023974686e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:02:13,536] [INFO] [timer.py:260:stop] epoch=1/micro_step=941/global_step=3270, RunningAvgSamplesPerSec=4.750461322603284, CurrSamplesPerSec=4.768593967530591, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3280, Loss: 0.076604463160038 +[2024-01-22 07:03:20,959] [INFO] [logging.py:96:log_dist] [Rank 0] step=3280, skipped=0, lr=[1.1199096514813559e-05, 1.1199096514813559e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:03:20,959] [INFO] [timer.py:260:stop] epoch=1/micro_step=951/global_step=3280, RunningAvgSamplesPerSec=4.750450755979842, CurrSamplesPerSec=4.731637976116351, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3290, Loss: 0.08409302681684494 +[2024-01-22 07:04:28,306] [INFO] [logging.py:96:log_dist] [Rank 0] step=3290, skipped=0, lr=[1.1153797054364553e-05, 1.1153797054364553e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:04:28,307] [INFO] [timer.py:260:stop] epoch=1/micro_step=961/global_step=3290, RunningAvgSamplesPerSec=4.750456446379654, CurrSamplesPerSec=4.759517867075784, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3300, Loss: 0.04462525248527527 +[2024-01-22 07:05:35,769] [INFO] [logging.py:96:log_dist] [Rank 0] step=3300, skipped=0, lr=[1.1108473585237254e-05, 1.1108473585237254e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:05:35,770] [INFO] [timer.py:260:stop] epoch=1/micro_step=971/global_step=3300, RunningAvgSamplesPerSec=4.750437529613622, CurrSamplesPerSec=4.745750073864102, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3310, Loss: 0.13382785022258759 +[2024-01-22 07:06:43,213] [INFO] [logging.py:96:log_dist] [Rank 0] step=3310, skipped=0, lr=[1.1063127050540843e-05, 1.1063127050540843e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:06:43,214] [INFO] [timer.py:260:stop] epoch=1/micro_step=981/global_step=3310, RunningAvgSamplesPerSec=4.750422636470613, CurrSamplesPerSec=4.754389001964876, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3320, Loss: 0.1106300875544548 +[2024-01-22 07:07:50,708] [INFO] [logging.py:96:log_dist] [Rank 0] step=3320, skipped=0, lr=[1.1017758393864452e-05, 1.1017758393864452e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:07:50,709] [INFO] [timer.py:260:stop] epoch=1/micro_step=991/global_step=3320, RunningAvgSamplesPerSec=4.7503970634573065, CurrSamplesPerSec=4.728171941252156, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3330, Loss: 0.21911221742630005 +[2024-01-22 07:08:58,175] [INFO] [logging.py:96:log_dist] [Rank 0] step=3330, skipped=0, lr=[1.0972368559257538e-05, 1.0972368559257538e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:08:58,176] [INFO] [timer.py:260:stop] epoch=1/micro_step=1001/global_step=3330, RunningAvgSamplesPerSec=4.75037762570494, CurrSamplesPerSec=4.742068648170283, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3340, Loss: 0.03936199098825455 +[2024-01-22 07:10:05,602] [INFO] [logging.py:96:log_dist] [Rank 0] step=3340, skipped=0, lr=[1.0926958491210238e-05, 1.0926958491210238e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:10:05,602] [INFO] [timer.py:260:stop] epoch=1/micro_step=1011/global_step=3340, RunningAvgSamplesPerSec=4.750366812837409, CurrSamplesPerSec=4.716679497011138, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3350, Loss: 0.017917828634381294 +[2024-01-22 07:11:12,942] [INFO] [logging.py:96:log_dist] [Rank 0] step=3350, skipped=0, lr=[1.0881529134633712e-05, 1.0881529134633712e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:11:12,942] [INFO] [timer.py:260:stop] epoch=1/micro_step=1021/global_step=3350, RunningAvgSamplesPerSec=4.750374388375268, CurrSamplesPerSec=4.732445457660468, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3360, Loss: 0.15101543068885803 +[2024-01-22 07:12:20,370] [INFO] [logging.py:96:log_dist] [Rank 0] step=3360, skipped=0, lr=[1.0836081434840488e-05, 1.0836081434840488e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:12:20,370] [INFO] [timer.py:260:stop] epoch=1/micro_step=1031/global_step=3360, RunningAvgSamplesPerSec=4.750363399822643, CurrSamplesPerSec=4.7510617769193955, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3370, Loss: 0.06617791205644608 +[2024-01-22 07:13:27,702] [INFO] [logging.py:96:log_dist] [Rank 0] step=3370, skipped=0, lr=[1.0790616337524783e-05, 1.0790616337524783e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:13:27,702] [INFO] [timer.py:260:stop] epoch=1/micro_step=1041/global_step=3370, RunningAvgSamplesPerSec=4.750372565388871, CurrSamplesPerSec=4.753986862098434, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3380, Loss: 0.04754965007305145 +[2024-01-22 07:14:35,133] [INFO] [logging.py:96:log_dist] [Rank 0] step=3380, skipped=0, lr=[1.0745134788742826e-05, 1.0745134788742826e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:14:35,133] [INFO] [timer.py:260:stop] epoch=1/micro_step=1051/global_step=3380, RunningAvgSamplesPerSec=4.750360995488504, CurrSamplesPerSec=4.750349814221086, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3390, Loss: 0.05432935431599617 +[2024-01-22 07:15:42,482] [INFO] [logging.py:96:log_dist] [Rank 0] step=3390, skipped=0, lr=[1.0699637734893183e-05, 1.0699637734893183e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:15:42,482] [INFO] [timer.py:260:stop] epoch=1/micro_step=1061/global_step=3390, RunningAvgSamplesPerSec=4.750366562368466, CurrSamplesPerSec=4.75869167304207, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3400, Loss: 0.0675906389951706 +[2024-01-22 07:16:50,049] [INFO] [logging.py:96:log_dist] [Rank 0] step=3400, skipped=0, lr=[1.065412612269705e-05, 1.065412612269705e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:16:50,049] [INFO] [timer.py:260:stop] epoch=1/micro_step=1071/global_step=3400, RunningAvgSamplesPerSec=4.750326867069662, CurrSamplesPerSec=4.747557833233389, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3410, Loss: 0.14601808786392212 +[2024-01-22 07:17:57,544] [INFO] [logging.py:96:log_dist] [Rank 0] step=3410, skipped=0, lr=[1.0608600899178563e-05, 1.0608600899178563e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:17:57,544] [INFO] [timer.py:260:stop] epoch=1/micro_step=1081/global_step=3410, RunningAvgSamplesPerSec=4.750302260581097, CurrSamplesPerSec=4.723413885995017, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3420, Loss: 0.021071434020996094 +[2024-01-22 07:19:04,924] [INFO] [logging.py:96:log_dist] [Rank 0] step=3420, skipped=0, lr=[1.0563063011645081e-05, 1.0563063011645081e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:19:04,925] [INFO] [timer.py:260:stop] epoch=1/micro_step=1091/global_step=3420, RunningAvgSamplesPerSec=4.750301427453837, CurrSamplesPerSec=4.750046529797279, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3430, Loss: 0.04899030551314354 +[2024-01-22 07:20:12,340] [INFO] [logging.py:96:log_dist] [Rank 0] step=3430, skipped=0, lr=[1.0517513407667487e-05, 1.0517513407667487e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:20:12,341] [INFO] [timer.py:260:stop] epoch=1/micro_step=1101/global_step=3430, RunningAvgSamplesPerSec=4.750293446872389, CurrSamplesPerSec=4.739003591916229, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3440, Loss: 0.04885591194033623 +[2024-01-22 07:21:19,701] [INFO] [logging.py:96:log_dist] [Rank 0] step=3440, skipped=0, lr=[1.0471953035060468e-05, 1.0471953035060468e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:21:19,701] [INFO] [timer.py:260:stop] epoch=1/micro_step=1111/global_step=3440, RunningAvgSamplesPerSec=4.75029664269112, CurrSamplesPerSec=4.746934723414399, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3450, Loss: 0.0975327119231224 +[2024-01-22 07:22:27,109] [INFO] [logging.py:96:log_dist] [Rank 0] step=3450, skipped=0, lr=[1.0426382841862776e-05, 1.0426382841862776e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:22:27,110] [INFO] [timer.py:260:stop] epoch=1/micro_step=1121/global_step=3450, RunningAvgSamplesPerSec=4.750290131065183, CurrSamplesPerSec=4.739256770124536, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3460, Loss: 0.0780249759554863 +[2024-01-22 07:23:34,535] [INFO] [logging.py:96:log_dist] [Rank 0] step=3460, skipped=0, lr=[1.0380803776317528e-05, 1.0380803776317528e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:23:34,536] [INFO] [timer.py:260:stop] epoch=1/micro_step=1131/global_step=3460, RunningAvgSamplesPerSec=4.750280121999395, CurrSamplesPerSec=4.74438471778899, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3470, Loss: 0.1045868843793869 +[2024-01-22 07:24:41,878] [INFO] [logging.py:96:log_dist] [Rank 0] step=3470, skipped=0, lr=[1.0335216786852448e-05, 1.0335216786852448e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:24:41,878] [INFO] [timer.py:260:stop] epoch=1/micro_step=1141/global_step=3470, RunningAvgSamplesPerSec=4.7502869699529, CurrSamplesPerSec=4.739524201203447, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3480, Loss: 0.04139584302902222 +[2024-01-22 07:25:49,167] [INFO] [logging.py:96:log_dist] [Rank 0] step=3480, skipped=0, lr=[1.0289622822060157e-05, 1.0289622822060157e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:25:49,167] [INFO] [timer.py:260:stop] epoch=1/micro_step=1151/global_step=3480, RunningAvgSamplesPerSec=4.750304694581267, CurrSamplesPerSec=4.744300698316893, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3490, Loss: 0.17506760358810425 +[2024-01-22 07:26:56,565] [INFO] [logging.py:96:log_dist] [Rank 0] step=3490, skipped=0, lr=[1.024402283067841e-05, 1.024402283067841e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:26:56,565] [INFO] [timer.py:260:stop] epoch=1/micro_step=1161/global_step=3490, RunningAvgSamplesPerSec=4.750300345387641, CurrSamplesPerSec=4.744322834895053, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3500, Loss: 0.01562433410435915 +[2024-01-22 07:28:03,988] [INFO] [logging.py:96:log_dist] [Rank 0] step=3500, skipped=0, lr=[1.0198417761570374e-05, 1.0198417761570374e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:28:03,989] [INFO] [timer.py:260:stop] epoch=1/micro_step=1171/global_step=3500, RunningAvgSamplesPerSec=4.750290779362248, CurrSamplesPerSec=4.749703784320305, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3510, Loss: 0.08391974866390228 +[2024-01-22 07:29:11,308] [INFO] [logging.py:96:log_dist] [Rank 0] step=3510, skipped=0, lr=[1.015280856370487e-05, 1.015280856370487e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:29:11,309] [INFO] [timer.py:260:stop] epoch=1/micro_step=1181/global_step=3510, RunningAvgSamplesPerSec=4.750302108769566, CurrSamplesPerSec=4.740536629212309, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3520, Loss: 0.11040050536394119 +[2024-01-22 07:30:18,720] [INFO] [logging.py:96:log_dist] [Rank 0] step=3520, skipped=0, lr=[1.0107196186136631e-05, 1.0107196186136631e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:30:18,720] [INFO] [timer.py:260:stop] epoch=1/micro_step=1191/global_step=3520, RunningAvgSamplesPerSec=4.750295098587424, CurrSamplesPerSec=4.744889903575149, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3530, Loss: 0.0817139595746994 +[2024-01-22 07:31:26,196] [INFO] [logging.py:96:log_dist] [Rank 0] step=3530, skipped=0, lr=[1.0061581577986564e-05, 1.0061581577986564e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:31:26,197] [INFO] [timer.py:260:stop] epoch=1/micro_step=1201/global_step=3530, RunningAvgSamplesPerSec=4.750275182310581, CurrSamplesPerSec=4.762213428547991, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3540, Loss: 0.04565306752920151 +[2024-01-22 07:32:33,467] [INFO] [logging.py:96:log_dist] [Rank 0] step=3540, skipped=0, lr=[1.0015965688421979e-05, 1.0015965688421979e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:32:33,467] [INFO] [timer.py:260:stop] epoch=1/micro_step=1211/global_step=3540, RunningAvgSamplesPerSec=4.750296501362154, CurrSamplesPerSec=4.764579497974789, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3550, Loss: 0.024680564180016518 +[2024-01-22 07:33:40,889] [INFO] [logging.py:96:log_dist] [Rank 0] step=3550, skipped=0, lr=[9.970349466636857e-06, 9.970349466636857e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:33:40,889] [INFO] [timer.py:260:stop] epoch=1/micro_step=1221/global_step=3550, RunningAvgSamplesPerSec=4.750287523197044, CurrSamplesPerSec=4.77011247555507, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3560, Loss: 0.0573020800948143 +[2024-01-22 07:34:48,362] [INFO] [logging.py:96:log_dist] [Rank 0] step=3560, skipped=0, lr=[9.92473386183209e-06, 9.92473386183209e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:34:48,363] [INFO] [timer.py:260:stop] epoch=1/micro_step=1231/global_step=3560, RunningAvgSamplesPerSec=4.750268249915481, CurrSamplesPerSec=4.740069197988399, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3570, Loss: 0.08773577213287354 +[2024-01-22 07:35:55,751] [INFO] [logging.py:96:log_dist] [Rank 0] step=3570, skipped=0, lr=[9.879119823195735e-06, 9.879119823195735e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:35:55,751] [INFO] [timer.py:260:stop] epoch=1/micro_step=1241/global_step=3570, RunningAvgSamplesPerSec=4.750265965247251, CurrSamplesPerSec=4.751507829199271, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3580, Loss: 0.02633209526538849 +[2024-01-22 07:37:03,123] [INFO] [logging.py:96:log_dist] [Rank 0] step=3580, skipped=0, lr=[9.83350829988325e-06, 9.83350829988325e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:37:03,124] [INFO] [timer.py:260:stop] epoch=1/micro_step=1251/global_step=3580, RunningAvgSamplesPerSec=4.750266865924007, CurrSamplesPerSec=4.721385289571943, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3590, Loss: 0.043103791773319244 +[2024-01-22 07:38:10,495] [INFO] [logging.py:96:log_dist] [Rank 0] step=3590, skipped=0, lr=[9.787900240997768e-06, 9.787900240997768e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:38:10,495] [INFO] [timer.py:260:stop] epoch=1/micro_step=1261/global_step=3590, RunningAvgSamplesPerSec=4.750267941183242, CurrSamplesPerSec=4.744932342737405, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3600, Loss: 0.08536440134048462 +[2024-01-22 07:39:17,878] [INFO] [logging.py:96:log_dist] [Rank 0] step=3600, skipped=0, lr=[9.742296595570316e-06, 9.742296595570316e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:39:17,878] [INFO] [timer.py:260:stop] epoch=1/micro_step=1271/global_step=3600, RunningAvgSamplesPerSec=4.750266526287527, CurrSamplesPerSec=4.745466503659149, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3610, Loss: 0.02414414845407009 +[2024-01-22 07:40:25,251] [INFO] [logging.py:96:log_dist] [Rank 0] step=3610, skipped=0, lr=[9.6966983125401e-06, 9.6966983125401e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:40:25,251] [INFO] [timer.py:260:stop] epoch=1/micro_step=1281/global_step=3610, RunningAvgSamplesPerSec=4.750267309768436, CurrSamplesPerSec=4.756562879675173, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3620, Loss: 0.04428018257021904 +[2024-01-22 07:41:32,678] [INFO] [logging.py:96:log_dist] [Rank 0] step=3620, skipped=0, lr=[9.651106340734729e-06, 9.651106340734729e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:41:32,679] [INFO] [timer.py:260:stop] epoch=1/micro_step=1291/global_step=3620, RunningAvgSamplesPerSec=4.750257467751216, CurrSamplesPerSec=4.754521547890412, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3630, Loss: 0.07153733819723129 +[2024-01-22 07:42:40,053] [INFO] [logging.py:96:log_dist] [Rank 0] step=3630, skipped=0, lr=[9.605521628850496e-06, 9.605521628850496e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:42:40,053] [INFO] [timer.py:260:stop] epoch=1/micro_step=1301/global_step=3630, RunningAvgSamplesPerSec=4.750257860424422, CurrSamplesPerSec=4.756260655433877, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3640, Loss: 0.04011976718902588 +[2024-01-22 07:43:47,502] [INFO] [logging.py:96:log_dist] [Rank 0] step=3640, skipped=0, lr=[9.55994512543262e-06, 9.55994512543262e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:43:47,503] [INFO] [timer.py:260:stop] epoch=1/micro_step=1311/global_step=3640, RunningAvgSamplesPerSec=4.750243903030836, CurrSamplesPerSec=4.769086190380084, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3650, Loss: 0.06767147779464722 +[2024-01-22 07:44:54,844] [INFO] [logging.py:96:log_dist] [Rank 0] step=3650, skipped=0, lr=[9.514377778855521e-06, 9.514377778855521e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:44:54,844] [INFO] [timer.py:260:stop] epoch=1/micro_step=1321/global_step=3650, RunningAvgSamplesPerSec=4.750250663115373, CurrSamplesPerSec=4.761170778303905, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3660, Loss: 0.14571981132030487 +[2024-01-22 07:46:02,150] [INFO] [logging.py:96:log_dist] [Rank 0] step=3660, skipped=0, lr=[9.468820537303071e-06, 9.468820537303071e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:46:02,151] [INFO] [timer.py:260:stop] epoch=1/micro_step=1331/global_step=3660, RunningAvgSamplesPerSec=4.750264214602106, CurrSamplesPerSec=4.7461759962205345, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3670, Loss: 0.09339810907840729 +[2024-01-22 07:47:09,488] [INFO] [logging.py:96:log_dist] [Rank 0] step=3670, skipped=0, lr=[9.42327434874888e-06, 9.42327434874888e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:47:09,488] [INFO] [timer.py:260:stop] epoch=1/micro_step=1341/global_step=3670, RunningAvgSamplesPerSec=4.750271793251117, CurrSamplesPerSec=4.771427715183674, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3680, Loss: 0.030793089419603348 +[2024-01-22 07:48:16,809] [INFO] [logging.py:96:log_dist] [Rank 0] step=3680, skipped=0, lr=[9.377740160936564e-06, 9.377740160936564e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:48:16,810] [INFO] [timer.py:260:stop] epoch=1/micro_step=1351/global_step=3680, RunningAvgSamplesPerSec=4.750282213224042, CurrSamplesPerSec=4.744823646321886, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3690, Loss: 0.04302429407835007 +[2024-01-22 07:49:24,220] [INFO] [logging.py:96:log_dist] [Rank 0] step=3690, skipped=0, lr=[9.332218921360013e-06, 9.332218921360013e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:49:24,220] [INFO] [timer.py:260:stop] epoch=1/micro_step=1361/global_step=3690, RunningAvgSamplesPerSec=4.750275818918075, CurrSamplesPerSec=4.739925069634499, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3700, Loss: 0.09879221022129059 +[2024-01-22 07:50:31,615] [INFO] [logging.py:96:log_dist] [Rank 0] step=3700, skipped=0, lr=[9.2867115772437e-06, 9.2867115772437e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:50:31,616] [INFO] [timer.py:260:stop] epoch=1/micro_step=1371/global_step=3700, RunningAvgSamplesPerSec=4.750272323847678, CurrSamplesPerSec=4.75476257473016, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3710, Loss: 0.16724780201911926 +[2024-01-22 07:51:38,953] [INFO] [logging.py:96:log_dist] [Rank 0] step=3710, skipped=0, lr=[9.241219075522934e-06, 9.241219075522934e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:51:38,953] [INFO] [timer.py:260:stop] epoch=1/micro_step=1381/global_step=3710, RunningAvgSamplesPerSec=4.75027984428398, CurrSamplesPerSec=4.731357757686225, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3720, Loss: 0.1297459751367569 +[2024-01-22 07:52:46,308] [INFO] [logging.py:96:log_dist] [Rank 0] step=3720, skipped=0, lr=[9.1957423628242e-06, 9.1957423628242e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:52:46,308] [INFO] [timer.py:260:stop] epoch=1/micro_step=1391/global_step=3720, RunningAvgSamplesPerSec=4.750283968819762, CurrSamplesPerSec=4.741230579639203, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3730, Loss: 0.06026165187358856 +[2024-01-22 07:53:53,688] [INFO] [logging.py:96:log_dist] [Rank 0] step=3730, skipped=0, lr=[9.150282385445423e-06, 9.150282385445423e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:53:53,688] [INFO] [timer.py:260:stop] epoch=1/micro_step=1401/global_step=3730, RunningAvgSamplesPerSec=4.750283336255227, CurrSamplesPerSec=4.767292481186156, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3740, Loss: 0.09422717988491058 +[2024-01-22 07:55:01,067] [INFO] [logging.py:96:log_dist] [Rank 0] step=3740, skipped=0, lr=[9.104840089336305e-06, 9.104840089336305e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:55:01,067] [INFO] [timer.py:260:stop] epoch=1/micro_step=1411/global_step=3740, RunningAvgSamplesPerSec=4.750282833235636, CurrSamplesPerSec=4.741445638327836, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3750, Loss: 0.054431043565273285 +[2024-01-22 07:56:08,495] [INFO] [logging.py:96:log_dist] [Rank 0] step=3750, skipped=0, lr=[9.059416420078611e-06, 9.059416420078611e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:56:08,495] [INFO] [timer.py:260:stop] epoch=1/micro_step=1421/global_step=3750, RunningAvgSamplesPerSec=4.750273181031221, CurrSamplesPerSec=4.7589445969977575, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3760, Loss: 0.05571820214390755 +[2024-01-22 07:57:15,946] [INFO] [logging.py:96:log_dist] [Rank 0] step=3760, skipped=0, lr=[9.014012322866532e-06, 9.014012322866532e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:57:15,947] [INFO] [timer.py:260:stop] epoch=1/micro_step=1431/global_step=3760, RunningAvgSamplesPerSec=4.750259154880012, CurrSamplesPerSec=4.72833800967138, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3770, Loss: 0.031157121062278748 +[2024-01-22 07:58:23,453] [INFO] [logging.py:96:log_dist] [Rank 0] step=3770, skipped=0, lr=[8.968628742486982e-06, 8.968628742486982e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:58:23,453] [INFO] [timer.py:260:stop] epoch=1/micro_step=1441/global_step=3770, RunningAvgSamplesPerSec=4.750234838492814, CurrSamplesPerSec=4.757905570105072, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3780, Loss: 0.03139489144086838 +[2024-01-22 07:59:31,019] [INFO] [logging.py:96:log_dist] [Rank 0] step=3780, skipped=0, lr=[8.923266623299958e-06, 8.923266623299958e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 07:59:31,019] [INFO] [timer.py:260:stop] epoch=1/micro_step=1451/global_step=3780, RunningAvgSamplesPerSec=4.750199538676649, CurrSamplesPerSec=4.746182206055901, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3790, Loss: 0.06717578321695328 +[2024-01-22 08:00:38,441] [INFO] [logging.py:96:log_dist] [Rank 0] step=3790, skipped=0, lr=[8.87792690921888e-06, 8.87792690921888e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:00:38,441] [INFO] [timer.py:260:stop] epoch=1/micro_step=1461/global_step=3790, RunningAvgSamplesPerSec=4.750191292058684, CurrSamplesPerSec=4.7651328101776045, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3800, Loss: 0.0172119140625 +[2024-01-22 08:01:45,772] [INFO] [logging.py:96:log_dist] [Rank 0] step=3800, skipped=0, lr=[8.832610543690957e-06, 8.832610543690957e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:01:45,772] [INFO] [timer.py:260:stop] epoch=1/micro_step=1471/global_step=3800, RunningAvgSamplesPerSec=4.750199963789076, CurrSamplesPerSec=4.7647929587507925, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3810, Loss: 0.100362129509449 +[2024-01-22 08:02:53,259] [INFO] [logging.py:96:log_dist] [Rank 0] step=3810, skipped=0, lr=[8.78731846967755e-06, 8.78731846967755e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:02:53,259] [INFO] [timer.py:260:stop] epoch=1/micro_step=1481/global_step=3810, RunningAvgSamplesPerSec=4.750179701924387, CurrSamplesPerSec=4.715592238626357, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3820, Loss: 0.03462900593876839 +[2024-01-22 08:04:00,635] [INFO] [logging.py:96:log_dist] [Rank 0] step=3820, skipped=0, lr=[8.742051629634553e-06, 8.742051629634553e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:04:00,635] [INFO] [timer.py:260:stop] epoch=1/micro_step=1491/global_step=3820, RunningAvgSamplesPerSec=4.750180096868436, CurrSamplesPerSec=4.759865744121783, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3830, Loss: 0.037166181951761246 +[2024-01-22 08:05:08,113] [INFO] [logging.py:96:log_dist] [Rank 0] step=3830, skipped=0, lr=[8.696810965492782e-06, 8.696810965492782e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:05:08,114] [INFO] [timer.py:260:stop] epoch=1/micro_step=1501/global_step=3830, RunningAvgSamplesPerSec=4.750161558097236, CurrSamplesPerSec=4.732795396296516, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3840, Loss: 0.04762602597475052 +[2024-01-22 08:06:15,453] [INFO] [logging.py:96:log_dist] [Rank 0] step=3840, skipped=0, lr=[8.65159741863837e-06, 8.65159741863837e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:06:15,453] [INFO] [timer.py:260:stop] epoch=1/micro_step=1511/global_step=3840, RunningAvgSamplesPerSec=4.750168604034019, CurrSamplesPerSec=4.763820023651975, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3850, Loss: 0.07930321246385574 +[2024-01-22 08:07:22,974] [INFO] [logging.py:96:log_dist] [Rank 0] step=3850, skipped=0, lr=[8.606411929893188e-06, 8.606411929893188e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:07:22,975] [INFO] [timer.py:260:stop] epoch=1/micro_step=1521/global_step=3850, RunningAvgSamplesPerSec=4.750142390433896, CurrSamplesPerSec=4.745027623938281, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3860, Loss: 0.045530758798122406 +[2024-01-22 08:08:30,453] [INFO] [logging.py:96:log_dist] [Rank 0] step=3860, skipped=0, lr=[8.561255439495265e-06, 8.561255439495265e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:08:30,454] [INFO] [timer.py:260:stop] epoch=1/micro_step=1531/global_step=3860, RunningAvgSamplesPerSec=4.750123951922841, CurrSamplesPerSec=4.731019037933002, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3870, Loss: 0.026902221143245697 +[2024-01-22 08:09:37,797] [INFO] [logging.py:96:log_dist] [Rank 0] step=3870, skipped=0, lr=[8.516128887079204e-06, 8.516128887079204e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:09:37,798] [INFO] [timer.py:260:stop] epoch=1/micro_step=1541/global_step=3870, RunningAvgSamplesPerSec=4.750130289683338, CurrSamplesPerSec=4.753706010784387, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3880, Loss: 0.017634285613894463 +[2024-01-22 08:10:45,111] [INFO] [logging.py:96:log_dist] [Rank 0] step=3880, skipped=0, lr=[8.47103321165667e-06, 8.47103321165667e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:10:45,111] [INFO] [timer.py:260:stop] epoch=1/micro_step=1551/global_step=3880, RunningAvgSamplesPerSec=4.750142084520195, CurrSamplesPerSec=4.74980261903921, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3890, Loss: 0.17281056940555573 +[2024-01-22 08:11:52,401] [INFO] [logging.py:96:log_dist] [Rank 0] step=3890, skipped=0, lr=[8.425969351596804e-06, 8.425969351596804e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:11:52,401] [INFO] [timer.py:260:stop] epoch=1/micro_step=1561/global_step=3890, RunningAvgSamplesPerSec=4.750158104198486, CurrSamplesPerSec=4.7590554600678985, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3900, Loss: 0.14881832897663116 +[2024-01-22 08:12:59,858] [INFO] [logging.py:96:log_dist] [Rank 0] step=3900, skipped=0, lr=[8.380938244606742e-06, 8.380938244606742e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:12:59,859] [INFO] [timer.py:260:stop] epoch=1/micro_step=1571/global_step=3900, RunningAvgSamplesPerSec=4.750143773331475, CurrSamplesPerSec=4.727680800130723, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3910, Loss: 0.027484361082315445 +[2024-01-22 08:14:07,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=3910, skipped=0, lr=[8.33594082771206e-06, 8.33594082771206e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:14:07,361] [INFO] [timer.py:260:stop] epoch=1/micro_step=1581/global_step=3910, RunningAvgSamplesPerSec=4.750121633311276, CurrSamplesPerSec=4.734742613948121, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3920, Loss: 0.03303825855255127 +[2024-01-22 08:15:14,681] [INFO] [logging.py:96:log_dist] [Rank 0] step=3920, skipped=0, lr=[8.290978037237316e-06, 8.290978037237316e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:15:14,682] [INFO] [timer.py:260:stop] epoch=1/micro_step=1591/global_step=3920, RunningAvgSamplesPerSec=4.75013205567538, CurrSamplesPerSec=4.761573120972044, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3930, Loss: 0.0252145454287529 +[2024-01-22 08:16:21,980] [INFO] [logging.py:96:log_dist] [Rank 0] step=3930, skipped=0, lr=[8.246050808786527e-06, 8.246050808786527e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:16:21,981] [INFO] [timer.py:260:stop] epoch=1/micro_step=1601/global_step=3930, RunningAvgSamplesPerSec=4.750146407152757, CurrSamplesPerSec=4.748370086063984, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3940, Loss: 0.049459367990493774 +[2024-01-22 08:17:29,502] [INFO] [logging.py:96:log_dist] [Rank 0] step=3940, skipped=0, lr=[8.201160077223737e-06, 8.201160077223737e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:17:29,503] [INFO] [timer.py:260:stop] epoch=1/micro_step=1611/global_step=3940, RunningAvgSamplesPerSec=4.75012076382351, CurrSamplesPerSec=4.735470121913076, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3950, Loss: 0.054260462522506714 +[2024-01-22 08:18:36,916] [INFO] [logging.py:96:log_dist] [Rank 0] step=3950, skipped=0, lr=[8.15630677665355e-06, 8.15630677665355e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:18:36,916] [INFO] [timer.py:260:stop] epoch=1/micro_step=1621/global_step=3950, RunningAvgSamplesPerSec=4.750114654908586, CurrSamplesPerSec=4.7504100049816085, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3960, Loss: 0.02983631007373333 +[2024-01-22 08:19:44,248] [INFO] [logging.py:96:log_dist] [Rank 0] step=3960, skipped=0, lr=[8.111491840401673e-06, 8.111491840401673e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:19:44,248] [INFO] [timer.py:260:stop] epoch=1/micro_step=1631/global_step=3960, RunningAvgSamplesPerSec=4.750122971726511, CurrSamplesPerSec=4.749485622919986, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3970, Loss: 0.03179832175374031 +[2024-01-22 08:20:51,624] [INFO] [logging.py:96:log_dist] [Rank 0] step=3970, skipped=0, lr=[8.06671620099553e-06, 8.06671620099553e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:20:51,625] [INFO] [timer.py:260:stop] epoch=1/micro_step=1641/global_step=3970, RunningAvgSamplesPerSec=4.75012335315089, CurrSamplesPerSec=4.725753528733276, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3980, Loss: 0.03224347159266472 +[2024-01-22 08:21:59,013] [INFO] [logging.py:96:log_dist] [Rank 0] step=3980, skipped=0, lr=[8.021980790144828e-06, 8.021980790144828e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:21:59,013] [INFO] [timer.py:260:stop] epoch=1/micro_step=1651/global_step=3980, RunningAvgSamplesPerSec=4.750121676640353, CurrSamplesPerSec=4.73131672848762, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 3990, Loss: 0.04867633059620857 +[2024-01-22 08:23:06,410] [INFO] [logging.py:96:log_dist] [Rank 0] step=3990, skipped=0, lr=[7.977286538722193e-06, 7.977286538722193e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:23:06,411] [INFO] [timer.py:260:stop] epoch=1/micro_step=1661/global_step=3990, RunningAvgSamplesPerSec=4.750118407974398, CurrSamplesPerSec=4.726813516619317, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4000, Loss: 0.07461197674274445 +[2024-01-22 08:24:13,796] [INFO] [logging.py:96:log_dist] [Rank 0] step=4000, skipped=0, lr=[7.932634376743776e-06, 7.932634376743776e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:24:13,796] [INFO] [timer.py:260:stop] epoch=1/micro_step=1671/global_step=4000, RunningAvgSamplesPerSec=4.750117153370882, CurrSamplesPerSec=4.76178529881223, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4010, Loss: 0.041548945009708405 +[2024-01-22 08:25:21,253] [INFO] [logging.py:96:log_dist] [Rank 0] step=4010, skipped=0, lr=[7.88802523334993e-06, 7.88802523334993e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:25:21,253] [INFO] [timer.py:260:stop] epoch=1/micro_step=1681/global_step=4010, RunningAvgSamplesPerSec=4.750103426903062, CurrSamplesPerSec=4.727275673024776, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4020, Loss: 0.05760756507515907 +[2024-01-22 08:26:28,642] [INFO] [logging.py:96:log_dist] [Rank 0] step=4020, skipped=0, lr=[7.84346003678584e-06, 7.84346003678584e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:26:28,643] [INFO] [timer.py:260:stop] epoch=1/micro_step=1691/global_step=4020, RunningAvgSamplesPerSec=4.7501016715057505, CurrSamplesPerSec=4.755101502162063, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4030, Loss: 0.027385979890823364 +[2024-01-22 08:27:35,992] [INFO] [logging.py:96:log_dist] [Rank 0] step=4030, skipped=0, lr=[7.798939714382245e-06, 7.798939714382245e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:27:35,993] [INFO] [timer.py:260:stop] epoch=1/micro_step=1701/global_step=4030, RunningAvgSamplesPerSec=4.750106817378401, CurrSamplesPerSec=4.7427347243776765, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4040, Loss: 0.03304236754775047 +[2024-01-22 08:28:43,378] [INFO] [logging.py:96:log_dist] [Rank 0] step=4040, skipped=0, lr=[7.754465192536121e-06, 7.754465192536121e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:28:43,379] [INFO] [timer.py:260:stop] epoch=1/micro_step=1711/global_step=4040, RunningAvgSamplesPerSec=4.750105608367287, CurrSamplesPerSec=4.737508166886509, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4050, Loss: 0.02018284797668457 +[2024-01-22 08:29:50,762] [INFO] [logging.py:96:log_dist] [Rank 0] step=4050, skipped=0, lr=[7.710037396691393e-06, 7.710037396691393e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:29:50,763] [INFO] [timer.py:260:stop] epoch=1/micro_step=1721/global_step=4050, RunningAvgSamplesPerSec=4.75010461409859, CurrSamplesPerSec=4.7524870158631725, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4060, Loss: 0.06941701471805573 +[2024-01-22 08:30:58,233] [INFO] [logging.py:96:log_dist] [Rank 0] step=4060, skipped=0, lr=[7.665657251319713e-06, 7.665657251319713e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:30:58,233] [INFO] [timer.py:260:stop] epoch=1/micro_step=1731/global_step=4060, RunningAvgSamplesPerSec=4.750088834234734, CurrSamplesPerSec=4.7351735790481495, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4070, Loss: 0.01920580491423607 +[2024-01-22 08:32:05,661] [INFO] [logging.py:96:log_dist] [Rank 0] step=4070, skipped=0, lr=[7.621325679901186e-06, 7.621325679901186e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:32:05,662] [INFO] [timer.py:260:stop] epoch=1/micro_step=1741/global_step=4070, RunningAvgSamplesPerSec=4.750080322936924, CurrSamplesPerSec=4.774767739298818, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4080, Loss: 0.18751388788223267 +[2024-01-22 08:33:13,014] [INFO] [logging.py:96:log_dist] [Rank 0] step=4080, skipped=0, lr=[7.577043604905184e-06, 7.577043604905184e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:33:13,015] [INFO] [timer.py:260:stop] epoch=1/micro_step=1751/global_step=4080, RunningAvgSamplesPerSec=4.750084895195671, CurrSamplesPerSec=4.76121013124057, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4090, Loss: 0.029517848044633865 +[2024-01-22 08:34:20,474] [INFO] [logging.py:96:log_dist] [Rank 0] step=4090, skipped=0, lr=[7.532811947771121e-06, 7.532811947771121e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:34:20,474] [INFO] [timer.py:260:stop] epoch=1/micro_step=1761/global_step=4090, RunningAvgSamplesPerSec=4.750071117097466, CurrSamplesPerSec=4.748006755004718, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4100, Loss: 0.055370982736349106 +[2024-01-22 08:35:27,911] [INFO] [logging.py:96:log_dist] [Rank 0] step=4100, skipped=0, lr=[7.4886316288893165e-06, 7.4886316288893165e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:35:27,911] [INFO] [timer.py:260:stop] epoch=1/micro_step=1771/global_step=4100, RunningAvgSamplesPerSec=4.750061150968646, CurrSamplesPerSec=4.763672925691798, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4110, Loss: 0.01886894553899765 +[2024-01-22 08:36:35,231] [INFO] [logging.py:96:log_dist] [Rank 0] step=4110, skipped=0, lr=[7.4445035675818e-06, 7.4445035675818e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:36:35,231] [INFO] [timer.py:260:stop] epoch=1/micro_step=1781/global_step=4110, RunningAvgSamplesPerSec=4.750071485308833, CurrSamplesPerSec=4.760044512950398, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4120, Loss: 0.018112346529960632 +[2024-01-22 08:37:42,544] [INFO] [logging.py:96:log_dist] [Rank 0] step=4120, skipped=0, lr=[7.4004286820832235e-06, 7.4004286820832235e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:37:42,544] [INFO] [timer.py:260:stop] epoch=1/micro_step=1791/global_step=4120, RunningAvgSamplesPerSec=4.750082773056995, CurrSamplesPerSec=4.767323976759986, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4130, Loss: 0.016937898471951485 +[2024-01-22 08:38:50,041] [INFO] [logging.py:96:log_dist] [Rank 0] step=4130, skipped=0, lr=[7.356407889521725e-06, 7.356407889521725e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:38:50,042] [INFO] [timer.py:260:stop] epoch=1/micro_step=1801/global_step=4130, RunningAvgSamplesPerSec=4.750062659044426, CurrSamplesPerSec=4.73864102357791, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4140, Loss: 0.09263104200363159 +[2024-01-22 08:39:57,559] [INFO] [logging.py:96:log_dist] [Rank 0] step=4140, skipped=0, lr=[7.312442105899855e-06, 7.312442105899855e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:39:57,560] [INFO] [timer.py:260:stop] epoch=1/micro_step=1811/global_step=4140, RunningAvgSamplesPerSec=4.750039126112489, CurrSamplesPerSec=4.724929200549709, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4150, Loss: 0.07531996071338654 +[2024-01-22 08:41:04,888] [INFO] [logging.py:96:log_dist] [Rank 0] step=4150, skipped=0, lr=[7.26853224607552e-06, 7.26853224607552e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:41:04,888] [INFO] [timer.py:260:stop] epoch=1/micro_step=1821/global_step=4150, RunningAvgSamplesPerSec=4.750047760285853, CurrSamplesPerSec=4.754706316104759, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4160, Loss: 0.031998977065086365 +[2024-01-22 08:42:12,306] [INFO] [logging.py:96:log_dist] [Rank 0] step=4160, skipped=0, lr=[7.224679223742938e-06, 7.224679223742938e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:42:12,306] [INFO] [timer.py:260:stop] epoch=1/micro_step=1831/global_step=4160, RunningAvgSamplesPerSec=4.750041312703988, CurrSamplesPerSec=4.714329951078603, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4170, Loss: 0.10620644688606262 +[2024-01-22 08:43:19,695] [INFO] [logging.py:96:log_dist] [Rank 0] step=4170, skipped=0, lr=[7.180883951413628e-06, 7.180883951413628e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:43:19,695] [INFO] [timer.py:260:stop] epoch=1/micro_step=1841/global_step=4170, RunningAvgSamplesPerSec=4.750039777051212, CurrSamplesPerSec=4.738816361190492, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4180, Loss: 0.03943274915218353 +[2024-01-22 08:44:27,116] [INFO] [logging.py:96:log_dist] [Rank 0] step=4180, skipped=0, lr=[7.137147340397428e-06, 7.137147340397428e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:44:27,117] [INFO] [timer.py:260:stop] epoch=1/micro_step=1851/global_step=4180, RunningAvgSamplesPerSec=4.750032840823496, CurrSamplesPerSec=4.7411309289955055, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4190, Loss: 0.037147656083106995 +[2024-01-22 08:45:34,433] [INFO] [logging.py:96:log_dist] [Rank 0] step=4190, skipped=0, lr=[7.093470300783525e-06, 7.093470300783525e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:45:34,434] [INFO] [timer.py:260:stop] epoch=1/micro_step=1861/global_step=4190, RunningAvgSamplesPerSec=4.7500434926336474, CurrSamplesPerSec=4.771346127526, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4200, Loss: 0.1446041464805603 +[2024-01-22 08:46:41,812] [INFO] [logging.py:96:log_dist] [Rank 0] step=4200, skipped=0, lr=[7.04985374142152e-06, 7.04985374142152e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:46:41,812] [INFO] [timer.py:260:stop] epoch=1/micro_step=1871/global_step=4200, RunningAvgSamplesPerSec=4.750043727361235, CurrSamplesPerSec=4.759393312112978, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4210, Loss: 0.05761910229921341 +[2024-01-22 08:47:49,297] [INFO] [logging.py:96:log_dist] [Rank 0] step=4210, skipped=0, lr=[7.006298569902516e-06, 7.006298569902516e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:47:49,298] [INFO] [timer.py:260:stop] epoch=1/micro_step=1881/global_step=4210, RunningAvgSamplesPerSec=4.750025939773013, CurrSamplesPerSec=4.740204127480239, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4220, Loss: 0.036561187356710434 +[2024-01-22 08:48:56,669] [INFO] [logging.py:96:log_dist] [Rank 0] step=4220, skipped=0, lr=[6.962805692540233e-06, 6.962805692540233e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:48:56,670] [INFO] [timer.py:260:stop] epoch=1/micro_step=1891/global_step=4220, RunningAvgSamplesPerSec=4.750027245952055, CurrSamplesPerSec=4.753308531564741, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4230, Loss: 0.04095073789358139 +[2024-01-22 08:50:04,085] [INFO] [logging.py:96:log_dist] [Rank 0] step=4230, skipped=0, lr=[6.919376014352147e-06, 6.919376014352147e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:50:04,085] [INFO] [timer.py:260:stop] epoch=1/micro_step=1901/global_step=4230, RunningAvgSamplesPerSec=4.750021374035284, CurrSamplesPerSec=4.7420418414725045, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4240, Loss: 0.04853092133998871 +[2024-01-22 08:51:11,592] [INFO] [logging.py:96:log_dist] [Rank 0] step=4240, skipped=0, lr=[6.8760104390406705e-06, 6.8760104390406705e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:51:11,593] [INFO] [timer.py:260:stop] epoch=1/micro_step=1911/global_step=4240, RunningAvgSamplesPerSec=4.750000119584085, CurrSamplesPerSec=4.7374063316638, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4250, Loss: 0.20478157699108124 +[2024-01-22 08:52:18,947] [INFO] [logging.py:96:log_dist] [Rank 0] step=4250, skipped=0, lr=[6.832709868974318e-06, 6.832709868974318e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:52:18,948] [INFO] [timer.py:260:stop] epoch=1/micro_step=1921/global_step=4250, RunningAvgSamplesPerSec=4.750004455333974, CurrSamplesPerSec=4.760952406661033, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4260, Loss: 0.08899834752082825 +[2024-01-22 08:53:26,326] [INFO] [logging.py:96:log_dist] [Rank 0] step=4260, skipped=0, lr=[6.789475205168968e-06, 6.789475205168968e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:53:26,326] [INFO] [timer.py:260:stop] epoch=1/micro_step=1931/global_step=4260, RunningAvgSamplesPerSec=4.750004822883346, CurrSamplesPerSec=4.759238386988727, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4270, Loss: 0.07614777237176895 +[2024-01-22 08:54:33,654] [INFO] [logging.py:96:log_dist] [Rank 0] step=4270, skipped=0, lr=[6.746307347269078e-06, 6.746307347269078e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:54:33,655] [INFO] [timer.py:260:stop] epoch=1/micro_step=1941/global_step=4270, RunningAvgSamplesPerSec=4.750013504678442, CurrSamplesPerSec=4.752338092904224, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4280, Loss: 0.058679066598415375 +[2024-01-22 08:55:40,982] [INFO] [logging.py:96:log_dist] [Rank 0] step=4280, skipped=0, lr=[6.703207193529e-06, 6.703207193529e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:55:40,982] [INFO] [timer.py:260:stop] epoch=1/micro_step=1951/global_step=4280, RunningAvgSamplesPerSec=4.750022249067114, CurrSamplesPerSec=4.759355508102247, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4290, Loss: 0.02914400026202202 +[2024-01-22 08:56:48,449] [INFO] [logging.py:96:log_dist] [Rank 0] step=4290, skipped=0, lr=[6.660175640794247e-06, 6.660175640794247e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:56:48,449] [INFO] [timer.py:260:stop] epoch=1/micro_step=1961/global_step=4290, RunningAvgSamplesPerSec=4.750007939984335, CurrSamplesPerSec=4.731329904453507, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4300, Loss: 0.06641086935997009 +[2024-01-22 08:57:55,781] [INFO] [logging.py:96:log_dist] [Rank 0] step=4300, skipped=0, lr=[6.617213584482877e-06, 6.617213584482877e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:57:55,781] [INFO] [timer.py:260:stop] epoch=1/micro_step=1971/global_step=4300, RunningAvgSamplesPerSec=4.750015949731746, CurrSamplesPerSec=4.754641637174601, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4310, Loss: 0.02106896974146366 +[2024-01-22 08:59:03,124] [INFO] [logging.py:96:log_dist] [Rank 0] step=4310, skipped=0, lr=[6.574321918566819e-06, 6.574321918566819e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 08:59:03,124] [INFO] [timer.py:260:stop] epoch=1/micro_step=1981/global_step=4310, RunningAvgSamplesPerSec=4.750022012990258, CurrSamplesPerSec=4.742974557620554, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4320, Loss: 0.04238169267773628 +[2024-01-22 09:00:10,571] [INFO] [logging.py:96:log_dist] [Rank 0] step=4320, skipped=0, lr=[6.531501535553303e-06, 6.531501535553303e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:00:10,571] [INFO] [timer.py:260:stop] epoch=1/micro_step=1991/global_step=4320, RunningAvgSamplesPerSec=4.75001113512446, CurrSamplesPerSec=4.726972996982603, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4330, Loss: 0.028034383431077003 +[2024-01-22 09:01:17,961] [INFO] [logging.py:96:log_dist] [Rank 0] step=4330, skipped=0, lr=[6.488753326466276e-06, 6.488753326466276e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:01:17,962] [INFO] [timer.py:260:stop] epoch=1/micro_step=2001/global_step=4330, RunningAvgSamplesPerSec=4.75000960204521, CurrSamplesPerSec=4.771969215833294, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4340, Loss: 0.02940499596297741 +[2024-01-22 09:02:25,383] [INFO] [logging.py:96:log_dist] [Rank 0] step=4340, skipped=0, lr=[6.446078180827847e-06, 6.446078180827847e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:02:25,384] [INFO] [timer.py:260:stop] epoch=1/micro_step=2011/global_step=4340, RunningAvgSamplesPerSec=4.750002947309517, CurrSamplesPerSec=4.736554531962827, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4350, Loss: 0.06428222358226776 +[2024-01-22 09:03:32,707] [INFO] [logging.py:96:log_dist] [Rank 0] step=4350, skipped=0, lr=[6.40347698663981e-06, 6.40347698663981e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:03:32,707] [INFO] [timer.py:260:stop] epoch=1/micro_step=2021/global_step=4350, RunningAvgSamplesPerSec=4.750012242360507, CurrSamplesPerSec=4.762238943050329, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4360, Loss: 0.011926885694265366 +[2024-01-22 09:04:40,208] [INFO] [logging.py:96:log_dist] [Rank 0] step=4360, skipped=0, lr=[6.360950630365126e-06, 6.360950630365126e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:04:40,209] [INFO] [timer.py:260:stop] epoch=1/micro_step=2031/global_step=4360, RunningAvgSamplesPerSec=4.749992697085756, CurrSamplesPerSec=4.742551890869285, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4370, Loss: 0.09281729906797409 +[2024-01-22 09:05:47,699] [INFO] [logging.py:96:log_dist] [Rank 0] step=4370, skipped=0, lr=[6.318499996909519e-06, 6.318499996909519e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:05:47,700] [INFO] [timer.py:260:stop] epoch=1/micro_step=2041/global_step=4370, RunningAvgSamplesPerSec=4.749974942379351, CurrSamplesPerSec=4.732672569784213, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4380, Loss: 0.015789013355970383 +[2024-01-22 09:06:55,117] [INFO] [logging.py:96:log_dist] [Rank 0] step=4380, skipped=0, lr=[6.276125969603024e-06, 6.276125969603024e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:06:55,118] [INFO] [timer.py:260:stop] epoch=1/micro_step=2051/global_step=4380, RunningAvgSamplesPerSec=4.74996898811078, CurrSamplesPerSec=4.74672269181574, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4390, Loss: 0.027594611048698425 +[2024-01-22 09:08:02,525] [INFO] [logging.py:96:log_dist] [Rank 0] step=4390, skipped=0, lr=[6.23382943018164e-06, 6.23382943018164e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:08:02,526] [INFO] [timer.py:260:stop] epoch=1/micro_step=2061/global_step=4390, RunningAvgSamplesPerSec=4.749964651559337, CurrSamplesPerSec=4.747597633193025, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4400, Loss: 0.0382491871714592 +[2024-01-22 09:09:09,982] [INFO] [logging.py:96:log_dist] [Rank 0] step=4400, skipped=0, lr=[6.191611258768953e-06, 6.191611258768953e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:09:09,982] [INFO] [timer.py:260:stop] epoch=1/micro_step=2071/global_step=4400, RunningAvgSamplesPerSec=4.7499524357645635, CurrSamplesPerSec=4.741177487934555, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4410, Loss: 0.032709959894418716 +[2024-01-22 09:10:17,370] [INFO] [logging.py:96:log_dist] [Rank 0] step=4410, skipped=0, lr=[6.149472333857841e-06, 6.149472333857841e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:10:17,371] [INFO] [timer.py:260:stop] epoch=1/micro_step=2081/global_step=4410, RunningAvgSamplesPerSec=4.749951335614354, CurrSamplesPerSec=4.762445265725668, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4420, Loss: 0.11348654329776764 +[2024-01-22 09:11:24,924] [INFO] [logging.py:96:log_dist] [Rank 0] step=4420, skipped=0, lr=[6.1074135322921964e-06, 6.1074135322921964e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:11:24,924] [INFO] [timer.py:260:stop] epoch=1/micro_step=2091/global_step=4420, RunningAvgSamplesPerSec=4.749923751607769, CurrSamplesPerSec=4.741100113509976, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4430, Loss: 0.06240713968873024 +[2024-01-22 09:12:32,314] [INFO] [logging.py:96:log_dist] [Rank 0] step=4430, skipped=0, lr=[6.0654357292486566e-06, 6.0654357292486566e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:12:32,314] [INFO] [timer.py:260:stop] epoch=1/micro_step=2101/global_step=4430, RunningAvgSamplesPerSec=4.74992262866572, CurrSamplesPerSec=4.749569658066245, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4440, Loss: 0.032763510942459106 +[2024-01-22 09:13:39,618] [INFO] [logging.py:96:log_dist] [Rank 0] step=4440, skipped=0, lr=[6.023539798218424e-06, 6.023539798218424e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:13:39,618] [INFO] [timer.py:260:stop] epoch=1/micro_step=2111/global_step=4440, RunningAvgSamplesPerSec=4.749934946176906, CurrSamplesPerSec=4.760642194617297, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4450, Loss: 0.05629627779126167 +[2024-01-22 09:14:47,085] [INFO] [logging.py:96:log_dist] [Rank 0] step=4450, skipped=0, lr=[5.981726610989061e-06, 5.981726610989061e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:14:47,085] [INFO] [timer.py:260:stop] epoch=1/micro_step=2121/global_step=4450, RunningAvgSamplesPerSec=4.749921352817333, CurrSamplesPerSec=4.743171336200347, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4460, Loss: 0.037237852811813354 +[2024-01-22 09:15:54,473] [INFO] [logging.py:96:log_dist] [Rank 0] step=4460, skipped=0, lr=[5.939997037626379e-06, 5.939997037626379e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:15:54,473] [INFO] [timer.py:260:stop] epoch=1/micro_step=2131/global_step=4460, RunningAvgSamplesPerSec=4.749920375882305, CurrSamplesPerSec=4.744926639406621, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4470, Loss: 0.12265822291374207 +[2024-01-22 09:17:01,850] [INFO] [logging.py:96:log_dist] [Rank 0] step=4470, skipped=0, lr=[5.898351946456301e-06, 5.898351946456301e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:17:01,851] [INFO] [timer.py:260:stop] epoch=1/micro_step=2141/global_step=4470, RunningAvgSamplesPerSec=4.749921184866931, CurrSamplesPerSec=4.755244870204276, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4480, Loss: 0.023885324597358704 +[2024-01-22 09:18:09,402] [INFO] [logging.py:96:log_dist] [Rank 0] step=4480, skipped=0, lr=[5.856792204046826e-06, 5.856792204046826e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:18:09,403] [INFO] [timer.py:260:stop] epoch=1/micro_step=2151/global_step=4480, RunningAvgSamplesPerSec=4.749894372179804, CurrSamplesPerSec=4.7330812930170385, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4490, Loss: 0.03692825511097908 +[2024-01-22 09:19:16,790] [INFO] [logging.py:96:log_dist] [Rank 0] step=4490, skipped=0, lr=[5.815318675189969e-06, 5.815318675189969e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:19:16,790] [INFO] [timer.py:260:stop] epoch=1/micro_step=2161/global_step=4490, RunningAvgSamplesPerSec=4.7498935076350834, CurrSamplesPerSec=4.76546780327353, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4500, Loss: 0.06557969003915787 +[2024-01-22 09:20:24,183] [INFO] [logging.py:96:log_dist] [Rank 0] step=4500, skipped=0, lr=[5.7739322228837816e-06, 5.7739322228837816e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:20:24,183] [INFO] [timer.py:260:stop] epoch=1/micro_step=2171/global_step=4500, RunningAvgSamplesPerSec=4.749891754881941, CurrSamplesPerSec=4.750561665560531, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4510, Loss: 0.018276508897542953 +[2024-01-22 09:21:31,552] [INFO] [logging.py:96:log_dist] [Rank 0] step=4510, skipped=0, lr=[5.732633708314403e-06, 5.732633708314403e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:21:31,553] [INFO] [timer.py:260:stop] epoch=1/micro_step=2181/global_step=4510, RunningAvgSamplesPerSec=4.749893746799782, CurrSamplesPerSec=4.746287943584487, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4520, Loss: 0.03850222006440163 +[2024-01-22 09:22:39,014] [INFO] [logging.py:96:log_dist] [Rank 0] step=4520, skipped=0, lr=[5.691423990838103e-06, 5.691423990838103e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:22:39,015] [INFO] [timer.py:260:stop] epoch=1/micro_step=2191/global_step=4520, RunningAvgSamplesPerSec=4.749881277789789, CurrSamplesPerSec=4.751711204357912, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4530, Loss: 0.05202767625451088 +[2024-01-22 09:23:46,478] [INFO] [logging.py:96:log_dist] [Rank 0] step=4530, skipped=0, lr=[5.650303927963459e-06, 5.650303927963459e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:23:46,478] [INFO] [timer.py:260:stop] epoch=1/micro_step=2201/global_step=4530, RunningAvgSamplesPerSec=4.749868768030397, CurrSamplesPerSec=4.744818781938232, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4540, Loss: 0.034960489720106125 +[2024-01-22 09:24:53,936] [INFO] [logging.py:96:log_dist] [Rank 0] step=4540, skipped=0, lr=[5.60927437533344e-06, 5.60927437533344e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:24:53,937] [INFO] [timer.py:260:stop] epoch=1/micro_step=2211/global_step=4540, RunningAvgSamplesPerSec=4.749856992363414, CurrSamplesPerSec=4.740676106312468, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4550, Loss: 0.030402934178709984 +[2024-01-22 09:26:01,397] [INFO] [logging.py:96:log_dist] [Rank 0] step=4550, skipped=0, lr=[5.568336186707679e-06, 5.568336186707679e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:26:01,398] [INFO] [timer.py:260:stop] epoch=1/micro_step=2221/global_step=4550, RunningAvgSamplesPerSec=4.749844932722444, CurrSamplesPerSec=4.750172109059739, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4560, Loss: 0.023456957191228867 +[2024-01-22 09:27:08,839] [INFO] [logging.py:96:log_dist] [Rank 0] step=4560, skipped=0, lr=[5.527490213944637e-06, 5.527490213944637e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:27:08,839] [INFO] [timer.py:260:stop] epoch=1/micro_step=2231/global_step=4560, RunningAvgSamplesPerSec=4.749835786831116, CurrSamplesPerSec=4.720247387070953, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4570, Loss: 0.021582724526524544 +[2024-01-22 09:28:16,305] [INFO] [logging.py:96:log_dist] [Rank 0] step=4570, skipped=0, lr=[5.486737306983942e-06, 5.486737306983942e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:28:16,305] [INFO] [timer.py:260:stop] epoch=1/micro_step=2241/global_step=4570, RunningAvgSamplesPerSec=4.749822976753319, CurrSamplesPerSec=4.747754488476153, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4580, Loss: 0.05611864849925041 +[2024-01-22 09:29:23,778] [INFO] [logging.py:96:log_dist] [Rank 0] step=4580, skipped=0, lr=[5.446078313828635e-06, 5.446078313828635e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:29:23,779] [INFO] [timer.py:260:stop] epoch=1/micro_step=2251/global_step=4580, RunningAvgSamplesPerSec=4.749809074458259, CurrSamplesPerSec=4.735311738342184, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4590, Loss: 0.05998646095395088 +[2024-01-22 09:30:31,238] [INFO] [logging.py:96:log_dist] [Rank 0] step=4590, skipped=0, lr=[5.405514080527594e-06, 5.405514080527594e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:30:31,238] [INFO] [timer.py:260:stop] epoch=1/micro_step=2261/global_step=4590, RunningAvgSamplesPerSec=4.749797373604183, CurrSamplesPerSec=4.729891822420602, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4600, Loss: 0.10789136588573456 +[2024-01-22 09:31:38,758] [INFO] [logging.py:96:log_dist] [Rank 0] step=4600, skipped=0, lr=[5.365045451157874e-06, 5.365045451157874e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:31:38,758] [INFO] [timer.py:260:stop] epoch=1/micro_step=2271/global_step=4600, RunningAvgSamplesPerSec=4.749776392100917, CurrSamplesPerSec=4.7592669073053075, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4610, Loss: 0.08405714482069016 +[2024-01-22 09:32:46,166] [INFO] [logging.py:96:log_dist] [Rank 0] step=4610, skipped=0, lr=[5.324673267807173e-06, 5.324673267807173e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:32:46,167] [INFO] [timer.py:260:stop] epoch=1/micro_step=2281/global_step=4610, RunningAvgSamplesPerSec=4.749772608824065, CurrSamplesPerSec=4.736009841695256, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4620, Loss: 0.09992418438196182 +[2024-01-22 09:33:53,572] [INFO] [logging.py:96:log_dist] [Rank 0] step=4620, skipped=0, lr=[5.284398370556299e-06, 5.284398370556299e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:33:53,573] [INFO] [timer.py:260:stop] epoch=1/micro_step=2291/global_step=4620, RunningAvgSamplesPerSec=4.749769226486221, CurrSamplesPerSec=4.734203683902077, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4630, Loss: 0.029842600226402283 +[2024-01-22 09:35:01,031] [INFO] [logging.py:96:log_dist] [Rank 0] step=4630, skipped=0, lr=[5.2442215974616906e-06, 5.2442215974616906e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:35:01,032] [INFO] [timer.py:260:stop] epoch=1/micro_step=2301/global_step=4630, RunningAvgSamplesPerSec=4.749757868251454, CurrSamplesPerSec=4.719449704677774, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4640, Loss: 0.029480883851647377 +[2024-01-22 09:36:08,492] [INFO] [logging.py:96:log_dist] [Rank 0] step=4640, skipped=0, lr=[5.2041437845379806e-06, 5.2041437845379806e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:36:08,493] [INFO] [timer.py:260:stop] epoch=1/micro_step=2311/global_step=4640, RunningAvgSamplesPerSec=4.749746170256197, CurrSamplesPerSec=4.733092308993289, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 1, Total Step: 4650, Loss: 0.012801233679056168 +[2024-01-22 09:37:15,997] [INFO] [logging.py:96:log_dist] [Rank 0] step=4650, skipped=0, lr=[5.164165765740597e-06, 5.164165765740597e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:37:15,998] [INFO] [timer.py:260:stop] epoch=1/micro_step=2321/global_step=4650, RunningAvgSamplesPerSec=4.749727856077609, CurrSamplesPerSec=4.746975520265405, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +***** Evaluating perplexity, Epoch 2/3 ***** +ppl: 1.0081787109375 +eval loss: 0.00814550556242466 +Beginning of Epoch 3/3, Total Micro Batches 2329 +Epoch: 2, Total Step: 4659, Loss: 0.04383140802383423 +[2024-01-22 09:38:19,289] [INFO] [logging.py:96:log_dist] [Rank 0] step=4660, skipped=0, lr=[5.1242883729484134e-06, 5.1242883729484134e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:38:19,289] [INFO] [timer.py:260:stop] epoch=2/micro_step=2/global_step=4660, RunningAvgSamplesPerSec=4.7504321553289275, CurrSamplesPerSec=4.75552708285213, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4669, Loss: 0.01794895902276039 +[2024-01-22 09:39:26,600] [INFO] [logging.py:96:log_dist] [Rank 0] step=4670, skipped=0, lr=[5.084512435946433e-06, 5.084512435946433e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:39:26,600] [INFO] [timer.py:260:stop] epoch=2/micro_step=12/global_step=4670, RunningAvgSamplesPerSec=4.750441768452078, CurrSamplesPerSec=4.7550861718959805, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4679, Loss: 0.03766123950481415 +[2024-01-22 09:40:33,905] [INFO] [logging.py:96:log_dist] [Rank 0] step=4680, skipped=0, lr=[5.044838782408528e-06, 5.044838782408528e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:40:33,906] [INFO] [timer.py:260:stop] epoch=2/micro_step=22/global_step=4680, RunningAvgSamplesPerSec=4.75045213597833, CurrSamplesPerSec=4.75712512230612, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4689, Loss: 0.027211857959628105 +[2024-01-22 09:41:41,322] [INFO] [logging.py:96:log_dist] [Rank 0] step=4690, skipped=0, lr=[5.005268237880213e-06, 5.005268237880213e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:41:41,323] [INFO] [timer.py:260:stop] epoch=2/micro_step=32/global_step=4690, RunningAvgSamplesPerSec=4.750445689129741, CurrSamplesPerSec=4.73598510881147, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4699, Loss: 0.039847664535045624 +[2024-01-22 09:42:48,822] [INFO] [logging.py:96:log_dist] [Rank 0] step=4700, skipped=0, lr=[4.965801625761472e-06, 4.965801625761472e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:42:48,823] [INFO] [timer.py:260:stop] epoch=2/micro_step=42/global_step=4700, RunningAvgSamplesPerSec=4.7504268167052714, CurrSamplesPerSec=4.725517097634174, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4709, Loss: 0.017651349306106567 +[2024-01-22 09:43:56,315] [INFO] [logging.py:96:log_dist] [Rank 0] step=4710, skipped=0, lr=[4.9264397672896166e-06, 4.9264397672896166e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:43:56,315] [INFO] [timer.py:260:stop] epoch=2/micro_step=52/global_step=4710, RunningAvgSamplesPerSec=4.750409213386487, CurrSamplesPerSec=4.733453360958661, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4719, Loss: 0.056400127708911896 +[2024-01-22 09:45:03,918] [INFO] [logging.py:96:log_dist] [Rank 0] step=4720, skipped=0, lr=[4.887183481522206e-06, 4.887183481522206e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:45:03,918] [INFO] [timer.py:260:stop] epoch=2/micro_step=62/global_step=4720, RunningAvgSamplesPerSec=4.750375088578413, CurrSamplesPerSec=4.722173824183618, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4729, Loss: 0.04272197186946869 +[2024-01-22 09:46:11,441] [INFO] [logging.py:96:log_dist] [Rank 0] step=4730, skipped=0, lr=[4.8480335853199965e-06, 4.8480335853199965e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:46:11,441] [INFO] [timer.py:260:stop] epoch=2/micro_step=72/global_step=4730, RunningAvgSamplesPerSec=4.750353006922011, CurrSamplesPerSec=4.7324941823044595, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4739, Loss: 0.01448905561119318 +[2024-01-22 09:47:18,922] [INFO] [logging.py:96:log_dist] [Rank 0] step=4740, skipped=0, lr=[4.808990893329948e-06, 4.808990893329948e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:47:18,922] [INFO] [timer.py:260:stop] epoch=2/micro_step=82/global_step=4740, RunningAvgSamplesPerSec=4.750337342122236, CurrSamplesPerSec=4.750652801076125, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4749, Loss: 0.03968941420316696 +[2024-01-22 09:48:26,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=4750, skipped=0, lr=[4.770056217968273e-06, 4.770056217968273e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:48:26,360] [INFO] [timer.py:260:stop] epoch=2/micro_step=92/global_step=4750, RunningAvgSamplesPerSec=4.7503280145932925, CurrSamplesPerSec=4.729041556231236, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4759, Loss: 0.01695208065211773 +[2024-01-22 09:49:33,762] [INFO] [logging.py:96:log_dist] [Rank 0] step=4760, skipped=0, lr=[4.731230369403527e-06, 4.731230369403527e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:49:33,763] [INFO] [timer.py:260:stop] epoch=2/micro_step=102/global_step=4760, RunningAvgSamplesPerSec=4.75032405345838, CurrSamplesPerSec=4.753095593552239, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4769, Loss: 0.028164617717266083 +[2024-01-22 09:50:41,163] [INFO] [logging.py:96:log_dist] [Rank 0] step=4770, skipped=0, lr=[4.692514155539758e-06, 4.692514155539758e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:50:41,164] [INFO] [timer.py:260:stop] epoch=2/micro_step=112/global_step=4770, RunningAvgSamplesPerSec=4.750320275626202, CurrSamplesPerSec=4.741625706362987, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4779, Loss: 0.057571008801460266 +[2024-01-22 09:51:48,616] [INFO] [logging.py:96:log_dist] [Rank 0] step=4780, skipped=0, lr=[4.653908381999685e-06, 4.653908381999685e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:51:48,616] [INFO] [timer.py:260:stop] epoch=2/micro_step=122/global_step=4780, RunningAvgSamplesPerSec=4.75030899471382, CurrSamplesPerSec=4.737683922371689, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4789, Loss: 0.013791180215775967 +[2024-01-22 09:52:56,005] [INFO] [logging.py:96:log_dist] [Rank 0] step=4790, skipped=0, lr=[4.61541385210794e-06, 4.61541385210794e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:52:56,005] [INFO] [timer.py:260:stop] epoch=2/micro_step=132/global_step=4790, RunningAvgSamplesPerSec=4.750307089076847, CurrSamplesPerSec=4.751414642428402, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4799, Loss: 0.018102753907442093 +[2024-01-22 09:54:03,600] [INFO] [logging.py:96:log_dist] [Rank 0] step=4800, skipped=0, lr=[4.577031366874365e-06, 4.577031366874365e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:54:03,601] [INFO] [timer.py:260:stop] epoch=2/micro_step=142/global_step=4800, RunningAvgSamplesPerSec=4.750274877965121, CurrSamplesPerSec=4.704571861290806, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4809, Loss: 0.04233488813042641 +[2024-01-22 09:55:11,161] [INFO] [logging.py:96:log_dist] [Rank 0] step=4810, skipped=0, lr=[4.538761724977307e-06, 4.538761724977307e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:55:11,162] [INFO] [timer.py:260:stop] epoch=2/micro_step=152/global_step=4810, RunningAvgSamplesPerSec=4.750247862708031, CurrSamplesPerSec=4.73574715156147, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4819, Loss: 0.06620138138532639 +[2024-01-22 09:56:18,554] [INFO] [logging.py:96:log_dist] [Rank 0] step=4820, skipped=0, lr=[4.50060572274705e-06, 4.50060572274705e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:56:18,555] [INFO] [timer.py:260:stop] epoch=2/micro_step=162/global_step=4820, RunningAvgSamplesPerSec=4.750245456077883, CurrSamplesPerSec=4.754300754392285, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4829, Loss: 0.021840203553438187 +[2024-01-22 09:57:26,085] [INFO] [logging.py:96:log_dist] [Rank 0] step=4830, skipped=0, lr=[4.46256415414919e-06, 4.46256415414919e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:57:26,086] [INFO] [timer.py:260:stop] epoch=2/micro_step=172/global_step=4830, RunningAvgSamplesPerSec=4.750222986743038, CurrSamplesPerSec=4.759652555794413, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4839, Loss: 0.027829572558403015 +[2024-01-22 09:58:33,530] [INFO] [logging.py:96:log_dist] [Rank 0] step=4840, skipped=0, lr=[4.424637810768172e-06, 4.424637810768172e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:58:33,530] [INFO] [timer.py:260:stop] epoch=2/micro_step=182/global_step=4840, RunningAvgSamplesPerSec=4.750213155305752, CurrSamplesPerSec=4.757636229343216, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4849, Loss: 0.021769477054476738 +[2024-01-22 09:59:40,955] [INFO] [logging.py:96:log_dist] [Rank 0] step=4850, skipped=0, lr=[4.3868274817907545e-06, 4.3868274817907545e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 09:59:40,956] [INFO] [timer.py:260:stop] epoch=2/micro_step=192/global_step=4850, RunningAvgSamplesPerSec=4.750206179442154, CurrSamplesPerSec=4.738355960348659, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4859, Loss: 0.027222730219364166 +[2024-01-22 10:00:48,464] [INFO] [logging.py:96:log_dist] [Rank 0] step=4860, skipped=0, lr=[4.349133953989654e-06, 4.349133953989654e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:00:48,464] [INFO] [timer.py:260:stop] epoch=2/micro_step=202/global_step=4860, RunningAvgSamplesPerSec=4.750187252655309, CurrSamplesPerSec=4.747343898893091, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4869, Loss: 0.02743493765592575 +[2024-01-22 10:01:55,826] [INFO] [logging.py:96:log_dist] [Rank 0] step=4870, skipped=0, lr=[4.311558011707109e-06, 4.311558011707109e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:01:55,827] [INFO] [timer.py:260:stop] epoch=2/micro_step=212/global_step=4870, RunningAvgSamplesPerSec=4.750189385518521, CurrSamplesPerSec=4.770199107116773, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4879, Loss: 0.019777238368988037 +[2024-01-22 10:03:03,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=4880, skipped=0, lr=[4.274100436838618e-06, 4.274100436838618e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:03:03,273] [INFO] [timer.py:260:stop] epoch=2/micro_step=222/global_step=4880, RunningAvgSamplesPerSec=4.750179517820066, CurrSamplesPerSec=4.752542548847082, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4889, Loss: 0.03072638437151909 +[2024-01-22 10:04:10,740] [INFO] [logging.py:96:log_dist] [Rank 0] step=4890, skipped=0, lr=[4.236762008816629e-06, 4.236762008816629e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:04:10,740] [INFO] [timer.py:260:stop] epoch=2/micro_step=232/global_step=4890, RunningAvgSamplesPerSec=4.750166653553493, CurrSamplesPerSec=4.743651115271684, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4899, Loss: 0.029104333370923996 +[2024-01-22 10:05:18,337] [INFO] [logging.py:96:log_dist] [Rank 0] step=4900, skipped=0, lr=[4.199543504594332e-06, 4.199543504594332e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:05:18,337] [INFO] [timer.py:260:stop] epoch=2/micro_step=242/global_step=4900, RunningAvgSamplesPerSec=4.750135175710009, CurrSamplesPerSec=4.721739242092087, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4909, Loss: 0.02530732750892639 +[2024-01-22 10:06:25,841] [INFO] [logging.py:96:log_dist] [Rank 0] step=4910, skipped=0, lr=[4.1624456986295e-06, 4.1624456986295e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:06:25,841] [INFO] [timer.py:260:stop] epoch=2/micro_step=252/global_step=4910, RunningAvgSamplesPerSec=4.750117249444689, CurrSamplesPerSec=4.708216084888237, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4919, Loss: 0.07364533096551895 +[2024-01-22 10:07:33,161] [INFO] [logging.py:96:log_dist] [Rank 0] step=4920, skipped=0, lr=[4.125469362868365e-06, 4.125469362868365e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:07:33,162] [INFO] [timer.py:260:stop] epoch=2/micro_step=262/global_step=4920, RunningAvgSamplesPerSec=4.7501256011621535, CurrSamplesPerSec=4.749466967520874, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4929, Loss: 0.08948176354169846 +[2024-01-22 10:08:40,657] [INFO] [logging.py:96:log_dist] [Rank 0] step=4930, skipped=0, lr=[4.0886152667295565e-06, 4.0886152667295565e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:08:40,658] [INFO] [timer.py:260:stop] epoch=2/micro_step=272/global_step=4930, RunningAvgSamplesPerSec=4.750108868538797, CurrSamplesPerSec=4.744396457273534, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4939, Loss: 0.038705550134181976 +[2024-01-22 10:09:47,989] [INFO] [logging.py:96:log_dist] [Rank 0] step=4940, skipped=0, lr=[4.051884177088095e-06, 4.051884177088095e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:09:47,989] [INFO] [timer.py:260:stop] epoch=2/micro_step=282/global_step=4940, RunningAvgSamplesPerSec=4.750115650090444, CurrSamplesPerSec=4.752129279931884, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4949, Loss: 0.04459773004055023 +[2024-01-22 10:10:55,375] [INFO] [logging.py:96:log_dist] [Rank 0] step=4950, skipped=0, lr=[4.015276858259427e-06, 4.015276858259427e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:10:55,376] [INFO] [timer.py:260:stop] epoch=2/micro_step=292/global_step=4950, RunningAvgSamplesPerSec=4.750114479333833, CurrSamplesPerSec=4.746927336405248, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4959, Loss: 0.04273287579417229 +[2024-01-22 10:12:02,736] [INFO] [logging.py:96:log_dist] [Rank 0] step=4960, skipped=0, lr=[3.9787940719835324e-06, 3.9787940719835324e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:12:02,737] [INFO] [timer.py:260:stop] epoch=2/micro_step=302/global_step=4960, RunningAvgSamplesPerSec=4.750117124232024, CurrSamplesPerSec=4.7563579091275034, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4969, Loss: 0.03505350276827812 +[2024-01-22 10:13:10,195] [INFO] [logging.py:96:log_dist] [Rank 0] step=4970, skipped=0, lr=[3.942436577409058e-06, 3.942436577409058e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:13:10,196] [INFO] [timer.py:260:stop] epoch=2/micro_step=312/global_step=4970, RunningAvgSamplesPerSec=4.750105760580285, CurrSamplesPerSec=4.745785816182371, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4979, Loss: 0.022300349548459053 +[2024-01-22 10:14:17,651] [INFO] [logging.py:96:log_dist] [Rank 0] step=4980, skipped=0, lr=[3.906205131077546e-06, 3.906205131077546e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:14:17,652] [INFO] [timer.py:260:stop] epoch=2/micro_step=322/global_step=4980, RunningAvgSamplesPerSec=4.750094771496916, CurrSamplesPerSec=4.760100729153238, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4989, Loss: 0.062212251126766205 +[2024-01-22 10:15:25,104] [INFO] [logging.py:96:log_dist] [Rank 0] step=4990, skipped=0, lr=[3.870100486907651e-06, 3.870100486907651e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:15:25,105] [INFO] [timer.py:260:stop] epoch=2/micro_step=332/global_step=4990, RunningAvgSamplesPerSec=4.750084378477288, CurrSamplesPerSec=4.738283194420889, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 4999, Loss: 0.024772904813289642 +[2024-01-22 10:16:32,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=5000, skipped=0, lr=[3.834123396179504e-06, 3.834123396179504e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:16:32,504] [INFO] [timer.py:260:stop] epoch=2/micro_step=342/global_step=5000, RunningAvgSamplesPerSec=4.750081528738929, CurrSamplesPerSec=4.7220867684586425, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5009, Loss: 0.015571890398859978 +[2024-01-22 10:17:39,974] [INFO] [logging.py:96:log_dist] [Rank 0] step=5010, skipped=0, lr=[3.79827460751903e-06, 3.79827460751903e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:17:39,974] [INFO] [timer.py:260:stop] epoch=2/micro_step=352/global_step=5010, RunningAvgSamplesPerSec=4.750068731440459, CurrSamplesPerSec=4.7459991066536205, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5019, Loss: 0.010124078020453453 +[2024-01-22 10:18:47,334] [INFO] [logging.py:96:log_dist] [Rank 0] step=5020, skipped=0, lr=[3.762554866882404e-06, 3.762554866882404e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:18:47,335] [INFO] [timer.py:260:stop] epoch=2/micro_step=362/global_step=5020, RunningAvgSamplesPerSec=4.750071368760871, CurrSamplesPerSec=4.752176223392745, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5029, Loss: 0.07045213133096695 +[2024-01-22 10:19:54,598] [INFO] [logging.py:96:log_dist] [Rank 0] step=5030, skipped=0, lr=[3.7269649175405122e-06, 3.7269649175405122e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:19:54,599] [INFO] [timer.py:260:stop] epoch=2/micro_step=372/global_step=5030, RunningAvgSamplesPerSec=4.750087576415971, CurrSamplesPerSec=4.741361052948991, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5039, Loss: 0.02506658248603344 +[2024-01-22 10:21:02,030] [INFO] [logging.py:96:log_dist] [Rank 0] step=5040, skipped=0, lr=[3.691505500063496e-06, 3.691505500063496e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:21:02,030] [INFO] [timer.py:260:stop] epoch=2/micro_step=382/global_step=5040, RunningAvgSamplesPerSec=4.750080321183334, CurrSamplesPerSec=4.730087350213568, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5049, Loss: 0.021653873845934868 +[2024-01-22 10:22:09,548] [INFO] [logging.py:96:log_dist] [Rank 0] step=5050, skipped=0, lr=[3.6561773523053302e-06, 3.6561773523053302e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:22:09,549] [INFO] [timer.py:260:stop] epoch=2/micro_step=392/global_step=5050, RunningAvgSamplesPerSec=4.750060894365627, CurrSamplesPerSec=4.744030045042539, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5059, Loss: 0.03861260041594505 +[2024-01-22 10:23:16,972] [INFO] [logging.py:96:log_dist] [Rank 0] step=5060, skipped=0, lr=[3.6209812093884777e-06, 3.6209812093884777e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:23:16,973] [INFO] [timer.py:260:stop] epoch=2/micro_step=402/global_step=5060, RunningAvgSamplesPerSec=4.75005475838086, CurrSamplesPerSec=4.73346154076746, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5069, Loss: 0.03889566287398338 +[2024-01-22 10:24:24,344] [INFO] [logging.py:96:log_dist] [Rank 0] step=5070, skipped=0, lr=[3.585917803688603e-06, 3.585917803688603e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:24:24,345] [INFO] [timer.py:260:stop] epoch=2/micro_step=412/global_step=5070, RunningAvgSamplesPerSec=4.750055874416436, CurrSamplesPerSec=4.750008201706415, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5079, Loss: 0.03392473980784416 +[2024-01-22 10:25:31,766] [INFO] [logging.py:96:log_dist] [Rank 0] step=5080, skipped=0, lr=[3.5509878648192964e-06, 3.5509878648192964e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:25:31,766] [INFO] [timer.py:260:stop] epoch=2/micro_step=422/global_step=5080, RunningAvgSamplesPerSec=4.750050149828087, CurrSamplesPerSec=4.763663964845255, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5089, Loss: 0.05829259753227234 +[2024-01-22 10:26:39,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=5090, skipped=0, lr=[3.5161921196169434e-06, 3.5161921196169434e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:26:39,273] [INFO] [timer.py:260:stop] epoch=2/micro_step=432/global_step=5090, RunningAvgSamplesPerSec=4.750032503310252, CurrSamplesPerSec=4.737976431151152, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5099, Loss: 0.021029209718108177 +[2024-01-22 10:27:46,612] [INFO] [logging.py:96:log_dist] [Rank 0] step=5100, skipped=0, lr=[3.481531292125546e-06, 3.481531292125546e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:27:46,613] [INFO] [timer.py:260:stop] epoch=2/micro_step=442/global_step=5100, RunningAvgSamplesPerSec=4.750038166438388, CurrSamplesPerSec=4.733604942415825, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5109, Loss: 0.012836124747991562 +[2024-01-22 10:28:54,212] [INFO] [logging.py:96:log_dist] [Rank 0] step=5110, skipped=0, lr=[3.447006103581709e-06, 3.447006103581709e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:28:54,213] [INFO] [timer.py:260:stop] epoch=2/micro_step=452/global_step=5110, RunningAvgSamplesPerSec=4.750007809550828, CurrSamplesPerSec=4.7342278972152165, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5119, Loss: 0.015074226073920727 +[2024-01-22 10:30:01,648] [INFO] [logging.py:96:log_dist] [Rank 0] step=5120, skipped=0, lr=[3.412617272399584e-06, 3.412617272399584e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:30:01,648] [INFO] [timer.py:260:stop] epoch=2/micro_step=462/global_step=5120, RunningAvgSamplesPerSec=4.750000182211776, CurrSamplesPerSec=4.7579759039288065, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5129, Loss: 0.012551895342767239 +[2024-01-22 10:31:09,136] [INFO] [logging.py:96:log_dist] [Rank 0] step=5130, skipped=0, lr=[3.3783655141559677e-06, 3.3783655141559677e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:31:09,137] [INFO] [timer.py:260:stop] epoch=2/micro_step=472/global_step=5130, RunningAvgSamplesPerSec=4.749985399424095, CurrSamplesPerSec=4.738636841064306, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5139, Loss: 0.0419391393661499 +[2024-01-22 10:32:16,595] [INFO] [logging.py:96:log_dist] [Rank 0] step=5140, skipped=0, lr=[3.3442515415753583e-06, 3.3442515415753583e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:32:16,596] [INFO] [timer.py:260:stop] epoch=2/micro_step=482/global_step=5140, RunningAvgSamplesPerSec=4.74997469255147, CurrSamplesPerSec=4.744808214518454, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5149, Loss: 0.02982579544186592 +[2024-01-22 10:33:24,135] [INFO] [logging.py:96:log_dist] [Rank 0] step=5150, skipped=0, lr=[3.3102760645151797e-06, 3.3102760645151797e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:33:24,136] [INFO] [timer.py:260:stop] epoch=2/micro_step=492/global_step=5150, RunningAvgSamplesPerSec=4.749952877225503, CurrSamplesPerSec=4.771290154239488, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5159, Loss: 0.014715326949954033 +[2024-01-22 10:34:31,548] [INFO] [logging.py:96:log_dist] [Rank 0] step=5160, skipped=0, lr=[3.2764397899509735e-06, 3.2764397899509735e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:34:31,549] [INFO] [timer.py:260:stop] epoch=2/micro_step=502/global_step=5160, RunningAvgSamplesPerSec=4.749948640308143, CurrSamplesPerSec=4.761766208789776, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5169, Loss: 0.015710238367319107 +[2024-01-22 10:35:38,913] [INFO] [logging.py:96:log_dist] [Rank 0] step=5170, skipped=0, lr=[3.242743421961698e-06, 3.242743421961698e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:35:38,914] [INFO] [timer.py:260:stop] epoch=2/micro_step=512/global_step=5170, RunningAvgSamplesPerSec=4.749950778483889, CurrSamplesPerSec=4.761357583865628, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5179, Loss: 0.025402814149856567 +[2024-01-22 10:36:46,365] [INFO] [logging.py:96:log_dist] [Rank 0] step=5180, skipped=0, lr=[3.2091876617150806e-06, 3.2091876617150806e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:36:46,365] [INFO] [timer.py:260:stop] epoch=2/micro_step=522/global_step=5180, RunningAvgSamplesPerSec=4.749941269021317, CurrSamplesPerSec=4.765534469134725, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5189, Loss: 0.015077459625899792 +[2024-01-22 10:37:53,920] [INFO] [logging.py:96:log_dist] [Rank 0] step=5190, skipped=0, lr=[3.1757732074530267e-06, 3.1757732074530267e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:37:53,920] [INFO] [timer.py:260:stop] epoch=2/micro_step=532/global_step=5190, RunningAvgSamplesPerSec=4.749917638729177, CurrSamplesPerSec=4.742801090898053, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5199, Loss: 0.019690489396452904 +[2024-01-22 10:39:01,351] [INFO] [logging.py:96:log_dist] [Rank 0] step=5200, skipped=0, lr=[3.142500754477088e-06, 3.142500754477088e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:39:01,351] [INFO] [timer.py:260:stop] epoch=2/micro_step=542/global_step=5200, RunningAvgSamplesPerSec=4.749911021295735, CurrSamplesPerSec=4.764950614011663, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5209, Loss: 0.024626223370432854 +[2024-01-22 10:40:08,914] [INFO] [logging.py:96:log_dist] [Rank 0] step=5210, skipped=0, lr=[3.1093709951339957e-06, 3.1093709951339957e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:40:08,914] [INFO] [timer.py:260:stop] epoch=2/micro_step=552/global_step=5210, RunningAvgSamplesPerSec=4.749886492240403, CurrSamplesPerSec=4.748694661795899, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5219, Loss: 0.02114482969045639 +[2024-01-22 10:41:16,432] [INFO] [logging.py:96:log_dist] [Rank 0] step=5220, skipped=0, lr=[3.0763846188012536e-06, 3.0763846188012536e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:41:16,432] [INFO] [timer.py:260:stop] epoch=2/micro_step=562/global_step=5220, RunningAvgSamplesPerSec=4.749868193729115, CurrSamplesPerSec=4.733109333784718, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5229, Loss: 0.01894483156502247 +[2024-01-22 10:42:23,785] [INFO] [logging.py:96:log_dist] [Rank 0] step=5230, skipped=0, lr=[3.043542311872796e-06, 3.043542311872796e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:42:23,786] [INFO] [timer.py:260:stop] epoch=2/micro_step=572/global_step=5230, RunningAvgSamplesPerSec=4.749872108932559, CurrSamplesPerSec=4.722965281359707, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5239, Loss: 0.013945533894002438 +[2024-01-22 10:43:31,204] [INFO] [logging.py:96:log_dist] [Rank 0] step=5240, skipped=0, lr=[3.0108447577446954e-06, 3.0108447577446954e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:43:31,205] [INFO] [timer.py:260:stop] epoch=2/micro_step=582/global_step=5240, RunningAvgSamplesPerSec=4.749867101525455, CurrSamplesPerSec=4.754269599066104, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5249, Loss: 0.030827293172478676 +[2024-01-22 10:44:38,652] [INFO] [logging.py:96:log_dist] [Rank 0] step=5250, skipped=0, lr=[2.9782926368009644e-06, 2.9782926368009644e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:44:38,652] [INFO] [timer.py:260:stop] epoch=2/micro_step=592/global_step=5250, RunningAvgSamplesPerSec=4.749858430233753, CurrSamplesPerSec=4.756278858595654, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5259, Loss: 0.0571478046476841 +[2024-01-22 10:45:46,179] [INFO] [logging.py:96:log_dist] [Rank 0] step=5260, skipped=0, lr=[2.9458866263993604e-06, 2.9458866263993604e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:45:46,179] [INFO] [timer.py:260:stop] epoch=2/micro_step=602/global_step=5260, RunningAvgSamplesPerSec=4.749839089265385, CurrSamplesPerSec=4.738897509420404, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5269, Loss: 0.016238592565059662 +[2024-01-22 10:46:53,569] [INFO] [logging.py:96:log_dist] [Rank 0] step=5270, skipped=0, lr=[2.9136274008573373e-06, 2.9136274008573373e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:46:53,570] [INFO] [timer.py:260:stop] epoch=2/micro_step=612/global_step=5270, RunningAvgSamplesPerSec=4.749838156025378, CurrSamplesPerSec=4.756097170318786, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5279, Loss: 0.028265634551644325 +[2024-01-22 10:48:00,911] [INFO] [logging.py:96:log_dist] [Rank 0] step=5280, skipped=0, lr=[2.8815156314379668e-06, 2.8815156314379668e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:48:00,911] [INFO] [timer.py:260:stop] epoch=2/micro_step=622/global_step=5280, RunningAvgSamplesPerSec=4.7498437005559495, CurrSamplesPerSec=4.767499750023488, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5289, Loss: 0.012683607637882233 +[2024-01-22 10:49:08,193] [INFO] [logging.py:96:log_dist] [Rank 0] step=5290, skipped=0, lr=[2.8495519863360166e-06, 2.8495519863360166e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:49:08,194] [INFO] [timer.py:260:stop] epoch=2/micro_step=632/global_step=5290, RunningAvgSamplesPerSec=4.749857108304031, CurrSamplesPerSec=4.761173142839669, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5299, Loss: 0.020029893144965172 +[2024-01-22 10:50:15,558] [INFO] [logging.py:96:log_dist] [Rank 0] step=5300, skipped=0, lr=[2.817737130663999e-06, 2.817737130663999e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:50:15,559] [INFO] [timer.py:260:stop] epoch=2/micro_step=642/global_step=5300, RunningAvgSamplesPerSec=4.749859573340701, CurrSamplesPerSec=4.737406833304414, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5309, Loss: 0.03577735275030136 +[2024-01-22 10:51:23,030] [INFO] [logging.py:96:log_dist] [Rank 0] step=5310, skipped=0, lr=[2.7860717264383807e-06, 2.7860717264383807e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:51:23,031] [INFO] [timer.py:260:stop] epoch=2/micro_step=652/global_step=5310, RunningAvgSamplesPerSec=4.749847736493183, CurrSamplesPerSec=4.746933044546664, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5319, Loss: 0.018763702362775803 +[2024-01-22 10:52:30,382] [INFO] [logging.py:96:log_dist] [Rank 0] step=5320, skipped=0, lr=[2.754556432565758e-06, 2.754556432565758e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:52:30,383] [INFO] [timer.py:260:stop] epoch=2/micro_step=662/global_step=5320, RunningAvgSamplesPerSec=4.749851782482396, CurrSamplesPerSec=4.746245648007697, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5329, Loss: 0.02098788134753704 +[2024-01-22 10:53:37,767] [INFO] [logging.py:96:log_dist] [Rank 0] step=5330, skipped=0, lr=[2.723191904829192e-06, 2.723191904829192e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:53:37,767] [INFO] [timer.py:260:stop] epoch=2/micro_step=672/global_step=5330, RunningAvgSamplesPerSec=4.749851617203261, CurrSamplesPerSec=4.736296126913147, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5339, Loss: 0.010645180940628052 +[2024-01-22 10:54:45,248] [INFO] [logging.py:96:log_dist] [Rank 0] step=5340, skipped=0, lr=[2.691978795874518e-06, 2.691978795874518e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:54:45,248] [INFO] [timer.py:260:stop] epoch=2/micro_step=682/global_step=5340, RunningAvgSamplesPerSec=4.749838662913988, CurrSamplesPerSec=4.740097656423133, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5349, Loss: 0.014903814531862736 +[2024-01-22 10:55:52,678] [INFO] [logging.py:96:log_dist] [Rank 0] step=5350, skipped=0, lr=[2.66091775519681e-06, 2.66091775519681e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:55:52,679] [INFO] [timer.py:260:stop] epoch=2/micro_step=692/global_step=5350, RunningAvgSamplesPerSec=4.749832427781793, CurrSamplesPerSec=4.737899161170795, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5359, Loss: 0.06353472173213959 +[2024-01-22 10:57:00,102] [INFO] [logging.py:96:log_dist] [Rank 0] step=5360, skipped=0, lr=[2.6300094291268297e-06, 2.6300094291268297e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:57:00,102] [INFO] [timer.py:260:stop] epoch=2/micro_step=702/global_step=5360, RunningAvgSamplesPerSec=4.749827085259542, CurrSamplesPerSec=4.752920039247843, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5369, Loss: 0.03854881972074509 +[2024-01-22 10:58:07,496] [INFO] [logging.py:96:log_dist] [Rank 0] step=5370, skipped=0, lr=[2.599254460817593e-06, 2.599254460817593e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:58:07,497] [INFO] [timer.py:260:stop] epoch=2/micro_step=712/global_step=5370, RunningAvgSamplesPerSec=4.749825570227868, CurrSamplesPerSec=4.755402736673956, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5379, Loss: 0.02086297981441021 +[2024-01-22 10:59:14,879] [INFO] [logging.py:96:log_dist] [Rank 0] step=5380, skipped=0, lr=[2.568653490230989e-06, 2.568653490230989e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 10:59:14,880] [INFO] [timer.py:260:stop] epoch=2/micro_step=722/global_step=5380, RunningAvgSamplesPerSec=4.74982557193422, CurrSamplesPerSec=4.755404590024535, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5389, Loss: 0.01957995444536209 +[2024-01-22 11:00:22,354] [INFO] [logging.py:96:log_dist] [Rank 0] step=5390, skipped=0, lr=[2.538207154124456e-06, 2.538207154124456e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:00:22,373] [INFO] [timer.py:260:stop] epoch=2/micro_step=732/global_step=5390, RunningAvgSamplesPerSec=4.749811170979623, CurrSamplesPerSec=4.739404037573072, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5399, Loss: 0.04621715843677521 +[2024-01-22 11:01:30,068] [INFO] [logging.py:96:log_dist] [Rank 0] step=5400, skipped=0, lr=[2.507916086037736e-06, 2.507916086037736e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:01:30,069] [INFO] [timer.py:260:stop] epoch=2/micro_step=742/global_step=5400, RunningAvgSamplesPerSec=4.749770479206504, CurrSamplesPerSec=4.5465279226138176, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5409, Loss: 0.07139088958501816 +[2024-01-22 11:02:37,510] [INFO] [logging.py:96:log_dist] [Rank 0] step=5410, skipped=0, lr=[2.477780916279693e-06, 2.477780916279693e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:02:37,510] [INFO] [timer.py:260:stop] epoch=2/micro_step=752/global_step=5410, RunningAvgSamplesPerSec=4.749762980554803, CurrSamplesPerSec=4.765130949236402, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5419, Loss: 0.0354047529399395 +[2024-01-22 11:03:44,821] [INFO] [logging.py:96:log_dist] [Rank 0] step=5420, skipped=0, lr=[2.4478022719151915e-06, 2.4478022719151915e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:03:44,822] [INFO] [timer.py:260:stop] epoch=2/micro_step=762/global_step=5420, RunningAvgSamplesPerSec=4.749772378650179, CurrSamplesPerSec=4.74858948902338, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5429, Loss: 0.02169969491660595 +[2024-01-22 11:04:52,284] [INFO] [logging.py:96:log_dist] [Rank 0] step=5430, skipped=0, lr=[2.417980776752057e-06, 2.417980776752057e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:04:52,285] [INFO] [timer.py:260:stop] epoch=2/micro_step=772/global_step=5430, RunningAvgSamplesPerSec=4.749762138274189, CurrSamplesPerSec=4.750557125690855, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5439, Loss: 0.029276812449097633 +[2024-01-22 11:05:59,712] [INFO] [logging.py:96:log_dist] [Rank 0] step=5440, skipped=0, lr=[2.388317051328084e-06, 2.388317051328084e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:05:59,712] [INFO] [timer.py:260:stop] epoch=2/micro_step=782/global_step=5440, RunningAvgSamplesPerSec=4.7497564872882005, CurrSamplesPerSec=4.745584625827548, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5449, Loss: 0.07677702605724335 +[2024-01-22 11:07:07,205] [INFO] [logging.py:96:log_dist] [Rank 0] step=5450, skipped=0, lr=[2.3588117128981356e-06, 2.3588117128981356e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:07:07,206] [INFO] [timer.py:260:stop] epoch=2/micro_step=792/global_step=5450, RunningAvgSamplesPerSec=4.749742289683742, CurrSamplesPerSec=4.689176375304525, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5459, Loss: 0.025627376511693 +[2024-01-22 11:08:14,500] [INFO] [logging.py:96:log_dist] [Rank 0] step=5460, skipped=0, lr=[2.3294653754212915e-06, 2.3294653754212915e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:08:14,500] [INFO] [timer.py:260:stop] epoch=2/micro_step=802/global_step=5460, RunningAvgSamplesPerSec=4.749753888693316, CurrSamplesPerSec=4.742622944563587, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5469, Loss: 0.03530857339501381 +[2024-01-22 11:09:21,776] [INFO] [logging.py:96:log_dist] [Rank 0] step=5470, skipped=0, lr=[2.3002786495480754e-06, 2.3002786495480754e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:09:21,777] [INFO] [timer.py:260:stop] epoch=2/micro_step=812/global_step=5470, RunningAvgSamplesPerSec=4.749767760529468, CurrSamplesPerSec=4.752243190938534, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5479, Loss: 0.022180551663041115 +[2024-01-22 11:10:29,190] [INFO] [logging.py:96:log_dist] [Rank 0] step=5480, skipped=0, lr=[2.2712521426077483e-06, 2.2712521426077483e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:10:29,191] [INFO] [timer.py:260:stop] epoch=2/micro_step=822/global_step=5480, RunningAvgSamplesPerSec=4.7497638478183815, CurrSamplesPerSec=4.7549483724034305, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5489, Loss: 0.015523741953074932 +[2024-01-22 11:11:36,545] [INFO] [logging.py:96:log_dist] [Rank 0] step=5490, skipped=0, lr=[2.24238645859567e-06, 2.24238645859567e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:11:36,546] [INFO] [timer.py:260:stop] epoch=2/micro_step=832/global_step=5490, RunningAvgSamplesPerSec=4.749767571113884, CurrSamplesPerSec=4.739469808833205, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5499, Loss: 0.013436981476843357 +[2024-01-22 11:12:44,010] [INFO] [logging.py:96:log_dist] [Rank 0] step=5500, skipped=0, lr=[2.2136821981607305e-06, 2.2136821981607305e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:12:44,011] [INFO] [timer.py:260:stop] epoch=2/micro_step=842/global_step=5500, RunningAvgSamplesPerSec=4.749757209680746, CurrSamplesPerSec=4.750023667352852, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5509, Loss: 0.014779139310121536 +[2024-01-22 11:13:51,473] [INFO] [logging.py:96:log_dist] [Rank 0] step=5510, skipped=0, lr=[2.1851399585928536e-06, 2.1851399585928536e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:13:51,474] [INFO] [timer.py:260:stop] epoch=2/micro_step=852/global_step=5510, RunningAvgSamplesPerSec=4.749747132673577, CurrSamplesPerSec=4.739211754969495, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5519, Loss: 0.02345513179898262 +[2024-01-22 11:14:58,915] [INFO] [logging.py:96:log_dist] [Rank 0] step=5520, skipped=0, lr=[2.1567603338105667e-06, 2.1567603338105667e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:14:58,915] [INFO] [timer.py:260:stop] epoch=2/micro_step=862/global_step=5520, RunningAvgSamplesPerSec=4.749739775047276, CurrSamplesPerSec=4.731731056009668, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5529, Loss: 0.018308386206626892 +[2024-01-22 11:16:06,251] [INFO] [logging.py:96:log_dist] [Rank 0] step=5530, skipped=0, lr=[2.1285439143486408e-06, 2.1285439143486408e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:16:06,251] [INFO] [timer.py:260:stop] epoch=2/micro_step=872/global_step=5530, RunningAvgSamplesPerSec=4.749745984505042, CurrSamplesPerSec=4.733643340140956, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5539, Loss: 0.0491025410592556 +[2024-01-22 11:17:13,570] [INFO] [logging.py:96:log_dist] [Rank 0] step=5540, skipped=0, lr=[2.100491287345813e-06, 2.100491287345813e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:17:13,571] [INFO] [timer.py:260:stop] epoch=2/micro_step=882/global_step=5540, RunningAvgSamplesPerSec=4.7497541359596935, CurrSamplesPerSec=4.768359329064641, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5549, Loss: 0.04489925131201744 +[2024-01-22 11:18:20,924] [INFO] [logging.py:96:log_dist] [Rank 0] step=5550, skipped=0, lr=[2.0726030365325434e-06, 2.0726030365325434e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:18:20,924] [INFO] [timer.py:260:stop] epoch=2/micro_step=892/global_step=5550, RunningAvgSamplesPerSec=4.749758090941637, CurrSamplesPerSec=4.755966558844744, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5559, Loss: 0.040540922433137894 +[2024-01-22 11:19:28,434] [INFO] [logging.py:96:log_dist] [Rank 0] step=5560, skipped=0, lr=[2.044879742218906e-06, 2.044879742218906e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:19:28,434] [INFO] [timer.py:260:stop] epoch=2/micro_step=902/global_step=5560, RunningAvgSamplesPerSec=4.749742116661933, CurrSamplesPerSec=4.751505978884859, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5569, Loss: 0.034597478806972504 +[2024-01-22 11:20:35,794] [INFO] [logging.py:96:log_dist] [Rank 0] step=5570, skipped=0, lr=[2.017321981282471e-06, 2.017321981282471e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:20:35,795] [INFO] [timer.py:260:stop] epoch=2/micro_step=912/global_step=5570, RunningAvgSamplesPerSec=4.749745082866869, CurrSamplesPerSec=4.752616426555501, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5579, Loss: 0.02792753279209137 +[2024-01-22 11:21:43,107] [INFO] [logging.py:96:log_dist] [Rank 0] step=5580, skipped=0, lr=[1.9899303271563443e-06, 1.9899303271563443e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:21:43,108] [INFO] [timer.py:260:stop] epoch=2/micro_step=922/global_step=5580, RunningAvgSamplesPerSec=4.749756491174611, CurrSamplesPerSec=4.7498833034918215, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5589, Loss: 0.01860707253217697 +[2024-01-22 11:22:50,470] [INFO] [logging.py:96:log_dist] [Rank 0] step=5590, skipped=0, lr=[1.9627053498171946e-06, 1.9627053498171946e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:22:50,470] [INFO] [timer.py:260:stop] epoch=2/micro_step=932/global_step=5590, RunningAvgSamplesPerSec=4.7497592934279105, CurrSamplesPerSec=4.756080485356627, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5599, Loss: 0.011437960900366306 +[2024-01-22 11:23:57,777] [INFO] [logging.py:96:log_dist] [Rank 0] step=5600, skipped=0, lr=[1.9356476157734315e-06, 1.9356476157734315e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:23:57,778] [INFO] [timer.py:260:stop] epoch=2/micro_step=942/global_step=5600, RunningAvgSamplesPerSec=4.749768915060658, CurrSamplesPerSec=4.760890259692834, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5609, Loss: 0.02275955118238926 +[2024-01-22 11:25:05,198] [INFO] [logging.py:96:log_dist] [Rank 0] step=5610, skipped=0, lr=[1.9087576880533763e-06, 1.9087576880533763e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:25:05,199] [INFO] [timer.py:260:stop] epoch=2/micro_step=952/global_step=5610, RunningAvgSamplesPerSec=4.7497642215412, CurrSamplesPerSec=4.751150913401393, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5619, Loss: 0.029363112524151802 +[2024-01-22 11:26:12,496] [INFO] [logging.py:96:log_dist] [Rank 0] step=5620, skipped=0, lr=[1.8820361261935882e-06, 1.8820361261935882e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:26:12,496] [INFO] [timer.py:260:stop] epoch=2/micro_step=962/global_step=5620, RunningAvgSamplesPerSec=4.749775034467953, CurrSamplesPerSec=4.762364829861974, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5629, Loss: 0.03192262724041939 +[2024-01-22 11:27:19,956] [INFO] [logging.py:96:log_dist] [Rank 0] step=5630, skipped=0, lr=[1.8554834862271887e-06, 1.8554834862271887e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:27:19,957] [INFO] [timer.py:260:stop] epoch=2/micro_step=972/global_step=5630, RunningAvgSamplesPerSec=4.749765398291921, CurrSamplesPerSec=4.746666959020844, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5639, Loss: 0.08093052357435226 +[2024-01-22 11:28:27,388] [INFO] [logging.py:96:log_dist] [Rank 0] step=5640, skipped=0, lr=[1.829100320672309e-06, 1.829100320672309e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:28:27,388] [INFO] [timer.py:260:stop] epoch=2/micro_step=982/global_step=5640, RunningAvgSamplesPerSec=4.749759491737996, CurrSamplesPerSec=4.749722609663935, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5649, Loss: 0.046877630054950714 +[2024-01-22 11:29:34,886] [INFO] [logging.py:96:log_dist] [Rank 0] step=5650, skipped=0, lr=[1.802887178520586e-06, 1.802887178520586e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:29:34,887] [INFO] [timer.py:260:stop] epoch=2/micro_step=992/global_step=5650, RunningAvgSamplesPerSec=4.7497451744354535, CurrSamplesPerSec=4.719103727810484, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5659, Loss: 0.08165321499109268 +[2024-01-22 11:30:42,292] [INFO] [logging.py:96:log_dist] [Rank 0] step=5660, skipped=0, lr=[1.7768446052257404e-06, 1.7768446052257404e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:30:42,292] [INFO] [timer.py:260:stop] epoch=2/micro_step=1002/global_step=5660, RunningAvgSamplesPerSec=4.749742498509746, CurrSamplesPerSec=4.755069662489206, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5669, Loss: 0.015755722299218178 +[2024-01-22 11:31:49,718] [INFO] [logging.py:96:log_dist] [Rank 0] step=5670, skipped=0, lr=[1.7509731426922284e-06, 1.7509731426922284e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:31:49,719] [INFO] [timer.py:260:stop] epoch=2/micro_step=1012/global_step=5670, RunningAvgSamplesPerSec=4.749737403541765, CurrSamplesPerSec=4.7434988897367, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5679, Loss: 0.014016703702509403 +[2024-01-22 11:32:57,022] [INFO] [logging.py:96:log_dist] [Rank 0] step=5680, skipped=0, lr=[1.7252733292639623e-06, 1.7252733292639623e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:32:57,023] [INFO] [timer.py:260:stop] epoch=2/micro_step=1022/global_step=5680, RunningAvgSamplesPerSec=4.749747366584425, CurrSamplesPerSec=4.7528730810906215, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5689, Loss: 0.061982616782188416 +[2024-01-22 11:34:04,424] [INFO] [logging.py:96:log_dist] [Rank 0] step=5690, skipped=0, lr=[1.6997456997131101e-06, 1.6997456997131101e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:34:04,424] [INFO] [timer.py:260:stop] epoch=2/micro_step=1032/global_step=5690, RunningAvgSamplesPerSec=4.749745270834984, CurrSamplesPerSec=4.735780069783899, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5699, Loss: 0.034916672855615616 +[2024-01-22 11:35:11,720] [INFO] [logging.py:96:log_dist] [Rank 0] step=5700, skipped=0, lr=[1.6743907852289686e-06, 1.6743907852289686e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:35:11,721] [INFO] [timer.py:260:stop] epoch=2/micro_step=1042/global_step=5700, RunningAvgSamplesPerSec=4.749756167928241, CurrSamplesPerSec=4.767074732850876, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5709, Loss: 0.03610040992498398 +[2024-01-22 11:36:19,118] [INFO] [logging.py:96:log_dist] [Rank 0] step=5710, skipped=0, lr=[1.6492091134069078e-06, 1.6492091134069078e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:36:19,118] [INFO] [timer.py:260:stop] epoch=2/micro_step=1052/global_step=5710, RunningAvgSamplesPerSec=4.749754490494737, CurrSamplesPerSec=4.769815985472675, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5719, Loss: 0.017359010875225067 +[2024-01-22 11:37:26,548] [INFO] [logging.py:96:log_dist] [Rank 0] step=5720, skipped=0, lr=[1.624201208237397e-06, 1.624201208237397e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:37:26,549] [INFO] [timer.py:260:stop] epoch=2/micro_step=1062/global_step=5720, RunningAvgSamplesPerSec=4.749748776985421, CurrSamplesPerSec=4.74306121193379, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5729, Loss: 0.02073444426059723 +[2024-01-22 11:38:34,059] [INFO] [logging.py:96:log_dist] [Rank 0] step=5730, skipped=0, lr=[1.5993675900950945e-06, 1.5993675900950945e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:38:34,060] [INFO] [timer.py:260:stop] epoch=2/micro_step=1072/global_step=5730, RunningAvgSamplesPerSec=4.749733158812794, CurrSamplesPerSec=4.74256814588983, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5739, Loss: 0.016383448615670204 +[2024-01-22 11:39:41,556] [INFO] [logging.py:96:log_dist] [Rank 0] step=5740, skipped=0, lr=[1.5747087757280243e-06, 1.5747087757280243e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:39:41,556] [INFO] [timer.py:260:stop] epoch=2/micro_step=1082/global_step=5740, RunningAvgSamplesPerSec=4.7497193474345885, CurrSamplesPerSec=4.740172152179125, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5749, Loss: 0.015516932122409344 +[2024-01-22 11:40:48,896] [INFO] [logging.py:96:log_dist] [Rank 0] step=5750, skipped=0, lr=[1.5502252782468252e-06, 1.5502252782468252e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:40:48,897] [INFO] [timer.py:260:stop] epoch=2/micro_step=1092/global_step=5750, RunningAvgSamplesPerSec=4.749724758497783, CurrSamplesPerSec=4.752567623256008, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5759, Loss: 0.041889388114213943 +[2024-01-22 11:41:56,257] [INFO] [logging.py:96:log_dist] [Rank 0] step=5760, skipped=0, lr=[1.525917607114068e-06, 1.525917607114068e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:41:56,257] [INFO] [timer.py:260:stop] epoch=2/micro_step=1102/global_step=5760, RunningAvgSamplesPerSec=4.749727865523258, CurrSamplesPerSec=4.759896972835856, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5769, Loss: 0.01950528472661972 +[2024-01-22 11:43:03,619] [INFO] [logging.py:96:log_dist] [Rank 0] step=5770, skipped=0, lr=[1.5017862681336581e-06, 1.5017862681336581e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:43:03,619] [INFO] [timer.py:260:stop] epoch=2/micro_step=1112/global_step=5770, RunningAvgSamplesPerSec=4.749730581082835, CurrSamplesPerSec=4.736152562165349, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5779, Loss: 0.05077425017952919 +[2024-01-22 11:44:10,962] [INFO] [logging.py:96:log_dist] [Rank 0] step=5780, skipped=0, lr=[1.4778317634403082e-06, 1.4778317634403082e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:44:10,963] [INFO] [timer.py:260:stop] epoch=2/micro_step=1122/global_step=5780, RunningAvgSamplesPerSec=4.7497356168424645, CurrSamplesPerSec=4.747046370784126, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5789, Loss: 0.05706050619482994 +[2024-01-22 11:45:18,377] [INFO] [logging.py:96:log_dist] [Rank 0] step=5790, skipped=0, lr=[1.4540545914890958e-06, 1.4540545914890958e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:45:18,378] [INFO] [timer.py:260:stop] epoch=2/micro_step=1132/global_step=5790, RunningAvgSamplesPerSec=4.749731850735013, CurrSamplesPerSec=4.756459043703073, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5799, Loss: 0.03995170816779137 +[2024-01-22 11:46:25,698] [INFO] [logging.py:96:log_dist] [Rank 0] step=5800, skipped=0, lr=[1.4304552470450817e-06, 1.4304552470450817e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:46:25,699] [INFO] [timer.py:260:stop] epoch=2/micro_step=1142/global_step=5800, RunningAvgSamplesPerSec=4.749739543718189, CurrSamplesPerSec=4.750100156541489, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5809, Loss: 0.03454599529504776 +[2024-01-22 11:47:33,036] [INFO] [logging.py:96:log_dist] [Rank 0] step=5810, skipped=0, lr=[1.4070342211730215e-06, 1.4070342211730215e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:47:33,037] [INFO] [timer.py:260:stop] epoch=2/micro_step=1152/global_step=5810, RunningAvgSamplesPerSec=4.7497451889807385, CurrSamplesPerSec=4.728971242139476, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5819, Loss: 0.06686290353536606 +[2024-01-22 11:48:40,395] [INFO] [logging.py:96:log_dist] [Rank 0] step=5820, skipped=0, lr=[1.3837920012271445e-06, 1.3837920012271445e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:48:40,396] [INFO] [timer.py:260:stop] epoch=2/micro_step=1162/global_step=5820, RunningAvgSamplesPerSec=4.749748294744982, CurrSamplesPerSec=4.752902703332651, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5829, Loss: 0.012469653971493244 +[2024-01-22 11:49:47,781] [INFO] [logging.py:96:log_dist] [Rank 0] step=5830, skipped=0, lr=[1.3607290708410204e-06, 1.3607290708410204e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:49:47,782] [INFO] [timer.py:260:stop] epoch=2/micro_step=1172/global_step=5830, RunningAvgSamplesPerSec=4.749748065945671, CurrSamplesPerSec=4.748162125559848, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5839, Loss: 0.015529108233749866 +[2024-01-22 11:50:55,114] [INFO] [logging.py:96:log_dist] [Rank 0] step=5840, skipped=0, lr=[1.3378459099174734e-06, 1.3378459099174734e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:50:55,114] [INFO] [timer.py:260:stop] epoch=2/micro_step=1182/global_step=5840, RunningAvgSamplesPerSec=4.749754319264826, CurrSamplesPerSec=4.737466863731494, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5849, Loss: 0.02740299142897129 +[2024-01-22 11:52:02,527] [INFO] [logging.py:96:log_dist] [Rank 0] step=5850, skipped=0, lr=[1.3151429946186322e-06, 1.3151429946186322e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:52:02,527] [INFO] [timer.py:260:stop] epoch=2/micro_step=1192/global_step=5850, RunningAvgSamplesPerSec=4.749751035302749, CurrSamplesPerSec=4.73933090475295, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5859, Loss: 0.015603732317686081 +[2024-01-22 11:53:09,974] [INFO] [logging.py:96:log_dist] [Rank 0] step=5860, skipped=0, lr=[1.29262079735598e-06, 1.29262079735598e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:53:09,975] [INFO] [timer.py:260:stop] epoch=2/micro_step=1202/global_step=5860, RunningAvgSamplesPerSec=4.749743532122372, CurrSamplesPerSec=4.748904685265455, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5869, Loss: 0.01589106023311615 +[2024-01-22 11:54:17,200] [INFO] [logging.py:96:log_dist] [Rank 0] step=5870, skipped=0, lr=[1.2702797867805649e-06, 1.2702797867805649e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:54:17,200] [INFO] [timer.py:260:stop] epoch=2/micro_step=1212/global_step=5870, RunningAvgSamplesPerSec=4.749762589036482, CurrSamplesPerSec=4.750695679696177, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5879, Loss: 0.014225756749510765 +[2024-01-22 11:55:24,634] [INFO] [logging.py:96:log_dist] [Rank 0] step=5880, skipped=0, lr=[1.2481204277732107e-06, 1.2481204277732107e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:55:24,634] [INFO] [timer.py:260:stop] epoch=2/micro_step=1222/global_step=5880, RunningAvgSamplesPerSec=4.749756618941018, CurrSamplesPerSec=4.719164954554502, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5889, Loss: 0.016019240021705627 +[2024-01-22 11:56:32,015] [INFO] [logging.py:96:log_dist] [Rank 0] step=5890, skipped=0, lr=[1.22614318143488e-06, 1.22614318143488e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:56:32,015] [INFO] [timer.py:260:stop] epoch=2/micro_step=1232/global_step=5890, RunningAvgSamplesPerSec=4.749757014796902, CurrSamplesPerSec=4.741191891244783, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5899, Loss: 0.012406694702804089 +[2024-01-22 11:57:39,415] [INFO] [logging.py:96:log_dist] [Rank 0] step=5900, skipped=0, lr=[1.204348505077042e-06, 1.204348505077042e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:57:39,416] [INFO] [timer.py:260:stop] epoch=2/micro_step=1242/global_step=5900, RunningAvgSamplesPerSec=4.749755081577915, CurrSamplesPerSec=4.744624885664164, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5909, Loss: 0.02086544968187809 +[2024-01-22 11:58:46,730] [INFO] [logging.py:96:log_dist] [Rank 0] step=5910, skipped=0, lr=[1.182736852212192e-06, 1.182736852212192e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:58:46,731] [INFO] [timer.py:260:stop] epoch=2/micro_step=1252/global_step=5910, RunningAvgSamplesPerSec=4.749764161564238, CurrSamplesPerSec=4.760149011997005, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5919, Loss: 0.015017512254416943 +[2024-01-22 11:59:54,051] [INFO] [logging.py:96:log_dist] [Rank 0] step=5920, skipped=0, lr=[1.161308672544389e-06, 1.161308672544389e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 11:59:54,051] [INFO] [timer.py:260:stop] epoch=2/micro_step=1262/global_step=5920, RunningAvgSamplesPerSec=4.749771774201542, CurrSamplesPerSec=4.742459055052395, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5929, Loss: 0.03942824900150299 +[2024-01-22 12:01:01,400] [INFO] [logging.py:96:log_dist] [Rank 0] step=5930, skipped=0, lr=[1.140064411959909e-06, 1.140064411959909e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:01:01,401] [INFO] [timer.py:260:stop] epoch=2/micro_step=1272/global_step=5930, RunningAvgSamplesPerSec=4.74977583558505, CurrSamplesPerSec=4.753819492272008, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5939, Loss: 0.0167243629693985 +[2024-01-22 12:02:08,734] [INFO] [logging.py:96:log_dist] [Rank 0] step=5940, skipped=0, lr=[1.119004512517965e-06, 1.119004512517965e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:02:08,735] [INFO] [timer.py:260:stop] epoch=2/micro_step=1282/global_step=5940, RunningAvgSamplesPerSec=4.749781686748476, CurrSamplesPerSec=4.752849518207224, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5949, Loss: 0.02517867274582386 +[2024-01-22 12:03:16,123] [INFO] [logging.py:96:log_dist] [Rank 0] step=5950, skipped=0, lr=[1.0981294124415075e-06, 1.0981294124415075e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:03:16,123] [INFO] [timer.py:260:stop] epoch=2/micro_step=1292/global_step=5950, RunningAvgSamplesPerSec=4.749781230985982, CurrSamplesPerSec=4.746045593431407, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5959, Loss: 0.014032556675374508 +[2024-01-22 12:04:23,457] [INFO] [logging.py:96:log_dist] [Rank 0] step=5960, skipped=0, lr=[1.0774395461081089e-06, 1.0774395461081089e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:04:23,457] [INFO] [timer.py:260:stop] epoch=2/micro_step=1302/global_step=5960, RunningAvgSamplesPerSec=4.7497871294129155, CurrSamplesPerSec=4.738254590495717, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5969, Loss: 0.012863795273005962 +[2024-01-22 12:05:30,817] [INFO] [logging.py:96:log_dist] [Rank 0] step=5970, skipped=0, lr=[1.0569353440409213e-06, 1.0569353440409213e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:05:30,818] [INFO] [timer.py:260:stop] epoch=2/micro_step=1312/global_step=5970, RunningAvgSamplesPerSec=4.74978986622096, CurrSamplesPerSec=4.754390686110713, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5979, Loss: 0.011284634470939636 +[2024-01-22 12:06:38,054] [INFO] [logging.py:96:log_dist] [Rank 0] step=5980, skipped=0, lr=[1.0366172328997182e-06, 1.0366172328997182e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:06:38,055] [INFO] [timer.py:260:stop] epoch=2/micro_step=1322/global_step=5980, RunningAvgSamplesPerSec=4.749807044741402, CurrSamplesPerSec=4.77408244704643, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5989, Loss: 0.0437389500439167 +[2024-01-22 12:07:45,354] [INFO] [logging.py:96:log_dist] [Rank 0] step=5990, skipped=0, lr=[1.0164856354720187e-06, 1.0164856354720187e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:07:45,355] [INFO] [timer.py:260:stop] epoch=2/micro_step=1332/global_step=5990, RunningAvgSamplesPerSec=4.7498168978127095, CurrSamplesPerSec=4.774773854315918, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 5999, Loss: 0.023541659116744995 +[2024-01-22 12:08:52,680] [INFO] [logging.py:96:log_dist] [Rank 0] step=6000, skipped=0, lr=[9.96540970664287e-07, 9.96540970664287e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:08:52,680] [INFO] [timer.py:260:stop] epoch=2/micro_step=1342/global_step=6000, RunningAvgSamplesPerSec=4.749823590248313, CurrSamplesPerSec=4.772944294686328, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6009, Loss: 0.014436419121921062 +[2024-01-22 12:09:59,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=6010, skipped=0, lr=[9.767836534932241e-07, 9.767836534932241e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:09:59,964] [INFO] [timer.py:260:stop] epoch=2/micro_step=1352/global_step=6010, RunningAvgSamplesPerSec=4.749835235641952, CurrSamplesPerSec=4.741478803319798, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6019, Loss: 0.020921040326356888 +[2024-01-22 12:11:07,332] [INFO] [logging.py:96:log_dist] [Rank 0] step=6020, skipped=0, lr=[9.572140950771115e-07, 9.572140950771115e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:11:07,333] [INFO] [timer.py:260:stop] epoch=2/micro_step=1362/global_step=6020, RunningAvgSamplesPerSec=4.74983686778308, CurrSamplesPerSec=4.753780261412046, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6029, Loss: 0.05584738031029701 +[2024-01-22 12:12:14,741] [INFO] [logging.py:96:log_dist] [Rank 0] step=6030, skipped=0, lr=[9.378327026272871e-07, 9.378327026272871e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:12:14,753] [INFO] [timer.py:260:stop] epoch=2/micro_step=1372/global_step=6030, RunningAvgSamplesPerSec=4.749832607327444, CurrSamplesPerSec=4.738737558043633, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6039, Loss: 0.04254451021552086 +[2024-01-22 12:13:22,081] [INFO] [logging.py:96:log_dist] [Rank 0] step=6040, skipped=0, lr=[9.186398794396389e-07, 9.186398794396389e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:13:22,081] [INFO] [timer.py:260:stop] epoch=2/micro_step=1382/global_step=6040, RunningAvgSamplesPerSec=4.749838944075252, CurrSamplesPerSec=4.7405717907327425, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6049, Loss: 0.020823508501052856 +[2024-01-22 12:14:29,426] [INFO] [logging.py:96:log_dist] [Rank 0] step=6050, skipped=0, lr=[8.996360248862434e-07, 8.996360248862434e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:14:29,426] [INFO] [timer.py:260:stop] epoch=2/micro_step=1392/global_step=6050, RunningAvgSamplesPerSec=4.749843371982713, CurrSamplesPerSec=4.750370157854974, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6059, Loss: 0.036714132875204086 +[2024-01-22 12:15:36,776] [INFO] [logging.py:96:log_dist] [Rank 0] step=6060, skipped=0, lr=[8.80821534407027e-07, 8.80821534407027e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:15:36,776] [INFO] [timer.py:260:stop] epoch=2/micro_step=1402/global_step=6060, RunningAvgSamplesPerSec=4.749847164462711, CurrSamplesPerSec=4.7598070014889275, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6069, Loss: 0.062165047973394394 +[2024-01-22 12:16:44,153] [INFO] [logging.py:96:log_dist] [Rank 0] step=6070, skipped=0, lr=[8.621967995015645e-07, 8.621967995015645e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:16:44,153] [INFO] [timer.py:260:stop] epoch=2/micro_step=1412/global_step=6070, RunningAvgSamplesPerSec=4.749847906945026, CurrSamplesPerSec=4.7749308117818625, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6079, Loss: 0.010757423005998135 +[2024-01-22 12:17:51,596] [INFO] [logging.py:96:log_dist] [Rank 0] step=6080, skipped=0, lr=[8.437622077209073e-07, 8.437622077209073e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:17:51,597] [INFO] [timer.py:260:stop] epoch=2/micro_step=1422/global_step=6080, RunningAvgSamplesPerSec=4.749840826745059, CurrSamplesPerSec=4.744592012088557, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6089, Loss: 0.014164083637297153 +[2024-01-22 12:18:59,035] [INFO] [logging.py:96:log_dist] [Rank 0] step=6090, skipped=0, lr=[8.255181426595427e-07, 8.255181426595427e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:18:59,036] [INFO] [timer.py:260:stop] epoch=2/micro_step=1432/global_step=6090, RunningAvgSamplesPerSec=4.749834344590854, CurrSamplesPerSec=4.732859982903303, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6099, Loss: 0.02133997529745102 +[2024-01-22 12:20:06,446] [INFO] [logging.py:96:log_dist] [Rank 0] step=6100, skipped=0, lr=[8.074649839473925e-07, 8.074649839473925e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:20:06,447] [INFO] [timer.py:260:stop] epoch=2/micro_step=1442/global_step=6100, RunningAvgSamplesPerSec=4.749831070509249, CurrSamplesPerSec=4.755448228423826, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6109, Loss: 0.02086484804749489 +[2024-01-22 12:21:13,958] [INFO] [logging.py:96:log_dist] [Rank 0] step=6110, skipped=0, lr=[7.896031072419263e-07, 7.896031072419263e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:21:13,958] [INFO] [timer.py:260:stop] epoch=2/micro_step=1452/global_step=6110, RunningAvgSamplesPerSec=4.749816242384106, CurrSamplesPerSec=4.764364534225432, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6119, Loss: 0.012495594099164009 +[2024-01-22 12:22:21,316] [INFO] [logging.py:96:log_dist] [Rank 0] step=6120, skipped=0, lr=[7.719328842203355e-07, 7.719328842203355e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:22:21,336] [INFO] [timer.py:260:stop] epoch=2/micro_step=1462/global_step=6120, RunningAvgSamplesPerSec=4.7498169291582055, CurrSamplesPerSec=4.753260387388425, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6129, Loss: 0.011831508949398994 +[2024-01-22 12:23:28,678] [INFO] [logging.py:96:log_dist] [Rank 0] step=6130, skipped=0, lr=[7.54454682571808e-07, 7.54454682571808e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:23:28,678] [INFO] [timer.py:260:stop] epoch=2/micro_step=1472/global_step=6130, RunningAvgSamplesPerSec=4.749821571399206, CurrSamplesPerSec=4.741330736943925, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6139, Loss: 0.06492243707180023 +[2024-01-22 12:24:36,110] [INFO] [logging.py:96:log_dist] [Rank 0] step=6140, skipped=0, lr=[7.371688659898712e-07, 7.371688659898712e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:24:36,111] [INFO] [timer.py:260:stop] epoch=2/micro_step=1482/global_step=6140, RunningAvgSamplesPerSec=4.749815931248311, CurrSamplesPerSec=4.722931876272574, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6149, Loss: 0.020301587879657745 +[2024-01-22 12:25:43,448] [INFO] [logging.py:96:log_dist] [Rank 0] step=6150, skipped=0, lr=[7.20075794164824e-07, 7.20075794164824e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:25:43,448] [INFO] [timer.py:260:stop] epoch=2/micro_step=1492/global_step=6150, RunningAvgSamplesPerSec=4.7498211245137165, CurrSamplesPerSec=4.736811460666171, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6159, Loss: 0.02345452643930912 +[2024-01-22 12:26:50,865] [INFO] [logging.py:96:log_dist] [Rank 0] step=6160, skipped=0, lr=[7.031758227762575e-07, 7.031758227762575e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:26:50,865] [INFO] [timer.py:260:stop] epoch=2/micro_step=1502/global_step=6160, RunningAvgSamplesPerSec=4.749817255885909, CurrSamplesPerSec=4.750529718513785, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6169, Loss: 0.023608390241861343 +[2024-01-22 12:27:58,181] [INFO] [logging.py:96:log_dist] [Rank 0] step=6170, skipped=0, lr=[6.864693034856473e-07, 6.864693034856473e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:27:58,181] [INFO] [timer.py:260:stop] epoch=2/micro_step=1512/global_step=6170, RunningAvgSamplesPerSec=4.749824898339665, CurrSamplesPerSec=4.75124560355434, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6179, Loss: 0.021856199949979782 +[2024-01-22 12:29:05,726] [INFO] [logging.py:96:log_dist] [Rank 0] step=6180, skipped=0, lr=[6.699565839290412e-07, 6.699565839290412e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:29:05,727] [INFO] [timer.py:260:stop] epoch=2/micro_step=1522/global_step=6180, RunningAvgSamplesPerSec=4.749806352577326, CurrSamplesPerSec=4.712100363889702, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6189, Loss: 0.01337597705423832 +[2024-01-22 12:30:13,149] [INFO] [logging.py:96:log_dist] [Rank 0] step=6190, skipped=0, lr=[6.536380077098214e-07, 6.536380077098214e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:30:13,150] [INFO] [timer.py:260:stop] epoch=2/micro_step=1532/global_step=6190, RunningAvgSamplesPerSec=4.749801843294798, CurrSamplesPerSec=4.736164428071823, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6199, Loss: 0.011978505179286003 +[2024-01-22 12:31:20,453] [INFO] [logging.py:96:log_dist] [Rank 0] step=6200, skipped=0, lr=[6.375139143915588e-07, 6.375139143915588e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:31:20,454] [INFO] [timer.py:260:stop] epoch=2/micro_step=1542/global_step=6200, RunningAvgSamplesPerSec=4.749810826405833, CurrSamplesPerSec=4.737386767762748, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6209, Loss: 0.01099093072116375 +[2024-01-22 12:32:27,758] [INFO] [logging.py:96:log_dist] [Rank 0] step=6210, skipped=0, lr=[6.215846394909442e-07, 6.215846394909442e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:32:27,758] [INFO] [timer.py:260:stop] epoch=2/micro_step=1552/global_step=6210, RunningAvgSamplesPerSec=4.749819807253532, CurrSamplesPerSec=4.747694700871777, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6219, Loss: 0.02490321174263954 +[2024-01-22 12:33:35,030] [INFO] [logging.py:96:log_dist] [Rank 0] step=6220, skipped=0, lr=[6.058505144708061e-07, 6.058505144708061e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:33:35,031] [INFO] [timer.py:260:stop] epoch=2/micro_step=1562/global_step=6220, RunningAvgSamplesPerSec=4.749832452427667, CurrSamplesPerSec=4.756928027226166, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6229, Loss: 0.01906600408256054 +[2024-01-22 12:34:42,513] [INFO] [logging.py:96:log_dist] [Rank 0] step=6230, skipped=0, lr=[5.903118667332164e-07, 5.903118667332164e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:34:42,513] [INFO] [timer.py:260:stop] epoch=2/micro_step=1572/global_step=6230, RunningAvgSamplesPerSec=4.749821273836478, CurrSamplesPerSec=4.736125822312386, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6239, Loss: 0.01162846852093935 +[2024-01-22 12:35:49,951] [INFO] [logging.py:96:log_dist] [Rank 0] step=6240, skipped=0, lr=[5.749690196126767e-07, 5.749690196126767e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:35:49,951] [INFO] [timer.py:260:stop] epoch=2/micro_step=1582/global_step=6240, RunningAvgSamplesPerSec=4.749815066282505, CurrSamplesPerSec=4.763473428031595, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6249, Loss: 0.016948996111750603 +[2024-01-22 12:36:57,294] [INFO] [logging.py:96:log_dist] [Rank 0] step=6250, skipped=0, lr=[5.598222923693875e-07, 5.598222923693875e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:36:57,295] [INFO] [timer.py:260:stop] epoch=2/micro_step=1592/global_step=6250, RunningAvgSamplesPerSec=4.7498195040755515, CurrSamplesPerSec=4.737713690053467, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6259, Loss: 0.013400072231888771 +[2024-01-22 12:38:04,669] [INFO] [logging.py:96:log_dist] [Rank 0] step=6260, skipped=0, lr=[5.448720001826091e-07, 5.448720001826091e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:38:04,669] [INFO] [timer.py:260:stop] epoch=2/micro_step=1602/global_step=6260, RunningAvgSamplesPerSec=4.7498204301466975, CurrSamplesPerSec=4.7136524604755206, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6269, Loss: 0.01449246983975172 +[2024-01-22 12:39:12,116] [INFO] [logging.py:96:log_dist] [Rank 0] step=6270, skipped=0, lr=[5.301184541441007e-07, 5.301184541441007e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:39:12,116] [INFO] [timer.py:260:stop] epoch=2/micro_step=1612/global_step=6270, RunningAvgSamplesPerSec=4.749813295685673, CurrSamplesPerSec=4.738136832944242, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6279, Loss: 0.022362535819411278 +[2024-01-22 12:40:19,516] [INFO] [logging.py:96:log_dist] [Rank 0] step=6280, skipped=0, lr=[5.155619612516505e-07, 5.155619612516505e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:40:19,517] [INFO] [timer.py:260:stop] epoch=2/micro_step=1622/global_step=6280, RunningAvgSamplesPerSec=4.7498113628015854, CurrSamplesPerSec=4.750778244238806, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6289, Loss: 0.020169373601675034 +[2024-01-22 12:41:26,793] [INFO] [logging.py:96:log_dist] [Rank 0] step=6290, skipped=0, lr=[5.012028244026757e-07, 5.012028244026757e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:41:26,793] [INFO] [timer.py:260:stop] epoch=2/micro_step=1632/global_step=6290, RunningAvgSamplesPerSec=4.749823293794733, CurrSamplesPerSec=4.763071271259761, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6299, Loss: 0.011605444364249706 +[2024-01-22 12:42:34,120] [INFO] [logging.py:96:log_dist] [Rank 0] step=6300, skipped=0, lr=[4.870413423879416e-07, 4.870413423879416e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:42:34,121] [INFO] [timer.py:260:stop] epoch=2/micro_step=1642/global_step=6300, RunningAvgSamplesPerSec=4.749829505465202, CurrSamplesPerSec=4.7648667103423925, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6309, Loss: 0.018407419323921204 +[2024-01-22 12:43:41,495] [INFO] [logging.py:96:log_dist] [Rank 0] step=6310, skipped=0, lr=[4.7307780988531946e-07, 4.7307780988531946e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:43:41,495] [INFO] [timer.py:260:stop] epoch=2/micro_step=1652/global_step=6310, RunningAvgSamplesPerSec=4.749830516826155, CurrSamplesPerSec=4.7560825077700475, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6319, Loss: 0.04301939532160759 +[2024-01-22 12:44:48,857] [INFO] [logging.py:96:log_dist] [Rank 0] step=6320, skipped=0, lr=[4.593125174536761e-07, 4.593125174536761e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:44:48,857] [INFO] [timer.py:260:stop] epoch=2/micro_step=1662/global_step=6320, RunningAvgSamplesPerSec=4.749832917450595, CurrSamplesPerSec=4.749862964027654, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6329, Loss: 0.010908924043178558 +[2024-01-22 12:45:56,195] [INFO] [logging.py:96:log_dist] [Rank 0] step=6330, skipped=0, lr=[4.457457515268082e-07, 4.457457515268082e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:45:56,195] [INFO] [timer.py:260:stop] epoch=2/micro_step=1672/global_step=6330, RunningAvgSamplesPerSec=4.749837983309595, CurrSamplesPerSec=4.766235588835726, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6339, Loss: 0.01919282227754593 +[2024-01-22 12:47:03,597] [INFO] [logging.py:96:log_dist] [Rank 0] step=6340, skipped=0, lr=[4.323777944075058e-07, 4.323777944075058e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:47:03,598] [INFO] [timer.py:260:stop] epoch=2/micro_step=1682/global_step=6340, RunningAvgSamplesPerSec=4.749835812992992, CurrSamplesPerSec=4.752361818987894, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6349, Loss: 0.02479756809771061 +[2024-01-22 12:48:10,991] [INFO] [logging.py:96:log_dist] [Rank 0] step=6350, skipped=0, lr=[4.192089242616482e-07, 4.192089242616482e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:48:10,992] [INFO] [timer.py:260:stop] epoch=2/micro_step=1692/global_step=6350, RunningAvgSamplesPerSec=4.749834578179352, CurrSamplesPerSec=4.745601405021626, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6359, Loss: 0.017866525799036026 +[2024-01-22 12:49:18,310] [INFO] [logging.py:96:log_dist] [Rank 0] step=6360, skipped=0, lr=[4.0623941511244713e-07, 4.0623941511244713e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:49:18,310] [INFO] [timer.py:260:stop] epoch=2/micro_step=1702/global_step=6360, RunningAvgSamplesPerSec=4.749841686137207, CurrSamplesPerSec=4.759841774251657, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6369, Loss: 0.015286087058484554 +[2024-01-22 12:50:25,681] [INFO] [logging.py:96:log_dist] [Rank 0] step=6370, skipped=0, lr=[3.9346953683471857e-07, 3.9346953683471857e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:50:25,682] [INFO] [timer.py:260:stop] epoch=2/micro_step=1712/global_step=6370, RunningAvgSamplesPerSec=4.749842926290729, CurrSamplesPerSec=4.745164345726322, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6379, Loss: 0.012954462319612503 +[2024-01-22 12:51:33,027] [INFO] [logging.py:96:log_dist] [Rank 0] step=6380, skipped=0, lr=[3.8089955514928554e-07, 3.8089955514928554e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:51:33,027] [INFO] [timer.py:260:stop] epoch=2/micro_step=1722/global_step=6380, RunningAvgSamplesPerSec=4.749847054922322, CurrSamplesPerSec=4.749586129303461, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6389, Loss: 0.043888434767723083 +[2024-01-22 12:52:40,472] [INFO] [logging.py:96:log_dist] [Rank 0] step=6390, skipped=0, lr=[3.685297316174363e-07, 3.685297316174363e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:52:40,473] [INFO] [timer.py:260:stop] epoch=2/micro_step=1732/global_step=6390, RunningAvgSamplesPerSec=4.749840079141555, CurrSamplesPerSec=4.741369092617583, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6399, Loss: 0.010757556185126305 +[2024-01-22 12:53:47,895] [INFO] [logging.py:96:log_dist] [Rank 0] step=6400, skipped=0, lr=[3.5636032363549065e-07, 3.5636032363549065e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:53:47,895] [INFO] [timer.py:260:stop] epoch=2/micro_step=1742/global_step=6400, RunningAvgSamplesPerSec=4.749835759907349, CurrSamplesPerSec=4.755459348761733, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6409, Loss: 0.1276363730430603 +[2024-01-22 12:54:55,231] [INFO] [logging.py:96:log_dist] [Rank 0] step=6410, skipped=0, lr=[3.4439158442943655e-07, 3.4439158442943655e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:54:55,231] [INFO] [timer.py:260:stop] epoch=2/micro_step=1752/global_step=6410, RunningAvgSamplesPerSec=4.749840895782249, CurrSamplesPerSec=4.750223216845309, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6419, Loss: 0.017784563824534416 +[2024-01-22 12:56:02,714] [INFO] [logging.py:96:log_dist] [Rank 0] step=6420, skipped=0, lr=[3.326237630496687e-07, 3.326237630496687e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:56:02,715] [INFO] [timer.py:260:stop] epoch=2/micro_step=1762/global_step=6420, RunningAvgSamplesPerSec=4.749830510709396, CurrSamplesPerSec=4.716325473993112, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6429, Loss: 0.03169148787856102 +[2024-01-22 12:57:10,067] [INFO] [logging.py:96:log_dist] [Rank 0] step=6430, skipped=0, lr=[3.210571043657973e-07, 3.210571043657973e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:57:10,068] [INFO] [timer.py:260:stop] epoch=2/micro_step=1772/global_step=6430, RunningAvgSamplesPerSec=4.749833825729483, CurrSamplesPerSec=4.759752480170199, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6439, Loss: 0.017682313919067383 +[2024-01-22 12:58:17,364] [INFO] [logging.py:96:log_dist] [Rank 0] step=6440, skipped=0, lr=[3.096918490615608e-07, 3.096918490615608e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:58:17,364] [INFO] [timer.py:260:stop] epoch=2/micro_step=1782/global_step=6440, RunningAvgSamplesPerSec=4.749843314183696, CurrSamplesPerSec=4.760383518480227, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6449, Loss: 0.014054086059331894 +[2024-01-22 12:59:24,688] [INFO] [logging.py:96:log_dist] [Rank 0] step=6450, skipped=0, lr=[2.985282336298134e-07, 2.985282336298134e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 12:59:24,689] [INFO] [timer.py:260:stop] epoch=2/micro_step=1792/global_step=6450, RunningAvgSamplesPerSec=4.749849703979994, CurrSamplesPerSec=4.73099819265901, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6459, Loss: 0.011726326309144497 +[2024-01-22 13:00:32,142] [INFO] [logging.py:96:log_dist] [Rank 0] step=6460, skipped=0, lr=[2.875664903676045e-07, 2.875664903676045e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:00:32,143] [INFO] [timer.py:260:stop] epoch=2/micro_step=1802/global_step=6460, RunningAvgSamplesPerSec=4.749841958327974, CurrSamplesPerSec=4.7563341431263355, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6469, Loss: 0.023618336766958237 +[2024-01-22 13:01:39,630] [INFO] [logging.py:96:log_dist] [Rank 0] step=6470, skipped=0, lr=[2.768068473713459e-07, 2.768068473713459e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:01:39,630] [INFO] [timer.py:260:stop] epoch=2/micro_step=1812/global_step=6470, RunningAvgSamplesPerSec=4.749830526650275, CurrSamplesPerSec=4.768291737140964, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6479, Loss: 0.04495960846543312 +[2024-01-22 13:02:46,976] [INFO] [logging.py:96:log_dist] [Rank 0] step=6480, skipped=0, lr=[2.662495285320632e-07, 2.662495285320632e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:02:46,977] [INFO] [timer.py:260:stop] epoch=2/micro_step=1822/global_step=6480, RunningAvgSamplesPerSec=4.749834461959851, CurrSamplesPerSec=4.746512861297806, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6489, Loss: 0.0123212318867445 +[2024-01-22 13:03:54,344] [INFO] [logging.py:96:log_dist] [Rank 0] step=6490, skipped=0, lr=[2.5589475353073987e-07, 2.5589475353073987e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:03:54,345] [INFO] [timer.py:260:stop] epoch=2/micro_step=1832/global_step=6490, RunningAvgSamplesPerSec=4.749836031309972, CurrSamplesPerSec=4.745976786610985, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6499, Loss: 0.0886368677020073 +[2024-01-22 13:05:01,671] [INFO] [logging.py:96:log_dist] [Rank 0] step=6500, skipped=0, lr=[2.4574273783374623e-07, 2.4574273783374623e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:05:01,671] [INFO] [timer.py:260:stop] epoch=2/micro_step=1842/global_step=6500, RunningAvgSamplesPerSec=4.749842117899008, CurrSamplesPerSec=4.77001109858428, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6509, Loss: 0.012893804349005222 +[2024-01-22 13:06:09,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=6510, skipped=0, lr=[2.3579369268835196e-07, 2.3579369268835196e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:06:09,050] [INFO] [timer.py:260:stop] epoch=2/micro_step=1852/global_step=6510, RunningAvgSamplesPerSec=4.749842585788137, CurrSamplesPerSec=4.766218324903355, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6519, Loss: 0.02299666590988636 +[2024-01-22 13:07:16,373] [INFO] [logging.py:96:log_dist] [Rank 0] step=6520, skipped=0, lr=[2.26047825118334e-07, 2.26047825118334e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:07:16,373] [INFO] [timer.py:260:stop] epoch=2/micro_step=1862/global_step=6520, RunningAvgSamplesPerSec=4.749849024669063, CurrSamplesPerSec=4.757551571232638, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6529, Loss: 0.07957718521356583 +[2024-01-22 13:08:23,738] [INFO] [logging.py:96:log_dist] [Rank 0] step=6530, skipped=0, lr=[2.165053379196691e-07, 2.165053379196691e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:08:23,739] [INFO] [timer.py:260:stop] epoch=2/micro_step=1872/global_step=6530, RunningAvgSamplesPerSec=4.749850814707701, CurrSamplesPerSec=4.749761941667038, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6539, Loss: 0.030769001692533493 +[2024-01-22 13:09:31,159] [INFO] [logging.py:96:log_dist] [Rank 0] step=6540, skipped=0, lr=[2.0716642965630917e-07, 2.0716642965630917e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:09:31,160] [INFO] [timer.py:260:stop] epoch=2/micro_step=1882/global_step=6540, RunningAvgSamplesPerSec=4.749846683876286, CurrSamplesPerSec=4.760740471902288, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6549, Loss: 0.01317563746124506 +[2024-01-22 13:10:38,533] [INFO] [logging.py:96:log_dist] [Rank 0] step=6550, skipped=0, lr=[1.9803129465605808e-07, 1.9803129465605808e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:10:38,534] [INFO] [timer.py:260:stop] epoch=2/micro_step=1892/global_step=6550, RunningAvgSamplesPerSec=4.749847636202129, CurrSamplesPerSec=4.734214204138737, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6559, Loss: 0.017031555995345116 +[2024-01-22 13:11:45,905] [INFO] [logging.py:96:log_dist] [Rank 0] step=6560, skipped=0, lr=[1.8910012300651592e-07, 1.8910012300651592e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:11:45,906] [INFO] [timer.py:260:stop] epoch=2/micro_step=1902/global_step=6560, RunningAvgSamplesPerSec=4.749848708195059, CurrSamplesPerSec=4.760836895656204, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6569, Loss: 0.029662776738405228 +[2024-01-22 13:12:53,402] [INFO] [logging.py:96:log_dist] [Rank 0] step=6570, skipped=0, lr=[1.8037310055113778e-07, 1.8037310055113778e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:12:53,403] [INFO] [timer.py:260:stop] epoch=2/micro_step=1912/global_step=6570, RunningAvgSamplesPerSec=4.7498363918840365, CurrSamplesPerSec=4.745132135712228, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6579, Loss: 0.1764068305492401 +[2024-01-22 13:14:00,701] [INFO] [logging.py:96:log_dist] [Rank 0] step=6580, skipped=0, lr=[1.718504088853512e-07, 1.718504088853512e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:14:00,715] [INFO] [timer.py:260:stop] epoch=2/micro_step=1922/global_step=6580, RunningAvgSamplesPerSec=4.749843924964083, CurrSamplesPerSec=4.748465169390087, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6589, Loss: 0.028277579694986343 +[2024-01-22 13:15:08,096] [INFO] [logging.py:96:log_dist] [Rank 0] step=6590, skipped=0, lr=[1.63532225352796e-07, 1.63532225352796e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:15:08,096] [INFO] [timer.py:260:stop] epoch=2/micro_step=1932/global_step=6590, RunningAvgSamplesPerSec=4.749844157626207, CurrSamplesPerSec=4.74540123697189, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6599, Loss: 0.05110957846045494 +[2024-01-22 13:16:15,420] [INFO] [logging.py:96:log_dist] [Rank 0] step=6600, skipped=0, lr=[1.5541872304161266e-07, 1.5541872304161266e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:16:15,420] [INFO] [timer.py:260:stop] epoch=2/micro_step=1942/global_step=6600, RunningAvgSamplesPerSec=4.749850362588539, CurrSamplesPerSec=4.765545636694882, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6609, Loss: 0.03560924902558327 +[2024-01-22 13:17:22,817] [INFO] [logging.py:96:log_dist] [Rank 0] step=6610, skipped=0, lr=[1.4751007078085854e-07, 1.4751007078085854e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:17:22,817] [INFO] [timer.py:260:stop] epoch=2/micro_step=1952/global_step=6610, RunningAvgSamplesPerSec=4.749848932924006, CurrSamplesPerSec=4.740388789098009, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6619, Loss: 0.013863084837794304 +[2024-01-22 13:18:30,188] [INFO] [logging.py:96:log_dist] [Rank 0] step=6620, skipped=0, lr=[1.3980643313698528e-07, 1.3980643313698528e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:18:30,189] [INFO] [timer.py:260:stop] epoch=2/micro_step=1962/global_step=6620, RunningAvgSamplesPerSec=4.749850100348437, CurrSamplesPerSec=4.7624025127119305, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6629, Loss: 0.01857743039727211 +[2024-01-22 13:19:37,521] [INFO] [logging.py:96:log_dist] [Rank 0] step=6630, skipped=0, lr=[1.323079704104191e-07, 1.323079704104191e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:19:37,522] [INFO] [timer.py:260:stop] epoch=2/micro_step=1972/global_step=6630, RunningAvgSamplesPerSec=4.74985538213186, CurrSamplesPerSec=4.731320564520749, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6639, Loss: 0.017645500600337982 +[2024-01-22 13:20:44,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=6640, skipped=0, lr=[1.250148386322192e-07, 1.250148386322192e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:20:44,864] [INFO] [timer.py:260:stop] epoch=2/micro_step=1982/global_step=6640, RunningAvgSamplesPerSec=4.749859754543086, CurrSamplesPerSec=4.733485579552858, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6649, Loss: 0.0321258082985878 +[2024-01-22 13:21:52,275] [INFO] [logging.py:96:log_dist] [Rank 0] step=6650, skipped=0, lr=[1.1792718956083915e-07, 1.1792718956083915e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:21:52,276] [INFO] [timer.py:260:stop] epoch=2/micro_step=1992/global_step=6650, RunningAvgSamplesPerSec=4.749856650876547, CurrSamplesPerSec=4.746434305524824, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6659, Loss: 0.019503667950630188 +[2024-01-22 13:22:59,570] [INFO] [logging.py:96:log_dist] [Rank 0] step=6660, skipped=0, lr=[1.1104517067896281e-07, 1.1104517067896281e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:22:59,570] [INFO] [timer.py:260:stop] epoch=2/micro_step=2002/global_step=6660, RunningAvgSamplesPerSec=4.749865963364216, CurrSamplesPerSec=4.744796305260517, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6669, Loss: 0.010050250217318535 +[2024-01-22 13:24:06,976] [INFO] [logging.py:96:log_dist] [Rank 0] step=6670, skipped=0, lr=[1.0436892519043673e-07, 1.0436892519043673e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:24:06,978] [INFO] [timer.py:260:stop] epoch=2/micro_step=2012/global_step=6670, RunningAvgSamplesPerSec=4.7498632540327215, CurrSamplesPerSec=4.743083169332207, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6679, Loss: 0.03030356392264366 +[2024-01-22 13:25:14,262] [INFO] [logging.py:96:log_dist] [Rank 0] step=6680, skipped=0, lr=[9.789859201729257e-08, 9.789859201729257e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:25:14,263] [INFO] [timer.py:260:stop] epoch=2/micro_step=2022/global_step=6680, RunningAvgSamplesPerSec=4.749873641370053, CurrSamplesPerSec=4.758013011340407, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6689, Loss: 0.009983907453715801 +[2024-01-22 13:26:21,764] [INFO] [logging.py:96:log_dist] [Rank 0] step=6690, skipped=0, lr=[9.163430579685384e-08, 9.163430579685384e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:26:21,764] [INFO] [timer.py:260:stop] epoch=2/micro_step=2032/global_step=6690, RunningAvgSamplesPerSec=4.749861092524607, CurrSamplesPerSec=4.7365914731511145, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6699, Loss: 0.04569818079471588 +[2024-01-22 13:27:29,192] [INFO] [logging.py:96:log_dist] [Rank 0] step=6700, skipped=0, lr=[8.557619687893481e-08, 8.557619687893481e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:27:29,193] [INFO] [timer.py:260:stop] epoch=2/micro_step=2042/global_step=6700, RunningAvgSamplesPerSec=4.749856291672931, CurrSamplesPerSec=4.755207637484496, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6709, Loss: 0.011522277258336544 +[2024-01-22 13:28:36,620] [INFO] [logging.py:96:log_dist] [Rank 0] step=6710, skipped=0, lr=[7.972439132313048e-08, 7.972439132313048e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:28:36,620] [INFO] [timer.py:260:stop] epoch=2/micro_step=2052/global_step=6710, RunningAvgSamplesPerSec=4.749851579644002, CurrSamplesPerSec=4.740956927242741, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6719, Loss: 0.013115708716213703 +[2024-01-22 13:29:43,921] [INFO] [logging.py:96:log_dist] [Rank 0] step=6720, skipped=0, lr=[7.407901089619086e-08, 7.407901089619086e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:29:43,922] [INFO] [timer.py:260:stop] epoch=2/micro_step=2062/global_step=6720, RunningAvgSamplesPerSec=4.749860091468141, CurrSamplesPerSec=4.7626534652847, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6729, Loss: 0.012674546800553799 +[2024-01-22 13:30:51,352] [INFO] [logging.py:96:log_dist] [Rank 0] step=6730, skipped=0, lr=[6.864017306948523e-08, 6.864017306948523e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:30:51,353] [INFO] [timer.py:260:stop] epoch=2/micro_step=2072/global_step=6730, RunningAvgSamplesPerSec=4.749855046082718, CurrSamplesPerSec=4.738374695845881, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6739, Loss: 0.017529362812638283 +[2024-01-22 13:31:58,700] [INFO] [logging.py:96:log_dist] [Rank 0] step=6740, skipped=0, lr=[6.340799101656525e-08, 6.340799101656525e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:31:58,700] [INFO] [timer.py:260:stop] epoch=2/micro_step=2082/global_step=6740, RunningAvgSamplesPerSec=4.7498587113873345, CurrSamplesPerSec=4.734753136580115, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6749, Loss: 0.04535684362053871 +[2024-01-22 13:33:06,160] [INFO] [logging.py:96:log_dist] [Rank 0] step=6750, skipped=0, lr=[5.838257361080124e-08, 5.838257361080124e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:33:06,161] [INFO] [timer.py:260:stop] epoch=2/micro_step=2092/global_step=6750, RunningAvgSamplesPerSec=4.74985060730558, CurrSamplesPerSec=4.764520131538764, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6759, Loss: 0.01673661544919014 +[2024-01-22 13:34:13,558] [INFO] [logging.py:96:log_dist] [Rank 0] step=6760, skipped=0, lr=[5.356402542312289e-08, 5.356402542312289e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:34:13,559] [INFO] [timer.py:260:stop] epoch=2/micro_step=2102/global_step=6760, RunningAvgSamplesPerSec=4.749849009055887, CurrSamplesPerSec=4.7527959976686756, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6769, Loss: 0.011836251243948936 +[2024-01-22 13:35:20,862] [INFO] [logging.py:96:log_dist] [Rank 0] step=6770, skipped=0, lr=[4.8952446719839896e-08, 4.8952446719839896e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:35:20,862] [INFO] [timer.py:260:stop] epoch=2/micro_step=2112/global_step=6770, RunningAvgSamplesPerSec=4.749857261255933, CurrSamplesPerSec=4.734996339329368, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6779, Loss: 0.023913053795695305 +[2024-01-22 13:36:28,304] [INFO] [logging.py:96:log_dist] [Rank 0] step=6780, skipped=0, lr=[4.454793346055697e-08, 4.454793346055697e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:36:28,305] [INFO] [timer.py:260:stop] epoch=2/micro_step=2122/global_step=6780, RunningAvgSamplesPerSec=4.749850997092475, CurrSamplesPerSec=4.744003216164862, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6789, Loss: 0.02063714899122715 +[2024-01-22 13:37:35,609] [INFO] [logging.py:96:log_dist] [Rank 0] step=6790, skipped=0, lr=[4.035057729617764e-08, 4.035057729617764e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:37:35,610] [INFO] [timer.py:260:stop] epoch=2/micro_step=2132/global_step=6790, RunningAvgSamplesPerSec=4.749859071990497, CurrSamplesPerSec=4.7654897994174865, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6799, Loss: 0.07796330749988556 +[2024-01-22 13:38:43,001] [INFO] [logging.py:96:log_dist] [Rank 0] step=6800, skipped=0, lr=[3.6360465566994685e-08, 3.6360465566994685e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:38:43,001] [INFO] [timer.py:260:stop] epoch=2/micro_step=2142/global_step=6800, RunningAvgSamplesPerSec=4.749858190191804, CurrSamplesPerSec=4.736259357464128, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6809, Loss: 0.015995467081665993 +[2024-01-22 13:39:50,585] [INFO] [logging.py:96:log_dist] [Rank 0] step=6810, skipped=0, lr=[3.257768130087713e-08, 3.257768130087713e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:39:50,586] [INFO] [timer.py:260:stop] epoch=2/micro_step=2152/global_step=6810, RunningAvgSamplesPerSec=4.749837242595963, CurrSamplesPerSec=4.724007557976893, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6819, Loss: 0.02341858111321926 +[2024-01-22 13:40:57,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=6820, skipped=0, lr=[2.9002303211537186e-08, 2.9002303211537186e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:40:57,863] [INFO] [timer.py:260:stop] epoch=2/micro_step=2162/global_step=6820, RunningAvgSamplesPerSec=4.749848132574506, CurrSamplesPerSec=4.744808885464342, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6829, Loss: 0.025314100086688995 +[2024-01-22 13:42:05,226] [INFO] [logging.py:96:log_dist] [Rank 0] step=6830, skipped=0, lr=[2.5634405696896013e-08, 2.5634405696896013e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:42:05,226] [INFO] [timer.py:260:stop] epoch=2/micro_step=2172/global_step=6830, RunningAvgSamplesPerSec=4.749850170821703, CurrSamplesPerSec=4.744952304503118, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6839, Loss: 0.01324329525232315 +[2024-01-22 13:43:12,539] [INFO] [logging.py:96:log_dist] [Rank 0] step=6840, skipped=0, lr=[2.2474058837536062e-08, 2.2474058837536062e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:43:12,540] [INFO] [timer.py:260:stop] epoch=2/micro_step=2182/global_step=6840, RunningAvgSamplesPerSec=4.749857264889896, CurrSamplesPerSec=4.75075470212154, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6849, Loss: 0.03438987955451012 +[2024-01-22 13:44:19,937] [INFO] [logging.py:96:log_dist] [Rank 0] step=6850, skipped=0, lr=[1.9521328395237794e-08, 1.9521328395237794e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:44:19,937] [INFO] [timer.py:260:stop] epoch=2/micro_step=2192/global_step=6850, RunningAvgSamplesPerSec=4.749855722077773, CurrSamplesPerSec=4.762269527019365, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6859, Loss: 0.017284264788031578 +[2024-01-22 13:45:27,323] [INFO] [logging.py:96:log_dist] [Rank 0] step=6860, skipped=0, lr=[1.677627581161745e-08, 1.677627581161745e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:45:27,323] [INFO] [timer.py:260:stop] epoch=2/micro_step=2202/global_step=6860, RunningAvgSamplesPerSec=4.749855342600888, CurrSamplesPerSec=4.75642448890606, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6869, Loss: 0.01239666435867548 +[2024-01-22 13:46:34,699] [INFO] [logging.py:96:log_dist] [Rank 0] step=6870, skipped=0, lr=[1.4238958206845844e-08, 1.4238958206845844e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:46:34,699] [INFO] [timer.py:260:stop] epoch=2/micro_step=2212/global_step=6870, RunningAvgSamplesPerSec=4.749856003138953, CurrSamplesPerSec=4.743626805472348, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6879, Loss: 0.018919533118605614 +[2024-01-22 13:47:42,020] [INFO] [logging.py:96:log_dist] [Rank 0] step=6880, skipped=0, lr=[1.190942837846043e-08, 1.190942837846043e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:47:42,020] [INFO] [timer.py:260:stop] epoch=2/micro_step=2222/global_step=6880, RunningAvgSamplesPerSec=4.749862387991642, CurrSamplesPerSec=4.763369459179837, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6889, Loss: 0.013424936681985855 +[2024-01-22 13:48:49,430] [INFO] [logging.py:96:log_dist] [Rank 0] step=6890, skipped=0, lr=[9.78773480026396e-09, 9.78773480026396e-09], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:48:49,430] [INFO] [timer.py:260:stop] epoch=2/micro_step=2232/global_step=6890, RunningAvgSamplesPerSec=4.74985955070619, CurrSamplesPerSec=4.757437574234314, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6899, Loss: 0.014224093407392502 +[2024-01-22 13:49:56,888] [INFO] [logging.py:96:log_dist] [Rank 0] step=6900, skipped=0, lr=[7.873921621319725e-09, 7.873921621319725e-09], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:49:56,888] [INFO] [timer.py:260:stop] epoch=2/micro_step=2242/global_step=6900, RunningAvgSamplesPerSec=4.749851811826726, CurrSamplesPerSec=4.723632318652578, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6909, Loss: 0.025186197832226753 +[2024-01-22 13:51:04,270] [INFO] [logging.py:96:log_dist] [Rank 0] step=6910, skipped=0, lr=[6.168028665028969e-09, 6.168028665028969e-09], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:51:04,271] [INFO] [timer.py:260:stop] epoch=2/micro_step=2252/global_step=6910, RunningAvgSamplesPerSec=4.749851803686121, CurrSamplesPerSec=4.750112764886, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6919, Loss: 0.044912852346897125 +[2024-01-22 13:52:11,633] [INFO] [logging.py:96:log_dist] [Rank 0] step=6920, skipped=0, lr=[4.67009142830932e-09, 4.67009142830932e-09], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:52:11,634] [INFO] [timer.py:260:stop] epoch=2/micro_step=2262/global_step=6920, RunningAvgSamplesPerSec=4.749853812251362, CurrSamplesPerSec=4.760776778111237, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6929, Loss: 0.022356605157256126 +[2024-01-22 13:53:19,126] [INFO] [logging.py:96:log_dist] [Rank 0] step=6930, skipped=0, lr=[3.380141080844279e-09, 3.380141080844279e-09], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:53:19,127] [INFO] [timer.py:260:stop] epoch=2/micro_step=2272/global_step=6930, RunningAvgSamplesPerSec=4.749842580506619, CurrSamplesPerSec=4.738956741017033, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6939, Loss: 0.015853822231292725 +[2024-01-22 13:54:26,418] [INFO] [logging.py:96:log_dist] [Rank 0] step=6940, skipped=0, lr=[2.2982044644481726e-09, 2.2982044644481726e-09], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:54:26,419] [INFO] [timer.py:260:stop] epoch=2/micro_step=2282/global_step=6940, RunningAvgSamplesPerSec=4.749851778581356, CurrSamplesPerSec=4.749549321288105, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6949, Loss: 0.07117908447980881 +[2024-01-22 13:55:33,731] [INFO] [logging.py:96:log_dist] [Rank 0] step=6950, skipped=0, lr=[1.4243040924954988e-09, 1.4243040924954988e-09], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:55:33,732] [INFO] [timer.py:260:stop] epoch=2/micro_step=2292/global_step=6950, RunningAvgSamplesPerSec=4.749858812704008, CurrSamplesPerSec=4.754382939049742, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6959, Loss: 0.01550787128508091 +[2024-01-22 13:56:41,160] [INFO] [logging.py:96:log_dist] [Rank 0] step=6960, skipped=0, lr=[7.584581494635146e-10, 7.584581494635146e-10], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:56:41,161] [INFO] [timer.py:260:stop] epoch=2/micro_step=2302/global_step=6960, RunningAvgSamplesPerSec=4.749854138624553, CurrSamplesPerSec=4.738448970167778, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6969, Loss: 0.019912857562303543 +[2024-01-22 13:57:48,483] [INFO] [logging.py:96:log_dist] [Rank 0] step=6970, skipped=0, lr=[3.0068049054254864e-10, 3.0068049054254864e-10], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:57:48,484] [INFO] [timer.py:260:stop] epoch=2/micro_step=2312/global_step=6970, RunningAvgSamplesPerSec=4.749860152627796, CurrSamplesPerSec=4.743173515270913, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +Epoch: 2, Total Step: 6979, Loss: 0.009952123276889324 +[2024-01-22 13:58:55,851] [INFO] [logging.py:96:log_dist] [Rank 0] step=6980, skipped=0, lr=[5.098064135733438e-11, 5.098064135733438e-11], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 13:58:55,852] [INFO] [timer.py:260:stop] epoch=2/micro_step=2322/global_step=6980, RunningAvgSamplesPerSec=4.749861661720875, CurrSamplesPerSec=4.765377959594081, MemAllocated=26.25GB, MaxMemAllocated=40.69GB +***** Evaluating perplexity, Epoch 3/3 ***** +ppl: 1.0079808235168457 +eval loss: 0.007949198596179485 +saving the final model ... +[2024-01-22 13:59:47,531] [INFO] [launch.py:347:main] Process 26112 exits successfully. +[2024-01-22 13:59:47,533] [INFO] [launch.py:347:main] Process 26110 exits successfully. +[2024-01-22 13:59:47,533] [INFO] [launch.py:347:main] Process 26114 exits successfully. +[2024-01-22 13:59:47,533] [INFO] [launch.py:347:main] Process 26111 exits successfully. +[2024-01-22 13:59:47,534] [INFO] [launch.py:347:main] Process 26113 exits successfully. +[2024-01-22 13:59:48,535] [INFO] [launch.py:347:main] Process 26109 exits successfully. +[2024-01-22 13:59:48,535] [INFO] [launch.py:347:main] Process 26108 exits successfully. +[2024-01-22 14:00:00,547] [INFO] [launch.py:347:main] Process 26107 exits successfully.