CAS-SIAT-XinHai
/

CPsyCounX

@@ -1,981 +0,0 @@
-[2024-02-01 14:20:07,768] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-[2024-02-01 14:20:09,368] [WARNING] [runner.py:202:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
-[2024-02-01 14:20:09,369] [INFO] [runner.py:568:main] cmd = /home/lirenhao/anaconda3/envs/llama_factory/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgM119 --master_addr=127.0.0.1 --master_port=2345 --enable_each_rank_log=None /home/lirenhao/projects/LLaMA-Factory/src/train_bash.py --deepspeed ds_config.json --stage sft --model_name_or_path /home/lirenhao/pretrained_models/internlm2-chat-7b/ --do_train --dataset cpsycoun --template intern2 --finetuning_type full --lora_target wqkv --output_dir /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9 --overwrite_cache --overwrite_output_dir --per_device_train_batch_size 4 --gradient_accumulation_steps 28 --lr_scheduler_type cosine --logging_steps 10 --save_steps 21 --learning_rate 1e-6 --num_train_epochs 9.0 --plot_loss --fp16
-[2024-02-01 14:20:12,819] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-[2024-02-01 14:20:14,435] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3]}
-[2024-02-01 14:20:14,436] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=4, node_rank=0
-[2024-02-01 14:20:14,436] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3]})
-[2024-02-01 14:20:14,436] [INFO] [launch.py:163:main] dist_world_size=4
-[2024-02-01 14:20:14,436] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3
-[2024-02-01 14:20:19,797] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-[2024-02-01 14:20:20,069] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-[2024-02-01 14:20:20,128] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-[2024-02-01 14:20:20,157] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-[2024-02-01 14:20:22,839] [INFO] [comm.py:637:init_distributed] cdb=None
-[2024-02-01 14:20:23,347] [INFO] [comm.py:637:init_distributed] cdb=None
-[2024-02-01 14:20:23,364] [INFO] [comm.py:637:init_distributed] cdb=None
-[2024-02-01 14:20:23,375] [INFO] [comm.py:637:init_distributed] cdb=None
-[2024-02-01 14:20:23,376] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
-02/01/2024 14:20:24 - INFO - llmtuner.hparams.parser - Process rank: 2, device: cuda:2, n_gpu: 1
-  distributed training: True, compute dtype: torch.float16
-02/01/2024 14:20:24 - INFO - llmtuner.hparams.parser - Training/evaluation parameters Seq2SeqTrainingArguments(
-_n_gpu=1,
-adafactor=False,
-adam_beta1=0.9,
-adam_beta2=0.999,
-adam_epsilon=1e-08,
-auto_find_batch_size=False,
-bf16=False,
-bf16_full_eval=False,
-data_seed=None,
-dataloader_drop_last=False,
-dataloader_num_workers=0,
-dataloader_persistent_workers=False,
-dataloader_pin_memory=True,
-ddp_backend=None,
-ddp_broadcast_buffers=None,
-ddp_bucket_cap_mb=None,
-ddp_find_unused_parameters=None,
-ddp_timeout=1800,
-debug=[],
-deepspeed=ds_config.json,
-disable_tqdm=False,
-dispatch_batches=None,
-do_eval=False,
-do_predict=False,
-do_train=True,
-eval_accumulation_steps=None,
-eval_delay=0,
-eval_steps=None,
-evaluation_strategy=no,
-fp16=True,
-fp16_backend=auto,
-fp16_full_eval=False,
-fp16_opt_level=O1,
-fsdp=[],
-fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
-fsdp_min_num_params=0,
-fsdp_transformer_layer_cls_to_wrap=None,
-full_determinism=False,
-generation_config=None,
-generation_max_length=None,
-generation_num_beams=None,
-gradient_accumulation_steps=28,
-gradient_checkpointing=False,
-gradient_checkpointing_kwargs=None,
-greater_is_better=None,
-group_by_length=False,
-half_precision_backend=auto,
-hub_always_push=False,
-hub_model_id=None,
-hub_private_repo=False,
-hub_strategy=every_save,
-hub_token=<HUB_TOKEN>,
-ignore_data_skip=False,
-include_inputs_for_metrics=False,
-include_num_input_tokens_seen=False,
-include_tokens_per_second=False,
-jit_mode_eval=False,
-label_names=None,
-label_smoothing_factor=0.0,
-learning_rate=1e-06,
-length_column_name=length,
-load_best_model_at_end=False,
-local_rank=2,
-log_level=passive,
-log_level_replica=warning,
-log_on_each_node=True,
-logging_dir=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/runs/Feb01_14-20-22_siat-a100-4-02,
-logging_first_step=False,
-logging_nan_inf_filter=True,
-logging_steps=10,
-logging_strategy=steps,
-lr_scheduler_kwargs={},
-lr_scheduler_type=cosine,
-max_grad_norm=1.0,
-max_steps=-1,
-metric_for_best_model=None,
-mp_parameters=,
-neftune_noise_alpha=None,
-no_cuda=False,
-num_train_epochs=9.0,
-optim=adamw_torch,
-optim_args=None,
-output_dir=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9,
-overwrite_output_dir=True,
-past_index=-1,
-per_device_eval_batch_size=8,
-per_device_train_batch_size=4,
-predict_with_generate=False,
-prediction_loss_only=False,
-push_to_hub=False,
-push_to_hub_model_id=None,
-push_to_hub_organization=None,
-push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
-ray_scope=last,
-remove_unused_columns=True,
-report_to=[],
-resume_from_checkpoint=None,
-run_name=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9,
-save_on_each_node=False,
-save_only_model=False,
-save_safetensors=True,
-save_steps=21,
-save_strategy=steps,
-save_total_limit=None,
-seed=42,
-skip_memory_metrics=True,
-sortish_sampler=False,
-split_batches=False,
-tf32=None,
-torch_compile=False,
-torch_compile_backend=None,
-torch_compile_mode=None,
-torchdynamo=None,
-tpu_metrics_debug=False,
-tpu_num_cores=None,
-use_cpu=False,
-use_ipex=False,
-use_legacy_prediction_loop=False,
-use_mps_device=False,
-warmup_ratio=0.0,
-warmup_steps=0,
-weight_decay=0.0,
-)
-02/01/2024 14:20:24 - INFO - llmtuner.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1
-  distributed training: True, compute dtype: torch.float16
-02/01/2024 14:20:24 - INFO - llmtuner.hparams.parser - Training/evaluation parameters Seq2SeqTrainingArguments(
-_n_gpu=1,
-adafactor=False,
-adam_beta1=0.9,
-adam_beta2=0.999,
-adam_epsilon=1e-08,
-auto_find_batch_size=False,
-bf16=False,
-bf16_full_eval=False,
-data_seed=None,
-dataloader_drop_last=False,
-dataloader_num_workers=0,
-dataloader_persistent_workers=False,
-dataloader_pin_memory=True,
-ddp_backend=None,
-ddp_broadcast_buffers=None,
-ddp_bucket_cap_mb=None,
-ddp_find_unused_parameters=None,
-ddp_timeout=1800,
-debug=[],
-deepspeed=ds_config.json,
-disable_tqdm=False,
-dispatch_batches=None,
-do_eval=False,
-do_predict=False,
-do_train=True,
-eval_accumulation_steps=None,
-eval_delay=0,
-eval_steps=None,
-evaluation_strategy=no,
-fp16=True,
-fp16_backend=auto,
-fp16_full_eval=False,
-fp16_opt_level=O1,
-fsdp=[],
-fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
-fsdp_min_num_params=0,
-fsdp_transformer_layer_cls_to_wrap=None,
-full_determinism=False,
-generation_config=None,
-generation_max_length=None,
-generation_num_beams=None,
-gradient_accumulation_steps=28,
-gradient_checkpointing=False,
-gradient_checkpointing_kwargs=None,
-greater_is_better=None,
-group_by_length=False,
-half_precision_backend=auto,
-hub_always_push=False,
-hub_model_id=None,
-hub_private_repo=False,
-hub_strategy=every_save,
-hub_token=<HUB_TOKEN>,
-ignore_data_skip=False,
-include_inputs_for_metrics=False,
-include_num_input_tokens_seen=False,
-include_tokens_per_second=False,
-jit_mode_eval=False,
-label_names=None,
-label_smoothing_factor=0.0,
-learning_rate=1e-06,
-length_column_name=length,
-load_best_model_at_end=False,
-local_rank=0,
-log_level=passive,
-log_level_replica=warning,
-log_on_each_node=True,
-logging_dir=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/runs/Feb01_14-20-23_siat-a100-4-02,
-logging_first_step=False,
-logging_nan_inf_filter=True,
-logging_steps=10,
-logging_strategy=steps,
-lr_scheduler_kwargs={},
-lr_scheduler_type=cosine,
-max_grad_norm=1.0,
-max_steps=-1,
-metric_for_best_model=None,
-mp_parameters=,
-neftune_noise_alpha=None,
-no_cuda=False,
-num_train_epochs=9.0,
-optim=adamw_torch,
-optim_args=None,
-output_dir=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9,
-overwrite_output_dir=True,
-past_index=-1,
-per_device_eval_batch_size=8,
-per_device_train_batch_size=4,
-predict_with_generate=False,
-prediction_loss_only=False,
-push_to_hub=False,
-push_to_hub_model_id=None,
-push_to_hub_organization=None,
-push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
-ray_scope=last,
-remove_unused_columns=True,
-report_to=[],
-resume_from_checkpoint=None,
-run_name=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9,
-save_on_each_node=False,
-save_only_model=False,
-save_safetensors=True,
-save_steps=21,
-save_strategy=steps,
-save_total_limit=None,
-seed=42,
-skip_memory_metrics=True,
-sortish_sampler=False,
-split_batches=False,
-tf32=None,
-torch_compile=False,
-torch_compile_backend=None,
-torch_compile_mode=None,
-torchdynamo=None,
-tpu_metrics_debug=False,
-tpu_num_cores=None,
-use_cpu=False,
-use_ipex=False,
-use_legacy_prediction_loop=False,
-use_mps_device=False,
-warmup_ratio=0.0,
-warmup_steps=0,
-weight_decay=0.0,
-)
-[INFO|tokenization_utils_base.py:2025] 2024-02-01 14:20:24,513 >> loading file ./tokenizer.model
-[INFO|tokenization_utils_base.py:2025] 2024-02-01 14:20:24,513 >> loading file added_tokens.json
-[INFO|tokenization_utils_base.py:2025] 2024-02-01 14:20:24,513 >> loading file special_tokens_map.json
-[INFO|tokenization_utils_base.py:2025] 2024-02-01 14:20:24,513 >> loading file tokenizer_config.json
-[INFO|tokenization_utils_base.py:2025] 2024-02-01 14:20:24,513 >> loading file tokenizer.json
-[INFO|configuration_utils.py:727] 2024-02-01 14:20:24,850 >> loading configuration file /home/lirenhao/pretrained_models/internlm2-chat-7b/config.json
-[INFO|configuration_utils.py:727] 2024-02-01 14:20:24,852 >> loading configuration file /home/lirenhao/pretrained_models/internlm2-chat-7b/config.json
-[INFO|configuration_utils.py:792] 2024-02-01 14:20:24,854 >> Model config InternLM2Config {
-  "_name_or_path": "/home/lirenhao/pretrained_models/internlm2-chat-7b/",
-  "architectures": [
-    "InternLM2ForCausalLM"
-  ],
-  "attn_implementation": "eager",
-  "auto_map": {
-    "AutoConfig": "configuration_internlm2.InternLM2Config",
-    "AutoModel": "modeling_internlm2.InternLM2ForCausalLM",
-    "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM"
-  },
-  "bias": false,
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "hidden_act": "silu",
-  "hidden_size": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 14336,
-  "max_position_embeddings": 32768,
-  "model_type": "internlm2",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 32,
-  "num_key_value_heads": 8,
-  "pad_token_id": 2,
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": {
-    "factor": 2.0,
-    "type": "dynamic"
-  },
-  "rope_theta": 1000000,
-  "tie_word_embeddings": false,
-  "torch_dtype": "float16",
-  "transformers_version": "4.37.1",
-  "use_cache": true,
-  "vocab_size": 92544
-}
-[INFO|modeling_utils.py:3475] 2024-02-01 14:20:24,903 >> loading weights file /home/lirenhao/pretrained_models/internlm2-chat-7b/pytorch_model.bin.index.json
-[INFO|modeling_utils.py:1428] 2024-02-01 14:20:24,903 >> Instantiating InternLM2ForCausalLM model under default dtype torch.float16.
-[INFO|configuration_utils.py:826] 2024-02-01 14:20:24,905 >> Generate config GenerationConfig {
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "pad_token_id": 2
-}
-02/01/2024 14:20:24 - INFO - llmtuner.hparams.parser - Process rank: 1, device: cuda:1, n_gpu: 1
-  distributed training: True, compute dtype: torch.float16
-02/01/2024 14:20:24 - INFO - llmtuner.hparams.parser - Training/evaluation parameters Seq2SeqTrainingArguments(
-_n_gpu=1,
-adafactor=False,
-adam_beta1=0.9,
-adam_beta2=0.999,
-adam_epsilon=1e-08,
-auto_find_batch_size=False,
-bf16=False,
-bf16_full_eval=False,
-data_seed=None,
-dataloader_drop_last=False,
-dataloader_num_workers=0,
-dataloader_persistent_workers=False,
-dataloader_pin_memory=True,
-ddp_backend=None,
-ddp_broadcast_buffers=None,
-ddp_bucket_cap_mb=None,
-ddp_find_unused_parameters=None,
-ddp_timeout=1800,
-debug=[],
-deepspeed=ds_config.json,
-disable_tqdm=False,
-dispatch_batches=None,
-do_eval=False,
-do_predict=False,
-do_train=True,
-eval_accumulation_steps=None,
-eval_delay=0,
-eval_steps=None,
-evaluation_strategy=no,
-fp16=True,
-fp16_backend=auto,
-fp16_full_eval=False,
-fp16_opt_level=O1,
-fsdp=[],
-fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
-fsdp_min_num_params=0,
-fsdp_transformer_layer_cls_to_wrap=None,
-full_determinism=False,
-generation_config=None,
-generation_max_length=None,
-generation_num_beams=None,
-gradient_accumulation_steps=28,
-gradient_checkpointing=False,
-gradient_checkpointing_kwargs=None,
-greater_is_better=None,
-group_by_length=False,
-half_precision_backend=auto,
-hub_always_push=False,
-hub_model_id=None,
-hub_private_repo=False,
-hub_strategy=every_save,
-hub_token=<HUB_TOKEN>,
-ignore_data_skip=False,
-include_inputs_for_metrics=False,
-include_num_input_tokens_seen=False,
-include_tokens_per_second=False,
-jit_mode_eval=False,
-label_names=None,
-label_smoothing_factor=0.0,
-learning_rate=1e-06,
-length_column_name=length,
-load_best_model_at_end=False,
-local_rank=1,
-log_level=passive,
-log_level_replica=warning,
-log_on_each_node=True,
-logging_dir=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/runs/Feb01_14-20-23_siat-a100-4-02,
-logging_first_step=False,
-logging_nan_inf_filter=True,
-logging_steps=10,
-logging_strategy=steps,
-lr_scheduler_kwargs={},
-lr_scheduler_type=cosine,
-max_grad_norm=1.0,
-max_steps=-1,
-metric_for_best_model=None,
-mp_parameters=,
-neftune_noise_alpha=None,
-no_cuda=False,
-num_train_epochs=9.0,
-optim=adamw_torch,
-optim_args=None,
-output_dir=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9,
-overwrite_output_dir=True,
-past_index=-1,
-per_device_eval_batch_size=8,
-per_device_train_batch_size=4,
-predict_with_generate=False,
-prediction_loss_only=False,
-push_to_hub=False,
-push_to_hub_model_id=None,
-push_to_hub_organization=None,
-push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
-ray_scope=last,
-remove_unused_columns=True,
-report_to=[],
-resume_from_checkpoint=None,
-run_name=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9,
-save_on_each_node=False,
-save_only_model=False,
-save_safetensors=True,
-save_steps=21,
-save_strategy=steps,
-save_total_limit=None,
-seed=42,
-skip_memory_metrics=True,
-sortish_sampler=False,
-split_batches=False,
-tf32=None,
-torch_compile=False,
-torch_compile_backend=None,
-torch_compile_mode=None,
-torchdynamo=None,
-tpu_metrics_debug=False,
-tpu_num_cores=None,
-use_cpu=False,
-use_ipex=False,
-use_legacy_prediction_loop=False,
-use_mps_device=False,
-warmup_ratio=0.0,
-warmup_steps=0,
-weight_decay=0.0,
-)
-02/01/2024 14:20:24 - INFO - llmtuner.hparams.parser - Process rank: 3, device: cuda:3, n_gpu: 1
-  distributed training: True, compute dtype: torch.float16
-02/01/2024 14:20:24 - INFO - llmtuner.hparams.parser - Training/evaluation parameters Seq2SeqTrainingArguments(
-_n_gpu=1,
-adafactor=False,
-adam_beta1=0.9,
-adam_beta2=0.999,
-adam_epsilon=1e-08,
-auto_find_batch_size=False,
-bf16=False,
-bf16_full_eval=False,
-data_seed=None,
-dataloader_drop_last=False,
-dataloader_num_workers=0,
-dataloader_persistent_workers=False,
-dataloader_pin_memory=True,
-ddp_backend=None,
-ddp_broadcast_buffers=None,
-ddp_bucket_cap_mb=None,
-ddp_find_unused_parameters=None,
-ddp_timeout=1800,
-debug=[],
-deepspeed=ds_config.json,
-disable_tqdm=False,
-dispatch_batches=None,
-do_eval=False,
-do_predict=False,
-do_train=True,
-eval_accumulation_steps=None,
-eval_delay=0,
-eval_steps=None,
-evaluation_strategy=no,
-fp16=True,
-fp16_backend=auto,
-fp16_full_eval=False,
-fp16_opt_level=O1,
-fsdp=[],
-fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
-fsdp_min_num_params=0,
-fsdp_transformer_layer_cls_to_wrap=None,
-full_determinism=False,
-generation_config=None,
-generation_max_length=None,
-generation_num_beams=None,
-gradient_accumulation_steps=28,
-gradient_checkpointing=False,
-gradient_checkpointing_kwargs=None,
-greater_is_better=None,
-group_by_length=False,
-half_precision_backend=auto,
-hub_always_push=False,
-hub_model_id=None,
-hub_private_repo=False,
-hub_strategy=every_save,
-hub_token=<HUB_TOKEN>,
-ignore_data_skip=False,
-include_inputs_for_metrics=False,
-include_num_input_tokens_seen=False,
-include_tokens_per_second=False,
-jit_mode_eval=False,
-label_names=None,
-label_smoothing_factor=0.0,
-learning_rate=1e-06,
-length_column_name=length,
-load_best_model_at_end=False,
-local_rank=3,
-log_level=passive,
-log_level_replica=warning,
-log_on_each_node=True,
-logging_dir=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/runs/Feb01_14-20-23_siat-a100-4-02,
-logging_first_step=False,
-logging_nan_inf_filter=True,
-logging_steps=10,
-logging_strategy=steps,
-lr_scheduler_kwargs={},
-lr_scheduler_type=cosine,
-max_grad_norm=1.0,
-max_steps=-1,
-metric_for_best_model=None,
-mp_parameters=,
-neftune_noise_alpha=None,
-no_cuda=False,
-num_train_epochs=9.0,
-optim=adamw_torch,
-optim_args=None,
-output_dir=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9,
-overwrite_output_dir=True,
-past_index=-1,
-per_device_eval_batch_size=8,
-per_device_train_batch_size=4,
-predict_with_generate=False,
-prediction_loss_only=False,
-push_to_hub=False,
-push_to_hub_model_id=None,
-push_to_hub_organization=None,
-push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
-ray_scope=last,
-remove_unused_columns=True,
-report_to=[],
-resume_from_checkpoint=None,
-run_name=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9,
-save_on_each_node=False,
-save_only_model=False,
-save_safetensors=True,
-save_steps=21,
-save_strategy=steps,
-save_total_limit=None,
-seed=42,
-skip_memory_metrics=True,
-sortish_sampler=False,
-split_batches=False,
-tf32=None,
-torch_compile=False,
-torch_compile_backend=None,
-torch_compile_mode=None,
-torchdynamo=None,
-tpu_metrics_debug=False,
-tpu_num_cores=None,
-use_cpu=False,
-use_ipex=False,
-use_legacy_prediction_loop=False,
-use_mps_device=False,
-warmup_ratio=0.0,
-warmup_steps=0,
-weight_decay=0.0,
-)
-  return self.fget.__get__(instance, owner)()
-  return self.fget.__get__(instance, owner)()
-  return self.fget.__get__(instance, owner)()
-  return self.fget.__get__(instance, owner)()
-02/01/2024 14:20:35 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled.
-02/01/2024 14:20:35 - INFO - llmtuner.model.adapter - Fine-tuning method: Full
-02/01/2024 14:20:36 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled.
-02/01/2024 14:20:36 - INFO - llmtuner.model.adapter - Fine-tuning method: Full
-[INFO|modeling_utils.py:4352] 2024-02-01 14:20:36,242 >> All model checkpoint weights were used when initializing InternLM2ForCausalLM.
-[INFO|modeling_utils.py:4360] 2024-02-01 14:20:36,242 >> All the weights of InternLM2ForCausalLM were initialized from the model checkpoint at /home/lirenhao/pretrained_models/internlm2-chat-7b/.
-If your task is similar to the task the model of the checkpoint was trained on, you can already use InternLM2ForCausalLM for predictions without further training.
-[INFO|configuration_utils.py:779] 2024-02-01 14:20:36,247 >> loading configuration file /home/lirenhao/pretrained_models/internlm2-chat-7b/generation_config.json
-[INFO|configuration_utils.py:826] 2024-02-01 14:20:36,248 >> Generate config GenerationConfig {
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "pad_token_id": 2
-}
-02/01/2024 14:20:36 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled.
-02/01/2024 14:20:36 - INFO - llmtuner.model.adapter - Fine-tuning method: Full
-02/01/2024 14:20:36 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled.
-02/01/2024 14:20:36 - INFO - llmtuner.model.adapter - Fine-tuning method: Full
-02/01/2024 14:20:47 - INFO - llmtuner.model.loader - trainable params: 7737708544 || all params: 7737708544 || trainable%: 100.0000
-02/01/2024 14:20:48 - INFO - llmtuner.data.template - Add <|im_end|> to stop words.
-02/01/2024 14:20:48 - INFO - llmtuner.model.loader - trainable params: 7737708544 || all params: 7737708544 || trainable%: 100.0000
-02/01/2024 14:20:48 - INFO - llmtuner.model.loader - trainable params: 7737708544 || all params: 7737708544 || trainable%: 100.0000
-02/01/2024 14:20:49 - INFO - llmtuner.data.template - Add <|im_end|> to stop words.
-02/01/2024 14:20:49 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json.
-02/01/2024 14:20:49 - INFO - llmtuner.data.template - Add <|im_end|> to stop words.
-02/01/2024 14:20:49 - INFO - llmtuner.model.loader - trainable params: 7737708544 || all params: 7737708544 || trainable%: 100.0000
-02/01/2024 14:20:49 - INFO - llmtuner.data.template - Add <|im_end|> to stop words.
-Using custom data configuration default-7bf826ddf73c2f44
-Loading Dataset Infos from /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/datasets/packaged_modules/json
-Overwrite dataset info from restored data version if exists.
-Loading Dataset info from /home/lirenhao/.cache/huggingface/datasets/json/default-7bf826ddf73c2f44/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
-Found cached dataset json (/home/lirenhao/.cache/huggingface/datasets/json/default-7bf826ddf73c2f44/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
-Loading Dataset info from /home/lirenhao/.cache/huggingface/datasets/json/default-7bf826ddf73c2f44/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
-02/01/2024 14:20:53 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json.
-02/01/2024 14:20:53 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json.
-02/01/2024 14:20:53 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json.
-Caching processed dataset at /home/lirenhao/.cache/huggingface/datasets/json/default-7bf826ddf73c2f44/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-7cecb244118aac13.arrow
-input_ids:
-[1, 92543, 9081, 364, 2770, 657, 589, 15358, 17993, 6843, 963, 505, 4576, 11146, 451, 60628, 60384, 60721, 62442, 60752, 4452, 285, 4576, 11146, 451, 60628, 60384, 60721, 62442, 60752, 313, 505, 395, 7659, 1813, 4287, 1762, 560, 505, 8020, 684, 36956, 15358, 31288, 451, 68589, 76659, 71581, 699, 1226, 505, 6342, 442, 517, 11100, 328, 10894, 328, 454, 51978, 756, 285, 4576, 11146, 451, 60628, 60384, 60721, 62442, 60752, 313, 777, 3696, 454, 19187, 19829, 4563, 435, 410, 4287, 12032, 684, 410, 1341, 1893, 569, 6519, 454, 262, 69093, 281, 92542, 364, 92543, 1008, 364, 85064, 60703, 60353, 68856, 68306, 61860, 62703, 69516, 68765, 68984, 60362, 60353, 60376, 68678, 60427, 69944, 60355, 92542, 364, 92543, 525, 11353, 364, 73406, 68865, 68364, 69377, 60353, 86839, 70004, 68364, 69516, 69461, 71677, 68287, 60353, 69029, 68831, 68287, 60355, 68931, 69702, 75326, 71838, 60403, 61860, 62703, 77797, 68540, 60355, 364, 92543, 1008, 364, 74820, 68399, 69088, 60677, 68540, 60353, 61032, 71155, 69059, 60355, 92542, 364, 92543, 525, 11353, 364, 91781, 60353, 73161, 80540, 60415, 82098, 60355, 72010, 71404, 60353, 60403, 76153, 68912, 60381, 74112, 61076, 60504, 364, 92543, 1008, 364, 68856, 68306, 68912, 68326, 75848, 68595, 60353, 69972, 71645, 68473, 68585, 60353, 60404, 68965, 61716, 60418, 68273, 60353, 70124, 70698, 60363, 60355, 68389, 60363, 69667, 68306, 68303, 60353, 61214, 68310, 68758, 68261, 70623, 60355, 92542, 364, 92543, 525, 11353, 364, 72010, 69030, 71711, 61076, 60504, 68522, 60353, 86004, 71645, 68629, 68804, 68592, 69095, 60504, 364, 92543, 1008, 364, 84386, 68268, 68315, 75835, 79506, 60353, 60404, 68965, 72245, 68306, 69377, 60355, 60363, 69836, 60427, 70681, 60353, 69281, 91457, 71102, 62600, 62792, 60425, 60355, 92542, 364, 92543, 525, 11353, 364, 82967, 69068, 87160, 68261, 60504, 68319, 60353, 60403, 70868, 69962, 60871, 69893, 60366, 73603, 68261, 60504, 364, 92543, 1008, 364, 68678, 70219, 92396, 84863, 73603, 68252, 71869, 76758, 60353, 68252, 78650, 68306, 71645, 60355, 60363, 70802, 68626, 71010, 73382, 69893, 60353, 60499, 69361, 61032, 68678, 72415, 60355, 92542, 364, 92543, 525, 11353, 364, 72010, 82120, 68381, 72415, 70860, 69209, 61076, 60504, 68522, 60353, 73880, 60359, 75493, 60359, 72415, 60504, 364, 92543, 1008, 364, 74212, 60353, 60363, 73408, 69836, 73880, 60381, 72415, 60355, 69097, 60353, 88720, 60382, 71343, 68254, 70861, 68892, 60459, 71356, 60586, 60355, 92542, 364, 92543, 525, 11353, 364, 68374, 69209, 73175, 68364, 77514, 61076, 60504, 68522, 60353, 68364, 70033, 60359, 69441, 60359, 68273, 60504, 364, 92543, 1008, 364, 88554, 60355, 60363, 68848, 70033, 81269, 60353, 68965, 60520, 80959, 60355, 68389, 60363, 79837, 80665, 60353, 70465, 70802, 70133, 60355, 92542, 364, 92543, 525, 11353, 364, 76273, 68374, 60353, 69060, 71958, 60353, 68364, 69516, 70848, 69715, 60354, 60355, 81425, 68831, 68637, 60353, 80698, 74131, 73382, 79309, 60355, 364, 92543, 1008, 364, 68369, 61076, 60504, 60462, 69735, 91900, 60827, 60504, 92542, 364, 92543, 525, 11353, 364, 68400, 60353, 69897, 68505, 68364, 69209, 60353, 68908, 69116, 60381, 82567, 69290, 60355, 68265, 60353, 73161, 72826, 68288, 69418, 68304, 68747, 60353, 69068, 60381, 71645, 60359, 68303, 82409, 68615, 69715, 60355, 364, 92543, 1008, 364, 82700, 69460, 70417, 60355, 86492, 60577, 69353, 68301, 60827, 60504, 92542, 364, 92543, 525, 11353, 364, 60577, 68505, 68364, 69209, 68301, 60355, 75630, 82261, 68300, 60353, 68848, 68427, 69836, 73880, 68319, 72415, 68269, 60353, 69095, 68540, 74465, 60504, 2]
-inputs:
- <s> <|im_start|>  system
-You are an AI assistant whose name is InternLM (书生·浦语).
-- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
-- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.<|im_end|>
- <|im_start|>  user
-心理咨询师，我觉得我的胸闷症状越来越严重了，这让我很害怕。<|im_end|>
- <|im_start|> assistant
-我能理解你的感受，首先我们要明确你的症状并不是生理问题，而是心理问题。我们可以尝试找出引发你胸闷的心理原因。
- <|im_start|>  user
-可是我一直都在找原因，却找不到答案。<|im_end|>
- <|im_start|> assistant
-不要着急，我们会一步一步地解决这个问题。你能告诉我，你生活中的压力和困扰吗？
- <|im_start|>  user
-我觉得我的压力主要来自于家庭，我和丈夫关系不好，他总是忙于工作，很少关心我。而且我担心我的孩子，怕他们出了什么意外。<|im_end|>
- <|im_start|> assistant
-你能详细说说吗？比如，你和丈夫之间的问题具体是什么？
- <|im_start|>  user
-我们经常因为一些小事争吵，他总是忽略我的感受。我感到很孤独，就像被困在一个牢笼里。<|im_end|>
- <|im_start|> assistant
-这种感觉让你想起了什么？或者，你觉得自己在这段婚姻中失去了什么？
- <|im_start|>  user
-让我想想……我觉得我失去了一个温馨的家，一个关��我的丈夫。我一直在努力维持这段婚姻，但现实却让我失望。<|im_end|>
- <|im_start|> assistant
-你能体会到这种失望带来的情绪吗？比如，伤心、愤怒、失望？
- <|im_start|>  user
-是的，我经常会感到伤心和失望。有时候，我甚至会怀疑自己的人生是不是选错了路。<|im_end|>
- <|im_start|> assistant
-这些情绪会影响你的日常生活吗？比如，你的睡眠、饮食、工作？
- <|im_start|>  user
-肯定的。我最近睡眠很差，总是做噩梦。而且我吃得也不好，体重一直在下降。<|im_end|>
- <|im_start|> assistant
-了解到这些，我想告诉你，你的症状是可以改善的。我们可以通过心理治疗，帮助你走出这段困境。
- <|im_start|>  user
-真的吗？那我要如何做呢？<|im_end|>
- <|im_start|> assistant
-首先，我们要了解你的情绪，学会面对和接纳它们。然后，我们会教你怎么表达自己的需求，让你和丈夫、孩子之间的关系得到改善。
- <|im_start|>  user
-听起来很有道理。那我们从哪里开始呢？<|im_end|>
- <|im_start|>  assistant
-从了解你的情绪开始。试着回想一下，最近一次感到伤心或者失望的时候，是什么原因导致的？</s>
-label_ids:
-[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 73406, 68865, 68364, 69377, 60353, 86839, 70004, 68364, 69516, 69461, 71677, 68287, 60353, 69029, 68831, 68287, 60355, 68931, 69702, 75326, 71838, 60403, 61860, 62703, 77797, 68540, 60355, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 91781, 60353, 73161, 80540, 60415, 82098, 60355, 72010, 71404, 60353, 60403, 76153, 68912, 60381, 74112, 61076, 60504, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 72010, 69030, 71711, 61076, 60504, 68522, 60353, 86004, 71645, 68629, 68804, 68592, 69095, 60504, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 82967, 69068, 87160, 68261, 60504, 68319, 60353, 60403, 70868, 69962, 60871, 69893, 60366, 73603, 68261, 60504, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 72010, 82120, 68381, 72415, 70860, 69209, 61076, 60504, 68522, 60353, 73880, 60359, 75493, 60359, 72415, 60504, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 68374, 69209, 73175, 68364, 77514, 61076, 60504, 68522, 60353, 68364, 70033, 60359, 69441, 60359, 68273, 60504, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 76273, 68374, 60353, 69060, 71958, 60353, 68364, 69516, 70848, 69715, 60354, 60355, 81425, 68831, 68637, 60353, 80698, 74131, 73382, 79309, 60355, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 68400, 60353, 69897, 68505, 68364, 69209, 60353, 68908, 69116, 60381, 82567, 69290, 60355, 68265, 60353, 73161, 72826, 68288, 69418, 68304, 68747, 60353, 69068, 60381, 71645, 60359, 68303, 82409, 68615, 69715, 60355, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 60577, 68505, 68364, 69209, 68301, 60355, 75630, 82261, 68300, 60353, 68848, 68427, 69836, 73880, 68319, 72415, 68269, 60353, 69095, 68540, 74465, 60504, 2]
-labels:
- 我能理解你的感受，首先我们要明确你的症状并不是生理问题，而是心理问题。我们可以尝试找出引发你胸闷的心理原因。</s> 不要着急，我们会一步一步地解决这个问题。你能告诉我，你生活中的压力和困扰吗？</s> 你能详细说说吗？比如，你和丈夫之间的问题具体是什么？</s> 这种感觉让你想起了什么？或者，你觉得自己在这段婚姻中失去了��么？</s> 你能体会到这种失望带来的情绪吗？比如，伤心、愤怒、失望？</s> 这些情绪会影响你的日常生活吗？比如，你的睡眠、饮食、工作？</s> 了解到这些，我想告诉你，你的症状是可以改善的。我们可以通过心理治疗，帮助你走出这段困境。</s> 首先，我们要了解你的情绪，学会面对和接纳它们。然后，我们会教你怎么表达自己的需求，让你和丈夫、孩子之间的关系得到改善。</s> 从了解你的情绪开始。试着回想一下，最近一次感到伤心或者失望的时候，是什么原因导致的？</s>
-[INFO|training_args.py:1828] 2024-02-01 14:21:08,098 >> PyTorch: setting up devices
-  warnings.warn(
-Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
-[INFO|trainer.py:571] 2024-02-01 14:21:08,153 >> Using auto half precision backend
-[2024-02-01 14:21:08,351] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.13.1, git-hash=unknown, git-branch=unknown
-/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/transformers/training_args.py:1741: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead.
-  warnings.warn(
-/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/transformers/training_args.py:1741: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead.
-  warnings.warn(
-/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/transformers/training_args.py:1741: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead.
-  warnings.warn(
-[2024-02-01 14:21:41,776] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
-[2024-02-01 14:21:41,778] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer
-[2024-02-01 14:21:41,778] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
-[2024-02-01 14:21:41,794] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = AdamW
-[2024-02-01 14:21:41,794] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=AdamW type=<class 'torch.optim.adamw.AdamW'>
-[2024-02-01 14:21:41,794] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.float16 ZeRO stage 2 optimizer
-[2024-02-01 14:21:41,795] [INFO] [stage_1_and_2.py:143:__init__] Reduce bucket size 500000000
-[2024-02-01 14:21:41,795] [INFO] [stage_1_and_2.py:144:__init__] Allgather bucket size 500000000
-[2024-02-01 14:21:41,795] [INFO] [stage_1_and_2.py:145:__init__] CPU Offload: False
-[2024-02-01 14:21:41,795] [INFO] [stage_1_and_2.py:146:__init__] Round robin gradient partitioning: False
-[2024-02-01 14:22:01,253] [INFO] [utils.py:791:see_memory_usage] Before initializing optimizer states
-[2024-02-01 14:22:01,254] [INFO] [utils.py:792:see_memory_usage] MA 22.12 GB         Max_MA 25.72 GB         CA 25.85 GB         Max_CA 26 GB
-[2024-02-01 14:22:01,254] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory:  used = 119.45 GB, percent = 12.4%
-[2024-02-01 14:22:01,614] [INFO] [utils.py:791:see_memory_usage] After initializing optimizer states
-[2024-02-01 14:22:01,615] [INFO] [utils.py:792:see_memory_usage] MA 36.53 GB         Max_MA 50.95 GB         CA 54.68 GB         Max_CA 55 GB
-[2024-02-01 14:22:01,615] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory:  used = 109.91 GB, percent = 11.4%
-[2024-02-01 14:22:01,615] [INFO] [stage_1_and_2.py:533:__init__] optimizer state initialized
-[2024-02-01 14:22:01,876] [INFO] [utils.py:791:see_memory_usage] After initializing ZeRO optimizer
-[2024-02-01 14:22:01,877] [INFO] [utils.py:792:see_memory_usage] MA 36.53 GB         Max_MA 36.53 GB         CA 54.68 GB         Max_CA 55 GB
-[2024-02-01 14:22:01,878] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory:  used = 101.72 GB, percent = 10.5%
-[2024-02-01 14:22:01,881] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = AdamW
-[2024-02-01 14:22:01,881] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler
-[2024-02-01 14:22:01,881] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
-[2024-02-01 14:22:01,881] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[1e-06], mom=[(0.9, 0.999)]
-[2024-02-01 14:22:01,883] [INFO] [config.py:984:print] DeepSpeedEngine configuration:
-[2024-02-01 14:22:01,883] [INFO] [config.py:988:print]   activation_checkpointing_config  {
-    "partition_activations": false,
-    "contiguous_memory_optimization": false,
-    "cpu_checkpointing": false,
-    "number_checkpoints": null,
-    "synchronize_checkpoint_boundary": false,
-    "profile": false
-}
-[2024-02-01 14:22:01,883] [INFO] [config.py:988:print]   aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
-[2024-02-01 14:22:01,883] [INFO] [config.py:988:print]   amp_enabled .................. False
-[2024-02-01 14:22:01,883] [INFO] [config.py:988:print]   amp_params ................... False
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   autotuning_config ............ {
-    "enabled": false,
-    "start_step": null,
-    "end_step": null,
-    "metric_path": null,
-    "arg_mappings": null,
-    "metric": "throughput",
-    "model_info": null,
-    "results_dir": "autotuning_results",
-    "exps_dir": "autotuning_exps",
-    "overwrite": true,
-    "fast": true,
-    "start_profile_step": 3,
-    "end_profile_step": 5,
-    "tuner_type": "gridsearch",
-    "tuner_early_stopping": 5,
-    "tuner_num_trials": 50,
-    "model_info_path": null,
-    "mp_size": 1,
-    "max_train_batch_size": null,
-    "min_train_batch_size": 1,
-    "max_train_micro_batch_size_per_gpu": 1.024000e+03,
-    "min_train_micro_batch_size_per_gpu": 1,
-    "num_tuning_micro_batch_sizes": 3
-}
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   bfloat16_enabled ............. False
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   checkpoint_parallel_write_pipeline  False
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   checkpoint_tag_validation_enabled  True
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   checkpoint_tag_validation_fail  False
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7f7f6152d840>
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   communication_data_type ...... None
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   curriculum_enabled_legacy .... False
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   curriculum_params_legacy ..... False
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   data_efficiency_enabled ...... False
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   dataloader_drop_last ......... False
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   disable_allgather ............ False
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   dump_state ................... False
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 1000, 'delayed_shift': 2, 'consecutive_hysteresis': False, 'min_scale': 1}
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   eigenvalue_enabled ........... False
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   eigenvalue_gas_boundary_resolution  1
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   eigenvalue_layer_name ........ bert.encoder.layer
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   eigenvalue_layer_num ......... 0
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   eigenvalue_max_iter .......... 100
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   eigenvalue_stability ......... 1e-06
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   eigenvalue_tol ............... 0.01
-[2024-02-01 14:22:01,884] [INFO] [config.py:988:print]   eigenvalue_verbose ........... False
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   elasticity_enabled ........... False
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   flops_profiler_config ........ {
-    "enabled": false,
-    "recompute_fwd_factor": 0.0,
-    "profile_step": 1,
-    "module_depth": -1,
-    "top_modules": 1,
-    "detailed": true,
-    "output_file": null
-}
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   fp16_auto_cast ............... False
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   fp16_enabled ................. True
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   fp16_master_weights_and_gradients  False
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   global_rank .................. 0
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   grad_accum_dtype ............. None
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   gradient_accumulation_steps .. 28
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   gradient_clipping ............ 1.0
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   gradient_predivide_factor .... 1.0
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   graph_harvesting ............. False
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   initial_dynamic_scale ........ 65536
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   load_universal_checkpoint .... False
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   loss_scale ................... 0
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   memory_breakdown ............. False
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   mics_hierarchial_params_gather  False
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   mics_shard_size .............. -1
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   nebula_config ................ {
-    "enabled": false,
-    "persistent_storage_path": null,
-    "persistent_time_interval": 100,
-    "num_of_version_in_retention": 2,
-    "enable_nebula_load": true,
-    "load_path": null
-}
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   optimizer_legacy_fusion ...... False
-[2024-02-01 14:22:01,885] [INFO] [config.py:988:print]   optimizer_name ............... None
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   optimizer_params ............. None
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   pld_enabled .................. False
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   pld_params ................... False
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   prescale_gradients ........... False
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   scheduler_name ............... None
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   scheduler_params ............. None
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   seq_parallel_communication_data_type  torch.float32
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   sparse_attention ............. None
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   sparse_gradients_enabled ..... False
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   steps_per_print .............. inf
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   train_batch_size ............. 448
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   train_micro_batch_size_per_gpu  4
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   use_data_before_expert_parallel_  False
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   use_node_local_storage ....... False
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   wall_clock_breakdown ......... False
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   weight_quantization_config ... None
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   world_size ................... 4
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   zero_allow_untested_optimizer  True
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   zero_enabled ................. True
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   zero_force_ds_cpu_optimizer .. True
-[2024-02-01 14:22:01,886] [INFO] [config.py:988:print]   zero_optimization_stage ...... 2
-[2024-02-01 14:22:01,887] [INFO] [config.py:974:print_user_config]   json = {
-    "train_batch_size": 448,
-    "train_micro_batch_size_per_gpu": 4,
-    "gradient_accumulation_steps": 28,
-    "gradient_clipping": 1.0,
-    "zero_allow_untested_optimizer": true,
-    "fp16": {
-        "enabled": true,
-        "loss_scale": 0,
-        "initial_scale_power": 16,
-        "loss_scale_window": 1000,
-        "hysteresis": 2,
-        "min_loss_scale": 1
-    },
-    "zero_optimization": {
-        "stage": 2,
-        "allgather_partitions": true,
-        "allgather_bucket_size": 5.000000e+08,
-        "reduce_scatter": true,
-        "reduce_bucket_size": 5.000000e+08,
-        "overlap_comm": false,
-        "contiguous_gradients": true
-    },
-    "steps_per_print": inf,
-    "bf16": {
-        "enabled": false
-    }
-}
-[INFO|trainer.py:1721] 2024-02-01 14:22:01,887 >> ***** Running training *****
-[INFO|trainer.py:1722] 2024-02-01 14:22:01,887 >>   Num examples = 3,134
-[INFO|trainer.py:1723] 2024-02-01 14:22:01,887 >>   Num Epochs = 9
-[INFO|trainer.py:1724] 2024-02-01 14:22:01,887 >>   Instantaneous batch size per device = 4
-[INFO|trainer.py:1727] 2024-02-01 14:22:01,887 >>   Total train batch size (w. parallel, distributed & accumulation) = 448
-[INFO|trainer.py:1728] 2024-02-01 14:22:01,887 >>   Gradient Accumulation steps = 28
-[INFO|trainer.py:1729] 2024-02-01 14:22:01,887 >>   Total optimization steps = 63
-[INFO|trainer.py:1730] 2024-02-01 14:22:01,889 >>   Number of trainable parameters = 7,737,708,544
  0%|          | 0/63 [00:00<?, ?it/s]/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
-  warnings.warn(
-/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
-  warnings.warn(
-/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
-  warnings.warn(
-/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
-  warnings.warn(
-/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py:1968: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)
-  overflow_gpu = get_accelerator().ByteTensor([overflow])
-/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py:1968: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)
-  overflow_gpu = get_accelerator().ByteTensor([overflow])
-/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py:1968: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)
-  overflow_gpu = get_accelerator().ByteTensor([overflow])
-/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py:1968: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)
-  overflow_gpu = get_accelerator().ByteTensor([overflow])
  2%|▏         | 1/63 [00:44<45:32, 44.06s/it]
  3%|▎         | 2/63 [01:23<42:01, 41.33s/it]
  5%|▍         | 3/63 [02:04<41:04, 41.08s/it]
  6%|▋         | 4/63 [02:43<39:37, 40.30s/it]
  8%|▊         | 5/63 [03:22<38:23, 39.72s/it][2024-02-01 14:26:04,941] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1
 10%|▉         | 6/63 [04:03<38:08, 40.15s/it][2024-02-01 14:26:44,502] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768
 11%|█         | 7/63 [04:42<37:17, 39.96s/it]
 13%|█▎        | 8/63 [05:21<36:23, 39.71s/it]
 14%|█▍        | 9/63 [06:01<35:39, 39.62s/it]
 16%|█▌        | 10/63 [06:41<35:16, 39.93s/it]
 16%|█▌        | 10/63 [06:41<35:16, 39.93s/it]
 17%|█▋        | 11/63 [07:20<34:22, 39.65s/it]
 19%|█▉        | 12/63 [08:00<33:47, 39.76s/it]
 21%|██        | 13/63 [08:39<32:56, 39.53s/it]
 22%|██▏       | 14/63 [09:20<32:32, 39.85s/it]
 24%|██▍       | 15/63 [09:59<31:45, 39.69s/it]
 25%|██▌       | 16/63 [10:38<30:47, 39.31s/it]
 27%|██▋       | 17/63 [11:19<30:31, 39.82s/it]
 29%|██▊       | 18/63 [11:58<29:51, 39.81s/it]
 30%|███       | 19/63 [12:39<29:15, 39.89s/it]
 32%|███▏      | 20/63 [13:19<28:42, 40.06s/it]
 32%|███▏      | 20/63 [13:19<28:42, 40.06s/it]
 33%|███▎      | 21/63 [13:59<27:59, 39.99s/it][INFO|trainer.py:2926] 2024-02-01 14:36:12,897 >> Saving model checkpoint to /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21
-[INFO|configuration_utils.py:473] 2024-02-01 14:36:12,902 >> Configuration saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/config.json
-[INFO|configuration_utils.py:594] 2024-02-01 14:36:12,903 >> Configuration saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/generation_config.json
-[INFO|modeling_utils.py:2503] 2024-02-01 14:36:40,422 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/model.safetensors.index.json.
-[INFO|tokenization_utils_base.py:2433] 2024-02-01 14:36:40,424 >> tokenizer config file saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2442] 2024-02-01 14:36:40,424 >> Special tokens file saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/special_tokens_map.json
-[2024-02-01 14:36:41,670] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step21 is about to be saved!
-/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
-  warnings.warn(
-/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
-  warnings.warn(
-/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
-  warnings.warn(
-/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
-  warnings.warn(
-[2024-02-01 14:36:41,683] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/global_step21/mp_rank_00_model_states.pt
-[2024-02-01 14:36:41,684] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/global_step21/mp_rank_00_model_states.pt...
-[2024-02-01 14:37:17,058] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/global_step21/mp_rank_00_model_states.pt.
-[2024-02-01 14:37:17,061] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/global_step21/zero_pp_rank_0_mp_rank_00_optim_states.pt...
-[2024-02-01 14:38:15,362] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/global_step21/zero_pp_rank_0_mp_rank_00_optim_states.pt.
-[2024-02-01 14:38:15,363] [INFO] [engine.py:3477:_save_zero_checkpoint] zero checkpoint saved /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/global_step21/zero_pp_rank_0_mp_rank_00_optim_states.pt
-[2024-02-01 14:38:15,363] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step21 is ready now!
-/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
-  warnings.warn(
 35%|███▍      | 22/63 [16:52<54:35, 79.88s/it]
 37%|███▋      | 23/63 [17:32<45:23, 68.08s/it]
 38%|███▊      | 24/63 [18:15<39:17, 60.44s/it]
 40%|███▉      | 25/63 [18:54<34:12, 54.02s/it]
 41%|████▏     | 26/63 [19:33<30:33, 49.54s/it]
 43%|████▎     | 27/63 [20:12<27:49, 46.38s/it]
 44%|████▍     | 28/63 [20:51<25:45, 44.17s/it]
 46%|████▌     | 29/63 [21:31<24:19, 42.92s/it]
 48%|████▊     | 30/63 [22:11<23:07, 42.06s/it]
 48%|████▊     | 30/63 [22:11<23:07, 42.06s/it]
 49%|████▉     | 31/63 [22:52<22:17, 41.80s/it]
 51%|█████     | 32/63 [23:32<21:11, 41.02s/it]
 52%|█████▏    | 33/63 [24:10<20:06, 40.20s/it]
 54%|█████▍    | 34/63 [24:49<19:18, 39.96s/it]
 56%|█████▌    | 35/63 [25:30<18:43, 40.11s/it]
 57%|█████▋    | 36/63 [26:10<18:03, 40.13s/it]
 59%|█████▊    | 37/63 [26:49<17:12, 39.70s/it]
 60%|██████    | 38/63 [27:29<16:36, 39.88s/it]
 62%|██████▏   | 39/63 [28:08<15:48, 39.51s/it]
 63%|██████▎   | 40/63 [28:46<15:04, 39.34s/it]
 63%|██████▎   | 40/63 [28:46<15:04, 39.34s/it]
 65%|██████▌   | 41/63 [29:27<14:36, 39.84s/it]
 67%|██████▋   | 42/63 [30:07<13:57, 39.87s/it][INFO|trainer.py:2926] 2024-02-01 14:52:21,426 >> Saving model checkpoint to /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42
-[INFO|configuration_utils.py:473] 2024-02-01 14:52:21,431 >> Configuration saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/config.json
-[INFO|configuration_utils.py:594] 2024-02-01 14:52:21,432 >> Configuration saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/generation_config.json
-[INFO|modeling_utils.py:2503] 2024-02-01 14:52:48,702 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/model.safetensors.index.json.
-[INFO|tokenization_utils_base.py:2433] 2024-02-01 14:52:48,704 >> tokenizer config file saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2442] 2024-02-01 14:52:48,704 >> Special tokens file saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/special_tokens_map.json
-[2024-02-01 14:52:49,843] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step42 is about to be saved!
-/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
-  warnings.warn(
-[2024-02-01 14:52:49,856] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/global_step42/mp_rank_00_model_states.pt
-[2024-02-01 14:52:49,856] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/global_step42/mp_rank_00_model_states.pt...
-[2024-02-01 14:53:25,041] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/global_step42/mp_rank_00_model_states.pt.
-[2024-02-01 14:53:25,044] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/global_step42/zero_pp_rank_0_mp_rank_00_optim_states.pt...
-[2024-02-01 14:54:24,364] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/global_step42/zero_pp_rank_0_mp_rank_00_optim_states.pt.
-[2024-02-01 14:54:24,364] [INFO] [engine.py:3477:_save_zero_checkpoint] zero checkpoint saved /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/global_step42/zero_pp_rank_0_mp_rank_00_optim_states.pt
-[2024-02-01 14:54:24,364] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step42 is ready now!
-/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
-  warnings.warn(
 68%|██████▊   | 43/63 [33:01<26:37, 79.86s/it]
 70%|██████▉   | 44/63 [33:41<21:31, 67.96s/it]
 71%|███████▏  | 45/63 [34:20<17:47, 59.29s/it]
 73%|███████▎  | 46/63 [35:01<15:13, 53.75s/it]
 75%|███████▍  | 47/63 [35:41<13:13, 49.58s/it]
 76%|███████▌  | 48/63 [36:21<11:40, 46.71s/it]
 78%|███████▊  | 49/63 [37:00<10:25, 44.69s/it]
 79%|███████▉  | 50/63 [37:42<09:26, 43.60s/it]
 79%|███████▉  | 50/63 [37:42<09:26, 43.60s/it]
 81%|████████  | 51/63 [38:20<08:24, 42.07s/it]
 83%|████████▎ | 52/63 [39:00<07:34, 41.29s/it]
 84%|████████▍ | 53/63 [39:41<06:52, 41.22s/it]
 86%|████████▌ | 54/63 [40:21<06:07, 40.87s/it]
 87%|████████▋ | 55/63 [41:00<05:22, 40.28s/it]
 89%|████████▉ | 56/63 [41:38<04:39, 39.88s/it]
 90%|█████████ | 57/63 [42:18<03:58, 39.78s/it]
 92%|█████████▏| 58/63 [42:59<03:20, 40.10s/it]
 94%|█████████▎| 59/63 [43:39<02:40, 40.24s/it]
 95%|█████████▌| 60/63 [44:19<02:00, 40.04s/it]
 95%|█████████▌| 60/63 [44:19<02:00, 40.04s/it]
 97%|█████████▋| 61/63 [44:58<01:19, 39.67s/it]
 98%|█████████▊| 62/63 [45:38<00:39, 39.80s/it]
-[INFO|configuration_utils.py:473] 2024-02-01 15:08:30,328 >> Configuration saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/config.json
-[INFO|configuration_utils.py:594] 2024-02-01 15:08:30,329 >> Configuration saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/generation_config.json
-[INFO|modeling_utils.py:2503] 2024-02-01 15:08:57,391 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/model.safetensors.index.json.
-[INFO|tokenization_utils_base.py:2433] 2024-02-01 15:08:57,393 >> tokenizer config file saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2442] 2024-02-01 15:08:57,393 >> Special tokens file saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/special_tokens_map.json
-[2024-02-01 15:08:58,595] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step63 is about to be saved!
-/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
-  warnings.warn(
-[2024-02-01 15:08:58,608] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/global_step63/mp_rank_00_model_states.pt
-[2024-02-01 15:08:58,608] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/global_step63/mp_rank_00_model_states.pt...
-[2024-02-01 15:09:33,948] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/global_step63/mp_rank_00_model_states.pt.
-[2024-02-01 15:09:33,951] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/global_step63/zero_pp_rank_0_mp_rank_00_optim_states.pt...
-[2024-02-01 15:10:31,865] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/global_step63/zero_pp_rank_0_mp_rank_00_optim_states.pt.
-[2024-02-01 15:10:31,866] [INFO] [engine.py:3477:_save_zero_checkpoint] zero checkpoint saved /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/global_step63/zero_pp_rank_0_mp_rank_00_optim_states.pt
-[2024-02-01 15:10:31,866] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step63 is ready now!
-[INFO|trainer.py:1962] 2024-02-01 15:10:32,863 >>
-Training completed. Do not forget to share your model on huggingface.co/models =)
-[INFO|trainer.py:2926] 2024-02-01 15:10:44,639 >> Saving model checkpoint to /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9
-[INFO|configuration_utils.py:473] 2024-02-01 15:10:44,787 >> Configuration saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/config.json
-[INFO|configuration_utils.py:594] 2024-02-01 15:10:44,788 >> Configuration saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/generation_config.json
-[2024-02-01 15:10:49,742] [INFO] [launch.py:347:main] Process 3771596 exits successfully.
-[2024-02-01 15:10:49,742] [INFO] [launch.py:347:main] Process 3771597 exits successfully.
-[2024-02-01 15:10:49,742] [INFO] [launch.py:347:main] Process 3771598 exits successfully.
-[INFO|modeling_utils.py:2503] 2024-02-01 15:11:12,707 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/model.safetensors.index.json.
-[INFO|tokenization_utils_base.py:2433] 2024-02-01 15:11:12,709 >> tokenizer config file saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2442] 2024-02-01 15:11:12,709 >> Special tokens file saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/special_tokens_map.json
-***** train metrics *****
-  epoch                    =        9.0
-  train_loss               =     1.4982
-  train_runtime            = 0:48:30.97
-  train_samples_per_second =       9.69
-  train_steps_per_second   =      0.022
-Figure saved: /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/training_loss.png
-02/01/2024 15:11:14 - WARNING - llmtuner.extras.ploting - No metric eval_loss to plot.
-[INFO|modelcard.py:452] 2024-02-01 15:11:14,095 >> Dropping the following result as it does not have all the necessary fields:
-{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
-[2024-02-01 15:11:17,773] [INFO] [launch.py:347:main] Process 3771595 exits successfully.

  0%|          | 0/63 [00:00<?, ?it/s]/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
  2%|▏         | 1/63 [00:44<45:32, 44.06s/it]
  3%|▎         | 2/63 [01:23<42:01, 41.33s/it]
  5%|▍         | 3/63 [02:04<41:04, 41.08s/it]
  6%|▋         | 4/63 [02:43<39:37, 40.30s/it]
  8%|▊         | 5/63 [03:22<38:23, 39.72s/it][2024-02-01 14:26:04,941] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1
 10%|▉         | 6/63 [04:03<38:08, 40.15s/it][2024-02-01 14:26:44,502] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768
 11%|█         | 7/63 [04:42<37:17, 39.96s/it]
 13%|█▎        | 8/63 [05:21<36:23, 39.71s/it]
 14%|█▍        | 9/63 [06:01<35:39, 39.62s/it]
 16%|█▌        | 10/63 [06:41<35:16, 39.93s/it]
 16%|█▌        | 10/63 [06:41<35:16, 39.93s/it]
 17%|█▋        | 11/63 [07:20<34:22, 39.65s/it]
 19%|█▉        | 12/63 [08:00<33:47, 39.76s/it]
 21%|██        | 13/63 [08:39<32:56, 39.53s/it]
 22%|██▏       | 14/63 [09:20<32:32, 39.85s/it]
 24%|██▍       | 15/63 [09:59<31:45, 39.69s/it]
 25%|██▌       | 16/63 [10:38<30:47, 39.31s/it]
 27%|██▋       | 17/63 [11:19<30:31, 39.82s/it]
 29%|██▊       | 18/63 [11:58<29:51, 39.81s/it]
 30%|███       | 19/63 [12:39<29:15, 39.89s/it]
 32%|███▏      | 20/63 [13:19<28:42, 40.06s/it]
 32%|███▏      | 20/63 [13:19<28:42, 40.06s/it]
 33%|███▎      | 21/63 [13:59<27:59, 39.99s/it][INFO|trainer.py:2926] 2024-02-01 14:36:12,897 >> Saving model checkpoint to /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21
 35%|███▍      | 22/63 [16:52<54:35, 79.88s/it]
 37%|███▋      | 23/63 [17:32<45:23, 68.08s/it]
 38%|███▊      | 24/63 [18:15<39:17, 60.44s/it]
 40%|███▉      | 25/63 [18:54<34:12, 54.02s/it]
 41%|████▏     | 26/63 [19:33<30:33, 49.54s/it]
 43%|████▎     | 27/63 [20:12<27:49, 46.38s/it]
 44%|████▍     | 28/63 [20:51<25:45, 44.17s/it]
 46%|████▌     | 29/63 [21:31<24:19, 42.92s/it]
 48%|████▊     | 30/63 [22:11<23:07, 42.06s/it]
 48%|████▊     | 30/63 [22:11<23:07, 42.06s/it]
 49%|████▉     | 31/63 [22:52<22:17, 41.80s/it]
 51%|█████     | 32/63 [23:32<21:11, 41.02s/it]
 52%|█████▏    | 33/63 [24:10<20:06, 40.20s/it]
 54%|█████▍    | 34/63 [24:49<19:18, 39.96s/it]
 56%|█████▌    | 35/63 [25:30<18:43, 40.11s/it]
 57%|█████▋    | 36/63 [26:10<18:03, 40.13s/it]
 59%|█████▊    | 37/63 [26:49<17:12, 39.70s/it]
 60%|██████    | 38/63 [27:29<16:36, 39.88s/it]
 62%|██████▏   | 39/63 [28:08<15:48, 39.51s/it]
 63%|██████▎   | 40/63 [28:46<15:04, 39.34s/it]
 63%|██████▎   | 40/63 [28:46<15:04, 39.34s/it]
 65%|██████▌   | 41/63 [29:27<14:36, 39.84s/it]
 67%|██████▋   | 42/63 [30:07<13:57, 39.87s/it][INFO|trainer.py:2926] 2024-02-01 14:52:21,426 >> Saving model checkpoint to /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42
 68%|██████▊   | 43/63 [33:01<26:37, 79.86s/it]
 70%|██████▉   | 44/63 [33:41<21:31, 67.96s/it]
 71%|███████▏  | 45/63 [34:20<17:47, 59.29s/it]
 73%|███████▎  | 46/63 [35:01<15:13, 53.75s/it]
 75%|███████▍  | 47/63 [35:41<13:13, 49.58s/it]
 76%|███████▌  | 48/63 [36:21<11:40, 46.71s/it]
 78%|███████▊  | 49/63 [37:00<10:25, 44.69s/it]
 79%|███████▉  | 50/63 [37:42<09:26, 43.60s/it]
 79%|███████▉  | 50/63 [37:42<09:26, 43.60s/it]
 81%|████████  | 51/63 [38:20<08:24, 42.07s/it]
 83%|████████▎ | 52/63 [39:00<07:34, 41.29s/it]
 84%|████████▍ | 53/63 [39:41<06:52, 41.22s/it]
 86%|████████▌ | 54/63 [40:21<06:07, 40.87s/it]
 87%|████████▋ | 55/63 [41:00<05:22, 40.28s/it]
 89%|████████▉ | 56/63 [41:38<04:39, 39.88s/it]
 90%|█████████ | 57/63 [42:18<03:58, 39.78s/it]
 92%|█████████▏| 58/63 [42:59<03:20, 40.10s/it]
 94%|█████████▎| 59/63 [43:39<02:40, 40.24s/it]
 95%|█████████▌| 60/63 [44:19<02:00, 40.04s/it]
 95%|█████████▌| 60/63 [44:19<02:00, 40.04s/it]
 97%|█████████▋| 61/63 [44:58<01:19, 39.67s/it]
 98%|█████████▊| 62/63 [45:38<00:39, 39.80s/it]